From c6e39b7f334d0fa51c56c302511b697b207b454d Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Tue, 14 Mar 2023 18:50:54 +0200
Subject: [PATCH 01/41] Add dhp-impact-indicators

---
 .../dhp-impact-indicators/README.txt          | 13 ++++++
 dhp-workflows/dhp-impact-indicators/pom.xml   | 41 +++++++++++++++++++
 2 files changed, 54 insertions(+)
 create mode 100644 dhp-workflows/dhp-impact-indicators/README.txt
 create mode 100644 dhp-workflows/dhp-impact-indicators/pom.xml
diff --git a/dhp-workflows/dhp-impact-indicators/README.txt b/dhp-workflows/dhp-impact-indicators/README.txt
new file mode 100644
index 000000000..788534c02
--- /dev/null
+++ b/dhp-workflows/dhp-impact-indicators/README.txt
@@ -0,0 +1,13 @@
+
+
+## Checkout a specific release of the BIP-Ranker git repository
+
+* Edit the `scmVersion` of the maven-scm-plugin in the pom.xml to point to the tag/release version you want to check out.
+
+* Then perform the checkout with:
+
+```
+mvn scm:checkout
+```
+
+* The code should be visible under `src/main/bip-ranker` folder.
\ No newline at end of file
diff --git a/dhp-workflows/dhp-impact-indicators/pom.xml b/dhp-workflows/dhp-impact-indicators/pom.xml
new file mode 100644
index 000000000..b827f42a4
--- /dev/null
+++ b/dhp-workflows/dhp-impact-indicators/pom.xml
@@ -0,0 +1,41 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+    <parent>
+        <groupId>eu.dnetlib.dhp</groupId>
+        <artifactId>dhp</artifactId>
+        <version>1.2.5-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+
+    <artifactId>dhp-impact-indicators</artifactId>
+
+    <properties>
+        <maven.compiler.source>8</maven.compiler.source>
+        <maven.compiler.target>8</maven.compiler.target>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <scm>
+        <url>https://github.com/athenarc/Bip-Ranker</url>
+        <connection>https://github.com/athenarc/Bip-Ranker.git</connection>
+    </scm>
+
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-scm-plugin</artifactId>
+                <version>1.8.1</version>
+                <configuration>
+                    <connectionType>connection</connectionType>
+                    <scmVersion>2</scmVersion>
+                    <scmVersionType>tag</scmVersionType>
+                    <checkoutDirectory>${project.build.directory}/../src/main/bip-ranker</checkoutDirectory>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+</project>
\ No newline at end of file

From 720fd19b3957bdc4d746778b7ab2fc306c4b2d14 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Tue, 14 Mar 2023 19:28:27 +0200
Subject: [PATCH 02/41] Add dhp-impact-indicators workflow files

---
 dhp-workflows/dhp-impact-indicators/README.md |  23 +
 .../dhp-impact-indicators/README.txt          |  13 -
 dhp-workflows/dhp-impact-indicators/pom.xml   |   6 +-
 .../create_openaire_ranking_graph.py          | 234 ++++++
 .../main/resources/format_ranking_results.py  | 770 ++++++++++++++++++
 .../src/main/resources/get_ranking_files.sh   |  14 +
 .../src/main/resources/job.properties         |  86 ++
 .../resources/map_openaire_ids_to_dois.py     |  60 ++
 .../src/main/resources/map_scores_to_dois.py  | 145 ++++
 .../src/main/resources/workflow.xml           | 600 ++++++++++++++
 10 files changed, 1935 insertions(+), 16 deletions(-)
 create mode 100644 dhp-workflows/dhp-impact-indicators/README.md
 delete mode 100644 dhp-workflows/dhp-impact-indicators/README.txt
 create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py
 create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/format_ranking_results.py
 create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/get_ranking_files.sh
 create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties
 create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/map_openaire_ids_to_dois.py
 create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/map_scores_to_dois.py
 create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml

diff --git a/dhp-workflows/dhp-impact-indicators/README.md b/dhp-workflows/dhp-impact-indicators/README.md
new file mode 100644
index 000000000..14f489da3
--- /dev/null
+++ b/dhp-workflows/dhp-impact-indicators/README.md
@@ -0,0 +1,23 @@
+# Ranking Workflow for Openaire Publications
+
+This project contains the files for running a paper ranking workflow on the openaire graph using apache oozie.
+All scripts are written in python and the project setup follows the typical oozie workflow structure:
+
+- a workflow.xml file containing the workflow specification
+- a job.properties file specifying parameter values for the parameters used by the workflow
+- a set of python scripts used by the workflow
+
+**NOTE**: the workflow depends on the external library of ranking scripts called BiP! Ranker.
+You can check out a specific tag/release of BIP! Ranker using maven, as described in the following section.
+
+## Check out a specific tag/release of BIP-Ranker
+
+* Edit the `scmVersion` of the maven-scm-plugin in the pom.xml to point to the tag/release version you want to check out.
+
+* Then, use maven to perform the checkout:
+
+```
+mvn scm:checkout
+```
+
+* The code should be visible under `src/main/bip-ranker` folder.
\ No newline at end of file
diff --git a/dhp-workflows/dhp-impact-indicators/README.txt b/dhp-workflows/dhp-impact-indicators/README.txt
deleted file mode 100644
index 788534c02..000000000
--- a/dhp-workflows/dhp-impact-indicators/README.txt
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
-## Checkout a specific release of the BIP-Ranker git repository
-
-* Edit the `scmVersion` of the maven-scm-plugin in the pom.xml to point to the tag/release version you want to check out.
-
-* Then perform the checkout with:
-
-```
-mvn scm:checkout
-```
-
-* The code should be visible under `src/main/bip-ranker` folder.
\ No newline at end of file
diff --git a/dhp-workflows/dhp-impact-indicators/pom.xml b/dhp-workflows/dhp-impact-indicators/pom.xml
index b827f42a4..b510635a6 100644
--- a/dhp-workflows/dhp-impact-indicators/pom.xml
+++ b/dhp-workflows/dhp-impact-indicators/pom.xml
@@ -20,7 +20,7 @@
 
     <scm>
         <url>https://github.com/athenarc/Bip-Ranker</url>
-        <connection>https://github.com/athenarc/Bip-Ranker.git</connection>
+        <connection>scm:git:https://github.com/athenarc/Bip-Ranker.git</connection>
     </scm>
 
     <build>
@@ -31,8 +31,8 @@
                 <version>1.8.1</version>
                 <configuration>
                     <connectionType>connection</connectionType>
-                    <scmVersion>2</scmVersion>
-                    <scmVersionType>tag</scmVersionType>
+                    <scmVersionType>tag</scmVersionType><!-- 'branch' can also be provided here -->
+                    <scmVersion>v1.0.0</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
                     <checkoutDirectory>${project.build.directory}/../src/main/bip-ranker</checkoutDirectory>
                 </configuration>
             </plugin>
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py
new file mode 100644
index 000000000..4cffa86a3
--- /dev/null
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py
@@ -0,0 +1,234 @@
+#!/usr/bin/python3
+
+# Create openaire id - openaire id graph from openaire data
+
+#############################################################################################################
+# Program proceeds as follows:
+# 1. We read the input folder provided from hdfs. 
+#    This contains subfolders with openaire graph objects and openaire graph relations
+# 2. We select all openaire graph objects of interest. We filter out based on visibility 
+#    and inference criteria. We also filter out based on the availability of publication year
+# 3. Get reference type dataframes from openaire. Then filter each one of them based on the
+#    existence of citing and cited in the above filtered dataset. Get only citations
+#    produced by publication objects, or otherresearchproducts of types: 
+#	 [TBD]
+# 4. Get objects that don't appear in the relations (from those gathered in step 1) and add 
+#	 them to the graph
+# 5. Group relations by citing paper and do graph-specific formatting
+#############################################################################################################
+# ---------- Imports ------------- #
+import sys
+# import pyspark
+# from pyspark import SparkConf, SparkContext
+from pyspark.sql import SparkSession
+# Functions to effectively handle data
+# manipulation for DataFrames
+import pyspark.sql.functions as F
+# Diagnostics
+from timeit import default_timer as timer
+# from datetime import timedelta, datetime
+# -------------------------------- #
+
+if len(sys.argv) < 5:
+	print ("Usage: ./create_openaire_ranking_graph.py <openaire_graph_data_folder> <current_year> <num_partitions> <output_folder>")
+	sys.exit(0)
+
+# Inputs will be:
+
+# 1. Folder where openaire graph is stored
+graph_folder = sys.argv[1]
+# 2. Current year (this will be needed for filtering)
+current_year = int(sys.argv[2])
+# 3. Number of partitions
+num_partitions = int(sys.argv[3])
+# 4. where to write output
+output_folder = sys.argv[4]
+
+# Lists of results types we want to inclued in the citations
+# valid_result_types = ['publication', 'other']
+valid_result_types = ['publication']
+# list of types in otherresearchproduct which are considered valid for citations
+valid_other = ['']
+
+# Create the spark session
+spark = SparkSession.builder.appName('oa ranking graph creation').getOrCreate()
+# Set context level logging to WARN
+spark.sparkContext.setLogLevel("WARN")
+
+############################################################################################################################
+# 1. Get the research objects and filter based on conditions. 
+#    These will also be the unique identifiers we should find in the final graph
+
+# Initialize an empty dataframe
+oa_objects_df = None
+
+# There is a directory structure on hdfs under the provided path. 
+# We need to parse data from the folders: ["publication", "dataset", "software", "otherresearchproduct"]
+# which are rankable oa result objects.
+
+# Loop subfolders
+for sub_folder in ["publication", "dataset", "software", "otherresearchproduct"]:
+	# Read the json data of the graph into a dataframe initially
+	if not oa_objects_df:
+		oa_objects_df = spark.read.json(graph_folder + "/" + sub_folder).select('id', 'resulttype.classname', 'datainfo.deletedbyinference', 'datainfo.invisible', F.year('dateofacceptance.value').alias('year'))
+		oa_objects_df = oa_objects_df.where( 'datainfo.deletedbyinference = false'  ).where( 'datainfo.invisible = false' ).repartition(num_partitions, 'id').cache()
+	# If we already have data, simply add more to it
+	else:
+		sub_df = spark.read.json(graph_folder + "/" + sub_folder).select('id', 'resulttype.classname','datainfo.deletedbyinference', 'datainfo.invisible', F.year('dateofacceptance.value').alias('year'))
+		sub_df = sub_df.where( 'datainfo.deletedbyinference = false ' ).where( 'datainfo.invisible = false ').cache()
+		# Add the data to the openaire objects dataframe
+		oa_objects_df = oa_objects_df.union(sub_df).repartition(num_partitions, 'id').cache()
+		# Clear memory
+		sub_df.unpersist(True)
+
+# Remove those records without year
+oa_objects_df = oa_objects_df.where(F.col('year').isNotNull())
+
+
+# Now replace years where > (current_year+1) with 0
+oa_objects_df = oa_objects_df.withColumn('clean_year', F.when(F.col('year').cast('int') > (current_year+1), 0).otherwise(F.col('year')))\
+			     .drop('year').withColumnRenamed('clean_year', 'year').repartition(num_partitions, 'id')
+
+# -------------------------------------------------------------------- #
+'''
+# Some diagnostics
+print ("Min and max years:" ) 
+oa_objects_df.select(F.max('year')).show()
+oa_objects_df.select(F.min('year')).show()
+
+# This should be slow due to not repartitioning by year
+print ("Distinct years:") 
+oa_objects_df.select('year').distinct().sort(F.col('year')).show(5000, False)
+
+# Show distinct values of deletedbyinference and invisible to ensure we have the correct data
+print ("Distinct deleted by inference:")
+oa_objects_df.select('deletedbyinference').distinct().show()
+print ("Distinct invisible values:")
+oa_objects_df.select('invisible').distinct().show()
+
+# Output total count
+print ("Total num of research objects: " + str(oa_objects_df.count()))
+'''
+# -------------------------------------------------------------------- #
+
+# Keep only required fields - we still keep resulttype.classname to
+# filter the citation relationships we consider valid
+oa_objects_df = oa_objects_df.drop('deletedbyinference').drop('invisible').distinct().cache()
+############################################################################################################################
+# 2. Get the relation objects and filter them based on their existence in the oa_objects_df
+#    NOTE: we are only interested in citations of type "cites"
+#	 Further, we 
+
+# Deprecated line
+# references_df = spark.read.json(graph_folder + "/relation").select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'relClass')\
+# 			  .where( 'relClass = "References"' ).repartition(num_partitions, 'citing').drop('relClass')
+# print ("References df has: " + str(references_df.count()) + " entries")		  
+
+# Collect only valid citations i.e., invisible = false & deletedbyinference=false  
+cites_df  = spark.read.json(graph_folder + "/relation")\
+			.select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\
+			.where( (F.col('relClass') == "Cites") \
+				& (F.col('dataInfo.deletedbyinference') == "false")\
+                & (F.col('dataInfo.invisible') == "false"))\
+				.drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\
+				.repartition(num_partitions, 'citing').drop('relClass')
+# print ("Cited df has: " + str(cites_df.count()) + " entries")	 
+
+# DEPRECATED 
+# cited_by_df   = spark.read.json(graph_folder + "/relation").select(F.col('target').alias('citing'), F.col('source').alias('cited'), 'relClass')\
+# 			.where( 'relClass = "IsCitedBy"' ).repartition(num_partitions, 'citing').drop('relClass')		
+# print ("Cited by df has: " + str(cited_by_df.count()) + " entries")
+
+# DEPRECATED			
+# Keep only relations where citing and cited are in the oa_objects_df
+# references_df = references_df.join(oa_objects_df.select('id'), references_df.citing == oa_objects_df.id).drop('id')
+# references_df = references_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), references_df.cited == oa_objects_df.id).drop('id').distinct().repartition(num_partitions, 'citing').cache()
+# print ("References df now has: " + str(references_df.count()) +  " entries")
+
+cites_df = cites_df.join(oa_objects_df.select('id'), cites_df.citing == oa_objects_df.id).where( F.col('resulttype.classname').isin(valid_result_types) ).drop('id').drop('resulttype.classname')
+cites_df = cites_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cites_df.cited == oa_objects_df.id).drop('id').drop('resulttype.classname').distinct().repartition(num_partitions, 'citing').cache()
+# TODO: add here a clause filtering out the citations 
+# originating from "other" types of research objects which we consider valid
+
+# print ("Cites df now has: " + str(cites_df.count()) + " entries")
+
+# DEPRECATED
+# cited_by_df = cited_by_df.join(oa_objects_df.select('id'), cited_by_df.citing == oa_objects_df.id).drop('id')
+# cited_by_df = cited_by_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cited_by_df.cited == oa_objects_df.id).drop('id').distinct().repartition(num_partitions, 'citing').cache()
+# print ("Cited BY df now has: " + str(cited_by_df.count()) + " entries")
+
+# DEPRECATED
+# Join all the above into a single set
+# citations_df = references_df.union(cites_df).distinct().repartition(num_partitions, 'citing').cache()
+# Free space
+# references_df.unpersist(True)
+# cites_df.unpersist(True)
+
+# citations_df = citations_df.union(cited_by_df).distinct().repartition(num_partitions, 'citing').cache()
+
+# ALL citations we keep are in the cited_df dataframe
+citations_df = cites_df
+
+'''
+# Show schema
+print ("Citation schema:")
+citations_df.printSchema()
+print ("Objects schema:")
+oa_objects_df.printSchema()
+'''
+
+# Free space
+# cited_by_df.unpersist(True)
+
+# Show total num of unique citations
+num_unique_citations = citations_df.count()
+print ("Total unique citations: " + str(num_unique_citations))
+############################################################################################################################
+# 3. Get any potentially missing 'citing' papers from references (these are dangling nodes w/o any outgoing references)
+dangling_nodes = oa_objects_df.join(citations_df.select('citing').distinct(), citations_df.citing == oa_objects_df.id, 'left_anti')\
+			      .select(F.col('id').alias('citing')).withColumn('cited', F.array([F.lit("0")])).repartition(num_partitions, 'citing')
+# Count dangling nodes
+dangling_num = dangling_nodes.count()
+print ("Number of dangling nodes: " + str(dangling_num))
+# print ("Dangling nodes sample:")
+# dangling_nodes.show(10, False)
+############################################################################################################################
+# 4. Group the citation dataframe by citing doi, and create the cited dois list. Add dangling nodes to the result
+graph = citations_df.groupBy('citing').agg(F.collect_set('cited').alias('cited')).repartition(num_partitions, 'citing').cache()
+# Free space
+citations_df.unpersist(True)
+
+num_nodes = graph.count()
+print ("Entries in graph before dangling nodes:"  + str(num_nodes))
+# print ("Sample in graph: ")
+# graph.show(10, False)
+
+# Add dangling nodes
+graph = graph.union(dangling_nodes).repartition(num_partitions, 'citing')
+# Count current number of results
+num_nodes = graph.count()
+print ("Num entries after adding dangling nodes: " + str(num_nodes))
+
+# Add publication year
+graph = graph.join(oa_objects_df, graph.citing == oa_objects_df.id).select('citing', 'cited', 'year').cache()
+num_nodes_final = graph.count()
+print ("After adding year: " + str(num_nodes_final))
+# print ("Graph sample:")
+# graph.show(20, False)
+# Calculate initial score of nodes (1/N)
+initial_score = float(1)/float(num_nodes_final)
+############################################################################################################################
+# 5. Write graph to output file!
+print("Writing output to: " + output_folder)
+
+graph.select('citing', F.concat_ws("|", F.concat_ws(",",'cited'), F.when(F.col('cited').getItem(1) != "0", F.size('cited')).otherwise(F.lit("0")), F.lit(str(initial_score)) ).alias('cited'), 'year').withColumn('prev_pr', F.lit("0")).select('citing', 'cited', 'prev_pr', 'year')\
+	.write.mode("overwrite").option("delimiter","\t").csv(output_folder, compression="gzip")
+
+if num_nodes_final != num_nodes:
+	print ("WARNING: the number of nodes after keeping only nodes where year is available went from: " + str(num_nodes) + " to " + str(num_nodes_final) + "\n")
+	print ("Check for any mistakes...")
+
+############################################################################################################################
+print ("\nDONE!\n\n")
+# Wrap up
+spark.stop()
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/format_ranking_results.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/format_ranking_results.py
new file mode 100644
index 000000000..60c71e52f
--- /dev/null
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/format_ranking_results.py
@@ -0,0 +1,770 @@
+# This program reads hdfs directories containing ranking results from openaire's cluster.
+# Based on the parameters provided by the user, it will create different types of output files.
+
+# Modes available are:
+# 1. bip 
+# This will result in output of the form required for bip-finder's update. 
+# Its lines conform to the following format:
+# <doi> \t <pagerank> \t <pagerank_normalized> \t <attrank> \t <attrank_normalized> \t <citation_count> \t <citation_count_normalized> \t <3y_cc> \t <3y_cc_normalized> \t <tar_ram> \t <references_count>
+
+# 2. zenodo
+# This is the format used in zenodo for Bip-DB.  (6 way classes will be named C1, C2, ..., C6)
+# This should output two files per ranking method with each line having the following data:
+# a. <id> <score> <6-way-class>
+# NOTE: this should also run for openaire-id files, hence we should have a total of 4 files per ranking (2 for each type of identifier)
+# In 'zenodo' mode the user specifies only a single file, for which zenodo-based output will be created
+
+# 3. json
+# This if the format used to provide openAIRE / claudio with data containing 1 json per identifier
+# An example of such a json format follows:
+#{
+#    "50|dedup_wf_001::08823c8f5c3ca2eae523817036cdda67": [
+#       {
+#            "id": "influence",
+#            "unit": [
+#                {
+#                    "key": "score",
+#                    "value": "5.06690394631e-09" 
+#                },
+#                {
+#                    "key": "class",
+#                    "value": "C" 
+#                }
+#            ]
+#        },
+#        {
+#            "id": "popularity_alt",
+#            "unit": [
+#                {
+#                    "key": "score",
+#                    "value": "0.0" 
+#                },
+#                {
+#                    "key": "class",
+#                    "value": "C" 
+#                }
+#            ]
+#        },
+#        {
+#            "id": "popularity",
+#            "unit": [
+#                {
+#                    "key": "score",
+#                    "value": "3.11855618382e-09" 
+#                },
+#                {
+#                    "key": "class",
+#                    "value": "C" 
+#                }
+#            ]
+#        },
+#        {
+#            "id": "influence_alt",
+#            "unit": [
+#                {
+#                    "key": "score",
+#                    "value": "0.0" 
+#                },
+#                {
+#                    "key": "class",
+#                    "value": "C" 
+#                }
+#            ]
+#        },
+#        {
+#            "id": "impulse",
+#            "unit": [
+#                {
+#                    "key": "score",
+#                    "value": "0.0" 
+#                },
+#                {
+#                    "key": "class",
+#                    "value": "C" 
+#                }
+#            ]
+#        }
+#    ]
+#}
+
+
+#################################################################################################
+# Imports
+import sys
+import time
+
+# Sparksession lib to communicate with cluster via session object
+from pyspark.sql import SparkSession
+
+# Import sql types to define the schema of score output files
+from pyspark.sql.types import *
+
+# Import sql functions with shorthand alias
+import pyspark.sql.functions as F
+from pyspark.sql.functions import udf
+
+# Json specific encoding
+import json
+#################################################################################################
+# Clean up directory name
+def clean_directory_name(dir_name):
+	# We have a name with the form *_bip_universe<digits>_* or *_graph_universe<digits>_* 
+	# and we need to keep the parts in *	
+	dir_name_parts = dir_name.split('_')
+	dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)]
+	
+	clean_name = '_'.join(dir_name_parts)
+	clean_name = clean_name.replace('_id', '_ids')
+	
+	clean_name = clean_name.replace('.txt', '')
+	clean_name = clean_name.replace('.gz', '')
+	
+	if 'openaire_ids_' in clean_name:
+		clean_name = clean_name.replace('openaire_ids_', '')
+		clean_name = clean_name + '_openaire_ids.txt.gz'
+	else:
+		clean_name = clean_name + '.txt.gz/'
+	
+	return clean_name
+# --------------------------------------------------------------------------------------------- #
+# User defined function to escape special characters in a string that will turn into a json key
+@udf(StringType())
+def json_encode_key(doi_string):
+	return json.dumps(doi_string)
+#################################################################################################
+# --------------------------------------------------------------------------------------------- #
+# Arguments from command line and initializations
+
+# Time initialization
+start_time = time.time()
+
+# Check whether input is correct, otherwise exit with appropriate message
+if len(sys.argv) < 2:
+	print ("Usage: ./format_ranking_results.py <mode> <input_file|input_file_list> <num_partitions>")
+	sys.exit(0)
+	
+# Define valid modes:
+valid_modes = ['json', 'zenodo', 'bip', 'json-5-way']
+# Read mode provided by user
+mode = sys.argv[1].strip()
+
+# If mode isn't valid, exit
+if mode not in valid_modes:
+	print ("Usage: ./format_ranking_results.py <mode> <input_file|input_file_list> <num_partitions>\n")
+	print ("Invalid mode provided. Valid modes: ['zenodo', 'bip', 'json', 'json-5-way']")
+	sys.exit(0)
+
+
+# Once here, we should be more or less okay to run.
+
+# Define the spark session object
+spark = SparkSession.builder.appName('Parse Scores - ' + str(mode) + ' mode').getOrCreate()
+# Set Log Level for spark session
+spark.sparkContext.setLogLevel('WARN')
+
+# Here we define the schema shared by all score output files 
+# - citation count variants have a slightly different schema, due to their scores being integers
+float_schema = StructType([
+	StructField('id', StringType(), False),
+	StructField('score', FloatType(), False),
+	StructField('normalized_score', FloatType(), False),
+	StructField('3-way-class', StringType(), False),
+	StructField('5-way-class', StringType(), False)
+	])
+	
+int_schema = StructType([
+	StructField('id', StringType(), False),
+	StructField('score', IntegerType(), False),
+	StructField('normalized_score', FloatType(), False),
+	StructField('3-way-class', StringType(), False),
+	StructField('5-way-class', StringType(), False)
+	])
+	
+# This schema concerns the output of the file
+# containing the number of references of each doi
+refs_schema = StructType([
+	StructField('id', StringType(), False),
+	StructField('num_refs', IntegerType(), False),
+	])
+	
+print("--- Initialization time: %s seconds ---" % (time.time() - start_time))
+
+# --------------------------------------------------------------------------------------------- #
+
+# Time the main program execution
+start_time = time.time()
+
+# The following is executed when the user requests the bip-update specific file
+if mode == 'bip':
+
+	# Read the remaining input files
+	if len(sys.argv) < 8:
+		print ("\n\nInsufficient input for 'bip' mode.")
+		print ("File list required: <pagerank> <attrank> <citation count> <3-year citation count> <tar-ram> <number of references> <num_partitions>\n")
+		sys.exit(0)
+		
+	
+	# Read number of partitions: 
+	num_partitions 	= int(sys.argv[-1])
+		
+		
+	pagerank_dir 	= sys.argv[2]
+	attrank_dir	= sys.argv[3]
+	cc_dir		= sys.argv[4]
+	impulse_dir	= sys.argv[5]
+	ram_dir		= sys.argv[6]
+	refs_dir	= sys.argv[7]	
+		
+	# Score-specific dataframe
+	pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id')
+	attrank_df  = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(attrank_dir).repartition(num_partitions, 'id')
+	cc_df	    = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id')
+	impulse_df   = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id')
+	ram_df      = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id')
+	refs_df     = spark.read.schema(refs_schema).option('delimiter', '\t').option('header',True).csv(refs_dir).repartition(num_partitions, 'id')
+	
+	# ----------- TESTING CODE --------------- #
+	# pagerank_entries = pagerank_df.count()
+	# attrank_entries = attrank_df.count()
+	# cc_entries = cc_df.count()
+	# impulse_entries = impulse_df.count()
+	# ram_entries = ram_df.count()
+	# refs_entries = refs_df.count()
+		
+	# print ("Pagerank:" + str(pagerank_entries))
+	# print ("AttRank:" + str(attrank_entries))
+	# print ("CC entries: " + str(cc_entries))
+	# print ("Impulse entries: " + str(impulse_entries))
+	# print ("Refs: " + str(refs_entries))
+	# ---------------------------------------- #
+	
+	# Create a new dataframe with the required data
+	results_df  = pagerank_df.select('id', F.col('score').alias('pagerank'), F.col('normalized_score').alias('pagerank_normalized'))
+	# Add attrank dataframe
+	results_df  = results_df.join(attrank_df.select('id', 'score', 'normalized_score'), ['id'])\
+				.select(results_df.id, 'pagerank', 'pagerank_normalized', F.col('score').alias('attrank'), F.col('normalized_score').alias('attrank_normalized'))
+
+	# Add citation count dataframe
+	results_df  = results_df.join(cc_df.select('id', 'score', 'normalized_score'), ['id'])\
+				.select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', F.col('score').alias('cc'), F.col('normalized_score').alias('cc_normalized'))
+
+	# Add 3-year df
+	results_df  = results_df.join(impulse_df.select('id', 'score', 'normalized_score'), ['id'])\
+				.select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', 'cc', 'cc_normalized', \
+					F.col('score').alias('3-cc'), F.col('normalized_score').alias('3-cc_normalized'))
+	
+	# Add ram df
+	results_df  = results_df.join(ram_df.select('id', 'score'), ['id'])\
+				.select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', 'cc', 'cc_normalized',\
+					'3-cc', '3-cc_normalized', F.col('score').alias('ram'))
+	
+	# Add references
+	results_df  = results_df.join(refs_df, ['id']).select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', \
+							      'cc', 'cc_normalized', '3-cc', '3-cc_normalized', 'ram', 'num_refs')
+	
+	# Write resulting dataframe to file
+	output_dir = "/".join(pagerank_dir.split('/')[:-1])
+	output_dir = output_dir + '/bip_update_data.txt.gz'
+	
+	print("Writing to:" +  output_dir)
+	results_df.write.mode('overwrite').option('delimiter','\t').option('header',True).csv(output_dir, compression='gzip')
+	
+# The following is executed when the user requests the zenodo-specific file
+elif mode == 'zenodo':
+
+	# Read the remaining input files
+	if len(sys.argv) < 9:
+		print ("\n\nInsufficient input for 'zenodo' mode.")
+		print ("File list required: <pagerank> <attrank> <citation count> <3-year citation count> <tar-ram> <num_partitions> <graph_type>\n")
+		sys.exit(0)
+		
+	# Read number of partitions: 
+	num_partitions 	= int(sys.argv[-2])
+	graph_type 	= sys.argv[-1]
+	
+	if graph_type not in ['bip', 'openaire']:
+		graph_type = 'bip'		
+		
+	pagerank_dir 	= sys.argv[2]
+	attrank_dir	= sys.argv[3]
+	cc_dir		= sys.argv[4]
+	impulse_dir	= sys.argv[5]
+	ram_dir		= sys.argv[6]
+
+	# Output directory is common for all files
+	output_dir_prefix = "/".join(pagerank_dir.split('/')[:-1])
+	# Method-specific outputs
+	pagerank_output   = clean_directory_name(pagerank_dir.split('/')[-1])
+	attrank_output    = clean_directory_name(attrank_dir.split('/')[-1])
+	cc_output   	  = clean_directory_name(cc_dir.split('/')[-1])
+	impulse_output    = clean_directory_name(impulse_dir.split('/')[-1])
+	ram_output        = clean_directory_name(ram_dir.split('/')[-1])
+
+	# --------- PageRank ----------- #
+	# Get per file the doi - score - 6-way classes and write it to output
+	print("Writing to: " +  output_dir_prefix + '/' + pagerank_output)
+	pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class')
+	# Replace dataframe class names
+	pagerank_df = pagerank_df.withColumn('class', F.lit('C6'))
+	pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) )
+	pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
+	pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
+	pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
+	pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
+	pagerank_df = pagerank_df.drop('5-way-class')
+	
+	if graph_type == 'openaire':
+		pagerank_df = pagerank_df.where( ~F.col('id').like('10.%') )
+	
+	# Write output
+	pagerank_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + pagerank_output, compression='gzip')
+	# --------- AttRank ----------- #
+	print("Writing to: " +  output_dir_prefix + '/' + attrank_output)	
+	attrank_df  = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(attrank_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class')
+	# Replace dataframe class names
+	attrank_df = attrank_df.withColumn('class', F.lit('C6'))
+	attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) )
+	attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
+	attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
+	attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
+	attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
+	attrank_df = attrank_df.drop('5-way-class')
+	
+	if graph_type == 'openaire':
+		attrank_df = attrank_df.where( ~F.col('id').like('10.%') )
+		
+	# Write output
+	attrank_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + attrank_output, compression='gzip')	
+	# --------- Citation Count ----------- #
+	print("Writing to: " +  output_dir_prefix + '/' + cc_output)	
+	cc_df	    = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class')
+	# Replace dataframe class names
+	cc_df = cc_df.withColumn('class', F.lit('C5'))
+	# cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) )
+	cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
+	cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
+	cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
+	cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
+	cc_df = cc_df.drop('5-way-class')
+	
+	if graph_type == 'openaire':
+		cc_df = cc_df.where( ~F.col('id').like('10.%') )
+			
+	# Write output	
+	cc_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + cc_output, compression='gzip')	
+	# --------- Impulse ----------- #
+	print("Writing to: " +  output_dir_prefix + '/' + impulse_output)	
+	impulse_df   = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class')
+	# Replace dataframe class names
+	impulse_df = impulse_df.withColumn('class', F.lit('C5'))
+	# impulse_df = impulse_df.withColumn('class', F.when(F.col('6-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) )
+	impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
+	impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
+	impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
+	impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
+	impulse_df = impulse_df.drop('5-way-class')
+	
+	if graph_type == 'openaire':
+		impulse_df = impulse_df.where( ~F.col('id').like('10.%') )
+			
+	# Write output	
+	impulse_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' +  impulse_output, compression='gzip')	
+	# --------- RAM ----------- #		
+	print("Writing to: " +  output_dir_prefix + '/' + ram_output)	
+	ram_df      = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id').select('id', 'score', '5-way-class')
+	# Replace dataframe class names
+	ram_df = ram_df.withColumn('class', F.lit('C5'))
+	# ram_df = ram_df.withColumn('class', F.when(F.col('6-way-class') == F.lit('E'), F.lit('C5')).otherwise(F.col('class')) )
+	ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
+	ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
+	ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
+	ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
+	ram_df = ram_df.drop('5-way-class')
+	
+	if graph_type == 'openaire':
+		ram_df = ram_df.where( ~F.col('id').like('10.%') )
+		
+	# Write output		
+	ram_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_dir_prefix + '/' + ram_output, compression='gzip')	
+
+# The following produces the json file required by openaire	
+elif mode == 'json':
+
+	# Read the remaining input files
+	if len(sys.argv) < 9:
+		print ("\n\nInsufficient input for 'json' mode.")
+		print ("File list required: <pagerank> <attrank> <citation count> <3-year citation count> <tar-ram> <num_partitions> <graph_type>\n")
+		sys.exit(0)
+		
+	# Read number of partitions: 
+	num_partitions 	= int(sys.argv[-2])
+	graph_type 	= sys.argv[-1]
+	
+	if graph_type not in ['bip', 'openaire']:
+		graph_type = 'bip'
+	
+	print ("Graph type: " + str(graph_type))	
+	
+	# File directories		
+	pagerank_dir 	= sys.argv[2]
+	attrank_dir	= sys.argv[3]
+	cc_dir		= sys.argv[4]
+	impulse_dir	= sys.argv[5]
+	ram_dir		= sys.argv[6]
+	
+	print ("Reading files:")
+	print (pagerank_dir)
+	print (attrank_dir)
+	print (cc_dir)
+	print (impulse_dir)
+	print (ram_dir)
+	
+	# Score-specific dataframe - read inputs
+	pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id')
+	attrank_df  = spark.read.schema(float_schema).option('delimiter', '\t').option('header',False).csv(attrank_dir).repartition(num_partitions, 'id')
+	cc_df	    = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id')
+	impulse_df  = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id')
+	ram_df      = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id')	
+	# --- Join the data of the various scores --- #
+	
+	# Create json data for pagerank
+	pagerank_df = pagerank_df.select('id', F.map_concat(
+							F.create_map(F.lit('key'), F.lit('score')),
+					       		F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
+					       F.map_concat(
+					       		F.create_map(F.lit('key'), F.lit('class')),
+					       		F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map'))
+				       		
+	pagerank_df = pagerank_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('influence_values') )
+	pagerank_df = pagerank_df.select('id', F.create_map(F.lit('id'), F.lit('influence')).alias('id_map'), F.col('influence_values'))
+	pagerank_df = pagerank_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('influence'))).alias('influence_key'), F.to_json(F.col('influence_values')).alias('influence_values') )
+	pagerank_df = pagerank_df.select('id', F.expr('substring(influence_key, 0, length(influence_key)-1)').alias('influence_key'), 'influence_values')
+	pagerank_df = pagerank_df.select('id', 'influence_key', F.expr('substring(influence_values, 2, length(influence_values))').alias('influence_values'))
+	pagerank_df = pagerank_df.select('id', F.concat_ws(', ', F.col('influence_key'), F.col('influence_values')).alias('influence_json'))
+			       
+	# Create json data for attrank
+	attrank_df = attrank_df.select('id', F.map_concat(
+							F.create_map(F.lit('key'), F.lit('score')),
+					       		F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
+					       F.map_concat(
+					       		F.create_map(F.lit('key'), F.lit('class')),
+					       		F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map'))
+				       		
+	attrank_df = attrank_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('popularity_values') )
+	attrank_df = attrank_df.select('id', F.create_map(F.lit('id'), F.lit('popularity')).alias('id_map'), F.col('popularity_values'))
+	attrank_df = attrank_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('popularity'))).alias('popularity_key'), F.to_json(F.col('popularity_values')).alias('popularity_values') )
+	attrank_df = attrank_df.select('id', F.expr('substring(popularity_key, 0, length(popularity_key)-1)').alias('popularity_key'), 'popularity_values')
+	attrank_df = attrank_df.select('id', 'popularity_key', F.expr('substring(popularity_values, 2, length(popularity_values))').alias('popularity_values'))
+	attrank_df = attrank_df.select('id', F.concat_ws(', ', F.col('popularity_key'), F.col('popularity_values')).alias('popularity_json'))	
+	
+	# Create json data for CC
+	cc_df = cc_df.select('id', F.map_concat(
+						F.create_map(F.lit('key'), F.lit('score')),
+					       	F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
+				   F.map_concat(
+					       	F.create_map(F.lit('key'), F.lit('class')),
+					       	F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map'))
+				       		
+	cc_df = cc_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('influence_alt_values') )
+	cc_df = cc_df.select('id', F.create_map(F.lit('id'), F.lit('influence_alt')).alias('id_map'), F.col('influence_alt_values'))
+	cc_df = cc_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('influence_alt'))).alias('influence_alt_key'), F.to_json(F.col('influence_alt_values')).alias('influence_alt_values') )
+	cc_df = cc_df.select('id', F.expr('substring(influence_alt_key, 0, length(influence_alt_key)-1)').alias('influence_alt_key'), 'influence_alt_values')
+	cc_df = cc_df.select('id', 'influence_alt_key', F.expr('substring(influence_alt_values, 2, length(influence_alt_values))').alias('influence_alt_values'))
+	cc_df = cc_df.select('id', F.concat_ws(', ', F.col('influence_alt_key'), F.col('influence_alt_values')).alias('influence_alt_json'))
+	
+
+	# Create json data for RAM	
+	ram_df = ram_df.select('id', F.map_concat(
+						F.create_map(F.lit('key'), F.lit('score')),
+					       	F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
+				   F.map_concat(
+					       	F.create_map(F.lit('key'), F.lit('class')),
+					       	F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map'))
+				       		
+	ram_df = ram_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('popularity_alt_values') )
+	ram_df = ram_df.select('id', F.create_map(F.lit('id'), F.lit('popularity_alt')).alias('id_map'), F.col('popularity_alt_values'))
+	ram_df = ram_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('popularity_alt'))).alias('popularity_alt_key'), F.to_json(F.col('popularity_alt_values')).alias('popularity_alt_values') )
+	ram_df = ram_df.select('id', F.expr('substring(popularity_alt_key, 0, length(popularity_alt_key)-1)').alias('popularity_alt_key'), 'popularity_alt_values')
+	ram_df = ram_df.select('id', 'popularity_alt_key', F.expr('substring(popularity_alt_values, 2, length(popularity_alt_values))').alias('popularity_alt_values'))
+	ram_df = ram_df.select('id', F.concat_ws(', ', F.col('popularity_alt_key'), F.col('popularity_alt_values')).alias('popularity_alt_json'))
+	
+	# Create json data for impulse	
+	impulse_df = impulse_df.select('id', F.map_concat(
+						F.create_map(F.lit('key'), F.lit('score')),
+					       	F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
+				   F.map_concat(
+					       	F.create_map(F.lit('key'), F.lit('class')),
+					       	F.create_map(F.lit('value'), F.col('3-way-class'))).alias('class_map'))
+				       		
+	impulse_df = impulse_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('impulse_values') )
+	impulse_df = impulse_df.select('id', F.create_map(F.lit('id'), F.lit('impulse')).alias('id_map'), F.col('impulse_values'))
+	impulse_df = impulse_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('impulse'))).alias('impulse_key'), F.to_json(F.col('impulse_values')).alias('impulse_values') )
+	impulse_df = impulse_df.select('id', F.expr('substring(impulse_key, 0, length(impulse_key)-1)').alias('impulse_key'), 'impulse_values')
+	impulse_df = impulse_df.select('id', 'impulse_key', F.expr('substring(impulse_values, 2, length(impulse_values))').alias('impulse_values'))
+	impulse_df = impulse_df.select('id', F.concat_ws(', ', F.col('impulse_key'), F.col('impulse_values')).alias('impulse_json'))	
+	
+	#Join dataframes together
+	results_df = pagerank_df.join(attrank_df, ['id'])
+	results_df = results_df.join(cc_df, ['id'])
+	results_df = results_df.join(ram_df, ['id'])
+	results_df = results_df.join(impulse_df, ['id'])
+	
+	print ("Json encoding DOI keys")
+	# Json encode doi strings
+	results_df = results_df.select(json_encode_key('id').alias('id'), 'influence_json', 'popularity_json', 'influence_alt_json', 'popularity_alt_json', 'impulse_json')
+
+	# Concatenate individual json columns
+	results_df = results_df.select('id', F.concat_ws(', ', F.col('influence_json'), F.col('popularity_json'), F.col('influence_alt_json'), F.col('popularity_alt_json'), F.col('impulse_json') ).alias('json_data'))
+	results_df = results_df.select('id', F.concat_ws('', F.lit('['), F.col('json_data'), F.lit(']')).alias('json_data') )
+	
+	# Filter out non-openaire ids if need
+	if graph_type == 'openaire':
+		results_df = results_df.where( ~F.col('id').like('"10.%') )
+
+	# Concatenate paper id and add opening and ending brackets
+	results_df = results_df.select(F.concat_ws('', F.lit('{'), F.col('id'), F.lit(': '), F.col('json_data'), F.lit('}')).alias('json') )
+
+	# -------------------------------------------- #
+	# Write json output - set the directory here
+	output_dir = "/".join(pagerank_dir.split('/')[:-1])
+	if graph_type == 'bip':
+		output_dir = output_dir + '/bip_universe_doi_scores/'
+	else:
+		output_dir = output_dir + '/openaire_universe_scores/'
+
+	# Write the dataframe
+	print ("Writing output to: " + output_dir)
+	results_df.write.mode('overwrite').option('header', False).text(output_dir, compression='gzip')
+
+	# Rename the files to .json.gz now
+	sc = spark.sparkContext
+	URI = sc._gateway.jvm.java.net.URI
+	Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
+	FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
+	# Get master prefix from input file path
+	master_prefix = "/".join(pagerank_dir.split('/')[:5])
+	fs = FileSystem.get(URI(master_prefix), sc._jsc.hadoopConfiguration())
+	path = Path(output_dir)
+	print ("Path is:" + path.toString())
+	file_list = fs.listStatus(Path(output_dir))
+	print ("Renaming files:")
+	for f in file_list:
+		initial_filename = f.getPath().toString()
+		if "part" in initial_filename:
+			print (initial_filename + " => " + initial_filename.replace(".txt.gz", ".json.gz"))
+			fs.rename(Path(initial_filename), Path(initial_filename.replace(".txt.gz", ".json.gz")))
+
+
+	'''
+	DEPRECATED: 
+	# -------------------------------------------- #
+	# Write json output
+	output_dir = "/".join(pagerank_dir.split('/')[:-1])
+	if graph_type == 'bip':
+		output_dir = output_dir + '/bip_universe_doi_scores_txt/'
+	else:
+		output_dir = output_dir + '/openaire_universe_scores_txt/'
+		
+	print ("Writing output to: " + output_dir)
+	results_df.write.mode('overwrite').option('header', False).text(output_dir, compression='gzip')
+	print ("Done writing first results")
+	# Read results df as json and write it as json file
+	print ("Reading json input from: " + str(output_dir))
+	resulds_df_json = spark.read.json(output_dir).cache()
+	# Write json to different dir
+	print ("Writing json output to: " + output_dir.replace("_txt", ""))
+	resulds_df_json.write.mode('overwrite').json(output_dir.replace("_txt", ""), compression='gzip')
+	'''
+
+# The following produces the json file required by openaire	
+elif mode == 'json-5-way':
+
+	# Read the remaining input files
+	if len(sys.argv) < 9:
+		print ("\n\nInsufficient input for 'json-5-way' mode.")
+		print ("File list required: <pagerank> <attrank> <citation count> <3-year citation count> <tar-ram> <num_partitions> <graph_type>\n")
+		sys.exit(0)
+		
+	# Read number of partitions: 
+	num_partitions 	= int(sys.argv[-2])
+	graph_type 	= sys.argv[-1]
+	
+	if graph_type not in ['bip', 'openaire']:
+		graph_type = 'bip'	
+	
+	# File directories		
+	pagerank_dir 	= sys.argv[2]
+	attrank_dir	= sys.argv[3]
+	cc_dir		= sys.argv[4]
+	impulse_dir	= sys.argv[5]
+	ram_dir		= sys.argv[6]
+	
+	# Score-specific dataframe - read inputs
+	pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id')
+	attrank_df  = spark.read.schema(float_schema).option('delimiter', '\t').option('header',False).csv(attrank_dir).repartition(num_partitions, 'id')
+	cc_df	    = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id')
+	impulse_df  = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id')
+	ram_df      = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id')	
+	# --- Join the data of the various scores --- #
+	
+	
+	# Replace 6-way classes with 5-way values
+	pagerank_df = pagerank_df.withColumn('class', F.lit('C5'))
+	pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
+	pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
+	pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
+	pagerank_df = pagerank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
+	pagerank_df = pagerank_df.drop('5-way-class').withColumnRenamed('class', '5-way-class')
+	
+	
+	# Create json data for pagerank
+	pagerank_df = pagerank_df.select('id', F.map_concat(
+							F.create_map(F.lit('key'), F.lit('score')),
+					       		F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
+					       F.map_concat(
+					       		F.create_map(F.lit('key'), F.lit('class')),
+					       		F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map'))
+					       		
+
+				       		
+	pagerank_df = pagerank_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('influence_values') )
+	pagerank_df = pagerank_df.select('id', F.create_map(F.lit('id'), F.lit('influence')).alias('id_map'), F.col('influence_values'))
+	pagerank_df = pagerank_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('influence'))).alias('influence_key'), F.to_json(F.col('influence_values')).alias('influence_values') )
+	pagerank_df = pagerank_df.select('id', F.expr('substring(influence_key, 0, length(influence_key)-1)').alias('influence_key'), 'influence_values')
+	pagerank_df = pagerank_df.select('id', 'influence_key', F.expr('substring(influence_values, 2, length(influence_values))').alias('influence_values'))
+	pagerank_df = pagerank_df.select('id', F.concat_ws(', ', F.col('influence_key'), F.col('influence_values')).alias('influence_json'))
+		
+	# Replace 6-way classes with 5 way classes for attrank			       		
+	attrank_df = attrank_df.withColumn('class', F.lit('C5'))
+	attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
+	attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
+	attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
+	attrank_df = attrank_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
+	attrank_df = attrank_df.drop('5-way-class').withColumnRenamed('class', '5-way-class')		
+			       
+	# Create json data for attrank
+	attrank_df = attrank_df.select('id', F.map_concat(
+							F.create_map(F.lit('key'), F.lit('score')),
+					       		F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
+					       F.map_concat(
+					       		F.create_map(F.lit('key'), F.lit('class')),
+					       		F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map'))
+					       							       		
+	attrank_df = attrank_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('popularity_values') )
+	attrank_df = attrank_df.select('id', F.create_map(F.lit('id'), F.lit('popularity')).alias('id_map'), F.col('popularity_values'))
+	attrank_df = attrank_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('popularity'))).alias('popularity_key'), F.to_json(F.col('popularity_values')).alias('popularity_values') )
+	attrank_df = attrank_df.select('id', F.expr('substring(popularity_key, 0, length(popularity_key)-1)').alias('popularity_key'), 'popularity_values')
+	attrank_df = attrank_df.select('id', 'popularity_key', F.expr('substring(popularity_values, 2, length(popularity_values))').alias('popularity_values'))
+	attrank_df = attrank_df.select('id', F.concat_ws(', ', F.col('popularity_key'), F.col('popularity_values')).alias('popularity_json'))	
+	
+	# Replace 6-way classes with 5 way classes for attrank			       		
+	cc_df = cc_df.withColumn('class', F.lit('C5'))
+	cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
+	cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
+	cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
+	cc_df = cc_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
+	cc_df = cc_df.drop('5-way-class').withColumnRenamed('class', '5-way-class')		
+		
+	# Create json data for CC
+	cc_df = cc_df.select('id', F.map_concat(
+						F.create_map(F.lit('key'), F.lit('score')),
+					       	F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
+				   F.map_concat(
+					       	F.create_map(F.lit('key'), F.lit('class')),
+					       	F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map'))
+				       		
+	cc_df = cc_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('influence_alt_values') )
+	cc_df = cc_df.select('id', F.create_map(F.lit('id'), F.lit('influence_alt')).alias('id_map'), F.col('influence_alt_values'))
+	cc_df = cc_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('influence_alt'))).alias('influence_alt_key'), F.to_json(F.col('influence_alt_values')).alias('influence_alt_values') )
+	cc_df = cc_df.select('id', F.expr('substring(influence_alt_key, 0, length(influence_alt_key)-1)').alias('influence_alt_key'), 'influence_alt_values')
+	cc_df = cc_df.select('id', 'influence_alt_key', F.expr('substring(influence_alt_values, 2, length(influence_alt_values))').alias('influence_alt_values'))
+	cc_df = cc_df.select('id', F.concat_ws(', ', F.col('influence_alt_key'), F.col('influence_alt_values')).alias('influence_alt_json'))
+	
+	# Replace 6-way classes with 5 way classes for attrank			       		
+	ram_df = ram_df.withColumn('class', F.lit('C5'))
+	ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
+	ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
+	ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
+	ram_df = ram_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
+	ram_df = ram_df.drop('5-way-class').withColumnRenamed('class', '5-way-class')		
+
+	# Create json data for RAM	
+	ram_df = ram_df.select('id', F.map_concat(
+						F.create_map(F.lit('key'), F.lit('score')),
+					       	F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
+				   F.map_concat(
+					       	F.create_map(F.lit('key'), F.lit('class')),
+					       	F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map'))
+				       		
+	ram_df = ram_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('popularity_alt_values') )
+	ram_df = ram_df.select('id', F.create_map(F.lit('id'), F.lit('popularity_alt')).alias('id_map'), F.col('popularity_alt_values'))
+	ram_df = ram_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('popularity_alt'))).alias('popularity_alt_key'), F.to_json(F.col('popularity_alt_values')).alias('popularity_alt_values') )
+	ram_df = ram_df.select('id', F.expr('substring(popularity_alt_key, 0, length(popularity_alt_key)-1)').alias('popularity_alt_key'), 'popularity_alt_values')
+	ram_df = ram_df.select('id', 'popularity_alt_key', F.expr('substring(popularity_alt_values, 2, length(popularity_alt_values))').alias('popularity_alt_values'))
+	ram_df = ram_df.select('id', F.concat_ws(', ', F.col('popularity_alt_key'), F.col('popularity_alt_values')).alias('popularity_alt_json'))
+
+	# Replace 6-way classes with 5 way classes for attrank			       		
+	impulse_df = impulse_df.withColumn('class', F.lit('C5'))
+	impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('D'), F.lit('C4')).otherwise(F.col('class')) )
+	impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('C'), F.lit('C3')).otherwise(F.col('class')) )
+	impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('B'), F.lit('C2')).otherwise(F.col('class')) )
+	impulse_df = impulse_df.withColumn('class', F.when(F.col('5-way-class') == F.lit('A'), F.lit('C1')).otherwise(F.col('class')) )
+	impulse_df = impulse_df.drop('5-way-class').withColumnRenamed('class', '5-way-class')		
+	
+	# Create json data for impulse	
+	impulse_df = impulse_df.select('id', F.map_concat(
+						F.create_map(F.lit('key'), F.lit('score')),
+					       	F.create_map(F.lit('value'), F.col('score'))).alias('score_map'),
+				   F.map_concat(
+					       	F.create_map(F.lit('key'), F.lit('class')),
+					       	F.create_map(F.lit('value'), F.col('5-way-class'))).alias('class_map'))
+				       		
+	impulse_df = impulse_df.select('id', F.create_map(F.lit('unit'), F.array([F.col('score_map'), F.col('class_map')]) ).alias('impulse_values') )
+	impulse_df = impulse_df.select('id', F.create_map(F.lit('id'), F.lit('impulse')).alias('id_map'), F.col('impulse_values'))
+	impulse_df = impulse_df.select('id', F.to_json(F.create_map(F.lit('id'), F.lit('impulse'))).alias('impulse_key'), F.to_json(F.col('impulse_values')).alias('impulse_values') )
+	impulse_df = impulse_df.select('id', F.expr('substring(impulse_key, 0, length(impulse_key)-1)').alias('impulse_key'), 'impulse_values')
+	impulse_df = impulse_df.select('id', 'impulse_key', F.expr('substring(impulse_values, 2, length(impulse_values))').alias('impulse_values'))
+	impulse_df = impulse_df.select('id', F.concat_ws(', ', F.col('impulse_key'), F.col('impulse_values')).alias('impulse_json'))	
+	
+	#Join dataframes together
+	results_df = pagerank_df.join(attrank_df, ['id'])
+	results_df = results_df.join(cc_df, ['id'])
+	results_df = results_df.join(ram_df, ['id'])
+	results_df = results_df.join(impulse_df, ['id'])
+	
+	print ("Json encoding DOI keys")
+	# Json encode doi strings
+	results_df = results_df.select(json_encode_key('id').alias('id'), 'influence_json', 'popularity_json', 'influence_alt_json', 'popularity_alt_json', 'impulse_json')
+
+	# Concatenate individual json columns
+	results_df = results_df.select('id', F.concat_ws(', ', F.col('influence_json'), F.col('popularity_json'), F.col('influence_alt_json'), F.col('popularity_alt_json'), F.col('impulse_json') ).alias('json_data'))
+	results_df = results_df.select('id', F.concat_ws('', F.lit('['), F.col('json_data'), F.lit(']')).alias('json_data') )
+	
+	# Filter out non-openaire ids if need
+	if graph_type == 'openaire':
+		results_df = results_df.where( ~F.col('id').like('10.%') )
+	
+	# Concatenate paper id and add opening and ending brackets
+	results_df = results_df.select(F.concat_ws('', F.lit('{'), F.col('id'), F.lit(': '), F.col('json_data'), F.lit('}')).alias('json') )
+
+	# TEST output and count
+	# results_df.show(20, False)
+	# print ("Results #" + str(results_df.count()))
+					
+	# -------------------------------------------- #
+	# Write json output
+	output_dir = "/".join(pagerank_dir.split('/')[:-1])
+	if graph_type == 'bip':
+		output_dir = output_dir + '/bip_universe_doi_scores_5_classes/'
+	else:
+		output_dir = output_dir + '/openaire_universe_scores_5_classes/'
+		
+	print ("Writing output to: " + output_dir)
+	results_df.write.mode('overwrite').option('header', False).text(output_dir, compression='gzip')
+
+# Close spark session
+spark.stop()	
+	
+print("--- Main program execution time: %s seconds ---" % (time.time() - start_time))
+print("--- Finished --- \n\n")
+
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/get_ranking_files.sh b/dhp-workflows/dhp-impact-indicators/src/main/resources/get_ranking_files.sh
new file mode 100644
index 000000000..4d0fedba9
--- /dev/null
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/get_ranking_files.sh
@@ -0,0 +1,14 @@
+ranking_results_folder=$1;
+
+pr_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/PR_.*" | grep -o "PR.*"`;
+attrank_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/AttRank.*" | grep -o "AttRank.*"`;
+cc_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/CC_.*" | grep -o "CC.*"`;
+impulse_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/3-year_.*" | grep -o "3-year.*"`;
+ram_file=`hdfs dfs -ls ${ranking_results_folder}/ | grep "/RAM_.*" | grep -o "RAM.*"`;
+
+echo "pr_file=${pr_file}";
+echo "attrank_file=${attrank_file}";
+echo "cc_file=${cc_file}";
+echo "impulse_file=${impulse_file}";
+echo "ram_file=${ram_file}";
+# echo "TEST=`hdfs dfs -ls ${ranking_results_folder}/`";
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties
new file mode 100644
index 000000000..9ad9def21
--- /dev/null
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties
@@ -0,0 +1,86 @@
+# The following set of properties are defined in https://support.openaire.eu/projects/openaire/wiki/Hadoop_clusters 
+# and concern the parameterization required for running workflows on the @GARR cluster
+
+dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos
+dhp.hadoop.frontend.user.name=ilias.kanellos
+dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl
+dhp.hadoop.frontend.port.ssh=22
+oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie
+jobTracker=yarnRM
+nameNode=hdfs://nameservice1
+oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log
+maven.executable=mvn
+sparkDriverMemory=7G
+sparkExecutorMemory=7G
+sparkExecutorCores=4
+# The above is given differently in an example I found online
+oozie.action.sharelib.for.spark=spark2
+oozieActionShareLibForSpark2=spark2
+spark2YarnHistoryServerAddress=http://iis-cdh5-test-gw.ocean.icm.edu.pl:18089
+spark2EventLogDir=/user/spark/spark2ApplicationHistory
+sparkSqlWarehouseDir=/user/hive/warehouse
+hiveMetastoreUris=thrift://iis-cdh5-test-m3.ocean.icm.edu.pl:9083
+# This MAY avoid the no library used error
+oozie.use.system.libpath=true
+# Some stuff copied from openaire's jobs
+spark2ExtraListeners=com.cloudera.spark.lineage.NavigatorAppListener
+spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener
+
+
+# Some stuff copied from openaire's jobs
+spark2ExtraListeners=com.cloudera.spark.lineage.NavigatorAppListener
+spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListener
+
+# ------------------------------------------------------------------------------ #
+# The following set of properties are my own custom ones
+
+# Based on the page linked to at the start of the file, if we use yarn as a resource manager, its address is given as follows
+resourceManager=http://iis-cdh5-test-m2.ocean.icm.edu.pl:8088/cluster
+
+# current year used when creating graph / by some ranking methods
+currentYear=2024
+
+# Alpha value for pagerank
+pageRankAlpha=0.5
+# AttRank values
+attrankAlpha=0.2
+attrankBeta=0.5
+attrankGamma=0.3
+attrankRho=-0.16
+# attrankCurrentYear=2023
+attrankStartYear=2021
+
+# Ram values
+ramGamma=0.6
+# ramCurrentYear=2023
+
+# Convergence error for pagerank
+convergenceError=0.000000000001
+
+# I think this should be the oozie workflow directory
+oozieWorkflowPath=user/ilias.kanellos/workflow_example/
+
+# The directory where the workflow data is/should be stored
+workflowDataDir=user/ilias.kanellos/ranking_workflow
+
+# Directory where dataframes are checkpointed
+checkpointDir=${nameNode}/${workflowDataDir}/check/
+
+# The directory for the doi-based bip graph
+bipGraphFilePath=${nameNode}/${workflowDataDir}/bipdbv8_graph
+
+# The folder from which synonyms of openaire-ids are read
+# openaireDataInput=${nameNode}/tmp/beta_provision/graph/21_graph_cleaned/
+openaireDataInput=${/tmp/prod_provision/graph/18_graph_blacklisted}
+
+# A folder where we will write the openaire to doi mapping
+synonymFolder=${nameNode}/${workflowDataDir}/openaireid_to_dois/
+
+# This will be where we store the openaire graph input. They told us on GARR to use a directory under /data
+openaireGraphInputPath=${nameNode}/${workflowDataDir}/openaire_id_graph
+
+# The workflow application path
+wfAppPath=${nameNode}/${oozieWorkflowPath}
+# The following is needed as a property of a workflow
+oozie.wf.application.path=${wfAppPath}
+
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/map_openaire_ids_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/map_openaire_ids_to_dois.py
new file mode 100644
index 000000000..7997eec82
--- /dev/null
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/map_openaire_ids_to_dois.py
@@ -0,0 +1,60 @@
+import json
+import sys
+from pyspark.sql import SparkSession
+from pyspark import SparkConf, SparkContext
+
+if len(sys.argv) != 3:
+    print("Usage: map_openaire_ids_to_dois.py <hdfs_src_dir> <hdfs_output_dir>")
+    sys.exit(-1)
+
+conf = SparkConf().setAppName('BIP!: Map OpenAIRE IDs to DOIs')
+sc = SparkContext(conf = conf)
+spark = SparkSession.builder.appName('BIP!: Map OpenAIRE IDs to DOIs').getOrCreate()
+sc.setLogLevel('OFF')
+
+src_dir = sys.argv[1]
+output = sys.argv[2]
+
+# src_dir = "/tmp/beta_provision/graph/21_graph_cleaned/"
+# output = '/tmp/openaireid_to_dois/'
+
+def transform(doc):
+    
+    # get publication year from 'doc.dateofacceptance.value'
+    dateofacceptance = doc.get('dateofacceptance', {}).get('value')
+
+    year = 0 
+    
+    if (dateofacceptance is not None):
+        year = dateofacceptance.split('-')[0]
+
+    # for each pid get 'pid.value' if 'pid.qualifier.classid' equals to 'doi'
+    dois = [ pid['value'] for pid in doc.get('pid', [])  if (pid.get('qualifier', {}).get('classid') == 'doi' and pid['value'] is not None)]
+
+    num_dois = len(dois)
+    
+    # exlcude openaire ids that do not correspond to DOIs
+    if (num_dois == 0): 
+        return None
+        
+    fields = [ doc['id'], str(num_dois), chr(0x02).join(dois), str(year) ]
+    
+    return '\t'.join([ v.encode('utf-8') for v in fields ])
+    
+docs = None
+
+for result_type in ["publication", "dataset", "software", "otherresearchproduct"]:
+    
+    tmp = sc.textFile(src_dir + result_type).map(json.loads)
+    
+    if (docs is None):
+        docs = tmp
+    else:
+        # append all result types in one RDD
+        docs = docs.union(tmp)
+
+docs = docs.filter(lambda d: d.get('dataInfo', {}).get('deletedbyinference') == False and d.get('dataInfo', {}).get('invisible') == False)
+
+docs = docs.map(transform).filter(lambda d: d is not None)
+
+docs.saveAsTextFile(output)
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/map_scores_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/map_scores_to_dois.py
new file mode 100644
index 000000000..0d294e045
--- /dev/null
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/map_scores_to_dois.py
@@ -0,0 +1,145 @@
+# This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow
+# and uses this mapping to create doi-based score files in the format required by BiP! DB.
+# This is done by reading each openaire-id based ranking file and joining the openaire based
+# score and classes to all the corresponding dois.
+#################################################################################################
+# Imports
+import sys
+
+# Sparksession lib to communicate with cluster via session object
+from pyspark.sql import SparkSession
+
+# Import sql types to define schemas
+from pyspark.sql.types import *
+
+# Import sql functions with shorthand alias
+import pyspark.sql.functions as F
+# from pyspark.sql.functions import udf
+#################################################################################################
+#################################################################################################
+# Clean up directory name
+def clean_directory_name(dir_name):
+    # We have a name with the form *_bip_universe<digits>_* or *_graph_universe<digits>_* 
+    # and we need to keep the parts in *	
+    dir_name_parts = dir_name.split('_')
+    dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)]
+	
+    clean_name = '_'.join(dir_name_parts)
+
+    if '_ids' not in clean_name:
+        clean_name = clean_name.replace('id_', 'ids_')
+        	
+    # clean_name = clean_name.replace('.txt', '')
+    # clean_name = clean_name.replace('.gz', '')
+
+    if 'openaire_ids_' in clean_name:
+        clean_name = clean_name.replace('openaire_ids_', '')
+        # clean_name = clean_name + '.txt.gz'
+    # else:
+        # clean_name = clean_name + '.txt.gz'
+	
+    return clean_name
+#################################################################################################
+if len(sys.argv) < 3:
+    print ("Usage: ./map_scores_to_dois.py <synonym_folder> <num_partitions> <score_file_1> <score_file_2> <...etc...>")
+    sys.exit(-1)
+
+# Read arguments
+synonyms_folder = sys.argv[1]
+num_partitions = int(sys.argv[2])
+input_file_list = [argument for argument in sys.argv[3:]]
+input_file_list = [clean_directory_name(item) for item in input_file_list]
+
+# Prepare output specific variables
+output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list]
+output_file_list = [item + ".gz" if not item.endswith(".gz") else item for item in output_file_list]
+
+# --- INFO MESSAGES --- #
+print ("\n\n----------------------------")
+print ("Mpping openaire ids to DOIs")
+print ("Reading input from: " + synonyms_folder)
+print ("Num partitions: " + str(num_partitions))
+print ("Input files:" + " -- ".join(input_file_list))
+print ("Output files: " + " -- ".join(output_file_list))
+print ("----------------------------\n\n")
+#######################################################################################
+# We weill define the following schemas:
+# --> the schema of the openaire - doi mapping file [string - int - doi_list] (the separator of the doi-list is a non printable character)
+# --> a schema for floating point ranking scores [string - float - string]  (the latter string is the class)
+# --> a schema for integer ranking scores [string - int - string]  (the latter string is the class)
+
+float_schema = StructType([
+	StructField('id', StringType(), False),
+	StructField('score', FloatType(), False),
+	StructField('class', StringType(), False)
+	])
+	
+int_schema = StructType([
+	StructField('id', StringType(), False),
+	StructField('score', IntegerType(), False),
+	StructField('class', StringType(), False)
+	])
+	
+# This schema concerns the output of the file
+# containing the number of references of each doi
+synonyms_schema = StructType([
+	StructField('id', StringType(), False),
+	StructField('num_synonyms', IntegerType(), False),
+    StructField('doi_list', StringType(), False),
+	])
+#######################################################################################
+# Start spark session
+spark = SparkSession.builder.appName('Map openaire scores to DOIs').getOrCreate()
+# Set Log Level for spark session
+spark.sparkContext.setLogLevel('WARN')
+#######################################################################################
+# MAIN Program
+
+# Read and repartition the synonym folder - also cache it since we will need to perform multiple joins
+synonym_df = spark.read.schema(synonyms_schema).option('delimiter', '\t').csv(synonyms_folder)
+synonym_df = synonym_df.select('id',  F.split(F.col('doi_list'), chr(0x02)).alias('doi_list'))
+synonym_df = synonym_df.select('id', F.explode('doi_list').alias('doi')).repartition(num_partitions, 'id').cache()
+
+# TESTING
+# print ("Synonyms: " + str(synonym_df.count()))
+# print ("DF looks like this:" )
+# synonym_df.show(1000, False)
+
+print ("\n\n-----------------------------")
+# Now we need to join the score files on the openaire-id with the synonyms and then keep
+# only doi - score - class and write this to the output
+for offset, input_file in enumerate(input_file_list):
+
+    print ("Mapping scores from " + input_file)
+
+    # Select correct schema
+    schema = int_schema
+    if "attrank" in input_file.lower() or "pr" in input_file.lower() or "ram" in input_file.lower():
+        schema = float_schema
+    
+    # Load file to dataframe
+    ranking_df = spark.read.schema(schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, 'id')
+   
+    # TESTING
+    # print ("Loaded df sample:")
+    # ranking_df.show(1000, False)
+
+    # Join scores to synonyms and keep required fields
+    doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'class').repartition(num_partitions, 'doi').cache()
+    # Write output
+    output_file = output_file_list[offset]
+    print ("Writing to: " + output_file)
+    doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip')
+    # Free memory?
+    ranking_df.unpersist(True)
+
+print ("-----------------------------")
+print ("\n\nFinished!\n\n")
+
+
+
+
+
+
+
+
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml
new file mode 100644
index 000000000..807c32063
--- /dev/null
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml
@@ -0,0 +1,600 @@
+<workflow-app xmlns="uri:oozie:workflow:0.5" name="ranking-wf">
+
+	<!-- start using a decision node, so as to determine from which point onwards a job will continue -->
+	<!-- <start to="get-doi-synonyms" /> -->
+	<start to="entry-point-decision" />
+	
+	<decision name="entry-point-decision">
+		<switch>
+			<!-- The default will be set as the normal start, a.k.a. get-doi-synonyms -->
+			<!-- If any different condition is set, go to the corresponding start -->
+			<case to="non-iterative-rankings">${resume eq "rankings-start"}</case>
+			<case to="spark-impulse">${resume eq "impulse"}</case>
+			<case to="iterative-rankings">${resume eq "rankings-iterative"}</case>
+			<case to="get-file-names">${resume eq "format-results"}</case>
+			<case to="map-openaire-to-doi">${resume eq "map-ids"}</case> 
+			<case to="map-scores-to-dois">${resume eq "map-scores"}</case> 
+			<case to="create-openaire-ranking-graph">${resume eq "start"}</case>
+			<!-- TODO: add action set creation here -->
+			<default to="create-openaire-ranking-graph" />
+		</switch>
+	</decision>
+	
+	<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
+	<action name="create-openaire-ranking-graph">
+		<!-- This is required as a tag for spark jobs, regardless of programming language -->
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
+			<job-tracker>${jobTracker}</job-tracker>
+			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
+			<name-node>${nameNode}</name-node>
+			<!-- Delete previously created doi synonym folder -->
+			<prepare>
+				<delete path="${synonymFolder}"/>
+			</prepare>
+
+            		
+            <!-- using configs from an example on openaire --> 
+            <master>yarn-cluster</master>
+			<mode>cluster</mode>
+            		
+			<!-- This is the name of our job -->
+			<name>Openaire Ranking Graph Creation</name>
+			<!-- Script name goes here -->
+			<jar>create_openaire_ranking_graph.py</jar>
+			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+			<spark-opts>--executor-memory 20G --executor-cores 4 --driver-memory 20G  
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<!-- Script arguments here -->
+			<!-- The openaire graph data from which to read relations and objects -->
+			<arg>${openaireDataInput}</arg>
+			<!-- Year for filtering entries w/ larger values / empty --> 
+			<arg>${currentYear}</arg>
+			<!-- number of partitions to be used on joins -->
+			<arg>7680</arg>
+			<!-- The output of the graph should be the openaire input graph for ranking-->
+			<arg>${openaireGraphInputPath}</arg>
+			<!-- This needs to point to the file on the hdfs i think -->
+			<file>${wfAppPath}/create_openaire_ranking_graph.py#create_openaire_ranking_graph.py</file>
+		</spark>
+		
+		<!-- Do this after finishing okay -->
+		<ok to="non-iterative-rankings" />
+		<!-- Go there if we have an error -->
+		<error to="openaire-graph-error" />
+		
+	</action>	
+	
+	<!-- Citation Count and RAM are calculated in parallel-->
+	<!-- Impulse Requires resources and will be run after-->
+	<fork name="non-iterative-rankings">
+		<path start="spark-cc"/>
+		<!-- <path start="spark-impulse"/> -->
+		<path start="spark-ram"/>
+	</fork>
+	
+	<!-- CC here -->
+	<action name="spark-cc">
+		<!-- This is required as a tag for spark jobs, regardless of programming language -->
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
+			<job-tracker>${jobTracker}</job-tracker>
+			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
+			<name-node>${nameNode}</name-node>
+
+            		
+            		<!-- using configs from an example on openaire --> 
+            		<master>yarn-cluster</master>
+			<mode>cluster</mode>
+            		
+			<!-- This is the name of our job -->
+			<name>Spark CC</name>
+			<!-- Script name goes here -->
+			<jar>CC.py</jar>
+			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<!-- Script arguments here -->
+			<arg>${openaireGraphInputPath}</arg>
+			<!-- number of partitions to be used on joins -->
+			<arg>7680</arg>
+			<!-- This needs to point to the file on the hdfs i think -->
+			<file>${wfAppPath}/CC.py#CC.py</file>
+		</spark>
+		
+		<!-- Do this after finishing okay -->
+		<ok to="join-non-iterative-rankings" />
+		<!-- Go there if we have an error -->
+		<error to="cc-fail" />
+		
+	</action>	
+
+	<!-- IMPULSE here -->
+	<action name="spark-ram">
+		<!-- This is required as a tag for spark jobs, regardless of programming language -->
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
+			<job-tracker>${jobTracker}</job-tracker>
+			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
+			<name-node>${nameNode}</name-node>
+
+            		
+            <!-- using configs from an example on openaire --> 
+            <master>yarn-cluster</master>
+			<mode>cluster</mode>
+            		
+			<!-- This is the name of our job -->
+			<name>Spark RAM</name>
+			<!-- Script name goes here -->
+			<jar>TAR.py</jar>
+			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<!-- Script arguments here -->
+			<arg>${openaireGraphInputPath}</arg>
+			<arg>${ramGamma}</arg>
+			<arg>${currentYear}</arg>
+			<arg>RAM</arg>
+			<!-- number of partitions to be used on joins -->
+			<arg>7680</arg>
+			<arg>${γιτ α}</arg>
+			<!-- This needs to point to the file on the hdfs i think -->
+			<file>${wfAppPath}/TAR.py#TAR.py</file>
+		</spark>
+		
+		<!-- Do this after finishing okay -->
+		<ok to="join-non-iterative-rankings" />
+		<!-- Go there if we have an error -->
+		<error to="ram-fail" />
+		
+	</action>		
+	
+	<!-- JOIN NON-ITERATIVE METHODS AND THEN CONTINUE TO ITERATIVE ONES -->
+	<join name="join-non-iterative-rankings" to="spark-impulse"/>
+	
+	<!-- IMPULSE here -->
+	<action name="spark-impulse">
+		<!-- This is required as a tag for spark jobs, regardless of programming language -->
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
+			<job-tracker>${jobTracker}</job-tracker>
+			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
+			<name-node>${nameNode}</name-node>
+
+            		
+            		<!-- using configs from an example on openaire --> 
+            		<master>yarn-cluster</master>
+			<mode>cluster</mode>
+            		
+			<!-- This is the name of our job -->
+			<name>Spark Impulse</name>
+			<!-- Script name goes here -->
+			<jar>CC.py</jar>
+			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<!-- Script arguments here -->
+			<arg>${openaireGraphInputPath}</arg>
+			<!-- number of partitions to be used on joins -->
+			<arg>7680</arg>
+			<arg>3</arg>
+			<!-- This needs to point to the file on the hdfs i think -->
+			<file>${wfAppPath}/CC.py#CC.py</file>
+		</spark>
+		
+		<!-- Do this after finishing okay -->
+		<ok to="iterative-rankings" />
+		<!-- Go there if we have an error -->
+		<error to="impulse-fail" />
+		
+	</action>	
+
+	<fork name="iterative-rankings">
+		<path start="spark-pagerank"/>
+		<path start="spark-attrank"/>
+	</fork>
+
+	<!-- PAGERANK here -->
+	<action name="spark-pagerank">
+		<!-- This is required as a tag for spark jobs, regardless of programming language -->
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
+			<job-tracker>${jobTracker}</job-tracker>
+			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
+			<name-node>${nameNode}</name-node>
+			
+			<!-- we could add map-reduce configs here, but I don't know if we need them -->
+			<!-- This is the type of master-client configuration for running spark -->
+			<!-- <master>yarn-client</master> -->
+			<!-- Reference says: The master element indicates the url of the Spark Master. Ex: spark://host:port, mesos://host:port, yarn-cluster, yarn-master, or local. -->
+			<!-- <master>local[*]</master> -->
+			<!-- Reference says: The mode element if present indicates the mode of spark, where to run spark driver program. Ex: client,cluster. | In my case I always have a client -->
+            		<!-- <mode>client</mode> --> 
+            		
+            <!-- using configs from an example on openaire --> 
+            <master>yarn-cluster</master>
+			<mode>cluster</mode>
+            		
+			<!-- This is the name of our job -->
+			<name>Spark Pagerank</name>
+			<!-- Script name goes here -->
+			<jar>PageRank.py</jar>
+			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<!-- Script arguments here -->
+			<arg>${openaireGraphInputPath}</arg>
+			<arg>${pageRankAlpha}</arg>
+			<arg>${convergenceError}</arg>
+			<arg>${checkpointDir}</arg>
+			<!-- number of partitions to be used on joins -->
+			<arg>7680</arg>
+			<arg>dfs</arg>
+			<!-- This needs to point to the file on the hdfs i think -->
+			<file>${wfAppPath}/PageRank.py#PageRank.py</file>
+		</spark>
+		
+		<!-- Do this after finishing okay -->
+		<ok to="join-iterative-rankings" />
+		<!-- Go there if we have an error -->
+		<error to="pagerank-fail" />
+		
+	</action>
+	
+	<!-- ATTRANK here -->
+	<action name="spark-attrank">
+		<!-- This is required as a tag for spark jobs, regardless of programming language -->
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
+			<job-tracker>${jobTracker}</job-tracker>
+			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
+			<name-node>${nameNode}</name-node>
+            		
+            <!-- using configs from an example on openaire --> 
+            <master>yarn-cluster</master>
+			<mode>cluster</mode>
+            		
+			<!-- This is the name of our job -->
+			<name>Spark AttRank</name>
+			<!-- Script name goes here -->
+			<jar>AttRank.py</jar>
+			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<!-- Script arguments here -->
+			<arg>${openaireGraphInputPath}</arg>
+			<arg>${attrankAlpha}</arg>
+			<arg>${attrankBeta}</arg>
+			<arg>${attrankGamma}</arg>
+			<arg>${attrankRho}</arg>
+			<arg>${currentYear}</arg>
+			<arg>${attrankStartYear}</arg>
+			<arg>${convergenceError}</arg>
+			<arg>${checkpointDir}</arg>
+			<!-- number of partitions to be used on joins -->
+			<arg>7680</arg>
+			<arg>dfs</arg>
+			<!-- This needs to point to the file on the hdfs i think -->
+			<file>${wfAppPath}/AttRank.py#AttRank.py</file>
+		</spark>
+		
+		<!-- Do this after finishing okay -->
+		<ok to="join-iterative-rankings" />
+		<!-- Go there if we have an error -->
+		<error to="attrank-fail" />
+		
+	</action>	
+	
+	<!-- JOIN ITERATIVE METHODS AND THEN END -->
+	<join name="join-iterative-rankings" to="get-file-names"/>
+	
+
+	<!-- This will be a shell action that will output key-value pairs for output files -->
+	<action name="get-file-names">
+		<!-- This is required as a tag for shell jobs -->
+		<shell xmlns="uri:oozie:shell-action:0.3">
+			<!-- Same for all -->
+			<job-tracker>${jobTracker}</job-tracker>
+			<!-- This should give the machine/root of the hdfs -->
+			<name-node>${nameNode}</name-node>
+            		
+            		<!-- Exec is needed foor shell comands - points to type of shell command -->
+            		<exec>/usr/bin/bash</exec>
+            		<!-- name of script to run -->
+            		<argument>get_ranking_files.sh</argument>
+            		<!-- We only pass the directory where we expect to find the rankings -->
+            		<argument>/${workflowDataDir}</argument>
+            		
+			<!-- the name of the file run -->
+			<file>${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh</file>
+			<!-- Get the output in order to be usable by following actions -->
+			<capture-output/>
+		</shell>
+		
+		<!-- Do this after finishing okay -->
+		<ok to="format-result-files" />
+		<!-- Go there if we have an error -->
+		<error to="filename-getting-error" />
+		
+	</action>
+	
+	
+	<!-- Now we will run in parallel the formatting of ranking files for BiP! DB and openaire (json files) -->
+	<fork name="format-result-files">
+		<path start="format-bip-files"/>
+		<path start="format-json-files"/>
+	</fork>
+	
+	
+	<!-- Format json files -->
+	<!-- Two parts: a) format files b) make the file endings .json.gz -->
+	<action name="format-json-files">
+		<!-- This is required as a tag for spark jobs, regardless of programming language -->
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
+			<job-tracker>${jobTracker}</job-tracker>
+			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
+			<name-node>${nameNode}</name-node>
+            		
+            		<!-- using configs from an example on openaire --> 
+            		<master>yarn-cluster</master>
+			<mode>cluster</mode>
+            		
+			<!-- This is the name of our job -->
+			<name>Format Ranking Results JSON</name>
+			<!-- Script name goes here -->
+			<jar>format_ranking_results.py</jar>
+			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+			<spark-opts>--executor-memory 10G --executor-cores 4 --driver-memory 10G  
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<!-- Script arguments here -->
+			<arg>json</arg>
+			<!-- Input files must be identified dynamically -->
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
+			<!-- Num partitions -->
+			<arg>7680</arg>
+			<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
+			<arg>openaire</arg>
+			<!-- This needs to point to the file on the hdfs i think -->
+			<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
+		</spark>		
+		
+		<!-- Do this after finishing okay -->
+		<ok to="join-file-formatting" />
+		<!-- Go there if we have an error -->
+		<error to="json-formatting-fail" />
+	</action> 
+	
+	<!-- This is the second line of parallel workflow execution where we create the BiP! DB files -->
+	<action name="format-bip-files">
+		<!-- This is required as a tag for spark jobs, regardless of programming language -->
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
+			<job-tracker>${jobTracker}</job-tracker>
+			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
+			<name-node>${nameNode}</name-node>
+            		
+            		<!-- using configs from an example on openaire --> 
+            		<master>yarn-cluster</master>
+			<mode>cluster</mode>
+            		
+			<!-- This is the name of our job -->
+			<name>Format Ranking Results BiP! DB</name>
+			<!-- Script name goes here -->
+			<jar>format_ranking_results.py</jar>
+			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+			<spark-opts>--executor-memory 10G --executor-cores 4 --driver-memory 10G  
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<!-- Script arguments here -->
+			<arg>zenodo</arg>
+			<!-- Input files must be identified dynamically -->
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
+			<!-- Num partitions -->
+			<arg>7680</arg>
+			<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
+			<arg>openaire</arg>
+			<!-- This needs to point to the file on the hdfs i think -->
+			<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
+		</spark>		
+		
+		<!-- Do this after finishing okay -->
+		<ok to="join-file-formatting" />
+		<!-- Go there if we have an error -->
+		<error to="bip-formatting-fail" />
+	</action> 	
+	
+	<!-- Finish formatting data and end --> 
+	<join name="join-file-formatting" to="map-openaire-to-doi"/>	
+
+	<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
+	<action name="map-openaire-to-doi">
+		<!-- This is required as a tag for spark jobs, regardless of programming language -->
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
+			<job-tracker>${jobTracker}</job-tracker>
+			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
+			<name-node>${nameNode}</name-node>
+			<!-- Delete previously created doi synonym folder -->
+			<prepare>
+				<delete path="${synonymFolder}"/>
+			</prepare>
+
+            		
+            <!-- using configs from an example on openaire --> 
+            <master>yarn-cluster</master>
+			<mode>cluster</mode>
+            		
+			<!-- This is the name of our job -->
+			<name>Openaire-DOI synonym collection</name>
+			<!-- Script name goes here -->
+			<jar>map_openaire_ids_to_dois.py</jar>
+			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 15G  
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<!-- Script arguments here -->
+			<arg>${openaireDataInput}</arg>
+			<!-- number of partitions to be used on joins -->
+			<arg>${synonymFolder}</arg>
+			<!-- This needs to point to the file on the hdfs i think -->
+			<file>${wfAppPath}/map_openaire_ids_to_dois.py#map_openaire_ids_to_dois.py</file>
+		</spark>
+		
+		<!-- Do this after finishing okay -->
+		<ok to="map-scores-to-dois" />
+		<!-- Go there if we have an error -->
+		<error to="synonym-collection-fail" />
+		
+	</action>	
+
+
+	<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
+	<action name="map-scores-to-dois">
+		<!-- This is required as a tag for spark jobs, regardless of programming language -->
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
+			<job-tracker>${jobTracker}</job-tracker>
+			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
+			<name-node>${nameNode}</name-node>
+
+            		
+            <!-- using configs from an example on openaire --> 
+            <master>yarn-cluster</master>
+			<mode>cluster</mode>
+            		
+			<!-- This is the name of our job -->
+			<name>Mapping Openaire Scores to DOIs</name>
+			<!-- Script name goes here -->
+			<jar>map_scores_to_dois.py</jar>
+			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 15G  
+					--master yarn
+					--deploy-mode cluster
+					--conf spark.sql.shuffle.partitions=7680
+					--conf spark.extraListeners=${spark2ExtraListeners}
+                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<!-- Script arguments here -->
+			<arg>${synonymFolder}</arg>
+			<!-- Number of partitions -->
+			<arg>7680</arg>
+			<!-- The remaining input are the ranking files fproduced for bip db-->
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg>			
+
+			<!-- This needs to point to the file on the hdfs i think -->
+			<file>${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file>
+		</spark>
+		
+		<!-- Do this after finishing okay -->
+		<ok to="end" />
+		<!-- Go there if we have an error -->
+		<error to="map-scores-fail" />
+		
+	</action>		
+	
+
+	<!-- TODO: end the workflow-->
+		
+	<!-- Define ending node -->
+	<end name="end" />
+	
+	<!-- Definitions of failure messages -->	
+	<kill name="pagerank-fail">
+		<message>PageRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>
+	
+	<kill name="attrank-fail">
+		<message>AttRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>	
+	
+	<kill name="cc-fail">
+		<message>CC failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>
+	
+	<kill name="impulse-fail">
+		<message>Impulse failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>	
+	
+	<kill name="ram-fail">
+		<message>RAM failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>	
+
+	<kill name="openaire-graph-error">
+		<message>Creation of openaire-graph failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>		
+
+	<kill name="synonym-collection-fail">
+		<message>Synonym collection failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>		
+
+	<kill name="map-scores-fail">
+		<message>Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>	
+
+</workflow-app>

From b5c252865c15605e8b6ff154891d82a7544763d8 Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Mon, 20 Mar 2023 15:38:36 +0200
Subject: [PATCH 03/41] Add filtering based on citation source

---
 .../main/resources/create_openaire_ranking_graph.py   | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py
index 4cffa86a3..cda12a77c 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py
@@ -126,12 +126,19 @@ oa_objects_df = oa_objects_df.drop('deletedbyinference').drop('invisible').disti
 
 # Collect only valid citations i.e., invisible = false & deletedbyinference=false  
 cites_df  = spark.read.json(graph_folder + "/relation")\
-			.select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\
+			.select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'collectedfrom.value', 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\
 			.where( (F.col('relClass') == "Cites") \
 				& (F.col('dataInfo.deletedbyinference') == "false")\
                 & (F.col('dataInfo.invisible') == "false"))\
 				.drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\
-				.repartition(num_partitions, 'citing').drop('relClass')
+				.repartition(num_partitions, 'citing').drop('relClass')\
+				.withColumn('collected_lower', F.expr('transform(collectedfrom.value, x -> lower(x))'))\
+				.drop('collectedfrom.value')\
+				.where(
+					(F.array_contains(F.col('collected_lower'), "opencitations"))
+            | 		(F.array_contains(F.col('collected_lower'), "crossref"))
+            | 		(F.array_contains(F.col('collected_lower'), "mag"))
+				).drop('collected_lower')
 # print ("Cited df has: " + str(cites_df.count()) + " entries")	 
 
 # DEPRECATED 

From 9dc8f0f05f2d527bccbde92680f864dbb635710f Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Tue, 21 Mar 2023 16:14:15 +0200
Subject: [PATCH 04/41] Add ActionSet step

---
 .../src/main/resources/job.properties         |  6 +++
 .../src/main/resources/workflow.xml           | 51 ++++++++++++++++++-
 2 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties
index 9ad9def21..a902c413f 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties
@@ -63,6 +63,9 @@ oozieWorkflowPath=user/ilias.kanellos/workflow_example/
 # The directory where the workflow data is/should be stored
 workflowDataDir=user/ilias.kanellos/ranking_workflow
 
+# Directory where json data containing scores will be output
+bipScorePath=${workflowDataDir}/openaire_universe_scores/
+
 # Directory where dataframes are checkpointed
 checkpointDir=${nameNode}/${workflowDataDir}/check/
 
@@ -84,3 +87,6 @@ wfAppPath=${nameNode}/${oozieWorkflowPath}
 # The following is needed as a property of a workflow
 oozie.wf.application.path=${wfAppPath}
 
+# Path where the final output should be?
+actionSetOutputPath=${workflowDataDir}/bip_actionsets/
+
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml
index 807c32063..d99dc16a2 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml
@@ -552,11 +552,50 @@
 		</spark>
 		
 		<!-- Do this after finishing okay -->
-		<ok to="end" />
+		<ok to="deleteOutputPathForActionSet" />
 		<!-- Go there if we have an error -->
 		<error to="map-scores-fail" />
 		
-	</action>		
+	</action>	
+
+	    <action name="deleteOutputPathForActionSet">
+        <fs>
+            <delete path="${actionSetOutputPath}"/>
+            <mkdir path="${actionSetOutputPath}"/>
+            <!--
+			<delete path="${workingDir}"/>
+            <mkdir path="${workingDir}"/>
+			--> 
+        </fs>
+        <ok to="createActionSet"/>
+        <error to="actionset-delete-fail"/>
+    </action>
+
+
+    <action name="createActionSet">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Produces the atomic action with the bip finder scores for publications</name>
+            <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${bipScorePath}</arg>
+            <arg>--outputPath</arg><arg>${actionSetOutputPath}</arg>
+        </spark>
+        <ok to="end"/>
+        <error to="actionset-creation-fail"/>
+    </action>
+	
 	
 
 	<!-- TODO: end the workflow-->
@@ -597,4 +636,12 @@
 		<message>Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>	
 
+	<kill name="actionset-delete-fail">
+		<message>Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>	
+
+	<kill name="actionset-creation-fail">
+		<message>ActionSet creation failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>		
+
 </workflow-app>

From f992ecb6573b507351773096af78d65faef1baac Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Tue, 21 Mar 2023 18:03:55 +0200
Subject: [PATCH 05/41] Checkout BIP-Ranker during 'prepare-package' && add it
 in the oozie-package.tar.gz

---
 dhp-workflows/dhp-impact-indicators/README.md | 14 ++++++--------
 dhp-workflows/dhp-impact-indicators/pom.xml   | 19 +++++++++++++++----
 .../create_openaire_ranking_graph.py          |  0
 .../format_ranking_results.py                 |  0
 .../{ => eu.dnetlib}/get_ranking_files.sh     |  0
 .../resources/{ => eu.dnetlib}/job.properties |  0
 .../map_openaire_ids_to_dois.py               |  0
 .../{ => eu.dnetlib}/map_scores_to_dois.py    |  0
 .../resources/{ => eu.dnetlib}/workflow.xml   |  0
 dhp-workflows/pom.xml                         |  1 +
 10 files changed, 22 insertions(+), 12 deletions(-)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/create_openaire_ranking_graph.py (100%)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/format_ranking_results.py (100%)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/get_ranking_files.sh (100%)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/job.properties (100%)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/map_openaire_ids_to_dois.py (100%)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/map_scores_to_dois.py (100%)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{ => eu.dnetlib}/workflow.xml (100%)

diff --git a/dhp-workflows/dhp-impact-indicators/README.md b/dhp-workflows/dhp-impact-indicators/README.md
index 14f489da3..45a4701e7 100644
--- a/dhp-workflows/dhp-impact-indicators/README.md
+++ b/dhp-workflows/dhp-impact-indicators/README.md
@@ -1,4 +1,4 @@
-# Ranking Workflow for Openaire Publications
+# Ranking Workflow for OpenAIRE Publications
 
 This project contains the files for running a paper ranking workflow on the openaire graph using apache oozie.
 All scripts are written in python and the project setup follows the typical oozie workflow structure:
@@ -7,17 +7,15 @@ All scripts are written in python and the project setup follows the typical oozi
 - a job.properties file specifying parameter values for the parameters used by the workflow
 - a set of python scripts used by the workflow
 
-**NOTE**: the workflow depends on the external library of ranking scripts called BiP! Ranker.
+**NOTE**: the workflow depends on the external library of ranking scripts called [BiP! Ranker](https://github.com/athenarc/Bip-Ranker).
 You can check out a specific tag/release of BIP! Ranker using maven, as described in the following section.
 
-## Check out a specific tag/release of BIP-Ranker
+## Build and deploy
 
-* Edit the `scmVersion` of the maven-scm-plugin in the pom.xml to point to the tag/release version you want to check out.
-
-* Then, use maven to perform the checkout:
+Use the following command for packaging:
 
 ```
-mvn scm:checkout
+mvn package  -Poozie-package -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/impact_indicators -DskipTests
 ```
 
-* The code should be visible under `src/main/bip-ranker` folder.
\ No newline at end of file
+Note: edit the property `bip.ranker.tag` of the `pom.xml` file to specify the tag of [BIP-Ranker](https://github.com/athenarc/Bip-Ranker) that you want to use.
diff --git a/dhp-workflows/dhp-impact-indicators/pom.xml b/dhp-workflows/dhp-impact-indicators/pom.xml
index b510635a6..644b82c7b 100644
--- a/dhp-workflows/dhp-impact-indicators/pom.xml
+++ b/dhp-workflows/dhp-impact-indicators/pom.xml
@@ -5,9 +5,8 @@
     <modelVersion>4.0.0</modelVersion>
     <parent>
         <groupId>eu.dnetlib.dhp</groupId>
-        <artifactId>dhp</artifactId>
+        <artifactId>dhp-workflows</artifactId>
         <version>1.2.5-SNAPSHOT</version>
-        <relativePath>../pom.xml</relativePath>
     </parent>
 
     <artifactId>dhp-impact-indicators</artifactId>
@@ -16,6 +15,9 @@
         <maven.compiler.source>8</maven.compiler.source>
         <maven.compiler.target>8</maven.compiler.target>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+
+        <!--   Use this property to fetch a specific tag      -->
+        <bip.ranker.tag>v1.0.0</bip.ranker.tag>
     </properties>
 
     <scm>
@@ -32,9 +34,18 @@
                 <configuration>
                     <connectionType>connection</connectionType>
                     <scmVersionType>tag</scmVersionType><!-- 'branch' can also be provided here -->
-                    <scmVersion>v1.0.0</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
-                    <checkoutDirectory>${project.build.directory}/../src/main/bip-ranker</checkoutDirectory>
+                    <scmVersion>${bip.ranker.tag}</scmVersion><!-- in case of scmVersionType == 'branch', this field points to the branch name -->
+                    <checkoutDirectory>${project.build.directory}/${oozie.package.file.name}/${oozieAppDir}/bip-ranker</checkoutDirectory>
                 </configuration>
+                <executions>
+                    <execution>
+                        <id>checkout-bip-ranker</id>
+                        <phase>prepare-package</phase>
+                        <goals>
+                            <goal>checkout</goal>
+                        </goals>
+                    </execution>
+                </executions>
             </plugin>
         </plugins>
     </build>
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/create_openaire_ranking_graph.py
similarity index 100%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/create_openaire_ranking_graph.py
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/create_openaire_ranking_graph.py
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/format_ranking_results.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/format_ranking_results.py
similarity index 100%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/format_ranking_results.py
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/format_ranking_results.py
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/get_ranking_files.sh b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/get_ranking_files.sh
similarity index 100%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/get_ranking_files.sh
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/get_ranking_files.sh
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/job.properties
similarity index 100%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/job.properties
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/job.properties
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/map_openaire_ids_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_openaire_ids_to_dois.py
similarity index 100%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/map_openaire_ids_to_dois.py
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_openaire_ids_to_dois.py
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/map_scores_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_scores_to_dois.py
similarity index 100%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/map_scores_to_dois.py
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_scores_to_dois.py
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/workflow.xml
similarity index 100%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/workflow.xml
diff --git a/dhp-workflows/pom.xml b/dhp-workflows/pom.xml
index 541d59007..d054ba39b 100644
--- a/dhp-workflows/pom.xml
+++ b/dhp-workflows/pom.xml
@@ -38,6 +38,7 @@
         <module>dhp-usage-raw-data-update</module>
         <module>dhp-broker-events</module>
         <module>dhp-doiboost</module>
+        <module>dhp-impact-indicators</module>
     </modules>
 
     <pluginRepositories>

From 3e8a4cf9521fdab068e47f48536e707d14f0ea18 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Tue, 21 Mar 2023 18:24:12 +0200
Subject: [PATCH 06/41] Rearrange resources folder structure

---
 .../create_openaire_ranking_graph.py          | 11 +++-
 .../oozie_app}/format_ranking_results.py      |  0
 .../oozie_app}/get_ranking_files.sh           |  0
 .../oozie_app}/job.properties                 |  6 +++
 .../oozie_app}/map_openaire_ids_to_dois.py    |  0
 .../oozie_app}/map_scores_to_dois.py          |  0
 .../impact_indicators/oozie_app}/workflow.xml | 51 ++++++++++++++++++-
 7 files changed, 64 insertions(+), 4 deletions(-)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/create_openaire_ranking_graph.py (95%)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/format_ranking_results.py (100%)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/get_ranking_files.sh (100%)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/job.properties (93%)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/map_openaire_ids_to_dois.py (100%)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/map_scores_to_dois.py (100%)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/{eu.dnetlib => eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app}/workflow.xml (93%)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
similarity index 95%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/create_openaire_ranking_graph.py
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
index 4cffa86a3..cda12a77c 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/create_openaire_ranking_graph.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
@@ -126,12 +126,19 @@ oa_objects_df = oa_objects_df.drop('deletedbyinference').drop('invisible').disti
 
 # Collect only valid citations i.e., invisible = false & deletedbyinference=false  
 cites_df  = spark.read.json(graph_folder + "/relation")\
-			.select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\
+			.select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'collectedfrom.value', 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\
 			.where( (F.col('relClass') == "Cites") \
 				& (F.col('dataInfo.deletedbyinference') == "false")\
                 & (F.col('dataInfo.invisible') == "false"))\
 				.drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\
-				.repartition(num_partitions, 'citing').drop('relClass')
+				.repartition(num_partitions, 'citing').drop('relClass')\
+				.withColumn('collected_lower', F.expr('transform(collectedfrom.value, x -> lower(x))'))\
+				.drop('collectedfrom.value')\
+				.where(
+					(F.array_contains(F.col('collected_lower'), "opencitations"))
+            | 		(F.array_contains(F.col('collected_lower'), "crossref"))
+            | 		(F.array_contains(F.col('collected_lower'), "mag"))
+				).drop('collected_lower')
 # print ("Cited df has: " + str(cites_df.count()) + " entries")	 
 
 # DEPRECATED 
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/format_ranking_results.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py
similarity index 100%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/format_ranking_results.py
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/get_ranking_files.sh b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_ranking_files.sh
similarity index 100%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/get_ranking_files.sh
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_ranking_files.sh
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
similarity index 93%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/job.properties
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
index 9ad9def21..a902c413f 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/job.properties
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
@@ -63,6 +63,9 @@ oozieWorkflowPath=user/ilias.kanellos/workflow_example/
 # The directory where the workflow data is/should be stored
 workflowDataDir=user/ilias.kanellos/ranking_workflow
 
+# Directory where json data containing scores will be output
+bipScorePath=${workflowDataDir}/openaire_universe_scores/
+
 # Directory where dataframes are checkpointed
 checkpointDir=${nameNode}/${workflowDataDir}/check/
 
@@ -84,3 +87,6 @@ wfAppPath=${nameNode}/${oozieWorkflowPath}
 # The following is needed as a property of a workflow
 oozie.wf.application.path=${wfAppPath}
 
+# Path where the final output should be?
+actionSetOutputPath=${workflowDataDir}/bip_actionsets/
+
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_openaire_ids_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py
similarity index 100%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_openaire_ids_to_dois.py
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_openaire_ids_to_dois.py
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_scores_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
similarity index 100%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/map_scores_to_dois.py
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
similarity index 93%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/workflow.xml
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 807c32063..d99dc16a2 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu.dnetlib/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -552,11 +552,50 @@
 		</spark>
 		
 		<!-- Do this after finishing okay -->
-		<ok to="end" />
+		<ok to="deleteOutputPathForActionSet" />
 		<!-- Go there if we have an error -->
 		<error to="map-scores-fail" />
 		
-	</action>		
+	</action>	
+
+	    <action name="deleteOutputPathForActionSet">
+        <fs>
+            <delete path="${actionSetOutputPath}"/>
+            <mkdir path="${actionSetOutputPath}"/>
+            <!--
+			<delete path="${workingDir}"/>
+            <mkdir path="${workingDir}"/>
+			--> 
+        </fs>
+        <ok to="createActionSet"/>
+        <error to="actionset-delete-fail"/>
+    </action>
+
+
+    <action name="createActionSet">
+        <spark xmlns="uri:oozie:spark-action:0.2">
+            <master>yarn</master>
+            <mode>cluster</mode>
+            <name>Produces the atomic action with the bip finder scores for publications</name>
+            <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
+            <jar>dhp-aggregation-${projectVersion}.jar</jar>
+            <spark-opts>
+                --executor-memory=${sparkExecutorMemory}
+                --executor-cores=${sparkExecutorCores}
+                --driver-memory=${sparkDriverMemory}
+                --conf spark.extraListeners=${spark2ExtraListeners}
+                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+            </spark-opts>
+            <arg>--inputPath</arg><arg>${bipScorePath}</arg>
+            <arg>--outputPath</arg><arg>${actionSetOutputPath}</arg>
+        </spark>
+        <ok to="end"/>
+        <error to="actionset-creation-fail"/>
+    </action>
+	
 	
 
 	<!-- TODO: end the workflow-->
@@ -597,4 +636,12 @@
 		<message>Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>	
 
+	<kill name="actionset-delete-fail">
+		<message>Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>	
+
+	<kill name="actionset-creation-fail">
+		<message>ActionSet creation failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>		
+
 </workflow-app>

From 102aa5ab81bf2acf6b758b0255d4383f050d31d6 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Tue, 21 Mar 2023 19:25:29 +0200
Subject: [PATCH 07/41] Add dependency to dhp-aggregation

---
 dhp-workflows/dhp-impact-indicators/pom.xml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/dhp-workflows/dhp-impact-indicators/pom.xml b/dhp-workflows/dhp-impact-indicators/pom.xml
index 644b82c7b..a9eb0a4a1 100644
--- a/dhp-workflows/dhp-impact-indicators/pom.xml
+++ b/dhp-workflows/dhp-impact-indicators/pom.xml
@@ -49,4 +49,14 @@
             </plugin>
         </plugins>
     </build>
+
+    <dependencies>
+        <dependency>
+            <groupId>eu.dnetlib.dhp</groupId>
+            <artifactId>dhp-aggregation</artifactId>
+            <version>${projectVersion}</version>
+            <scope>compile</scope>
+        </dependency>
+    </dependencies>
+
 </project>
\ No newline at end of file

From 7256c8d3c71c632ae0537e2c5ce585da738662b5 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Fri, 7 Apr 2023 16:30:12 +0300
Subject: [PATCH 08/41] Add script for aggregating impact indicators at the
 project level

---
 .../oozie_app/job.properties                  |   3 +
 .../oozie_app/projects_impact.py              | 109 ++++++++++++++++++
 .../impact_indicators/oozie_app/workflow.xml  |  70 ++++++++++-
 3 files changed, 176 insertions(+), 6 deletions(-)
 create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
index a902c413f..f9f5519cc 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
@@ -90,3 +90,6 @@ oozie.wf.application.path=${wfAppPath}
 # Path where the final output should be?
 actionSetOutputPath=${workflowDataDir}/bip_actionsets/
 
+# The directory to store project impact indicators
+projectImpactIndicatorsOutput=${workflowDataDir}/project_indicators
+
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py
new file mode 100644
index 000000000..f01c92a0d
--- /dev/null
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py
@@ -0,0 +1,109 @@
+import sys
+from pyspark.sql import SparkSession
+from pyspark import SparkConf, SparkContext
+import pyspark.sql.functions as F
+from pyspark.sql.types import StringType, IntegerType, StructType, StructField
+
+if len(sys.argv) < 8:
+    print("Usage: projects_impact.py <relations_folder> <influence_file> <popularity_file> <cc_file> <impulse_file> <num_partitions> <output_dir>")
+    sys.exit(-1)
+
+appName = 'Project Impact Indicators'
+conf = SparkConf().setAppName(appName)
+sc = SparkContext(conf = conf)
+spark = SparkSession.builder.appName(appName).getOrCreate()
+sc.setLogLevel('OFF')
+
+# input parameters
+relations_fd = sys.argv[1]
+influence_fd = sys.argv[2]
+popularity_fd = sys.argv[3]
+cc_fd = sys.argv[4]
+impulse_fd = sys.argv[5]
+num_partitions = int(sys.argv[6])
+output_dir = sys.argv[7]
+
+# schema for impact indicator files
+impact_files_schema = StructType([
+    StructField('resultId', StringType(), False),
+    StructField('score', IntegerType(), False),
+    StructField('class', StringType(), False),
+])
+
+# list of impact indicators
+impact_indicators = [
+    ('influence', influence_fd, 'class'),
+    ('popularity', popularity_fd, 'class'),
+    ('impulse', impulse_fd, 'score'),
+    ('citation_count', cc_fd, 'score')
+]
+
+'''
+    * Read impact indicator file and return a dataframe with the following schema:
+    *   resultId: String
+    *   indicator_name: Integer
+'''
+def read_df(fd, indicator_name, column_name):
+    return spark.read.schema(impact_files_schema)\
+        .option('delimiter', '\t')\
+        .option('header', False)\
+        .csv(fd)\
+        .select('resultId', F.col(column_name).alias(indicator_name))\
+        .repartition(num_partitions, 'resultId')
+
+# Print dataframe schema, first 5 rows, and count
+def print_df(df):
+    df.show(50)
+    df.printSchema()
+    print(df.count())
+
+# Sets a null value to the column if the value is equal to the given value
+def set_class_value_to_null(column, value):
+    return F.when(column != value, column).otherwise(F.lit(None))
+
+# load and filter Project-to-Result relations
+print("Reading relations")
+relations = spark.read.json(relations_fd)\
+			.select(F.col('source').alias('projectId'), F.col('target').alias('resultId'), 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\
+			.where( (F.col('relClass') == 'produces') \
+				& (F.col('deletedbyinference') == "false")\
+                & (F.col('invisible') == "false"))\
+			.drop('deletedbyinference')\
+			.drop('invisible')\
+            .drop('relClass')\
+			.repartition(num_partitions, 'resultId')
+
+for indicator_name, fd, column_name in impact_indicators:
+
+    print("Reading {} '{}' field from file".format(indicator_name, column_name))
+    df = read_df(fd, indicator_name, column_name)
+
+    # sets a zero value to the indicator column if the value is C5
+    if (column_name == 'class'):
+        df = df.withColumn(indicator_name, F.when(F.col(indicator_name).isin("C5"), 0).otherwise(1))
+
+    # print_df(df)
+
+    print("Joining {} to relations".format(indicator_name))
+
+    # NOTE: we use inner join because we want to keep only the results that have an impact score
+    # also note that all impact scores have the same set of results
+    relations = relations.join(df, 'resultId', 'inner')\
+        .repartition(num_partitions, 'resultId')
+
+# uncomment to print non-null values count for each indicator
+# for indicator_name, fd, column_name in impact_indicators:
+#     print("Counting non null values for {}".format(indicator_name))
+#     print(relations.filter(F.col(indicator_name).isNotNull()).count())
+
+sum the impact indicator values for each project
+relations.groupBy('projectId')\
+    .agg(\
+        F.sum('influence').alias('influence'),\
+        F.sum('popularity').alias('popularity'),\
+        F.sum('impulse').alias('impulse'),\
+        F.sum('citation_count').alias('citation_count')\
+    )\
+    .write.mode("overwrite")\
+    .option("delimiter", "\t")\
+    .csv(output_dir, compression="gzip")
\ No newline at end of file
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index d99dc16a2..8cd0b0d5d 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -15,6 +15,8 @@
 			<case to="map-openaire-to-doi">${resume eq "map-ids"}</case> 
 			<case to="map-scores-to-dois">${resume eq "map-scores"}</case> 
 			<case to="create-openaire-ranking-graph">${resume eq "start"}</case>
+			<case to="project-impact-indicators">${resume eq "projects-impact"}</case>
+
 			<!-- TODO: add action set creation here -->
 			<default to="create-openaire-ranking-graph" />
 		</switch>
@@ -334,7 +336,7 @@
 			<!-- This should give the machine/root of the hdfs -->
 			<name-node>${nameNode}</name-node>
             		
-            		<!-- Exec is needed foor shell comands - points to type of shell command -->
+            		<!-- Exec is needed for shell commands - points to type of shell command -->
             		<exec>/usr/bin/bash</exec>
             		<!-- name of script to run -->
             		<argument>get_ranking_files.sh</argument>
@@ -558,7 +560,7 @@
 		
 	</action>	
 
-	    <action name="deleteOutputPathForActionSet">
+	<action name="deleteOutputPathForActionSet">
         <fs>
             <delete path="${actionSetOutputPath}"/>
             <mkdir path="${actionSetOutputPath}"/>
@@ -592,11 +594,63 @@
             <arg>--inputPath</arg><arg>${bipScorePath}</arg>
             <arg>--outputPath</arg><arg>${actionSetOutputPath}</arg>
         </spark>
-        <ok to="end"/>
+        <ok to="project-impact-indicators"/>
         <error to="actionset-creation-fail"/>
     </action>
-	
-	
+
+	<!-- PAGERANK here -->
+	<action name="project-impact-indicators">
+		<!-- This is required as a tag for spark jobs, regardless of programming language -->
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
+			<job-tracker>${jobTracker}</job-tracker>
+			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
+			<name-node>${nameNode}</name-node>
+			<!-- using configs from an example on openaire -->
+			<master>yarn-cluster</master>
+			<mode>cluster</mode>
+
+			<!-- This is the name of our job -->
+			<name>Spark Pagerank</name>
+			<!-- Script name goes here -->
+			<jar>PageRank.py</jar>
+			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
+			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G
+				--master yarn
+				--deploy-mode cluster
+				--conf spark.sql.shuffle.partitions=7680
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+
+			<!-- Script arguments here -->
+
+			<!-- graph data folder from which to read relations -->
+			<arg>${openaireDataInput}/relations</arg>
+
+			<!-- input files with impact indicators for results	-->
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
+			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
+
+			<!-- number of partitions to be used on joins -->
+			<arg>7680</arg>
+
+			<arg>${projectImpactIndicatorsOutput}</arg>
+
+			<!-- This needs to point to the file on the hdfs i think -->
+			<file>${wfAppPath}/projects_impact.py#projects_impact.py</file>
+		</spark>
+
+		<!-- Do this after finishing okay -->
+		<ok to="end" />
+
+		<!-- Go there if we have an error -->
+		<error to="project-impact-indicators-fail" />
+
+	</action>
 
 	<!-- TODO: end the workflow-->
 		
@@ -642,6 +696,10 @@
 
 	<kill name="actionset-creation-fail">
 		<message>ActionSet creation failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-	</kill>		
+	</kill>
+
+	<kill name="project-impact-indicators-fail">
+		<message>Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>
 
 </workflow-app>

From 23f58a86f177ac7fcbef5b3d5bff28e654299f07 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Tue, 18 Apr 2023 12:26:01 +0300
Subject: [PATCH 09/41] Change jar param in project impact indicators action

---
 .../dhp/oa/graph/impact_indicators/oozie_app/workflow.xml     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 8cd0b0d5d..ac44d5c05 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -611,9 +611,9 @@
 			<mode>cluster</mode>
 
 			<!-- This is the name of our job -->
-			<name>Spark Pagerank</name>
+			<name>Project Impact Indicators</name>
 			<!-- Script name goes here -->
-			<jar>PageRank.py</jar>
+			<jar>projects_impact.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
 			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G
 				--master yarn

From ee04cf92bf4030f9be3b4a34703198c3dd5ce424 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Wed, 26 Apr 2023 20:23:46 +0300
Subject: [PATCH 10/41] Add actionsets for project impact indicators

---
 .../bipfinder/SparkAtomicActionScoreJob.java  |  63 +++++++----
 .../score/deserializers/BipProjectModel.java  |  69 ++++++++++++
 .../deserializers/BipResultModel.java}        |   8 +-
 .../PrepareBipFinder.java                     |   6 +-
 .../bipfinder/input_actionset_parameter.json  |   6 ++
 .../SparkAtomicActionScoreJobTest.java        | 102 ++++++++++++++----
 .../bipfinder/project_bip_scores.json         |   4 +
 ...scores_oid.json => result_bip_scores.json} |   0
 .../oozie_app/projects_impact.py              |  13 ++-
 9 files changed, 218 insertions(+), 53 deletions(-)
 create mode 100644 dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java
 rename dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/{BipDeserialize.java => score/deserializers/BipResultModel.java} (65%)
 create mode 100644 dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json
 rename dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/{bip_scores_oid.json => result_bip_scores.json} (100%)

diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
index ddf5f4adf..13ce1440a 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@@ -9,6 +9,7 @@ import java.util.List;
 import java.util.Optional;
 import java.util.stream.Collectors;
 
+import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipProjectModel;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
@@ -24,7 +25,7 @@ import org.slf4j.LoggerFactory;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
 
-import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
+import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel;
 import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
@@ -56,18 +57,17 @@ public class SparkAtomicActionScoreJob implements Serializable {
 
 		parser.parseArgument(args);
 
-		Boolean isSparkSessionManaged = Optional
-			.ofNullable(parser.get("isSparkSessionManaged"))
-			.map(Boolean::valueOf)
-			.orElse(Boolean.TRUE);
-
+		Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 
 		final String inputPath = parser.get("inputPath");
-		log.info("inputPath {}: ", inputPath);
+		log.info("inputPath: {}", inputPath);
 
 		final String outputPath = parser.get("outputPath");
-		log.info("outputPath {}: ", outputPath);
+		log.info("outputPath: {}", outputPath);
+
+		final String targetEntity = parser.get("targetEntity");
+		log.info("targetEntity: {}", targetEntity);
 
 		SparkConf conf = new SparkConf();
 
@@ -76,17 +76,48 @@ public class SparkAtomicActionScoreJob implements Serializable {
 			isSparkSessionManaged,
 			spark -> {
 				removeOutputDir(spark, outputPath);
-				prepareResults(spark, inputPath, outputPath);
-			});
+
+				// follow different procedures for different target entities
+				switch (targetEntity) {
+					case "result":
+						prepareResults(spark, inputPath, outputPath);
+						break;
+					case "project":
+						prepareProjects(spark, inputPath, outputPath);
+						break;
+					default:
+						throw new RuntimeException("Unknown target entity: " + targetEntity);
+				}
+			}
+		);
+	}
+
+	private static <I extends Project> void prepareProjects(SparkSession spark, String inputPath, String outputPath) {
+
+		// read input bip project scores
+		Dataset<BipProjectModel> projectScores = readPath(spark, inputPath, BipProjectModel.class);
+
+		projectScores.map( (MapFunction<BipProjectModel, Project>) bipProjectScores -> {
+			Project project = new Project();
+			project.setId(bipProjectScores.getProjectId());
+			project.setMeasures(bipProjectScores.toMeasures());
+			return project;
+		}, Encoders.bean(Project.class))
+		.toJavaRDD()
+		.map(p -> new AtomicAction(Project.class, p))
+		.mapToPair( aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
+				new Text(OBJECT_MAPPER.writeValueAsString(aa))))
+		.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
+
 	}
 
 	private static <I extends Result> void prepareResults(SparkSession spark, String bipScorePath, String outputPath) {
 
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 
-		JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
+		JavaRDD<BipResultModel> bipDeserializeJavaRDD = sc
 			.textFile(bipScorePath)
-			.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
+			.map(item -> OBJECT_MAPPER.readValue(item, BipResultModel.class));
 
 		Dataset<BipScore> bipScores = spark
 			.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
@@ -159,12 +190,4 @@ public class SparkAtomicActionScoreJob implements Serializable {
 		HdfsSupport.remove(path, spark.sparkContext().hadoopConfiguration());
 	}
 
-	public static <R> Dataset<R> readPath(
-		SparkSession spark, String inputPath, Class<R> clazz) {
-		return spark
-			.read()
-			.textFile(inputPath)
-			.map((MapFunction<String, R>) value -> OBJECT_MAPPER.readValue(value, clazz), Encoders.bean(clazz));
-	}
-
 }
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java
new file mode 100644
index 000000000..77c1567a8
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java
@@ -0,0 +1,69 @@
+package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers;
+
+import com.opencsv.bean.CsvBindByPosition;
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
+import lombok.AllArgsConstructor;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+import lombok.Setter;
+import eu.dnetlib.dhp.schema.oaf.Measure;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import static eu.dnetlib.dhp.actionmanager.Constants.*;
+
+@NoArgsConstructor
+@AllArgsConstructor
+@Getter
+@Setter
+public class BipProjectModel {
+    String projectId;
+
+    String numOfInfluentialResults;
+
+    String numOfPopularResults;
+
+    String totalImpulse;
+
+    String totalCitationCount;
+
+    // each project bip measure has exactly one value, hence one key-value pair
+    private Measure createMeasure(String measureId, String measureValue) {
+
+        KeyValue kv = new KeyValue();
+        kv.setKey("score");
+        kv.setValue(measureValue);
+        kv.setDataInfo(
+            OafMapperUtils.dataInfo(
+                false,
+                UPDATE_DATA_INFO_TYPE,
+                true,
+                false,
+                OafMapperUtils.qualifier(
+                    UPDATE_MEASURE_BIP_CLASS_ID,
+                    UPDATE_CLASS_NAME,
+                    ModelConstants.DNET_PROVENANCE_ACTIONS,
+                    ModelConstants.DNET_PROVENANCE_ACTIONS),
+    "")
+        );
+
+        Measure measure = new Measure();
+        measure.setId(measureId);
+        measure.setUnit(Collections.singletonList(kv));
+        return measure;
+    }
+    public List<Measure> toMeasures() {
+        return Arrays.asList(
+            createMeasure("numOfInfluentialResults", numOfInfluentialResults),
+            createMeasure("numOfPopularResults", numOfPopularResults),
+            createMeasure("totalImpulse", totalImpulse),
+            createMeasure("totalCitationCount", totalCitationCount)
+        );
+    }
+
+}
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/BipDeserialize.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java
similarity index 65%
rename from dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/BipDeserialize.java
rename to dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java
index a70bca618..06a173413 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/BipDeserialize.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java
@@ -1,5 +1,7 @@
 
-package eu.dnetlib.dhp.actionmanager.bipmodel;
+package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers;
+
+import eu.dnetlib.dhp.actionmanager.bipmodel.Score;
 
 import java.io.Serializable;
 import java.util.ArrayList;
@@ -11,9 +13,9 @@ import java.util.List;
  * Only needed for deserialization purposes
  */
 
-public class BipDeserialize extends HashMap<String, List<Score>> implements Serializable {
+public class BipResultModel extends HashMap<String, List<Score>> implements Serializable {
 
-	public BipDeserialize() {
+	public BipResultModel() {
 		super();
 	}
 
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
index 80573c71a..efcb96a85 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
@@ -24,7 +24,7 @@ import org.slf4j.LoggerFactory;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
 
-import eu.dnetlib.dhp.actionmanager.bipmodel.BipDeserialize;
+import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel;
 import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
@@ -82,9 +82,9 @@ public class PrepareBipFinder implements Serializable {
 
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 
-		JavaRDD<BipDeserialize> bipDeserializeJavaRDD = sc
+		JavaRDD<BipResultModel> bipDeserializeJavaRDD = sc
 			.textFile(inputPath)
-			.map(item -> OBJECT_MAPPER.readValue(item, BipDeserialize.class));
+			.map(item -> OBJECT_MAPPER.readValue(item, BipResultModel.class));
 
 		spark
 			.createDataset(bipDeserializeJavaRDD.flatMap(entry -> entry.keySet().stream().map(key -> {
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
index 7663a454b..d6b93c5af 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
@@ -16,5 +16,11 @@
     "paramLongName": "outputPath",
     "paramDescription": "the path of the new ActionSet",
     "paramRequired": true
+  },
+  {
+    "paramName": "te",
+    "paramLongName": "targetEntity",
+    "paramDescription": "the type of target entity to be enriched; currently supported one of { 'result', 'project' }",
+    "paramRequired": true
   }
 ]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
index be82b9fc3..aa5a19f11 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
@@ -6,8 +6,9 @@ import static org.junit.jupiter.api.Assertions.*;
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.List;
 
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.Project;
 import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.SparkConf;
@@ -27,7 +28,6 @@ import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 
 import eu.dnetlib.dhp.schema.action.AtomicAction;
-import eu.dnetlib.dhp.schema.oaf.Publication;
 import eu.dnetlib.dhp.schema.oaf.Result;
 
 public class SparkAtomicActionScoreJobTest {
@@ -37,8 +37,11 @@ public class SparkAtomicActionScoreJobTest {
 	private static SparkSession spark;
 
 	private static Path workingDir;
-	private static final Logger log = LoggerFactory
-		.getLogger(SparkAtomicActionScoreJobTest.class);
+
+	private final static String RESULT = "result";
+	private final static String PROJECT = "project";
+
+	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJobTest.class);
 
 	@BeforeAll
 	public static void beforeAll() throws IOException {
@@ -69,29 +72,31 @@ public class SparkAtomicActionScoreJobTest {
 		spark.stop();
 	}
 
+	private void runJob(String inputPath, String outputPath, String targetEntity) throws Exception {
+		SparkAtomicActionScoreJob.main(
+			new String[] {
+					"-isSparkSessionManaged", Boolean.FALSE.toString(),
+					"-inputPath", inputPath,
+					"-outputPath", outputPath,
+					"-targetEntity", targetEntity,
+			}
+		);
+	}
 	@Test
-	void testMatch() throws Exception {
-		String bipScoresPath = getClass()
-			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json")
+	void testResultScores() throws Exception {
+		final String targetEntity = RESULT;
+		String inputResultScores = getClass()
+			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json")
 			.getPath();
+		String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet";
 
-		SparkAtomicActionScoreJob
-			.main(
-				new String[] {
-					"-isSparkSessionManaged",
-					Boolean.FALSE.toString(),
-					"-inputPath",
-
-					bipScoresPath,
-
-					"-outputPath",
-					workingDir.toString() + "/actionSet"
-				});
+		// execute the job to generate the action sets for result scores
+		runJob(inputResultScores, outputPath, targetEntity);
 
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 
 		JavaRDD<Result> tmp = sc
-			.sequenceFile(workingDir.toString() + "/actionSet", Text.class, Text.class)
+			.sequenceFile(outputPath, Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
 			.map(aa -> ((Result) aa.getPayload()));
 
@@ -140,4 +145,61 @@ public class SparkAtomicActionScoreJobTest {
 
 	}
 
+	@Test
+	void testProjectScores() throws Exception {
+		String targetEntity = PROJECT;
+		String inputResultScores = getClass()
+				.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json")
+				.getPath();
+		String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet";
+
+		// execute the job to generate the action sets for project scores
+		runJob(inputResultScores, outputPath, PROJECT);
+
+		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
+
+		JavaRDD<Project> projects = sc
+				.sequenceFile(outputPath, Text.class, Text.class)
+				.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+				.map(aa -> ((Project) aa.getPayload()));
+
+		// test the number of projects
+		assertEquals(4, projects.count());
+
+		String testProjectId = "40|nih_________::c02a8233e9b60f05bb418f0c9b714833";
+
+		// count that the project with id testProjectId is present
+		assertEquals(1, projects.filter(row -> row.getId().equals(testProjectId)).count());
+
+		projects.filter(row -> row.getId().equals(testProjectId))
+			.flatMap(r -> r.getMeasures().iterator())
+			.foreach(m -> {
+				log.info(m.getId() + " " + m.getUnit());
+
+				// ensure that only one score is present for each bip impact measure
+				assertEquals(1, m.getUnit().size());
+
+				KeyValue kv = m.getUnit().get(0);
+
+				// ensure that the correct key is provided, i.e. score
+				assertEquals("score", kv.getKey());
+
+				switch(m.getId()) {
+					case "numOfInfluentialResults":
+						assertEquals("0", kv.getValue());
+						break;
+					case "numOfPopularResults":
+						assertEquals("1", kv.getValue());
+						break;
+					case "totalImpulse":
+						assertEquals("25", kv.getValue());
+						break;
+					case "totalCitationCount":
+						assertEquals("43", kv.getValue());
+						break;
+					default:
+						fail("Unknown measure id in the context of projects");
+				}
+			});
+	}
 }
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json
new file mode 100644
index 000000000..096268287
--- /dev/null
+++ b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json
@@ -0,0 +1,4 @@
+{"projectId":"40|nsf_________::d93e50d22374a1cf59f6a232413ea027","numOfInfluentialResults":0,"numOfPopularResults":10,"totalImpulse":181,"totalCitationCount":235}
+{"projectId":"40|nih_________::1c93debc7085e440f245fbe70b2e8b21","numOfInfluentialResults":14,"numOfPopularResults":17,"totalImpulse":1558,"totalCitationCount":4226}
+{"projectId":"40|nih_________::c02a8233e9b60f05bb418f0c9b714833","numOfInfluentialResults":0,"numOfPopularResults":1,"totalImpulse":25,"totalCitationCount":43}
+{"projectId":"40|corda_______::d91dcf3a87dd7f72248fab0b8a4ba273","numOfInfluentialResults":2,"numOfPopularResults":3,"totalImpulse":78,"totalCitationCount":178}
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json b/dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json
similarity index 100%
rename from dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/bip_scores_oid.json
rename to dhp-workflows/dhp-aggregation/src/test/resources/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py
index f01c92a0d..d60f86e88 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/projects_impact.py
@@ -96,14 +96,13 @@ for indicator_name, fd, column_name in impact_indicators:
 #     print("Counting non null values for {}".format(indicator_name))
 #     print(relations.filter(F.col(indicator_name).isNotNull()).count())
 
-sum the impact indicator values for each project
+# sum the impact indicator values for each project
 relations.groupBy('projectId')\
     .agg(\
-        F.sum('influence').alias('influence'),\
-        F.sum('popularity').alias('popularity'),\
-        F.sum('impulse').alias('impulse'),\
-        F.sum('citation_count').alias('citation_count')\
+        F.sum('influence').alias('numOfInfluentialResults'),\
+        F.sum('popularity').alias('numOfPopularResults'),\
+        F.sum('impulse').alias('totalImpulse'),\
+        F.sum('citation_count').alias('totalCitationCount')\
     )\
     .write.mode("overwrite")\
-    .option("delimiter", "\t")\
-    .csv(output_dir, compression="gzip")
\ No newline at end of file
+    .json(output_dir, compression="gzip")
\ No newline at end of file

From 815a4ddbbaf6fa68a23d576189db2ee03f97f828 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Wed, 26 Apr 2023 20:40:06 +0300
Subject: [PATCH 11/41] Add actionset creation for project bip indicators in
 workflow

---
 .../bipfinder/SparkAtomicActionScoreJob.java  |  7 +-
 .../impact_indicators/oozie_app/workflow.xml  | 81 +++++++++++--------
 2 files changed, 53 insertions(+), 35 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
index 13ce1440a..8b8e05723 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@@ -41,7 +41,8 @@ import scala.Tuple2;
  */
 public class SparkAtomicActionScoreJob implements Serializable {
 
-	private static final String DOI = "doi";
+	private static final String RESULT = "result";
+	private static final String PROJECT = "project";
 	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 
@@ -79,10 +80,10 @@ public class SparkAtomicActionScoreJob implements Serializable {
 
 				// follow different procedures for different target entities
 				switch (targetEntity) {
-					case "result":
+					case RESULT:
 						prepareResults(spark, inputPath, outputPath);
 						break;
-					case "project":
+					case PROJECT:
 						prepareProjects(spark, inputPath, outputPath);
 						break;
 					default:
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index ac44d5c05..c77443bd9 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -34,7 +34,6 @@
 			<prepare>
 				<delete path="${synonymFolder}"/>
 			</prepare>
-
             		
             <!-- using configs from an example on openaire --> 
             <master>yarn-cluster</master>
@@ -90,9 +89,8 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
 
-            		
-            		<!-- using configs from an example on openaire --> 
-            		<master>yarn-cluster</master>
+			<!-- using configs from an example on openaire -->
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
             		
 			<!-- This is the name of our job -->
@@ -131,7 +129,6 @@
 			<job-tracker>${jobTracker}</job-tracker>
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
-
             		
             <!-- using configs from an example on openaire --> 
             <master>yarn-cluster</master>
@@ -181,9 +178,8 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
 
-            		
-            		<!-- using configs from an example on openaire --> 
-            		<master>yarn-cluster</master>
+			<!-- using configs from an example on openaire -->
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
             		
 			<!-- This is the name of our job -->
@@ -235,7 +231,7 @@
 			<!-- Reference says: The master element indicates the url of the Spark Master. Ex: spark://host:port, mesos://host:port, yarn-cluster, yarn-master, or local. -->
 			<!-- <master>local[*]</master> -->
 			<!-- Reference says: The mode element if present indicates the mode of spark, where to run spark driver program. Ex: client,cluster. | In my case I always have a client -->
-            		<!-- <mode>client</mode> --> 
+			<!-- <mode>client</mode> -->
             		
             <!-- using configs from an example on openaire --> 
             <master>yarn-cluster</master>
@@ -336,12 +332,12 @@
 			<!-- This should give the machine/root of the hdfs -->
 			<name-node>${nameNode}</name-node>
             		
-            		<!-- Exec is needed for shell commands - points to type of shell command -->
-            		<exec>/usr/bin/bash</exec>
-            		<!-- name of script to run -->
-            		<argument>get_ranking_files.sh</argument>
-            		<!-- We only pass the directory where we expect to find the rankings -->
-            		<argument>/${workflowDataDir}</argument>
+			<!-- Exec is needed for shell commands - points to type of shell command -->
+			<exec>/usr/bin/bash</exec>
+			<!-- name of script to run -->
+			<argument>get_ranking_files.sh</argument>
+			<!-- We only pass the directory where we expect to find the rankings -->
+			<argument>/${workflowDataDir}</argument>
             		
 			<!-- the name of the file run -->
 			<file>${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh</file>
@@ -374,8 +370,8 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
             		
-            		<!-- using configs from an example on openaire --> 
-            		<master>yarn-cluster</master>
+			<!-- using configs from an example on openaire -->
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
             		
 			<!-- This is the name of our job -->
@@ -422,8 +418,8 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
             		
-            		<!-- using configs from an example on openaire --> 
-            		<master>yarn-cluster</master>
+			<!-- using configs from an example on openaire -->
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
             		
 			<!-- This is the name of our job -->
@@ -476,7 +472,6 @@
 			<prepare>
 				<delete path="${synonymFolder}"/>
 			</prepare>
-
             		
             <!-- using configs from an example on openaire --> 
             <master>yarn-cluster</master>
@@ -520,7 +515,6 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
 
-            		
             <!-- using configs from an example on openaire --> 
             <master>yarn-cluster</master>
 			<mode>cluster</mode>
@@ -564,17 +558,12 @@
         <fs>
             <delete path="${actionSetOutputPath}"/>
             <mkdir path="${actionSetOutputPath}"/>
-            <!--
-			<delete path="${workingDir}"/>
-            <mkdir path="${workingDir}"/>
-			--> 
         </fs>
-        <ok to="createActionSet"/>
+        <ok to="createActionSetForResults"/>
         <error to="actionset-delete-fail"/>
     </action>
 
-
-    <action name="createActionSet">
+    <action name="createActionSetForResults">
         <spark xmlns="uri:oozie:spark-action:0.2">
             <master>yarn</master>
             <mode>cluster</mode>
@@ -593,12 +582,12 @@
             </spark-opts>
             <arg>--inputPath</arg><arg>${bipScorePath}</arg>
             <arg>--outputPath</arg><arg>${actionSetOutputPath}</arg>
-        </spark>
+			<arg>--targetEntity</arg><arg>result</arg>
+		</spark>
         <ok to="project-impact-indicators"/>
         <error to="actionset-creation-fail"/>
     </action>
 
-	<!-- PAGERANK here -->
 	<action name="project-impact-indicators">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
@@ -645,13 +634,38 @@
 		</spark>
 
 		<!-- Do this after finishing okay -->
-		<ok to="end" />
+		<ok to="createActionSetForProjects" />
 
 		<!-- Go there if we have an error -->
 		<error to="project-impact-indicators-fail" />
 
 	</action>
 
+	<action name="createActionSetForProjects">
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<master>yarn</master>
+			<mode>cluster</mode>
+			<name>Produces the atomic action with the bip finder scores for projects</name>
+			<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
+			<jar>dhp-aggregation-${projectVersion}.jar</jar>
+			<spark-opts>
+				--executor-memory=${sparkExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkDriverMemory}
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+				--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+			</spark-opts>
+			<arg>--inputPath</arg><arg>${projectImpactIndicatorsOutput}</arg>
+			<arg>--outputPath</arg><arg>${actionSetOutputPath}</arg>
+			<arg>--targetEntity</arg><arg>project</arg>
+		</spark>
+		<ok to="end"/>
+		<error to="actionset-project-creation-fail"/>
+	</action>
+
 	<!-- TODO: end the workflow-->
 		
 	<!-- Define ending node -->
@@ -695,11 +709,14 @@
 	</kill>	
 
 	<kill name="actionset-creation-fail">
-		<message>ActionSet creation failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+		<message>ActionSet creation for results failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 
 	<kill name="project-impact-indicators-fail">
 		<message>Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 
+	<kill name="actionset-project-creation-fail">
+		<message>ActionSet creation for projects failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>
 </workflow-app>

From 614cc1089b975f8dc05df4f671029b5bdaa31d44 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Thu, 27 Apr 2023 12:37:15 +0300
Subject: [PATCH 12/41] Add separate forder for results && project actionsets

---
 .../graph/impact_indicators/oozie_app/workflow.xml  | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index c77443bd9..5f67bb914 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -556,9 +556,12 @@
 
 	<action name="deleteOutputPathForActionSet">
         <fs>
-            <delete path="${actionSetOutputPath}"/>
-            <mkdir path="${actionSetOutputPath}"/>
-        </fs>
+            <delete path="${actionSetOutputPath}/results/"/>
+			<delete path="${actionSetOutputPath}/projects/"/>
+
+			<mkdir path="${actionSetOutputPath}/results/"/>
+			<mkdir path="${actionSetOutputPath}/projects/"/>
+		</fs>
         <ok to="createActionSetForResults"/>
         <error to="actionset-delete-fail"/>
     </action>
@@ -581,7 +584,7 @@
                 --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
             </spark-opts>
             <arg>--inputPath</arg><arg>${bipScorePath}</arg>
-            <arg>--outputPath</arg><arg>${actionSetOutputPath}</arg>
+            <arg>--outputPath</arg><arg>${actionSetOutputPath}/results/</arg>
 			<arg>--targetEntity</arg><arg>result</arg>
 		</spark>
         <ok to="project-impact-indicators"/>
@@ -659,7 +662,7 @@
 				--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
 			</spark-opts>
 			<arg>--inputPath</arg><arg>${projectImpactIndicatorsOutput}</arg>
-			<arg>--outputPath</arg><arg>${actionSetOutputPath}</arg>
+			<arg>--outputPath</arg><arg>${actionSetOutputPath}/projects/</arg>
 			<arg>--targetEntity</arg><arg>project</arg>
 		</spark>
 		<ok to="end"/>

From 09485fbee3d1c782af33756a73b59f53a90532b5 Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Fri, 28 Apr 2023 13:09:13 +0300
Subject: [PATCH 13/41] Fixed unicode bug. Workflow ends after first script

---
 .../dhp-impact-indicators/src/main/resources/workflow.xml     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml
index d99dc16a2..a957f6c10 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/workflow.xml
@@ -65,7 +65,7 @@
 		</spark>
 		
 		<!-- Do this after finishing okay -->
-		<ok to="non-iterative-rankings" />
+		<ok to="end" />
 		<!-- Go there if we have an error -->
 		<error to="openaire-graph-error" />
 		
@@ -155,7 +155,7 @@
 			<arg>RAM</arg>
 			<!-- number of partitions to be used on joins -->
 			<arg>7680</arg>
-			<arg>${γιτ α}</arg>
+			<arg>${checkpointDir}</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
 			<file>${wfAppPath}/TAR.py#TAR.py</file>
 		</spark>

From 90332439adc5e1400067fc61cefecbc39c9ab478 Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Fri, 28 Apr 2023 13:45:19 +0300
Subject: [PATCH 14/41] Remove deletion of synonym folder

---
 .../dhp/oa/graph/impact_indicators/oozie_app/workflow.xml       | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 815096665..7aa95db22 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -31,9 +31,11 @@
 			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
 			<name-node>${nameNode}</name-node>
 			<!-- Delete previously created doi synonym folder -->
+			<!-- I think we don't need this given we don't have synonyms anymore
 			<prepare>
 				<delete path="${synonymFolder}"/>
 			</prepare>
+			--> 
             		
             <!-- using configs from an example on openaire --> 
             <master>yarn-cluster</master>

From 3de35fd6a310ca41c8fb7cdd1a1e1396a2067fba Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Thu, 11 May 2023 14:42:25 +0300
Subject: [PATCH 15/41] Produce 5 classes of ranking scores

---
 .../oozie_app/format_ranking_results.py       | 31 ++++++++++++++++---
 .../impact_indicators/oozie_app/workflow.xml  |  2 +-
 2 files changed, 27 insertions(+), 6 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py
index 60c71e52f..e7d62c2f1 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py
@@ -421,7 +421,7 @@ elif mode == 'json':
 	
 	# Score-specific dataframe - read inputs
 	pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id')
-	attrank_df  = spark.read.schema(float_schema).option('delimiter', '\t').option('header',False).csv(attrank_dir).repartition(num_partitions, 'id')
+	attrank_df  = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(attrank_dir).repartition(num_partitions, 'id')
 	cc_df	    = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id')
 	impulse_df  = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id')
 	ram_df      = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id')	
@@ -601,7 +601,7 @@ elif mode == 'json-5-way':
 	
 	# Score-specific dataframe - read inputs
 	pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id')
-	attrank_df  = spark.read.schema(float_schema).option('delimiter', '\t').option('header',False).csv(attrank_dir).repartition(num_partitions, 'id')
+	attrank_df  = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(attrank_dir).repartition(num_partitions, 'id')
 	cc_df	    = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id')
 	impulse_df  = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id')
 	ram_df      = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id')	
@@ -753,15 +753,36 @@ elif mode == 'json-5-way':
 					
 	# -------------------------------------------- #
 	# Write json output
+	# -------------------------------------------- #
+	# Write json output - set the directory here
 	output_dir = "/".join(pagerank_dir.split('/')[:-1])
 	if graph_type == 'bip':
-		output_dir = output_dir + '/bip_universe_doi_scores_5_classes/'
+		output_dir = output_dir + '/bip_universe_doi_scores/'
 	else:
-		output_dir = output_dir + '/openaire_universe_scores_5_classes/'
-		
+		output_dir = output_dir + '/openaire_universe_scores/'
+
+	# Write the dataframe
 	print ("Writing output to: " + output_dir)
 	results_df.write.mode('overwrite').option('header', False).text(output_dir, compression='gzip')
 
+	# Rename the files to .json.gz now
+	sc = spark.sparkContext
+	URI = sc._gateway.jvm.java.net.URI
+	Path = sc._gateway.jvm.org.apache.hadoop.fs.Path
+	FileSystem = sc._gateway.jvm.org.apache.hadoop.fs.FileSystem
+	# Get master prefix from input file path
+	master_prefix = "/".join(pagerank_dir.split('/')[:5])
+	fs = FileSystem.get(URI(master_prefix), sc._jsc.hadoopConfiguration())
+	path = Path(output_dir)
+	print ("Path is:" + path.toString())
+	file_list = fs.listStatus(Path(output_dir))
+	print ("Renaming files:")
+	for f in file_list:
+		initial_filename = f.getPath().toString()
+		if "part" in initial_filename:
+			print (initial_filename + " => " + initial_filename.replace(".txt.gz", ".json.gz"))
+			fs.rename(Path(initial_filename), Path(initial_filename.replace(".txt.gz", ".json.gz")))	
+
 # Close spark session
 spark.stop()	
 	
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 7aa95db22..f07a27244 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -390,7 +390,7 @@
                 			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
                 			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
 			<!-- Script arguments here -->
-			<arg>json</arg>
+			<arg>json-5-way</arg>
 			<!-- Input files must be identified dynamically -->
 			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
 			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>

From 5ddbb4ad10f8885e6fdbc9c18e0356c2a25db63a Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Thu, 11 May 2023 15:36:47 +0300
Subject: [PATCH 16/41] Spark properties no longer hardcoded

---
 .../oa/graph/impact_indicators/oozie_app/job.properties   | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
index f9f5519cc..7b4bb96cf 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
@@ -13,6 +13,14 @@ maven.executable=mvn
 sparkDriverMemory=7G
 sparkExecutorMemory=7G
 sparkExecutorCores=4
+
+# Some memory and driver settings for more demanding tasks
+sparkHighDriverMemory=20G
+sparkNormalDriverMemory=10G
+
+sparkHighExecutorMemory=20G
+sparkNormalExecutorMemory=10G
+
 # The above is given differently in an example I found online
 oozie.action.sharelib.for.spark=spark2
 oozieActionShareLibForSpark2=spark2

From 1788ac2d4d1403dc8ed4173e9487c9f1a8d1ba4c Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Fri, 12 May 2023 12:55:43 +0300
Subject: [PATCH 17/41] Correct filtering for MAG records

---
 .../oozie_app/create_openaire_ranking_graph.py                  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
index cda12a77c..182fd9309 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
@@ -137,7 +137,7 @@ cites_df  = spark.read.json(graph_folder + "/relation")\
 				.where(
 					(F.array_contains(F.col('collected_lower'), "opencitations"))
             | 		(F.array_contains(F.col('collected_lower'), "crossref"))
-            | 		(F.array_contains(F.col('collected_lower'), "mag"))
+            | 		(F.array_contains(F.col('collected_lower'), "microsoft academic graph"))
 				).drop('collected_lower')
 # print ("Cited df has: " + str(cites_df.count()) + " entries")	 
 

From 07818131ef0067810953c8692d6559c56d25bb48 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Mon, 15 May 2023 13:04:44 +0300
Subject: [PATCH 18/41] Update documentation

---
 dhp-workflows/dhp-impact-indicators/README.md | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-impact-indicators/README.md b/dhp-workflows/dhp-impact-indicators/README.md
index 45a4701e7..aad94ea19 100644
--- a/dhp-workflows/dhp-impact-indicators/README.md
+++ b/dhp-workflows/dhp-impact-indicators/README.md
@@ -15,7 +15,12 @@ You can check out a specific tag/release of BIP! Ranker using maven, as describe
 Use the following command for packaging:
 
 ```
-mvn package  -Poozie-package -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/impact_indicators -DskipTests
+mvn package -Poozie-package -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/impact_indicators -DskipTests
+```
+
+Deploy and run:
+```
+mvn package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/graph/impact_indicators -DskipTests
 ```
 
 Note: edit the property `bip.ranker.tag` of the `pom.xml` file to specify the tag of [BIP-Ranker](https://github.com/athenarc/Bip-Ranker) that you want to use.

From 4a905932a3db36c61570c24b9aa54283cd30abba Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Mon, 15 May 2023 15:24:22 +0300
Subject: [PATCH 19/41] Spark properties from job.properties

---
 .../impact_indicators/oozie_app/workflow.xml  | 22 +++++++++----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index f07a27244..ec2bb140f 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -46,7 +46,7 @@
 			<!-- Script name goes here -->
 			<jar>create_openaire_ranking_graph.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 20G --executor-cores 4 --driver-memory 20G  
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory}  
 					--master yarn
 					--deploy-mode cluster
 					--conf spark.sql.shuffle.partitions=7680
@@ -100,7 +100,7 @@
 			<!-- Script name goes here -->
 			<jar>CC.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}  
 					--master yarn
 					--deploy-mode cluster
 					--conf spark.sql.shuffle.partitions=7680
@@ -141,7 +141,7 @@
 			<!-- Script name goes here -->
 			<jar>TAR.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}  
 					--master yarn
 					--deploy-mode cluster
 					--conf spark.sql.shuffle.partitions=7680
@@ -189,7 +189,7 @@
 			<!-- Script name goes here -->
 			<jar>CC.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}  
 					--master yarn
 					--deploy-mode cluster
 					--conf spark.sql.shuffle.partitions=7680
@@ -244,7 +244,7 @@
 			<!-- Script name goes here -->
 			<jar>PageRank.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}  
 					--master yarn
 					--deploy-mode cluster
 					--conf spark.sql.shuffle.partitions=7680
@@ -289,7 +289,7 @@
 			<!-- Script name goes here -->
 			<jar>AttRank.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory} 
 					--master yarn
 					--deploy-mode cluster
 					--conf spark.sql.shuffle.partitions=7680
@@ -381,7 +381,7 @@
 			<!-- Script name goes here -->
 			<jar>format_ranking_results.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 10G --executor-cores 4 --driver-memory 10G  
+			<spark-opts>--executor-memory ${sparkNormalExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}  
 					--master yarn
 					--deploy-mode cluster
 					--conf spark.sql.shuffle.partitions=7680
@@ -429,7 +429,7 @@
 			<!-- Script name goes here -->
 			<jar>format_ranking_results.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 10G --executor-cores 4 --driver-memory 10G  
+			<spark-opts>--executor-memory ${sparkNormalExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}  
 					--master yarn
 					--deploy-mode cluster
 					--conf spark.sql.shuffle.partitions=7680
@@ -484,7 +484,7 @@
 			<!-- Script name goes here -->
 			<jar>map_openaire_ids_to_dois.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 15G  
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory}  
 					--master yarn
 					--deploy-mode cluster
 					--conf spark.sql.shuffle.partitions=7680
@@ -526,7 +526,7 @@
 			<!-- Script name goes here -->
 			<jar>map_scores_to_dois.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 15G  
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory}  
 					--master yarn
 					--deploy-mode cluster
 					--conf spark.sql.shuffle.partitions=7680
@@ -609,7 +609,7 @@
 			<!-- Script name goes here -->
 			<jar>projects_impact.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G
+			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
 				--master yarn
 				--deploy-mode cluster
 				--conf spark.sql.shuffle.partitions=7680

From b8e8c959fe5a72f3b88610643b5e229371aa687c Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Mon, 15 May 2023 15:50:23 +0300
Subject: [PATCH 20/41] Update workflow.xml && job.properties

---
 .../oozie_app/job.properties                  |  46 ++--
 .../impact_indicators/oozie_app/workflow.xml  | 260 ++++++++++--------
 2 files changed, 163 insertions(+), 143 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
index 7b4bb96cf..08f9b1eac 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
@@ -1,18 +1,16 @@
 # The following set of properties are defined in https://support.openaire.eu/projects/openaire/wiki/Hadoop_clusters 
 # and concern the parameterization required for running workflows on the @GARR cluster
 
-dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos
-dhp.hadoop.frontend.user.name=ilias.kanellos
-dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl
-dhp.hadoop.frontend.port.ssh=22
-oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie
-jobTracker=yarnRM
-nameNode=hdfs://nameservice1
-oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log
-maven.executable=mvn
-sparkDriverMemory=7G
-sparkExecutorMemory=7G
-sparkExecutorCores=4
+# --- You can override the following properties (if needed) coming from your ~/.dhp/application.properties ---
+# dhp.hadoop.frontend.temp.dir=/home/ilias.kanellos
+# dhp.hadoop.frontend.user.name=ilias.kanellos
+# dhp.hadoop.frontend.host.name=iis-cdh5-test-gw.ocean.icm.edu.pl
+# dhp.hadoop.frontend.port.ssh=22
+# oozieServiceLoc=http://iis-cdh5-test-m3:11000/oozie
+# jobTracker=yarnRM
+# nameNode=hdfs://nameservice1
+# oozie.execution.log.file.location = target/extract-and-run-on-remote-host.log
+# maven.executable=mvn
 
 # Some memory and driver settings for more demanding tasks
 sparkHighDriverMemory=20G
@@ -21,6 +19,9 @@ sparkNormalDriverMemory=10G
 sparkHighExecutorMemory=20G
 sparkNormalExecutorMemory=10G
 
+sparkExecutorCores=4
+sparkShufflePartitions=7680
+
 # The above is given differently in an example I found online
 oozie.action.sharelib.for.spark=spark2
 oozieActionShareLibForSpark2=spark2
@@ -66,29 +67,26 @@ ramGamma=0.6
 convergenceError=0.000000000001
 
 # I think this should be the oozie workflow directory
-oozieWorkflowPath=user/ilias.kanellos/workflow_example/
-
-# The directory where the workflow data is/should be stored
-workflowDataDir=user/ilias.kanellos/ranking_workflow
+# oozieWorkflowPath=user/ilias.kanellos/workflow_example/
 
 # Directory where json data containing scores will be output
-bipScorePath=${workflowDataDir}/openaire_universe_scores/
+bipScorePath=${workingDir}/openaire_universe_scores/
 
 # Directory where dataframes are checkpointed
-checkpointDir=${nameNode}/${workflowDataDir}/check/
+checkpointDir=${nameNode}/${workingDir}/check/
 
 # The directory for the doi-based bip graph
-bipGraphFilePath=${nameNode}/${workflowDataDir}/bipdbv8_graph
+bipGraphFilePath=${nameNode}/${workingDir}/bipdbv8_graph
 
 # The folder from which synonyms of openaire-ids are read
 # openaireDataInput=${nameNode}/tmp/beta_provision/graph/21_graph_cleaned/
-openaireDataInput=${/tmp/prod_provision/graph/18_graph_blacklisted}
+openaireDataInput=/tmp/prod_provision/graph/18_graph_blacklisted
 
 # A folder where we will write the openaire to doi mapping
-synonymFolder=${nameNode}/${workflowDataDir}/openaireid_to_dois/
+synonymFolder=${nameNode}/${workingDir}/openaireid_to_dois/
 
 # This will be where we store the openaire graph input. They told us on GARR to use a directory under /data
-openaireGraphInputPath=${nameNode}/${workflowDataDir}/openaire_id_graph
+openaireGraphInputPath=${nameNode}/${workingDir}/openaire_id_graph
 
 # The workflow application path
 wfAppPath=${nameNode}/${oozieWorkflowPath}
@@ -96,8 +94,8 @@ wfAppPath=${nameNode}/${oozieWorkflowPath}
 oozie.wf.application.path=${wfAppPath}
 
 # Path where the final output should be?
-actionSetOutputPath=${workflowDataDir}/bip_actionsets/
+actionSetOutputPath=${workingDir}/bip_actionsets/
 
 # The directory to store project impact indicators
-projectImpactIndicatorsOutput=${workflowDataDir}/project_indicators
+projectImpactIndicatorsOutput=${workingDir}/project_indicators
 
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index f07a27244..d930ab774 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -46,21 +46,23 @@
 			<!-- Script name goes here -->
 			<jar>create_openaire_ranking_graph.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 20G --executor-cores 4 --driver-memory 20G  
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<spark-opts>
+				--executor-memory=${sparkHighExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkHighDriverMemory}
+				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+			</spark-opts>
 			<!-- Script arguments here -->
 			<!-- The openaire graph data from which to read relations and objects -->
 			<arg>${openaireDataInput}</arg>
 			<!-- Year for filtering entries w/ larger values / empty --> 
 			<arg>${currentYear}</arg>
 			<!-- number of partitions to be used on joins -->
-			<arg>7680</arg>
+			<arg>${sparkShufflePartitions}</arg>
 			<!-- The output of the graph should be the openaire input graph for ranking-->
 			<arg>${openaireGraphInputPath}</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
@@ -100,18 +102,20 @@
 			<!-- Script name goes here -->
 			<jar>CC.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<spark-opts>
+				--executor-memory=${sparkHighExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkNormalDriverMemory}
+				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+			</spark-opts>
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<!-- number of partitions to be used on joins -->
-			<arg>7680</arg>
+			<arg>${sparkShufflePartitions}</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
 			<file>${wfAppPath}/CC.py#CC.py</file>
 		</spark>
@@ -141,21 +145,23 @@
 			<!-- Script name goes here -->
 			<jar>TAR.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<spark-opts>
+				--executor-memory=${sparkHighExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkNormalDriverMemory}
+				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+			</spark-opts>
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<arg>${ramGamma}</arg>
 			<arg>${currentYear}</arg>
 			<arg>RAM</arg>
 			<!-- number of partitions to be used on joins -->
-			<arg>7680</arg>
+			<arg>${sparkShufflePartitions}</arg>
 			<arg>${checkpointDir}</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
 			<file>${wfAppPath}/TAR.py#TAR.py</file>
@@ -189,18 +195,20 @@
 			<!-- Script name goes here -->
 			<jar>CC.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<spark-opts>
+				--executor-memory=${sparkHighExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkNormalDriverMemory}
+				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+			</spark-opts>
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<!-- number of partitions to be used on joins -->
-			<arg>7680</arg>
+			<arg>${sparkShufflePartitions}</arg>
 			<arg>3</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
 			<file>${wfAppPath}/CC.py#CC.py</file>
@@ -244,21 +252,23 @@
 			<!-- Script name goes here -->
 			<jar>PageRank.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<spark-opts>
+				--executor-memory=${sparkHighExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkNormalDriverMemory}
+				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+			</spark-opts>
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<arg>${pageRankAlpha}</arg>
 			<arg>${convergenceError}</arg>
 			<arg>${checkpointDir}</arg>
 			<!-- number of partitions to be used on joins -->
-			<arg>7680</arg>
+			<arg>${sparkShufflePartitions}</arg>
 			<arg>dfs</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
 			<file>${wfAppPath}/PageRank.py#PageRank.py</file>
@@ -289,14 +299,16 @@
 			<!-- Script name goes here -->
 			<jar>AttRank.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G  
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<spark-opts>
+				--executor-memory=${sparkHighExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkNormalDriverMemory}
+				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+			</spark-opts>
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<arg>${attrankAlpha}</arg>
@@ -308,7 +320,7 @@
 			<arg>${convergenceError}</arg>
 			<arg>${checkpointDir}</arg>
 			<!-- number of partitions to be used on joins -->
-			<arg>7680</arg>
+			<arg>${sparkShufflePartitions}</arg>
 			<arg>dfs</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
 			<file>${wfAppPath}/AttRank.py#AttRank.py</file>
@@ -339,7 +351,7 @@
 			<!-- name of script to run -->
 			<argument>get_ranking_files.sh</argument>
 			<!-- We only pass the directory where we expect to find the rankings -->
-			<argument>/${workflowDataDir}</argument>
+			<argument>/${workingDir}</argument>
             		
 			<!-- the name of the file run -->
 			<file>${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh</file>
@@ -381,24 +393,26 @@
 			<!-- Script name goes here -->
 			<jar>format_ranking_results.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 10G --executor-cores 4 --driver-memory 10G  
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<spark-opts>
+				--executor-memory=${sparkNormalExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkNormalDriverMemory}
+				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+			</spark-opts>
 			<!-- Script arguments here -->
 			<arg>json-5-way</arg>
 			<!-- Input files must be identified dynamically -->
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
 			<!-- Num partitions -->
-			<arg>7680</arg>
+			<arg>${sparkShufflePartitions}</arg>
 			<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
 			<arg>openaire</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
@@ -429,24 +443,26 @@
 			<!-- Script name goes here -->
 			<jar>format_ranking_results.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 10G --executor-cores 4 --driver-memory 10G  
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<spark-opts>
+				--executor-memory=${sparkNormalExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkNormalDriverMemory}
+				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+			</spark-opts>
 			<!-- Script arguments here -->
 			<arg>zenodo</arg>
 			<!-- Input files must be identified dynamically -->
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
 			<!-- Num partitions -->
-			<arg>7680</arg>
+			<arg>${sparkShufflePartitions}</arg>
 			<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
 			<arg>openaire</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
@@ -484,14 +500,16 @@
 			<!-- Script name goes here -->
 			<jar>map_openaire_ids_to_dois.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 15G  
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<spark-opts>
+				--executor-memory=${sparkHighExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkHighDriverMemory}
+				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+			</spark-opts>
 			<!-- Script arguments here -->
 			<arg>${openaireDataInput}</arg>
 			<!-- number of partitions to be used on joins -->
@@ -526,24 +544,26 @@
 			<!-- Script name goes here -->
 			<jar>map_scores_to_dois.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 15G  
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+			<spark-opts>
+				--executor-memory=${sparkHighExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkHighDriverMemory}
+				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+			</spark-opts>
 			<!-- Script arguments here -->
 			<arg>${synonymFolder}</arg>
 			<!-- Number of partitions -->
-			<arg>7680</arg>
+			<arg>${sparkShufflePartitions}</arg>
 			<!-- The remaining input are the ranking files fproduced for bip db-->
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['ram_file']}</arg>			
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
 
 			<!-- This needs to point to the file on the hdfs i think -->
 			<file>${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file>
@@ -576,9 +596,9 @@
             <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
             <jar>dhp-aggregation-${projectVersion}.jar</jar>
             <spark-opts>
-                --executor-memory=${sparkExecutorMemory}
+                --executor-memory=${sparkNormalExecutorMemory}
                 --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkDriverMemory}
+                --driver-memory=${sparkNormalDriverMemory}
                 --conf spark.extraListeners=${spark2ExtraListeners}
                 --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
                 --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
@@ -609,14 +629,16 @@
 			<!-- Script name goes here -->
 			<jar>projects_impact.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-			<spark-opts>--executor-memory 18G --executor-cores 4 --driver-memory 10G
-				--master yarn
-				--deploy-mode cluster
-				--conf spark.sql.shuffle.partitions=7680
+			<spark-opts>
+				--executor-memory=${sparkHighExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkNormalDriverMemory}
+				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+			</spark-opts>
 
 			<!-- Script arguments here -->
 
@@ -624,13 +646,13 @@
 			<arg>${openaireDataInput}/relations</arg>
 
 			<!-- input files with impact indicators for results	-->
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
-			<arg>${nameNode}/${workflowDataDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['attrank_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['cc_file']}</arg>
+			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
 
 			<!-- number of partitions to be used on joins -->
-			<arg>7680</arg>
+			<arg>${sparkShufflePartitions}</arg>
 
 			<arg>${projectImpactIndicatorsOutput}</arg>
 
@@ -654,9 +676,9 @@
 			<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
 			<jar>dhp-aggregation-${projectVersion}.jar</jar>
 			<spark-opts>
-				--executor-memory=${sparkExecutorMemory}
+				--executor-memory=${sparkNormalExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
-				--driver-memory=${sparkDriverMemory}
+				--driver-memory=${sparkNormalDriverMemory}
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

From 12a57e1f584ffb13a7e1961b9bf79974b6e05a60 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Mon, 15 May 2023 15:59:51 +0300
Subject: [PATCH 21/41] Resolve conflicts

---
 .../impact_indicators/oozie_app/workflow.xml  | 138 +++---------------
 1 file changed, 21 insertions(+), 117 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 8b5313c15..f185f2a8a 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -46,7 +46,7 @@
 			<!-- Script name goes here -->
 			<jar>create_openaire_ranking_graph.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-<<<<<<< HEAD
+
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@@ -57,16 +57,7 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
-=======
-			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory}
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
+
 			<!-- Script arguments here -->
 			<!-- The openaire graph data from which to read relations and objects -->
 			<arg>${openaireDataInput}</arg>
@@ -113,7 +104,7 @@
 			<!-- Script name goes here -->
 			<jar>CC.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-<<<<<<< HEAD
+
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@@ -124,16 +115,7 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
-=======
-			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
+
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<!-- number of partitions to be used on joins -->
@@ -167,7 +149,7 @@
 			<!-- Script name goes here -->
 			<jar>TAR.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-<<<<<<< HEAD
+
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@@ -178,16 +160,7 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
-=======
-			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
+
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<arg>${ramGamma}</arg>
@@ -228,7 +201,7 @@
 			<!-- Script name goes here -->
 			<jar>CC.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-<<<<<<< HEAD
+
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@@ -239,16 +212,7 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
-=======
-			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
+
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<!-- number of partitions to be used on joins -->
@@ -296,7 +260,7 @@
 			<!-- Script name goes here -->
 			<jar>PageRank.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-<<<<<<< HEAD
+
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@@ -307,16 +271,7 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
-=======
-			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
+
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<arg>${pageRankAlpha}</arg>
@@ -354,7 +309,7 @@
 			<!-- Script name goes here -->
 			<jar>AttRank.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-<<<<<<< HEAD
+
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@@ -365,16 +320,7 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
-=======
-			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
+
 			<!-- Script arguments here -->
 			<arg>${openaireGraphInputPath}</arg>
 			<arg>${attrankAlpha}</arg>
@@ -459,7 +405,7 @@
 			<!-- Script name goes here -->
 			<jar>format_ranking_results.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-<<<<<<< HEAD
+
 			<spark-opts>
 				--executor-memory=${sparkNormalExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@@ -470,16 +416,7 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
-=======
-			<spark-opts>--executor-memory ${sparkNormalExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
+
 			<!-- Script arguments here -->
 			<arg>json-5-way</arg>
 			<!-- Input files must be identified dynamically -->
@@ -520,7 +457,7 @@
 			<!-- Script name goes here -->
 			<jar>format_ranking_results.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-<<<<<<< HEAD
+
 			<spark-opts>
 				--executor-memory=${sparkNormalExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@@ -531,16 +468,7 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
-=======
-			<spark-opts>--executor-memory ${sparkNormalExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
+
 			<!-- Script arguments here -->
 			<arg>zenodo</arg>
 			<!-- Input files must be identified dynamically -->
@@ -588,7 +516,7 @@
 			<!-- Script name goes here -->
 			<jar>map_openaire_ids_to_dois.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-<<<<<<< HEAD
+
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@@ -599,16 +527,7 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
-=======
-			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory}
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
+
 			<!-- Script arguments here -->
 			<arg>${openaireDataInput}</arg>
 			<!-- number of partitions to be used on joins -->
@@ -643,7 +562,7 @@
 			<!-- Script name goes here -->
 			<jar>map_scores_to_dois.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-<<<<<<< HEAD
+
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@@ -654,16 +573,7 @@
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 			</spark-opts>
-=======
-			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkHighDriverMemory}
-					--master yarn
-					--deploy-mode cluster
-					--conf spark.sql.shuffle.partitions=7680
-					--conf spark.extraListeners=${spark2ExtraListeners}
-                			--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                			--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                			--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}</spark-opts>
->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
+
 			<!-- Script arguments here -->
 			<arg>${synonymFolder}</arg>
 			<!-- Number of partitions -->
@@ -739,18 +649,12 @@
 			<!-- Script name goes here -->
 			<jar>projects_impact.py</jar>
 			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
-<<<<<<< HEAD
+
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
 				--driver-memory=${sparkNormalDriverMemory}
 				--conf spark.sql.shuffle.partitions=${sparkShufflePartitions}
-=======
-			<spark-opts>--executor-memory ${sparkHighExecutorMemory} --executor-cores ${sparkExecutorCores} --driver-memory ${sparkNormalDriverMemory}
-				--master yarn
-				--deploy-mode cluster
-				--conf spark.sql.shuffle.partitions=7680
->>>>>>> 4a905932a3db36c61570c24b9aa54283cd30abba
 				--conf spark.extraListeners=${spark2ExtraListeners}
 				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
 				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}

From 45f2aa0867419093a866fe4686fe3c15400fe7d4 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Mon, 15 May 2023 17:52:20 +0300
Subject: [PATCH 22/41] Move end node ... at the end in workflow.xml

---
 .../impact_indicators/oozie_app/workflow.xml  | 35 +++++++++----------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index f185f2a8a..bc40dfd11 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -3,7 +3,7 @@
 	<!-- start using a decision node, so as to determine from which point onwards a job will continue -->
 	<!-- <start to="get-doi-synonyms" /> -->
 	<start to="entry-point-decision" />
-	
+
 	<decision name="entry-point-decision">
 		<switch>
 			<!-- The default will be set as the normal start, a.k.a. get-doi-synonyms -->
@@ -714,47 +714,42 @@
 		<error to="actionset-project-creation-fail"/>
 	</action>
 
-	<!-- TODO: end the workflow-->
-		
-	<!-- Define ending node -->
-	<end name="end" />
-	
-	<!-- Definitions of failure messages -->	
+	<!-- Definitions of failure messages -->
 	<kill name="pagerank-fail">
 		<message>PageRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
-	
+
 	<kill name="attrank-fail">
 		<message>AttRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-	</kill>	
-	
+	</kill>
+
 	<kill name="cc-fail">
 		<message>CC failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
-	
+
 	<kill name="impulse-fail">
 		<message>Impulse failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-	</kill>	
-	
+	</kill>
+
 	<kill name="ram-fail">
 		<message>RAM failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-	</kill>	
+	</kill>
 
 	<kill name="openaire-graph-error">
 		<message>Creation of openaire-graph failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-	</kill>		
+	</kill>
 
 	<kill name="synonym-collection-fail">
 		<message>Synonym collection failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-	</kill>		
+	</kill>
 
 	<kill name="map-scores-fail">
 		<message>Mapping scores to DOIs failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-	</kill>	
+	</kill>
 
 	<kill name="actionset-delete-fail">
 		<message>Deleting output path for actionsets failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-	</kill>	
+	</kill>
 
 	<kill name="actionset-creation-fail">
 		<message>ActionSet creation for results failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
@@ -767,4 +762,8 @@
 	<kill name="actionset-project-creation-fail">
 		<message>ActionSet creation for projects failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
+
+	<!-- Define ending node -->
+	<end name="end" />
+
 </workflow-app>

From b83135c252e1d90e117269ae5b7609009d370c31 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Mon, 15 May 2023 19:55:35 +0300
Subject: [PATCH 23/41] Add missing kill nodes in workflow.xml

---
 .../impact_indicators/oozie_app/workflow.xml  | 32 +++++++++++++------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index bc40dfd11..d2933e36f 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -715,6 +715,22 @@
 	</action>
 
 	<!-- Definitions of failure messages -->
+	<kill name="openaire-graph-error">
+		<message>Creation of openaire-graph failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>
+
+	<kill name="cc-fail">
+		<message>CC failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>
+
+	<kill name="ram-fail">
+		<message>RAM failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>
+
+	<kill name="impulse-fail">
+		<message>Impulse failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	</kill>
+
 	<kill name="pagerank-fail">
 		<message>PageRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
@@ -723,20 +739,16 @@
 		<message>AttRank failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 
-	<kill name="cc-fail">
-		<message>CC failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	<kill name="filename-getting-error">
+		<message>Error getting key-value pairs for output files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 
-	<kill name="impulse-fail">
-		<message>Impulse failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	<kill name="json-formatting-fail">
+		<message>Error formatting json files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 
-	<kill name="ram-fail">
-		<message>RAM failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-	</kill>
-
-	<kill name="openaire-graph-error">
-		<message>Creation of openaire-graph failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
+	<kill name="bip-formatting-fail">
+		<message>Error formatting BIP files, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 
 	<kill name="synonym-collection-fail">

From 4eec3e7052756002f2f3d48561d516a3a5c003b5 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Mon, 15 May 2023 22:28:48 +0300
Subject: [PATCH 24/41] Add jobTracker, nameNode && spark2Lib as global params
 in oozie wf

---
 .../oozie_app/job.properties                  |  1 +
 .../impact_indicators/oozie_app/workflow.xml  | 80 ++++++-------------
 2 files changed, 24 insertions(+), 57 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
index 08f9b1eac..fb68a6928 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
@@ -99,3 +99,4 @@ actionSetOutputPath=${workingDir}/bip_actionsets/
 # The directory to store project impact indicators
 projectImpactIndicatorsOutput=${workingDir}/project_indicators
 
+resume=create-openaire-ranking-graph
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index d2933e36f..570dc46f5 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -1,5 +1,17 @@
 <workflow-app xmlns="uri:oozie:workflow:0.5" name="ranking-wf">
 
+	<!-- Global params	-->
+	<global>
+		<job-tracker>${jobTracker}</job-tracker>
+		<name-node>${nameNode}</name-node>
+		<configuration>
+			<property>
+				<name>oozie.action.sharelib.for.spark</name>
+				<value>${oozieActionShareLibForSpark2}</value>
+			</property>
+		</configuration>
+	</global>
+
 	<!-- start using a decision node, so as to determine from which point onwards a job will continue -->
 	<!-- <start to="get-doi-synonyms" /> -->
 	<start to="entry-point-decision" />
@@ -8,14 +20,14 @@
 		<switch>
 			<!-- The default will be set as the normal start, a.k.a. get-doi-synonyms -->
 			<!-- If any different condition is set, go to the corresponding start -->
-			<case to="non-iterative-rankings">${resume eq "rankings-start"}</case>
-			<case to="spark-impulse">${resume eq "impulse"}</case>
-			<case to="iterative-rankings">${resume eq "rankings-iterative"}</case>
-			<case to="get-file-names">${resume eq "format-results"}</case>
-			<case to="map-openaire-to-doi">${resume eq "map-ids"}</case> 
-			<case to="map-scores-to-dois">${resume eq "map-scores"}</case> 
-			<case to="create-openaire-ranking-graph">${resume eq "start"}</case>
-			<case to="project-impact-indicators">${resume eq "projects-impact"}</case>
+			<case to="non-iterative-rankings">${wf:conf('resume') eq "rankings-start"}</case>
+			<case to="spark-impulse">${wf:conf('resume') eq "impulse"}</case>
+			<case to="iterative-rankings">${wf:conf('resume') eq "rankings-iterative"}</case>
+			<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
+			<case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case>
+			<case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case>
+			<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>
+			<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
 
 			<!-- TODO: add action set creation here -->
 			<default to="create-openaire-ranking-graph" />
@@ -26,10 +38,7 @@
 	<action name="create-openaire-ranking-graph">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
-			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
-			<job-tracker>${jobTracker}</job-tracker>
-			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
-			<name-node>${nameNode}</name-node>
+
 			<!-- Delete previously created doi synonym folder -->
 			<!-- I think we don't need this given we don't have synonyms anymore
 			<prepare>
@@ -90,10 +99,6 @@
 	<action name="spark-cc">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
-			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
-			<job-tracker>${jobTracker}</job-tracker>
-			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
-			<name-node>${nameNode}</name-node>
 
 			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
@@ -135,10 +140,6 @@
 	<action name="spark-ram">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
-			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
-			<job-tracker>${jobTracker}</job-tracker>
-			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
-			<name-node>${nameNode}</name-node>
 
             <!-- using configs from an example on openaire -->
             <master>yarn-cluster</master>
@@ -187,10 +188,6 @@
 	<action name="spark-impulse">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
-			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
-			<job-tracker>${jobTracker}</job-tracker>
-			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
-			<name-node>${nameNode}</name-node>
 
 			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
@@ -238,10 +235,6 @@
 	<action name="spark-pagerank">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
-			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
-			<job-tracker>${jobTracker}</job-tracker>
-			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
-			<name-node>${nameNode}</name-node>
 
 			<!-- we could add map-reduce configs here, but I don't know if we need them -->
 			<!-- This is the type of master-client configuration for running spark -->
@@ -295,10 +288,6 @@
 	<action name="spark-attrank">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
-			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
-			<job-tracker>${jobTracker}</job-tracker>
-			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
-			<name-node>${nameNode}</name-node>
 
             <!-- using configs from an example on openaire -->
             <master>yarn-cluster</master>
@@ -353,10 +342,6 @@
 	<action name="get-file-names">
 		<!-- This is required as a tag for shell jobs -->
 		<shell xmlns="uri:oozie:shell-action:0.3">
-			<!-- Same for all -->
-			<job-tracker>${jobTracker}</job-tracker>
-			<!-- This should give the machine/root of the hdfs -->
-			<name-node>${nameNode}</name-node>
 
 			<!-- Exec is needed for shell commands - points to type of shell command -->
 			<exec>/usr/bin/bash</exec>
@@ -378,7 +363,6 @@
 
 	</action>
 
-
 	<!-- Now we will run in parallel the formatting of ranking files for BiP! DB and openaire (json files) -->
 	<fork name="format-result-files">
 		<path start="format-bip-files"/>
@@ -391,10 +375,6 @@
 	<action name="format-json-files">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
-			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
-			<job-tracker>${jobTracker}</job-tracker>
-			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
-			<name-node>${nameNode}</name-node>
 
 			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
@@ -443,10 +423,6 @@
 	<action name="format-bip-files">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
-			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
-			<job-tracker>${jobTracker}</job-tracker>
-			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
-			<name-node>${nameNode}</name-node>
 
 			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
@@ -498,10 +474,7 @@
 	<action name="map-openaire-to-doi">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
-			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
-			<job-tracker>${jobTracker}</job-tracker>
-			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
-			<name-node>${nameNode}</name-node>
+
 			<!-- Delete previously created doi synonym folder -->
 			<prepare>
 				<delete path="${synonymFolder}"/>
@@ -548,10 +521,6 @@
 	<action name="map-scores-to-dois">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
-			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
-			<job-tracker>${jobTracker}</job-tracker>
-			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
-			<name-node>${nameNode}</name-node>
 
             <!-- using configs from an example on openaire -->
             <master>yarn-cluster</master>
@@ -636,10 +605,7 @@
 	<action name="project-impact-indicators">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
-			<!-- Is this yarn? Probably the answers are at the link serafeim sent me -->
-			<job-tracker>${jobTracker}</job-tracker>
-			<!-- This should give the machine/root of the hdfs, serafeim has provided a link with the required job properties -->
-			<name-node>${nameNode}</name-node>
+
 			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>

From 26328e2a0da67e1469c8781c15750250d915272e Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Tue, 16 May 2023 14:39:38 +0300
Subject: [PATCH 25/41] Move job.properties

---
 .../dhp/oa/graph/impact_indicators/{oozie_app => }/job.properties | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/{oozie_app => }/job.properties (100%)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties
similarity index 100%
rename from dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/job.properties
rename to dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties

From 8ef718c3635f88358a3e44187be7b1d38b8b2c55 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Tue, 16 May 2023 16:28:48 +0300
Subject: [PATCH 26/41] Fix workflow application path

---
 dhp-workflows/dhp-impact-indicators/README.md          | 10 ++++++++++
 .../dhp/oa/graph/impact_indicators/job.properties      |  9 ++++++---
 .../oa/graph/impact_indicators/oozie_app/workflow.xml  | 10 +++++-----
 3 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/README.md b/dhp-workflows/dhp-impact-indicators/README.md
index aad94ea19..de0ad157c 100644
--- a/dhp-workflows/dhp-impact-indicators/README.md
+++ b/dhp-workflows/dhp-impact-indicators/README.md
@@ -24,3 +24,13 @@ mvn package -Poozie-package,deploy,run -Dworkflow.source.dir=eu/dnetlib/dhp/oa/g
 ```
 
 Note: edit the property `bip.ranker.tag` of the `pom.xml` file to specify the tag of [BIP-Ranker](https://github.com/athenarc/Bip-Ranker) that you want to use.
+
+
+Job info and logs: 
+```
+export OOZIE_URL=http://iis-cdh5-test-m3:11000/oozie
+oozie job -info <jobId>
+oozie job -log <jobId>
+```
+
+where `jobId` is the id of the job returned by the `run_workflow.sh` script.
\ No newline at end of file
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties
index fb68a6928..a2f3d5828 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties
@@ -76,7 +76,7 @@ bipScorePath=${workingDir}/openaire_universe_scores/
 checkpointDir=${nameNode}/${workingDir}/check/
 
 # The directory for the doi-based bip graph
-bipGraphFilePath=${nameNode}/${workingDir}/bipdbv8_graph
+# bipGraphFilePath=${nameNode}/${workingDir}/bipdbv8_graph
 
 # The folder from which synonyms of openaire-ids are read
 # openaireDataInput=${nameNode}/tmp/beta_provision/graph/21_graph_cleaned/
@@ -89,9 +89,12 @@ synonymFolder=${nameNode}/${workingDir}/openaireid_to_dois/
 openaireGraphInputPath=${nameNode}/${workingDir}/openaire_id_graph
 
 # The workflow application path
-wfAppPath=${nameNode}/${oozieWorkflowPath}
+wfAppPath=${oozieTopWfApplicationPath}
+
 # The following is needed as a property of a workflow
-oozie.wf.application.path=${wfAppPath}
+#oozie.wf.application.path=${wfAppPath}
+oozie.wf.application.path=${oozieTopWfApplicationPath}
+
 
 # Path where the final output should be?
 actionSetOutputPath=${workingDir}/bip_actionsets/
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 570dc46f5..285a66382 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -126,7 +126,7 @@
 			<!-- number of partitions to be used on joins -->
 			<arg>${sparkShufflePartitions}</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
-			<file>${wfAppPath}/CC.py#CC.py</file>
+			<file>${wfAppPath}/bip-ranker/CC.py#CC.py</file>
 		</spark>
 
 		<!-- Do this after finishing okay -->
@@ -171,7 +171,7 @@
 			<arg>${sparkShufflePartitions}</arg>
 			<arg>${checkpointDir}</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
-			<file>${wfAppPath}/TAR.py#TAR.py</file>
+			<file>${wfAppPath}/bip-ranker/TAR.py#TAR.py</file>
 		</spark>
 
 		<!-- Do this after finishing okay -->
@@ -216,7 +216,7 @@
 			<arg>${sparkShufflePartitions}</arg>
 			<arg>3</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
-			<file>${wfAppPath}/CC.py#CC.py</file>
+			<file>${wfAppPath}/bip-ranker/CC.py#CC.py</file>
 		</spark>
 
 		<!-- Do this after finishing okay -->
@@ -274,7 +274,7 @@
 			<arg>${sparkShufflePartitions}</arg>
 			<arg>dfs</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
-			<file>${wfAppPath}/PageRank.py#PageRank.py</file>
+			<file>${wfAppPath}/bip-ranker/PageRank.py#PageRank.py</file>
 		</spark>
 
 		<!-- Do this after finishing okay -->
@@ -324,7 +324,7 @@
 			<arg>${sparkShufflePartitions}</arg>
 			<arg>dfs</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
-			<file>${wfAppPath}/AttRank.py#AttRank.py</file>
+			<file>${wfAppPath}/bip-ranker/AttRank.py#AttRank.py</file>
 		</spark>
 
 		<!-- Do this after finishing okay -->

From 3c38f7ba6f53f735e4eb23370cc3d09eedcc808a Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Tue, 16 May 2023 17:32:53 +0300
Subject: [PATCH 27/41] Fix selection of columns in graph creation

---
 .../oozie_app/create_openaire_ranking_graph.py                | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
index 182fd9309..50d2cd99b 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
@@ -132,8 +132,8 @@ cites_df  = spark.read.json(graph_folder + "/relation")\
                 & (F.col('dataInfo.invisible') == "false"))\
 				.drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\
 				.repartition(num_partitions, 'citing').drop('relClass')\
-				.withColumn('collected_lower', F.expr('transform(collectedfrom.value, x -> lower(x))'))\
-				.drop('collectedfrom.value')\
+				.withColumn('collected_lower', F.expr('transform(value, x -> lower(x))'))\
+				.drop('value')\
 				.where(
 					(F.array_contains(F.col('collected_lower'), "opencitations"))
             | 		(F.array_contains(F.col('collected_lower'), "crossref"))

From 3d69f33c847b39fd9eb8de3a3cf93d5535a2438f Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Tue, 16 May 2023 17:34:42 +0300
Subject: [PATCH 28/41] Fix selection of columns in graph creation

---
 .../oozie_app/create_openaire_ranking_graph.py                  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
index 50d2cd99b..3d131933d 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
@@ -131,8 +131,10 @@ cites_df  = spark.read.json(graph_folder + "/relation")\
 				& (F.col('dataInfo.deletedbyinference') == "false")\
                 & (F.col('dataInfo.invisible') == "false"))\
 				.drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\
+				.drop('deletedbyinference').drop('invisible')\
 				.repartition(num_partitions, 'citing').drop('relClass')\
 				.withColumn('collected_lower', F.expr('transform(value, x -> lower(x))'))\
+				.drop('collectedfrom.value')\
 				.drop('value')\
 				.where(
 					(F.array_contains(F.col('collected_lower'), "opencitations"))

From ec4e01068759a48fdfcd94d4e3854059b61f0d42 Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Tue, 23 May 2023 16:44:04 +0300
Subject: [PATCH 29/41] End after rankings | Create graph debugged

---
 .../oozie_app/create_openaire_ranking_graph.py         | 10 ++++++++--
 .../oa/graph/impact_indicators/oozie_app/workflow.xml  |  5 +++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
index 3d131933d..6dd4427b9 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
@@ -114,6 +114,12 @@ print ("Total num of research objects: " + str(oa_objects_df.count()))
 # Keep only required fields - we still keep resulttype.classname to
 # filter the citation relationships we consider valid
 oa_objects_df = oa_objects_df.drop('deletedbyinference').drop('invisible').distinct().cache()
+
+'''
+print ("OA objects Schema:")
+oa_objects_df.printSchema()
+sys.exit(0)
+'''
 ############################################################################################################################
 # 2. Get the relation objects and filter them based on their existence in the oa_objects_df
 #    NOTE: we are only interested in citations of type "cites"
@@ -154,8 +160,8 @@ cites_df  = spark.read.json(graph_folder + "/relation")\
 # references_df = references_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), references_df.cited == oa_objects_df.id).drop('id').distinct().repartition(num_partitions, 'citing').cache()
 # print ("References df now has: " + str(references_df.count()) +  " entries")
 
-cites_df = cites_df.join(oa_objects_df.select('id'), cites_df.citing == oa_objects_df.id).where( F.col('resulttype.classname').isin(valid_result_types) ).drop('id').drop('resulttype.classname')
-cites_df = cites_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cites_df.cited == oa_objects_df.id).drop('id').drop('resulttype.classname').distinct().repartition(num_partitions, 'citing').cache()
+cites_df = cites_df.join(oa_objects_df.select('id', 'classname'), cites_df.citing == oa_objects_df.id).where( F.col('classname').isin(valid_result_types) ).drop('id').drop('classname')
+cites_df = cites_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cites_df.cited == oa_objects_df.id).distinct().repartition(num_partitions, 'citing').cache()
 # TODO: add here a clause filtering out the citations 
 # originating from "other" types of research objects which we consider valid
 
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 285a66382..78cf92bd2 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -81,7 +81,7 @@
 		</spark>
 
 		<!-- Do this after finishing okay -->
-		<ok to="end" />
+		<ok to="non-iterative-rankings" />
 		<!-- Go there if we have an error -->
 		<error to="openaire-graph-error" />
 
@@ -335,7 +335,8 @@
 	</action>
 
 	<!-- JOIN ITERATIVE METHODS AND THEN END -->
-	<join name="join-iterative-rankings" to="get-file-names"/>
+	<join name="join-iterative-rankings" to="end">
+	<!-- to="get-file-names"/> --> 
 
 
 	<!-- This will be a shell action that will output key-value pairs for output files -->

From 6a7e370a21d23ec987291da0faa7994f814106e5 Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Tue, 23 May 2023 16:48:58 +0300
Subject: [PATCH 30/41] Remove unnecessary counts in graph creation

---
 .../oozie_app/create_openaire_ranking_graph.py              | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
index 6dd4427b9..2b6b4aae9 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
@@ -196,15 +196,19 @@ oa_objects_df.printSchema()
 # cited_by_df.unpersist(True)
 
 # Show total num of unique citations
+'''
 num_unique_citations = citations_df.count()
 print ("Total unique citations: " + str(num_unique_citations))
+'''
 ############################################################################################################################
 # 3. Get any potentially missing 'citing' papers from references (these are dangling nodes w/o any outgoing references)
 dangling_nodes = oa_objects_df.join(citations_df.select('citing').distinct(), citations_df.citing == oa_objects_df.id, 'left_anti')\
 			      .select(F.col('id').alias('citing')).withColumn('cited', F.array([F.lit("0")])).repartition(num_partitions, 'citing')
 # Count dangling nodes
+'''
 dangling_num = dangling_nodes.count()
 print ("Number of dangling nodes: " + str(dangling_num))
+'''
 # print ("Dangling nodes sample:")
 # dangling_nodes.show(10, False)
 ############################################################################################################################
@@ -213,8 +217,10 @@ graph = citations_df.groupBy('citing').agg(F.collect_set('cited').alias('cited')
 # Free space
 citations_df.unpersist(True)
 
+'''
 num_nodes = graph.count()
 print ("Entries in graph before dangling nodes:"  + str(num_nodes))
+'''
 # print ("Sample in graph: ")
 # graph.show(10, False)
 

From a1b9187039639d0eaf194b5982591850cf688805 Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Tue, 23 May 2023 17:17:12 +0300
Subject: [PATCH 31/41] Fix syntax error on workflow.xml

---
 .../dhp/oa/graph/impact_indicators/oozie_app/workflow.xml       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 78cf92bd2..9bd582984 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -335,7 +335,7 @@
 	</action>
 
 	<!-- JOIN ITERATIVE METHODS AND THEN END -->
-	<join name="join-iterative-rankings" to="end">
+	<join name="join-iterative-rankings" to="end" />
 	<!-- to="get-file-names"/> --> 
 
 

From 60f25b780de1c456762003cbb8b0011c9c82f93d Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Fri, 23 Jun 2023 12:51:50 +0300
Subject: [PATCH 32/41] Minor fixes in workflow.xml and job.properties

---
 .../oa/graph/impact_indicators/job.properties |  2 +-
 .../impact_indicators/oozie_app/workflow.xml  | 95 ++++++++++---------
 2 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties
index a2f3d5828..860a14713 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties
@@ -102,4 +102,4 @@ actionSetOutputPath=${workingDir}/bip_actionsets/
 # The directory to store project impact indicators
 projectImpactIndicatorsOutput=${workingDir}/project_indicators
 
-resume=create-openaire-ranking-graph
+resume=entry-point-decision
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 9bd582984..1d49322b6 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -33,7 +33,7 @@
 			<default to="create-openaire-ranking-graph" />
 		</switch>
 	</decision>
-	
+
 	<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
 	<action name="create-openaire-ranking-graph">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
@@ -44,12 +44,12 @@
 			<prepare>
 				<delete path="${synonymFolder}"/>
 			</prepare>
-			--> 
-            		
-            <!-- using configs from an example on openaire --> 
-            <master>yarn-cluster</master>
+			-->
+
+			<!-- using configs from an example on openaire -->
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
-            		
+
 			<!-- This is the name of our job -->
 			<name>Openaire Ranking Graph Creation</name>
 			<!-- Script name goes here -->
@@ -141,8 +141,8 @@
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
-            <!-- using configs from an example on openaire -->
-            <master>yarn-cluster</master>
+			<!-- using configs from an example on openaire -->
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 
 			<!-- This is the name of our job -->
@@ -244,8 +244,8 @@
 			<!-- Reference says: The mode element if present indicates the mode of spark, where to run spark driver program. Ex: client,cluster. | In my case I always have a client -->
 			<!-- <mode>client</mode> -->
 
-            <!-- using configs from an example on openaire -->
-            <master>yarn-cluster</master>
+			<!-- using configs from an example on openaire -->
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 
 			<!-- This is the name of our job -->
@@ -289,8 +289,8 @@
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
-            <!-- using configs from an example on openaire -->
-            <master>yarn-cluster</master>
+			<!-- using configs from an example on openaire -->
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 
 			<!-- This is the name of our job -->
@@ -335,8 +335,8 @@
 	</action>
 
 	<!-- JOIN ITERATIVE METHODS AND THEN END -->
-	<join name="join-iterative-rankings" to="end" />
-	<!-- to="get-file-names"/> --> 
+	<join name="join-iterative-rankings" to="get-file-names" />
+	<!-- to="get-file-names"/> -->
 
 
 	<!-- This will be a shell action that will output key-value pairs for output files -->
@@ -349,7 +349,7 @@
 			<!-- name of script to run -->
 			<argument>get_ranking_files.sh</argument>
 			<!-- We only pass the directory where we expect to find the rankings -->
-			<argument>/${workingDir}</argument>
+			<argument>${workingDir}</argument>
 
 			<!-- the name of the file run -->
 			<file>${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh</file>
@@ -481,8 +481,8 @@
 				<delete path="${synonymFolder}"/>
 			</prepare>
 
-            <!-- using configs from an example on openaire -->
-            <master>yarn-cluster</master>
+			<!-- using configs from an example on openaire -->
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 
 			<!-- This is the name of our job -->
@@ -503,7 +503,7 @@
 			</spark-opts>
 
 			<!-- Script arguments here -->
-			<arg>${openaireDataInput}</arg>
+			<arg>${openaireDataInput}/</arg>
 			<!-- number of partitions to be used on joins -->
 			<arg>${synonymFolder}</arg>
 			<!-- This needs to point to the file on the hdfs i think -->
@@ -523,8 +523,8 @@
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
-            <!-- using configs from an example on openaire -->
-            <master>yarn-cluster</master>
+			<!-- using configs from an example on openaire -->
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 
 			<!-- This is the name of our job -->
@@ -561,47 +561,48 @@
 
 		<!-- Do this after finishing okay -->
 		<ok to="deleteOutputPathForActionSet" />
+		<!-- This is the initial code <ok to="deleteOutputPathForActionSet" /> -->
 		<!-- Go there if we have an error -->
 		<error to="map-scores-fail" />
 
 	</action>
 
 	<action name="deleteOutputPathForActionSet">
-        <fs>
-            <delete path="${actionSetOutputPath}/results/"/>
+		<fs>
+			<delete path="${actionSetOutputPath}/results/"/>
 			<delete path="${actionSetOutputPath}/projects/"/>
 
 			<mkdir path="${actionSetOutputPath}/results/"/>
 			<mkdir path="${actionSetOutputPath}/projects/"/>
 		</fs>
-        <ok to="createActionSetForResults"/>
-        <error to="actionset-delete-fail"/>
-    </action>
+		<ok to="createActionSetForResults"/>
+		<error to="actionset-delete-fail"/>
+	</action>
 
-    <action name="createActionSetForResults">
-        <spark xmlns="uri:oozie:spark-action:0.2">
-            <master>yarn</master>
-            <mode>cluster</mode>
-            <name>Produces the atomic action with the bip finder scores for publications</name>
-            <class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
-            <jar>dhp-aggregation-${projectVersion}.jar</jar>
-            <spark-opts>
-                --executor-memory=${sparkNormalExecutorMemory}
-                --executor-cores=${sparkExecutorCores}
-                --driver-memory=${sparkNormalDriverMemory}
-                --conf spark.extraListeners=${spark2ExtraListeners}
-                --conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-                --conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-                --conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-                --conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-            </spark-opts>
-            <arg>--inputPath</arg><arg>${bipScorePath}</arg>
-            <arg>--outputPath</arg><arg>${actionSetOutputPath}/results/</arg>
+	<action name="createActionSetForResults">
+		<spark xmlns="uri:oozie:spark-action:0.2">
+			<master>yarn</master>
+			<mode>cluster</mode>
+			<name>Produces the atomic action with the bip finder scores for publications</name>
+			<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
+			<jar>dhp-aggregation-${projectVersion}.jar</jar>
+			<spark-opts>
+				--executor-memory=${sparkNormalExecutorMemory}
+				--executor-cores=${sparkExecutorCores}
+				--driver-memory=${sparkNormalDriverMemory}
+				--conf spark.extraListeners=${spark2ExtraListeners}
+				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
+				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
+				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
+				--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
+			</spark-opts>
+			<arg>--inputPath</arg><arg>${bipScorePath}</arg>
+			<arg>--outputPath</arg><arg>${actionSetOutputPath}/results/</arg>
 			<arg>--targetEntity</arg><arg>result</arg>
 		</spark>
-        <ok to="project-impact-indicators"/>
-        <error to="actionset-creation-fail"/>
-    </action>
+		<ok to="project-impact-indicators"/>
+		<error to="actionset-creation-fail"/>
+	</action>
 
 	<action name="project-impact-indicators">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->

From 772d5f0aab8b680688ef66bc27b171c64e93d78f Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Thu, 6 Jul 2023 13:47:51 +0300
Subject: [PATCH 33/41] Make PR and AttRank serial

---
 .../impact_indicators/oozie_app/workflow.xml  | 21 +++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 9bd582984..8466e03e5 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -22,7 +22,10 @@
 			<!-- If any different condition is set, go to the corresponding start -->
 			<case to="non-iterative-rankings">${wf:conf('resume') eq "rankings-start"}</case>
 			<case to="spark-impulse">${wf:conf('resume') eq "impulse"}</case>
-			<case to="iterative-rankings">${wf:conf('resume') eq "rankings-iterative"}</case>
+			<case to="spark-pagerank">${wf:conf('resume') eq "pagerank"}</case>
+			<case to="spark-attrank">${wf:conf('resume') eq "attrank"}</case>
+			<!-- Removed for serafeim -->
+			<!-- <case to="iterative-rankings">${wf:conf('resume') eq "rankings-iterative"}</case> -->
 			<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
 			<case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case>
 			<case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case>
@@ -220,16 +223,19 @@
 		</spark>
 
 		<!-- Do this after finishing okay -->
-		<ok to="iterative-rankings" />
+		<ok to="spark-pagerank" />
 		<!-- Go there if we have an error -->
 		<error to="impulse-fail" />
 
 	</action>
 
+	<!-- Removed for ser to make pagerank & attrank serial -->
+	<!-- 
 	<fork name="iterative-rankings">
 		<path start="spark-pagerank"/>
 		<path start="spark-attrank"/>
 	</fork>
+	-->
 
 	<!-- PAGERANK here -->
 	<action name="spark-pagerank">
@@ -278,7 +284,7 @@
 		</spark>
 
 		<!-- Do this after finishing okay -->
-		<ok to="join-iterative-rankings" />
+		<ok to="spark-attrank" />
 		<!-- Go there if we have an error -->
 		<error to="pagerank-fail" />
 
@@ -328,15 +334,18 @@
 		</spark>
 
 		<!-- Do this after finishing okay -->
-		<ok to="join-iterative-rankings" />
+		<ok to="get-file-names" />
 		<!-- Go there if we have an error -->
 		<error to="attrank-fail" />
 
 	</action>
 
-	<!-- JOIN ITERATIVE METHODS AND THEN END -->
+	<!-- Removed for ser -->
+	<!--
+	 JOIN ITERATIVE METHODS AND THEN END =
 	<join name="join-iterative-rankings" to="end" />
-	<!-- to="get-file-names"/> --> 
+	to="get-file-names"/>  
+	-->
 
 
 	<!-- This will be a shell action that will output key-value pairs for output files -->

From d5c39a10596f732d9a17fdb9d6c5abe014f88c4c Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Thu, 6 Jul 2023 15:04:48 +0300
Subject: [PATCH 34/41] Fix map scores to doi

---
 .../oa/graph/impact_indicators/job.properties |  2 +-
 .../oozie_app/map_scores_to_dois.py           | 28 ++++++++++++-------
 2 files changed, 19 insertions(+), 11 deletions(-)
 mode change 100644 => 100755 dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties
index a2f3d5828..ea68ade1a 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties
@@ -47,7 +47,7 @@ spark2SqlQueryExecutionListeners=com.cloudera.spark.lineage.NavigatorQueryListen
 resourceManager=http://iis-cdh5-test-m2.ocean.icm.edu.pl:8088/cluster
 
 # current year used when creating graph / by some ranking methods
-currentYear=2024
+currentYear=2023
 
 # Alpha value for pagerank
 pageRankAlpha=0.5
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
old mode 100644
new mode 100755
index 0d294e045..0fc67eb53
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
@@ -1,3 +1,4 @@
+#!/usr/bin/python
 # This program reads the openaire to doi mapping from the ${synonymFolder} of the workflow
 # and uses this mapping to create doi-based score files in the format required by BiP! DB.
 # This is done by reading each openaire-id based ranking file and joining the openaire based
@@ -17,28 +18,35 @@ import pyspark.sql.functions as F
 # from pyspark.sql.functions import udf
 #################################################################################################
 #################################################################################################
-# Clean up directory name
+# Clean up directory name - no longer needed in final workflow version
+'''
 def clean_directory_name(dir_name):
     # We have a name with the form *_bip_universe<digits>_* or *_graph_universe<digits>_* 
     # and we need to keep the parts in *	
+
+    
     dir_name_parts = dir_name.split('_')
     dir_name_parts = [part for part in dir_name_parts if ('bip' not in part and 'graph' not in part and 'universe' not in part and 'from' not in part)]
-	
-    clean_name = '_'.join(dir_name_parts)
+    
+    dir_name = dir_name.replace("openaire_id_graph", "openaire_ids")
+    clean_name = dir_name + ".txt.gz"
 
-    if '_ids' not in clean_name:
-        clean_name = clean_name.replace('id_', 'ids_')
+    # clean_name = '_'.join(dir_name_parts)
+
+    # if '_ids' not in clean_name:
+    #     clean_name = clean_name.replace('id_', 'ids_')
         	
     # clean_name = clean_name.replace('.txt', '')
     # clean_name = clean_name.replace('.gz', '')
 
-    if 'openaire_ids_' in clean_name:
-        clean_name = clean_name.replace('openaire_ids_', '')
+    # if 'openaire_ids_' in clean_name:
+    #     clean_name = clean_name.replace('openaire_ids_', '')
         # clean_name = clean_name + '.txt.gz'
     # else:
         # clean_name = clean_name + '.txt.gz'
 	
     return clean_name
+'''
 #################################################################################################
 if len(sys.argv) < 3:
     print ("Usage: ./map_scores_to_dois.py <synonym_folder> <num_partitions> <score_file_1> <score_file_2> <...etc...>")
@@ -47,12 +55,12 @@ if len(sys.argv) < 3:
 # Read arguments
 synonyms_folder = sys.argv[1]
 num_partitions = int(sys.argv[2])
-input_file_list = [argument for argument in sys.argv[3:]]
-input_file_list = [clean_directory_name(item) for item in input_file_list]
+input_file_list = [argument.replace("_openaire_id_graph", "").replace("_openaire_id_graph_", "") + "_openaire_ids.txt.gz" for argument in sys.argv[3:]]
+# input_file_list = [clean_directory_name(item) for item in input_file_list]
 
 # Prepare output specific variables
 output_file_list = [item.replace("_openaire_ids", "") for item in input_file_list]
-output_file_list = [item + ".gz" if not item.endswith(".gz") else item for item in output_file_list]
+output_file_list = [item + ".txt.gz" if not item.endswith(".txt.gz") else item for item in output_file_list]
 
 # --- INFO MESSAGES --- #
 print ("\n\n----------------------------")

From db4ca43ee84aa29610bbf6dcbef6f921bf57e13c Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Tue, 18 Jul 2023 18:38:26 +0300
Subject: [PATCH 35/41] Resolve conflict

---
 .../dhp/oa/graph/impact_indicators/oozie_app/workflow.xml  | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 1f344ba5a..6eb783941 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -340,19 +340,12 @@
 
 	</action>
 
-<<<<<<< HEAD
 	<!-- Removed for ser -->
 	<!--
 	 JOIN ITERATIVE METHODS AND THEN END =
 	<join name="join-iterative-rankings" to="end" />
 	to="get-file-names"/>  
 	-->
-=======
-	<!-- JOIN ITERATIVE METHODS AND THEN END -->
-	<join name="join-iterative-rankings" to="get-file-names" />
-	<!-- to="get-file-names"/> -->
->>>>>>> 60f25b780de1c456762003cbb8b0011c9c82f93d
-
 
 	<!-- This will be a shell action that will output key-value pairs for output files -->
 	<action name="get-file-names">

From 03da9651620f363ba23720b8f131c084856f28dd Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Fri, 21 Jul 2023 13:42:30 +0300
Subject: [PATCH 36/41] Format bip-score based file without doi references

---
 .../oozie_app/format_ranking_results.py            | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py
index e7d62c2f1..8dbbe3ad3 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/format_ranking_results.py
@@ -213,7 +213,10 @@ if mode == 'bip':
 	cc_dir		= sys.argv[4]
 	impulse_dir	= sys.argv[5]
 	ram_dir		= sys.argv[6]
-	refs_dir	= sys.argv[7]	
+
+	# NOTE: This was used initial, but @Serafeim told me to remove it since we don't get doi-doi referencew anymore
+	# In case of emergency, bring this back
+	# refs_dir	= sys.argv[7]	
 		
 	# Score-specific dataframe
 	pagerank_df = spark.read.schema(float_schema).option('delimiter', '\t').option('header',True).csv(pagerank_dir).repartition(num_partitions, 'id')
@@ -221,7 +224,7 @@ if mode == 'bip':
 	cc_df	    = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(cc_dir).repartition(num_partitions, 'id')
 	impulse_df   = spark.read.schema(int_schema).option('delimiter', '\t').option('header',True).csv(impulse_dir).repartition(num_partitions, 'id')
 	ram_df      = spark.read.schema(float_schema).option('delimiter', '\t').option('header', True).csv(ram_dir).repartition(num_partitions, 'id')
-	refs_df     = spark.read.schema(refs_schema).option('delimiter', '\t').option('header',True).csv(refs_dir).repartition(num_partitions, 'id')
+	# refs_df     = spark.read.schema(refs_schema).option('delimiter', '\t').option('header',True).csv(refs_dir).repartition(num_partitions, 'id')
 	
 	# ----------- TESTING CODE --------------- #
 	# pagerank_entries = pagerank_df.count()
@@ -258,9 +261,10 @@ if mode == 'bip':
 				.select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', 'cc', 'cc_normalized',\
 					'3-cc', '3-cc_normalized', F.col('score').alias('ram'))
 	
-	# Add references
-	results_df  = results_df.join(refs_df, ['id']).select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', \
-							      'cc', 'cc_normalized', '3-cc', '3-cc_normalized', 'ram', 'num_refs')
+	# Add references - THIS WAS REMOVED SINCE WE DON't GET DOI REFERENCES
+	# In case of emergency bring back
+	# results_df  = results_df.join(refs_df, ['id']).select(results_df.id, 'pagerank', 'pagerank_normalized', 'attrank', 'attrank_normalized', \
+	#						      'cc', 'cc_normalized', '3-cc', '3-cc_normalized', 'ram', 'num_refs')
 	
 	# Write resulting dataframe to file
 	output_dir = "/".join(pagerank_dir.split('/')[:-1])

From 2cc5b1a39b36f6c0bc35a23cd2c76b7e04609eaf Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Fri, 21 Jul 2023 15:26:50 +0300
Subject: [PATCH 37/41] Fixes in workflow.xml

---
 .../bipfinder/SparkAtomicActionScoreJob.java  | 20 ++--
 .../score/deserializers/BipProjectModel.java  | 97 ++++++++++---------
 .../score/deserializers/BipResultModel.java   |  4 +-
 .../PrepareBipFinder.java                     |  2 +-
 .../SparkAtomicActionScoreJobTest.java        | 28 +++---
 .../project/PrepareH2020ProgrammeTest.java    |  2 +-
 .../project/ReadProjectsTest.java             |  2 +-
 .../actionmanager/project/ReadTopicTest.java  |  2 +-
 .../oa/graph/impact_indicators/job.properties |  2 +-
 .../impact_indicators/oozie_app/workflow.xml  | 33 ++++---
 10 files changed, 100 insertions(+), 92 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
index 8b8e05723..fb11e829f 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@@ -9,7 +9,6 @@ import java.util.List;
 import java.util.Optional;
 import java.util.stream.Collectors;
 
-import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipProjectModel;
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
@@ -25,8 +24,9 @@ import org.slf4j.LoggerFactory;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
 
-import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel;
 import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
+import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipProjectModel;
+import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.action.AtomicAction;
@@ -89,8 +89,7 @@ public class SparkAtomicActionScoreJob implements Serializable {
 					default:
 						throw new RuntimeException("Unknown target entity: " + targetEntity);
 				}
-			}
-		);
+			});
 	}
 
 	private static <I extends Project> void prepareProjects(SparkSession spark, String inputPath, String outputPath) {
@@ -98,17 +97,18 @@ public class SparkAtomicActionScoreJob implements Serializable {
 		// read input bip project scores
 		Dataset<BipProjectModel> projectScores = readPath(spark, inputPath, BipProjectModel.class);
 
-		projectScores.map( (MapFunction<BipProjectModel, Project>) bipProjectScores -> {
+		projectScores.map((MapFunction<BipProjectModel, Project>) bipProjectScores -> {
 			Project project = new Project();
 			project.setId(bipProjectScores.getProjectId());
 			project.setMeasures(bipProjectScores.toMeasures());
 			return project;
 		}, Encoders.bean(Project.class))
-		.toJavaRDD()
-		.map(p -> new AtomicAction(Project.class, p))
-		.mapToPair( aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
-				new Text(OBJECT_MAPPER.writeValueAsString(aa))))
-		.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
+			.toJavaRDD()
+			.map(p -> new AtomicAction(Project.class, p))
+			.mapToPair(
+				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
+					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
+			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
 
 	}
 
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java
index 77c1567a8..680e12504 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipProjectModel.java
@@ -1,69 +1,74 @@
+
 package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers;
 
-import com.opencsv.bean.CsvBindByPosition;
-import eu.dnetlib.dhp.schema.common.ModelConstants;
-import eu.dnetlib.dhp.schema.oaf.KeyValue;
-import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
-import lombok.AllArgsConstructor;
-import lombok.Getter;
-import lombok.NoArgsConstructor;
-import lombok.Setter;
-import eu.dnetlib.dhp.schema.oaf.Measure;
+import static eu.dnetlib.dhp.actionmanager.Constants.*;
 
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 
-import static eu.dnetlib.dhp.actionmanager.Constants.*;
+import com.opencsv.bean.CsvBindByPosition;
+
+import eu.dnetlib.dhp.schema.common.ModelConstants;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.Measure;
+import eu.dnetlib.dhp.schema.oaf.utils.OafMapperUtils;
+import lombok.AllArgsConstructor;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+import lombok.Setter;
 
 @NoArgsConstructor
 @AllArgsConstructor
 @Getter
 @Setter
 public class BipProjectModel {
-    String projectId;
+	String projectId;
 
-    String numOfInfluentialResults;
+	String numOfInfluentialResults;
 
-    String numOfPopularResults;
+	String numOfPopularResults;
 
-    String totalImpulse;
+	String totalImpulse;
 
-    String totalCitationCount;
+	String totalCitationCount;
 
-    // each project bip measure has exactly one value, hence one key-value pair
-    private Measure createMeasure(String measureId, String measureValue) {
+	// each project bip measure has exactly one value, hence one key-value pair
+	private Measure createMeasure(String measureId, String measureValue) {
 
-        KeyValue kv = new KeyValue();
-        kv.setKey("score");
-        kv.setValue(measureValue);
-        kv.setDataInfo(
-            OafMapperUtils.dataInfo(
-                false,
-                UPDATE_DATA_INFO_TYPE,
-                true,
-                false,
-                OafMapperUtils.qualifier(
-                    UPDATE_MEASURE_BIP_CLASS_ID,
-                    UPDATE_CLASS_NAME,
-                    ModelConstants.DNET_PROVENANCE_ACTIONS,
-                    ModelConstants.DNET_PROVENANCE_ACTIONS),
-    "")
-        );
+		KeyValue kv = new KeyValue();
+		kv.setKey("score");
+		kv.setValue(measureValue);
+		kv
+			.setDataInfo(
+				OafMapperUtils
+					.dataInfo(
+						false,
+						UPDATE_DATA_INFO_TYPE,
+						true,
+						false,
+						OafMapperUtils
+							.qualifier(
+								UPDATE_MEASURE_BIP_CLASS_ID,
+								UPDATE_CLASS_NAME,
+								ModelConstants.DNET_PROVENANCE_ACTIONS,
+								ModelConstants.DNET_PROVENANCE_ACTIONS),
+						""));
 
-        Measure measure = new Measure();
-        measure.setId(measureId);
-        measure.setUnit(Collections.singletonList(kv));
-        return measure;
-    }
-    public List<Measure> toMeasures() {
-        return Arrays.asList(
-            createMeasure("numOfInfluentialResults", numOfInfluentialResults),
-            createMeasure("numOfPopularResults", numOfPopularResults),
-            createMeasure("totalImpulse", totalImpulse),
-            createMeasure("totalCitationCount", totalCitationCount)
-        );
-    }
+		Measure measure = new Measure();
+		measure.setId(measureId);
+		measure.setUnit(Collections.singletonList(kv));
+		return measure;
+	}
+
+	public List<Measure> toMeasures() {
+		return Arrays
+			.asList(
+				createMeasure("numOfInfluentialResults", numOfInfluentialResults),
+				createMeasure("numOfPopularResults", numOfPopularResults),
+				createMeasure("totalImpulse", totalImpulse),
+				createMeasure("totalCitationCount", totalCitationCount));
+	}
 
 }
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java
index 06a173413..f992dc59f 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipmodel/score/deserializers/BipResultModel.java
@@ -1,13 +1,13 @@
 
 package eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers;
 
-import eu.dnetlib.dhp.actionmanager.bipmodel.Score;
-
 import java.io.Serializable;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 
+import eu.dnetlib.dhp.actionmanager.bipmodel.Score;
+
 /**
  * Class that maps the model of the bipFinder! input data.
  * Only needed for deserialization purposes
diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
index efcb96a85..0507f90e5 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/createunresolvedentities/PrepareBipFinder.java
@@ -24,8 +24,8 @@ import org.slf4j.LoggerFactory;
 
 import com.fasterxml.jackson.databind.ObjectMapper;
 
-import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel;
 import eu.dnetlib.dhp.actionmanager.bipmodel.BipScore;
+import eu.dnetlib.dhp.actionmanager.bipmodel.score.deserializers.BipResultModel;
 import eu.dnetlib.dhp.application.ArgumentApplicationParser;
 import eu.dnetlib.dhp.common.HdfsSupport;
 import eu.dnetlib.dhp.schema.common.ModelConstants;
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
index aa5a19f11..7752fbc27 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
@@ -7,8 +7,6 @@ import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 
-import eu.dnetlib.dhp.schema.oaf.KeyValue;
-import eu.dnetlib.dhp.schema.oaf.Project;
 import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.SparkConf;
@@ -28,6 +26,8 @@ import org.slf4j.LoggerFactory;
 import com.fasterxml.jackson.databind.ObjectMapper;
 
 import eu.dnetlib.dhp.schema.action.AtomicAction;
+import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.Project;
 import eu.dnetlib.dhp.schema.oaf.Result;
 
 public class SparkAtomicActionScoreJobTest {
@@ -73,15 +73,16 @@ public class SparkAtomicActionScoreJobTest {
 	}
 
 	private void runJob(String inputPath, String outputPath, String targetEntity) throws Exception {
-		SparkAtomicActionScoreJob.main(
-			new String[] {
+		SparkAtomicActionScoreJob
+			.main(
+				new String[] {
 					"-isSparkSessionManaged", Boolean.FALSE.toString(),
 					"-inputPath", inputPath,
 					"-outputPath", outputPath,
 					"-targetEntity", targetEntity,
-			}
-		);
+				});
 	}
+
 	@Test
 	void testResultScores() throws Exception {
 		final String targetEntity = RESULT;
@@ -149,8 +150,8 @@ public class SparkAtomicActionScoreJobTest {
 	void testProjectScores() throws Exception {
 		String targetEntity = PROJECT;
 		String inputResultScores = getClass()
-				.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json")
-				.getPath();
+			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json")
+			.getPath();
 		String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet";
 
 		// execute the job to generate the action sets for project scores
@@ -159,9 +160,9 @@ public class SparkAtomicActionScoreJobTest {
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 
 		JavaRDD<Project> projects = sc
-				.sequenceFile(outputPath, Text.class, Text.class)
-				.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
-				.map(aa -> ((Project) aa.getPayload()));
+			.sequenceFile(outputPath, Text.class, Text.class)
+			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
+			.map(aa -> ((Project) aa.getPayload()));
 
 		// test the number of projects
 		assertEquals(4, projects.count());
@@ -171,7 +172,8 @@ public class SparkAtomicActionScoreJobTest {
 		// count that the project with id testProjectId is present
 		assertEquals(1, projects.filter(row -> row.getId().equals(testProjectId)).count());
 
-		projects.filter(row -> row.getId().equals(testProjectId))
+		projects
+			.filter(row -> row.getId().equals(testProjectId))
 			.flatMap(r -> r.getMeasures().iterator())
 			.foreach(m -> {
 				log.info(m.getId() + " " + m.getUnit());
@@ -184,7 +186,7 @@ public class SparkAtomicActionScoreJobTest {
 				// ensure that the correct key is provided, i.e. score
 				assertEquals("score", kv.getKey());
 
-				switch(m.getId()) {
+				switch (m.getId()) {
 					case "numOfInfluentialResults":
 						assertEquals("0", kv.getValue());
 						break;
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java
index c68bfa13a..b30658feb 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/PrepareH2020ProgrammeTest.java
@@ -92,7 +92,7 @@ public class PrepareH2020ProgrammeTest {
 
 		Assertions.assertEquals(0, verificationDataset.filter("classification = ''").count());
 
-		//tmp.foreach(csvProgramme -> System.out.println(OBJECT_MAPPER.writeValueAsString(csvProgramme)));
+		// tmp.foreach(csvProgramme -> System.out.println(OBJECT_MAPPER.writeValueAsString(csvProgramme)));
 
 		Assertions
 			.assertEquals(
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsTest.java
index 4be09c4b7..0d92c48a8 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadProjectsTest.java
@@ -98,7 +98,7 @@ public class ReadProjectsTest {
 		Assertions.assertEquals("H2020-EU.1.3.", project.getLegalBasis());
 		Assertions.assertEquals("MSCA-IF-2019", project.getTopics());
 
-		//tmp.foreach(p -> System.out.println(OBJECT_MAPPER.writeValueAsString(p)));
+		// tmp.foreach(p -> System.out.println(OBJECT_MAPPER.writeValueAsString(p)));
 
 	}
 }
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadTopicTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadTopicTest.java
index bdb0cc3a1..82a9e6aed 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadTopicTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/project/ReadTopicTest.java
@@ -93,7 +93,7 @@ public class ReadTopicTest {
 		Assertions.assertEquals("Individual Fellowships", topic.getTitle());
 		Assertions.assertEquals("MSCA-IF-2019", topic.getTopic());
 
-		//tmp.foreach(p -> System.out.println(OBJECT_MAPPER.writeValueAsString(p)));
+		// tmp.foreach(p -> System.out.println(OBJECT_MAPPER.writeValueAsString(p)));
 
 	}
 }
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties
index 9d6c94ca9..b1598910d 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/job.properties
@@ -97,7 +97,7 @@ oozie.wf.application.path=${oozieTopWfApplicationPath}
 
 
 # Path where the final output should be?
-actionSetOutputPath=${workingDir}/bip_actionsets/
+actionSetOutputPath=${workingDir}/bip_actionsets
 
 # The directory to store project impact indicators
 projectImpactIndicatorsOutput=${workingDir}/project_indicators
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 6eb783941..65067dace 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -24,20 +24,21 @@
 			<case to="spark-impulse">${wf:conf('resume') eq "impulse"}</case>
 			<case to="spark-pagerank">${wf:conf('resume') eq "pagerank"}</case>
 			<case to="spark-attrank">${wf:conf('resume') eq "attrank"}</case>
-			<!-- Removed for serafeim -->
 			<!-- <case to="iterative-rankings">${wf:conf('resume') eq "rankings-iterative"}</case> -->
 			<case to="get-file-names">${wf:conf('resume') eq "format-results"}</case>
 			<case to="map-openaire-to-doi">${wf:conf('resume') eq "map-ids"}</case>
 			<case to="map-scores-to-dois">${wf:conf('resume') eq "map-scores"}</case>
 			<case to="create-openaire-ranking-graph">${wf:conf('resume') eq "start"}</case>
-			<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
 
-			<!-- TODO: add action set creation here -->
+			<!-- Aggregation of impact scores on the project level		-->
+			<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
+			<case to="create-actionset-for-projects">${wf:conf('resume') eq "projects-impact-actionsets"}</case>
+
 			<default to="create-openaire-ranking-graph" />
 		</switch>
 	</decision>
 
-	<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
+	<!-- maps openaire ids to their synonyms -->
 	<action name="create-openaire-ranking-graph">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
@@ -479,7 +480,7 @@
 	<!-- Finish formatting data and end -->
 	<join name="join-file-formatting" to="map-openaire-to-doi"/>
 
-	<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
+	<!-- maps openaire ids to their synonyms -->
 	<action name="map-openaire-to-doi">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
@@ -526,7 +527,7 @@
 	</action>
 
 
-	<!-- Script here written by Serafeim: maps openaire ids to their synonyms -->
+	<!-- maps openaire ids to their synonyms -->
 	<action name="map-scores-to-dois">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
@@ -568,14 +569,14 @@
 		</spark>
 
 		<!-- Do this after finishing okay -->
-		<ok to="deleteOutputPathForActionSet" />
-		<!-- This is the initial code <ok to="deleteOutputPathForActionSet" /> -->
+		<ok to="delete-output-path-for-actionset" />
+		<!-- This is the initial code <ok to="delete-output-path-for-actionset" /> -->
 		<!-- Go there if we have an error -->
 		<error to="map-scores-fail" />
 
 	</action>
 
-	<action name="deleteOutputPathForActionSet">
+	<action name="delete-output-path-for-actionset">
 		<fs>
 			<delete path="${actionSetOutputPath}/results/"/>
 			<delete path="${actionSetOutputPath}/projects/"/>
@@ -583,13 +584,13 @@
 			<mkdir path="${actionSetOutputPath}/results/"/>
 			<mkdir path="${actionSetOutputPath}/projects/"/>
 		</fs>
-		<ok to="createActionSetForResults"/>
+		<ok to="create-actionset-for-results"/>
 		<error to="actionset-delete-fail"/>
 	</action>
 
-	<action name="createActionSetForResults">
+	<action name="create-actionset-for-results">
 		<spark xmlns="uri:oozie:spark-action:0.2">
-			<master>yarn</master>
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 			<name>Produces the atomic action with the bip finder scores for publications</name>
 			<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
@@ -640,7 +641,7 @@
 			<!-- Script arguments here -->
 
 			<!-- graph data folder from which to read relations -->
-			<arg>${openaireDataInput}/relations</arg>
+			<arg>${openaireDataInput}/relation</arg>
 
 			<!-- input files with impact indicators for results	-->
 			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['pr_file']}</arg>
@@ -658,16 +659,16 @@
 		</spark>
 
 		<!-- Do this after finishing okay -->
-		<ok to="createActionSetForProjects" />
+		<ok to="create-actionset-for-projects" />
 
 		<!-- Go there if we have an error -->
 		<error to="project-impact-indicators-fail" />
 
 	</action>
 
-	<action name="createActionSetForProjects">
+	<action name="create-actionset-for-projects">
 		<spark xmlns="uri:oozie:spark-action:0.2">
-			<master>yarn</master>
+			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 			<name>Produces the atomic action with the bip finder scores for projects</name>
 			<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>

From cb0f3c50f69bba7c0db137117e973bb8bd865c3f Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Fri, 21 Jul 2023 16:07:10 +0300
Subject: [PATCH 38/41] Format workflow.xml

---
 .../impact_indicators/oozie_app/workflow.xml  | 182 +++---------------
 1 file changed, 32 insertions(+), 150 deletions(-)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 65067dace..349e054d8 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -13,7 +13,6 @@
 	</global>
 
 	<!-- start using a decision node, so as to determine from which point onwards a job will continue -->
-	<!-- <start to="get-doi-synonyms" /> -->
 	<start to="entry-point-decision" />
 
 	<decision name="entry-point-decision">
@@ -38,27 +37,14 @@
 		</switch>
 	</decision>
 
-	<!-- maps openaire ids to their synonyms -->
+	<!-- initial step: create citation network -->
 	<action name="create-openaire-ranking-graph">
-		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
-			<!-- Delete previously created doi synonym folder -->
-			<!-- I think we don't need this given we don't have synonyms anymore
-			<prepare>
-				<delete path="${synonymFolder}"/>
-			</prepare>
-			-->
-
-			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
-
-			<!-- This is the name of our job -->
-			<name>Openaire Ranking Graph Creation</name>
-			<!-- Script name goes here -->
+			<name>OpenAIRE Ranking Graph Creation</name>
 			<jar>create_openaire_ranking_graph.py</jar>
-			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
 
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
@@ -80,39 +66,30 @@
 			<arg>${sparkShufflePartitions}</arg>
 			<!-- The output of the graph should be the openaire input graph for ranking-->
 			<arg>${openaireGraphInputPath}</arg>
-			<!-- This needs to point to the file on the hdfs i think -->
+
 			<file>${wfAppPath}/create_openaire_ranking_graph.py#create_openaire_ranking_graph.py</file>
 		</spark>
 
-		<!-- Do this after finishing okay -->
 		<ok to="non-iterative-rankings" />
-		<!-- Go there if we have an error -->
 		<error to="openaire-graph-error" />
 
 	</action>
 
 	<!-- Citation Count and RAM are calculated in parallel-->
-	<!-- Impulse Requires resources and will be run after-->
 	<fork name="non-iterative-rankings">
 		<path start="spark-cc"/>
 		<!-- <path start="spark-impulse"/> -->
 		<path start="spark-ram"/>
 	</fork>
 
-	<!-- CC here -->
+	<!-- Run Citation Count calculation -->
 	<action name="spark-cc">
-		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
-			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
-
-			<!-- This is the name of our job -->
-			<name>Spark CC</name>
-			<!-- Script name goes here -->
+			<name>Citation Count calculation</name>
 			<jar>CC.py</jar>
-			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
 
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
@@ -129,31 +106,23 @@
 			<arg>${openaireGraphInputPath}</arg>
 			<!-- number of partitions to be used on joins -->
 			<arg>${sparkShufflePartitions}</arg>
-			<!-- This needs to point to the file on the hdfs i think -->
+
 			<file>${wfAppPath}/bip-ranker/CC.py#CC.py</file>
 		</spark>
 
-		<!-- Do this after finishing okay -->
 		<ok to="join-non-iterative-rankings" />
-		<!-- Go there if we have an error -->
 		<error to="cc-fail" />
 
 	</action>
 
-	<!-- IMPULSE here -->
+	<!-- RAM calculation -->
 	<action name="spark-ram">
-		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
-			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
-
-			<!-- This is the name of our job -->
-			<name>Spark RAM</name>
-			<!-- Script name goes here -->
+			<name>RAM calculation</name>
 			<jar>TAR.py</jar>
-			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
 
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
@@ -171,37 +140,27 @@
 			<arg>${ramGamma}</arg>
 			<arg>${currentYear}</arg>
 			<arg>RAM</arg>
-			<!-- number of partitions to be used on joins -->
 			<arg>${sparkShufflePartitions}</arg>
 			<arg>${checkpointDir}</arg>
-			<!-- This needs to point to the file on the hdfs i think -->
+
 			<file>${wfAppPath}/bip-ranker/TAR.py#TAR.py</file>
 		</spark>
 
-		<!-- Do this after finishing okay -->
 		<ok to="join-non-iterative-rankings" />
-		<!-- Go there if we have an error -->
 		<error to="ram-fail" />
 
 	</action>
 
-	<!-- JOIN NON-ITERATIVE METHODS AND THEN CONTINUE TO ITERATIVE ONES -->
+	<!-- Join non-iterative methods -->
 	<join name="join-non-iterative-rankings" to="spark-impulse"/>
 
-	<!-- IMPULSE here -->
 	<action name="spark-impulse">
-		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
-			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
-
-			<!-- This is the name of our job -->
-			<name>Spark Impulse</name>
-			<!-- Script name goes here -->
+			<name>Impulse calculation</name>
 			<jar>CC.py</jar>
-			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
 
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
@@ -219,47 +178,22 @@
 			<!-- number of partitions to be used on joins -->
 			<arg>${sparkShufflePartitions}</arg>
 			<arg>3</arg>
-			<!-- This needs to point to the file on the hdfs i think -->
+
 			<file>${wfAppPath}/bip-ranker/CC.py#CC.py</file>
 		</spark>
 
-		<!-- Do this after finishing okay -->
 		<ok to="spark-pagerank" />
-		<!-- Go there if we have an error -->
 		<error to="impulse-fail" />
 
 	</action>
 
-	<!-- Removed for ser to make pagerank & attrank serial -->
-	<!-- 
-	<fork name="iterative-rankings">
-		<path start="spark-pagerank"/>
-		<path start="spark-attrank"/>
-	</fork>
-	-->
-
-	<!-- PAGERANK here -->
 	<action name="spark-pagerank">
-		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
-			<!-- we could add map-reduce configs here, but I don't know if we need them -->
-			<!-- This is the type of master-client configuration for running spark -->
-			<!-- <master>yarn-client</master> -->
-			<!-- Reference says: The master element indicates the url of the Spark Master. Ex: spark://host:port, mesos://host:port, yarn-cluster, yarn-master, or local. -->
-			<!-- <master>local[*]</master> -->
-			<!-- Reference says: The mode element if present indicates the mode of spark, where to run spark driver program. Ex: client,cluster. | In my case I always have a client -->
-			<!-- <mode>client</mode> -->
-
-			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
-
-			<!-- This is the name of our job -->
-			<name>Spark Pagerank</name>
-			<!-- Script name goes here -->
+			<name>Pagerank calculation</name>
 			<jar>PageRank.py</jar>
-			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
 
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
@@ -280,31 +214,22 @@
 			<!-- number of partitions to be used on joins -->
 			<arg>${sparkShufflePartitions}</arg>
 			<arg>dfs</arg>
-			<!-- This needs to point to the file on the hdfs i think -->
+
 			<file>${wfAppPath}/bip-ranker/PageRank.py#PageRank.py</file>
 		</spark>
 
-		<!-- Do this after finishing okay -->
 		<ok to="spark-attrank" />
-		<!-- Go there if we have an error -->
 		<error to="pagerank-fail" />
 
 	</action>
 
-	<!-- ATTRANK here -->
 	<action name="spark-attrank">
-		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
-			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
-
-			<!-- This is the name of our job -->
-			<name>Spark AttRank</name>
-			<!-- Script name goes here -->
+			<name>AttRank calculation</name>
 			<jar>AttRank.py</jar>
-			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
 
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
@@ -330,27 +255,16 @@
 			<!-- number of partitions to be used on joins -->
 			<arg>${sparkShufflePartitions}</arg>
 			<arg>dfs</arg>
-			<!-- This needs to point to the file on the hdfs i think -->
+
 			<file>${wfAppPath}/bip-ranker/AttRank.py#AttRank.py</file>
 		</spark>
 
-		<!-- Do this after finishing okay -->
 		<ok to="get-file-names" />
-		<!-- Go there if we have an error -->
 		<error to="attrank-fail" />
 
 	</action>
 
-	<!-- Removed for ser -->
-	<!--
-	 JOIN ITERATIVE METHODS AND THEN END =
-	<join name="join-iterative-rankings" to="end" />
-	to="get-file-names"/>  
-	-->
-
-	<!-- This will be a shell action that will output key-value pairs for output files -->
 	<action name="get-file-names">
-		<!-- This is required as a tag for shell jobs -->
 		<shell xmlns="uri:oozie:shell-action:0.3">
 
 			<!-- Exec is needed for shell commands - points to type of shell command -->
@@ -360,15 +274,12 @@
 			<!-- We only pass the directory where we expect to find the rankings -->
 			<argument>${workingDir}</argument>
 
-			<!-- the name of the file run -->
 			<file>${wfAppPath}/get_ranking_files.sh#get_ranking_files.sh</file>
 			<!-- Get the output in order to be usable by following actions -->
 			<capture-output/>
 		</shell>
 
-		<!-- Do this after finishing okay -->
 		<ok to="format-result-files" />
-		<!-- Go there if we have an error -->
 		<error to="filename-getting-error" />
 
 	</action>
@@ -383,18 +294,12 @@
 	<!-- Format json files -->
 	<!-- Two parts: a) format files b) make the file endings .json.gz -->
 	<action name="format-json-files">
-		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
-			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
-
-			<!-- This is the name of our job -->
 			<name>Format Ranking Results JSON</name>
-			<!-- Script name goes here -->
 			<jar>format_ranking_results.py</jar>
-			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
 
 			<spark-opts>
 				--executor-memory=${sparkNormalExecutorMemory}
@@ -419,13 +324,11 @@
 			<arg>${sparkShufflePartitions}</arg>
 			<!-- Type of data to be produced [bip (dois) / openaire (openaire-ids) ] -->
 			<arg>openaire</arg>
-			<!-- This needs to point to the file on the hdfs i think -->
+
 			<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
 		</spark>
 
-		<!-- Do this after finishing okay -->
 		<ok to="join-file-formatting" />
-		<!-- Go there if we have an error -->
 		<error to="json-formatting-fail" />
 	</action>
 
@@ -471,18 +374,15 @@
 			<file>${wfAppPath}/format_ranking_results.py#format_ranking_results.py</file>
 		</spark>
 
-		<!-- Do this after finishing okay -->
 		<ok to="join-file-formatting" />
-		<!-- Go there if we have an error -->
 		<error to="bip-formatting-fail" />
 	</action>
 
-	<!-- Finish formatting data and end -->
+	<!-- Finish formatting jobs -->
 	<join name="join-file-formatting" to="map-openaire-to-doi"/>
 
-	<!-- maps openaire ids to their synonyms -->
+	<!-- maps openaire ids to DOIs -->
 	<action name="map-openaire-to-doi">
-		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
 			<!-- Delete previously created doi synonym folder -->
@@ -490,15 +390,10 @@
 				<delete path="${synonymFolder}"/>
 			</prepare>
 
-			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
-
-			<!-- This is the name of our job -->
 			<name>Openaire-DOI synonym collection</name>
-			<!-- Script name goes here -->
 			<jar>map_openaire_ids_to_dois.py</jar>
-			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
 
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
@@ -515,19 +410,16 @@
 			<arg>${openaireDataInput}/</arg>
 			<!-- number of partitions to be used on joins -->
 			<arg>${synonymFolder}</arg>
-			<!-- This needs to point to the file on the hdfs i think -->
+
 			<file>${wfAppPath}/map_openaire_ids_to_dois.py#map_openaire_ids_to_dois.py</file>
 		</spark>
 
-		<!-- Do this after finishing okay -->
 		<ok to="map-scores-to-dois" />
-		<!-- Go there if we have an error -->
 		<error to="synonym-collection-fail" />
 
 	</action>
 
-
-	<!-- maps openaire ids to their synonyms -->
+	<!-- mapping openaire scores to DOIs -->
 	<action name="map-scores-to-dois">
 		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
@@ -535,12 +427,8 @@
 			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
-
-			<!-- This is the name of our job -->
 			<name>Mapping Openaire Scores to DOIs</name>
-			<!-- Script name goes here -->
 			<jar>map_scores_to_dois.py</jar>
-			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
 
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
@@ -564,18 +452,15 @@
 			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['impulse_file']}</arg>
 			<arg>${nameNode}/${workingDir}/${wf:actionData('get-file-names')['ram_file']}</arg>
 
-			<!-- This needs to point to the file on the hdfs i think -->
 			<file>${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file>
 		</spark>
 
-		<!-- Do this after finishing okay -->
 		<ok to="delete-output-path-for-actionset" />
-		<!-- This is the initial code <ok to="delete-output-path-for-actionset" /> -->
-		<!-- Go there if we have an error -->
 		<error to="map-scores-fail" />
 
 	</action>
 
+	<!-- Re-create folder for result and project actionsets -->
 	<action name="delete-output-path-for-actionset">
 		<fs>
 			<delete path="${actionSetOutputPath}/results/"/>
@@ -590,11 +475,13 @@
 
 	<action name="create-actionset-for-results">
 		<spark xmlns="uri:oozie:spark-action:0.2">
+
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 			<name>Produces the atomic action with the bip finder scores for publications</name>
 			<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
 			<jar>dhp-aggregation-${projectVersion}.jar</jar>
+
 			<spark-opts>
 				--executor-memory=${sparkNormalExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@@ -609,23 +496,19 @@
 			<arg>--outputPath</arg><arg>${actionSetOutputPath}/results/</arg>
 			<arg>--targetEntity</arg><arg>result</arg>
 		</spark>
+
 		<ok to="project-impact-indicators"/>
 		<error to="actionset-creation-fail"/>
+
 	</action>
 
 	<action name="project-impact-indicators">
-		<!-- This is required as a tag for spark jobs, regardless of programming language -->
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
-			<!-- using configs from an example on openaire -->
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
-
-			<!-- This is the name of our job -->
-			<name>Project Impact Indicators</name>
-			<!-- Script name goes here -->
+			<name>Project Impact Indicators calculation</name>
 			<jar>projects_impact.py</jar>
-			<!-- spark configuration options: I've taken most of them from an example from dhp workflows / Master value stolen from sandro -->
 
 			<spark-opts>
 				--executor-memory=${sparkHighExecutorMemory}
@@ -639,7 +522,6 @@
 			</spark-opts>
 
 			<!-- Script arguments here -->
-
 			<!-- graph data folder from which to read relations -->
 			<arg>${openaireDataInput}/relation</arg>
 
@@ -653,26 +535,23 @@
 			<arg>${sparkShufflePartitions}</arg>
 
 			<arg>${projectImpactIndicatorsOutput}</arg>
-
-			<!-- This needs to point to the file on the hdfs i think -->
 			<file>${wfAppPath}/projects_impact.py#projects_impact.py</file>
 		</spark>
 
-		<!-- Do this after finishing okay -->
 		<ok to="create-actionset-for-projects" />
-
-		<!-- Go there if we have an error -->
 		<error to="project-impact-indicators-fail" />
 
 	</action>
 
 	<action name="create-actionset-for-projects">
 		<spark xmlns="uri:oozie:spark-action:0.2">
+
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
 			<name>Produces the atomic action with the bip finder scores for projects</name>
 			<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
 			<jar>dhp-aggregation-${projectVersion}.jar</jar>
+
 			<spark-opts>
 				--executor-memory=${sparkNormalExecutorMemory}
 				--executor-cores=${sparkExecutorCores}
@@ -683,12 +562,15 @@
 				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
 				--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
 			</spark-opts>
+
 			<arg>--inputPath</arg><arg>${projectImpactIndicatorsOutput}</arg>
 			<arg>--outputPath</arg><arg>${actionSetOutputPath}/projects/</arg>
 			<arg>--targetEntity</arg><arg>project</arg>
 		</spark>
+
 		<ok to="end"/>
 		<error to="actionset-project-creation-fail"/>
+
 	</action>
 
 	<!-- Definitions of failure messages -->

From 2374f445a941a76fe239a95e75a5e491c12a22bf Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Fri, 21 Jul 2023 17:42:46 +0300
Subject: [PATCH 39/41] Produce additional bip update specific files

---
 .../oozie_app/map_scores_to_dois.py               | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
index 0fc67eb53..f6a8e9996 100755
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/map_scores_to_dois.py
@@ -15,6 +15,8 @@ from pyspark.sql.types import *
 
 # Import sql functions with shorthand alias
 import pyspark.sql.functions as F
+
+from pyspark.sql.functions import max
 # from pyspark.sql.functions import udf
 #################################################################################################
 #################################################################################################
@@ -127,6 +129,10 @@ for offset, input_file in enumerate(input_file_list):
     
     # Load file to dataframe
     ranking_df = spark.read.schema(schema).option('delimiter', '\t').csv(input_file).repartition(num_partitions, 'id')
+
+    # Get max score
+    max_score = ranking_df.select(max('score').alias('max')).collect()[0]['max']
+    print ("Max Score for " + str(input_file) + " is " + str(max_score))
    
     # TESTING
     # print ("Loaded df sample:")
@@ -138,6 +144,15 @@ for offset, input_file in enumerate(input_file_list):
     output_file = output_file_list[offset]
     print ("Writing to: " + output_file)
     doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip')
+    
+    # Creata another file for the bip update process
+    ranking_df = ranking_df.select('id', 'score', F.lit(F.col('score')/max_score).alias('normalized_score'), 'class', F.col('class').alias('class_dup'))
+    doi_score_df = synonym_df.join(ranking_df, ['id']).select('doi', 'score', 'normalized_score', 'class', 'class_dup').repartition(num_partitions, 'doi').cache()
+    output_file = output_file.replace(".txt.gz", "_for_bip_update.txt.gz")
+    print ("Writing bip update to: " + output_file)
+    doi_score_df.write.mode('overwrite').option('delimiter','\t').option('header',False).csv(output_file, compression='gzip')
+ 
+    
     # Free memory?
     ranking_df.unpersist(True)
 

From 3a0f09774a941b38e641b2d74ea073a9b6bce187 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Fri, 21 Jul 2023 17:55:41 +0300
Subject: [PATCH 40/41] Add script to find score limits

---
 .../oozie_app/get_score_limits.sh             | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh

diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh
new file mode 100644
index 000000000..6d4161d7f
--- /dev/null
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/get_score_limits.sh
@@ -0,0 +1,63 @@
+#/usr/bin/bash
+
+# Read log files from ranking scripts and create a two-line file  
+# with score limits for the various measures. To be used by Kleanthis
+
+attrank_file=$(ls *attrank*.log);
+pr_file=$(ls *pagerank*.log)
+ram_file=$(ls *ram*.log);
+cc_file=$(ls *cc*.log);
+impulse_file=$(ls *impulse*.log);
+
+echo
+echo "-----------------------------"
+echo "Attrank file:${attrank_file}";
+echo "PageRank file:${pr_file}";
+echo "RAM file:${ram_file}";
+echo "CC file:${cc_file}";
+echo "Impulse file:${impulse_file}";
+echo "-----------------------------"
+echo
+echo
+
+# output file will be called score_limits.csv
+echo -e "influence_top001\tinfluence_top01\tinfluence_top1\tinfluence_top10\tpopularity_top001\tpopularity_top01\tpopularity_top1\tpopularity_top10\timpulse_top001\timpulse_top01\timpulse_top1\timpulse_top10\tcc_top001\tcc_top01\tcc_top1\tcc_top10" > score_limits.csv
+# ---------------------------------------------------- #
+# Get respective score limits (we don't need RAM)
+inf_001=$(grep "^0.01%" ${pr_file} | cut -f 2);
+inf_01=$(grep "^0.1%" ${pr_file} | cut -f 2);
+inf_1=$(grep "^1%" ${pr_file} | cut -f 2);
+inf_10=$(grep "^10%" ${pr_file} | cut -f 2);
+echo "Influnence limits:"
+echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}";
+# ---------------------------------------------------- #
+pop_001=$(grep "^0.01%" ${attrank_file} | cut -f 2);
+pop_01=$(grep "^0.1%" ${attrank_file} | cut -f 2);
+pop_1=$(grep "^1%" ${attrank_file} | cut -f 2);
+pop_10=$(grep "^10%" ${attrank_file} | cut -f 2);
+echo "Popularity limits:";
+echo -e "${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}";
+# ---------------------------------------------------- #
+imp_001=$(grep "^0.01%" ${impulse_file} | cut -f 2);
+imp_01=$(grep "^0.1%" ${impulse_file} | cut -f 2);
+imp_1=$(grep "^1%" ${impulse_file} | cut -f 2);
+imp_10=$(grep "^10%" ${impulse_file} | cut -f 2);
+echo "Popularity limits:";
+echo -e "${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}";
+# ---------------------------------------------------- #
+cc_001=$(grep "^0.01%" ${cc_file} | cut -f 2);
+cc_01=$(grep "^0.1%" ${cc_file} | cut -f 2);
+cc_1=$(grep "^1%" ${cc_file} | cut -f 2);
+cc_10=$(grep "^10%" ${cc_file} | cut -f 2);
+echo "Popularity limits:";
+echo -e "${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}";
+# ---------------------------------------------------- #
+
+echo -e "${inf_001}\t${inf_01}\t${inf_1}\t${inf_10}\t${pop_001}\t${pop_01}\t${pop_1}\t${pop_10}\t${imp_001}\t${imp_01}\t${imp_1}\t${imp_10}\t${cc_001}\t${cc_01}\t${cc_1}\t${cc_10}" >> score_limits.csv
+
+echo
+echo "score_limits.csv contents:"
+cat score_limits.csv
+
+echo;
+echo;

From 97c1ba89187b5c57b6cac3263dd8c9d855c586d8 Mon Sep 17 00:00:00 2001
From: Serafeim Chatzopoulos <s.chatzopoulos@gmail.com>
Date: Fri, 11 Aug 2023 15:56:53 +0300
Subject: [PATCH 41/41] Merge actionsets of results and projects

---
 .../bipfinder/SparkAtomicActionScoreJob.java  |  62 ++++---
 .../bipfinder/input_actionset_parameter.json  |  18 +--
 .../SparkAtomicActionScoreJobTest.java        | 152 +++++++++---------
 .../impact_indicators/oozie_app/workflow.xml  |  76 +++------
 4 files changed, 130 insertions(+), 178 deletions(-)

diff --git a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
index fb11e829f..040c89782 100644
--- a/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
+++ b/dhp-workflows/dhp-aggregation/src/main/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJob.java
@@ -6,13 +6,14 @@ import static eu.dnetlib.dhp.common.SparkSessionSupport.runWithSparkSession;
 
 import java.io.Serializable;
 import java.util.List;
-import java.util.Optional;
 import java.util.stream.Collectors;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.compress.GzipCodec;
 import org.apache.hadoop.mapred.SequenceFileOutputFormat;
 import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
 import org.apache.spark.api.java.JavaRDD;
 import org.apache.spark.api.java.JavaSparkContext;
 import org.apache.spark.api.java.function.MapFunction;
@@ -41,8 +42,6 @@ import scala.Tuple2;
  */
 public class SparkAtomicActionScoreJob implements Serializable {
 
-	private static final String RESULT = "result";
-	private static final String PROJECT = "project";
 	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJob.class);
 	private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
 
@@ -61,15 +60,15 @@ public class SparkAtomicActionScoreJob implements Serializable {
 		Boolean isSparkSessionManaged = isSparkSessionManaged(parser);
 		log.info("isSparkSessionManaged: {}", isSparkSessionManaged);
 
-		final String inputPath = parser.get("inputPath");
-		log.info("inputPath: {}", inputPath);
+		final String resultsInputPath = parser.get("resultsInputPath");
+		log.info("resultsInputPath: {}", resultsInputPath);
+
+		final String projectsInputPath = parser.get("projectsInputPath");
+		log.info("projectsInputPath: {}", projectsInputPath);
 
 		final String outputPath = parser.get("outputPath");
 		log.info("outputPath: {}", outputPath);
 
-		final String targetEntity = parser.get("targetEntity");
-		log.info("targetEntity: {}", targetEntity);
-
 		SparkConf conf = new SparkConf();
 
 		runWithSparkSession(
@@ -78,26 +77,23 @@ public class SparkAtomicActionScoreJob implements Serializable {
 			spark -> {
 				removeOutputDir(spark, outputPath);
 
-				// follow different procedures for different target entities
-				switch (targetEntity) {
-					case RESULT:
-						prepareResults(spark, inputPath, outputPath);
-						break;
-					case PROJECT:
-						prepareProjects(spark, inputPath, outputPath);
-						break;
-					default:
-						throw new RuntimeException("Unknown target entity: " + targetEntity);
-				}
+				JavaPairRDD<Text, Text> resultsRDD = prepareResults(spark, resultsInputPath, outputPath);
+				JavaPairRDD<Text, Text> projectsRDD = prepareProjects(spark, projectsInputPath, outputPath);
+
+				resultsRDD
+					.union(projectsRDD)
+					.saveAsHadoopFile(
+						outputPath, Text.class, Text.class, SequenceFileOutputFormat.class, GzipCodec.class);
 			});
 	}
 
-	private static <I extends Project> void prepareProjects(SparkSession spark, String inputPath, String outputPath) {
+	private static <I extends Project> JavaPairRDD<Text, Text> prepareProjects(SparkSession spark, String inputPath,
+		String outputPath) {
 
 		// read input bip project scores
 		Dataset<BipProjectModel> projectScores = readPath(spark, inputPath, BipProjectModel.class);
 
-		projectScores.map((MapFunction<BipProjectModel, Project>) bipProjectScores -> {
+		return projectScores.map((MapFunction<BipProjectModel, Project>) bipProjectScores -> {
 			Project project = new Project();
 			project.setId(bipProjectScores.getProjectId());
 			project.setMeasures(bipProjectScores.toMeasures());
@@ -107,12 +103,12 @@ public class SparkAtomicActionScoreJob implements Serializable {
 			.map(p -> new AtomicAction(Project.class, p))
 			.mapToPair(
 				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
-					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
-			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
+					new Text(OBJECT_MAPPER.writeValueAsString(aa))));
 
 	}
 
-	private static <I extends Result> void prepareResults(SparkSession spark, String bipScorePath, String outputPath) {
+	private static <I extends Result> JavaPairRDD<Text, Text> prepareResults(SparkSession spark, String bipScorePath,
+		String outputPath) {
 
 		final JavaSparkContext sc = JavaSparkContext.fromSparkContext(spark.sparkContext());
 
@@ -128,24 +124,20 @@ public class SparkAtomicActionScoreJob implements Serializable {
 				return bs;
 			}).collect(Collectors.toList()).iterator()).rdd(), Encoders.bean(BipScore.class));
 
-		bipScores
+		return bipScores.map((MapFunction<BipScore, Result>) bs -> {
+			Result ret = new Result();
 
-			.map((MapFunction<BipScore, Result>) bs -> {
-				Result ret = new Result();
+			ret.setId(bs.getId());
 
-				ret.setId(bs.getId());
+			ret.setMeasures(getMeasure(bs));
 
-				ret.setMeasures(getMeasure(bs));
-
-				return ret;
-			}, Encoders.bean(Result.class))
+			return ret;
+		}, Encoders.bean(Result.class))
 			.toJavaRDD()
 			.map(p -> new AtomicAction(Result.class, p))
 			.mapToPair(
 				aa -> new Tuple2<>(new Text(aa.getClazz().getCanonicalName()),
-					new Text(OBJECT_MAPPER.writeValueAsString(aa))))
-			.saveAsHadoopFile(outputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
-
+					new Text(OBJECT_MAPPER.writeValueAsString(aa))));
 	}
 
 	private static List<Measure> getMeasure(BipScore value) {
diff --git a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
index d6b93c5af..c472eb5e6 100644
--- a/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
+++ b/dhp-workflows/dhp-aggregation/src/main/resources/eu/dnetlib/dhp/actionmanager/bipfinder/input_actionset_parameter.json
@@ -6,9 +6,15 @@
     "paramRequired": false
   },
   {
-    "paramName": "ip",
-    "paramLongName": "inputPath",
-    "paramDescription": "the URL from where to get the programme file",
+    "paramName": "rip",
+    "paramLongName": "resultsInputPath",
+    "paramDescription": "the URL from where to get the input file for results",
+    "paramRequired": true
+  },
+  {
+    "paramName": "pip",
+    "paramLongName": "projectsInputPath",
+    "paramDescription": "the URL from where to get the input file for projects",
     "paramRequired": true
   },
   {
@@ -16,11 +22,5 @@
     "paramLongName": "outputPath",
     "paramDescription": "the path of the new ActionSet",
     "paramRequired": true
-  },
-  {
-    "paramName": "te",
-    "paramLongName": "targetEntity",
-    "paramDescription": "the type of target entity to be enriched; currently supported one of { 'result', 'project' }",
-    "paramRequired": true
   }
 ]
\ No newline at end of file
diff --git a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
index 7752fbc27..542354836 100644
--- a/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
+++ b/dhp-workflows/dhp-aggregation/src/test/java/eu/dnetlib/dhp/actionmanager/bipfinder/SparkAtomicActionScoreJobTest.java
@@ -7,6 +7,8 @@ import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 
+import javax.xml.crypto.Data;
+
 import org.apache.commons.io.FileUtils;
 import org.apache.hadoop.io.Text;
 import org.apache.spark.SparkConf;
@@ -27,6 +29,7 @@ import com.fasterxml.jackson.databind.ObjectMapper;
 
 import eu.dnetlib.dhp.schema.action.AtomicAction;
 import eu.dnetlib.dhp.schema.oaf.KeyValue;
+import eu.dnetlib.dhp.schema.oaf.OafEntity;
 import eu.dnetlib.dhp.schema.oaf.Project;
 import eu.dnetlib.dhp.schema.oaf.Result;
 
@@ -38,9 +41,6 @@ public class SparkAtomicActionScoreJobTest {
 
 	private static Path workingDir;
 
-	private final static String RESULT = "result";
-	private final static String PROJECT = "project";
-
 	private static final Logger log = LoggerFactory.getLogger(SparkAtomicActionScoreJobTest.class);
 
 	@BeforeAll
@@ -72,50 +72,64 @@ public class SparkAtomicActionScoreJobTest {
 		spark.stop();
 	}
 
-	private void runJob(String inputPath, String outputPath, String targetEntity) throws Exception {
+	private void runJob(String resultsInputPath, String projectsInputPath, String outputPath) throws Exception {
 		SparkAtomicActionScoreJob
 			.main(
 				new String[] {
 					"-isSparkSessionManaged", Boolean.FALSE.toString(),
-					"-inputPath", inputPath,
+					"-resultsInputPath", resultsInputPath,
+					"-projectsInputPath", projectsInputPath,
 					"-outputPath", outputPath,
-					"-targetEntity", targetEntity,
 				});
 	}
 
 	@Test
-	void testResultScores() throws Exception {
-		final String targetEntity = RESULT;
-		String inputResultScores = getClass()
+	void testScores() throws Exception {
+
+		String resultsInputPath = getClass()
 			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/result_bip_scores.json")
 			.getPath();
-		String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet";
+
+		String projectsInputPath = getClass()
+			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json")
+			.getPath();
+
+		String outputPath = workingDir.toString() + "/actionSet";
 
 		// execute the job to generate the action sets for result scores
-		runJob(inputResultScores, outputPath, targetEntity);
+		runJob(resultsInputPath, projectsInputPath, outputPath);
 
 		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
 
-		JavaRDD<Result> tmp = sc
+		JavaRDD<OafEntity> tmp = sc
 			.sequenceFile(outputPath, Text.class, Text.class)
 			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
-			.map(aa -> ((Result) aa.getPayload()));
+			.map(aa -> ((OafEntity) aa.getPayload()));
 
-		assertEquals(4, tmp.count());
+		assertEquals(8, tmp.count());
 
-		Dataset<Result> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(Result.class));
+		Dataset<OafEntity> verificationDataset = spark.createDataset(tmp.rdd(), Encoders.bean(OafEntity.class));
 		verificationDataset.createOrReplaceTempView("result");
 
-		Dataset<Row> execVerification = spark
+		Dataset<Row> testDataset = spark
 			.sql(
 				"Select p.id oaid, mes.id, mUnit.value from result p " +
 					"lateral view explode(measures) m as mes " +
 					"lateral view explode(mes.unit) u as mUnit ");
 
-		Assertions.assertEquals(12, execVerification.count());
+//		execVerification.show();
+
+		Assertions.assertEquals(28, testDataset.count());
+
+		assertResultImpactScores(testDataset);
+		assertProjectImpactScores(testDataset);
+
+	}
+
+	void assertResultImpactScores(Dataset<Row> testDataset) {
 		Assertions
 			.assertEquals(
-				"6.63451994567e-09", execVerification
+				"6.63451994567e-09", testDataset
 					.filter(
 						"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
 							"and id = 'influence'")
@@ -125,7 +139,7 @@ public class SparkAtomicActionScoreJobTest {
 					.getString(0));
 		Assertions
 			.assertEquals(
-				"0.348694533145", execVerification
+				"0.348694533145", testDataset
 					.filter(
 						"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
 							"and id = 'popularity_alt'")
@@ -135,7 +149,7 @@ public class SparkAtomicActionScoreJobTest {
 					.getString(0));
 		Assertions
 			.assertEquals(
-				"2.16094680115e-09", execVerification
+				"2.16094680115e-09", testDataset
 					.filter(
 						"oaid='50|arXiv_dedup_::4a2d5fd8d71daec016c176ec71d957b1' " +
 							"and id = 'popularity'")
@@ -143,65 +157,49 @@ public class SparkAtomicActionScoreJobTest {
 					.collectAsList()
 					.get(0)
 					.getString(0));
-
 	}
 
-	@Test
-	void testProjectScores() throws Exception {
-		String targetEntity = PROJECT;
-		String inputResultScores = getClass()
-			.getResource("/eu/dnetlib/dhp/actionmanager/bipfinder/project_bip_scores.json")
-			.getPath();
-		String outputPath = workingDir.toString() + "/" + targetEntity + "/actionSet";
+	void assertProjectImpactScores(Dataset<Row> testDataset) throws Exception {
 
-		// execute the job to generate the action sets for project scores
-		runJob(inputResultScores, outputPath, PROJECT);
-
-		final JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
-
-		JavaRDD<Project> projects = sc
-			.sequenceFile(outputPath, Text.class, Text.class)
-			.map(value -> OBJECT_MAPPER.readValue(value._2().toString(), AtomicAction.class))
-			.map(aa -> ((Project) aa.getPayload()));
-
-		// test the number of projects
-		assertEquals(4, projects.count());
-
-		String testProjectId = "40|nih_________::c02a8233e9b60f05bb418f0c9b714833";
-
-		// count that the project with id testProjectId is present
-		assertEquals(1, projects.filter(row -> row.getId().equals(testProjectId)).count());
-
-		projects
-			.filter(row -> row.getId().equals(testProjectId))
-			.flatMap(r -> r.getMeasures().iterator())
-			.foreach(m -> {
-				log.info(m.getId() + " " + m.getUnit());
-
-				// ensure that only one score is present for each bip impact measure
-				assertEquals(1, m.getUnit().size());
-
-				KeyValue kv = m.getUnit().get(0);
-
-				// ensure that the correct key is provided, i.e. score
-				assertEquals("score", kv.getKey());
-
-				switch (m.getId()) {
-					case "numOfInfluentialResults":
-						assertEquals("0", kv.getValue());
-						break;
-					case "numOfPopularResults":
-						assertEquals("1", kv.getValue());
-						break;
-					case "totalImpulse":
-						assertEquals("25", kv.getValue());
-						break;
-					case "totalCitationCount":
-						assertEquals("43", kv.getValue());
-						break;
-					default:
-						fail("Unknown measure id in the context of projects");
-				}
-			});
+		Assertions
+			.assertEquals(
+				"0", testDataset
+					.filter(
+						"oaid='40|nih_________::c02a8233e9b60f05bb418f0c9b714833' " +
+							"and id = 'numOfInfluentialResults'")
+					.select("value")
+					.collectAsList()
+					.get(0)
+					.getString(0));
+		Assertions
+			.assertEquals(
+				"1", testDataset
+					.filter(
+						"oaid='40|nih_________::c02a8233e9b60f05bb418f0c9b714833' " +
+							"and id = 'numOfPopularResults'")
+					.select("value")
+					.collectAsList()
+					.get(0)
+					.getString(0));
+		Assertions
+			.assertEquals(
+				"25", testDataset
+					.filter(
+						"oaid='40|nih_________::c02a8233e9b60f05bb418f0c9b714833' " +
+							"and id = 'totalImpulse'")
+					.select("value")
+					.collectAsList()
+					.get(0)
+					.getString(0));
+		Assertions
+			.assertEquals(
+				"43", testDataset
+					.filter(
+						"oaid='40|nih_________::c02a8233e9b60f05bb418f0c9b714833' " +
+							"and id = 'totalCitationCount'")
+					.select("value")
+					.collectAsList()
+					.get(0)
+					.getString(0));
 	}
 }
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 349e054d8..c225fa3e1 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -31,7 +31,7 @@
 
 			<!-- Aggregation of impact scores on the project level		-->
 			<case to="project-impact-indicators">${wf:conf('resume') eq "projects-impact"}</case>
-			<case to="create-actionset-for-projects">${wf:conf('resume') eq "projects-impact-actionsets"}</case>
+			<case to="create-actionset">${wf:conf('resume') eq "create-actionset"}</case>
 
 			<default to="create-openaire-ranking-graph" />
 		</switch>
@@ -455,53 +455,11 @@
 			<file>${wfAppPath}/map_scores_to_dois.py#map_scores_to_dois.py</file>
 		</spark>
 
-		<ok to="delete-output-path-for-actionset" />
+		<ok to="project-impact-indicators" />
 		<error to="map-scores-fail" />
 
 	</action>
 
-	<!-- Re-create folder for result and project actionsets -->
-	<action name="delete-output-path-for-actionset">
-		<fs>
-			<delete path="${actionSetOutputPath}/results/"/>
-			<delete path="${actionSetOutputPath}/projects/"/>
-
-			<mkdir path="${actionSetOutputPath}/results/"/>
-			<mkdir path="${actionSetOutputPath}/projects/"/>
-		</fs>
-		<ok to="create-actionset-for-results"/>
-		<error to="actionset-delete-fail"/>
-	</action>
-
-	<action name="create-actionset-for-results">
-		<spark xmlns="uri:oozie:spark-action:0.2">
-
-			<master>yarn-cluster</master>
-			<mode>cluster</mode>
-			<name>Produces the atomic action with the bip finder scores for publications</name>
-			<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
-			<jar>dhp-aggregation-${projectVersion}.jar</jar>
-
-			<spark-opts>
-				--executor-memory=${sparkNormalExecutorMemory}
-				--executor-cores=${sparkExecutorCores}
-				--driver-memory=${sparkNormalDriverMemory}
-				--conf spark.extraListeners=${spark2ExtraListeners}
-				--conf spark.sql.queryExecutionListeners=${spark2SqlQueryExecutionListeners}
-				--conf spark.yarn.historyServer.address=${spark2YarnHistoryServerAddress}
-				--conf spark.eventLog.dir=${nameNode}${spark2EventLogDir}
-				--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
-			</spark-opts>
-			<arg>--inputPath</arg><arg>${bipScorePath}</arg>
-			<arg>--outputPath</arg><arg>${actionSetOutputPath}/results/</arg>
-			<arg>--targetEntity</arg><arg>result</arg>
-		</spark>
-
-		<ok to="project-impact-indicators"/>
-		<error to="actionset-creation-fail"/>
-
-	</action>
-
 	<action name="project-impact-indicators">
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
@@ -538,17 +496,26 @@
 			<file>${wfAppPath}/projects_impact.py#projects_impact.py</file>
 		</spark>
 
-		<ok to="create-actionset-for-projects" />
+		<ok to="delete-output-path-for-actionset" />
 		<error to="project-impact-indicators-fail" />
-
 	</action>
 
-	<action name="create-actionset-for-projects">
+	<!-- Re-create folder for actionsets -->
+	<action name="delete-output-path-for-actionset">
+		<fs>
+			<delete path="${actionSetOutputPath}"/>
+			<mkdir path="${actionSetOutputPath}"/>
+		</fs>
+		<ok to="create-actionset"/>
+		<error to="actionset-delete-fail"/>
+	</action>
+
+	<action name="create-actionset">
 		<spark xmlns="uri:oozie:spark-action:0.2">
 
 			<master>yarn-cluster</master>
 			<mode>cluster</mode>
-			<name>Produces the atomic action with the bip finder scores for projects</name>
+			<name>Produces the atomic action with the bip finder scores</name>
 			<class>eu.dnetlib.dhp.actionmanager.bipfinder.SparkAtomicActionScoreJob</class>
 			<jar>dhp-aggregation-${projectVersion}.jar</jar>
 
@@ -563,14 +530,13 @@
 				--conf spark.sql.warehouse.dir=${sparkSqlWarehouseDir}
 			</spark-opts>
 
-			<arg>--inputPath</arg><arg>${projectImpactIndicatorsOutput}</arg>
-			<arg>--outputPath</arg><arg>${actionSetOutputPath}/projects/</arg>
-			<arg>--targetEntity</arg><arg>project</arg>
+			<arg>--resultsInputPath</arg><arg>${bipScorePath}</arg>
+			<arg>--projectsInputPath</arg><arg>${projectImpactIndicatorsOutput}</arg>
+			<arg>--outputPath</arg><arg>${actionSetOutputPath}</arg>
 		</spark>
 
 		<ok to="end"/>
-		<error to="actionset-project-creation-fail"/>
-
+		<error to="actionset-creation-fail"/>
 	</action>
 
 	<!-- Definitions of failure messages -->
@@ -630,10 +596,6 @@
 		<message>Calculating project impact indicators failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
 	</kill>
 
-	<kill name="actionset-project-creation-fail">
-		<message>ActionSet creation for projects failed, error message[${wf:errorMessage(wf:lastErrorNode())}]</message>
-	</kill>
-
 	<!-- Define ending node -->
 	<end name="end" />