From ec4e01068759a48fdfcd94d4e3854059b61f0d42 Mon Sep 17 00:00:00 2001
From: ikanellos <ilias.kanellos@gmail.com>
Date: Tue, 23 May 2023 16:44:04 +0300
Subject: [PATCH] End after rankings | Create graph debugged

---
 .../oozie_app/create_openaire_ranking_graph.py         | 10 ++++++++--
 .../oa/graph/impact_indicators/oozie_app/workflow.xml  |  5 +++--
 2 files changed, 11 insertions(+), 4 deletions(-)
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
index 3d131933d..6dd4427b9 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py
@@ -114,6 +114,12 @@ print ("Total num of research objects: " + str(oa_objects_df.count()))
 # Keep only required fields - we still keep resulttype.classname to
 # filter the citation relationships we consider valid
 oa_objects_df = oa_objects_df.drop('deletedbyinference').drop('invisible').distinct().cache()
+
+'''
+print ("OA objects Schema:")
+oa_objects_df.printSchema()
+sys.exit(0)
+'''
 ############################################################################################################################
 # 2. Get the relation objects and filter them based on their existence in the oa_objects_df
 #    NOTE: we are only interested in citations of type "cites"
@@ -154,8 +160,8 @@ cites_df  = spark.read.json(graph_folder + "/relation")\
 # references_df = references_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), references_df.cited == oa_objects_df.id).drop('id').distinct().repartition(num_partitions, 'citing').cache()
 # print ("References df now has: " + str(references_df.count()) +  " entries")
 
-cites_df = cites_df.join(oa_objects_df.select('id'), cites_df.citing == oa_objects_df.id).where( F.col('resulttype.classname').isin(valid_result_types) ).drop('id').drop('resulttype.classname')
-cites_df = cites_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cites_df.cited == oa_objects_df.id).drop('id').drop('resulttype.classname').distinct().repartition(num_partitions, 'citing').cache()
+cites_df = cites_df.join(oa_objects_df.select('id', 'classname'), cites_df.citing == oa_objects_df.id).where( F.col('classname').isin(valid_result_types) ).drop('id').drop('classname')
+cites_df = cites_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cites_df.cited == oa_objects_df.id).distinct().repartition(num_partitions, 'citing').cache()
 # TODO: add here a clause filtering out the citations 
 # originating from "other" types of research objects which we consider valid
 
diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
index 285a66382..78cf92bd2 100644
--- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
+++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/workflow.xml
@@ -81,7 +81,7 @@
 		</spark>
 
 		<!-- Do this after finishing okay -->
-		<ok to="end" />
+		<ok to="non-iterative-rankings" />
 		<!-- Go there if we have an error -->
 		<error to="openaire-graph-error" />
 
@@ -335,7 +335,8 @@
 	</action>
 
 	<!-- JOIN ITERATIVE METHODS AND THEN END -->
-	<join name="join-iterative-rankings" to="get-file-names"/>
+	<join name="join-iterative-rankings" to="end">
+	<!-- to="get-file-names"/> --> 
 
 
 	<!-- This will be a shell action that will output key-value pairs for output files -->