From 6a7e370a21d23ec987291da0faa7994f814106e5 Mon Sep 17 00:00:00 2001 From: ikanellos Date: Tue, 23 May 2023 16:48:58 +0300 Subject: [PATCH] Remove unnecessary counts in graph creation --- .../oozie_app/create_openaire_ranking_graph.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py index 6dd4427b9..2b6b4aae9 100644 --- a/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py +++ b/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_graph.py @@ -196,15 +196,19 @@ oa_objects_df.printSchema() # cited_by_df.unpersist(True) # Show total num of unique citations +''' num_unique_citations = citations_df.count() print ("Total unique citations: " + str(num_unique_citations)) +''' ############################################################################################################################ # 3. Get any potentially missing 'citing' papers from references (these are dangling nodes w/o any outgoing references) dangling_nodes = oa_objects_df.join(citations_df.select('citing').distinct(), citations_df.citing == oa_objects_df.id, 'left_anti')\ .select(F.col('id').alias('citing')).withColumn('cited', F.array([F.lit("0")])).repartition(num_partitions, 'citing') # Count dangling nodes +''' dangling_num = dangling_nodes.count() print ("Number of dangling nodes: " + str(dangling_num)) +''' # print ("Dangling nodes sample:") # dangling_nodes.show(10, False) ############################################################################################################################ @@ -213,8 +217,10 @@ graph = citations_df.groupBy('citing').agg(F.collect_set('cited').alias('cited') # Free space citations_df.unpersist(True) +''' num_nodes = graph.count() print ("Entries in graph before dangling nodes:" + str(num_nodes)) +''' # print ("Sample in graph: ") # graph.show(10, False)