forked from D-Net/dnet-hadoop
Remove unnecessary counts in graph creation
This commit is contained in:
parent
ec4e010687
commit
6a7e370a21
|
@ -196,15 +196,19 @@ oa_objects_df.printSchema()
|
||||||
# cited_by_df.unpersist(True)
|
# cited_by_df.unpersist(True)
|
||||||
|
|
||||||
# Show total num of unique citations
|
# Show total num of unique citations
|
||||||
|
'''
|
||||||
num_unique_citations = citations_df.count()
|
num_unique_citations = citations_df.count()
|
||||||
print ("Total unique citations: " + str(num_unique_citations))
|
print ("Total unique citations: " + str(num_unique_citations))
|
||||||
|
'''
|
||||||
############################################################################################################################
|
############################################################################################################################
|
||||||
# 3. Get any potentially missing 'citing' papers from references (these are dangling nodes w/o any outgoing references)
|
# 3. Get any potentially missing 'citing' papers from references (these are dangling nodes w/o any outgoing references)
|
||||||
dangling_nodes = oa_objects_df.join(citations_df.select('citing').distinct(), citations_df.citing == oa_objects_df.id, 'left_anti')\
|
dangling_nodes = oa_objects_df.join(citations_df.select('citing').distinct(), citations_df.citing == oa_objects_df.id, 'left_anti')\
|
||||||
.select(F.col('id').alias('citing')).withColumn('cited', F.array([F.lit("0")])).repartition(num_partitions, 'citing')
|
.select(F.col('id').alias('citing')).withColumn('cited', F.array([F.lit("0")])).repartition(num_partitions, 'citing')
|
||||||
# Count dangling nodes
|
# Count dangling nodes
|
||||||
|
'''
|
||||||
dangling_num = dangling_nodes.count()
|
dangling_num = dangling_nodes.count()
|
||||||
print ("Number of dangling nodes: " + str(dangling_num))
|
print ("Number of dangling nodes: " + str(dangling_num))
|
||||||
|
'''
|
||||||
# print ("Dangling nodes sample:")
|
# print ("Dangling nodes sample:")
|
||||||
# dangling_nodes.show(10, False)
|
# dangling_nodes.show(10, False)
|
||||||
############################################################################################################################
|
############################################################################################################################
|
||||||
|
@ -213,8 +217,10 @@ graph = citations_df.groupBy('citing').agg(F.collect_set('cited').alias('cited')
|
||||||
# Free space
|
# Free space
|
||||||
citations_df.unpersist(True)
|
citations_df.unpersist(True)
|
||||||
|
|
||||||
|
'''
|
||||||
num_nodes = graph.count()
|
num_nodes = graph.count()
|
||||||
print ("Entries in graph before dangling nodes:" + str(num_nodes))
|
print ("Entries in graph before dangling nodes:" + str(num_nodes))
|
||||||
|
'''
|
||||||
# print ("Sample in graph: ")
|
# print ("Sample in graph: ")
|
||||||
# graph.show(10, False)
|
# graph.show(10, False)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue