forked from D-Net/dnet-hadoop
End after rankings | Create graph debugged
This commit is contained in:
parent
38020e242a
commit
ec4e010687
|
@ -114,6 +114,12 @@ print ("Total num of research objects: " + str(oa_objects_df.count()))
|
||||||
# Keep only required fields - we still keep resulttype.classname to
|
# Keep only required fields - we still keep resulttype.classname to
|
||||||
# filter the citation relationships we consider valid
|
# filter the citation relationships we consider valid
|
||||||
oa_objects_df = oa_objects_df.drop('deletedbyinference').drop('invisible').distinct().cache()
|
oa_objects_df = oa_objects_df.drop('deletedbyinference').drop('invisible').distinct().cache()
|
||||||
|
|
||||||
|
'''
|
||||||
|
print ("OA objects Schema:")
|
||||||
|
oa_objects_df.printSchema()
|
||||||
|
sys.exit(0)
|
||||||
|
'''
|
||||||
############################################################################################################################
|
############################################################################################################################
|
||||||
# 2. Get the relation objects and filter them based on their existence in the oa_objects_df
|
# 2. Get the relation objects and filter them based on their existence in the oa_objects_df
|
||||||
# NOTE: we are only interested in citations of type "cites"
|
# NOTE: we are only interested in citations of type "cites"
|
||||||
|
@ -154,8 +160,8 @@ cites_df = spark.read.json(graph_folder + "/relation")\
|
||||||
# references_df = references_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), references_df.cited == oa_objects_df.id).drop('id').distinct().repartition(num_partitions, 'citing').cache()
|
# references_df = references_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), references_df.cited == oa_objects_df.id).drop('id').distinct().repartition(num_partitions, 'citing').cache()
|
||||||
# print ("References df now has: " + str(references_df.count()) + " entries")
|
# print ("References df now has: " + str(references_df.count()) + " entries")
|
||||||
|
|
||||||
cites_df = cites_df.join(oa_objects_df.select('id'), cites_df.citing == oa_objects_df.id).where( F.col('resulttype.classname').isin(valid_result_types) ).drop('id').drop('resulttype.classname')
|
cites_df = cites_df.join(oa_objects_df.select('id', 'classname'), cites_df.citing == oa_objects_df.id).where( F.col('classname').isin(valid_result_types) ).drop('id').drop('classname')
|
||||||
cites_df = cites_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cites_df.cited == oa_objects_df.id).drop('id').drop('resulttype.classname').distinct().repartition(num_partitions, 'citing').cache()
|
cites_df = cites_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cites_df.cited == oa_objects_df.id).distinct().repartition(num_partitions, 'citing').cache()
|
||||||
# TODO: add here a clause filtering out the citations
|
# TODO: add here a clause filtering out the citations
|
||||||
# originating from "other" types of research objects which we consider valid
|
# originating from "other" types of research objects which we consider valid
|
||||||
|
|
||||||
|
|
|
@ -81,7 +81,7 @@
|
||||||
</spark>
|
</spark>
|
||||||
|
|
||||||
<!-- Do this after finishing okay -->
|
<!-- Do this after finishing okay -->
|
||||||
<ok to="end" />
|
<ok to="non-iterative-rankings" />
|
||||||
<!-- Go there if we have an error -->
|
<!-- Go there if we have an error -->
|
||||||
<error to="openaire-graph-error" />
|
<error to="openaire-graph-error" />
|
||||||
|
|
||||||
|
@ -335,7 +335,8 @@
|
||||||
</action>
|
</action>
|
||||||
|
|
||||||
<!-- JOIN ITERATIVE METHODS AND THEN END -->
|
<!-- JOIN ITERATIVE METHODS AND THEN END -->
|
||||||
<join name="join-iterative-rankings" to="get-file-names"/>
|
<join name="join-iterative-rankings" to="end">
|
||||||
|
<!-- to="get-file-names"/> -->
|
||||||
|
|
||||||
|
|
||||||
<!-- This will be a shell action that will output key-value pairs for output files -->
|
<!-- This will be a shell action that will output key-value pairs for output files -->
|
||||||
|
|
Loading…
Reference in New Issue