dnet-hadoop/dhp-workflows/dhp-impact-indicators/src/main/resources/eu/dnetlib/dhp/oa/graph/impact_indicators/oozie_app/create_openaire_ranking_gra...

256 lines
12 KiB
Python

#!/usr/bin/python3
# Create openaire id - openaire id graph from openaire data
#############################################################################################################
# Program proceeds as follows:
# 1. We read the input folder provided from hdfs.
# This contains subfolders with openaire graph objects and openaire graph relations
# 2. We select all openaire graph objects of interest. We filter out based on visibility
# and inference criteria. We also filter out based on the availability of publication year
# 3. Get reference type dataframes from openaire. Then filter each one of them based on the
# existence of citing and cited in the above filtered dataset. Get only citations
# produced by publication objects, or otherresearchproducts of types:
# [TBD]
# 4. Get objects that don't appear in the relations (from those gathered in step 1) and add
# them to the graph
# 5. Group relations by citing paper and do graph-specific formatting
#############################################################################################################
# ---------- Imports ------------- #
import sys
# import pyspark
# from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession
# Functions to effectively handle data
# manipulation for DataFrames
import pyspark.sql.functions as F
# Diagnostics
from timeit import default_timer as timer
# from datetime import timedelta, datetime
# -------------------------------- #
if len(sys.argv) < 5:
print ("Usage: ./create_openaire_ranking_graph.py <openaire_graph_data_folder> <current_year> <num_partitions> <output_folder>")
sys.exit(0)
# Inputs will be:
# 1. Folder where openaire graph is stored
graph_folder = sys.argv[1]
# 2. Current year (this will be needed for filtering)
current_year = int(sys.argv[2])
# 3. Number of partitions
num_partitions = int(sys.argv[3])
# 4. where to write output
output_folder = sys.argv[4]
# Lists of results types we want to inclued in the citations
# valid_result_types = ['publication', 'other']
valid_result_types = ['publication']
# list of types in otherresearchproduct which are considered valid for citations
valid_other = ['']
# Create the spark session
spark = SparkSession.builder.appName('oa ranking graph creation').getOrCreate()
# Set context level logging to WARN
spark.sparkContext.setLogLevel("WARN")
############################################################################################################################
# 1. Get the research objects and filter based on conditions.
# These will also be the unique identifiers we should find in the final graph
# Initialize an empty dataframe
oa_objects_df = None
# There is a directory structure on hdfs under the provided path.
# We need to parse data from the folders: ["publication", "dataset", "software", "otherresearchproduct"]
# which are rankable oa result objects.
# Loop subfolders
for sub_folder in ["publication", "dataset", "software", "otherresearchproduct"]:
# Read the json data of the graph into a dataframe initially
if not oa_objects_df:
oa_objects_df = spark.read.json(graph_folder + "/" + sub_folder).select('id', 'resulttype.classname', 'datainfo.deletedbyinference', 'datainfo.invisible', F.year('dateofacceptance.value').alias('year'))
oa_objects_df = oa_objects_df.where( 'datainfo.deletedbyinference = false' ).where( 'datainfo.invisible = false' ).repartition(num_partitions, 'id').cache()
# If we already have data, simply add more to it
else:
sub_df = spark.read.json(graph_folder + "/" + sub_folder).select('id', 'resulttype.classname','datainfo.deletedbyinference', 'datainfo.invisible', F.year('dateofacceptance.value').alias('year'))
sub_df = sub_df.where( 'datainfo.deletedbyinference = false ' ).where( 'datainfo.invisible = false ').cache()
# Add the data to the openaire objects dataframe
oa_objects_df = oa_objects_df.union(sub_df).repartition(num_partitions, 'id').cache()
# Clear memory
sub_df.unpersist(True)
# Remove those records without year
oa_objects_df = oa_objects_df.where(F.col('year').isNotNull())
# Now replace years where > (current_year+1) with 0
oa_objects_df = oa_objects_df.withColumn('clean_year', F.when(F.col('year').cast('int') > (current_year+1), 0).otherwise(F.col('year')))\
.drop('year').withColumnRenamed('clean_year', 'year').repartition(num_partitions, 'id')
# -------------------------------------------------------------------- #
'''
# Some diagnostics
print ("Min and max years:" )
oa_objects_df.select(F.max('year')).show()
oa_objects_df.select(F.min('year')).show()
# This should be slow due to not repartitioning by year
print ("Distinct years:")
oa_objects_df.select('year').distinct().sort(F.col('year')).show(5000, False)
# Show distinct values of deletedbyinference and invisible to ensure we have the correct data
print ("Distinct deleted by inference:")
oa_objects_df.select('deletedbyinference').distinct().show()
print ("Distinct invisible values:")
oa_objects_df.select('invisible').distinct().show()
# Output total count
print ("Total num of research objects: " + str(oa_objects_df.count()))
'''
# -------------------------------------------------------------------- #
# Keep only required fields - we still keep resulttype.classname to
# filter the citation relationships we consider valid
oa_objects_df = oa_objects_df.drop('deletedbyinference').drop('invisible').distinct().cache()
'''
print ("OA objects Schema:")
oa_objects_df.printSchema()
sys.exit(0)
'''
############################################################################################################################
# 2. Get the relation objects and filter them based on their existence in the oa_objects_df
# NOTE: we are only interested in citations of type "cites"
# Further, we
# Deprecated line
# references_df = spark.read.json(graph_folder + "/relation").select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'relClass')\
# .where( 'relClass = "References"' ).repartition(num_partitions, 'citing').drop('relClass')
# print ("References df has: " + str(references_df.count()) + " entries")
# Collect only valid citations i.e., invisible = false & deletedbyinference=false
cites_df = spark.read.json(graph_folder + "/relation")\
.select(F.col('source').alias('citing'), F.col('target').alias('cited'), 'collectedfrom.value', 'relClass', 'dataInfo.deletedbyinference', 'dataInfo.invisible')\
.where( (F.col('relClass') == "Cites") \
& (F.col('dataInfo.deletedbyinference') == "false")\
& (F.col('dataInfo.invisible') == "false"))\
.drop('dataInfo.deletedbyinference').drop('dataInfo.invisible')\
.drop('deletedbyinference').drop('invisible')\
.repartition(num_partitions, 'citing').drop('relClass')\
.withColumn('collected_lower', F.expr('transform(value, x -> lower(x))'))\
.drop('collectedfrom.value')\
.drop('value')\
.where(
(F.array_contains(F.col('collected_lower'), "opencitations"))
| (F.array_contains(F.col('collected_lower'), "crossref"))
| (F.array_contains(F.col('collected_lower'), "microsoft academic graph"))
).drop('collected_lower')
# print ("Cited df has: " + str(cites_df.count()) + " entries")
# DEPRECATED
# cited_by_df = spark.read.json(graph_folder + "/relation").select(F.col('target').alias('citing'), F.col('source').alias('cited'), 'relClass')\
# .where( 'relClass = "IsCitedBy"' ).repartition(num_partitions, 'citing').drop('relClass')
# print ("Cited by df has: " + str(cited_by_df.count()) + " entries")
# DEPRECATED
# Keep only relations where citing and cited are in the oa_objects_df
# references_df = references_df.join(oa_objects_df.select('id'), references_df.citing == oa_objects_df.id).drop('id')
# references_df = references_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), references_df.cited == oa_objects_df.id).drop('id').distinct().repartition(num_partitions, 'citing').cache()
# print ("References df now has: " + str(references_df.count()) + " entries")
cites_df = cites_df.join(oa_objects_df.select('id', 'classname'), cites_df.citing == oa_objects_df.id).where( F.col('classname').isin(valid_result_types) ).drop('id').drop('classname')
cites_df = cites_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cites_df.cited == oa_objects_df.id).distinct().repartition(num_partitions, 'citing').cache()
# TODO: add here a clause filtering out the citations
# originating from "other" types of research objects which we consider valid
# print ("Cites df now has: " + str(cites_df.count()) + " entries")
# DEPRECATED
# cited_by_df = cited_by_df.join(oa_objects_df.select('id'), cited_by_df.citing == oa_objects_df.id).drop('id')
# cited_by_df = cited_by_df.repartition(num_partitions, 'cited').join(oa_objects_df.select('id'), cited_by_df.cited == oa_objects_df.id).drop('id').distinct().repartition(num_partitions, 'citing').cache()
# print ("Cited BY df now has: " + str(cited_by_df.count()) + " entries")
# DEPRECATED
# Join all the above into a single set
# citations_df = references_df.union(cites_df).distinct().repartition(num_partitions, 'citing').cache()
# Free space
# references_df.unpersist(True)
# cites_df.unpersist(True)
# citations_df = citations_df.union(cited_by_df).distinct().repartition(num_partitions, 'citing').cache()
# ALL citations we keep are in the cited_df dataframe
citations_df = cites_df
'''
# Show schema
print ("Citation schema:")
citations_df.printSchema()
print ("Objects schema:")
oa_objects_df.printSchema()
'''
# Free space
# cited_by_df.unpersist(True)
# Show total num of unique citations
'''
num_unique_citations = citations_df.count()
print ("Total unique citations: " + str(num_unique_citations))
'''
############################################################################################################################
# 3. Get any potentially missing 'citing' papers from references (these are dangling nodes w/o any outgoing references)
dangling_nodes = oa_objects_df.join(citations_df.select('citing').distinct(), citations_df.citing == oa_objects_df.id, 'left_anti')\
.select(F.col('id').alias('citing')).withColumn('cited', F.array([F.lit("0")])).repartition(num_partitions, 'citing')
# Count dangling nodes
'''
dangling_num = dangling_nodes.count()
print ("Number of dangling nodes: " + str(dangling_num))
'''
# print ("Dangling nodes sample:")
# dangling_nodes.show(10, False)
############################################################################################################################
# 4. Group the citation dataframe by citing doi, and create the cited dois list. Add dangling nodes to the result
graph = citations_df.groupBy('citing').agg(F.collect_set('cited').alias('cited')).repartition(num_partitions, 'citing').cache()
# Free space
citations_df.unpersist(True)
'''
num_nodes = graph.count()
print ("Entries in graph before dangling nodes:" + str(num_nodes))
'''
# print ("Sample in graph: ")
# graph.show(10, False)
# Add dangling nodes
graph = graph.union(dangling_nodes).repartition(num_partitions, 'citing')
# Count current number of results
num_nodes = graph.count()
print ("Num entries after adding dangling nodes: " + str(num_nodes))
# Add publication year
graph = graph.join(oa_objects_df, graph.citing == oa_objects_df.id).select('citing', 'cited', 'year').cache()
num_nodes_final = graph.count()
print ("After adding year: " + str(num_nodes_final))
# print ("Graph sample:")
# graph.show(20, False)
# Calculate initial score of nodes (1/N)
initial_score = float(1)/float(num_nodes_final)
############################################################################################################################
# 5. Write graph to output file!
print("Writing output to: " + output_folder)
graph.select('citing', F.concat_ws("|", F.concat_ws(",",'cited'), F.when(F.col('cited').getItem(1) != "0", F.size('cited')).otherwise(F.lit("0")), F.lit(str(initial_score)) ).alias('cited'), 'year').withColumn('prev_pr', F.lit("0")).select('citing', 'cited', 'prev_pr', 'year')\
.write.mode("overwrite").option("delimiter","\t").csv(output_folder, compression="gzip")
if num_nodes_final != num_nodes:
print ("WARNING: the number of nodes after keeping only nodes where year is available went from: " + str(num_nodes) + " to " + str(num_nodes_final) + "\n")
print ("Check for any mistakes...")
############################################################################################################################
print ("\nDONE!\n\n")
# Wrap up
spark.stop()