2024-10-16 12:42:51 +02:00
import json
2024-12-05 12:18:45 +01:00
from pyspark . sql . types import StringType , ArrayType , StructType , StructField , DoubleType
2024-10-16 12:42:51 +02:00
from affro_cluster import *
2024-12-05 12:54:10 +01:00
2024-10-16 12:42:51 +02:00
from pyspark . sql import SparkSession
2024-12-05 12:18:45 +01:00
from pyspark . sql . functions import col , explode , first , collect_list , udf
2024-10-16 12:42:51 +02:00
import sys
spark = SparkSession . builder . appName ( " JSONProcessing " ) . getOrCreate ( )
2024-10-18 10:48:18 +02:00
folder_path = sys . argv [ 1 ]
hdfs_output_path = sys . argv [ 2 ]
2024-12-05 16:50:10 +01:00
working_dir = sys . argv [ 3 ]
2024-12-05 12:18:45 +01:00
matchings_schema = ArrayType (
StructType ( [
StructField ( " Provenance " , StringType ( ) , nullable = True ) ,
StructField ( " PID " , StringType ( ) , nullable = True ) ,
StructField ( " Value " , StringType ( ) , nullable = True ) ,
StructField ( " Confidence " , DoubleType ( ) , nullable = True ) ,
StructField ( " Status " , StringType ( ) , nullable = True )
] )
)
2024-10-18 10:48:18 +02:00
2024-12-05 16:44:17 +01:00
result_schema = StructType ( [
StructField ( " affiliation " , StringType ( ) , nullable = True ) ,
StructField ( " match " , ArrayType (
StructType ( [
StructField ( " Provenance " , StringType ( ) , nullable = True ) ,
StructField ( " PID " , StringType ( ) , nullable = True ) ,
StructField ( " Value " , StringType ( ) , nullable = True ) ,
StructField ( " Confidence " , DoubleType ( ) , nullable = True ) ,
StructField ( " Status " , StringType ( ) , nullable = True )
] )
) )
] )
2024-12-05 16:36:38 +01:00
def oalex_affro_2 ( aff_string ) :
try :
matchings = affro ( aff_string )
# Ensure matchings is a list, even if affro returns a single dict
if not isinstance ( matchings , list ) :
matchings = [ matchings ]
# Create the result as a tuple that matches matchings_schema
result = [ ]
for matching in matchings :
# Assuming 'matching' is a dictionary that contains 'Provenance', 'PID', 'Value', 'Confidence', 'Status'
result . append ( (
matching . get ( " Provenance " , None ) ,
matching . get ( " PID " , None ) ,
matching . get ( " Value " , None ) ,
float ( matching . get ( " Confidence " , None ) ) ,
matching . get ( " Status " , None )
) )
return result
except Exception as e :
print ( f " Error processing affiliation string { aff_string } : { str ( e ) } " )
return ( )
2024-12-05 10:27:32 +01:00
#Version of affro application on a single raw_aff_string and returns just the Matchins set
2024-12-05 12:02:40 +01:00
def oalex_affro ( aff_string ) :
2024-12-05 10:27:32 +01:00
try :
matchings = affro ( aff_string )
if not isinstance ( matchings , list ) :
matchings = [ matchings ]
return matchings
2024-10-16 12:42:51 +02:00
except Exception as e :
2024-12-05 12:02:40 +01:00
print ( f " Error processing affiliation string { aff_string } : { str ( e ) } " )
2024-12-05 10:27:32 +01:00
return [ ]
2024-12-05 16:36:38 +01:00
oalex_affro_udf = udf ( oalex_affro_2 , matchings_schema )
2024-12-05 12:18:45 +01:00
2024-12-05 12:02:40 +01:00
explode = spark . read . json ( folder_path ) \
2024-12-05 11:22:10 +01:00
. filter ( col ( " doi " ) . isNotNull ( ) ) \
2024-12-05 10:27:32 +01:00
. select (
2024-12-05 12:18:45 +01:00
col ( " doi " ) . alias ( " DOI " ) ,
2024-12-05 16:36:38 +01:00
col ( " rors " ) . alias ( " OAlex " ) ,
2024-12-05 12:18:45 +01:00
explode ( col ( " raw_aff_string " ) ) . alias ( " affiliation " ) #this allows to split all the raw_aff_string and to parallelize better
2024-12-05 12:02:40 +01:00
)
2024-12-05 16:44:17 +01:00
rdd = explode \
2024-12-05 12:02:40 +01:00
. select ( " affiliation " ) \
. distinct ( ) \
2024-12-05 16:36:38 +01:00
. rdd \
2024-12-05 16:44:17 +01:00
. flatMap ( lambda row : [ { " affiliation " : row [ ' affiliation ' ] , " match " : m } for m in oalex_affro ( row [ ' affiliation ' ] ) ] )
2024-12-05 16:36:38 +01:00
2024-12-05 16:50:10 +01:00
rdd . map ( json . dumps ) . saveAsTextFile ( working_dir + " / tmp " )
affs = spark . read . json ( working_dir + " / tmp " )
2024-12-05 12:02:40 +01:00
2024-12-05 12:35:59 +01:00
affs . join ( explode , on = " affiliation " ) \
2024-12-05 12:02:40 +01:00
. select ( col ( " DOI " ) ,
col ( " OAlex " ) ,
2024-12-05 16:36:38 +01:00
col ( " match " )
2024-12-05 12:02:40 +01:00
) \
2024-12-05 11:22:10 +01:00
. groupBy ( " DOI " ) \
2024-12-05 10:27:32 +01:00
. agg ( first ( " OAlex " ) . alias ( " OAlex " ) , #for each DOI it says what are the other columns Since OALEX is equal for each doi just select the first, while use the collect_list function to aggregate the Matchings
2024-12-05 11:22:10 +01:00
collect_list ( " match " ) . alias ( " Matchings " ) #each exploded match is collected again
) \
. write \
. mode ( " overwrite " ) \
. option ( " compression " , " gzip " ) \
2024-12-05 10:27:32 +01:00
. json ( hdfs_output_path )
2024-10-16 12:42:51 +02:00