2024-10-16 12:42:51 +02:00
import json
2024-12-05 12:18:45 +01:00
from pyspark . sql . types import StringType , ArrayType , StructType , StructField , DoubleType
2024-10-16 12:42:51 +02:00
from affro_cluster import *
2024-12-05 12:54:10 +01:00
2024-10-16 12:42:51 +02:00
from pyspark . sql import SparkSession
2024-12-05 12:18:45 +01:00
from pyspark . sql . functions import col , explode , first , collect_list , udf
2024-10-16 12:42:51 +02:00
import sys
spark = SparkSession . builder . appName ( " JSONProcessing " ) . getOrCreate ( )
2024-10-18 10:48:18 +02:00
folder_path = sys . argv [ 1 ]
hdfs_output_path = sys . argv [ 2 ]
2024-12-05 12:18:45 +01:00
matchings_schema = ArrayType (
StructType ( [
StructField ( " Provenance " , StringType ( ) , nullable = True ) ,
StructField ( " PID " , StringType ( ) , nullable = True ) ,
StructField ( " Value " , StringType ( ) , nullable = True ) ,
StructField ( " Confidence " , DoubleType ( ) , nullable = True ) ,
StructField ( " Status " , StringType ( ) , nullable = True )
] )
)
2024-10-18 10:48:18 +02:00
2024-12-05 10:27:32 +01:00
#Version of affro application on a single raw_aff_string and returns just the Matchins set
2024-12-05 12:02:40 +01:00
def oalex_affro ( aff_string ) :
2024-12-05 10:27:32 +01:00
try :
matchings = affro ( aff_string )
if not isinstance ( matchings , list ) :
matchings = [ matchings ]
return matchings
2024-10-16 12:42:51 +02:00
except Exception as e :
2024-12-05 12:02:40 +01:00
print ( f " Error processing affiliation string { aff_string } : { str ( e ) } " )
2024-12-05 10:27:32 +01:00
return [ ]
2024-12-05 12:18:45 +01:00
oalex_affro_udf = udf ( oalex_affro , matchings_schema )
2024-12-05 12:02:40 +01:00
explode = spark . read . json ( folder_path ) \
2024-12-05 11:22:10 +01:00
. filter ( col ( " doi " ) . isNotNull ( ) ) \
2024-12-05 10:27:32 +01:00
. select (
2024-12-05 12:18:45 +01:00
col ( " doi " ) . alias ( " DOI " ) ,
col ( " ror " ) . alias ( " OAlex " ) ,
explode ( col ( " raw_aff_string " ) ) . alias ( " affiliation " ) #this allows to split all the raw_aff_string and to parallelize better
2024-12-05 12:02:40 +01:00
)
affs = explode \
. select ( " affiliation " ) \
. distinct ( ) \
2024-12-05 12:54:10 +01:00
. withColumn ( " Matchings " , oalex_affro_udf ( col ( " affiliation " ) ) )
2024-12-05 12:02:40 +01:00
2024-12-05 12:35:59 +01:00
affs . join ( explode , on = " affiliation " ) \
2024-12-05 12:02:40 +01:00
. select ( col ( " DOI " ) ,
col ( " OAlex " ) ,
2024-12-05 12:54:10 +01:00
explode ( col ( " Matchins " ) ) . alias ( " match " )
2024-12-05 12:02:40 +01:00
) \
2024-12-05 11:22:10 +01:00
. groupBy ( " DOI " ) \
2024-12-05 10:27:32 +01:00
. agg ( first ( " OAlex " ) . alias ( " OAlex " ) , #for each DOI it says what are the other columns Since OALEX is equal for each doi just select the first, while use the collect_list function to aggregate the Matchings
2024-12-05 11:22:10 +01:00
collect_list ( " match " ) . alias ( " Matchings " ) #each exploded match is collected again
) \
. write \
. mode ( " overwrite " ) \
. option ( " compression " , " gzip " ) \
2024-12-05 10:27:32 +01:00
. json ( hdfs_output_path )
2024-10-16 12:42:51 +02:00