2024-10-16 12:42:51 +02:00
import json
from affro_cluster import *
import os
from pyspark . sql import SparkSession
2024-12-05 10:27:32 +01:00
from pyspark . sql . functions import col , explode , first , collect_list
2024-10-16 12:42:51 +02:00
import sys
spark = SparkSession . builder . appName ( " JSONProcessing " ) . getOrCreate ( )
2024-10-18 10:48:18 +02:00
folder_path = sys . argv [ 1 ]
hdfs_output_path = sys . argv [ 2 ]
2024-12-05 12:02:40 +01:00
working_dir_path = sys . argv [ 3 ]
2024-10-18 10:48:18 +02:00
2024-12-05 10:27:32 +01:00
#Version of affro application on a single raw_aff_string and returns just the Matchins set
2024-12-05 12:02:40 +01:00
def oalex_affro ( aff_string ) :
2024-12-05 10:27:32 +01:00
try :
matchings = affro ( aff_string )
if not isinstance ( matchings , list ) :
matchings = [ matchings ]
return matchings
2024-10-16 12:42:51 +02:00
except Exception as e :
2024-12-05 12:02:40 +01:00
print ( f " Error processing affiliation string { aff_string } : { str ( e ) } " )
2024-12-05 10:27:32 +01:00
return [ ]
2024-12-05 12:02:40 +01:00
explode = spark . read . json ( folder_path ) \
2024-12-05 11:22:10 +01:00
. filter ( col ( " doi " ) . isNotNull ( ) ) \
2024-12-05 10:27:32 +01:00
. select (
2024-12-05 12:02:40 +01:00
col ( " doi " ) . alias ( " DOI " ) ,
col ( " ror " ) . alias ( " OAlex " ) ,
explode ( col ( " raw_aff_string " ) ) . alias ( " affiliation " ) #this allows to split all the raw_aff_string and to parallelize better
)
affs = explode \
. select ( " affiliation " ) \
. distinct ( ) \
. withColumn ( " Matchings " , oalex_affro ( col ( " aff_string " ) ) )
affs . join ( explode , on = " affiliation " ) \
. select ( col ( " DOI " ) ,
col ( " OAlex " ) ,
explode ( " Matchins " ) . alias ( " match " )
) \
2024-12-05 11:22:10 +01:00
. groupBy ( " DOI " ) \
2024-12-05 10:27:32 +01:00
. agg ( first ( " OAlex " ) . alias ( " OAlex " ) , #for each DOI it says what are the other columns Since OALEX is equal for each doi just select the first, while use the collect_list function to aggregate the Matchings
2024-12-05 11:22:10 +01:00
collect_list ( " match " ) . alias ( " Matchings " ) #each exploded match is collected again
) \
. write \
. mode ( " overwrite " ) \
. option ( " compression " , " gzip " ) \
2024-12-05 10:27:32 +01:00
. json ( hdfs_output_path )
2024-10-16 12:42:51 +02:00