[oalex] avoid redefinition of explode function

This commit is contained in:
Miriam Baglioni 2024-12-05 18:41:16 +01:00
parent e2f8007433
commit a59d0ce9fc
1 changed files with 17 additions and 38 deletions

View File

@ -12,7 +12,6 @@ spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
folder_path = sys.argv[1] folder_path = sys.argv[1]
hdfs_output_path = sys.argv[2] hdfs_output_path = sys.argv[2]
working_dir = sys.argv[3]
matchings_schema = ArrayType( matchings_schema = ArrayType(
StructType([ StructType([
@ -24,20 +23,6 @@ matchings_schema = ArrayType(
]) ])
) )
result_schema = StructType([
StructField("affiliation", StringType(),nullable=True),
StructField("match", ArrayType(
StructType([
StructField("Provenance", StringType(), nullable=True),
StructField("PID", StringType(), nullable=True),
StructField("Value", StringType(), nullable=True),
StructField("Confidence", DoubleType(), nullable=True),
StructField("Status", StringType(), nullable=True)
])
))
])
def oalex_affro_2(aff_string): def oalex_affro_2(aff_string):
try: try:
matchings = affro(aff_string) matchings = affro(aff_string)
@ -77,34 +62,28 @@ def oalex_affro(aff_string):
oalex_affro_udf = udf(oalex_affro_2, matchings_schema) oalex_affro_udf = udf(oalex_affro_2, matchings_schema)
explode = spark.read.json(folder_path) \ exploded = spark.read.json(folder_path) \
.filter(col("doi").isNotNull()) \ .filter(col("doi").isNotNull()) \
.select( .select(
col("doi").alias("DOI"), col("doi").alias("DOI"),
col("rors").alias("OAlex"), col("rors").alias("OAlex"),
explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
) )
affs = explode \
rdd = explode \
.select("affiliation") \ .select("affiliation") \
.distinct() \ .distinct() \
.rdd \ .withColumn("Matchings", oalex_affro_udf(col("affiliation")))
.flatMap(lambda row: [{"affiliation":row['affiliation'], "match": m} for m in oalex_affro(row['affiliation'])])
rdd.map(json.dumps).saveAsTextFile(working_dir + "/ tmp") affs.join(exploded, on="affiliation") \
affs = spark.read.json(working_dir + "/ tmp")
affs.join(explode, on="affiliation") \
.select(col("DOI"), .select(col("DOI"),
col("OAlex"), col("OAlex"),
col("match") explode(col("Matchings")).alias("match")
) \ ) \
.groupBy("DOI") \ .groupBy("DOI") \
.agg(first("OAlex").alias("OAlex"), #for each DOI it says what are the other columns Since OALEX is equal for each doi just select the first, while use the collect_list function to aggregate the Matchings .agg(first("OAlex").alias("OAlex"), #for each DOI it says what are the other columns Since OALEX is equal for each doi just select the first, while use the collect_list function to aggregate the Matchings
collect_list("match").alias("Matchings") #each exploded match is collected again collect_list("match").alias("Matchings") #each exploded match is collected again
) \ ) \
.write \ .write \
.mode("overwrite") \ .mode("overwrite") \
.option("compression","gzip") \ .option("compression","gzip") \
.json(hdfs_output_path) .json(hdfs_output_path)