Oalex #13

Merged
mkallipo merged 15 commits from openaire-workflow-ready_2 into openaire-workflow-ready 2024-12-09 18:51:24 +01:00
1 changed files with 24 additions and 16 deletions
Showing only changes of commit ad691c28c2 - Show all commits

View File

@ -1,11 +1,11 @@
import json import time
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
from threading import Thread
from affro_cluster import * from affro_cluster import *
from pyspark.sql import SparkSession from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, first, collect_set, udf from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set
import sys import sys
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate() spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
@ -23,10 +23,14 @@ matchings_schema = ArrayType(
]) ])
) )
def oalex_affro_2(aff_string): operation_counter = spark.sparkContext.accumulator(0)
#Version of affro application on a single raw_aff_string and returns just the Matchins set
def oalex_affro(aff_string):
global operation_counter
try: try:
matchings = affro(aff_string) matchings = affro(aff_string)
operation_counter += 1
# Ensure matchings is a list, even if affro returns a single dict # Ensure matchings is a list, even if affro returns a single dict
if not isinstance(matchings, list): if not isinstance(matchings, list):
matchings = [matchings] matchings = [matchings]
@ -49,18 +53,15 @@ def oalex_affro_2(aff_string):
print(f"Error processing affiliation string {aff_string}: {str(e)}") print(f"Error processing affiliation string {aff_string}: {str(e)}")
return () return ()
#Version of affro application on a single raw_aff_string and returns just the Matchins set oalex_affro_udf = udf(oalex_affro, matchings_schema)
def oalex_affro(aff_string): monitor_done = False
try:
matchings = affro(aff_string)
if not isinstance(matchings, list):
matchings = [matchings]
return matchings
except Exception as e:
print(f"Error processing affiliation string {aff_string}: {str(e)}")
return []
oalex_affro_udf = udf(oalex_affro_2, matchings_schema) def monitor_counter(interval):
while True:
print(f"Number of calls to AffRo: {operation_counter.value}")
time.sleep(interval)
if monitor_done:
break
exploded = spark.read.json(folder_path) \ exploded = spark.read.json(folder_path) \
.filter(col("doi").isNotNull()) \ .filter(col("doi").isNotNull()) \
@ -69,6 +70,10 @@ exploded = spark.read.json(folder_path) \
col("rors").alias("OAlex"), col("rors").alias("OAlex"),
explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
) )
monitor_thread = Thread(target=monitor_counter, args=(600,), daemon=True)
monitor_thread.start()
affs = exploded \ affs = exploded \
.select("affiliation") \ .select("affiliation") \
.distinct() \ .distinct() \
@ -87,3 +92,6 @@ affs.join(exploded, on="affiliation") \
.mode("overwrite") \ .mode("overwrite") \
.option("compression","gzip") \ .option("compression","gzip") \
.json(hdfs_output_path) .json(hdfs_output_path)
monitor_done = True
monitor_thread.join()