2024-12-09 18:51:24 +01:00
1 changed files with 20 additions and 11 deletions
--- a/strings.py
+++ b/strings.py
@ -9,29 +9,38 @@ spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()

 folder_path = sys.argv[1]
 hdfs_output_path = sys.argv[2]
+working_dir_path = sys.argv[3]

 #Version of affro application on a single raw_aff_string and returns just the Matchins set
-def oalex_affro(doi, aff_string):
+def oalex_affro(aff_string):
    try:
        matchings = affro(aff_string)
        if not isinstance(matchings, list):
            matchings = [matchings]
        return matchings
    except Exception as e:
-        print(f"Error processing record with doi {doi}: {str(e)}")
+        print(f"Error processing affiliation string {aff_string}: {str(e)}")
        return []

-
-spark.read.json(folder_path) \
+explode = spark.read.json(folder_path) \
    .filter(col("doi").isNotNull()) \
    .select(
    col("doi").alias("DOI"),
-        col("rors").alias("OAlex"),
-        explode(col("raw_aff_string")).alias("aff_string") #this allows to split all the raw_aff_string and to parallelize better
+    col("ror").alias("OAlex"),
+    explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
+    )
+
+affs = explode \
+    .select("affiliation") \
+    .distinct() \
+    .withColumn("Matchings", oalex_affro(col("aff_string")))
+
+
+affs.join(explode, on = "affiliation") \
+    .select(col("DOI"),
+            col("OAlex"),
+            explode("Matchins").alias("match")
            ) \
-    .drop(col("aff_string") #removes the aff_string column
-          )  \
-    .select(col("DOI"),col("OAlex"),explode("Matchins").alias("match")) \
    .groupBy("DOI")  \
          .agg(first("OAlex").alias("OAlex"), #for each DOI it says what are the other columns Since OALEX is equal for each doi just select the first, while use the collect_list function to aggregate the Matchings
               collect_list("match").alias("Matchings") #each exploded match is collected again