Merge pull request 'Oalex' (#13 ) from openaire-workflow-ready_2 into openaire-workflow-ready

Reviewed-on: #13
[oalex] change to add a thread to monitor the number of operations done by affro up to a certain point
2024-12-09 18:51:22 +01:00 · 2024-12-06 10:19:53 +01:00 · 2024-12-05 21:26:08 +01:00 · 2024-12-05 18:44:06 +01:00 · 2024-12-05 18:41:16 +01:00 · 2024-12-05 16:50:10 +01:00
4 changed files with 95 additions and 39 deletions
--- a/affro_cluster.py
+++ b/affro_cluster.py
@ -5,15 +5,15 @@ from matching_cluster import *
 from create_input_cluster import *
 import json

-dix_org = load_json('dictionaries/dix_acad.json')
-dix_mult = load_json('dictionaries/dix_mult.json')
-dix_city = load_json('dictionaries/dix_city.json')
-dix_country = load_json('dictionaries/dix_country.json')
-dix_org_oaire = load_json('dictionaries/dix_acad_oaire.json')
-dix_mult_oaire = load_json('dictionaries/dix_mult_oaire.json')
-dix_country_oaire = load_json('dictionaries/dix_country_oaire.json')
-dix_status = load_json('dictionaries/dix_status.json')
-dix_grids = load_json('dictionaries/dix_grids_rors.json')
+dix_org = load_json('dix_acad.json')
+dix_mult = load_json('dix_mult.json')
+dix_city = load_json('dix_city.json')
+dix_country = load_json('dix_country.json')
+dix_org_oaire = load_json('dix_acad_oaire.json')
+dix_mult_oaire = load_json('dix_mult_oaire.json')
+dix_country_oaire = load_json('dix_country_oaire.json')
+dix_status = load_json('dix_status.json')
+dix_grids = load_json('dix_grids_rors.json')

    
 def find_ror(input, simU, simG):
--- a/functions_cluster.py
+++ b/functions_cluster.py
@ -27,12 +27,12 @@ def replace_double_consonants(text):
    result = re.sub(pattern, r'\1', text, flags=re.IGNORECASE)
    return result

-remove_list = [replace_double_consonants(x) for x in load_txt('txt_files/remove_list.txt')]
-stop_words = load_txt('txt_files/stop_words.txt')
-university_terms = [replace_double_consonants(x) for x in load_txt('txt_files/university_terms.txt')]
-city_names = [replace_double_consonants(x) for x in load_txt('txt_files/city_names.txt')]
+remove_list = [replace_double_consonants(x) for x in load_txt('remove_list.txt')]
+stop_words = load_txt('stop_words.txt')
+university_terms = [replace_double_consonants(x) for x in load_txt('university_terms.txt')]
+city_names = [replace_double_consonants(x) for x in load_txt('city_names.txt')]

-categ_dicts = load_json('dictionaries/dix_categ.json')
+categ_dicts = load_json('dix_categ.json')


 def is_contained(s, w):
--- a/strings.py
+++ b/strings.py
@ -1,8 +1,11 @@
-import json
+import time
+
+from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
+from threading import Thread
 from affro_cluster import *
-import os
+
 from pyspark.sql import SparkSession
-from pyspark.sql.functions import col
+from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set
 import sys

 spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
@ -10,31 +13,85 @@ spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
 folder_path = sys.argv[1]
 hdfs_output_path = sys.argv[2]

-# folder_path = '/user/zeppelin/affiliations/raw_aff_string/2024-08'
-# hdfs_output_path = 'tmp/affro/results_strings'
+matchings_schema = ArrayType(
+    StructType([
+        StructField("Provenance", StringType(), nullable=True),
+        StructField("PID", StringType(), nullable=True),
+        StructField("Value", StringType(), nullable=True),
+        StructField("Confidence", DoubleType(), nullable=True),
+        StructField("Status", StringType(), nullable=True)
+    ])
+)

+operation_counter = spark.sparkContext.accumulator(0)

-def oalex_affro(record):
-    doi = record['doi'][16:]
-    oalex = record['rors']
-    try: 
-        matchings =  [item for sublist in [affro(x) for x in record['raw_aff_string']] for item in (sublist if isinstance(sublist, list) else [sublist])]  
+#Version of affro application on a single raw_aff_string and returns just the Matchins set
+def oalex_affro(aff_string):
+    global operation_counter
+    try:
+        matchings = affro(aff_string)
+        operation_counter += 1
+        # Ensure matchings is a list, even if affro returns a single dict
+        if not isinstance(matchings, list):
+            matchings = [matchings]
+
+        # Create the result as a tuple that matches matchings_schema
+        result = []
+        for matching in matchings:
+            # Assuming 'matching' is a dictionary that contains 'Provenance', 'PID', 'Value', 'Confidence', 'Status'
+            result.append((
+                matching.get("Provenance", None),
+                matching.get("PID", None),
+                matching.get("Value", None),
+                float(matching.get("Confidence", None)),
+                matching.get("Status", None)
+            ))

-        result = {'DOI' : doi, 'OAlex' : oalex, 'Matchings': matchings}
-      
        return result
-   
+
    except Exception as e:
-        print(f"Error processing record with id {record.get('doi')}: {str(e)}")
-        return None
+        print(f"Error processing affiliation string {aff_string}: {str(e)}")
+        return ()

+oalex_affro_udf = udf(oalex_affro, matchings_schema)
+monitor_done = False

-df = spark.read.json(folder_path)
-filtered_df = df.filter(col("doi").isNotNull())
-updated_rdd = filtered_df.rdd.map(lambda row: oalex_affro(row.asDict()))
+def monitor_counter(interval):
+    while True:
+        print(f"Number of calls to AffRo: {operation_counter.value}")
+        time.sleep(interval)
+        if monitor_done:
+            break

-json_rdd = updated_rdd.map(lambda record: json.dumps(record))
+exploded = spark.read.json(folder_path) \
+    .filter(col("doi").isNotNull()) \
+    .select(
+    col("doi").alias("DOI"),
+    col("rors").alias("OAlex"),
+    explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
+)

+monitor_thread = Thread(target=monitor_counter, args=(600,), daemon=True)
+monitor_thread.start()

-json_rdd.saveAsTextFile(hdfs_output_path)
+affs = exploded \
+    .select("affiliation") \
+    .distinct() \
+    .withColumn("Matchings", oalex_affro_udf(col("affiliation")))

+affs.join(exploded, on="affiliation") \
+    .select(col("DOI"),
+            col("OAlex"),
+            explode(col("Matchings")).alias("match")
+            ) \
+    .groupBy("DOI") \
+    .agg(first("OAlex").alias("OAlex"), #for each DOI it says what are the other columns Since OALEX is equal for each doi just select the first, while use the collect_list function to aggregate the Matchings
+         collect_set("match").alias("Matchings") #each exploded match is collected again
+         ) \
+    .write \
+    .mode("overwrite") \
+    .option("compression","gzip") \
+    .json(hdfs_output_path)
+
+monitor_done = True
+monitor_thread.join()
--- a/update_records.py
+++ b/update_records.py
@ -3,9 +3,10 @@ import os
 from pyspark.sql import SparkSession
 from affro_cluster import *

-folder_path ='/Users/myrto/Documents/openAIRE/7. matching/data_samples/iis_short'
+import sys

-#folder_path = 'check'
+folder_path = sys.argv[1]
+hdfs_output_path = sys.argv[2]

 json_file_names = []

@ -106,8 +107,6 @@ def update_record(record):
        print(f"Error processing record with id {record.get('id')}: {str(e)}")
        return None

-
-
 for file in json_file_names:
    print('start processing '+str(file))
    df = spark.read.json(folder_path + '/' + file)
@ -122,7 +121,7 @@ for file in json_file_names:
    json_data = json_rdd.collect()

    # Create a new filename by appending "_output.json" to the original filename (without extension)
-    output_file_name = file+'_output.json'
+    output_file_name = hdfs_output_path + "/" + file+'_output.json'
    print('end processing '+str(file))

    with open(output_file_name, 'w') as f:
Author	SHA1	Message	Date
Myrto Kallipoliti	44f0f9987f	Merge pull request 'Oalex' (#13 ) from openaire-workflow-ready_2 into openaire-workflow-ready Reviewed-on: #13	2024-12-09 18:51:22 +01:00
Miriam Baglioni	ad691c28c2	[oalex] change to add a thread to monitor the number of operations done by affro up to a certain point	2024-12-06 10:19:53 +01:00
Miriam Baglioni	2806511e02	[oalex] change collec_list to collect_set so that the same match will be there just one time	2024-12-05 21:26:08 +01:00
Miriam Baglioni	0043e4051f	[oalex] renaming	2024-12-05 18:44:06 +01:00
Miriam Baglioni	a59d0ce9fc	[oalex] avoid redefinition of explode function	2024-12-05 18:41:16 +01:00
Miriam Baglioni	e2f8007433	[oalex] added fix	2024-12-05 16:50:10 +01:00
Miriam Baglioni	f8479083f2	[oalex] pasing the schema to avoid changing in confidence type	2024-12-05 16:44:17 +01:00
Miriam Baglioni	9440f863c9	[oalex] changed implementation passing throguh rdd to avoi calling udf function	2024-12-05 16:36:38 +01:00
Miriam Baglioni	f78456288c	[oalex] fix issue	2024-12-05 12:54:10 +01:00
Miriam Baglioni	997f2e492f	[oalex] change the call of the function in the dataframe	2024-12-05 12:35:59 +01:00
Miriam Baglioni	982a1b0b9f	[oalex] change the call of the function in the dataframe	2024-12-05 12:21:21 +01:00
Miriam Baglioni	4fe3d31ed5	[oalex] register the UDF oalex_affro and the schema of the output to be used in the dataframe by pyspark	2024-12-05 12:18:45 +01:00
Miriam Baglioni	efa4db4e52	[oalex] execute affRo on distinct affilitaion_strings	2024-12-05 12:02:40 +01:00
Miriam Baglioni	ea2e27a9f4	[oalex] fix python syntax errors	2024-12-05 11:22:10 +01:00
Miriam Baglioni	e33bf4ef14	[oalex] proposal to higher the parallelization	2024-12-05 10:39:00 +01:00
Miriam Baglioni	f4704aef4d	[oalex] proposal to higher the parallelization	2024-12-05 10:27:32 +01:00
Miriam Baglioni	0500fc586f	Added input/output path as parameters	2024-12-04 15:14:58 +01:00
Miriam Baglioni	5568aa92ec	Remove from path	2024-12-03 16:54:47 +01:00
Miriam Baglioni	600ddf8087	Remove directory name Change to make the file discoverable on the cluster	2024-12-03 16:45:57 +01:00