import json from pyspark.sql import SparkSession from affro_cluster import * import sys folder_path = sys.argv[1] hdfs_output_path = sys.argv[2] # Initialize Spark session spark = SparkSession.builder.appName("AffRo-Crossref").getOrCreate() def remove_duplicates(list_of_dicts): # Use a set to store tuples of dictionary items to filter out duplicates seen = set() unique_list_of_dicts = [] for d in list_of_dicts: # Convert the dictionary to a tuple of items items = tuple(d.items()) if items not in seen: seen.add(items) unique_list_of_dicts.append(d) return unique_list_of_dicts def crossref_affro(record): doi = record['DOI'] try: for author in record['author']: affiliations = [] if len(author['affiliation'])>0: for organization in author['affiliation']: try: if organization['name'] not in affiliations: affiliations.append(organization['name']) except: pass if len(affiliations)>0: affiliations = list(set(affiliations)) ror_links = [affro(affil) for affil in affiliations] matchings = [inner_ror for outer_ror in ror_links for inner_ror in outer_ror] matchings = remove_duplicates(matchings) if len(matchings)>0: result = {'DOI' : doi, 'Matchings' : matchings} return result except Exception as e: print(f"Error processing record with id {record['DOI']} : {str(e)}") df = spark.read.json(folder_path) # Apply the update_record function updated_rdd = df.rdd.map(lambda row: crossref_affro(row.asDict())) filtered_rdd = updated_rdd.filter(lambda record: record is not None and record != {}) # Convert updated RDD to JSON strings json_rdd = filtered_rdd.map(lambda record: json.dumps(record)) json_rdd.saveAsTextFile(hdfs_output_path)