initial commit

2024-09-05 12:23:32 +02:00 · 2024-09-05 12:23:32 +02:00 · 0c98ba76a6
parent 530e474d7c
commit 0c98ba76a6
12 changed files with 1851 additions and 0 deletions
--- a/README.md
+++ b/README.md
--- a/affro_cluster.py
+++ b/affro_cluster.py
@ -0,0 +1,40 @@
 import sys 
 ##import functions
 from functions_cluster import *
 from matching_cluster import *
 from create_input_cluster import *
 import json
 dix_org = load_json('dictionaries/dix_acad.json')
 dix_mult = load_json('dictionaries/dix_mult.json')
 dix_city = load_json('dictionaries/dix_city.json')
 dix_country = load_json('dictionaries/dix_country.json')
 def affro(raw_aff_string):
    try:
        result = Aff_Ids(create_df_algorithm(raw_aff_string), dix_org, dix_mult, dix_city, dix_country,  0.65, 0.82)
        if len(result)>0:
            result_dict =  [json.dumps({'Origin': 'affRo', 'RORid':x[2], 'Confidence':x[1]}) for x in result]
        else:
            result_dict =  []
        return result_dict
    except Exception as e:
        # Return some indication of an error, or log the row
        print(f"Error: {str(e)}")
        print(raw_aff_string)
        pass
 #raw_aff = 'university of california, los angeles, university of athens, university of california, san diego, university of athens, greece'
 if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python affro_spark.py <string> <float1> <float2>")
        sys.exit(1)
    string_arg = sys.argv[1]
   # float_arg1 = float(sys.argv[2])
   # float_arg2 = float(sys.argv[3])
    print(affro(string_arg))
--- a/affro_test_example.py
+++ b/affro_test_example.py
@ -0,0 +1,28 @@
 from pyspark.sql import SparkSession
 from pyspark.sql.functions import udf
 from pyspark.sql.types import StringType
 import sys
 from affro_cluster import *
 # Initialize SparkSession
 spark = SparkSession.builder.appName("CustomFunctionExample").getOrCreate()
 # Register the function as a UDF
 affro_udf = udf(affro, StringType())
 # Input list of strings
 input_data = ["university of athens", "university of vienna", "UCLA"]
 # # Convert the list to a Spark DataFrame
 df = spark.createDataFrame(input_data, "string").toDF("raw_affiliation_string")
 # # Apply your custom UDF to the DataFrame
 df_with_custom_value = df.withColumn("affro_value", affro_udf(df["raw_affiliation_string"]))
 df_with_custom_value.show(truncate=False)
 # Stop the SparkSession
 spark.stop()
--- a/create_input_cluster.py
+++ b/create_input_cluster.py
@ -0,0 +1,77 @@
 from functions_cluster import *
 def create_df_algorithm(raw_aff_string):
    aff_no_symbols_d =  substrings_dict(clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string))))
    substring_list = list(aff_no_symbols_d.values())
    i = 0
    while i < len(substring_list) - 1:
        if is_contained('progr', substring_list[i]) and is_contained('dep', substring_list[i+1]):
            substring_list.pop(i)
        elif (is_contained('assistant', substring_list[i]) or is_contained('researcher', substring_list[i]) or is_contained('phd', substring_list[i]) or is_contained('student', substring_list[i]) or is_contained('section', substring_list[i]) or is_contained('prof', substring_list[i]) or is_contained('director', substring_list[i])) and (not is_contained('school', substring_list[i+1]) or is_contained('univ', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('lab', substring_list[i+1]) or is_contained('fac', substring_list[i+1])):
            substring_list.pop(i)
        elif (is_contained('engineer', substring_list[i]) or is_contained('progr', substring_list[i]) or is_contained('unit', substring_list[i]) or is_contained('lab', substring_list[i]) or is_contained('dep', substring_list[i]) or  is_contained('school', substring_list[i])  or is_contained('inst', substring_list[i]) #or is_contained('hosp', substring_list[i]) 
            or is_contained('fac', substring_list[i])) and is_contained('univ', substring_list[i+1]):
            if not is_contained('univ', substring_list[i]):
                substring_list.pop(i)
            else:
                i = i+1
                continue
        elif is_contained('lab', substring_list[i]) and (is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('school', substring_list[i+1])):
            if not is_contained('univ', substring_list[i]):
                substring_list.pop(i)
            else:
                i = i+1
                continue
        elif is_contained('dep', substring_list[i]) and (is_contained('tech', substring_list[i+1]) or is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or  is_contained('hosp', substring_list[i+1]) or  is_contained('school', substring_list[i+1]) or  is_contained('fac', substring_list[i+1])):
            if not is_contained('univ', substring_list[i]):
                substring_list.pop(i)
            else:
                i = i+1
                continue
        elif is_contained('inst',substring_list[i]) and (is_contained('school', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('acad', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('clin', substring_list[i+1]) or is_contained('klin', substring_list[i+1])  or is_contained('fak', substring_list[i+1]) or is_contained('fac', substring_list[i+1]) or is_contained('cent', substring_list[i+1]) or is_contained('div', substring_list[i+1])):
            if not is_contained('univ', substring_list[i]):
                substring_list.pop(i)
            else:
                i = i+1
                continue
        elif is_contained('school',substring_list[i]) and is_contained('colege', substring_list[i+1]):
            if not is_contained('univ', substring_list[i]):
                substring_list.pop(i)
            else:
                i = i+1
                continue
        else:
            i += 1
    light_aff = (', '.join((substring_list)))
    for x in substring_list:
        if x in city_names+remove_list:
            substring_list.remove(x)
    substring_list = [shorten_keywords_spark([x])[0] for x in substring_list] 
    def valueToCategory(value):
        flag = 0
        for k in categ_dicts:
            if k in value: 
                flag = 1
        return flag
    aff_list = [{"index": i, "keywords": substring_list[i], "category": valueToCategory(substring_list[i])} for i in range(len(substring_list))]
    filtered_list = [entry for entry in aff_list if entry.get("category") == 1]
    return   [light_aff, filtered_list]
--- a/functions_cluster.py
+++ b/functions_cluster.py
@ -0,0 +1,635 @@
 import re
 import unicodedata
 import html
 from unidecode import unidecode
 import json   
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 #import pandas as pd 
 def load_txt(file_path):
    with open(file_path, 'r',  encoding='utf-8') as file:
        list_ = [line.strip() for line in file]
        return list_
 def load_pickled_dict(file_path): 
    with open(file_path, 'rb') as file: 
        pickled_dict = pickle.load(file) 
        return pickled_dict
 def load_json(file_path): 
    with open(file_path, 'r') as json_file:
        json_dict = json.load(json_file)
        return json_dict
 categ_string = 'Laboratory|Univ/Inst|Hospital|Foundation|Specific|Museum'
 def replace_double_consonants(text):
    # This regex pattern matches any double consonant
    pattern = r'([bcdfghjklmnpqrstvwxyz])\1'
    # The replacement is the first captured group (the single consonant)
    result = re.sub(pattern, r'\1', text, flags=re.IGNORECASE)
    return result
 remove_list = [replace_double_consonants(x) for x in load_txt('txt_files/remove_list.txt')]
 stop_words = load_txt('txt_files/stop_words.txt')
 university_terms = [replace_double_consonants(x) for x in load_txt('txt_files/university_terms.txt')]
 city_names = [replace_double_consonants(x) for x in load_txt('txt_files/city_names.txt')]
 categ_dicts = load_json('dictionaries/dix_categ.json')
 def is_contained(s, w):
    words = s.split()  # Split the string 's' into a list of words
    for word in words:
        if word not in w:  # If a word from 's' is not found in 'w'
            return False  # Return False immediately
    return True  # If all words from 's' are found in 'w', return True
 def starts_with_any(string, prefixes):
    for prefix in prefixes:
        if string.startswith(prefix):
            return [True, prefix]
    return False
 def remove_leading_numbers(s):
    return re.sub(r'^\d+', '', s)
 def remove_outer_parentheses(string):
    """Remove outer parentheses from the string if they enclose the entire string."""
    if string.startswith('(') and string.endswith(')'):
        return string[1:-1].strip()
    return string
 def insert_space_between_lower_and_upper(s):
    """
    Inserts a space between a lowercase letter followed by an uppercase letter in a string.
    Parameters:
    s (str): The input string.
    Returns:
    str: The modified string with spaces inserted.
    """
    # Use regex to insert space between lowercase and uppercase letters
    modified_string = re.sub(r'([a-z])([A-Z])', r'\1 \2', s)
    return modified_string
 def index_multiple_matchings(pairs):
    result_dict = {}
    r_list = [pair[3] for pair in pairs]
    modified_list = [item for sublist in r_list for item in sublist]
    r = len(list(set(modified_list)))
    for t in [pair[0] for pair in pairs]:
        key = t
        if key in result_dict and r>1:
            result_dict[key] += 1
        else:
            result_dict[key] = 1
    return result_dict
 def avg_string(df, col):
    avg = [] 
    for i in range(len(df)):
        avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i]))
    return sum(avg)/len(avg)
 #stop_words = ['from', 'the', 'of', 'at', 'de','for','et','für','des', 'in','as','a','and','fur','for','und']
 def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)
 def remove_parentheses(text):
   return re.sub(r'\([^()]*\)', '', text)
 def replace_umlauts(text):
    normalized_text = unicodedata.normalize('NFKD', text)
    replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
    return replaced_text
 def protect_phrases(input_string, phrases):
    # Replace phrases with placeholders
    placeholder_map = {}
    for i, phrase in enumerate(phrases):
        placeholder = f"__PLACEHOLDER_{i}__"
        placeholder_map[placeholder] = phrase
        input_string = input_string.replace(phrase, placeholder)
    return input_string, placeholder_map
 def restore_phrases(split_strings, placeholder_map):
    # Restore placeholders with original phrases
    restored_strings = []
    for s in split_strings:
        for placeholder, phrase in placeholder_map.items():
            s = s.replace(placeholder, phrase)
        restored_strings.append(s)
    return restored_strings
 def replace_comma_spaces(text):
    return text.replace('  ', ' ').replace(' , ', ', ')
 def split_string_with_protection(input_string, protected_phrases):
    # Step 1: Protect specific phrases
    input_string, placeholder_map = protect_phrases(input_string, protected_phrases)
    # Step 2: Split the string on specified delimiters
    split_strings = [s.strip() for s in re.split(r'[,;/]| – ', input_string) if s.strip()]
    # Step 3: Restore protected phrases
    split_strings = restore_phrases(split_strings, placeholder_map)
    return split_strings
 protected_phrases1 =  [
    phrase.format(x=x)
    for x in city_names
    for phrase in [
        'university california, {x}',
    #    'university california , {x}',
        'university colege hospital, {x}',
    #    'university colege hospital , {x}',
        'national univ ireland, {x}',
    #    'national univ ireland , {x}',
        'national university ireland, {x}',
    #    'national university ireland , {x}',
        'university colege, {x}',
    #    'university colege , {x}',
        'university hospital, {x}', 
    #    'university hospital , {x}', 
        'imperial colege, {x}',
    #    'imperial colege , {x}'
        'city university, {x}', 
    #    'city university , {x}'
    ]
 ]
 replacements = {'uni versity':'university',
                'univ ':'university ',
                'univercity':'university', 
                'universtiy':'university', 
                'univeristy':'university',
                'universirty':'university', 
                'universiti':'university', 
                'universitiy':'university',
                'universty' :'university',
                'univ col': 'university colege',
                'belfield, dublin': 'dublin',
                'balsbridge, dublin': 'dublin', #ballsbridge
                'earlsfort terrace, dublin': 'dublin',
                'bon secours hospital, cork' : 'bon secours hospital cork',
                'bon secours hospital, dublin' : 'bon secours hospital dublin',
                'bon secours hospital, galway' : 'bon secours hospital galway',
                'bon secours hospital, tralee' : 'bon secours hospital tralee',
                'bon secours health system' : 'bon secours hospital dublin',
                'bon secours hospital, glasnevin' : 'bon secours hospital dublin',
                'imperial colege science, technology medicine' : 'imperial colege science technology medicine',
                'ucl queen square institute neurology' : 'ucl, london',
                'ucl institute neurology' : 'ucl, london',
                'royal holoway, university london' : 'royal holoway universi london', #holloway
                'city, university london' : 'city universi london',
                'city university, london' : 'city universi london',
                'aeginition':'eginition',
                'national technical university, athens' : 'national technical university athens' 
            # 'harvard medical school' : 'harvard university'
 }
 def substrings_dict(string):
    # Split the input string and clean each substring
   # split_strings =  split_string_with_protection(string.replace('univ coll', 'university college').replace('belfield, dublin', 'dublin').replace('ballsbridge, dublin', 'dublin').replace('earlsfort Terrace, dublin', 'dublin'), protected_phrases1)
    for old, new in replacements.items():
        string = string.replace(old, new)
    split_strings = split_string_with_protection(string, protected_phrases1)
    # Define a set of university-related terms for later use
    dict_string = {}
    index = 0    
    for value in split_strings:
        # Check if the substring contains any university-related terms
        if not any(term in value.lower() for term in university_terms):
            # Apply regex substitutions for common patterns
            modified_value = re.sub(r'universi\w*', 'universi', value, flags=re.IGNORECASE)
            modified_value = re.sub(r'institu\w*', 'institu', modified_value, flags=re.IGNORECASE)
            modified_value = re.sub(r'centre*', 'center', modified_value, flags=re.IGNORECASE)
            modified_value = re.sub(r'\bsaint\b', 'st', modified_value, flags=re.IGNORECASE) 
            modified_value = re.sub(r'\btrinity col\b', 'trinity colege', modified_value, flags=re.IGNORECASE)
            modified_value = re.sub(r'\btechnische\b', 'technological', modified_value, flags=re.IGNORECASE)
            # Add the modified substring to the dictionary
            dict_string[index] = modified_value.lower().strip()
            index += 1
       # elif 'universitetskaya' in value.lower():
       #     index += 1
            # Add the original substring to the dictionary
        else:
            dict_string[index] = value.lower().strip()
            index += 1
    return dict_string
 def clean_string(input_string):
    # Temporarily replace " - " with a unique placeholder
    placeholder = "placeholder"
  #  input_string = input_string.replace(" - ", placeholder)
    input_string = input_string.replace(" – ", placeholder)
    # Unescape HTML entities and convert to lowercase
    input_string = replace_comma_spaces(replace_double_consonants(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.replace("'", "")))))).strip())
    # Normalize unicode characters (optional, e.g., replace umlauts)
    input_string = unidecode(input_string)
    # Replace `/` and `–` with space (do not replace hyphen `-`)
    result = re.sub(r'[/\-]', ' ', input_string)
    # Replace "saint" with "st"
    result = re.sub(r'\bSaint\b', 'St', result)
    result = re.sub(r'\bAghia\b', 'Agia', result)
    # Remove characters that are not from the Latin alphabet, or allowed punctuation
    result = replace_comma_spaces(re.sub(r'[^a-zA-Z\s,;/]', '', result).strip())
    # Restore the " - " sequence from the placeholder
    result = result.replace(placeholder, " – ")
    # Replace consecutive whitespace with a single space
    result = re.sub(r'\s+', ' ', result)
    #result = result.replace('ss', 's')
    result = insert_space_between_lower_and_upper(result).lower()
    result = remove_stop_words(result)
    return result.strip()  # Strip leading/trailing spaces
 def clean_string_facts(input_string):
    # Replace specified characters with space
    input_string = remove_stop_words(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.lower())))))
    result = re.sub(r'[/\-,]', ' ', input_string)
    result = re.sub(r'\bsaint\b', 'st', result) 
    # Remove characters that are not from the Latin alphabet or numbers
    result = re.sub(r'[^a-zA-Z0-9\s;/-]', '', result)
    # Replace consecutive whitespace with a single space
    result = re.sub(r'\s+', ' ', result)
    return result
 def str_radius_u(string):
    string = string.lower()
    radius = 3
    str_list = string.split()
    indices = []
    result = []
    for i, x in enumerate(str_list):
        if is_contained('univers',x):
            indices.append(i)
        # elif is_contained('coll',x):
        #     indices.append(i)
    for r0 in indices:
        lmin =max(0,r0-radius)
        lmax =min(r0+radius, len(str_list))
        s = str_list[lmin:lmax+1]
        result.append(' '.join(s))
    return result 
 def str_radius_coll(string):
    string = string.lower()
    radius = 1
    str_list = string.split()
    indices = []
    result = []
    for i, x in enumerate(str_list):
        if is_contained('col',x):
            indices.append(i)
    for r0 in indices:
        lmin =max(0,r0-radius)
        lmax =min(r0+radius, len(str_list))
        s = str_list[lmin:lmax]
        result.append(' '.join(s))
    return result 
 def str_radius_h(string):
    string = string.lower()
    radius = 3
    str_list = string.split()
    indices = []
    result = []
    for i, x in enumerate(str_list):
        if is_contained('hospital',x) or is_contained('hopita',x):
            indices.append(i)
    for r0 in indices:
        lmin =max(0,r0-radius-1)
        lmax =min(r0+radius, len(str_list))
        s = str_list[lmin:lmax]
        result.append(' '.join(s))
    return result 
 def str_radius_c(string):
    string = string.lower()
    radius = 2
    str_list = string.split()
    indices = []
    result = []
    for i, x in enumerate(str_list):
        if is_contained('clinic',x) or is_contained('klinik',x):
            indices.append(i)
    for r0 in indices:
        lmin =max(0,r0-radius-1)
        lmax =min(r0+radius, len(str_list))
        s = str_list[lmin:lmax]
        result.append(' '.join(s))
    return result 
 def str_radius_r(string):
    string = string.lower()
    radius = 2
    str_list = string.split()
    indices = []
    result = []
    for i, x in enumerate(str_list):
        if is_contained('research',x):
            indices.append(i)
    for r0 in indices:
        lmin =max(0,r0-radius-1)
        lmax =min(r0+radius, len(str_list))
        s = str_list[lmin:lmax]
        result.append(' '.join(s))
    return result 
 def str_radius_spec(string):
    spec = False
    for x in string.split():
        try:
            if categ_dicts[x] == 'Specific':
                spec = True
                return x
        except:
            pass
    if spec == False:
        return string        
 def avg_string(df, col):
    avg = [] 
    for i in range(len(df)):
        avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i]))
    return sum(avg)/len(avg)
 def shorten_keywords(affiliations_simple):
    affiliations_simple_n = []
    for aff in affiliations_simple:
        inner = []
        for str in aff:
            if 'universi' in str:
                inner.extend(str_radius_u(str))
            elif 'col' in str and 'trinity' in str:
                inner.extend(str_radius_coll(str))
            elif 'hospital' in str or 'hopita' in str:
                inner.extend(str_radius_h(str))
            elif 'clinic' in str or 'klinik' in str:
                inner.extend(str_radius_c(str))
            elif 'research council' in str:
                inner.extend(str_radius_r(str))
            else:
                inner.append(str_radius_spec(str))
        affiliations_simple_n.append(inner)
    return affiliations_simple_n
 def shorten_keywords_spark(affiliations_simple):
    affiliations_simple_n = []
    for aff in affiliations_simple:
        if 'universi' in aff:
            affiliations_simple_n.extend(str_radius_u(aff))
        elif 'col' in aff and 'trinity' in aff:
            affiliations_simple_n.extend(str_radius_coll(aff))
        elif 'hospital' in aff or 'hopita' in aff:
            affiliations_simple_n.extend(str_radius_h(aff))
        elif 'clinic' in aff or 'klinik' in aff:
            affiliations_simple_n.extend(str_radius_c(aff))
        elif 'research council' in aff:
            affiliations_simple_n.extend(str_radius_r(aff))
        else:
            affiliations_simple_n.append(str_radius_spec(aff))
    return affiliations_simple_n
 def refine(list_, affil):
    affil = affil.lower()
    ids = []
    for matched_org_list in list_:      
        id_list = []
        for matched_org in matched_org_list:
            if dix_mult[matched_org] == 'unique':
                id_list.append(dix_acad[matched_org])
            else:
                city_found = False
                for city in dix_city[matched_org]:
                    if city[0] in affil:
                        id_list.append(city[1])
                        city_found = True
                        break
                if not city_found:
                    country_found = False
                    for country in dix_country[matched_org]:
                        if country[0] in  list(country_mapping.keys()):
                            print(country[0])
                            if country[0] in affil or country_mapping[country[0]][0] in affil or country_mapping[country[0]][0] in affil:
                                id_list.append(country[1])
                                country_found = True
                                break
                        elif country[0] in affil:
                            print('country found',country[0])
                            id_list.append(country[1])
                            country_found = True
                            break
                    if not country_found:
                        id_list.append(dix_acad[matched_org])
        ids.append(id_list)
        return ids
 def compute_cos(x,s):
    vectorizer = CountVectorizer()
    s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
    x_vector = vectorizer.transform([x]).toarray()
    # Compute similarity between the vectors
    return cosine_similarity(x_vector, s_vector)[0][0]
 # def find_ror(string, simU, simG):
 #     df = pd.DataFrame()
 #     df['Unique affiliations'] = [[string.lower()]]
 #     academia = create_df_algorithm(df)
 #     result = Aff_Ids(len(academia), academia,dix_acad, dix_mult, dix_city, dix_country, simU,simG)
 #     if len(result)>0:
 #         dict_aff_open = {x: y for x, y in zip(result['Original affiliations'], result['Matched organizations'])}
 #         dict_aff_id = {x: y for x, y in zip(result['Original affiliations'], result['unique ROR'])}
 #         dict_aff_score = {}
 #         for i in range(len(result)):
 #             if type(result['Similarity score'].iloc[i]) == list:
 #                 dict_aff_score[result['Original affiliations'].iloc[i]] = result['Similarity score'].iloc[i]
 #             else:
 #                 dict_aff_score[result['Original affiliations'].iloc[i]] = [result['Similarity score'].iloc[i]]
 #         pids = []
 #         for i in range(len(df)):
 #             pidsi = []
 #             for aff in df['Unique affiliations'].iloc[i]:
 #                 if aff in list(dict_aff_id.keys()):
 #                     pidsi = pidsi + dict_aff_id[aff]
 #             # elif 'unmatched organization(s)' not in pidsi:
 #             #     pidsi = pidsi + ['unmatched organization(s)']
 #             pids.append(pidsi)
 #         names = []
 #         for i in range(len(df)):
 #             namesi = []
 #             for aff in df['Unique affiliations'].iloc[i]:
 #                 if aff in list(dict_aff_open.keys()):
 #                     try:
 #                         namesi = namesi + dict_aff_open[aff]
 #                     except TypeError:
 #                         namesi = namesi + [dict_aff_open[aff]]
 #             names.append(namesi)
 #         scores = []
 #         for i in range(len(df)):
 #             scoresi = []
 #             for aff in df['Unique affiliations'].iloc[i]:
 #                 if aff in list(dict_aff_score.keys()):
 #                     scoresi = scoresi +  dict_aff_score[aff]
 #             scores.append(scoresi)
 #         df['Matched organizations'] = names
 #         df['ROR'] = pids
 #         df['Scores'] = scores
 #         def update_Z(row):
 #             if len(row['ROR']) == 0 or len(row['Scores']) == 0:
 #                 return []
 #             new_Z = []
 #             for ror, score in zip(row['ROR'], row['Scores']):
 #                 entry = {'ROR_ID': ror, 'Confidence': score}
 #                 new_Z.append(entry)
 #             return new_Z
 #         matching = df.apply(update_Z, axis=1)
 #         df['Matchings'] = matching
 #         return df['Matchings'].iloc[0]
 #     else: 
 #         return 'no result'
--- a/matching_cluster.py
+++ b/matching_cluster.py
@ -0,0 +1,319 @@
 from collections import defaultdict
 from collections import Counter
 import Levenshtein
 from sklearn.feature_extraction.text import CountVectorizer
 from sklearn.metrics.pairwise import cosine_similarity
 from functions_cluster import *
 from create_input_cluster import *
 def best_sim_score(light_raw, candidate_num, pairs_list, m, simU, simG):
    """
    Finds the best match between a 'key word' and several legal names from the OpenAIRE database.
    ---> corrects special cases in the main map that follows
    Args:
      light_raw  
      l2  candidate_num: number of candidates.
      l3  pairs_list: List of pairs. (s,  x, score)
      l4  m: mult
    Returns:
        List: Resulting list containing OpenAIRE names and their similarity scores.
    """
    vectorizer = CountVectorizer()
    univ_num = light_raw.lower().count('univ') 
    result = []
    best = [] 
    s = light_raw
    for j in range(len(pairs_list)):
        x = pairs_list[j][1] 
        if [x, pairs_list[j][2]] in result:
            continue
        if m[pairs_list[j][0]] == 1:
            if  is_contained('univ', x.lower()) and  pairs_list[j][2] > simU:
                result.append([x, pairs_list[j][2]])
            elif  pairs_list[j][2] > simG:
                result.append([x, pairs_list[j][2]])
        elif pairs_list[j][2] >= 0.98: # and (is_contained("univ", x.lower()) or is_contained("college", x.lower()) or  is_contained("center", x.lower()) or  is_contained("schule", x.lower())): # If the similarity score of a pair (s,x) was 1, we store it to results list
            result.append([pairs_list[j][1], 1])
        else:
            try:
                if not is_contained("univ", x.lower()):
                    continue  # Skip if x does not contain "university" or "univ"
                #  if (is_contained('hosp', x.lower()) and not is_contained('hosp', s)) or (not is_contained('hosp', x.lower()) and is_contained('hosp', s)) or (is_contained('hopital', x.lower()) and not is_contained('hopital', s)) or (not is_contained('hopital', x.lower()) and is_contained('hopital', s)):
                #      continue
                s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
                x_vector = vectorizer.transform([x]).toarray()
                # Compute similarity between the vectors
                similarity = cosine_similarity(x_vector, s_vector)[0][0]
                if similarity > 0.1:
                    similarity_l = 1 - Levenshtein.distance(x, pairs_list[j][0]) / max(len(x), len(pairs_list[j][0]))
                    best.append([x, similarity, similarity_l]) #(similarity+similarity2)/2])
            except:
                KeyError
    if best:
        # max_numbers = defaultdict(float)
 # Assuming best is a list of three-element lists
 # Each element is (string, number1, number2)
        max_numbers = defaultdict(float)
        for item in best:
            string, number1, number2 = item  # Unpack the three elements
            max_numbers[string] = max(max_numbers[string], number1)
        reduced_best = [[string, number1, number2] for string, number1, number2 in best if number1 == max_numbers[string]]
 # Sort by number1 decreasingly and then by number2 in descending order
        reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True)
        result = result + reduced_best
    univ_list = []
    other_list = []
    for r in result:
        if is_contained('univ', r[0]):
            univ_list.append(r)
        else:
            other_list.append(r)
    limit =  min(univ_num, candidate_num)
    if len(univ_list) > limit:
        result = univ_list[:limit] + other_list
    result_dict = {}
    pairs_dict = {}
    for l in pairs_list:
        pairs_dict[l[1]] = l[2]
    for p in result:
        result_dict[p[0]] = pairs_dict[p[0]]
    result_dict_list = [[y[0], result_dict[y[0]]] for y in result]  
    return result_dict_list
 def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG):
    """
    Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
    Args:
        m (int): The number of DOIs to check.
        DF (DataFrame): The input DataFrame containing affiliation data.
        dix_org (dict): A dictionary of names of organizations and their ROR_ids.
        simU (float): Similarity threshold for universities.
        simG (float): Similarity threshold for non-universities.
    Returns:
        DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
    """
    df_list = input[1]
    light_aff = input[0]
    vectorizer = CountVectorizer()
    lnamelist = list(dix_org.keys())
    dix = {}    # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
    #pairs = [] 
    result = {}
    pairs = []
    def get_keywords(filtered_list):
        # Extract the "keywords" values from the dictionaries in filtered_list
        keywords_list = [entry["keywords"] for entry in filtered_list]
        return keywords_list
    keywords = get_keywords(df_list)
    for k,s in enumerate(keywords):
        similar_k = []
        pairs_k = []
        if s in lnamelist:
            similarity = 1
            similar_k.append(similarity)
            pairs_k.append((s,s,similarity,dix_org[s]))
            pairs.append((s,s,similarity,dix_org[s]))
            if k not in dix:
                dix[k] = [s]
            else:
                dix[k].append(s)
        else:
            for x in lnamelist:
                if  is_contained(s, x):
                    x_vector = vectorizer.fit_transform([x]).toarray()
                    s_vector = vectorizer.transform([s]).toarray()
                    # Compute similarity between the vectors
                    similarity = cosine_similarity(x_vector, s_vector)[0][0]
                    if similarity > min(simU, simG):
                        if (is_contained('univ', s) and is_contained('univ', x)) and similarity > simU:
                            similar_k.append(similarity)
                            pairs_k.append((s,x,similarity,dix_org[x]))
                            pairs.append((s,x,similarity,dix_org[x]))
                            if k not in dix:
                                dix[k] = [x]
                            else:
                                dix[k].append(x)
                        elif (not is_contained('univ', s) and not is_contained('univ', x)) and similarity > simG:
                            similar_k.append(similarity)
                            pairs_k.append((s,x,similarity,dix_org[x]))
                            pairs.append((s,x,similarity,dix_org[x]))
                            if k not in dix:
                                dix[k] = [x]
                            else:
                                dix[k].append(x)
                elif is_contained(x, s):
                    if (is_contained('univ', s) and is_contained('univ', x)):
                        s_vector = vectorizer.fit_transform([s]).toarray()
                        x_vector = vectorizer.transform([x]).toarray()
                        # Compute similarity between the vectors
                        similarity = cosine_similarity(s_vector, x_vector)[0][0]
                        if similarity > simU: #max(0.82,sim):
                            similar_k.append(similarity)
                            pairs_k.append((s,x,similarity,dix_org[x]))
                            pairs.append((s,x,similarity,dix_org[x]))
                            if k not in dix:
                                dix[k] = [x]
                            else:
                                dix[k].append(x)
                    elif not is_contained('univ', s) and not is_contained('univ', x):
                        s_vector = vectorizer.fit_transform([s]).toarray()
                        x_vector = vectorizer.transform([x]).toarray()
                        # Compute similarity between the vectors
                        similarity = cosine_similarity(s_vector, x_vector)[0][0]
                        if similarity > simG: #max(0.82,sim):
                            similar_k.append(similarity)
                            pairs_k.append((s,x,similarity,dix_org[x]))
                            pairs.append((s,x,similarity,dix_org[x]))
                            if k not in dix:
                                dix[k] = [x]
                            else:
                                dix[k].append(x)  
        result[k] = pairs_k
    multi = index_multiple_matchings(list(set(pairs)))
   # need_check = list(set([i for i in range(len(multi)) if list(multi.values())[i]>1]))
   # print('here', multi)
   # need_check_keys = [keywords[i] for i in range(len(keywords)) if multi[keywords[i]]>1]
    need_check_keys = []
    for i in range(len(keywords)):
        try: 
            if  multi[keywords[i]]>1:
                need_check_keys.append(keywords[i])
        except:
            pass
    best =  best_sim_score(light_aff, len(keywords), pairs, multi, simU, simG) 
    matched_org = [x[0] for x in best]
  #      best_o = []
 #       best_s = []
  #      best_result = []
   #     for x in best:
    #        best_o.append([x[i][0]  for i in range(len(x))])
     #       best_s.append([round(x[i][1],2)  for i in range(len(x))])
      #  num_mathced = [len(best_s[i]) for i in range(len(need_check))]
    ids = [dix_org[x[0]] for x in best]
    for i,x in enumerate(matched_org):
       # id_list = []
        if dix_mult[x] != 'unique':
            if x in list(dix_city_ror.keys()):
                match_found0 = False
                match_found = False
                for city in dix_city_ror[x]:
                    if city[0] in light_aff:
                        if city[0] not in x: 
                            ids[i] = city[1]
                            match_found0 = True
                            match_found = True
                            break
                if not match_found:
                    for city in dix_city_ror[x]:
                        if city[0] in   light_aff and city[0] not in x:
                            ids[i] = city[1]
                            match_found0 = True
                            print('ok')
                            break  
                if not match_found:
                    match_found2 = False
                    match_found3 = False
                    for country in dix_country_ror[x]:
                        if country[0] == 'united states' and (country[0] in light_aff or 'usa'  in light_aff):
                            ids[i] = country[1]
                            match_found2 = True
                            match_found3 = True
                            break
                        if country[0] == 'united kingdom' and (country[0] in light_aff or 'uk'  in light_aff):
                            ids[i] = country[1]
                            match_found2 = True
                            match_found3 = True
                            break
                        elif country[0] in light_aff:
                            if country[0] not in x:
                                ids[i] = country[1]
                                match_found2 = True
                                match_found3 = True
                                break
                    if not match_found3:
                        for country in dix_country_ror[x]:
                            if country[0] in light_aff and country[0] in x:
                                ids[i] = country[1]
                                match_found2 = True
                                break  
    results = [[x[0],x[1], ids[i]] for i,x in enumerate(best)]
    return  results #[[result[to_check[i]] for i in ready] + [to_check[2]], best[0]]
--- a/txt_files/.DS_Store
+++ b/txt_files/.DS_Store
--- a/txt_files/city_names.txt
+++ b/txt_files/city_names.txt
@ -0,0 +1,584 @@
 galway
 maynooth
 duluth
 port arthur
 new orleans
 paterson
 santa barbara
 thornton
 westminster
 north las vegas
 stockton
 marysville
 fitchburg
 tallinn
 fargo
 seaside
 manaus
 porto
 quebec city
 hialeah
 normal
 kansas city
 delhi
 fort worth
 palermo
 olathe
 madison
 santa maria
 youngstown
 allentown
 santa clara
 charlotte
 agra
 palmdale
 kraków
 bendigo
 high point
 washington
 dallas
 grand prairie
 plano
 leipzig
 bratislava
 seville
 puebla
 lucknow
 toowoomba
 santa rosa
 sioux falls
 flint
 kissimmee
 lacey
 brownsville
 palm springs
 tyler
 minsk
 san diego
 los angeles
 edmonton
 college station
 toulouse
 garland
 florence
 saskatoon
 albury-wodonga
 newburgh
 danbury
 deltona
 south bend
 nagpur
 pomona
 memphis
 london
 lincoln
 chandler
 adelaide
 salt lake city
 edinburgh
 suzhou
 grayslake
 new york city
 kanpur
 brussels
 okayama
 tuscaloosa
 clarksville
 jackson
 boise city
 canton
 louisville
 varanasi
 columbus
 lorain
 vadodara
 orem
 chennai
 townsville
 eindhoventoronto
 wuhan
 norman
 winter haven
 eugene
 riga
 hamamatsu
 fresno
 lake charles
 budapest
 mobile
 lowell
 vienna
 tallahassee
 nanjing
 new haven
 sacramento
 leeds
 harlingen
 springdale
 perth
 sendai
 utica
 orange
 baltimore
 rochester
 rancho cucamonga
 bellevue
 fort wayne
 modesto
 pristina
 nuremberg
 stuttgart
 indore
 murfreesboro
 nottingham
 scranton
 lancaster
 abilene
 monterey
 sioux city
 bari
 chula vista
 ahmedabad
 north port
 helsinki
 leominster
 ocala
 sarajevo
 hangzhou
 roanoke
 new york
 bethlehem
 dublin
 sunshine coast
 pune
 billings
 changchunsydney
 garden grove
 port orange
 pittsburgh
 new bedford
 hiroshima
 san francisco
 sheffield
 chongqing
 layton
 pueblo
 chengdu
 cincinnati
 erie
 lansing
 ljubljana
 st louis
 rio de janeiro
 philadelphia
 tacoma
 bel air
 chesapeake
 davenport
 las vegas
 nagasaki
 kitchener
 boulder
 roseville
 evansville
 victoria
 burbank
 sofia
 santa clarita
 san buenaventura
 savannah
 apple valley
 brighton
 coral springs
 huntsville
 fort lauderdale
 warsaw
 antioch
 medford
 visalia
 frankfurt
 joliet
 curitiba
 mcallen
 seattle
 alexandria
 bryan
 moreno valley
 berlin
 olympia
 caracas
 tianjin
 cleveland
 des moines
 prague
 fukuoka
 burlington
 bhopal
 nara
 hampton
 jefferson
 chicago
 temecula
 paris
 gilbert
 bradenton
 champaign
 munich
 amsterdam
 raleigh
 atlanta
 lakeland
 denver
 round lake beach
 richmond
 buffalo
 phoenix
 antwerp
 greenbay
 milwaukee
 south lyon
 concord
 vero beach
 newcastle
 podgorica
 monterrey
 shantou
 costa mesa
 copenhagen
 vilnius
 dalian
 bristol
 salinas
 belgrade
 waterloo
 henderson
 hayward
 hickory
 el monte
 lima
 redding
 mexico city
 cary
 kennewick
 guayaquil
 tirana
 kawasaki
 greensboro
 west covina
 amarillo
 saitama
 new london
 recife
 manchester
 rockford
 kelowna
 hagerstown
 bordeaux
 york
 kaneohe
 tucson
 gainesville
 kalamazoo
 bogotá
 reading
 virginia beach
 guadalajara
 albany
 durham
 green bay
 oceanside
 montreal
 turin
 malaga
 oshawa
 mesa
 pensacola
 boise
 bonita springs
 fort walton beach
 port saint lucie
 reykjavik
 north charleston
 newark
 reno
 knoxville
 bakersfield
 oslo
 omaha
 milan
 cambridge
 norwich
 shanghai
 naples
 victorville
 zagreb
 norwalk
 huntington beach
 clarke county
 lubbock
 yakima
 warren
 bucharest
 simi valley
 greenville
 racine
 salvador
 elk grove
 orlando
 windsor
 santa cruz
 saginaw
 ballarat
 muskegon
 shreveport
 clearwater
 merced
 boston
 basel
 elizabeth
 panama city
 okinawa
 sarasota
 zurich
 glendale
 wilmington
 pompano beach
 guangzhou
 fairfield
 hyderabad
 santiago
 nashville
 mchenry
 ann arbor
 carrollton
 hollywood
 laredo
 rome
 san bernardino
 bergen
 springfield
 winnipeg
 corona
 surat
 long beach
 nagoya
 toledo
 geelong
 kenosha
 sterling heights
 lisbon
 myrtle beach
 nashua
 riverside
 tampa
 bangalore
 richland
 rotterdam
 lyon
 scottsdale
 berkeley
 bologna
 cedar rapids
 syracuse
 tulsa
 ludhiana
 hemet
 portland
 mission viejo
 salem
 overland park
 detroit
 jinan
 osaka
 grand rapids
 jersey city
 kailua
 venice
 darwin
 miramar
 gulfport-biloxi
 huntington
 portsmouth
 worcester
 sunnyvale
 escondido
 college park
 thousand oaks
 harbin
 belfast
 yonkers
 alicante
 barnstable
 kitakyushu
 sapporo
 ogden
 aurora
 palm bay
 düsseldorf
 hobart
 irvine
 st johns
 hamburg
 provo
 melbourne
 madrid
 zhengzhou
 asheville
 patna
 inglewood
 houston
 newport news
 west valley city
 oklahoma city
 brisbane
 valencia
 pasadena
 aberdeen
 st petersburg
 lakewood
 irving
 naperville
 miami
 topeka
 downey
 genoa
 lewisville
 birmingham
 xian
 saint paul
 bremerton
 corpus christi
 daytona beach
 st paul
 oxnard
 murrieta
 lafayette
 montgomery
 baton rouge
 skopje
 cathedral city
 spartanburg
 canberra
 arvada
 hesperia
 port st lucie
 saint louis
 bridgeport
 tempe
 quito
 chattanooga
 bremen
 gold coast
 cairns
 beaumont
 elkhart
 peoria
 calgary
 honolulu
 havre de grace
 hamilton
 fullerton
 daly city
 dresden
 belem
 ottawa
 regina
 chiba
 fort collins
 indianapolis
 mumbai
 killeen
 sao paulo
 jaipur
 fremont
 zaragoza
 charleston
 waco
 kobe
 odessa
 monroe
 vallejo
 marseille
 qingdao
 frederick
 marina
 sebastian
 oakland
 pembroke pines
 san antonio
 kyoto
 colorado springs
 el paso
 shenyang
 punta gorda
 fort smith
 richmond county
 waterbury
 shenzhen
 albuquerque
 jacksonville
 minneapolis
 fortaleza
 denton
 gastonia
 fayetteville
 bloomington
 houma
 santa ana
 kolkata
 las cruces
 barcelona
 arlington
 niigata
 norfolk
 fontana
 providence
 santo domingo
 vancouver
 appleton
 san jose
 hartford
 winston
 barrie
 glasgow
 davidson county
 yokohama
 independence
 athens
 harrisburg
 macon
 torrance
 launceston
 cape coral
 austin
 little rock
 cologne
 mesquite
 catania
 stockholm
 nice
 stamford
 buenos aires
 columbia
 anchorage
 dayton
 wollongong
 halifax
 verona
 anaheim
 kiev
 augusta
 tokyo
 akron
 lexington
 wichita
 saint petersburg
 beijing
 johnson city
 spokane
 liverpool
 howell
 poughkeepsie
 ontario
 atlantic city
 trenton
--- a/txt_files/remove_list.txt
+++ b/txt_files/remove_list.txt
@ -0,0 +1,28 @@
 universi
 research institu
 laboratory
 gmbh
 inc
 universi of
 research center
 foundation
 faculty
 national institu
 school medicine
 universi school
 graduate school
 graduate school engineering
 institu tropical medicine
 institu virology
 faculty medicine
 laboratory
 universi park
 institu science
 polytechnic universi
 universi 1
 ciudad universi
 universi campus
 universi hospitals
 colege
 universi road
 universitetska str
--- a/txt_files/stop_words.txt
+++ b/txt_files/stop_words.txt
@ -0,0 +1,16 @@
 from
 the
 of
 at
 de
 for
 et
 für
 des
 in
 as
 a
 and
 fur
 for
 und
--- a/txt_files/university_terms.txt
+++ b/txt_files/university_terms.txt
@ -0,0 +1,8 @@
 universitetskaya
 universitatsklinikum
 universitatskinderklinik
 universitatskliniken
 universitetshospital
 universitatsmedizin
 universitatsbibliothek
 universitatspital
--- a/update_records.py
+++ b/update_records.py
@ -0,0 +1,116 @@
 import json
 import os 
 from pyspark.sql import SparkSession
 from affro_cluster import *
 folder_path = '/user/zeppelin/miriam.baglioni/AffStringFromIISDataset2'
 #folder_path = 'check'
 json_file_names = []
 # Loop through all files in the directory
 for file_name in os.listdir(folder_path):
    # Check if the file is a JSON file (you can adjust the check as needed)
    if file_name != '_SUCCESS':
        json_file_names.append(file_name)
 # json_file_names now contains the names of all JSON files in the folder
 # Initialize Spark session
 spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
 def remove_duplicates(list_of_dicts):
    # Use a set to store tuples of dictionary items to filter out duplicates
    seen = set()
    unique_list_of_dicts = []
    for d in list_of_dicts:
        # Convert the dictionary to a tuple of items
        items = tuple(d.items())
        if items not in seen:
            seen.add(items)
            unique_list_of_dicts.append(d)
    return unique_list_of_dicts
 def update_record(record):
    id = record['id']
    authors = []
    try:
        for author in record['authors']:
            author_object = {}
            if 'orcid.org/0'  in author['fullName']:
                author_object['Name'] = {'Full':author['fullName'].split(',')[1], 'First' : None, 'Last' : None}
                author_object['ORCID'] = author['fullName'].split(',')[0][:36]
            else:
                author_object['Name'] = {'Full':author['fullName'], 'First' : None, 'Last' : None}
                author_object['ORCID'] = None 
            author_object['Raw_affiliations'] = [affiliation['raw_affiliation_string'] for affiliation in author['affiliations']]
            all_affs_with_ror = []
            have_ror = False
            for affiliation in author['affiliations']:
           #     author_object['Raw_affiliations'] = [x for x in affiliation['raw_affiliation_string']]
                if 'ORCID: 0' in affiliation['raw_affiliation_string']:
                    x = affiliation['raw_affiliation_string']
                    author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID: ')[1]
                elif  'ORCID 0' in affiliation['raw_affiliation_string']:
                    x = affiliation['raw_affiliation_string']
                    author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID ')[1]
                if 'ror.org' in affiliation['raw_affiliation_string']:
                    have_ror = True
                    all_affs_with_ror.append({
                    'Origin': 'data',
                    'RORid': affiliation['raw_affiliation_string'][0:25],
                    'Confidence': None
                    })
                else:
                    if len(affro(affiliation['raw_affiliation_string']))>0:
                        author_object['Organization_PIDs'] = affro(affiliation['raw_affiliation_string'])
                        author_object['Organization_PIDs'] = remove_duplicates([json.loads(x) for x in author_object['Organization_PIDs']])
                    else:
                        author_object['Organization_PIDs'] = []
            if have_ror == True:
                author_object['Organization_PIDs'] = all_affs_with_ror
            order = ["Name", "Raw_affiliations", "Organization_PIDs", "ORCID"]
            reordered_data = {k: author_object[k] for k in order}
            authors.append(reordered_data)
        organizations =  remove_duplicates([x for author in authors for x in  author['Organization_PIDs']])
        updt = {'ID' : id, 'Authors' : authors, 'Organizations' : organizations}
        return updt
    except Exception as e:
        print(f"Error processing record with id {record.get('id')}: {str(e)}")
        return None
 for file in json_file_names:
    print('start processing '+str(file))
    df = spark.read.json(folder_path + '/' + file)
    # Apply the update_record function
    updated_rdd = df.rdd.map(lambda row: update_record(row.asDict()))
    # Convert updated RDD to JSON strings
    json_rdd = updated_rdd.map(lambda record: json.dumps(record))
    # Collect the data and write to an output file with a unique name
    json_data = json_rdd.collect()
    # Create a new filename by appending "_output.json" to the original filename (without extension)
    output_file_name = file+'_output.json'
    print('end processing '+str(file))
    with open(output_file_name, 'w') as f:
        for i, item in enumerate(json_data):
            print('write '+str(i))
            f.write(item + '\n')