From 0c98ba76a6297a20fcc21de693bd997977097c85 Mon Sep 17 00:00:00 2001
From: mkallipo <95910739+mkallipo@users.noreply.github.com>
Date: Thu, 5 Sep 2024 12:23:32 +0200
Subject: [PATCH] initial commit

---
 README.md                      |   0
 affro_cluster.py               |  40 +++
 affro_test_example.py          |  28 ++
 create_input_cluster.py        |  77 ++++
 functions_cluster.py           | 635 +++++++++++++++++++++++++++++++++
 matching_cluster.py            | 319 +++++++++++++++++
 txt_files/.DS_Store            | Bin 0 -> 8196 bytes
 txt_files/city_names.txt       | 584 ++++++++++++++++++++++++++++++
 txt_files/remove_list.txt      |  28 ++
 txt_files/stop_words.txt       |  16 +
 txt_files/university_terms.txt |   8 +
 update_records.py              | 116 ++++++
 12 files changed, 1851 insertions(+)
 create mode 100644 README.md
 create mode 100644 affro_cluster.py
 create mode 100644 affro_test_example.py
 create mode 100644 create_input_cluster.py
 create mode 100644 functions_cluster.py
 create mode 100644 matching_cluster.py
 create mode 100644 txt_files/.DS_Store
 create mode 100644 txt_files/city_names.txt
 create mode 100644 txt_files/remove_list.txt
 create mode 100644 txt_files/stop_words.txt
 create mode 100644 txt_files/university_terms.txt
 create mode 100644 update_records.py
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..e69de29
diff --git a/affro_cluster.py b/affro_cluster.py
new file mode 100644
index 0000000..7d59a2b
--- /dev/null
+++ b/affro_cluster.py
@@ -0,0 +1,40 @@
+import sys 
+##import functions
+from functions_cluster import *
+from matching_cluster import *
+from create_input_cluster import *
+import json
+
+dix_org = load_json('dictionaries/dix_acad.json')
+dix_mult = load_json('dictionaries/dix_mult.json')
+dix_city = load_json('dictionaries/dix_city.json')
+dix_country = load_json('dictionaries/dix_country.json')
+
+    
+def affro(raw_aff_string):
+    try:
+        result = Aff_Ids(create_df_algorithm(raw_aff_string), dix_org, dix_mult, dix_city, dix_country,  0.65, 0.82)
+        if len(result)>0:
+            result_dict =  [json.dumps({'Origin': 'affRo', 'RORid':x[2], 'Confidence':x[1]}) for x in result]
+        else:
+            result_dict =  []
+
+        return result_dict
+    except Exception as e:
+        # Return some indication of an error, or log the row
+        print(f"Error: {str(e)}")
+        print(raw_aff_string)
+        pass
+#raw_aff = 'university of california, los angeles, university of athens, university of california, san diego, university of athens, greece'
+
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python affro_spark.py <string> <float1> <float2>")
+        sys.exit(1)
+
+    string_arg = sys.argv[1]
+   # float_arg1 = float(sys.argv[2])
+   # float_arg2 = float(sys.argv[3])
+
+    print(affro(string_arg))
diff --git a/affro_test_example.py b/affro_test_example.py
new file mode 100644
index 0000000..7cb3363
--- /dev/null
+++ b/affro_test_example.py
@@ -0,0 +1,28 @@
+from pyspark.sql import SparkSession
+from pyspark.sql.functions import udf
+from pyspark.sql.types import StringType
+
+import sys
+
+from affro_cluster import *
+
+# Initialize SparkSession
+spark = SparkSession.builder.appName("CustomFunctionExample").getOrCreate()
+
+# Register the function as a UDF
+affro_udf = udf(affro, StringType())
+
+# Input list of strings
+input_data = ["university of athens", "university of vienna", "UCLA"]
+
+# # Convert the list to a Spark DataFrame
+df = spark.createDataFrame(input_data, "string").toDF("raw_affiliation_string")
+
+# # Apply your custom UDF to the DataFrame
+df_with_custom_value = df.withColumn("affro_value", affro_udf(df["raw_affiliation_string"]))
+
+
+df_with_custom_value.show(truncate=False)
+
+# Stop the SparkSession
+spark.stop()
diff --git a/create_input_cluster.py b/create_input_cluster.py
new file mode 100644
index 0000000..880fb87
--- /dev/null
+++ b/create_input_cluster.py
@@ -0,0 +1,77 @@
+from functions_cluster import *
+
+def create_df_algorithm(raw_aff_string):
+    aff_no_symbols_d =  substrings_dict(clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string))))
+    substring_list = list(aff_no_symbols_d.values())
+
+    i = 0
+
+    while i < len(substring_list) - 1:
+        if is_contained('progr', substring_list[i]) and is_contained('dep', substring_list[i+1]):
+            substring_list.pop(i)
+
+            
+        elif (is_contained('assistant', substring_list[i]) or is_contained('researcher', substring_list[i]) or is_contained('phd', substring_list[i]) or is_contained('student', substring_list[i]) or is_contained('section', substring_list[i]) or is_contained('prof', substring_list[i]) or is_contained('director', substring_list[i])) and (not is_contained('school', substring_list[i+1]) or is_contained('univ', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('lab', substring_list[i+1]) or is_contained('fac', substring_list[i+1])):
+            substring_list.pop(i)
+
+        elif (is_contained('engineer', substring_list[i]) or is_contained('progr', substring_list[i]) or is_contained('unit', substring_list[i]) or is_contained('lab', substring_list[i]) or is_contained('dep', substring_list[i]) or  is_contained('school', substring_list[i])  or is_contained('inst', substring_list[i]) #or is_contained('hosp', substring_list[i]) 
+            or is_contained('fac', substring_list[i])) and is_contained('univ', substring_list[i+1]):
+            if not is_contained('univ', substring_list[i]):
+                substring_list.pop(i)
+            else:
+                i = i+1
+                continue
+
+        elif is_contained('lab', substring_list[i]) and (is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('school', substring_list[i+1])):
+            if not is_contained('univ', substring_list[i]):
+                substring_list.pop(i)
+            else:
+                i = i+1
+                continue
+
+        elif is_contained('dep', substring_list[i]) and (is_contained('tech', substring_list[i+1]) or is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or  is_contained('hosp', substring_list[i+1]) or  is_contained('school', substring_list[i+1]) or  is_contained('fac', substring_list[i+1])):
+            if not is_contained('univ', substring_list[i]):
+                substring_list.pop(i)
+            else:
+                i = i+1
+                continue
+
+        elif is_contained('inst',substring_list[i]) and (is_contained('school', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('acad', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('clin', substring_list[i+1]) or is_contained('klin', substring_list[i+1])  or is_contained('fak', substring_list[i+1]) or is_contained('fac', substring_list[i+1]) or is_contained('cent', substring_list[i+1]) or is_contained('div', substring_list[i+1])):
+            if not is_contained('univ', substring_list[i]):
+                substring_list.pop(i)
+            else:
+                i = i+1
+                continue
+
+
+        elif is_contained('school',substring_list[i]) and is_contained('colege', substring_list[i+1]):
+            if not is_contained('univ', substring_list[i]):
+                substring_list.pop(i)
+            else:
+                i = i+1
+                continue
+
+        else:
+            i += 1
+
+    light_aff = (', '.join((substring_list)))
+    for x in substring_list:
+        if x in city_names+remove_list:
+            substring_list.remove(x)
+    
+    substring_list = [shorten_keywords_spark([x])[0] for x in substring_list] 
+
+
+    def valueToCategory(value):
+        flag = 0
+
+        for k in categ_dicts:
+            if k in value: 
+                flag = 1
+        return flag
+    
+    aff_list = [{"index": i, "keywords": substring_list[i], "category": valueToCategory(substring_list[i])} for i in range(len(substring_list))]
+
+    filtered_list = [entry for entry in aff_list if entry.get("category") == 1]
+
+    return   [light_aff, filtered_list]
\ No newline at end of file
diff --git a/functions_cluster.py b/functions_cluster.py
new file mode 100644
index 0000000..9562755
--- /dev/null
+++ b/functions_cluster.py
@@ -0,0 +1,635 @@
+import re
+import unicodedata
+import html
+from unidecode import unidecode
+import json   
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+#import pandas as pd 
+
+def load_txt(file_path):
+    with open(file_path, 'r',  encoding='utf-8') as file:
+        list_ = [line.strip() for line in file]
+        return list_
+    
+def load_pickled_dict(file_path): 
+    with open(file_path, 'rb') as file: 
+        pickled_dict = pickle.load(file) 
+        return pickled_dict
+    
+
+def load_json(file_path): 
+    with open(file_path, 'r') as json_file:
+        json_dict = json.load(json_file)
+        return json_dict
+        
+categ_string = 'Laboratory|Univ/Inst|Hospital|Foundation|Specific|Museum'
+
+def replace_double_consonants(text):
+    # This regex pattern matches any double consonant
+    pattern = r'([bcdfghjklmnpqrstvwxyz])\1'
+    # The replacement is the first captured group (the single consonant)
+    result = re.sub(pattern, r'\1', text, flags=re.IGNORECASE)
+    return result
+
+remove_list = [replace_double_consonants(x) for x in load_txt('txt_files/remove_list.txt')]
+stop_words = load_txt('txt_files/stop_words.txt')
+university_terms = [replace_double_consonants(x) for x in load_txt('txt_files/university_terms.txt')]
+city_names = [replace_double_consonants(x) for x in load_txt('txt_files/city_names.txt')]
+
+categ_dicts = load_json('dictionaries/dix_categ.json')
+
+
+def is_contained(s, w):
+    words = s.split()  # Split the string 's' into a list of words
+    for word in words:
+        if word not in w:  # If a word from 's' is not found in 'w'
+            return False  # Return False immediately
+    return True  # If all words from 's' are found in 'w', return True
+
+def starts_with_any(string, prefixes):
+    for prefix in prefixes:
+        if string.startswith(prefix):
+            return [True, prefix]
+    return False
+
+def remove_leading_numbers(s):
+    return re.sub(r'^\d+', '', s)
+
+def remove_outer_parentheses(string):
+    """Remove outer parentheses from the string if they enclose the entire string."""
+    if string.startswith('(') and string.endswith(')'):
+        return string[1:-1].strip()
+    return string
+
+
+def insert_space_between_lower_and_upper(s):
+    """
+    Inserts a space between a lowercase letter followed by an uppercase letter in a string.
+
+    Parameters:
+    s (str): The input string.
+
+    Returns:
+    str: The modified string with spaces inserted.
+    """
+    # Use regex to insert space between lowercase and uppercase letters
+    modified_string = re.sub(r'([a-z])([A-Z])', r'\1 \2', s)
+    return modified_string
+
+
+def index_multiple_matchings(pairs):
+    result_dict = {}
+    
+    r_list = [pair[3] for pair in pairs]
+    modified_list = [item for sublist in r_list for item in sublist]
+    r = len(list(set(modified_list)))
+        
+    for t in [pair[0] for pair in pairs]:
+        key = t
+        if key in result_dict and r>1:
+            result_dict[key] += 1
+            
+        else:
+            result_dict[key] = 1
+   
+    return result_dict
+
+def avg_string(df, col):
+    avg = [] 
+    for i in range(len(df)):
+        avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i]))
+    return sum(avg)/len(avg)
+
+#stop_words = ['from', 'the', 'of', 'at', 'de','for','et','für','des', 'in','as','a','and','fur','for','und']
+
+
+def remove_stop_words(text):
+    words = text.split()
+    filtered_words = [word for word in words if word not in stop_words]
+    return ' '.join(filtered_words)
+
+
+def remove_parentheses(text):
+   return re.sub(r'\([^()]*\)', '', text)
+
+
+def replace_umlauts(text):
+    normalized_text = unicodedata.normalize('NFKD', text)
+    replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
+    return replaced_text
+
+def protect_phrases(input_string, phrases):
+    # Replace phrases with placeholders
+    placeholder_map = {}
+    for i, phrase in enumerate(phrases):
+        placeholder = f"__PLACEHOLDER_{i}__"
+        placeholder_map[placeholder] = phrase
+        input_string = input_string.replace(phrase, placeholder)
+    return input_string, placeholder_map
+
+def restore_phrases(split_strings, placeholder_map):
+    # Restore placeholders with original phrases
+    restored_strings = []
+    for s in split_strings:
+        for placeholder, phrase in placeholder_map.items():
+            s = s.replace(placeholder, phrase)
+        restored_strings.append(s)
+    return restored_strings
+
+def replace_comma_spaces(text):
+    return text.replace('  ', ' ').replace(' , ', ', ')
+
+def split_string_with_protection(input_string, protected_phrases):
+    # Step 1: Protect specific phrases
+    input_string, placeholder_map = protect_phrases(input_string, protected_phrases)
+    
+    # Step 2: Split the string on specified delimiters
+    split_strings = [s.strip() for s in re.split(r'[,;/]| – ', input_string) if s.strip()]
+    
+    # Step 3: Restore protected phrases
+    split_strings = restore_phrases(split_strings, placeholder_map)
+    
+    return split_strings
+
+protected_phrases1 =  [
+    phrase.format(x=x)
+    for x in city_names
+    for phrase in [
+        'university california, {x}',
+    #    'university california , {x}',
+
+        'university colege hospital, {x}',
+    #    'university colege hospital , {x}',
+        
+        'national univ ireland, {x}',
+    #    'national univ ireland , {x}',
+
+        'national university ireland, {x}',
+    #    'national university ireland , {x}',
+
+        'university colege, {x}',
+    #    'university colege , {x}',
+        
+        'university hospital, {x}', 
+    #    'university hospital , {x}', 
+
+        'imperial colege, {x}',
+    #    'imperial colege , {x}'
+        
+        'city university, {x}', 
+    #    'city university , {x}'
+
+        
+    ]
+]
+
+
+
+replacements = {'uni versity':'university',
+                'univ ':'university ',
+                'univercity':'university', 
+                'universtiy':'university', 
+                'univeristy':'university',
+                'universirty':'university', 
+                'universiti':'university', 
+                'universitiy':'university',
+                'universty' :'university',
+                'univ col': 'university colege',
+                'belfield, dublin': 'dublin',
+                'balsbridge, dublin': 'dublin', #ballsbridge
+                'earlsfort terrace, dublin': 'dublin',
+                'bon secours hospital, cork' : 'bon secours hospital cork',
+                'bon secours hospital, dublin' : 'bon secours hospital dublin',
+                'bon secours hospital, galway' : 'bon secours hospital galway',
+                'bon secours hospital, tralee' : 'bon secours hospital tralee',
+                'bon secours health system' : 'bon secours hospital dublin',
+                'bon secours hospital, glasnevin' : 'bon secours hospital dublin',
+                'imperial colege science, technology medicine' : 'imperial colege science technology medicine',
+                'ucl queen square institute neurology' : 'ucl, london',
+                'ucl institute neurology' : 'ucl, london',
+                'royal holoway, university london' : 'royal holoway universi london', #holloway
+                'city, university london' : 'city universi london',
+                'city university, london' : 'city universi london',
+                'aeginition':'eginition',
+                'national technical university, athens' : 'national technical university athens' 
+            # 'harvard medical school' : 'harvard university'
+
+
+    
+}
+
+
+def substrings_dict(string):
+    # Split the input string and clean each substring
+   # split_strings =  split_string_with_protection(string.replace('univ coll', 'university college').replace('belfield, dublin', 'dublin').replace('ballsbridge, dublin', 'dublin').replace('earlsfort Terrace, dublin', 'dublin'), protected_phrases1)
+    
+    for old, new in replacements.items():
+        string = string.replace(old, new)
+    split_strings = split_string_with_protection(string, protected_phrases1)
+    
+    # Define a set of university-related terms for later use
+
+
+    dict_string = {}
+    index = 0    
+    for value in split_strings:
+        
+        # Check if the substring contains any university-related terms
+        if not any(term in value.lower() for term in university_terms):
+            # Apply regex substitutions for common patterns
+   
+            modified_value = re.sub(r'universi\w*', 'universi', value, flags=re.IGNORECASE)
+            modified_value = re.sub(r'institu\w*', 'institu', modified_value, flags=re.IGNORECASE)
+            modified_value = re.sub(r'centre*', 'center', modified_value, flags=re.IGNORECASE)
+            modified_value = re.sub(r'\bsaint\b', 'st', modified_value, flags=re.IGNORECASE) 
+            modified_value = re.sub(r'\btrinity col\b', 'trinity colege', modified_value, flags=re.IGNORECASE)
+            modified_value = re.sub(r'\btechnische\b', 'technological', modified_value, flags=re.IGNORECASE)
+
+            
+
+            # Add the modified substring to the dictionary
+                     
+            dict_string[index] = modified_value.lower().strip()
+            index += 1
+       # elif 'universitetskaya' in value.lower():
+       #     index += 1
+
+
+            # Add the original substring to the dictionary
+        else:
+            dict_string[index] = value.lower().strip()
+            index += 1
+            
+    return dict_string
+
+
+
+def clean_string(input_string):
+    # Temporarily replace " - " with a unique placeholder
+    placeholder = "placeholder"
+  #  input_string = input_string.replace(" - ", placeholder)
+    input_string = input_string.replace(" – ", placeholder)
+
+    # Unescape HTML entities and convert to lowercase
+    input_string = replace_comma_spaces(replace_double_consonants(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.replace("'", "")))))).strip())
+    
+    # Normalize unicode characters (optional, e.g., replace umlauts)
+    input_string = unidecode(input_string)
+    
+    # Replace `/` and `–` with space (do not replace hyphen `-`)
+    result = re.sub(r'[/\-]', ' ', input_string)
+    
+    # Replace "saint" with "st"
+    result = re.sub(r'\bSaint\b', 'St', result)
+    result = re.sub(r'\bAghia\b', 'Agia', result)
+
+    
+    # Remove characters that are not from the Latin alphabet, or allowed punctuation
+    result = replace_comma_spaces(re.sub(r'[^a-zA-Z\s,;/]', '', result).strip())
+    
+    # Restore the " - " sequence from the placeholder
+    result = result.replace(placeholder, " – ")
+    
+    # Replace consecutive whitespace with a single space
+    result = re.sub(r'\s+', ' ', result)
+    #result = result.replace('ss', 's')
+    
+    result = insert_space_between_lower_and_upper(result).lower()
+    result = remove_stop_words(result)
+
+    return result.strip()  # Strip leading/trailing spaces
+
+
+def clean_string_facts(input_string):
+    # Replace specified characters with space
+    input_string = remove_stop_words(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.lower())))))
+    result = re.sub(r'[/\-,]', ' ', input_string)
+    result = re.sub(r'\bsaint\b', 'st', result) 
+
+    # Remove characters that are not from the Latin alphabet or numbers
+    result = re.sub(r'[^a-zA-Z0-9\s;/-]', '', result)
+    
+    # Replace consecutive whitespace with a single space
+    result = re.sub(r'\s+', ' ', result)
+    
+    return result
+    
+    
+def str_radius_u(string):
+    string = string.lower()
+    radius = 3
+    
+    str_list = string.split()
+    indices = []
+    result = []
+
+    for i, x in enumerate(str_list):
+        if is_contained('univers',x):
+            indices.append(i)
+        # elif is_contained('coll',x):
+        #     indices.append(i)
+            
+    for r0 in indices:
+        lmin =max(0,r0-radius)
+        lmax =min(r0+radius, len(str_list))
+        s = str_list[lmin:lmax+1]
+        
+        result.append(' '.join(s))
+    
+    return result 
+
+
+def str_radius_coll(string):
+    string = string.lower()
+    radius = 1
+    
+    str_list = string.split()
+    indices = []
+    result = []
+
+    for i, x in enumerate(str_list):
+        if is_contained('col',x):
+            indices.append(i)
+  
+    for r0 in indices:
+        lmin =max(0,r0-radius)
+        lmax =min(r0+radius, len(str_list))
+        s = str_list[lmin:lmax]
+        
+        result.append(' '.join(s))
+    
+    return result 
+
+
+def str_radius_h(string):
+    string = string.lower()
+    radius = 3
+    
+    str_list = string.split()
+    indices = []
+    result = []
+
+    for i, x in enumerate(str_list):
+        if is_contained('hospital',x) or is_contained('hopita',x):
+            indices.append(i)
+            
+    for r0 in indices:
+        lmin =max(0,r0-radius-1)
+        lmax =min(r0+radius, len(str_list))
+        s = str_list[lmin:lmax]
+        
+        result.append(' '.join(s))
+    
+    return result 
+
+
+def str_radius_c(string):
+    string = string.lower()
+    radius = 2
+    
+    str_list = string.split()
+    indices = []
+    result = []
+
+    for i, x in enumerate(str_list):
+        if is_contained('clinic',x) or is_contained('klinik',x):
+            indices.append(i)
+            
+    for r0 in indices:
+        lmin =max(0,r0-radius-1)
+        lmax =min(r0+radius, len(str_list))
+        s = str_list[lmin:lmax]
+        
+        result.append(' '.join(s))
+    
+    return result 
+
+def str_radius_r(string):
+    string = string.lower()
+    radius = 2
+    
+    str_list = string.split()
+    indices = []
+    result = []
+
+    for i, x in enumerate(str_list):
+        if is_contained('research',x):
+            indices.append(i)
+            
+    for r0 in indices:
+        lmin =max(0,r0-radius-1)
+        lmax =min(r0+radius, len(str_list))
+        s = str_list[lmin:lmax]
+        
+        result.append(' '.join(s))
+    
+    return result 
+
+def str_radius_spec(string):
+    spec = False
+    for x in string.split():
+        try:
+            if categ_dicts[x] == 'Specific':
+                spec = True
+                return x
+        except:
+            pass
+    if spec == False:
+        return string        
+        
+
+def avg_string(df, col):
+    avg = [] 
+    for i in range(len(df)):
+        avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i]))
+    return sum(avg)/len(avg)
+
+
+
+        
+                                
+def shorten_keywords(affiliations_simple):
+    affiliations_simple_n = []
+
+    for aff in affiliations_simple:
+        inner = []
+        for str in aff:
+            if 'universi' in str:
+                inner.extend(str_radius_u(str))
+            elif 'col' in str and 'trinity' in str:
+                inner.extend(str_radius_coll(str))
+            elif 'hospital' in str or 'hopita' in str:
+                inner.extend(str_radius_h(str))
+            elif 'clinic' in str or 'klinik' in str:
+                inner.extend(str_radius_c(str))
+            elif 'research council' in str:
+                inner.extend(str_radius_r(str))
+            else:
+                inner.append(str_radius_spec(str))
+
+        affiliations_simple_n.append(inner)
+
+    return affiliations_simple_n
+
+def shorten_keywords_spark(affiliations_simple):
+    affiliations_simple_n = []
+
+    for aff in affiliations_simple:
+      
+        if 'universi' in aff:
+            affiliations_simple_n.extend(str_radius_u(aff))
+        elif 'col' in aff and 'trinity' in aff:
+            affiliations_simple_n.extend(str_radius_coll(aff))
+        elif 'hospital' in aff or 'hopita' in aff:
+            affiliations_simple_n.extend(str_radius_h(aff))
+        elif 'clinic' in aff or 'klinik' in aff:
+            affiliations_simple_n.extend(str_radius_c(aff))
+        elif 'research council' in aff:
+            affiliations_simple_n.extend(str_radius_r(aff))
+        else:
+            affiliations_simple_n.append(str_radius_spec(aff))
+
+
+    return affiliations_simple_n
+
+
+def refine(list_, affil):
+    affil = affil.lower()
+    
+    ids = []
+    
+    for matched_org_list in list_:      
+     
+        id_list = []
+        
+        for matched_org in matched_org_list:
+            
+            if dix_mult[matched_org] == 'unique':
+                id_list.append(dix_acad[matched_org])
+            else:
+                city_found = False
+                for city in dix_city[matched_org]:
+                    if city[0] in affil:
+                        id_list.append(city[1])
+                        city_found = True
+                        break
+        
+                if not city_found:
+                    country_found = False
+                        
+                    for country in dix_country[matched_org]:
+                        if country[0] in  list(country_mapping.keys()):
+                            print(country[0])
+                            if country[0] in affil or country_mapping[country[0]][0] in affil or country_mapping[country[0]][0] in affil:
+                                id_list.append(country[1])
+                                country_found = True
+                                break
+                    
+                            
+
+                        elif country[0] in affil:
+                            print('country found',country[0])
+                        
+                            id_list.append(country[1])
+                            country_found = True
+                            break
+
+                    
+                    
+                    if not country_found:
+                        id_list.append(dix_acad[matched_org])
+           
+                
+        
+        ids.append(id_list)
+        return ids
+    
+def compute_cos(x,s):
+    vectorizer = CountVectorizer()
+
+    s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
+    x_vector = vectorizer.transform([x]).toarray()
+
+    # Compute similarity between the vectors
+    return cosine_similarity(x_vector, s_vector)[0][0]
+
+
+# def find_ror(string, simU, simG):
+#     df = pd.DataFrame()
+ 
+#     df['Unique affiliations'] = [[string.lower()]]
+#     academia = create_df_algorithm(df)
+    
+ 
+#     result = Aff_Ids(len(academia), academia,dix_acad, dix_mult, dix_city, dix_country, simU,simG)
+#     if len(result)>0:
+         
+#         dict_aff_open = {x: y for x, y in zip(result['Original affiliations'], result['Matched organizations'])}
+#         dict_aff_id = {x: y for x, y in zip(result['Original affiliations'], result['unique ROR'])}
+    
+#         dict_aff_score = {}
+#         for i in range(len(result)):
+#             if type(result['Similarity score'].iloc[i]) == list:
+#                 dict_aff_score[result['Original affiliations'].iloc[i]] = result['Similarity score'].iloc[i]
+#             else:
+#                 dict_aff_score[result['Original affiliations'].iloc[i]] = [result['Similarity score'].iloc[i]]
+                
+
+#         pids = []
+#         for i in range(len(df)):
+#             pidsi = []
+#             for aff in df['Unique affiliations'].iloc[i]:
+#                 if aff in list(dict_aff_id.keys()):
+#                     pidsi = pidsi + dict_aff_id[aff]
+#             # elif 'unmatched organization(s)' not in pidsi:
+#             #     pidsi = pidsi + ['unmatched organization(s)']
+#             pids.append(pidsi)
+                    
+                    
+#         names = []
+#         for i in range(len(df)):
+#             namesi = []
+#             for aff in df['Unique affiliations'].iloc[i]:
+#                 if aff in list(dict_aff_open.keys()):
+#                     try:
+#                         namesi = namesi + dict_aff_open[aff]
+#                     except TypeError:
+#                         namesi = namesi + [dict_aff_open[aff]]
+                    
+#             names.append(namesi)
+            
+#         scores = []
+#         for i in range(len(df)):
+#             scoresi = []
+#             for aff in df['Unique affiliations'].iloc[i]:
+#                 if aff in list(dict_aff_score.keys()):
+#                     scoresi = scoresi +  dict_aff_score[aff]
+                    
+#             scores.append(scoresi)
+            
+            
+#         df['Matched organizations'] = names
+#         df['ROR'] = pids
+#         df['Scores'] = scores
+
+
+       
+#         def update_Z(row):
+#             if len(row['ROR']) == 0 or len(row['Scores']) == 0:
+#                 return []
+            
+#             new_Z = []
+#             for ror, score in zip(row['ROR'], row['Scores']):
+#                 entry = {'ROR_ID': ror, 'Confidence': score}
+#                 new_Z.append(entry)
+#             return new_Z
+
+#         matching = df.apply(update_Z, axis=1)
+
+#         df['Matchings'] = matching
+
+        
+#         return df['Matchings'].iloc[0]
+#     else: 
+#         return 'no result'
\ No newline at end of file
diff --git a/matching_cluster.py b/matching_cluster.py
new file mode 100644
index 0000000..0c1ec92
--- /dev/null
+++ b/matching_cluster.py
@@ -0,0 +1,319 @@
+from collections import defaultdict
+from collections import Counter
+
+import Levenshtein
+
+from sklearn.feature_extraction.text import CountVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+
+from functions_cluster import *
+from create_input_cluster import *
+
+def best_sim_score(light_raw, candidate_num, pairs_list, m, simU, simG):
+    """
+    Finds the best match between a 'key word' and several legal names from the OpenAIRE database.
+    ---> corrects special cases in the main map that follows
+
+    Args:
+      light_raw  
+      l2  candidate_num: number of candidates.
+      l3  pairs_list: List of pairs. (s,  x, score)
+      l4  m: mult
+
+    Returns:
+        List: Resulting list containing OpenAIRE names and their similarity scores.
+    """
+    
+    vectorizer = CountVectorizer()
+    univ_num = light_raw.lower().count('univ') 
+    result = []
+    best = [] 
+    s = light_raw
+     
+    for j in range(len(pairs_list)):
+        x = pairs_list[j][1] 
+        
+        if [x, pairs_list[j][2]] in result:
+            continue
+        
+        if m[pairs_list[j][0]] == 1:
+            
+            if  is_contained('univ', x.lower()) and  pairs_list[j][2] > simU:
+                result.append([x, pairs_list[j][2]])
+            elif  pairs_list[j][2] > simG:
+                result.append([x, pairs_list[j][2]])
+
+        elif pairs_list[j][2] >= 0.98: # and (is_contained("univ", x.lower()) or is_contained("college", x.lower()) or  is_contained("center", x.lower()) or  is_contained("schule", x.lower())): # If the similarity score of a pair (s,x) was 1, we store it to results list
+            result.append([pairs_list[j][1], 1])
+            
+        else:
+            try:
+                if not is_contained("univ", x.lower()):
+                    continue  # Skip if x does not contain "university" or "univ"
+                
+                #  if (is_contained('hosp', x.lower()) and not is_contained('hosp', s)) or (not is_contained('hosp', x.lower()) and is_contained('hosp', s)) or (is_contained('hopital', x.lower()) and not is_contained('hopital', s)) or (not is_contained('hopital', x.lower()) and is_contained('hopital', s)):
+                #      continue
+                s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
+                x_vector = vectorizer.transform([x]).toarray()
+    
+                # Compute similarity between the vectors
+                similarity = cosine_similarity(x_vector, s_vector)[0][0]
+                if similarity > 0.1:
+                    similarity_l = 1 - Levenshtein.distance(x, pairs_list[j][0]) / max(len(x), len(pairs_list[j][0]))
+
+                    best.append([x, similarity, similarity_l]) #(similarity+similarity2)/2])
+            except:
+                KeyError
+                    
+    if best:
+        # max_numbers = defaultdict(float)
+
+
+# Assuming best is a list of three-element lists
+# Each element is (string, number1, number2)
+        max_numbers = defaultdict(float)
+        for item in best:
+            string, number1, number2 = item  # Unpack the three elements
+            max_numbers[string] = max(max_numbers[string], number1)
+
+        reduced_best = [[string, number1, number2] for string, number1, number2 in best if number1 == max_numbers[string]]
+
+# Sort by number1 decreasingly and then by number2 in descending order
+        reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True)
+
+        result = result + reduced_best
+                
+    univ_list = []
+    other_list = []
+    
+    for r in result:
+        if is_contained('univ', r[0]):
+            univ_list.append(r)
+        else:
+            other_list.append(r)
+    
+    limit =  min(univ_num, candidate_num)
+
+    if len(univ_list) > limit:
+        result = univ_list[:limit] + other_list
+        
+    result_dict = {}
+    pairs_dict = {}
+    
+    
+    for l in pairs_list:
+        pairs_dict[l[1]] = l[2]
+        
+        
+    for p in result:
+        result_dict[p[0]] = pairs_dict[p[0]]
+        
+        
+    result_dict_list = [[y[0], result_dict[y[0]]] for y in result]  
+                
+    return result_dict_list
+
+
+    
+def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG):
+    
+    """
+    Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
+
+    Args:
+        m (int): The number of DOIs to check.
+        DF (DataFrame): The input DataFrame containing affiliation data.
+        dix_org (dict): A dictionary of names of organizations and their ROR_ids.
+        simU (float): Similarity threshold for universities.
+        simG (float): Similarity threshold for non-universities.
+
+    Returns:
+        DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
+    """
+    df_list = input[1]
+    light_aff = input[0]
+    vectorizer = CountVectorizer()
+
+    lnamelist = list(dix_org.keys())
+    dix = {}    # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
+    #pairs = [] 
+    result = {}
+    pairs = []
+    
+ 
+    def get_keywords(filtered_list):
+        # Extract the "keywords" values from the dictionaries in filtered_list
+        keywords_list = [entry["keywords"] for entry in filtered_list]
+        
+        return keywords_list
+    keywords = get_keywords(df_list)
+
+
+    for k,s in enumerate(keywords):
+        similar_k = []
+        pairs_k = []
+
+        if s in lnamelist:
+            similarity = 1
+            similar_k.append(similarity)
+            
+            pairs_k.append((s,s,similarity,dix_org[s]))
+            pairs.append((s,s,similarity,dix_org[s]))
+
+
+            if k not in dix:
+                dix[k] = [s]
+            else:
+                dix[k].append(s)
+        else:
+
+            for x in lnamelist:
+                if  is_contained(s, x):
+
+                    x_vector = vectorizer.fit_transform([x]).toarray()
+                    s_vector = vectorizer.transform([s]).toarray()
+
+                    # Compute similarity between the vectors
+                    similarity = cosine_similarity(x_vector, s_vector)[0][0]
+                    if similarity > min(simU, simG):
+                        if (is_contained('univ', s) and is_contained('univ', x)) and similarity > simU:
+                            similar_k.append(similarity)
+                            pairs_k.append((s,x,similarity,dix_org[x]))
+                            pairs.append((s,x,similarity,dix_org[x]))
+
+
+                            if k not in dix:
+                                dix[k] = [x]
+                            else:
+                                dix[k].append(x)
+                        elif (not is_contained('univ', s) and not is_contained('univ', x)) and similarity > simG:
+                            similar_k.append(similarity)
+                            pairs_k.append((s,x,similarity,dix_org[x]))
+                            pairs.append((s,x,similarity,dix_org[x]))
+
+
+                            if k not in dix:
+                                dix[k] = [x]
+                            else:
+                                dix[k].append(x)
+                                
+                elif is_contained(x, s):
+                    if (is_contained('univ', s) and is_contained('univ', x)):
+
+                        s_vector = vectorizer.fit_transform([s]).toarray()
+                        x_vector = vectorizer.transform([x]).toarray()
+
+                        # Compute similarity between the vectors
+                        similarity = cosine_similarity(s_vector, x_vector)[0][0]
+                        if similarity > simU: #max(0.82,sim):
+                            similar_k.append(similarity)
+                            pairs_k.append((s,x,similarity,dix_org[x]))
+                            pairs.append((s,x,similarity,dix_org[x]))
+
+                            if k not in dix:
+                                dix[k] = [x]
+                            else:
+                                dix[k].append(x)
+                    elif not is_contained('univ', s) and not is_contained('univ', x):
+
+                        s_vector = vectorizer.fit_transform([s]).toarray()
+                        x_vector = vectorizer.transform([x]).toarray()
+
+                        # Compute similarity between the vectors
+                        similarity = cosine_similarity(s_vector, x_vector)[0][0]
+                        if similarity > simG: #max(0.82,sim):
+                            similar_k.append(similarity)
+                            pairs_k.append((s,x,similarity,dix_org[x]))
+                            pairs.append((s,x,similarity,dix_org[x]))
+
+                            if k not in dix:
+                                dix[k] = [x]
+                            else:
+                                dix[k].append(x)  
+
+        result[k] = pairs_k
+        
+    multi = index_multiple_matchings(list(set(pairs)))
+   # need_check = list(set([i for i in range(len(multi)) if list(multi.values())[i]>1]))
+   # print('here', multi)
+   # need_check_keys = [keywords[i] for i in range(len(keywords)) if multi[keywords[i]]>1]
+    need_check_keys = []
+    for i in range(len(keywords)):
+        try: 
+            if  multi[keywords[i]]>1:
+                need_check_keys.append(keywords[i])
+        except:
+            pass
+        
+    best =  best_sim_score(light_aff, len(keywords), pairs, multi, simU, simG) 
+    matched_org = [x[0] for x in best]
+  #      best_o = []
+ #       best_s = []
+  #      best_result = []
+   #     for x in best:
+    #        best_o.append([x[i][0]  for i in range(len(x))])
+     #       best_s.append([round(x[i][1],2)  for i in range(len(x))])
+      #  num_mathced = [len(best_s[i]) for i in range(len(need_check))]
+    ids = [dix_org[x[0]] for x in best]
+    for i,x in enumerate(matched_org):
+       # id_list = []
+        if dix_mult[x] != 'unique':
+            if x in list(dix_city_ror.keys()):
+                match_found0 = False
+                match_found = False
+
+                for city in dix_city_ror[x]:
+                    if city[0] in light_aff:
+                        if city[0] not in x: 
+                            ids[i] = city[1]
+                            
+                            match_found0 = True
+                            match_found = True
+                            break
+                if not match_found:
+                    for city in dix_city_ror[x]:
+                        if city[0] in   light_aff and city[0] not in x:
+                            ids[i] = city[1]
+                            match_found0 = True
+                            print('ok')
+                            break  
+                    
+                if not match_found:
+                    match_found2 = False
+                    match_found3 = False
+
+                    for country in dix_country_ror[x]:
+                        if country[0] == 'united states' and (country[0] in light_aff or 'usa'  in light_aff):
+                            ids[i] = country[1]
+                            match_found2 = True
+                            match_found3 = True
+                            break
+                        
+                        if country[0] == 'united kingdom' and (country[0] in light_aff or 'uk'  in light_aff):
+                            ids[i] = country[1]
+                            match_found2 = True
+                            match_found3 = True
+                            break
+
+                        elif country[0] in light_aff:
+
+                            if country[0] not in x:
+                                ids[i] = country[1]
+                                match_found2 = True
+                                match_found3 = True
+                                break
+
+                    if not match_found3:
+                        for country in dix_country_ror[x]:
+                            if country[0] in light_aff and country[0] in x:
+                                ids[i] = country[1]
+                                match_found2 = True
+                                break  
+                        
+                
+                
+            
+
+    results = [[x[0],x[1], ids[i]] for i,x in enumerate(best)]
+
+    return  results #[[result[to_check[i]] for i in ready] + [to_check[2]], best[0]]
\ No newline at end of file
diff --git a/txt_files/.DS_Store b/txt_files/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..37cf8fae3c3837fbac5dd32715df22c6f1041ffc
GIT binary patch
literal 8196
zcmeHM!A`<J5Php0NKDX!2af#$8@+g~1P&&|gG8^0(0~cFq!g7~KixOGTN6{EF(wA$
zY_j{B**CMD$8MV`08D2+xdhq(n(TsSCmg<MGB3T-j=V)C4gG;35{$4wj8%oL230^6
zPz6*0RX`Qk2L<@fme$<z-gmP)ssgIOzf?e;4*|Pi<gsvQ9~~TQ2|#SnY#Xmx2UwrL
zW8|@LXk8i8RS(9y8ed`<=g#>+=)}ll;n2CmICmJovhfv)v8&TBuymNnp^mD6Do|BG
zj!g?=ydcNsOZ|R9%RBOA{@h=$@{GJb2IQ=0k6Gh~ctnah*1p~k(PizQvxj&fD@MjK
z<m`r5Il00}xt8-tXp{LAOL7z3;1>7j(Q=Iop8kc6HpYzimS9Tzh^tM>&+OgK$Qa@k
zSDdNO;m+r<GgrnrWpv3dW#pLg=UhuNJ}forvnchbVOnJ+fBBV$c(dwHsK{aI(Xzkc
z_8dwzcJipjIrnbD9U*%|c6WJd<V0lLHHW6lPx2`>s@*aA{^w57rV8x40>|zK+fx7c
zwtxTM_cZkWQ~_1s7Ydk0chK!HfUm79p`_Lh*q_*?39oQyT{zfsoM_8&;^!ZRybeee
WpU7k3kQO%m5MX7{K^6E@1wH_!2k8(1

literal 0
HcmV?d00001

diff --git a/txt_files/city_names.txt b/txt_files/city_names.txt
new file mode 100644
index 0000000..e0d7bd4
--- /dev/null
+++ b/txt_files/city_names.txt
@@ -0,0 +1,584 @@
+galway
+maynooth
+duluth
+port arthur
+new orleans
+paterson
+santa barbara
+thornton
+westminster
+north las vegas
+stockton
+marysville
+fitchburg
+tallinn
+fargo
+seaside
+manaus
+porto
+quebec city
+hialeah
+normal
+kansas city
+delhi
+fort worth
+palermo
+olathe
+madison
+santa maria
+youngstown
+allentown
+santa clara
+charlotte
+agra
+palmdale
+kraków
+bendigo
+high point
+washington
+dallas
+grand prairie
+plano
+leipzig
+bratislava
+seville
+puebla
+lucknow
+toowoomba
+santa rosa
+sioux falls
+flint
+kissimmee
+lacey
+brownsville
+palm springs
+tyler
+minsk
+san diego
+los angeles
+edmonton
+college station
+toulouse
+garland
+florence
+saskatoon
+albury-wodonga
+newburgh
+danbury
+deltona
+south bend
+nagpur
+pomona
+memphis
+london
+lincoln
+chandler
+adelaide
+salt lake city
+edinburgh
+suzhou
+grayslake
+new york city
+kanpur
+brussels
+okayama
+tuscaloosa
+clarksville
+jackson
+boise city
+canton
+louisville
+varanasi
+columbus
+lorain
+vadodara
+orem
+chennai
+townsville
+eindhoventoronto
+wuhan
+norman
+winter haven
+eugene
+riga
+hamamatsu
+fresno
+lake charles
+budapest
+mobile
+lowell
+vienna
+tallahassee
+nanjing
+new haven
+sacramento
+leeds
+harlingen
+springdale
+perth
+sendai
+utica
+orange
+baltimore
+rochester
+rancho cucamonga
+bellevue
+fort wayne
+modesto
+pristina
+nuremberg
+stuttgart
+indore
+murfreesboro
+nottingham
+scranton
+lancaster
+abilene
+monterey
+sioux city
+bari
+chula vista
+ahmedabad
+north port
+helsinki
+leominster
+ocala
+sarajevo
+hangzhou
+roanoke
+new york
+bethlehem
+dublin
+sunshine coast
+pune
+billings
+changchunsydney
+garden grove
+port orange
+pittsburgh
+new bedford
+hiroshima
+san francisco
+sheffield
+chongqing
+layton
+pueblo
+chengdu
+cincinnati
+erie
+lansing
+ljubljana
+st louis
+rio de janeiro
+philadelphia
+tacoma
+bel air
+chesapeake
+davenport
+las vegas
+nagasaki
+kitchener
+boulder
+roseville
+evansville
+victoria
+burbank
+sofia
+santa clarita
+san buenaventura
+savannah
+apple valley
+brighton
+coral springs
+huntsville
+fort lauderdale
+warsaw
+antioch
+medford
+visalia
+frankfurt
+joliet
+curitiba
+mcallen
+seattle
+alexandria
+bryan
+moreno valley
+berlin
+olympia
+caracas
+tianjin
+cleveland
+des moines
+prague
+fukuoka
+burlington
+bhopal
+nara
+hampton
+jefferson
+chicago
+temecula
+paris
+gilbert
+bradenton
+champaign
+munich
+amsterdam
+raleigh
+atlanta
+lakeland
+denver
+round lake beach
+richmond
+buffalo
+phoenix
+antwerp
+greenbay
+milwaukee
+south lyon
+concord
+vero beach
+newcastle
+podgorica
+monterrey
+shantou
+costa mesa
+copenhagen
+vilnius
+dalian
+bristol
+salinas
+belgrade
+waterloo
+henderson
+hayward
+hickory
+el monte
+lima
+redding
+mexico city
+cary
+kennewick
+guayaquil
+tirana
+kawasaki
+greensboro
+west covina
+amarillo
+saitama
+new london
+recife
+manchester
+rockford
+kelowna
+hagerstown
+bordeaux
+york
+kaneohe
+tucson
+gainesville
+kalamazoo
+bogotá
+reading
+virginia beach
+guadalajara
+albany
+durham
+green bay
+oceanside
+montreal
+turin
+malaga
+oshawa
+mesa
+pensacola
+boise
+bonita springs
+fort walton beach
+port saint lucie
+reykjavik
+north charleston
+newark
+reno
+knoxville
+bakersfield
+oslo
+omaha
+milan
+cambridge
+norwich
+shanghai
+naples
+victorville
+zagreb
+norwalk
+huntington beach
+clarke county
+lubbock
+yakima
+warren
+bucharest
+simi valley
+greenville
+racine
+salvador
+elk grove
+orlando
+windsor
+santa cruz
+saginaw
+ballarat
+muskegon
+shreveport
+clearwater
+merced
+boston
+basel
+elizabeth
+panama city
+okinawa
+sarasota
+zurich
+glendale
+wilmington
+pompano beach
+guangzhou
+fairfield
+hyderabad
+santiago
+nashville
+mchenry
+ann arbor
+carrollton
+hollywood
+laredo
+rome
+san bernardino
+bergen
+springfield
+winnipeg
+corona
+surat
+long beach
+nagoya
+toledo
+geelong
+kenosha
+sterling heights
+lisbon
+myrtle beach
+nashua
+riverside
+tampa
+bangalore
+richland
+rotterdam
+lyon
+scottsdale
+berkeley
+bologna
+cedar rapids
+syracuse
+tulsa
+ludhiana
+hemet
+portland
+mission viejo
+salem
+overland park
+detroit
+jinan
+osaka
+grand rapids
+jersey city
+kailua
+venice
+darwin
+miramar
+gulfport-biloxi
+huntington
+portsmouth
+worcester
+sunnyvale
+escondido
+college park
+thousand oaks
+harbin
+belfast
+yonkers
+alicante
+barnstable
+kitakyushu
+sapporo
+ogden
+aurora
+palm bay
+düsseldorf
+hobart
+irvine
+st johns
+hamburg
+provo
+melbourne
+madrid
+zhengzhou
+asheville
+patna
+inglewood
+houston
+newport news
+west valley city
+oklahoma city
+brisbane
+valencia
+pasadena
+aberdeen
+st petersburg
+lakewood
+irving
+naperville
+miami
+topeka
+downey
+genoa
+lewisville
+birmingham
+xian
+saint paul
+bremerton
+corpus christi
+daytona beach
+st paul
+oxnard
+murrieta
+lafayette
+montgomery
+baton rouge
+skopje
+cathedral city
+spartanburg
+canberra
+arvada
+hesperia
+port st lucie
+saint louis
+bridgeport
+tempe
+quito
+chattanooga
+bremen
+gold coast
+cairns
+beaumont
+elkhart
+peoria
+calgary
+honolulu
+havre de grace
+hamilton
+fullerton
+daly city
+dresden
+belem
+ottawa
+regina
+chiba
+fort collins
+indianapolis
+mumbai
+killeen
+sao paulo
+jaipur
+fremont
+zaragoza
+charleston
+waco
+kobe
+odessa
+monroe
+vallejo
+marseille
+qingdao
+frederick
+marina
+sebastian
+oakland
+pembroke pines
+san antonio
+kyoto
+colorado springs
+el paso
+shenyang
+punta gorda
+fort smith
+richmond county
+waterbury
+shenzhen
+albuquerque
+jacksonville
+minneapolis
+fortaleza
+denton
+gastonia
+fayetteville
+bloomington
+houma
+santa ana
+kolkata
+las cruces
+barcelona
+arlington
+niigata
+norfolk
+fontana
+providence
+santo domingo
+vancouver
+appleton
+san jose
+hartford
+winston
+barrie
+glasgow
+davidson county
+yokohama
+independence
+athens
+harrisburg
+macon
+torrance
+launceston
+cape coral
+austin
+little rock
+cologne
+mesquite
+catania
+stockholm
+nice
+stamford
+buenos aires
+columbia
+anchorage
+dayton
+wollongong
+halifax
+verona
+anaheim
+kiev
+augusta
+tokyo
+akron
+lexington
+wichita
+saint petersburg
+beijing
+johnson city
+spokane
+liverpool
+howell
+poughkeepsie
+ontario
+atlantic city
+trenton
diff --git a/txt_files/remove_list.txt b/txt_files/remove_list.txt
new file mode 100644
index 0000000..5023a5c
--- /dev/null
+++ b/txt_files/remove_list.txt
@@ -0,0 +1,28 @@
+universi
+research institu
+laboratory
+gmbh
+inc
+universi of
+research center
+foundation
+faculty
+national institu
+school medicine
+universi school
+graduate school
+graduate school engineering
+institu tropical medicine
+institu virology
+faculty medicine
+laboratory
+universi park
+institu science
+polytechnic universi
+universi 1
+ciudad universi
+universi campus
+universi hospitals
+colege
+universi road
+universitetska str
diff --git a/txt_files/stop_words.txt b/txt_files/stop_words.txt
new file mode 100644
index 0000000..808ddf1
--- /dev/null
+++ b/txt_files/stop_words.txt
@@ -0,0 +1,16 @@
+from
+the
+of
+at
+de
+for
+et
+für
+des
+in
+as
+a
+and
+fur
+for
+und
diff --git a/txt_files/university_terms.txt b/txt_files/university_terms.txt
new file mode 100644
index 0000000..abfaef4
--- /dev/null
+++ b/txt_files/university_terms.txt
@@ -0,0 +1,8 @@
+universitetskaya
+universitatsklinikum
+universitatskinderklinik
+universitatskliniken
+universitetshospital
+universitatsmedizin
+universitatsbibliothek
+universitatspital
\ No newline at end of file
diff --git a/update_records.py b/update_records.py
new file mode 100644
index 0000000..04278ab
--- /dev/null
+++ b/update_records.py
@@ -0,0 +1,116 @@
+import json
+import os 
+from pyspark.sql import SparkSession
+from affro_cluster import *
+
+folder_path = '/user/zeppelin/miriam.baglioni/AffStringFromIISDataset2'
+#folder_path = 'check'
+
+json_file_names = []
+
+# Loop through all files in the directory
+for file_name in os.listdir(folder_path):
+    # Check if the file is a JSON file (you can adjust the check as needed)
+    if file_name != '_SUCCESS':
+        json_file_names.append(file_name)
+
+# json_file_names now contains the names of all JSON files in the folder
+
+# Initialize Spark session
+spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
+
+def remove_duplicates(list_of_dicts):
+    # Use a set to store tuples of dictionary items to filter out duplicates
+    seen = set()
+    unique_list_of_dicts = []
+
+    for d in list_of_dicts:
+        # Convert the dictionary to a tuple of items
+        items = tuple(d.items())
+        if items not in seen:
+            seen.add(items)
+            unique_list_of_dicts.append(d)
+
+    return unique_list_of_dicts
+
+def update_record(record):
+    id = record['id']
+    authors = []
+    try:
+        for author in record['authors']:
+            author_object = {}
+            if 'orcid.org/0'  in author['fullName']:
+                author_object['Name'] = {'Full':author['fullName'].split(',')[1], 'First' : None, 'Last' : None}
+                author_object['ORCID'] = author['fullName'].split(',')[0][:36]
+            else:
+                author_object['Name'] = {'Full':author['fullName'], 'First' : None, 'Last' : None}
+                author_object['ORCID'] = None 
+            author_object['Raw_affiliations'] = [affiliation['raw_affiliation_string'] for affiliation in author['affiliations']]
+            all_affs_with_ror = []
+            have_ror = False
+            for affiliation in author['affiliations']:
+           #     author_object['Raw_affiliations'] = [x for x in affiliation['raw_affiliation_string']]
+                if 'ORCID: 0' in affiliation['raw_affiliation_string']:
+                    x = affiliation['raw_affiliation_string']
+                    author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID: ')[1]
+                elif  'ORCID 0' in affiliation['raw_affiliation_string']:
+                    x = affiliation['raw_affiliation_string']
+                    author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID ')[1]
+                if 'ror.org' in affiliation['raw_affiliation_string']:
+                    have_ror = True
+                    all_affs_with_ror.append({
+                    'Origin': 'data',
+                    'RORid': affiliation['raw_affiliation_string'][0:25],
+                    'Confidence': None
+                    })
+             
+                        
+                else:
+                    if len(affro(affiliation['raw_affiliation_string']))>0:
+                        author_object['Organization_PIDs'] = affro(affiliation['raw_affiliation_string'])
+                        author_object['Organization_PIDs'] = remove_duplicates([json.loads(x) for x in author_object['Organization_PIDs']])
+
+                    else:
+                        author_object['Organization_PIDs'] = []
+                    
+            if have_ror == True:
+                author_object['Organization_PIDs'] = all_affs_with_ror
+            order = ["Name", "Raw_affiliations", "Organization_PIDs", "ORCID"]
+
+            reordered_data = {k: author_object[k] for k in order}
+
+            authors.append(reordered_data)
+        
+        
+        organizations =  remove_duplicates([x for author in authors for x in  author['Organization_PIDs']])
+
+        updt = {'ID' : id, 'Authors' : authors, 'Organizations' : organizations}
+        return updt
+    except Exception as e:
+        print(f"Error processing record with id {record.get('id')}: {str(e)}")
+        return None
+
+
+
+for file in json_file_names:
+    print('start processing '+str(file))
+    df = spark.read.json(folder_path + '/' + file)
+
+    # Apply the update_record function
+    updated_rdd = df.rdd.map(lambda row: update_record(row.asDict()))
+
+    # Convert updated RDD to JSON strings
+    json_rdd = updated_rdd.map(lambda record: json.dumps(record))
+
+    # Collect the data and write to an output file with a unique name
+    json_data = json_rdd.collect()
+
+    # Create a new filename by appending "_output.json" to the original filename (without extension)
+    output_file_name = file+'_output.json'
+    print('end processing '+str(file))
+
+    with open(output_file_name, 'w') as f:
+        for i, item in enumerate(json_data):
+            print('write '+str(i))
+
+            f.write(item + '\n')