From 0c98ba76a6297a20fcc21de693bd997977097c85 Mon Sep 17 00:00:00 2001 From: mkallipo <95910739+mkallipo@users.noreply.github.com> Date: Thu, 5 Sep 2024 12:23:32 +0200 Subject: [PATCH] initial commit --- README.md | 0 affro_cluster.py | 40 +++ affro_test_example.py | 28 ++ create_input_cluster.py | 77 ++++ functions_cluster.py | 635 +++++++++++++++++++++++++++++++++ matching_cluster.py | 319 +++++++++++++++++ txt_files/.DS_Store | Bin 0 -> 8196 bytes txt_files/city_names.txt | 584 ++++++++++++++++++++++++++++++ txt_files/remove_list.txt | 28 ++ txt_files/stop_words.txt | 16 + txt_files/university_terms.txt | 8 + update_records.py | 116 ++++++ 12 files changed, 1851 insertions(+) create mode 100644 README.md create mode 100644 affro_cluster.py create mode 100644 affro_test_example.py create mode 100644 create_input_cluster.py create mode 100644 functions_cluster.py create mode 100644 matching_cluster.py create mode 100644 txt_files/.DS_Store create mode 100644 txt_files/city_names.txt create mode 100644 txt_files/remove_list.txt create mode 100644 txt_files/stop_words.txt create mode 100644 txt_files/university_terms.txt create mode 100644 update_records.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/affro_cluster.py b/affro_cluster.py new file mode 100644 index 0000000..7d59a2b --- /dev/null +++ b/affro_cluster.py @@ -0,0 +1,40 @@ +import sys +##import functions +from functions_cluster import * +from matching_cluster import * +from create_input_cluster import * +import json + +dix_org = load_json('dictionaries/dix_acad.json') +dix_mult = load_json('dictionaries/dix_mult.json') +dix_city = load_json('dictionaries/dix_city.json') +dix_country = load_json('dictionaries/dix_country.json') + + +def affro(raw_aff_string): + try: + result = Aff_Ids(create_df_algorithm(raw_aff_string), dix_org, dix_mult, dix_city, dix_country, 0.65, 0.82) + if len(result)>0: + result_dict = [json.dumps({'Origin': 'affRo', 'RORid':x[2], 'Confidence':x[1]}) for x in result] + else: + result_dict = [] + + return result_dict + except Exception as e: + # Return some indication of an error, or log the row + print(f"Error: {str(e)}") + print(raw_aff_string) + pass +#raw_aff = 'university of california, los angeles, university of athens, university of california, san diego, university of athens, greece' + + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python affro_spark.py ") + sys.exit(1) + + string_arg = sys.argv[1] + # float_arg1 = float(sys.argv[2]) + # float_arg2 = float(sys.argv[3]) + + print(affro(string_arg)) diff --git a/affro_test_example.py b/affro_test_example.py new file mode 100644 index 0000000..7cb3363 --- /dev/null +++ b/affro_test_example.py @@ -0,0 +1,28 @@ +from pyspark.sql import SparkSession +from pyspark.sql.functions import udf +from pyspark.sql.types import StringType + +import sys + +from affro_cluster import * + +# Initialize SparkSession +spark = SparkSession.builder.appName("CustomFunctionExample").getOrCreate() + +# Register the function as a UDF +affro_udf = udf(affro, StringType()) + +# Input list of strings +input_data = ["university of athens", "university of vienna", "UCLA"] + +# # Convert the list to a Spark DataFrame +df = spark.createDataFrame(input_data, "string").toDF("raw_affiliation_string") + +# # Apply your custom UDF to the DataFrame +df_with_custom_value = df.withColumn("affro_value", affro_udf(df["raw_affiliation_string"])) + + +df_with_custom_value.show(truncate=False) + +# Stop the SparkSession +spark.stop() diff --git a/create_input_cluster.py b/create_input_cluster.py new file mode 100644 index 0000000..880fb87 --- /dev/null +++ b/create_input_cluster.py @@ -0,0 +1,77 @@ +from functions_cluster import * + +def create_df_algorithm(raw_aff_string): + aff_no_symbols_d = substrings_dict(clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string)))) + substring_list = list(aff_no_symbols_d.values()) + + i = 0 + + while i < len(substring_list) - 1: + if is_contained('progr', substring_list[i]) and is_contained('dep', substring_list[i+1]): + substring_list.pop(i) + + + elif (is_contained('assistant', substring_list[i]) or is_contained('researcher', substring_list[i]) or is_contained('phd', substring_list[i]) or is_contained('student', substring_list[i]) or is_contained('section', substring_list[i]) or is_contained('prof', substring_list[i]) or is_contained('director', substring_list[i])) and (not is_contained('school', substring_list[i+1]) or is_contained('univ', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('lab', substring_list[i+1]) or is_contained('fac', substring_list[i+1])): + substring_list.pop(i) + + elif (is_contained('engineer', substring_list[i]) or is_contained('progr', substring_list[i]) or is_contained('unit', substring_list[i]) or is_contained('lab', substring_list[i]) or is_contained('dep', substring_list[i]) or is_contained('school', substring_list[i]) or is_contained('inst', substring_list[i]) #or is_contained('hosp', substring_list[i]) + or is_contained('fac', substring_list[i])) and is_contained('univ', substring_list[i+1]): + if not is_contained('univ', substring_list[i]): + substring_list.pop(i) + else: + i = i+1 + continue + + elif is_contained('lab', substring_list[i]) and (is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('school', substring_list[i+1])): + if not is_contained('univ', substring_list[i]): + substring_list.pop(i) + else: + i = i+1 + continue + + elif is_contained('dep', substring_list[i]) and (is_contained('tech', substring_list[i+1]) or is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('school', substring_list[i+1]) or is_contained('fac', substring_list[i+1])): + if not is_contained('univ', substring_list[i]): + substring_list.pop(i) + else: + i = i+1 + continue + + elif is_contained('inst',substring_list[i]) and (is_contained('school', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('acad', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('clin', substring_list[i+1]) or is_contained('klin', substring_list[i+1]) or is_contained('fak', substring_list[i+1]) or is_contained('fac', substring_list[i+1]) or is_contained('cent', substring_list[i+1]) or is_contained('div', substring_list[i+1])): + if not is_contained('univ', substring_list[i]): + substring_list.pop(i) + else: + i = i+1 + continue + + + elif is_contained('school',substring_list[i]) and is_contained('colege', substring_list[i+1]): + if not is_contained('univ', substring_list[i]): + substring_list.pop(i) + else: + i = i+1 + continue + + else: + i += 1 + + light_aff = (', '.join((substring_list))) + for x in substring_list: + if x in city_names+remove_list: + substring_list.remove(x) + + substring_list = [shorten_keywords_spark([x])[0] for x in substring_list] + + + def valueToCategory(value): + flag = 0 + + for k in categ_dicts: + if k in value: + flag = 1 + return flag + + aff_list = [{"index": i, "keywords": substring_list[i], "category": valueToCategory(substring_list[i])} for i in range(len(substring_list))] + + filtered_list = [entry for entry in aff_list if entry.get("category") == 1] + + return [light_aff, filtered_list] \ No newline at end of file diff --git a/functions_cluster.py b/functions_cluster.py new file mode 100644 index 0000000..9562755 --- /dev/null +++ b/functions_cluster.py @@ -0,0 +1,635 @@ +import re +import unicodedata +import html +from unidecode import unidecode +import json +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.metrics.pairwise import cosine_similarity +#import pandas as pd + +def load_txt(file_path): + with open(file_path, 'r', encoding='utf-8') as file: + list_ = [line.strip() for line in file] + return list_ + +def load_pickled_dict(file_path): + with open(file_path, 'rb') as file: + pickled_dict = pickle.load(file) + return pickled_dict + + +def load_json(file_path): + with open(file_path, 'r') as json_file: + json_dict = json.load(json_file) + return json_dict + +categ_string = 'Laboratory|Univ/Inst|Hospital|Foundation|Specific|Museum' + +def replace_double_consonants(text): + # This regex pattern matches any double consonant + pattern = r'([bcdfghjklmnpqrstvwxyz])\1' + # The replacement is the first captured group (the single consonant) + result = re.sub(pattern, r'\1', text, flags=re.IGNORECASE) + return result + +remove_list = [replace_double_consonants(x) for x in load_txt('txt_files/remove_list.txt')] +stop_words = load_txt('txt_files/stop_words.txt') +university_terms = [replace_double_consonants(x) for x in load_txt('txt_files/university_terms.txt')] +city_names = [replace_double_consonants(x) for x in load_txt('txt_files/city_names.txt')] + +categ_dicts = load_json('dictionaries/dix_categ.json') + + +def is_contained(s, w): + words = s.split() # Split the string 's' into a list of words + for word in words: + if word not in w: # If a word from 's' is not found in 'w' + return False # Return False immediately + return True # If all words from 's' are found in 'w', return True + +def starts_with_any(string, prefixes): + for prefix in prefixes: + if string.startswith(prefix): + return [True, prefix] + return False + +def remove_leading_numbers(s): + return re.sub(r'^\d+', '', s) + +def remove_outer_parentheses(string): + """Remove outer parentheses from the string if they enclose the entire string.""" + if string.startswith('(') and string.endswith(')'): + return string[1:-1].strip() + return string + + +def insert_space_between_lower_and_upper(s): + """ + Inserts a space between a lowercase letter followed by an uppercase letter in a string. + + Parameters: + s (str): The input string. + + Returns: + str: The modified string with spaces inserted. + """ + # Use regex to insert space between lowercase and uppercase letters + modified_string = re.sub(r'([a-z])([A-Z])', r'\1 \2', s) + return modified_string + + +def index_multiple_matchings(pairs): + result_dict = {} + + r_list = [pair[3] for pair in pairs] + modified_list = [item for sublist in r_list for item in sublist] + r = len(list(set(modified_list))) + + for t in [pair[0] for pair in pairs]: + key = t + if key in result_dict and r>1: + result_dict[key] += 1 + + else: + result_dict[key] = 1 + + return result_dict + +def avg_string(df, col): + avg = [] + for i in range(len(df)): + avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i])) + return sum(avg)/len(avg) + +#stop_words = ['from', 'the', 'of', 'at', 'de','for','et','für','des', 'in','as','a','and','fur','for','und'] + + +def remove_stop_words(text): + words = text.split() + filtered_words = [word for word in words if word not in stop_words] + return ' '.join(filtered_words) + + +def remove_parentheses(text): + return re.sub(r'\([^()]*\)', '', text) + + +def replace_umlauts(text): + normalized_text = unicodedata.normalize('NFKD', text) + replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c)) + return replaced_text + +def protect_phrases(input_string, phrases): + # Replace phrases with placeholders + placeholder_map = {} + for i, phrase in enumerate(phrases): + placeholder = f"__PLACEHOLDER_{i}__" + placeholder_map[placeholder] = phrase + input_string = input_string.replace(phrase, placeholder) + return input_string, placeholder_map + +def restore_phrases(split_strings, placeholder_map): + # Restore placeholders with original phrases + restored_strings = [] + for s in split_strings: + for placeholder, phrase in placeholder_map.items(): + s = s.replace(placeholder, phrase) + restored_strings.append(s) + return restored_strings + +def replace_comma_spaces(text): + return text.replace(' ', ' ').replace(' , ', ', ') + +def split_string_with_protection(input_string, protected_phrases): + # Step 1: Protect specific phrases + input_string, placeholder_map = protect_phrases(input_string, protected_phrases) + + # Step 2: Split the string on specified delimiters + split_strings = [s.strip() for s in re.split(r'[,;/]| – ', input_string) if s.strip()] + + # Step 3: Restore protected phrases + split_strings = restore_phrases(split_strings, placeholder_map) + + return split_strings + +protected_phrases1 = [ + phrase.format(x=x) + for x in city_names + for phrase in [ + 'university california, {x}', + # 'university california , {x}', + + 'university colege hospital, {x}', + # 'university colege hospital , {x}', + + 'national univ ireland, {x}', + # 'national univ ireland , {x}', + + 'national university ireland, {x}', + # 'national university ireland , {x}', + + 'university colege, {x}', + # 'university colege , {x}', + + 'university hospital, {x}', + # 'university hospital , {x}', + + 'imperial colege, {x}', + # 'imperial colege , {x}' + + 'city university, {x}', + # 'city university , {x}' + + + ] +] + + + +replacements = {'uni versity':'university', + 'univ ':'university ', + 'univercity':'university', + 'universtiy':'university', + 'univeristy':'university', + 'universirty':'university', + 'universiti':'university', + 'universitiy':'university', + 'universty' :'university', + 'univ col': 'university colege', + 'belfield, dublin': 'dublin', + 'balsbridge, dublin': 'dublin', #ballsbridge + 'earlsfort terrace, dublin': 'dublin', + 'bon secours hospital, cork' : 'bon secours hospital cork', + 'bon secours hospital, dublin' : 'bon secours hospital dublin', + 'bon secours hospital, galway' : 'bon secours hospital galway', + 'bon secours hospital, tralee' : 'bon secours hospital tralee', + 'bon secours health system' : 'bon secours hospital dublin', + 'bon secours hospital, glasnevin' : 'bon secours hospital dublin', + 'imperial colege science, technology medicine' : 'imperial colege science technology medicine', + 'ucl queen square institute neurology' : 'ucl, london', + 'ucl institute neurology' : 'ucl, london', + 'royal holoway, university london' : 'royal holoway universi london', #holloway + 'city, university london' : 'city universi london', + 'city university, london' : 'city universi london', + 'aeginition':'eginition', + 'national technical university, athens' : 'national technical university athens' + # 'harvard medical school' : 'harvard university' + + + +} + + +def substrings_dict(string): + # Split the input string and clean each substring + # split_strings = split_string_with_protection(string.replace('univ coll', 'university college').replace('belfield, dublin', 'dublin').replace('ballsbridge, dublin', 'dublin').replace('earlsfort Terrace, dublin', 'dublin'), protected_phrases1) + + for old, new in replacements.items(): + string = string.replace(old, new) + split_strings = split_string_with_protection(string, protected_phrases1) + + # Define a set of university-related terms for later use + + + dict_string = {} + index = 0 + for value in split_strings: + + # Check if the substring contains any university-related terms + if not any(term in value.lower() for term in university_terms): + # Apply regex substitutions for common patterns + + modified_value = re.sub(r'universi\w*', 'universi', value, flags=re.IGNORECASE) + modified_value = re.sub(r'institu\w*', 'institu', modified_value, flags=re.IGNORECASE) + modified_value = re.sub(r'centre*', 'center', modified_value, flags=re.IGNORECASE) + modified_value = re.sub(r'\bsaint\b', 'st', modified_value, flags=re.IGNORECASE) + modified_value = re.sub(r'\btrinity col\b', 'trinity colege', modified_value, flags=re.IGNORECASE) + modified_value = re.sub(r'\btechnische\b', 'technological', modified_value, flags=re.IGNORECASE) + + + + # Add the modified substring to the dictionary + + dict_string[index] = modified_value.lower().strip() + index += 1 + # elif 'universitetskaya' in value.lower(): + # index += 1 + + + # Add the original substring to the dictionary + else: + dict_string[index] = value.lower().strip() + index += 1 + + return dict_string + + + +def clean_string(input_string): + # Temporarily replace " - " with a unique placeholder + placeholder = "placeholder" + # input_string = input_string.replace(" - ", placeholder) + input_string = input_string.replace(" – ", placeholder) + + # Unescape HTML entities and convert to lowercase + input_string = replace_comma_spaces(replace_double_consonants(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.replace("'", "")))))).strip()) + + # Normalize unicode characters (optional, e.g., replace umlauts) + input_string = unidecode(input_string) + + # Replace `/` and `–` with space (do not replace hyphen `-`) + result = re.sub(r'[/\-]', ' ', input_string) + + # Replace "saint" with "st" + result = re.sub(r'\bSaint\b', 'St', result) + result = re.sub(r'\bAghia\b', 'Agia', result) + + + # Remove characters that are not from the Latin alphabet, or allowed punctuation + result = replace_comma_spaces(re.sub(r'[^a-zA-Z\s,;/]', '', result).strip()) + + # Restore the " - " sequence from the placeholder + result = result.replace(placeholder, " – ") + + # Replace consecutive whitespace with a single space + result = re.sub(r'\s+', ' ', result) + #result = result.replace('ss', 's') + + result = insert_space_between_lower_and_upper(result).lower() + result = remove_stop_words(result) + + return result.strip() # Strip leading/trailing spaces + + +def clean_string_facts(input_string): + # Replace specified characters with space + input_string = remove_stop_words(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.lower()))))) + result = re.sub(r'[/\-,]', ' ', input_string) + result = re.sub(r'\bsaint\b', 'st', result) + + # Remove characters that are not from the Latin alphabet or numbers + result = re.sub(r'[^a-zA-Z0-9\s;/-]', '', result) + + # Replace consecutive whitespace with a single space + result = re.sub(r'\s+', ' ', result) + + return result + + +def str_radius_u(string): + string = string.lower() + radius = 3 + + str_list = string.split() + indices = [] + result = [] + + for i, x in enumerate(str_list): + if is_contained('univers',x): + indices.append(i) + # elif is_contained('coll',x): + # indices.append(i) + + for r0 in indices: + lmin =max(0,r0-radius) + lmax =min(r0+radius, len(str_list)) + s = str_list[lmin:lmax+1] + + result.append(' '.join(s)) + + return result + + +def str_radius_coll(string): + string = string.lower() + radius = 1 + + str_list = string.split() + indices = [] + result = [] + + for i, x in enumerate(str_list): + if is_contained('col',x): + indices.append(i) + + for r0 in indices: + lmin =max(0,r0-radius) + lmax =min(r0+radius, len(str_list)) + s = str_list[lmin:lmax] + + result.append(' '.join(s)) + + return result + + +def str_radius_h(string): + string = string.lower() + radius = 3 + + str_list = string.split() + indices = [] + result = [] + + for i, x in enumerate(str_list): + if is_contained('hospital',x) or is_contained('hopita',x): + indices.append(i) + + for r0 in indices: + lmin =max(0,r0-radius-1) + lmax =min(r0+radius, len(str_list)) + s = str_list[lmin:lmax] + + result.append(' '.join(s)) + + return result + + +def str_radius_c(string): + string = string.lower() + radius = 2 + + str_list = string.split() + indices = [] + result = [] + + for i, x in enumerate(str_list): + if is_contained('clinic',x) or is_contained('klinik',x): + indices.append(i) + + for r0 in indices: + lmin =max(0,r0-radius-1) + lmax =min(r0+radius, len(str_list)) + s = str_list[lmin:lmax] + + result.append(' '.join(s)) + + return result + +def str_radius_r(string): + string = string.lower() + radius = 2 + + str_list = string.split() + indices = [] + result = [] + + for i, x in enumerate(str_list): + if is_contained('research',x): + indices.append(i) + + for r0 in indices: + lmin =max(0,r0-radius-1) + lmax =min(r0+radius, len(str_list)) + s = str_list[lmin:lmax] + + result.append(' '.join(s)) + + return result + +def str_radius_spec(string): + spec = False + for x in string.split(): + try: + if categ_dicts[x] == 'Specific': + spec = True + return x + except: + pass + if spec == False: + return string + + +def avg_string(df, col): + avg = [] + for i in range(len(df)): + avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i])) + return sum(avg)/len(avg) + + + + + +def shorten_keywords(affiliations_simple): + affiliations_simple_n = [] + + for aff in affiliations_simple: + inner = [] + for str in aff: + if 'universi' in str: + inner.extend(str_radius_u(str)) + elif 'col' in str and 'trinity' in str: + inner.extend(str_radius_coll(str)) + elif 'hospital' in str or 'hopita' in str: + inner.extend(str_radius_h(str)) + elif 'clinic' in str or 'klinik' in str: + inner.extend(str_radius_c(str)) + elif 'research council' in str: + inner.extend(str_radius_r(str)) + else: + inner.append(str_radius_spec(str)) + + affiliations_simple_n.append(inner) + + return affiliations_simple_n + +def shorten_keywords_spark(affiliations_simple): + affiliations_simple_n = [] + + for aff in affiliations_simple: + + if 'universi' in aff: + affiliations_simple_n.extend(str_radius_u(aff)) + elif 'col' in aff and 'trinity' in aff: + affiliations_simple_n.extend(str_radius_coll(aff)) + elif 'hospital' in aff or 'hopita' in aff: + affiliations_simple_n.extend(str_radius_h(aff)) + elif 'clinic' in aff or 'klinik' in aff: + affiliations_simple_n.extend(str_radius_c(aff)) + elif 'research council' in aff: + affiliations_simple_n.extend(str_radius_r(aff)) + else: + affiliations_simple_n.append(str_radius_spec(aff)) + + + return affiliations_simple_n + + +def refine(list_, affil): + affil = affil.lower() + + ids = [] + + for matched_org_list in list_: + + id_list = [] + + for matched_org in matched_org_list: + + if dix_mult[matched_org] == 'unique': + id_list.append(dix_acad[matched_org]) + else: + city_found = False + for city in dix_city[matched_org]: + if city[0] in affil: + id_list.append(city[1]) + city_found = True + break + + if not city_found: + country_found = False + + for country in dix_country[matched_org]: + if country[0] in list(country_mapping.keys()): + print(country[0]) + if country[0] in affil or country_mapping[country[0]][0] in affil or country_mapping[country[0]][0] in affil: + id_list.append(country[1]) + country_found = True + break + + + + elif country[0] in affil: + print('country found',country[0]) + + id_list.append(country[1]) + country_found = True + break + + + + if not country_found: + id_list.append(dix_acad[matched_org]) + + + + ids.append(id_list) + return ids + +def compute_cos(x,s): + vectorizer = CountVectorizer() + + s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name + x_vector = vectorizer.transform([x]).toarray() + + # Compute similarity between the vectors + return cosine_similarity(x_vector, s_vector)[0][0] + + +# def find_ror(string, simU, simG): +# df = pd.DataFrame() + +# df['Unique affiliations'] = [[string.lower()]] +# academia = create_df_algorithm(df) + + +# result = Aff_Ids(len(academia), academia,dix_acad, dix_mult, dix_city, dix_country, simU,simG) +# if len(result)>0: + +# dict_aff_open = {x: y for x, y in zip(result['Original affiliations'], result['Matched organizations'])} +# dict_aff_id = {x: y for x, y in zip(result['Original affiliations'], result['unique ROR'])} + +# dict_aff_score = {} +# for i in range(len(result)): +# if type(result['Similarity score'].iloc[i]) == list: +# dict_aff_score[result['Original affiliations'].iloc[i]] = result['Similarity score'].iloc[i] +# else: +# dict_aff_score[result['Original affiliations'].iloc[i]] = [result['Similarity score'].iloc[i]] + + +# pids = [] +# for i in range(len(df)): +# pidsi = [] +# for aff in df['Unique affiliations'].iloc[i]: +# if aff in list(dict_aff_id.keys()): +# pidsi = pidsi + dict_aff_id[aff] +# # elif 'unmatched organization(s)' not in pidsi: +# # pidsi = pidsi + ['unmatched organization(s)'] +# pids.append(pidsi) + + +# names = [] +# for i in range(len(df)): +# namesi = [] +# for aff in df['Unique affiliations'].iloc[i]: +# if aff in list(dict_aff_open.keys()): +# try: +# namesi = namesi + dict_aff_open[aff] +# except TypeError: +# namesi = namesi + [dict_aff_open[aff]] + +# names.append(namesi) + +# scores = [] +# for i in range(len(df)): +# scoresi = [] +# for aff in df['Unique affiliations'].iloc[i]: +# if aff in list(dict_aff_score.keys()): +# scoresi = scoresi + dict_aff_score[aff] + +# scores.append(scoresi) + + +# df['Matched organizations'] = names +# df['ROR'] = pids +# df['Scores'] = scores + + + +# def update_Z(row): +# if len(row['ROR']) == 0 or len(row['Scores']) == 0: +# return [] + +# new_Z = [] +# for ror, score in zip(row['ROR'], row['Scores']): +# entry = {'ROR_ID': ror, 'Confidence': score} +# new_Z.append(entry) +# return new_Z + +# matching = df.apply(update_Z, axis=1) + +# df['Matchings'] = matching + + +# return df['Matchings'].iloc[0] +# else: +# return 'no result' \ No newline at end of file diff --git a/matching_cluster.py b/matching_cluster.py new file mode 100644 index 0000000..0c1ec92 --- /dev/null +++ b/matching_cluster.py @@ -0,0 +1,319 @@ +from collections import defaultdict +from collections import Counter + +import Levenshtein + +from sklearn.feature_extraction.text import CountVectorizer +from sklearn.metrics.pairwise import cosine_similarity + +from functions_cluster import * +from create_input_cluster import * + +def best_sim_score(light_raw, candidate_num, pairs_list, m, simU, simG): + """ + Finds the best match between a 'key word' and several legal names from the OpenAIRE database. + ---> corrects special cases in the main map that follows + + Args: + light_raw + l2 candidate_num: number of candidates. + l3 pairs_list: List of pairs. (s, x, score) + l4 m: mult + + Returns: + List: Resulting list containing OpenAIRE names and their similarity scores. + """ + + vectorizer = CountVectorizer() + univ_num = light_raw.lower().count('univ') + result = [] + best = [] + s = light_raw + + for j in range(len(pairs_list)): + x = pairs_list[j][1] + + if [x, pairs_list[j][2]] in result: + continue + + if m[pairs_list[j][0]] == 1: + + if is_contained('univ', x.lower()) and pairs_list[j][2] > simU: + result.append([x, pairs_list[j][2]]) + elif pairs_list[j][2] > simG: + result.append([x, pairs_list[j][2]]) + + elif pairs_list[j][2] >= 0.98: # and (is_contained("univ", x.lower()) or is_contained("college", x.lower()) or is_contained("center", x.lower()) or is_contained("schule", x.lower())): # If the similarity score of a pair (s,x) was 1, we store it to results list + result.append([pairs_list[j][1], 1]) + + else: + try: + if not is_contained("univ", x.lower()): + continue # Skip if x does not contain "university" or "univ" + + # if (is_contained('hosp', x.lower()) and not is_contained('hosp', s)) or (not is_contained('hosp', x.lower()) and is_contained('hosp', s)) or (is_contained('hopital', x.lower()) and not is_contained('hopital', s)) or (not is_contained('hopital', x.lower()) and is_contained('hopital', s)): + # continue + s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name + x_vector = vectorizer.transform([x]).toarray() + + # Compute similarity between the vectors + similarity = cosine_similarity(x_vector, s_vector)[0][0] + if similarity > 0.1: + similarity_l = 1 - Levenshtein.distance(x, pairs_list[j][0]) / max(len(x), len(pairs_list[j][0])) + + best.append([x, similarity, similarity_l]) #(similarity+similarity2)/2]) + except: + KeyError + + if best: + # max_numbers = defaultdict(float) + + +# Assuming best is a list of three-element lists +# Each element is (string, number1, number2) + max_numbers = defaultdict(float) + for item in best: + string, number1, number2 = item # Unpack the three elements + max_numbers[string] = max(max_numbers[string], number1) + + reduced_best = [[string, number1, number2] for string, number1, number2 in best if number1 == max_numbers[string]] + +# Sort by number1 decreasingly and then by number2 in descending order + reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True) + + result = result + reduced_best + + univ_list = [] + other_list = [] + + for r in result: + if is_contained('univ', r[0]): + univ_list.append(r) + else: + other_list.append(r) + + limit = min(univ_num, candidate_num) + + if len(univ_list) > limit: + result = univ_list[:limit] + other_list + + result_dict = {} + pairs_dict = {} + + + for l in pairs_list: + pairs_dict[l[1]] = l[2] + + + for p in result: + result_dict[p[0]] = pairs_dict[p[0]] + + + result_dict_list = [[y[0], result_dict[y[0]]] for y in result] + + return result_dict_list + + + +def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG): + + """ + Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores. + + Args: + m (int): The number of DOIs to check. + DF (DataFrame): The input DataFrame containing affiliation data. + dix_org (dict): A dictionary of names of organizations and their ROR_ids. + simU (float): Similarity threshold for universities. + simG (float): Similarity threshold for non-universities. + + Returns: + DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores. + """ + df_list = input[1] + light_aff = input[0] + vectorizer = CountVectorizer() + + lnamelist = list(dix_org.keys()) + dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]} + #pairs = [] + result = {} + pairs = [] + + + def get_keywords(filtered_list): + # Extract the "keywords" values from the dictionaries in filtered_list + keywords_list = [entry["keywords"] for entry in filtered_list] + + return keywords_list + keywords = get_keywords(df_list) + + + for k,s in enumerate(keywords): + similar_k = [] + pairs_k = [] + + if s in lnamelist: + similarity = 1 + similar_k.append(similarity) + + pairs_k.append((s,s,similarity,dix_org[s])) + pairs.append((s,s,similarity,dix_org[s])) + + + if k not in dix: + dix[k] = [s] + else: + dix[k].append(s) + else: + + for x in lnamelist: + if is_contained(s, x): + + x_vector = vectorizer.fit_transform([x]).toarray() + s_vector = vectorizer.transform([s]).toarray() + + # Compute similarity between the vectors + similarity = cosine_similarity(x_vector, s_vector)[0][0] + if similarity > min(simU, simG): + if (is_contained('univ', s) and is_contained('univ', x)) and similarity > simU: + similar_k.append(similarity) + pairs_k.append((s,x,similarity,dix_org[x])) + pairs.append((s,x,similarity,dix_org[x])) + + + if k not in dix: + dix[k] = [x] + else: + dix[k].append(x) + elif (not is_contained('univ', s) and not is_contained('univ', x)) and similarity > simG: + similar_k.append(similarity) + pairs_k.append((s,x,similarity,dix_org[x])) + pairs.append((s,x,similarity,dix_org[x])) + + + if k not in dix: + dix[k] = [x] + else: + dix[k].append(x) + + elif is_contained(x, s): + if (is_contained('univ', s) and is_contained('univ', x)): + + s_vector = vectorizer.fit_transform([s]).toarray() + x_vector = vectorizer.transform([x]).toarray() + + # Compute similarity between the vectors + similarity = cosine_similarity(s_vector, x_vector)[0][0] + if similarity > simU: #max(0.82,sim): + similar_k.append(similarity) + pairs_k.append((s,x,similarity,dix_org[x])) + pairs.append((s,x,similarity,dix_org[x])) + + if k not in dix: + dix[k] = [x] + else: + dix[k].append(x) + elif not is_contained('univ', s) and not is_contained('univ', x): + + s_vector = vectorizer.fit_transform([s]).toarray() + x_vector = vectorizer.transform([x]).toarray() + + # Compute similarity between the vectors + similarity = cosine_similarity(s_vector, x_vector)[0][0] + if similarity > simG: #max(0.82,sim): + similar_k.append(similarity) + pairs_k.append((s,x,similarity,dix_org[x])) + pairs.append((s,x,similarity,dix_org[x])) + + if k not in dix: + dix[k] = [x] + else: + dix[k].append(x) + + result[k] = pairs_k + + multi = index_multiple_matchings(list(set(pairs))) + # need_check = list(set([i for i in range(len(multi)) if list(multi.values())[i]>1])) + # print('here', multi) + # need_check_keys = [keywords[i] for i in range(len(keywords)) if multi[keywords[i]]>1] + need_check_keys = [] + for i in range(len(keywords)): + try: + if multi[keywords[i]]>1: + need_check_keys.append(keywords[i]) + except: + pass + + best = best_sim_score(light_aff, len(keywords), pairs, multi, simU, simG) + matched_org = [x[0] for x in best] + # best_o = [] + # best_s = [] + # best_result = [] + # for x in best: + # best_o.append([x[i][0] for i in range(len(x))]) + # best_s.append([round(x[i][1],2) for i in range(len(x))]) + # num_mathced = [len(best_s[i]) for i in range(len(need_check))] + ids = [dix_org[x[0]] for x in best] + for i,x in enumerate(matched_org): + # id_list = [] + if dix_mult[x] != 'unique': + if x in list(dix_city_ror.keys()): + match_found0 = False + match_found = False + + for city in dix_city_ror[x]: + if city[0] in light_aff: + if city[0] not in x: + ids[i] = city[1] + + match_found0 = True + match_found = True + break + if not match_found: + for city in dix_city_ror[x]: + if city[0] in light_aff and city[0] not in x: + ids[i] = city[1] + match_found0 = True + print('ok') + break + + if not match_found: + match_found2 = False + match_found3 = False + + for country in dix_country_ror[x]: + if country[0] == 'united states' and (country[0] in light_aff or 'usa' in light_aff): + ids[i] = country[1] + match_found2 = True + match_found3 = True + break + + if country[0] == 'united kingdom' and (country[0] in light_aff or 'uk' in light_aff): + ids[i] = country[1] + match_found2 = True + match_found3 = True + break + + elif country[0] in light_aff: + + if country[0] not in x: + ids[i] = country[1] + match_found2 = True + match_found3 = True + break + + if not match_found3: + for country in dix_country_ror[x]: + if country[0] in light_aff and country[0] in x: + ids[i] = country[1] + match_found2 = True + break + + + + + + results = [[x[0],x[1], ids[i]] for i,x in enumerate(best)] + + return results #[[result[to_check[i]] for i in ready] + [to_check[2]], best[0]] \ No newline at end of file diff --git a/txt_files/.DS_Store b/txt_files/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..37cf8fae3c3837fbac5dd32715df22c6f1041ffc GIT binary patch literal 8196 zcmeHM!A`+=)}ll;n2CmICmJovhfv)v8&TBuymNnp^mD6Do|BG zj!g?=ydcNsOZ|R9%RBOA{@h=$@{GJb2IQ=0k6Gh~ctnah*1p~k(PizQvxj&fD@MjK z7j(Q=Iop8kc6HpYzimS9Tzh^tM>&+OgK$Qa@k zSDdNO;m+rs@*aA{^w57rV8x40>|zK+fx7c zwtxTM_cZkWQ~_1s7Ydk0chK!HfUm79p`_Lh*q_*?39oQyT{zfsoM_8&;^!ZRybeee WpU7k3kQO%m5MX7{K^6E@1wH_!2k8(1 literal 0 HcmV?d00001 diff --git a/txt_files/city_names.txt b/txt_files/city_names.txt new file mode 100644 index 0000000..e0d7bd4 --- /dev/null +++ b/txt_files/city_names.txt @@ -0,0 +1,584 @@ +galway +maynooth +duluth +port arthur +new orleans +paterson +santa barbara +thornton +westminster +north las vegas +stockton +marysville +fitchburg +tallinn +fargo +seaside +manaus +porto +quebec city +hialeah +normal +kansas city +delhi +fort worth +palermo +olathe +madison +santa maria +youngstown +allentown +santa clara +charlotte +agra +palmdale +kraków +bendigo +high point +washington +dallas +grand prairie +plano +leipzig +bratislava +seville +puebla +lucknow +toowoomba +santa rosa +sioux falls +flint +kissimmee +lacey +brownsville +palm springs +tyler +minsk +san diego +los angeles +edmonton +college station +toulouse +garland +florence +saskatoon +albury-wodonga +newburgh +danbury +deltona +south bend +nagpur +pomona +memphis +london +lincoln +chandler +adelaide +salt lake city +edinburgh +suzhou +grayslake +new york city +kanpur +brussels +okayama +tuscaloosa +clarksville +jackson +boise city +canton +louisville +varanasi +columbus +lorain +vadodara +orem +chennai +townsville +eindhoventoronto +wuhan +norman +winter haven +eugene +riga +hamamatsu +fresno +lake charles +budapest +mobile +lowell +vienna +tallahassee +nanjing +new haven +sacramento +leeds +harlingen +springdale +perth +sendai +utica +orange +baltimore +rochester +rancho cucamonga +bellevue +fort wayne +modesto +pristina +nuremberg +stuttgart +indore +murfreesboro +nottingham +scranton +lancaster +abilene +monterey +sioux city +bari +chula vista +ahmedabad +north port +helsinki +leominster +ocala +sarajevo +hangzhou +roanoke +new york +bethlehem +dublin +sunshine coast +pune +billings +changchunsydney +garden grove +port orange +pittsburgh +new bedford +hiroshima +san francisco +sheffield +chongqing +layton +pueblo +chengdu +cincinnati +erie +lansing +ljubljana +st louis +rio de janeiro +philadelphia +tacoma +bel air +chesapeake +davenport +las vegas +nagasaki +kitchener +boulder +roseville +evansville +victoria +burbank +sofia +santa clarita +san buenaventura +savannah +apple valley +brighton +coral springs +huntsville +fort lauderdale +warsaw +antioch +medford +visalia +frankfurt +joliet +curitiba +mcallen +seattle +alexandria +bryan +moreno valley +berlin +olympia +caracas +tianjin +cleveland +des moines +prague +fukuoka +burlington +bhopal +nara +hampton +jefferson +chicago +temecula +paris +gilbert +bradenton +champaign +munich +amsterdam +raleigh +atlanta +lakeland +denver +round lake beach +richmond +buffalo +phoenix +antwerp +greenbay +milwaukee +south lyon +concord +vero beach +newcastle +podgorica +monterrey +shantou +costa mesa +copenhagen +vilnius +dalian +bristol +salinas +belgrade +waterloo +henderson +hayward +hickory +el monte +lima +redding +mexico city +cary +kennewick +guayaquil +tirana +kawasaki +greensboro +west covina +amarillo +saitama +new london +recife +manchester +rockford +kelowna +hagerstown +bordeaux +york +kaneohe +tucson +gainesville +kalamazoo +bogotá +reading +virginia beach +guadalajara +albany +durham +green bay +oceanside +montreal +turin +malaga +oshawa +mesa +pensacola +boise +bonita springs +fort walton beach +port saint lucie +reykjavik +north charleston +newark +reno +knoxville +bakersfield +oslo +omaha +milan +cambridge +norwich +shanghai +naples +victorville +zagreb +norwalk +huntington beach +clarke county +lubbock +yakima +warren +bucharest +simi valley +greenville +racine +salvador +elk grove +orlando +windsor +santa cruz +saginaw +ballarat +muskegon +shreveport +clearwater +merced +boston +basel +elizabeth +panama city +okinawa +sarasota +zurich +glendale +wilmington +pompano beach +guangzhou +fairfield +hyderabad +santiago +nashville +mchenry +ann arbor +carrollton +hollywood +laredo +rome +san bernardino +bergen +springfield +winnipeg +corona +surat +long beach +nagoya +toledo +geelong +kenosha +sterling heights +lisbon +myrtle beach +nashua +riverside +tampa +bangalore +richland +rotterdam +lyon +scottsdale +berkeley +bologna +cedar rapids +syracuse +tulsa +ludhiana +hemet +portland +mission viejo +salem +overland park +detroit +jinan +osaka +grand rapids +jersey city +kailua +venice +darwin +miramar +gulfport-biloxi +huntington +portsmouth +worcester +sunnyvale +escondido +college park +thousand oaks +harbin +belfast +yonkers +alicante +barnstable +kitakyushu +sapporo +ogden +aurora +palm bay +düsseldorf +hobart +irvine +st johns +hamburg +provo +melbourne +madrid +zhengzhou +asheville +patna +inglewood +houston +newport news +west valley city +oklahoma city +brisbane +valencia +pasadena +aberdeen +st petersburg +lakewood +irving +naperville +miami +topeka +downey +genoa +lewisville +birmingham +xian +saint paul +bremerton +corpus christi +daytona beach +st paul +oxnard +murrieta +lafayette +montgomery +baton rouge +skopje +cathedral city +spartanburg +canberra +arvada +hesperia +port st lucie +saint louis +bridgeport +tempe +quito +chattanooga +bremen +gold coast +cairns +beaumont +elkhart +peoria +calgary +honolulu +havre de grace +hamilton +fullerton +daly city +dresden +belem +ottawa +regina +chiba +fort collins +indianapolis +mumbai +killeen +sao paulo +jaipur +fremont +zaragoza +charleston +waco +kobe +odessa +monroe +vallejo +marseille +qingdao +frederick +marina +sebastian +oakland +pembroke pines +san antonio +kyoto +colorado springs +el paso +shenyang +punta gorda +fort smith +richmond county +waterbury +shenzhen +albuquerque +jacksonville +minneapolis +fortaleza +denton +gastonia +fayetteville +bloomington +houma +santa ana +kolkata +las cruces +barcelona +arlington +niigata +norfolk +fontana +providence +santo domingo +vancouver +appleton +san jose +hartford +winston +barrie +glasgow +davidson county +yokohama +independence +athens +harrisburg +macon +torrance +launceston +cape coral +austin +little rock +cologne +mesquite +catania +stockholm +nice +stamford +buenos aires +columbia +anchorage +dayton +wollongong +halifax +verona +anaheim +kiev +augusta +tokyo +akron +lexington +wichita +saint petersburg +beijing +johnson city +spokane +liverpool +howell +poughkeepsie +ontario +atlantic city +trenton diff --git a/txt_files/remove_list.txt b/txt_files/remove_list.txt new file mode 100644 index 0000000..5023a5c --- /dev/null +++ b/txt_files/remove_list.txt @@ -0,0 +1,28 @@ +universi +research institu +laboratory +gmbh +inc +universi of +research center +foundation +faculty +national institu +school medicine +universi school +graduate school +graduate school engineering +institu tropical medicine +institu virology +faculty medicine +laboratory +universi park +institu science +polytechnic universi +universi 1 +ciudad universi +universi campus +universi hospitals +colege +universi road +universitetska str diff --git a/txt_files/stop_words.txt b/txt_files/stop_words.txt new file mode 100644 index 0000000..808ddf1 --- /dev/null +++ b/txt_files/stop_words.txt @@ -0,0 +1,16 @@ +from +the +of +at +de +for +et +für +des +in +as +a +and +fur +for +und diff --git a/txt_files/university_terms.txt b/txt_files/university_terms.txt new file mode 100644 index 0000000..abfaef4 --- /dev/null +++ b/txt_files/university_terms.txt @@ -0,0 +1,8 @@ +universitetskaya +universitatsklinikum +universitatskinderklinik +universitatskliniken +universitetshospital +universitatsmedizin +universitatsbibliothek +universitatspital \ No newline at end of file diff --git a/update_records.py b/update_records.py new file mode 100644 index 0000000..04278ab --- /dev/null +++ b/update_records.py @@ -0,0 +1,116 @@ +import json +import os +from pyspark.sql import SparkSession +from affro_cluster import * + +folder_path = '/user/zeppelin/miriam.baglioni/AffStringFromIISDataset2' +#folder_path = 'check' + +json_file_names = [] + +# Loop through all files in the directory +for file_name in os.listdir(folder_path): + # Check if the file is a JSON file (you can adjust the check as needed) + if file_name != '_SUCCESS': + json_file_names.append(file_name) + +# json_file_names now contains the names of all JSON files in the folder + +# Initialize Spark session +spark = SparkSession.builder.appName("JSONProcessing").getOrCreate() + +def remove_duplicates(list_of_dicts): + # Use a set to store tuples of dictionary items to filter out duplicates + seen = set() + unique_list_of_dicts = [] + + for d in list_of_dicts: + # Convert the dictionary to a tuple of items + items = tuple(d.items()) + if items not in seen: + seen.add(items) + unique_list_of_dicts.append(d) + + return unique_list_of_dicts + +def update_record(record): + id = record['id'] + authors = [] + try: + for author in record['authors']: + author_object = {} + if 'orcid.org/0' in author['fullName']: + author_object['Name'] = {'Full':author['fullName'].split(',')[1], 'First' : None, 'Last' : None} + author_object['ORCID'] = author['fullName'].split(',')[0][:36] + else: + author_object['Name'] = {'Full':author['fullName'], 'First' : None, 'Last' : None} + author_object['ORCID'] = None + author_object['Raw_affiliations'] = [affiliation['raw_affiliation_string'] for affiliation in author['affiliations']] + all_affs_with_ror = [] + have_ror = False + for affiliation in author['affiliations']: + # author_object['Raw_affiliations'] = [x for x in affiliation['raw_affiliation_string']] + if 'ORCID: 0' in affiliation['raw_affiliation_string']: + x = affiliation['raw_affiliation_string'] + author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID: ')[1] + elif 'ORCID 0' in affiliation['raw_affiliation_string']: + x = affiliation['raw_affiliation_string'] + author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID ')[1] + if 'ror.org' in affiliation['raw_affiliation_string']: + have_ror = True + all_affs_with_ror.append({ + 'Origin': 'data', + 'RORid': affiliation['raw_affiliation_string'][0:25], + 'Confidence': None + }) + + + else: + if len(affro(affiliation['raw_affiliation_string']))>0: + author_object['Organization_PIDs'] = affro(affiliation['raw_affiliation_string']) + author_object['Organization_PIDs'] = remove_duplicates([json.loads(x) for x in author_object['Organization_PIDs']]) + + else: + author_object['Organization_PIDs'] = [] + + if have_ror == True: + author_object['Organization_PIDs'] = all_affs_with_ror + order = ["Name", "Raw_affiliations", "Organization_PIDs", "ORCID"] + + reordered_data = {k: author_object[k] for k in order} + + authors.append(reordered_data) + + + organizations = remove_duplicates([x for author in authors for x in author['Organization_PIDs']]) + + updt = {'ID' : id, 'Authors' : authors, 'Organizations' : organizations} + return updt + except Exception as e: + print(f"Error processing record with id {record.get('id')}: {str(e)}") + return None + + + +for file in json_file_names: + print('start processing '+str(file)) + df = spark.read.json(folder_path + '/' + file) + + # Apply the update_record function + updated_rdd = df.rdd.map(lambda row: update_record(row.asDict())) + + # Convert updated RDD to JSON strings + json_rdd = updated_rdd.map(lambda record: json.dumps(record)) + + # Collect the data and write to an output file with a unique name + json_data = json_rdd.collect() + + # Create a new filename by appending "_output.json" to the original filename (without extension) + output_file_name = file+'_output.json' + print('end processing '+str(file)) + + with open(output_file_name, 'w') as f: + for i, item in enumerate(json_data): + print('write '+str(i)) + + f.write(item + '\n')