import re import unicodedata import html from unidecode import unidecode import json from sklearn.feature_extraction.text import CountVectorizer from sklearn.metrics.pairwise import cosine_similarity #import pandas as pd def load_txt(file_path): with open(file_path, 'r', encoding='utf-8') as file: list_ = [line.strip() for line in file] return list_ def load_json(file_path): with open(file_path, 'r') as json_file: json_dict = json.load(json_file) return json_dict categ_string = 'Laboratory|Univ/Inst|Hospital|Foundation|Specific|Museum' def replace_double_consonants(text): # This regex pattern matches any double consonant pattern = r'([bcdfghjklmnpqrstvwxyz])\1' # The replacement is the first captured group (the single consonant) result = re.sub(pattern, r'\1', text, flags=re.IGNORECASE) return result remove_list = [replace_double_consonants(x) for x in load_txt('txt_files/remove_list.txt')] stop_words = load_txt('txt_files/stop_words.txt') university_terms = [replace_double_consonants(x) for x in load_txt('txt_files/university_terms.txt')] city_names = [replace_double_consonants(x) for x in load_txt('txt_files/city_names.txt')] categ_dicts = load_json('dictionaries/dix_categ.json') def is_contained(s, w): words = s.split() # Split the string 's' into a list of words for word in words: if word not in w: # If a word from 's' is not found in 'w' return False # Return False immediately return True # If all words from 's' are found in 'w', return True def starts_with_any(string, prefixes): for prefix in prefixes: if string.startswith(prefix): return [True, prefix] return False def remove_leading_numbers(s): return re.sub(r'^\d+', '', s) def remove_outer_parentheses(string): """Remove outer parentheses from the string if they enclose the entire string.""" if string.startswith('(') and string.endswith(')'): return string[1:-1].strip() return string def insert_space_between_lower_and_upper(s): """ Inserts a space between a lowercase letter followed by an uppercase letter in a string. Parameters: s (str): The input string. Returns: str: The modified string with spaces inserted. """ # Use regex to insert space between lowercase and uppercase letters modified_string = re.sub(r'([a-z])([A-Z])', r'\1 \2', s) return modified_string def index_multiple_matchings(pairs): result_dict = {} r_list = [pair[3] for pair in pairs] modified_list = [item for sublist in r_list for item in sublist] r = len(list(set(modified_list))) for t in [pair[0] for pair in pairs]: key = t if key in result_dict and r>1: result_dict[key] += 1 else: result_dict[key] = 1 return result_dict def avg_string(df, col): avg = [] for i in range(len(df)): avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i])) return sum(avg)/len(avg) #stop_words = ['from', 'the', 'of', 'at', 'de','for','et','für','des', 'in','as','a','and','fur','for','und'] def remove_stop_words(text): words = text.split() filtered_words = [word for word in words if word not in stop_words] return ' '.join(filtered_words) def remove_parentheses(text): return re.sub(r'\([^()]*\)', '', text) # def replace_umlauts(text): # normalized_text = unicodedata.normalize('NFKD', text) # replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c)) # return replaced_text def protect_phrases(input_string, phrases): # Replace phrases with placeholders placeholder_map = {} for i, phrase in enumerate(phrases): placeholder = f"__PLACEHOLDER_{i}__" placeholder_map[placeholder] = phrase input_string = input_string.replace(phrase, placeholder) return input_string, placeholder_map def restore_phrases(split_strings, placeholder_map): # Restore placeholders with original phrases restored_strings = [] for s in split_strings: for placeholder, phrase in placeholder_map.items(): s = s.replace(placeholder, phrase) restored_strings.append(s) return restored_strings def replace_comma_spaces(text): return text.replace(' ', ' ').replace(' , ', ', ') def split_string_with_protection(input_string, protected_phrases): # Step 1: Protect specific phrases input_string, placeholder_map = protect_phrases(input_string, protected_phrases) # Step 2: Split the string on specified delimiters split_strings = [s.strip() for s in re.split(r'[,;/]| – ', input_string) if s.strip()] # Step 3: Restore protected phrases split_strings = restore_phrases(split_strings, placeholder_map) return split_strings protected_phrases1 = [ phrase.format(x=x) for x in city_names for phrase in [ 'university california, {x}', # 'university california , {x}', 'university colege hospital, {x}', # 'university colege hospital , {x}', 'national univ ireland, {x}', # 'national univ ireland , {x}', 'national university ireland, {x}', # 'national university ireland , {x}', 'university colege, {x}', # 'university colege , {x}', 'university hospital, {x}', # 'university hospital , {x}', 'imperial colege, {x}', # 'imperial colege , {x}' 'city university, {x}', # 'city university , {x}' ] ] replacements = {'czechoslovak':'czech', 'saint' : 'st', 'aghia' : 'agia', 'universitatsklinikum' : 'universi hospital', 'universitetshospital' : 'universi hospital', 'universitatskinderklinik' : 'universi childrens hospital', 'universitatskliniken' : 'universi hospital', 'Universitätsklinik' : 'universi hospital', 'universitatsmedizin' : 'universi medicine', 'universitatsbibliothek' : 'universi library', 'nat.' : 'national', 'uni versity' : 'university', 'unive rsity' : 'university', 'univ ersity' : 'university', 'inst ' : 'institute ', 'adv ' : 'advanced ', 'univ ' : 'university ', 'stud ' : 'studies ', 'inst.' : 'institute', 'adv.' : 'advanced', 'univ.' : 'university', 'stud.' : 'studies', 'univercity' : 'university', 'univerisity' : 'university', 'universtiy' : 'university', 'univeristy' : 'university', 'universirty' : 'university', 'universiti' : 'university', 'universitiy' : 'university', 'universty' : 'university', 'techniche' : 'technological', 'univ col' : 'university colege', 'univ. col.' : 'university colege', 'univ. coll.' : 'university colege', 'col.' : 'colege', 'hipokration' : 'hipocration', 'belfield, dublin' : 'dublin', 'balsbridge, dublin' : 'dublin', #ballsbridge 'earlsfort terrace, dublin' : 'dublin', 'bon secours hospital, cork' : 'bon secours hospital cork', 'bon secours hospital, dublin' : 'bon secours hospital dublin', 'bon secours hospital, galway' : 'bon secours hospital galway', 'bon secours hospital, tralee' : 'bon secours hospital tralee', 'bon secours health system' : 'bon secours hospital dublin', 'bon secours hospital, glasnevin' : 'bon secours hospital dublin', 'imperial colege science, technology medicine' : 'imperial colege science technology medicine', 'ucl queen square institute neurology' : 'ucl, london', 'ucl institute neurology' : 'ucl, london', 'royal holoway, university london' : 'royal holoway universi london', #holloway 'city, university london' : 'city universi london', 'city university, london' : 'city universi london', 'aeginition' : 'eginition', 'national technical university, athens' : 'national technical university athens' # 'harvard medical school' : 'harvard university' } def substrings_dict(string): # Split the input string and clean each substring # split_strings = split_string_with_protection(string.replace('univ coll', 'university college').replace('belfield, dublin', 'dublin').replace('ballsbridge, dublin', 'dublin').replace('earlsfort Terrace, dublin', 'dublin'), protected_phrases1) for old, new in replacements.items(): string = string.replace(old, new) string = string.replace('hospitalum','hospital').replace('hospitalen','hospital') split_strings = split_string_with_protection(string, protected_phrases1) # Define a set of university-related terms for later use dict_string = {} index = 0 for value in split_strings: value = value.replace('.', ' ') # Check if the substring contains any university-related terms if not any(term in value.lower() for term in university_terms): # Apply regex substitutions for common patterns modified_value = re.sub(r'universi\w*', 'universi', value, flags=re.IGNORECASE) modified_value = re.sub(r'institu\w*', 'institu', modified_value, flags=re.IGNORECASE) modified_value = re.sub(r'centre\b', 'center', modified_value, flags=re.IGNORECASE) modified_value = re.sub(r'\bsaint\b', 'st', modified_value, flags=re.IGNORECASE) modified_value = re.sub(r'\btrinity col\b', 'trinity colege', modified_value, flags=re.IGNORECASE) modified_value = re.sub(r'\btechnische\b', 'technological', modified_value, flags=re.IGNORECASE) modified_value = re.sub(r'\bteknologi\b', 'technology', modified_value, flags=re.IGNORECASE) modified_value = re.sub(r'\bpolitehnica\b', 'polytechnic', modified_value, flags=re.IGNORECASE) # Add the modified substring to the dictionary dict_string[index] = modified_value.lower().strip() index += 1 # elif 'universitetskaya' in value.lower(): # index += 1 # Add the original substring to the dictionary else: dict_string[index] = value.lower().strip() index += 1 return dict_string def clean_string(input_string): # Temporarily replace " - " with a unique placeholder placeholder = "placeholder" # input_string = input_string.replace(" - ", placeholder) input_string = input_string.replace(" – ", placeholder) # Unescape HTML entities and convert to lowercase input_string = replace_comma_spaces(replace_double_consonants(unidecode(remove_parentheses(html.unescape(input_string.replace(" ́e","e").replace("'", ""))))).strip()) # Replace `–` with space (do not replace hyphen `-`) result = re.sub(r'[\-]', ' ', input_string) # Replace "saint" with "st" result = re.sub(r'\bSaint\b', 'St', result) result = re.sub(r'\bAghia\b', 'Agia', result) result = re.sub(r'\bAghios\b', 'Agios', result) # Remove characters that are not from the Latin alphabet, or allowed punctuation result = replace_comma_spaces(re.sub(r'[^a-zA-Z\s,;/.]', '', result).strip()) # Restore the " - " sequence from the placeholder result = result.replace(placeholder, " – ") # Replace consecutive whitespace with a single space result = re.sub(r'\s+', ' ', result) #result = result.replace('ss', 's') result = insert_space_between_lower_and_upper(result).lower() result = remove_stop_words(result) return result.strip() # Strip leading/trailing spaces def clean_string_facts(input_string): # Replace specified characters with space input_string = remove_stop_words(unidecode(remove_parentheses(html.unescape(input_string.lower())))) result = re.sub(r'[/\-,]', ' ', input_string) result = re.sub(r'\bsaint\b', 'st', result) # Remove characters that are not from the Latin alphabet or numbers result = re.sub(r'[^a-zA-Z0-9\s;/-.]', '', result) # Replace consecutive whitespace with a single space result = re.sub(r'\s+', ' ', result) return result def str_radius_u(string): string = string.lower() radius = 3 str_list = string.split() indices = [] result = [] for i, x in enumerate(str_list): if is_contained('univers',x): indices.append(i) # elif is_contained('coll',x): # indices.append(i) for r0 in indices: lmin =max(0,r0-radius) lmax =min(r0+radius, len(str_list)) s = str_list[lmin:lmax+1] result.append(' '.join(s)) return result def str_radius_coll(string): string = string.lower() radius = 1 str_list = string.split() indices = [] result = [] for i, x in enumerate(str_list): if is_contained('col',x): indices.append(i) for r0 in indices: lmin =max(0,r0-radius) lmax =min(r0+radius, len(str_list)) s = str_list[lmin:lmax] result.append(' '.join(s)) return result def str_radius_h(string): string = string.lower() radius = 3 str_list = string.split() indices = [] result = [] for i, x in enumerate(str_list): if is_contained('hospital',x) or is_contained('hopita',x): indices.append(i) for r0 in indices: lmin =max(0,r0-radius-1) lmax =min(r0+radius, len(str_list)) s = str_list[lmin:lmax] result.append(' '.join(s)) return result def str_radius_c(string): string = string.lower() radius = 2 str_list = string.split() indices = [] result = [] for i, x in enumerate(str_list): if is_contained('clinic',x) or is_contained('klinik',x): indices.append(i) for r0 in indices: lmin =max(0,r0-radius-1) lmax =min(r0+radius, len(str_list)) s = str_list[lmin:lmax] result.append(' '.join(s)) return result def str_radius_r(string): string = string.lower() radius = 2 str_list = string.split() indices = [] result = [] for i, x in enumerate(str_list): if is_contained('research',x): indices.append(i) for r0 in indices: lmin =max(0,r0-radius-1) lmax =min(r0+radius, len(str_list)) s = str_list[lmin:lmax] result.append(' '.join(s)) return result def str_radius_spec(string): spec = False for x in string.split(): try: if categ_dicts[x] == 'Specific': spec = True return x except: pass if spec == False: return string def avg_string(df, col): avg = [] for i in range(len(df)): avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i])) return sum(avg)/len(avg) def shorten_keywords(affiliations_simple): affiliations_simple_n = [] for aff in affiliations_simple: inner = [] for str in aff: if 'universi' in str: inner.extend(str_radius_u(str)) elif 'col' in str and 'trinity' in str: inner.extend(str_radius_coll(str)) elif 'hospital' in str or 'hopita' in str: inner.extend(str_radius_h(str)) elif 'clinic' in str or 'klinik' in str: inner.extend(str_radius_c(str)) elif 'research council' in str: inner.extend(str_radius_r(str)) else: inner.append(str_radius_spec(str)) affiliations_simple_n.append(inner) return affiliations_simple_n def shorten_keywords_spark(affiliations_simple): affiliations_simple_n = [] for aff in affiliations_simple: if 'universi' in aff: affiliations_simple_n.extend(str_radius_u(aff)) elif 'col' in aff and 'trinity' in aff: affiliations_simple_n.extend(str_radius_coll(aff)) elif 'hospital' in aff or 'hopita' in aff: affiliations_simple_n.extend(str_radius_h(aff)) elif 'clinic' in aff or 'klinik' in aff: affiliations_simple_n.extend(str_radius_c(aff)) elif 'research council' in aff: affiliations_simple_n.extend(str_radius_r(aff)) else: affiliations_simple_n.append(str_radius_spec(aff)) return affiliations_simple_n def refine(list_, affil): affil = affil.lower() ids = [] for matched_org_list in list_: id_list = [] for matched_org in matched_org_list: if dix_mult[matched_org] == 'unique': id_list.append(dix_acad[matched_org]) else: city_found = False for city in dix_city[matched_org]: if city[0] in affil: id_list.append(city[1]) city_found = True break if not city_found: country_found = False for country in dix_country[matched_org]: if country[0] in list(country_mapping.keys()): print(country[0]) if country[0] in affil or country_mapping[country[0]][0] in affil or country_mapping[country[0]][0] in affil: id_list.append(country[1]) country_found = True break elif country[0] in affil: print('country found',country[0]) id_list.append(country[1]) country_found = True break if not country_found: id_list.append(dix_acad[matched_org]) ids.append(id_list) return ids def compute_cos(x,s): vectorizer = CountVectorizer() s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name x_vector = vectorizer.transform([x]).toarray() # Compute similarity between the vectors return cosine_similarity(x_vector, s_vector)[0][0] # def find_ror(string, simU, simG): # df = pd.DataFrame() # df['Unique affiliations'] = [[string.lower()]] # academia = create_df_algorithm(df) # result = Aff_Ids(len(academia), academia,dix_acad, dix_mult, dix_city, dix_country, simU,simG) # if len(result)>0: # dict_aff_open = {x: y for x, y in zip(result['Original affiliations'], result['Matched organizations'])} # dict_aff_id = {x: y for x, y in zip(result['Original affiliations'], result['unique ROR'])} # dict_aff_score = {} # for i in range(len(result)): # if type(result['Similarity score'].iloc[i]) == list: # dict_aff_score[result['Original affiliations'].iloc[i]] = result['Similarity score'].iloc[i] # else: # dict_aff_score[result['Original affiliations'].iloc[i]] = [result['Similarity score'].iloc[i]] # pids = [] # for i in range(len(df)): # pidsi = [] # for aff in df['Unique affiliations'].iloc[i]: # if aff in list(dict_aff_id.keys()): # pidsi = pidsi + dict_aff_id[aff] # # elif 'unmatched organization(s)' not in pidsi: # # pidsi = pidsi + ['unmatched organization(s)'] # pids.append(pidsi) # names = [] # for i in range(len(df)): # namesi = [] # for aff in df['Unique affiliations'].iloc[i]: # if aff in list(dict_aff_open.keys()): # try: # namesi = namesi + dict_aff_open[aff] # except TypeError: # namesi = namesi + [dict_aff_open[aff]] # names.append(namesi) # scores = [] # for i in range(len(df)): # scoresi = [] # for aff in df['Unique affiliations'].iloc[i]: # if aff in list(dict_aff_score.keys()): # scoresi = scoresi + dict_aff_score[aff] # scores.append(scoresi) # df['Matched organizations'] = names # df['ROR'] = pids # df['Scores'] = scores # def update_Z(row): # if len(row['ROR']) == 0 or len(row['Scores']) == 0: # return [] # new_Z = [] # for ror, score in zip(row['ROR'], row['Scores']): # entry = {'ROR_ID': ror, 'Confidence': score} # new_Z.append(entry) # return new_Z # matching = df.apply(update_Z, axis=1) # df['Matchings'] = matching # return df['Matchings'].iloc[0] # else: # return 'no result'