initial commit

This commit is contained in:
mkallipo 2024-09-05 12:23:32 +02:00
parent 530e474d7c
commit 0c98ba76a6
12 changed files with 1851 additions and 0 deletions

0
README.md Normal file
View File

40
affro_cluster.py Normal file
View File

@ -0,0 +1,40 @@
import sys
##import functions
from functions_cluster import *
from matching_cluster import *
from create_input_cluster import *
import json
dix_org = load_json('dictionaries/dix_acad.json')
dix_mult = load_json('dictionaries/dix_mult.json')
dix_city = load_json('dictionaries/dix_city.json')
dix_country = load_json('dictionaries/dix_country.json')
def affro(raw_aff_string):
try:
result = Aff_Ids(create_df_algorithm(raw_aff_string), dix_org, dix_mult, dix_city, dix_country, 0.65, 0.82)
if len(result)>0:
result_dict = [json.dumps({'Origin': 'affRo', 'RORid':x[2], 'Confidence':x[1]}) for x in result]
else:
result_dict = []
return result_dict
except Exception as e:
# Return some indication of an error, or log the row
print(f"Error: {str(e)}")
print(raw_aff_string)
pass
#raw_aff = 'university of california, los angeles, university of athens, university of california, san diego, university of athens, greece'
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: python affro_spark.py <string> <float1> <float2>")
sys.exit(1)
string_arg = sys.argv[1]
# float_arg1 = float(sys.argv[2])
# float_arg2 = float(sys.argv[3])
print(affro(string_arg))

28
affro_test_example.py Normal file
View File

@ -0,0 +1,28 @@
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import sys
from affro_cluster import *
# Initialize SparkSession
spark = SparkSession.builder.appName("CustomFunctionExample").getOrCreate()
# Register the function as a UDF
affro_udf = udf(affro, StringType())
# Input list of strings
input_data = ["university of athens", "university of vienna", "UCLA"]
# # Convert the list to a Spark DataFrame
df = spark.createDataFrame(input_data, "string").toDF("raw_affiliation_string")
# # Apply your custom UDF to the DataFrame
df_with_custom_value = df.withColumn("affro_value", affro_udf(df["raw_affiliation_string"]))
df_with_custom_value.show(truncate=False)
# Stop the SparkSession
spark.stop()

77
create_input_cluster.py Normal file
View File

@ -0,0 +1,77 @@
from functions_cluster import *
def create_df_algorithm(raw_aff_string):
aff_no_symbols_d = substrings_dict(clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string))))
substring_list = list(aff_no_symbols_d.values())
i = 0
while i < len(substring_list) - 1:
if is_contained('progr', substring_list[i]) and is_contained('dep', substring_list[i+1]):
substring_list.pop(i)
elif (is_contained('assistant', substring_list[i]) or is_contained('researcher', substring_list[i]) or is_contained('phd', substring_list[i]) or is_contained('student', substring_list[i]) or is_contained('section', substring_list[i]) or is_contained('prof', substring_list[i]) or is_contained('director', substring_list[i])) and (not is_contained('school', substring_list[i+1]) or is_contained('univ', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('lab', substring_list[i+1]) or is_contained('fac', substring_list[i+1])):
substring_list.pop(i)
elif (is_contained('engineer', substring_list[i]) or is_contained('progr', substring_list[i]) or is_contained('unit', substring_list[i]) or is_contained('lab', substring_list[i]) or is_contained('dep', substring_list[i]) or is_contained('school', substring_list[i]) or is_contained('inst', substring_list[i]) #or is_contained('hosp', substring_list[i])
or is_contained('fac', substring_list[i])) and is_contained('univ', substring_list[i+1]):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
elif is_contained('lab', substring_list[i]) and (is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('school', substring_list[i+1])):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
elif is_contained('dep', substring_list[i]) and (is_contained('tech', substring_list[i+1]) or is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('school', substring_list[i+1]) or is_contained('fac', substring_list[i+1])):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
elif is_contained('inst',substring_list[i]) and (is_contained('school', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('acad', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('clin', substring_list[i+1]) or is_contained('klin', substring_list[i+1]) or is_contained('fak', substring_list[i+1]) or is_contained('fac', substring_list[i+1]) or is_contained('cent', substring_list[i+1]) or is_contained('div', substring_list[i+1])):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
elif is_contained('school',substring_list[i]) and is_contained('colege', substring_list[i+1]):
if not is_contained('univ', substring_list[i]):
substring_list.pop(i)
else:
i = i+1
continue
else:
i += 1
light_aff = (', '.join((substring_list)))
for x in substring_list:
if x in city_names+remove_list:
substring_list.remove(x)
substring_list = [shorten_keywords_spark([x])[0] for x in substring_list]
def valueToCategory(value):
flag = 0
for k in categ_dicts:
if k in value:
flag = 1
return flag
aff_list = [{"index": i, "keywords": substring_list[i], "category": valueToCategory(substring_list[i])} for i in range(len(substring_list))]
filtered_list = [entry for entry in aff_list if entry.get("category") == 1]
return [light_aff, filtered_list]

635
functions_cluster.py Normal file
View File

@ -0,0 +1,635 @@
import re
import unicodedata
import html
from unidecode import unidecode
import json
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
#import pandas as pd
def load_txt(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
list_ = [line.strip() for line in file]
return list_
def load_pickled_dict(file_path):
with open(file_path, 'rb') as file:
pickled_dict = pickle.load(file)
return pickled_dict
def load_json(file_path):
with open(file_path, 'r') as json_file:
json_dict = json.load(json_file)
return json_dict
categ_string = 'Laboratory|Univ/Inst|Hospital|Foundation|Specific|Museum'
def replace_double_consonants(text):
# This regex pattern matches any double consonant
pattern = r'([bcdfghjklmnpqrstvwxyz])\1'
# The replacement is the first captured group (the single consonant)
result = re.sub(pattern, r'\1', text, flags=re.IGNORECASE)
return result
remove_list = [replace_double_consonants(x) for x in load_txt('txt_files/remove_list.txt')]
stop_words = load_txt('txt_files/stop_words.txt')
university_terms = [replace_double_consonants(x) for x in load_txt('txt_files/university_terms.txt')]
city_names = [replace_double_consonants(x) for x in load_txt('txt_files/city_names.txt')]
categ_dicts = load_json('dictionaries/dix_categ.json')
def is_contained(s, w):
words = s.split() # Split the string 's' into a list of words
for word in words:
if word not in w: # If a word from 's' is not found in 'w'
return False # Return False immediately
return True # If all words from 's' are found in 'w', return True
def starts_with_any(string, prefixes):
for prefix in prefixes:
if string.startswith(prefix):
return [True, prefix]
return False
def remove_leading_numbers(s):
return re.sub(r'^\d+', '', s)
def remove_outer_parentheses(string):
"""Remove outer parentheses from the string if they enclose the entire string."""
if string.startswith('(') and string.endswith(')'):
return string[1:-1].strip()
return string
def insert_space_between_lower_and_upper(s):
"""
Inserts a space between a lowercase letter followed by an uppercase letter in a string.
Parameters:
s (str): The input string.
Returns:
str: The modified string with spaces inserted.
"""
# Use regex to insert space between lowercase and uppercase letters
modified_string = re.sub(r'([a-z])([A-Z])', r'\1 \2', s)
return modified_string
def index_multiple_matchings(pairs):
result_dict = {}
r_list = [pair[3] for pair in pairs]
modified_list = [item for sublist in r_list for item in sublist]
r = len(list(set(modified_list)))
for t in [pair[0] for pair in pairs]:
key = t
if key in result_dict and r>1:
result_dict[key] += 1
else:
result_dict[key] = 1
return result_dict
def avg_string(df, col):
avg = []
for i in range(len(df)):
avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i]))
return sum(avg)/len(avg)
#stop_words = ['from', 'the', 'of', 'at', 'de','for','et','für','des', 'in','as','a','and','fur','for','und']
def remove_stop_words(text):
words = text.split()
filtered_words = [word for word in words if word not in stop_words]
return ' '.join(filtered_words)
def remove_parentheses(text):
return re.sub(r'\([^()]*\)', '', text)
def replace_umlauts(text):
normalized_text = unicodedata.normalize('NFKD', text)
replaced_text = ''.join(c for c in normalized_text if not unicodedata.combining(c))
return replaced_text
def protect_phrases(input_string, phrases):
# Replace phrases with placeholders
placeholder_map = {}
for i, phrase in enumerate(phrases):
placeholder = f"__PLACEHOLDER_{i}__"
placeholder_map[placeholder] = phrase
input_string = input_string.replace(phrase, placeholder)
return input_string, placeholder_map
def restore_phrases(split_strings, placeholder_map):
# Restore placeholders with original phrases
restored_strings = []
for s in split_strings:
for placeholder, phrase in placeholder_map.items():
s = s.replace(placeholder, phrase)
restored_strings.append(s)
return restored_strings
def replace_comma_spaces(text):
return text.replace(' ', ' ').replace(' , ', ', ')
def split_string_with_protection(input_string, protected_phrases):
# Step 1: Protect specific phrases
input_string, placeholder_map = protect_phrases(input_string, protected_phrases)
# Step 2: Split the string on specified delimiters
split_strings = [s.strip() for s in re.split(r'[,;/]| ', input_string) if s.strip()]
# Step 3: Restore protected phrases
split_strings = restore_phrases(split_strings, placeholder_map)
return split_strings
protected_phrases1 = [
phrase.format(x=x)
for x in city_names
for phrase in [
'university california, {x}',
# 'university california , {x}',
'university colege hospital, {x}',
# 'university colege hospital , {x}',
'national univ ireland, {x}',
# 'national univ ireland , {x}',
'national university ireland, {x}',
# 'national university ireland , {x}',
'university colege, {x}',
# 'university colege , {x}',
'university hospital, {x}',
# 'university hospital , {x}',
'imperial colege, {x}',
# 'imperial colege , {x}'
'city university, {x}',
# 'city university , {x}'
]
]
replacements = {'uni versity':'university',
'univ ':'university ',
'univercity':'university',
'universtiy':'university',
'univeristy':'university',
'universirty':'university',
'universiti':'university',
'universitiy':'university',
'universty' :'university',
'univ col': 'university colege',
'belfield, dublin': 'dublin',
'balsbridge, dublin': 'dublin', #ballsbridge
'earlsfort terrace, dublin': 'dublin',
'bon secours hospital, cork' : 'bon secours hospital cork',
'bon secours hospital, dublin' : 'bon secours hospital dublin',
'bon secours hospital, galway' : 'bon secours hospital galway',
'bon secours hospital, tralee' : 'bon secours hospital tralee',
'bon secours health system' : 'bon secours hospital dublin',
'bon secours hospital, glasnevin' : 'bon secours hospital dublin',
'imperial colege science, technology medicine' : 'imperial colege science technology medicine',
'ucl queen square institute neurology' : 'ucl, london',
'ucl institute neurology' : 'ucl, london',
'royal holoway, university london' : 'royal holoway universi london', #holloway
'city, university london' : 'city universi london',
'city university, london' : 'city universi london',
'aeginition':'eginition',
'national technical university, athens' : 'national technical university athens'
# 'harvard medical school' : 'harvard university'
}
def substrings_dict(string):
# Split the input string and clean each substring
# split_strings = split_string_with_protection(string.replace('univ coll', 'university college').replace('belfield, dublin', 'dublin').replace('ballsbridge, dublin', 'dublin').replace('earlsfort Terrace, dublin', 'dublin'), protected_phrases1)
for old, new in replacements.items():
string = string.replace(old, new)
split_strings = split_string_with_protection(string, protected_phrases1)
# Define a set of university-related terms for later use
dict_string = {}
index = 0
for value in split_strings:
# Check if the substring contains any university-related terms
if not any(term in value.lower() for term in university_terms):
# Apply regex substitutions for common patterns
modified_value = re.sub(r'universi\w*', 'universi', value, flags=re.IGNORECASE)
modified_value = re.sub(r'institu\w*', 'institu', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'centre*', 'center', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\bsaint\b', 'st', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\btrinity col\b', 'trinity colege', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\btechnische\b', 'technological', modified_value, flags=re.IGNORECASE)
# Add the modified substring to the dictionary
dict_string[index] = modified_value.lower().strip()
index += 1
# elif 'universitetskaya' in value.lower():
# index += 1
# Add the original substring to the dictionary
else:
dict_string[index] = value.lower().strip()
index += 1
return dict_string
def clean_string(input_string):
# Temporarily replace " - " with a unique placeholder
placeholder = "placeholder"
# input_string = input_string.replace(" - ", placeholder)
input_string = input_string.replace(" ", placeholder)
# Unescape HTML entities and convert to lowercase
input_string = replace_comma_spaces(replace_double_consonants(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.replace("'", "")))))).strip())
# Normalize unicode characters (optional, e.g., replace umlauts)
input_string = unidecode(input_string)
# Replace `/` and `` with space (do not replace hyphen `-`)
result = re.sub(r'[/\-]', ' ', input_string)
# Replace "saint" with "st"
result = re.sub(r'\bSaint\b', 'St', result)
result = re.sub(r'\bAghia\b', 'Agia', result)
# Remove characters that are not from the Latin alphabet, or allowed punctuation
result = replace_comma_spaces(re.sub(r'[^a-zA-Z\s,;/]', '', result).strip())
# Restore the " - " sequence from the placeholder
result = result.replace(placeholder, " ")
# Replace consecutive whitespace with a single space
result = re.sub(r'\s+', ' ', result)
#result = result.replace('ss', 's')
result = insert_space_between_lower_and_upper(result).lower()
result = remove_stop_words(result)
return result.strip() # Strip leading/trailing spaces
def clean_string_facts(input_string):
# Replace specified characters with space
input_string = remove_stop_words(replace_umlauts(unidecode(remove_parentheses(html.unescape(input_string.lower())))))
result = re.sub(r'[/\-,]', ' ', input_string)
result = re.sub(r'\bsaint\b', 'st', result)
# Remove characters that are not from the Latin alphabet or numbers
result = re.sub(r'[^a-zA-Z0-9\s;/-]', '', result)
# Replace consecutive whitespace with a single space
result = re.sub(r'\s+', ' ', result)
return result
def str_radius_u(string):
string = string.lower()
radius = 3
str_list = string.split()
indices = []
result = []
for i, x in enumerate(str_list):
if is_contained('univers',x):
indices.append(i)
# elif is_contained('coll',x):
# indices.append(i)
for r0 in indices:
lmin =max(0,r0-radius)
lmax =min(r0+radius, len(str_list))
s = str_list[lmin:lmax+1]
result.append(' '.join(s))
return result
def str_radius_coll(string):
string = string.lower()
radius = 1
str_list = string.split()
indices = []
result = []
for i, x in enumerate(str_list):
if is_contained('col',x):
indices.append(i)
for r0 in indices:
lmin =max(0,r0-radius)
lmax =min(r0+radius, len(str_list))
s = str_list[lmin:lmax]
result.append(' '.join(s))
return result
def str_radius_h(string):
string = string.lower()
radius = 3
str_list = string.split()
indices = []
result = []
for i, x in enumerate(str_list):
if is_contained('hospital',x) or is_contained('hopita',x):
indices.append(i)
for r0 in indices:
lmin =max(0,r0-radius-1)
lmax =min(r0+radius, len(str_list))
s = str_list[lmin:lmax]
result.append(' '.join(s))
return result
def str_radius_c(string):
string = string.lower()
radius = 2
str_list = string.split()
indices = []
result = []
for i, x in enumerate(str_list):
if is_contained('clinic',x) or is_contained('klinik',x):
indices.append(i)
for r0 in indices:
lmin =max(0,r0-radius-1)
lmax =min(r0+radius, len(str_list))
s = str_list[lmin:lmax]
result.append(' '.join(s))
return result
def str_radius_r(string):
string = string.lower()
radius = 2
str_list = string.split()
indices = []
result = []
for i, x in enumerate(str_list):
if is_contained('research',x):
indices.append(i)
for r0 in indices:
lmin =max(0,r0-radius-1)
lmax =min(r0+radius, len(str_list))
s = str_list[lmin:lmax]
result.append(' '.join(s))
return result
def str_radius_spec(string):
spec = False
for x in string.split():
try:
if categ_dicts[x] == 'Specific':
spec = True
return x
except:
pass
if spec == False:
return string
def avg_string(df, col):
avg = []
for i in range(len(df)):
avg.append(sum(len(s) for s in df[col].iloc[i])/len(df[col].iloc[i]))
return sum(avg)/len(avg)
def shorten_keywords(affiliations_simple):
affiliations_simple_n = []
for aff in affiliations_simple:
inner = []
for str in aff:
if 'universi' in str:
inner.extend(str_radius_u(str))
elif 'col' in str and 'trinity' in str:
inner.extend(str_radius_coll(str))
elif 'hospital' in str or 'hopita' in str:
inner.extend(str_radius_h(str))
elif 'clinic' in str or 'klinik' in str:
inner.extend(str_radius_c(str))
elif 'research council' in str:
inner.extend(str_radius_r(str))
else:
inner.append(str_radius_spec(str))
affiliations_simple_n.append(inner)
return affiliations_simple_n
def shorten_keywords_spark(affiliations_simple):
affiliations_simple_n = []
for aff in affiliations_simple:
if 'universi' in aff:
affiliations_simple_n.extend(str_radius_u(aff))
elif 'col' in aff and 'trinity' in aff:
affiliations_simple_n.extend(str_radius_coll(aff))
elif 'hospital' in aff or 'hopita' in aff:
affiliations_simple_n.extend(str_radius_h(aff))
elif 'clinic' in aff or 'klinik' in aff:
affiliations_simple_n.extend(str_radius_c(aff))
elif 'research council' in aff:
affiliations_simple_n.extend(str_radius_r(aff))
else:
affiliations_simple_n.append(str_radius_spec(aff))
return affiliations_simple_n
def refine(list_, affil):
affil = affil.lower()
ids = []
for matched_org_list in list_:
id_list = []
for matched_org in matched_org_list:
if dix_mult[matched_org] == 'unique':
id_list.append(dix_acad[matched_org])
else:
city_found = False
for city in dix_city[matched_org]:
if city[0] in affil:
id_list.append(city[1])
city_found = True
break
if not city_found:
country_found = False
for country in dix_country[matched_org]:
if country[0] in list(country_mapping.keys()):
print(country[0])
if country[0] in affil or country_mapping[country[0]][0] in affil or country_mapping[country[0]][0] in affil:
id_list.append(country[1])
country_found = True
break
elif country[0] in affil:
print('country found',country[0])
id_list.append(country[1])
country_found = True
break
if not country_found:
id_list.append(dix_acad[matched_org])
ids.append(id_list)
return ids
def compute_cos(x,s):
vectorizer = CountVectorizer()
s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
return cosine_similarity(x_vector, s_vector)[0][0]
# def find_ror(string, simU, simG):
# df = pd.DataFrame()
# df['Unique affiliations'] = [[string.lower()]]
# academia = create_df_algorithm(df)
# result = Aff_Ids(len(academia), academia,dix_acad, dix_mult, dix_city, dix_country, simU,simG)
# if len(result)>0:
# dict_aff_open = {x: y for x, y in zip(result['Original affiliations'], result['Matched organizations'])}
# dict_aff_id = {x: y for x, y in zip(result['Original affiliations'], result['unique ROR'])}
# dict_aff_score = {}
# for i in range(len(result)):
# if type(result['Similarity score'].iloc[i]) == list:
# dict_aff_score[result['Original affiliations'].iloc[i]] = result['Similarity score'].iloc[i]
# else:
# dict_aff_score[result['Original affiliations'].iloc[i]] = [result['Similarity score'].iloc[i]]
# pids = []
# for i in range(len(df)):
# pidsi = []
# for aff in df['Unique affiliations'].iloc[i]:
# if aff in list(dict_aff_id.keys()):
# pidsi = pidsi + dict_aff_id[aff]
# # elif 'unmatched organization(s)' not in pidsi:
# # pidsi = pidsi + ['unmatched organization(s)']
# pids.append(pidsi)
# names = []
# for i in range(len(df)):
# namesi = []
# for aff in df['Unique affiliations'].iloc[i]:
# if aff in list(dict_aff_open.keys()):
# try:
# namesi = namesi + dict_aff_open[aff]
# except TypeError:
# namesi = namesi + [dict_aff_open[aff]]
# names.append(namesi)
# scores = []
# for i in range(len(df)):
# scoresi = []
# for aff in df['Unique affiliations'].iloc[i]:
# if aff in list(dict_aff_score.keys()):
# scoresi = scoresi + dict_aff_score[aff]
# scores.append(scoresi)
# df['Matched organizations'] = names
# df['ROR'] = pids
# df['Scores'] = scores
# def update_Z(row):
# if len(row['ROR']) == 0 or len(row['Scores']) == 0:
# return []
# new_Z = []
# for ror, score in zip(row['ROR'], row['Scores']):
# entry = {'ROR_ID': ror, 'Confidence': score}
# new_Z.append(entry)
# return new_Z
# matching = df.apply(update_Z, axis=1)
# df['Matchings'] = matching
# return df['Matchings'].iloc[0]
# else:
# return 'no result'

319
matching_cluster.py Normal file
View File

@ -0,0 +1,319 @@
from collections import defaultdict
from collections import Counter
import Levenshtein
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from functions_cluster import *
from create_input_cluster import *
def best_sim_score(light_raw, candidate_num, pairs_list, m, simU, simG):
"""
Finds the best match between a 'key word' and several legal names from the OpenAIRE database.
---> corrects special cases in the main map that follows
Args:
light_raw
l2 candidate_num: number of candidates.
l3 pairs_list: List of pairs. (s, x, score)
l4 m: mult
Returns:
List: Resulting list containing OpenAIRE names and their similarity scores.
"""
vectorizer = CountVectorizer()
univ_num = light_raw.lower().count('univ')
result = []
best = []
s = light_raw
for j in range(len(pairs_list)):
x = pairs_list[j][1]
if [x, pairs_list[j][2]] in result:
continue
if m[pairs_list[j][0]] == 1:
if is_contained('univ', x.lower()) and pairs_list[j][2] > simU:
result.append([x, pairs_list[j][2]])
elif pairs_list[j][2] > simG:
result.append([x, pairs_list[j][2]])
elif pairs_list[j][2] >= 0.98: # and (is_contained("univ", x.lower()) or is_contained("college", x.lower()) or is_contained("center", x.lower()) or is_contained("schule", x.lower())): # If the similarity score of a pair (s,x) was 1, we store it to results list
result.append([pairs_list[j][1], 1])
else:
try:
if not is_contained("univ", x.lower()):
continue # Skip if x does not contain "university" or "univ"
# if (is_contained('hosp', x.lower()) and not is_contained('hosp', s)) or (not is_contained('hosp', x.lower()) and is_contained('hosp', s)) or (is_contained('hopital', x.lower()) and not is_contained('hopital', s)) or (not is_contained('hopital', x.lower()) and is_contained('hopital', s)):
# continue
s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(x_vector, s_vector)[0][0]
if similarity > 0.1:
similarity_l = 1 - Levenshtein.distance(x, pairs_list[j][0]) / max(len(x), len(pairs_list[j][0]))
best.append([x, similarity, similarity_l]) #(similarity+similarity2)/2])
except:
KeyError
if best:
# max_numbers = defaultdict(float)
# Assuming best is a list of three-element lists
# Each element is (string, number1, number2)
max_numbers = defaultdict(float)
for item in best:
string, number1, number2 = item # Unpack the three elements
max_numbers[string] = max(max_numbers[string], number1)
reduced_best = [[string, number1, number2] for string, number1, number2 in best if number1 == max_numbers[string]]
# Sort by number1 decreasingly and then by number2 in descending order
reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True)
result = result + reduced_best
univ_list = []
other_list = []
for r in result:
if is_contained('univ', r[0]):
univ_list.append(r)
else:
other_list.append(r)
limit = min(univ_num, candidate_num)
if len(univ_list) > limit:
result = univ_list[:limit] + other_list
result_dict = {}
pairs_dict = {}
for l in pairs_list:
pairs_dict[l[1]] = l[2]
for p in result:
result_dict[p[0]] = pairs_dict[p[0]]
result_dict_list = [[y[0], result_dict[y[0]]] for y in result]
return result_dict_list
def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG):
"""
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
Args:
m (int): The number of DOIs to check.
DF (DataFrame): The input DataFrame containing affiliation data.
dix_org (dict): A dictionary of names of organizations and their ROR_ids.
simU (float): Similarity threshold for universities.
simG (float): Similarity threshold for non-universities.
Returns:
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
"""
df_list = input[1]
light_aff = input[0]
vectorizer = CountVectorizer()
lnamelist = list(dix_org.keys())
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
#pairs = []
result = {}
pairs = []
def get_keywords(filtered_list):
# Extract the "keywords" values from the dictionaries in filtered_list
keywords_list = [entry["keywords"] for entry in filtered_list]
return keywords_list
keywords = get_keywords(df_list)
for k,s in enumerate(keywords):
similar_k = []
pairs_k = []
if s in lnamelist:
similarity = 1
similar_k.append(similarity)
pairs_k.append((s,s,similarity,dix_org[s]))
pairs.append((s,s,similarity,dix_org[s]))
if k not in dix:
dix[k] = [s]
else:
dix[k].append(s)
else:
for x in lnamelist:
if is_contained(s, x):
x_vector = vectorizer.fit_transform([x]).toarray()
s_vector = vectorizer.transform([s]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(x_vector, s_vector)[0][0]
if similarity > min(simU, simG):
if (is_contained('univ', s) and is_contained('univ', x)) and similarity > simU:
similar_k.append(similarity)
pairs_k.append((s,x,similarity,dix_org[x]))
pairs.append((s,x,similarity,dix_org[x]))
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif (not is_contained('univ', s) and not is_contained('univ', x)) and similarity > simG:
similar_k.append(similarity)
pairs_k.append((s,x,similarity,dix_org[x]))
pairs.append((s,x,similarity,dix_org[x]))
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif is_contained(x, s):
if (is_contained('univ', s) and is_contained('univ', x)):
s_vector = vectorizer.fit_transform([s]).toarray()
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(s_vector, x_vector)[0][0]
if similarity > simU: #max(0.82,sim):
similar_k.append(similarity)
pairs_k.append((s,x,similarity,dix_org[x]))
pairs.append((s,x,similarity,dix_org[x]))
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif not is_contained('univ', s) and not is_contained('univ', x):
s_vector = vectorizer.fit_transform([s]).toarray()
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(s_vector, x_vector)[0][0]
if similarity > simG: #max(0.82,sim):
similar_k.append(similarity)
pairs_k.append((s,x,similarity,dix_org[x]))
pairs.append((s,x,similarity,dix_org[x]))
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
result[k] = pairs_k
multi = index_multiple_matchings(list(set(pairs)))
# need_check = list(set([i for i in range(len(multi)) if list(multi.values())[i]>1]))
# print('here', multi)
# need_check_keys = [keywords[i] for i in range(len(keywords)) if multi[keywords[i]]>1]
need_check_keys = []
for i in range(len(keywords)):
try:
if multi[keywords[i]]>1:
need_check_keys.append(keywords[i])
except:
pass
best = best_sim_score(light_aff, len(keywords), pairs, multi, simU, simG)
matched_org = [x[0] for x in best]
# best_o = []
# best_s = []
# best_result = []
# for x in best:
# best_o.append([x[i][0] for i in range(len(x))])
# best_s.append([round(x[i][1],2) for i in range(len(x))])
# num_mathced = [len(best_s[i]) for i in range(len(need_check))]
ids = [dix_org[x[0]] for x in best]
for i,x in enumerate(matched_org):
# id_list = []
if dix_mult[x] != 'unique':
if x in list(dix_city_ror.keys()):
match_found0 = False
match_found = False
for city in dix_city_ror[x]:
if city[0] in light_aff:
if city[0] not in x:
ids[i] = city[1]
match_found0 = True
match_found = True
break
if not match_found:
for city in dix_city_ror[x]:
if city[0] in light_aff and city[0] not in x:
ids[i] = city[1]
match_found0 = True
print('ok')
break
if not match_found:
match_found2 = False
match_found3 = False
for country in dix_country_ror[x]:
if country[0] == 'united states' and (country[0] in light_aff or 'usa' in light_aff):
ids[i] = country[1]
match_found2 = True
match_found3 = True
break
if country[0] == 'united kingdom' and (country[0] in light_aff or 'uk' in light_aff):
ids[i] = country[1]
match_found2 = True
match_found3 = True
break
elif country[0] in light_aff:
if country[0] not in x:
ids[i] = country[1]
match_found2 = True
match_found3 = True
break
if not match_found3:
for country in dix_country_ror[x]:
if country[0] in light_aff and country[0] in x:
ids[i] = country[1]
match_found2 = True
break
results = [[x[0],x[1], ids[i]] for i,x in enumerate(best)]
return results #[[result[to_check[i]] for i in ready] + [to_check[2]], best[0]]

BIN
txt_files/.DS_Store vendored Normal file

Binary file not shown.

584
txt_files/city_names.txt Normal file
View File

@ -0,0 +1,584 @@
galway
maynooth
duluth
port arthur
new orleans
paterson
santa barbara
thornton
westminster
north las vegas
stockton
marysville
fitchburg
tallinn
fargo
seaside
manaus
porto
quebec city
hialeah
normal
kansas city
delhi
fort worth
palermo
olathe
madison
santa maria
youngstown
allentown
santa clara
charlotte
agra
palmdale
kraków
bendigo
high point
washington
dallas
grand prairie
plano
leipzig
bratislava
seville
puebla
lucknow
toowoomba
santa rosa
sioux falls
flint
kissimmee
lacey
brownsville
palm springs
tyler
minsk
san diego
los angeles
edmonton
college station
toulouse
garland
florence
saskatoon
albury-wodonga
newburgh
danbury
deltona
south bend
nagpur
pomona
memphis
london
lincoln
chandler
adelaide
salt lake city
edinburgh
suzhou
grayslake
new york city
kanpur
brussels
okayama
tuscaloosa
clarksville
jackson
boise city
canton
louisville
varanasi
columbus
lorain
vadodara
orem
chennai
townsville
eindhoventoronto
wuhan
norman
winter haven
eugene
riga
hamamatsu
fresno
lake charles
budapest
mobile
lowell
vienna
tallahassee
nanjing
new haven
sacramento
leeds
harlingen
springdale
perth
sendai
utica
orange
baltimore
rochester
rancho cucamonga
bellevue
fort wayne
modesto
pristina
nuremberg
stuttgart
indore
murfreesboro
nottingham
scranton
lancaster
abilene
monterey
sioux city
bari
chula vista
ahmedabad
north port
helsinki
leominster
ocala
sarajevo
hangzhou
roanoke
new york
bethlehem
dublin
sunshine coast
pune
billings
changchunsydney
garden grove
port orange
pittsburgh
new bedford
hiroshima
san francisco
sheffield
chongqing
layton
pueblo
chengdu
cincinnati
erie
lansing
ljubljana
st louis
rio de janeiro
philadelphia
tacoma
bel air
chesapeake
davenport
las vegas
nagasaki
kitchener
boulder
roseville
evansville
victoria
burbank
sofia
santa clarita
san buenaventura
savannah
apple valley
brighton
coral springs
huntsville
fort lauderdale
warsaw
antioch
medford
visalia
frankfurt
joliet
curitiba
mcallen
seattle
alexandria
bryan
moreno valley
berlin
olympia
caracas
tianjin
cleveland
des moines
prague
fukuoka
burlington
bhopal
nara
hampton
jefferson
chicago
temecula
paris
gilbert
bradenton
champaign
munich
amsterdam
raleigh
atlanta
lakeland
denver
round lake beach
richmond
buffalo
phoenix
antwerp
greenbay
milwaukee
south lyon
concord
vero beach
newcastle
podgorica
monterrey
shantou
costa mesa
copenhagen
vilnius
dalian
bristol
salinas
belgrade
waterloo
henderson
hayward
hickory
el monte
lima
redding
mexico city
cary
kennewick
guayaquil
tirana
kawasaki
greensboro
west covina
amarillo
saitama
new london
recife
manchester
rockford
kelowna
hagerstown
bordeaux
york
kaneohe
tucson
gainesville
kalamazoo
bogotá
reading
virginia beach
guadalajara
albany
durham
green bay
oceanside
montreal
turin
malaga
oshawa
mesa
pensacola
boise
bonita springs
fort walton beach
port saint lucie
reykjavik
north charleston
newark
reno
knoxville
bakersfield
oslo
omaha
milan
cambridge
norwich
shanghai
naples
victorville
zagreb
norwalk
huntington beach
clarke county
lubbock
yakima
warren
bucharest
simi valley
greenville
racine
salvador
elk grove
orlando
windsor
santa cruz
saginaw
ballarat
muskegon
shreveport
clearwater
merced
boston
basel
elizabeth
panama city
okinawa
sarasota
zurich
glendale
wilmington
pompano beach
guangzhou
fairfield
hyderabad
santiago
nashville
mchenry
ann arbor
carrollton
hollywood
laredo
rome
san bernardino
bergen
springfield
winnipeg
corona
surat
long beach
nagoya
toledo
geelong
kenosha
sterling heights
lisbon
myrtle beach
nashua
riverside
tampa
bangalore
richland
rotterdam
lyon
scottsdale
berkeley
bologna
cedar rapids
syracuse
tulsa
ludhiana
hemet
portland
mission viejo
salem
overland park
detroit
jinan
osaka
grand rapids
jersey city
kailua
venice
darwin
miramar
gulfport-biloxi
huntington
portsmouth
worcester
sunnyvale
escondido
college park
thousand oaks
harbin
belfast
yonkers
alicante
barnstable
kitakyushu
sapporo
ogden
aurora
palm bay
düsseldorf
hobart
irvine
st johns
hamburg
provo
melbourne
madrid
zhengzhou
asheville
patna
inglewood
houston
newport news
west valley city
oklahoma city
brisbane
valencia
pasadena
aberdeen
st petersburg
lakewood
irving
naperville
miami
topeka
downey
genoa
lewisville
birmingham
xian
saint paul
bremerton
corpus christi
daytona beach
st paul
oxnard
murrieta
lafayette
montgomery
baton rouge
skopje
cathedral city
spartanburg
canberra
arvada
hesperia
port st lucie
saint louis
bridgeport
tempe
quito
chattanooga
bremen
gold coast
cairns
beaumont
elkhart
peoria
calgary
honolulu
havre de grace
hamilton
fullerton
daly city
dresden
belem
ottawa
regina
chiba
fort collins
indianapolis
mumbai
killeen
sao paulo
jaipur
fremont
zaragoza
charleston
waco
kobe
odessa
monroe
vallejo
marseille
qingdao
frederick
marina
sebastian
oakland
pembroke pines
san antonio
kyoto
colorado springs
el paso
shenyang
punta gorda
fort smith
richmond county
waterbury
shenzhen
albuquerque
jacksonville
minneapolis
fortaleza
denton
gastonia
fayetteville
bloomington
houma
santa ana
kolkata
las cruces
barcelona
arlington
niigata
norfolk
fontana
providence
santo domingo
vancouver
appleton
san jose
hartford
winston
barrie
glasgow
davidson county
yokohama
independence
athens
harrisburg
macon
torrance
launceston
cape coral
austin
little rock
cologne
mesquite
catania
stockholm
nice
stamford
buenos aires
columbia
anchorage
dayton
wollongong
halifax
verona
anaheim
kiev
augusta
tokyo
akron
lexington
wichita
saint petersburg
beijing
johnson city
spokane
liverpool
howell
poughkeepsie
ontario
atlantic city
trenton

28
txt_files/remove_list.txt Normal file
View File

@ -0,0 +1,28 @@
universi
research institu
laboratory
gmbh
inc
universi of
research center
foundation
faculty
national institu
school medicine
universi school
graduate school
graduate school engineering
institu tropical medicine
institu virology
faculty medicine
laboratory
universi park
institu science
polytechnic universi
universi 1
ciudad universi
universi campus
universi hospitals
colege
universi road
universitetska str

16
txt_files/stop_words.txt Normal file
View File

@ -0,0 +1,16 @@
from
the
of
at
de
for
et
für
des
in
as
a
and
fur
for
und

View File

@ -0,0 +1,8 @@
universitetskaya
universitatsklinikum
universitatskinderklinik
universitatskliniken
universitetshospital
universitatsmedizin
universitatsbibliothek
universitatspital

116
update_records.py Normal file
View File

@ -0,0 +1,116 @@
import json
import os
from pyspark.sql import SparkSession
from affro_cluster import *
folder_path = '/user/zeppelin/miriam.baglioni/AffStringFromIISDataset2'
#folder_path = 'check'
json_file_names = []
# Loop through all files in the directory
for file_name in os.listdir(folder_path):
# Check if the file is a JSON file (you can adjust the check as needed)
if file_name != '_SUCCESS':
json_file_names.append(file_name)
# json_file_names now contains the names of all JSON files in the folder
# Initialize Spark session
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
def remove_duplicates(list_of_dicts):
# Use a set to store tuples of dictionary items to filter out duplicates
seen = set()
unique_list_of_dicts = []
for d in list_of_dicts:
# Convert the dictionary to a tuple of items
items = tuple(d.items())
if items not in seen:
seen.add(items)
unique_list_of_dicts.append(d)
return unique_list_of_dicts
def update_record(record):
id = record['id']
authors = []
try:
for author in record['authors']:
author_object = {}
if 'orcid.org/0' in author['fullName']:
author_object['Name'] = {'Full':author['fullName'].split(',')[1], 'First' : None, 'Last' : None}
author_object['ORCID'] = author['fullName'].split(',')[0][:36]
else:
author_object['Name'] = {'Full':author['fullName'], 'First' : None, 'Last' : None}
author_object['ORCID'] = None
author_object['Raw_affiliations'] = [affiliation['raw_affiliation_string'] for affiliation in author['affiliations']]
all_affs_with_ror = []
have_ror = False
for affiliation in author['affiliations']:
# author_object['Raw_affiliations'] = [x for x in affiliation['raw_affiliation_string']]
if 'ORCID: 0' in affiliation['raw_affiliation_string']:
x = affiliation['raw_affiliation_string']
author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID: ')[1]
elif 'ORCID 0' in affiliation['raw_affiliation_string']:
x = affiliation['raw_affiliation_string']
author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID ')[1]
if 'ror.org' in affiliation['raw_affiliation_string']:
have_ror = True
all_affs_with_ror.append({
'Origin': 'data',
'RORid': affiliation['raw_affiliation_string'][0:25],
'Confidence': None
})
else:
if len(affro(affiliation['raw_affiliation_string']))>0:
author_object['Organization_PIDs'] = affro(affiliation['raw_affiliation_string'])
author_object['Organization_PIDs'] = remove_duplicates([json.loads(x) for x in author_object['Organization_PIDs']])
else:
author_object['Organization_PIDs'] = []
if have_ror == True:
author_object['Organization_PIDs'] = all_affs_with_ror
order = ["Name", "Raw_affiliations", "Organization_PIDs", "ORCID"]
reordered_data = {k: author_object[k] for k in order}
authors.append(reordered_data)
organizations = remove_duplicates([x for author in authors for x in author['Organization_PIDs']])
updt = {'ID' : id, 'Authors' : authors, 'Organizations' : organizations}
return updt
except Exception as e:
print(f"Error processing record with id {record.get('id')}: {str(e)}")
return None
for file in json_file_names:
print('start processing '+str(file))
df = spark.read.json(folder_path + '/' + file)
# Apply the update_record function
updated_rdd = df.rdd.map(lambda row: update_record(row.asDict()))
# Convert updated RDD to JSON strings
json_rdd = updated_rdd.map(lambda record: json.dumps(record))
# Collect the data and write to an output file with a unique name
json_data = json_rdd.collect()
# Create a new filename by appending "_output.json" to the original filename (without extension)
output_file_name = file+'_output.json'
print('end processing '+str(file))
with open(output_file_name, 'w') as f:
for i, item in enumerate(json_data):
print('write '+str(i))
f.write(item + '\n')