affRo/matching_cluster.py

361 lines
14 KiB
Python
Raw Normal View History

2024-09-05 12:23:32 +02:00
from collections import defaultdict
from collections import Counter
import Levenshtein
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from functions_cluster import *
from create_input_cluster import *
2024-09-12 15:56:26 +02:00
with open('/Users/myrto/Documents/openAIRE/3. ror/dictionaries/dix_status.json', 'rb') as f:
dix_status = json.load(f)
2024-10-07 11:25:16 +02:00
specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific']
2024-09-05 12:23:32 +02:00
def best_sim_score(light_raw, candidate_num, pairs_list, m, simU, simG):
"""
Finds the best match between a 'key word' and several legal names from the OpenAIRE database.
---> corrects special cases in the main map that follows
Args:
light_raw
l2 candidate_num: number of candidates.
l3 pairs_list: List of pairs. (s, x, score)
l4 m: mult
Returns:
List: Resulting list containing OpenAIRE names and their similarity scores.
"""
vectorizer = CountVectorizer()
univ_num = light_raw.lower().count('univ')
result = []
best = []
s = light_raw
for j in range(len(pairs_list)):
x = pairs_list[j][1]
if [x, pairs_list[j][2]] in result:
continue
if m[pairs_list[j][0]] == 1:
if is_contained('univ', x.lower()) and pairs_list[j][2] > simU:
result.append([x, pairs_list[j][2]])
elif pairs_list[j][2] > simG:
result.append([x, pairs_list[j][2]])
elif pairs_list[j][2] >= 0.98: # and (is_contained("univ", x.lower()) or is_contained("college", x.lower()) or is_contained("center", x.lower()) or is_contained("schule", x.lower())): # If the similarity score of a pair (s,x) was 1, we store it to results list
result.append([pairs_list[j][1], 1])
else:
try:
if not is_contained("univ", x.lower()):
continue # Skip if x does not contain "university" or "univ"
# if (is_contained('hosp', x.lower()) and not is_contained('hosp', s)) or (not is_contained('hosp', x.lower()) and is_contained('hosp', s)) or (is_contained('hopital', x.lower()) and not is_contained('hopital', s)) or (not is_contained('hopital', x.lower()) and is_contained('hopital', s)):
# continue
s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(x_vector, s_vector)[0][0]
if similarity > 0.1:
similarity_l = 1 - Levenshtein.distance(x, pairs_list[j][0]) / max(len(x), len(pairs_list[j][0]))
best.append([x, similarity, similarity_l]) #(similarity+similarity2)/2])
except:
KeyError
if best:
# max_numbers = defaultdict(float)
# Assuming best is a list of three-element lists
# Each element is (string, number1, number2)
max_numbers = defaultdict(float)
for item in best:
string, number1, number2 = item # Unpack the three elements
max_numbers[string] = max(max_numbers[string], number1)
reduced_best = [[string, number1, number2] for string, number1, number2 in best if number1 == max_numbers[string]]
# Sort by number1 decreasingly and then by number2 in descending order
reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True)
result = result + reduced_best
univ_list = []
other_list = []
for r in result:
if is_contained('univ', r[0]):
univ_list.append(r)
else:
other_list.append(r)
limit = min(univ_num, candidate_num)
if len(univ_list) > limit:
result = univ_list[:limit] + other_list
result_dict = {}
pairs_dict = {}
for l in pairs_list:
pairs_dict[l[1]] = l[2]
for p in result:
result_dict[p[0]] = pairs_dict[p[0]]
result_dict_list = [[y[0], result_dict[y[0]]] for y in result]
return result_dict_list
def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG):
"""
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
Args:
m (int): The number of DOIs to check.
DF (DataFrame): The input DataFrame containing affiliation data.
dix_org (dict): A dictionary of names of organizations and their ROR_ids.
simU (float): Similarity threshold for universities.
simG (float): Similarity threshold for non-universities.
Returns:
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
"""
df_list = input[1]
light_aff = input[0]
vectorizer = CountVectorizer()
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
#pairs = []
result = {}
pairs = []
def get_keywords(filtered_list):
# Extract the "keywords" values from the dictionaries in filtered_list
keywords_list = [entry["keywords"] for entry in filtered_list]
return keywords_list
keywords = get_keywords(df_list)
for k,s in enumerate(keywords):
similar_k = []
pairs_k = []
2024-09-19 21:37:28 +02:00
if s in dix_org:
2024-09-05 12:23:32 +02:00
similarity = 1
similar_k.append(similarity)
pairs_k.append((s,s,similarity,dix_org[s]))
pairs.append((s,s,similarity,dix_org[s]))
if k not in dix:
dix[k] = [s]
else:
dix[k].append(s)
else:
2024-09-19 21:37:28 +02:00
for x in dix_org:
2024-09-05 12:23:32 +02:00
if is_contained(s, x):
x_vector = vectorizer.fit_transform([x]).toarray()
s_vector = vectorizer.transform([s]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(x_vector, s_vector)[0][0]
if similarity > min(simU, simG):
if (is_contained('univ', s) and is_contained('univ', x)) and similarity > simU:
similar_k.append(similarity)
pairs_k.append((s,x,similarity,dix_org[x]))
pairs.append((s,x,similarity,dix_org[x]))
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif (not is_contained('univ', s) and not is_contained('univ', x)) and similarity > simG:
similar_k.append(similarity)
pairs_k.append((s,x,similarity,dix_org[x]))
pairs.append((s,x,similarity,dix_org[x]))
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif is_contained(x, s):
if (is_contained('univ', s) and is_contained('univ', x)):
s_vector = vectorizer.fit_transform([s]).toarray()
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(s_vector, x_vector)[0][0]
if similarity > simU: #max(0.82,sim):
similar_k.append(similarity)
pairs_k.append((s,x,similarity,dix_org[x]))
pairs.append((s,x,similarity,dix_org[x]))
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif not is_contained('univ', s) and not is_contained('univ', x):
s_vector = vectorizer.fit_transform([s]).toarray()
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(s_vector, x_vector)[0][0]
if similarity > simG: #max(0.82,sim):
similar_k.append(similarity)
pairs_k.append((s,x,similarity,dix_org[x]))
pairs.append((s,x,similarity,dix_org[x]))
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
result[k] = pairs_k
multi = index_multiple_matchings(list(set(pairs)))
# need_check = list(set([i for i in range(len(multi)) if list(multi.values())[i]>1]))
# print('here', multi)
# need_check_keys = [keywords[i] for i in range(len(keywords)) if multi[keywords[i]]>1]
need_check_keys = []
for i in range(len(keywords)):
try:
if multi[keywords[i]]>1:
need_check_keys.append(keywords[i])
except:
pass
best = best_sim_score(light_aff, len(keywords), pairs, multi, simU, simG)
matched_org = [x[0] for x in best]
# best_o = []
# best_s = []
# best_result = []
# for x in best:
# best_o.append([x[i][0] for i in range(len(x))])
# best_s.append([round(x[i][1],2) for i in range(len(x))])
# num_mathced = [len(best_s[i]) for i in range(len(need_check))]
ids = [dix_org[x[0]] for x in best]
for i,x in enumerate(matched_org):
# id_list = []
2024-10-07 11:25:16 +02:00
if dix_mult[x] == 'unique':
if 'institu' in x and 'univ' in x:
if dix_city_ror[x][0] not in light_aff and dix_country_ror[x][0] not in light_aff:
pass
else:
ids[i] = dix_org[x]
2024-09-05 12:23:32 +02:00
if dix_mult[x] != 'unique':
2024-09-19 21:37:28 +02:00
if x in dix_city_ror:
2024-09-05 12:23:32 +02:00
match_found = False
for city in dix_city_ror[x]:
if city[0] in light_aff:
if city[0] not in x:
2024-09-12 15:56:26 +02:00
ids[i] = city[1]
2024-09-05 12:23:32 +02:00
match_found = True
break
2024-09-12 15:56:26 +02:00
else:
if light_aff.count(city[0]) >1:
ids[i] = city[1]
match_found = True
break
2024-09-05 12:23:32 +02:00
if not match_found:
for city in dix_city_ror[x]:
if city[0] in light_aff and city[0] not in x:
ids[i] = city[1]
break
if not match_found:
2024-10-07 11:25:16 +02:00
match_found2 = False
2024-09-05 12:23:32 +02:00
match_found3 = False
for country in dix_country_ror[x]:
if country[0] == 'united states' and (country[0] in light_aff or 'usa' in light_aff):
ids[i] = country[1]
2024-10-07 11:25:16 +02:00
match_found2 = True
2024-09-05 12:23:32 +02:00
match_found3 = True
break
if country[0] == 'united kingdom' and (country[0] in light_aff or 'uk' in light_aff):
ids[i] = country[1]
2024-10-07 11:25:16 +02:00
match_found2 = True
2024-09-05 12:23:32 +02:00
match_found3 = True
break
elif country[0] in light_aff:
if country[0] not in x:
ids[i] = country[1]
2024-10-07 11:25:16 +02:00
match_found2 = True
2024-09-05 12:23:32 +02:00
match_found3 = True
break
if not match_found3:
for country in dix_country_ror[x]:
if country[0] in light_aff and country[0] in x:
ids[i] = country[1]
2024-10-07 11:25:16 +02:00
match_found2 = True
2024-09-05 12:23:32 +02:00
break
2024-10-07 11:25:16 +02:00
if not match_found2:
if 'univ' in x:
try:
ids[i] = dix_org[x]
except Exception as e:
print(e)
print('wtf')
print(x)
else:
for sp in specific:
try:
if sp in x:
ids[i] = dix_org[x]
except Exception as e:
print(e)
print('wtf')
print(x)
2024-09-12 15:56:26 +02:00
2024-09-05 12:23:32 +02:00
results = [[x[0],x[1], ids[i]] for i,x in enumerate(best)]
2024-09-12 15:56:26 +02:00
results_upd = []
2024-09-05 12:23:32 +02:00
2024-09-12 15:56:26 +02:00
for r in results:
2024-10-07 11:25:16 +02:00
if 'ror.org' in r[2]:
if dix_status[r[2]][0] == 'active':
results_upd.append([r[0],r[1], 'ROR', r[2], 'active'])
2024-09-12 15:56:26 +02:00
else:
2024-10-07 11:25:16 +02:00
if dix_status[r[2]][1] == '':
results_upd.append([r[0],r[1], 'ROR', r[2], dix_status[r[2]][0]])
else:
results_upd.append([r[0],r[1], 'ROR', r[2], dix_status[r[2]][0]])
results_upd.append([r[0],r[1], 'ROR', dix_status[r[2]][1], 'active'])
2024-09-12 15:56:26 +02:00
return results_upd