331 lines
12 KiB
Python
331 lines
12 KiB
Python
from collections import defaultdict
|
|
from collections import Counter
|
|
|
|
import Levenshtein
|
|
|
|
from sklearn.feature_extraction.text import CountVectorizer
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
|
|
from functions_cluster import *
|
|
from create_input_cluster import *
|
|
|
|
with open('/Users/myrto/Documents/openAIRE/3. ror/dictionaries/dix_status.json', 'rb') as f:
|
|
dix_status = json.load(f)
|
|
|
|
def best_sim_score(light_raw, candidate_num, pairs_list, m, simU, simG):
|
|
"""
|
|
Finds the best match between a 'key word' and several legal names from the OpenAIRE database.
|
|
---> corrects special cases in the main map that follows
|
|
|
|
Args:
|
|
light_raw
|
|
l2 candidate_num: number of candidates.
|
|
l3 pairs_list: List of pairs. (s, x, score)
|
|
l4 m: mult
|
|
|
|
Returns:
|
|
List: Resulting list containing OpenAIRE names and their similarity scores.
|
|
"""
|
|
|
|
vectorizer = CountVectorizer()
|
|
univ_num = light_raw.lower().count('univ')
|
|
result = []
|
|
best = []
|
|
s = light_raw
|
|
|
|
for j in range(len(pairs_list)):
|
|
x = pairs_list[j][1]
|
|
|
|
if [x, pairs_list[j][2]] in result:
|
|
continue
|
|
|
|
if m[pairs_list[j][0]] == 1:
|
|
|
|
if is_contained('univ', x.lower()) and pairs_list[j][2] > simU:
|
|
result.append([x, pairs_list[j][2]])
|
|
elif pairs_list[j][2] > simG:
|
|
result.append([x, pairs_list[j][2]])
|
|
|
|
elif pairs_list[j][2] >= 0.98: # and (is_contained("univ", x.lower()) or is_contained("college", x.lower()) or is_contained("center", x.lower()) or is_contained("schule", x.lower())): # If the similarity score of a pair (s,x) was 1, we store it to results list
|
|
result.append([pairs_list[j][1], 1])
|
|
|
|
else:
|
|
try:
|
|
if not is_contained("univ", x.lower()):
|
|
continue # Skip if x does not contain "university" or "univ"
|
|
|
|
# if (is_contained('hosp', x.lower()) and not is_contained('hosp', s)) or (not is_contained('hosp', x.lower()) and is_contained('hosp', s)) or (is_contained('hopital', x.lower()) and not is_contained('hopital', s)) or (not is_contained('hopital', x.lower()) and is_contained('hopital', s)):
|
|
# continue
|
|
s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
|
|
x_vector = vectorizer.transform([x]).toarray()
|
|
|
|
# Compute similarity between the vectors
|
|
similarity = cosine_similarity(x_vector, s_vector)[0][0]
|
|
if similarity > 0.1:
|
|
similarity_l = 1 - Levenshtein.distance(x, pairs_list[j][0]) / max(len(x), len(pairs_list[j][0]))
|
|
|
|
best.append([x, similarity, similarity_l]) #(similarity+similarity2)/2])
|
|
except:
|
|
KeyError
|
|
|
|
if best:
|
|
# max_numbers = defaultdict(float)
|
|
|
|
|
|
# Assuming best is a list of three-element lists
|
|
# Each element is (string, number1, number2)
|
|
max_numbers = defaultdict(float)
|
|
for item in best:
|
|
string, number1, number2 = item # Unpack the three elements
|
|
max_numbers[string] = max(max_numbers[string], number1)
|
|
|
|
reduced_best = [[string, number1, number2] for string, number1, number2 in best if number1 == max_numbers[string]]
|
|
|
|
# Sort by number1 decreasingly and then by number2 in descending order
|
|
reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True)
|
|
|
|
result = result + reduced_best
|
|
|
|
univ_list = []
|
|
other_list = []
|
|
|
|
for r in result:
|
|
if is_contained('univ', r[0]):
|
|
univ_list.append(r)
|
|
else:
|
|
other_list.append(r)
|
|
|
|
limit = min(univ_num, candidate_num)
|
|
|
|
if len(univ_list) > limit:
|
|
result = univ_list[:limit] + other_list
|
|
|
|
result_dict = {}
|
|
pairs_dict = {}
|
|
|
|
|
|
for l in pairs_list:
|
|
pairs_dict[l[1]] = l[2]
|
|
|
|
|
|
for p in result:
|
|
result_dict[p[0]] = pairs_dict[p[0]]
|
|
|
|
|
|
result_dict_list = [[y[0], result_dict[y[0]]] for y in result]
|
|
|
|
return result_dict_list
|
|
|
|
|
|
|
|
def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG):
|
|
|
|
"""
|
|
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
|
|
|
|
Args:
|
|
m (int): The number of DOIs to check.
|
|
DF (DataFrame): The input DataFrame containing affiliation data.
|
|
dix_org (dict): A dictionary of names of organizations and their ROR_ids.
|
|
simU (float): Similarity threshold for universities.
|
|
simG (float): Similarity threshold for non-universities.
|
|
|
|
Returns:
|
|
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
|
|
"""
|
|
df_list = input[1]
|
|
light_aff = input[0]
|
|
vectorizer = CountVectorizer()
|
|
|
|
lnamelist = list(dix_org.keys())
|
|
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
|
|
#pairs = []
|
|
result = {}
|
|
pairs = []
|
|
|
|
|
|
def get_keywords(filtered_list):
|
|
# Extract the "keywords" values from the dictionaries in filtered_list
|
|
keywords_list = [entry["keywords"] for entry in filtered_list]
|
|
|
|
return keywords_list
|
|
keywords = get_keywords(df_list)
|
|
|
|
|
|
for k,s in enumerate(keywords):
|
|
similar_k = []
|
|
pairs_k = []
|
|
|
|
if s in lnamelist:
|
|
similarity = 1
|
|
similar_k.append(similarity)
|
|
|
|
pairs_k.append((s,s,similarity,dix_org[s]))
|
|
pairs.append((s,s,similarity,dix_org[s]))
|
|
|
|
|
|
if k not in dix:
|
|
dix[k] = [s]
|
|
else:
|
|
dix[k].append(s)
|
|
else:
|
|
|
|
for x in lnamelist:
|
|
if is_contained(s, x):
|
|
|
|
x_vector = vectorizer.fit_transform([x]).toarray()
|
|
s_vector = vectorizer.transform([s]).toarray()
|
|
|
|
# Compute similarity between the vectors
|
|
similarity = cosine_similarity(x_vector, s_vector)[0][0]
|
|
if similarity > min(simU, simG):
|
|
if (is_contained('univ', s) and is_contained('univ', x)) and similarity > simU:
|
|
similar_k.append(similarity)
|
|
pairs_k.append((s,x,similarity,dix_org[x]))
|
|
pairs.append((s,x,similarity,dix_org[x]))
|
|
|
|
|
|
if k not in dix:
|
|
dix[k] = [x]
|
|
else:
|
|
dix[k].append(x)
|
|
elif (not is_contained('univ', s) and not is_contained('univ', x)) and similarity > simG:
|
|
similar_k.append(similarity)
|
|
pairs_k.append((s,x,similarity,dix_org[x]))
|
|
pairs.append((s,x,similarity,dix_org[x]))
|
|
|
|
|
|
if k not in dix:
|
|
dix[k] = [x]
|
|
else:
|
|
dix[k].append(x)
|
|
|
|
elif is_contained(x, s):
|
|
if (is_contained('univ', s) and is_contained('univ', x)):
|
|
|
|
s_vector = vectorizer.fit_transform([s]).toarray()
|
|
x_vector = vectorizer.transform([x]).toarray()
|
|
|
|
# Compute similarity between the vectors
|
|
similarity = cosine_similarity(s_vector, x_vector)[0][0]
|
|
if similarity > simU: #max(0.82,sim):
|
|
similar_k.append(similarity)
|
|
pairs_k.append((s,x,similarity,dix_org[x]))
|
|
pairs.append((s,x,similarity,dix_org[x]))
|
|
|
|
if k not in dix:
|
|
dix[k] = [x]
|
|
else:
|
|
dix[k].append(x)
|
|
elif not is_contained('univ', s) and not is_contained('univ', x):
|
|
|
|
s_vector = vectorizer.fit_transform([s]).toarray()
|
|
x_vector = vectorizer.transform([x]).toarray()
|
|
|
|
# Compute similarity between the vectors
|
|
similarity = cosine_similarity(s_vector, x_vector)[0][0]
|
|
if similarity > simG: #max(0.82,sim):
|
|
similar_k.append(similarity)
|
|
pairs_k.append((s,x,similarity,dix_org[x]))
|
|
pairs.append((s,x,similarity,dix_org[x]))
|
|
|
|
if k not in dix:
|
|
dix[k] = [x]
|
|
else:
|
|
dix[k].append(x)
|
|
|
|
result[k] = pairs_k
|
|
|
|
multi = index_multiple_matchings(list(set(pairs)))
|
|
# need_check = list(set([i for i in range(len(multi)) if list(multi.values())[i]>1]))
|
|
# print('here', multi)
|
|
# need_check_keys = [keywords[i] for i in range(len(keywords)) if multi[keywords[i]]>1]
|
|
need_check_keys = []
|
|
for i in range(len(keywords)):
|
|
try:
|
|
if multi[keywords[i]]>1:
|
|
need_check_keys.append(keywords[i])
|
|
except:
|
|
pass
|
|
|
|
best = best_sim_score(light_aff, len(keywords), pairs, multi, simU, simG)
|
|
matched_org = [x[0] for x in best]
|
|
# best_o = []
|
|
# best_s = []
|
|
# best_result = []
|
|
# for x in best:
|
|
# best_o.append([x[i][0] for i in range(len(x))])
|
|
# best_s.append([round(x[i][1],2) for i in range(len(x))])
|
|
# num_mathced = [len(best_s[i]) for i in range(len(need_check))]
|
|
ids = [dix_org[x[0]] for x in best]
|
|
for i,x in enumerate(matched_org):
|
|
# id_list = []
|
|
if dix_mult[x] != 'unique':
|
|
if x in list(dix_city_ror.keys()):
|
|
match_found = False
|
|
|
|
for city in dix_city_ror[x]:
|
|
if city[0] in light_aff:
|
|
if city[0] not in x:
|
|
ids[i] = city[1]
|
|
match_found = True
|
|
break
|
|
else:
|
|
if light_aff.count(city[0]) >1:
|
|
ids[i] = city[1]
|
|
match_found = True
|
|
break
|
|
|
|
if not match_found:
|
|
for city in dix_city_ror[x]:
|
|
if city[0] in light_aff and city[0] not in x:
|
|
ids[i] = city[1]
|
|
break
|
|
|
|
if not match_found:
|
|
match_found3 = False
|
|
|
|
for country in dix_country_ror[x]:
|
|
if country[0] == 'united states' and (country[0] in light_aff or 'usa' in light_aff):
|
|
ids[i] = country[1]
|
|
match_found3 = True
|
|
break
|
|
|
|
if country[0] == 'united kingdom' and (country[0] in light_aff or 'uk' in light_aff):
|
|
ids[i] = country[1]
|
|
match_found3 = True
|
|
break
|
|
|
|
elif country[0] in light_aff:
|
|
|
|
if country[0] not in x:
|
|
ids[i] = country[1]
|
|
match_found3 = True
|
|
break
|
|
|
|
if not match_found3:
|
|
for country in dix_country_ror[x]:
|
|
if country[0] in light_aff and country[0] in x:
|
|
ids[i] = country[1]
|
|
break
|
|
|
|
|
|
|
|
results = [[x[0],x[1], ids[i]] for i,x in enumerate(best)]
|
|
results_upd = []
|
|
|
|
for r in results:
|
|
if dix_status[r[2]][0] == 'active':
|
|
r.append('active')
|
|
results_upd.append(r)
|
|
else:
|
|
if dix_status[r[2]][1] == '':
|
|
r.append(dix_status[r[2]][0])
|
|
results_upd.append(r)
|
|
else:
|
|
r.append(dix_status[r[2]][0])
|
|
results_upd.append(r)
|
|
results_upd.append([r[0],r[1], dix_status[r[2]][1], 'active'])
|
|
|
|
|
|
return results_upd |