Compare commits
7 Commits
| Author | SHA1 | Date |
|---|---|---|
|
|
30e88e87f0 | |
|
|
abd11209b9 | |
|
|
b274291443 | |
|
|
baceddf91f | |
|
|
1ecee1372f | |
|
|
2da4348884 | |
|
|
c9914a47d0 |
|
|
@ -1,6 +1,6 @@
|
|||
Metadata-Version: 2.4
|
||||
Name: affro
|
||||
Version: 2.2.2
|
||||
Version: 3.1.1
|
||||
Summary: A tool to resolve organization names to ROR or OpenOrgs IDs
|
||||
Home-page: https://code-repo.d4science.org/mkallipo/affRo
|
||||
Author: Myrto Kallipoliti
|
||||
|
|
|
|||
|
|
@ -10,17 +10,15 @@ affro.egg-info/dependency_links.txt
|
|||
affro.egg-info/top_level.txt
|
||||
affro/helpers/__init__.py
|
||||
affro/helpers/create_input.py
|
||||
affro/helpers/disambiguation.py
|
||||
affro/helpers/find_id.py
|
||||
affro/helpers/find_name.py
|
||||
affro/helpers/functions.py
|
||||
affro/helpers/matching.py
|
||||
affro/jsons/dix_categ.json
|
||||
affro/jsons/dix_city.json
|
||||
affro/jsons/dix_country.json
|
||||
affro/jsons/dix_country_legalnames.json
|
||||
affro/jsons/dix_id_country.json
|
||||
affro/jsons/dix_id_name.json
|
||||
affro/jsons/dix_mult.json
|
||||
affro/jsons/dix_org.json
|
||||
affro/jsons/dix_status.json
|
||||
affro/jsons/dix_id.json
|
||||
affro/jsons/dix_name.json
|
||||
affro/jsons/replacements.json
|
||||
affro/txts/city_names.txt
|
||||
affro/txts/country_names.txt
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
__version__ = "2.2.2"
|
||||
__version__ = "3.1.1"
|
||||
|
|
|
|||
250
affro/core.py
250
affro/core.py
|
|
@ -3,156 +3,150 @@ import sys
|
|||
from affro.helpers.functions import *
|
||||
from affro.helpers.matching import *
|
||||
from affro.helpers.create_input import *
|
||||
import json
|
||||
from affro.helpers.find_name import *
|
||||
from affro.helpers.find_id import *
|
||||
from affro.helpers.disambiguation import *
|
||||
|
||||
|
||||
from . import __version__
|
||||
|
||||
VERSION = __version__
|
||||
|
||||
dix_org = load_json('jsons/dix_org.json')
|
||||
dix_mult = load_json('jsons/dix_mult.json')
|
||||
dix_city = load_json('jsons/dix_city.json')
|
||||
dix_country = load_json('jsons/dix_country.json')
|
||||
dix_status = load_json('jsons/dix_status.json')
|
||||
dix_id_name = load_json('jsons/dix_id_name.json')
|
||||
dix_id_country = load_json('jsons/dix_id_country.json')
|
||||
dix_id_name = load_json('jsons/dix_id_name.json')
|
||||
|
||||
dix_id = load_json('jsons/dix_id.json')
|
||||
dix_name = load_json('jsons/dix_name.json')
|
||||
|
||||
|
||||
dix_status_new = {k :[dix_status[k][0], dix_status[k][1].split(', ')] for k in dix_status}
|
||||
us_states = [
|
||||
"alabama", "alaska", "arizona", "arkansas", "california",
|
||||
"colorado", "conecticut", "delaware", "florida", "georgia",
|
||||
"hawaii", "idaho", "ilinois", "indiana", "iowa",
|
||||
"kansas", "kentucky", "louisiana", "maine", "maryland",
|
||||
"masachusets", "michigan", "minesota", "misisipi", "misouri",
|
||||
"montana", "nebraska", "nevada", "new hampshire", "new jersey",
|
||||
"new mexico", "new york", "north carolina", "north dakota", "ohio",
|
||||
"oklahoma", "oregon", "pensylvania", "rhode island", "south carolina",
|
||||
"south dakota", "tenesee", "texas", "utah", "vermont",
|
||||
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
|
||||
]
|
||||
def produce_result(input, simU, simG, limit):
|
||||
best_name = find_name(input, dix_name, simU, simG, limit)
|
||||
id_result = find_id(input, best_name, dix_name)
|
||||
result = disamb(input, id_result, dix_id)
|
||||
|
||||
|
||||
|
||||
def contains_us_state(text):
|
||||
text = text.lower()
|
||||
return any(state in text for state in us_states)
|
||||
|
||||
def find_ror(input, simU, simG, limit):
|
||||
light_aff = input[0]
|
||||
result = Aff_Ids(input, dix_org, dix_mult, dix_city, dix_country, simU, simG, limit)
|
||||
results_upd = []
|
||||
|
||||
for r in result:
|
||||
|
||||
if "openorgs" in r[2]:
|
||||
results_upd.append([r[1], 'openorgs', r[2], 'active', dix_id_country[r[2]]])
|
||||
|
||||
else:
|
||||
if dix_status_new[r[2]][0] == 'active':
|
||||
results_upd.append([r[1], 'ror', r[2], 'active', dix_id_country[r[2]]])
|
||||
else:
|
||||
if dix_status_new[r[2]][1][0] == '':
|
||||
results_upd.append([r[1], 'ror', r[2], dix_status_new[r[2]][0], dix_id_country[r[2]]])
|
||||
|
||||
|
||||
else:
|
||||
results_upd.append([r[1], 'ror', r[2], dix_status_new[r[2]][0],dix_id_country[r[2]]])
|
||||
for link in (dix_status_new[r[2]][1]):
|
||||
results_upd.append([r[1], 'ror', link, 'active',dix_id_country[r[2]],dix_id_country[link]])
|
||||
|
||||
if len(results_upd) > len(set(description(light_aff)[1])):
|
||||
|
||||
|
||||
final_matching = []
|
||||
light_aff_tokens = [clean_string_ror(x) for x in set(light_aff.split())]
|
||||
for id_ in results_upd:
|
||||
country = dix_id_country[id_[2]]
|
||||
if country == 'united states':
|
||||
if 'united states' in light_aff or 'usa' in light_aff_tokens or contains_us_state(light_aff):
|
||||
final_matching.append(id_)
|
||||
|
||||
elif country == 'united kingdom':
|
||||
if 'united kingdom' in light_aff or 'uk' in light_aff_tokens:
|
||||
final_matching.append(id_)
|
||||
|
||||
elif 'korea' in country:
|
||||
|
||||
if 'korea' in light_aff_tokens:
|
||||
final_matching.append(id_)
|
||||
|
||||
elif country in light_aff:
|
||||
final_matching.append(id_)
|
||||
|
||||
|
||||
if len(final_matching)>0:
|
||||
result_dict = [{'provenance': 'affro', 'version': VERSION, 'pid':'openorgs', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'provenance': 'affro', 'version': VERSION,'pid':'ror', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} for x in final_matching]
|
||||
return result_dict
|
||||
else:
|
||||
|
||||
return [{'provenance': 'affro', 'version': VERSION, 'pid':'openorgs', 'value':x[2], 'name': dix_id_name[x[2]],'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'provenance': 'affro', 'version': VERSION, 'pid':'ror', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} for x in results_upd]
|
||||
|
||||
elif len(results_upd)>0:
|
||||
return [{'provenance': 'affro', 'version': VERSION, 'pid':'openorgs', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'provenance': 'affro', 'version': VERSION, 'pid':'ror', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} for x in results_upd]
|
||||
else:
|
||||
result_dict = []
|
||||
|
||||
return result_dict
|
||||
return result
|
||||
|
||||
|
||||
def run_affro(raw_aff_string):
|
||||
lucky_guess = clean_string_ror(raw_aff_string)
|
||||
lucky_guess = clean_string_lucky(raw_aff_string)
|
||||
# print(lucky_guess)
|
||||
try:
|
||||
if lucky_guess in dix_org:
|
||||
if dix_mult[lucky_guess] == "unique":
|
||||
if 'openorgs' in dix_org[lucky_guess]:
|
||||
if lucky_guess in dix_name:
|
||||
# print('lucky guess hit', lucky_guess)
|
||||
# print('lucky guess found', dix_name[lucky_guess])
|
||||
if len(dix_name[lucky_guess]) == 1:
|
||||
id_ = dix_name[lucky_guess][0]['id']
|
||||
name_ = dix_id[id_]['name']
|
||||
country_ = dix_id[id_]['country']
|
||||
status_ = dix_id[id_]['status']
|
||||
if 'openorgs' in id_:
|
||||
|
||||
return[{'provenance': 'affro', 'version': VERSION, 'pid': 'openorgs', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'openorgs', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
|
||||
else:
|
||||
if dix_status_new[dix_org[lucky_guess]][0] == 'active':
|
||||
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
elif dix_status_new[dix_org[lucky_guess]][1][0]== '':
|
||||
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': dix_status_new[dix_org[lucky_guess]][0], 'country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
if status_[0] == 'active':
|
||||
# print('active')
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
|
||||
elif status_[0]== '':
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value':id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
|
||||
else:
|
||||
res = [{'provenance': 'affro', 'version': VERSION, 'pid' : 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': dix_status_new[dix_org[lucky_guess]][0], 'country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
for successor in dix_status_new[dix_org[lucky_guess]][1]:
|
||||
res.append({'provenance': 'affro', 'version': VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]})
|
||||
res = [{'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
|
||||
for successor in status_[1]:
|
||||
if successor != '':
|
||||
res.append({'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id[successor]['name'], 'confidence': 1, 'status': 'active', 'country':dix_id[successor]['country']})
|
||||
return res
|
||||
else:
|
||||
cand_ids = [
|
||||
key
|
||||
for _, key in dix_city[lucky_guess]
|
||||
if ("ror" in key and dix_status_new[key][0] == "active") or ("openorgs" in key)
|
||||
]
|
||||
num_countries = len(
|
||||
set(
|
||||
dix_id_country[x[1]]
|
||||
for x in dix_city[lucky_guess]
|
||||
if ("ror" in x[1] and dix_status_new[x[1]][0] == "active") or ("openorgs" in x[1])
|
||||
)
|
||||
)
|
||||
# print('multiple candidates')
|
||||
ids = [x['id'] for x in dix_name[lucky_guess]]
|
||||
cand_ids = [id for id in ids if is_first(id, lucky_guess) == 'y']
|
||||
# print('cand_ids', cand_ids)
|
||||
# pick the ror id where 'first' == 'y' (None if not found)
|
||||
if len(cand_ids) !=1:
|
||||
# print('secondary conditions')
|
||||
conditions = [
|
||||
lambda key: ("ror" in key and dix_id[key]['status'][0] == "active"
|
||||
and dix_id[key]['top_level'][0] == 'y') \
|
||||
or ("openorgs" in key),
|
||||
|
||||
if len(cand_ids) == 1 or num_countries == 1:
|
||||
if 'openorgs' in dix_org[lucky_guess]:
|
||||
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'openorgs', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
lambda key: ("ror" in key and dix_id[key]['status'][0] == "active"
|
||||
and dix_id[key]['parent'][0] == 'y') \
|
||||
or ("openorgs" in key),
|
||||
|
||||
lambda key: ("ror" in key and dix_id[key]['status'][0] == "active") \
|
||||
or ("openorgs" in key)
|
||||
]
|
||||
|
||||
for cond in conditions:
|
||||
cand_ids = [key for key in ids if cond(key)]
|
||||
if cand_ids:
|
||||
# print('break')
|
||||
break
|
||||
|
||||
if len(cand_ids) == 0:
|
||||
# print('check result')
|
||||
result = produce_result(create_df_algorithm(raw_aff_string, 10), 0.42, 0.82, 500)
|
||||
|
||||
return result
|
||||
|
||||
# print('cand_ids',cand_ids)
|
||||
if len(cand_ids) == 1:# or num_countries == 1:
|
||||
id_ = cand_ids[0]
|
||||
# print('id',id_)
|
||||
name_ = dix_id[id_]['name']
|
||||
country_ = dix_id[id_]['country']
|
||||
status_ = dix_id[id_]['status']
|
||||
if 'openorgs' in id_:
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'openorgs', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
|
||||
else:
|
||||
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
|
||||
else:
|
||||
return []
|
||||
if status_[0] == 'active':
|
||||
# print('active')
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
|
||||
elif status_[0]== '':
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value':id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
|
||||
else:
|
||||
res = [{'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
|
||||
for successor in status_[1]:
|
||||
if successor != '':
|
||||
res.append({'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id[successor]['name'], 'confidence': 1, 'status': 'active', 'country':dix_id[successor]['country']})
|
||||
return res
|
||||
# return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country':country_}]
|
||||
|
||||
else:
|
||||
found = False
|
||||
for triplet in dix_name[lucky_guess]:
|
||||
if triplet['first'] == 'y':
|
||||
found = True
|
||||
id_ = triplet['id']
|
||||
name_ = dix_id[id_]['name']
|
||||
country_ = dix_id[id_]['country']
|
||||
status_ = dix_id[id_]['status']
|
||||
if 'openorgs' in id_:
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'openorgs', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
|
||||
else:
|
||||
if status_[0] == 'active':
|
||||
# print('active')
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
|
||||
elif status_[0]== '':
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value':id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
|
||||
else:
|
||||
res = [{'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
|
||||
for successor in status_[1]:
|
||||
if successor != '':
|
||||
res.append({'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id[successor]['name'], 'confidence': 1, 'status': 'active', 'country':dix_id[successor]['country']})
|
||||
return res
|
||||
|
||||
if found == False:
|
||||
return []
|
||||
else:
|
||||
# print('No lucky guess, running algorithm...')
|
||||
result = find_ror(create_df_algorithm(raw_aff_string, 3), 0.42, 0.82, 500)
|
||||
|
||||
# print('lucky guess miss')
|
||||
result = produce_result(create_df_algorithm(raw_aff_string, 3), 0.42, 0.82, 500)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# Return some indication of an error, or log the row
|
||||
print(f"Error: {str(e)}")
|
||||
print(f"Error end: {str(e)}")
|
||||
print(raw_aff_string)
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def matchings_affro(aff_string):
|
||||
|
|
@ -168,7 +162,7 @@ def matchings_affro(aff_string):
|
|||
# Create the result as a tuple that matches matchings_schema
|
||||
result = []
|
||||
for matching in matchings:
|
||||
# Assuming 'matching' is a dictionary that contains 'provenance', 'version', 'pid', 'value', 'name', 'confidence', 'status', 'country'
|
||||
# Assuming 'matching' is a dictionary that contains 'provenance', 'affro', 'value', 'confidence', 'status'
|
||||
result.append((
|
||||
matching.get("provenance", None),
|
||||
matching.get("version", None),
|
||||
|
|
@ -178,7 +172,6 @@ def matchings_affro(aff_string):
|
|||
float(matching.get("confidence", None)),
|
||||
matching.get("status", None),
|
||||
matching.get("country", None)
|
||||
|
||||
))
|
||||
if len(result)>0:
|
||||
return result
|
||||
|
|
@ -189,5 +182,4 @@ def matchings_affro(aff_string):
|
|||
return ()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,12 +1,9 @@
|
|||
from affro.helpers.functions import *
|
||||
|
||||
def valueToCategory(value):
|
||||
flag = 0
|
||||
|
||||
for k in categ_dicts:
|
||||
if k in value and categ_dicts[k] in categ_string.split('|'):
|
||||
flag = 1
|
||||
return flag
|
||||
return categ_dicts[k]
|
||||
|
||||
|
||||
# tokenization
|
||||
|
|
@ -28,21 +25,30 @@ protect = ['national univer ireland',
|
|||
'state univer',
|
||||
'rijksuniver',
|
||||
'rijks univer',
|
||||
'univer medical center'
|
||||
'univer medical center',
|
||||
'royal colege surgeons',
|
||||
'st patricks colege',
|
||||
'institu techn',
|
||||
'trinity colege',
|
||||
'st johns colege',
|
||||
'wiliam beaumont hospital'
|
||||
]
|
||||
|
||||
def create_df_algorithm(raw_aff_string, radius_u):
|
||||
clean_aff = clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string)))
|
||||
#print(0, clean_aff)
|
||||
countries_list = description(clean_aff)[1]
|
||||
aff_no_symbols_d = substrings_dict(reduce(clean_aff))
|
||||
#print(0.5, aff_no_symbols_d)
|
||||
substring_list = [replace_abbr_univ(x) for x in list(aff_no_symbols_d.values())]
|
||||
#print(1, substring_list)
|
||||
# for k, word in enumerate(substring_list):
|
||||
# print(word)
|
||||
# if word in protect and substring_list[k+1] in city_names:
|
||||
# print('y')
|
||||
# word = word + ', ' + substring_list[k+1]
|
||||
# substring_list[k] = word
|
||||
i = 0
|
||||
|
||||
# print(substring_list,'substring_list')
|
||||
while i < len(substring_list) - 1:
|
||||
if substring_list[i] in protect and any(name in substring_list[i+1] for name in city_names): #substring_list[i+1] in city_names:
|
||||
if substring_list[i] in protect and any(name in substring_list[i+1] for name in city_names+countries): #substring_list[i+1] in city_names:
|
||||
substring_list[i] = substring_list[i] + ' ' + substring_list[i+1]
|
||||
i = i+2
|
||||
continue
|
||||
|
|
@ -61,31 +67,18 @@ def create_df_algorithm(raw_aff_string, radius_u):
|
|||
i = i+1
|
||||
continue
|
||||
|
||||
# elif 'lab' in substring_list[i] and ('colege' in substring_list[i+1] or 'dep' in substring_list[i+1] or 'school' in substring_list[i+1]):
|
||||
# if not 'univ' in substring_list[i]: #'inst' in substring_list[i+1] or
|
||||
# substring_list.pop(i)
|
||||
# else:
|
||||
# i = i+1
|
||||
# continue
|
||||
|
||||
else:
|
||||
i += 1
|
||||
# print(1.4, substring_list)
|
||||
|
||||
|
||||
light_aff = (', '.join((substring_list)))
|
||||
# print(1.5, light_aff)
|
||||
|
||||
|
||||
substring_list = [x for x in substring_list if x.replace(' gmbh','') not in city_names+remove_list]
|
||||
# print(1.7,substring_list)
|
||||
|
||||
|
||||
substring_list0 = [shorten_keywords([x], radius_u) for x in substring_list if len(shorten_keywords([x],radius_u))>0]
|
||||
# print(2,substring_list0 )
|
||||
|
||||
substring_list1 = [inner for outer in substring_list0 for inner in outer]
|
||||
# print(3,substring_list1 )
|
||||
|
||||
aff_list = [{"index": i, "keywords": substring_list1[i], "category": valueToCategory(substring_list1[i])} for i in range(len(substring_list1))]
|
||||
|
||||
filtered_list = [entry for entry in aff_list if entry.get("category") == 1]
|
||||
filtered_list = [entry for entry in aff_list if type(entry.get("category")) == str]
|
||||
|
||||
return [clean_aff, light_aff, filtered_list, countries_list]
|
||||
|
|
@ -0,0 +1,155 @@
|
|||
from affro.helpers.functions import *
|
||||
from affro.helpers.create_input import *
|
||||
from .. import __version__
|
||||
|
||||
VERSION = __version__
|
||||
|
||||
|
||||
us_states = [
|
||||
"alabama", "alaska", "arizona", "arkansas", "california",
|
||||
"colorado", "conecticut", "delaware", "florida", "georgia",
|
||||
"hawaii", "idaho", "ilinois", "indiana", "iowa",
|
||||
"kansas", "kentucky", "louisiana", "maine", "maryland",
|
||||
"masachusets", "michigan", "minesota", "misisipi", "misouri",
|
||||
"montana", "nebraska", "nevada", "new hampshire", "new jersey",
|
||||
"new mexico", "new york", "north carolina", "north dakota", "ohio",
|
||||
"oklahoma", "oregon", "pensylvania", "rhode island", "south carolina",
|
||||
"south dakota", "tennesee", "texas", "utah", "vermont",
|
||||
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
|
||||
]
|
||||
|
||||
|
||||
def contains_us_state(text):
|
||||
text = text.lower()
|
||||
return any(state in text for state in us_states)
|
||||
|
||||
# def get_city(name, dix_name):
|
||||
# return {x['city'] : x['id'] for x in dix_name[name]}
|
||||
|
||||
|
||||
def convert_to_result(id_list_, dix_id):
|
||||
"""
|
||||
id_list_ rows: [something, score, value]
|
||||
dix_id: mapping from id -> {'name':..., 'country':..., 'status': [primary, secondary_list]}
|
||||
"""
|
||||
result_dict = []
|
||||
for r in id_list_:
|
||||
# Confidence is in r[1]
|
||||
score = min(r[1], 1.0)
|
||||
value = r[2]
|
||||
|
||||
rec = dix_id.get(value)
|
||||
|
||||
if rec is None:
|
||||
# missing metadata for this id — skip (or log if you want)
|
||||
continue
|
||||
|
||||
name = rec.get('name')
|
||||
country = rec.get('country')
|
||||
status_field = rec.get('status', [])
|
||||
primary_status = status_field[0] if len(status_field) > 0 else None
|
||||
secondary = status_field[1] if len(status_field) > 1 else []
|
||||
|
||||
def make_entry(pid, val, nm, conf, st, ctry):
|
||||
return {
|
||||
'provenance': 'affro',
|
||||
'version': VERSION,
|
||||
'pid': pid,
|
||||
'value': val,
|
||||
'name': nm,
|
||||
'confidence': conf,
|
||||
'status': st,
|
||||
'country': ctry
|
||||
}
|
||||
|
||||
if "openorgs" in value:
|
||||
result_dict.append(make_entry('openorgs', value, name, score, 'active', country))
|
||||
continue
|
||||
|
||||
# ROR branch
|
||||
if primary_status == 'active':
|
||||
result_dict.append(make_entry('ror', value, name, score, 'active', country))
|
||||
continue
|
||||
|
||||
# primary is not active
|
||||
# treat case where secondary exists and its first element is empty string specially
|
||||
if secondary and secondary[0] == '':
|
||||
result_dict.append(make_entry('ror', value, name, score, primary_status, country))
|
||||
else:
|
||||
# append parent (non-active)
|
||||
result_dict.append(make_entry('ror', value, name, score, primary_status, country))
|
||||
# append linked records (use link's own metadata)
|
||||
for link in secondary:
|
||||
if not link:
|
||||
continue
|
||||
link_rec = dix_id.get(link, {})
|
||||
link_name = link_rec.get('name')
|
||||
link_country = link_rec.get('country')
|
||||
result_dict.append(make_entry('ror', link, link_name, score, 'active', link_country))
|
||||
|
||||
return result_dict
|
||||
|
||||
def count_active(items):
|
||||
return sum(1 for x in items if x.get("status") == "active")
|
||||
|
||||
def disamb(input, id_list_,dix_id):
|
||||
# print('disamb id_list_', id_list_)
|
||||
if id_list_ == []:
|
||||
return []
|
||||
|
||||
clean_aff = input[0]
|
||||
# print(input)
|
||||
result_dict = convert_to_result(id_list_, dix_id)
|
||||
num_actives = count_active(result_dict)
|
||||
# print('result_dict',result_dict)
|
||||
# print('num_actives', num_actives)
|
||||
if len(id_list_) ==1:
|
||||
# print('1')
|
||||
return result_dict
|
||||
|
||||
elif len(description(clean_aff)[1]) == 0:
|
||||
# print('no country in affiliation')
|
||||
# polytechnic?
|
||||
countries_uni = [res['country'] for res in result_dict if 'Uni' in res['name']]
|
||||
if len(countries_uni) >0:
|
||||
final_matching = [res for res in result_dict if res['country'] in countries_uni]
|
||||
return final_matching
|
||||
else:
|
||||
# print('no universities')
|
||||
return result_dict
|
||||
|
||||
elif num_actives > len(set(description(clean_aff)[1])):
|
||||
# print('more results than countries')
|
||||
final_matching = []
|
||||
light_aff_tokens = [clean_string_ror(x) for x in set(clean_aff.split())]
|
||||
for res in result_dict:
|
||||
country = res['country']
|
||||
if country == 'united states':
|
||||
if 'united states' in clean_aff or 'usa' in light_aff_tokens or contains_us_state(clean_aff):
|
||||
final_matching.append(res)
|
||||
|
||||
elif country == 'united kingdom':
|
||||
if 'united kingdom' in clean_aff or 'uk' in light_aff_tokens:
|
||||
final_matching.append(res)
|
||||
|
||||
elif 'korea' in country:
|
||||
|
||||
if 'korea' in light_aff_tokens:
|
||||
final_matching.append(res)
|
||||
|
||||
elif country in clean_aff:
|
||||
final_matching.append(res)
|
||||
|
||||
|
||||
if final_matching:
|
||||
return final_matching
|
||||
|
||||
else:
|
||||
return result_dict
|
||||
|
||||
elif len(result_dict)>0:
|
||||
return result_dict
|
||||
else:
|
||||
# print('leider nichts')
|
||||
return []
|
||||
|
||||
|
|
@ -0,0 +1,167 @@
|
|||
from affro.helpers.functions import *
|
||||
from affro.helpers.create_input import *
|
||||
|
||||
specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' or categ_dicts[k] == 'Acronyms']
|
||||
|
||||
country_synonyms = {x: [x] for x in countries}
|
||||
country_synonyms["united states"] = ["united states", "u.s.a.", "usa", "usa.","states"]
|
||||
country_synonyms["germany"] = ["germany","deutschland"]
|
||||
country_synonyms["united kingdom"] = ["united kingdom", "u.k.", "uk", "uk.","kingdom","england"]
|
||||
country_synonyms["turkey"] = ["turkey","turkiye", "cyprus"]
|
||||
country_synonyms["china"] = ["china", "prc","chinese"]
|
||||
country_synonyms["ireland"] = ["eire", "ireland"]
|
||||
country_synonyms["south korea"] = ["south korea", "korea"]
|
||||
|
||||
special_countries = {'united states', 'united kngdom', 'germany', 'china','turkey'}
|
||||
|
||||
|
||||
|
||||
|
||||
def keep_highest_score(data):
|
||||
""""
|
||||
Keeps only one inner list for each unique last value.
|
||||
The kept list is the one with the greatest second value.
|
||||
If multiple have the same greatest second value, one is kept arbitrarily.
|
||||
"""
|
||||
best = {}
|
||||
for lst in data:
|
||||
key = lst[-1]
|
||||
value = lst[1]
|
||||
if key not in best or value > best[key][1]:
|
||||
best[key] = lst
|
||||
return list(best.values())
|
||||
|
||||
|
||||
def find_id(input, best_names, dix_name):
|
||||
# print('start find_id')
|
||||
clean_aff = input[0]
|
||||
light_aff = input[1]
|
||||
id_list = []
|
||||
|
||||
for org_list in best_names:
|
||||
org = org_list[0]
|
||||
# print('org:', org)
|
||||
conf = org_list[1]
|
||||
|
||||
if len(dix_name[org]) == 1:
|
||||
# print('unique')
|
||||
id_ = dix_name[org][0]['id']
|
||||
city_ = dix_name[org][0]['city']
|
||||
country_ = dix_name[org][0]['country']
|
||||
# print(city_, country_)
|
||||
# print('c',set(country_synonyms[country_]))
|
||||
# print('l',set(light_aff.split()))
|
||||
if (
|
||||
# ('univ' in org and 'institu' in org)
|
||||
# or
|
||||
(
|
||||
city_ not in light_aff
|
||||
and not set(country_synonyms[country_]) & set(light_aff.split())
|
||||
and 'univ' not in org
|
||||
and 'inst' not in org
|
||||
and 'national' not in org
|
||||
and valueToCategory(org) not in ['Company', 'Acronyms', 'Specific']
|
||||
)
|
||||
):
|
||||
pass
|
||||
else:
|
||||
id_list.append([org, conf, id_])
|
||||
# else:
|
||||
# id_list.append([org, conf, id_])
|
||||
|
||||
else:
|
||||
# print('multiple')
|
||||
match_found = False
|
||||
for quadruple in dix_name[org]:
|
||||
city_ = quadruple['city']
|
||||
# print('city', city_)
|
||||
id_ = quadruple['id']
|
||||
|
||||
if city_ in clean_aff:
|
||||
if city_ not in org:
|
||||
id_list.append([org, conf, id_])
|
||||
match_found = True
|
||||
# break
|
||||
else:
|
||||
if clean_aff.count(city_) >1:
|
||||
id_list.append([org, conf, id_])
|
||||
match_found = True
|
||||
# break
|
||||
|
||||
if not match_found:
|
||||
countries_ids = {quadruple['country'] for quadruple in dix_name[org]}
|
||||
if countries_ids & special_countries:
|
||||
# print('special country')
|
||||
for quadruple in dix_name[org]:
|
||||
country_ = quadruple['country']
|
||||
# print(country_)
|
||||
id_ = quadruple['id']
|
||||
|
||||
tokens = set([x.replace(',','') for x in clean_aff.lower().split()])
|
||||
# print('tokens',tokens)
|
||||
text = clean_aff.lower()
|
||||
# print('text', text)
|
||||
|
||||
if ((country_ == 'united states' and ('united states' in text or {'usa', 'usa.'} & tokens or 'u.s.a.' in text)) or
|
||||
(country_ == 'germany' and ('deutschland' in text )) or
|
||||
(country_ == 'united kingdom' and ('united kingdom' in text or ({'uk', 'uk.'} & tokens) or 'u.k.' in text)) or
|
||||
(country_ == 'turkey' and ('turkiye' in text)) or
|
||||
(country_ == 'china' and ('chinese' in text or 'prc' in text))):
|
||||
# print('specific country found')
|
||||
id_list.append([org, conf, id_])
|
||||
match_found = True
|
||||
break
|
||||
|
||||
|
||||
if not match_found:
|
||||
# print('no special country')
|
||||
for quadruple in dix_name[org]:
|
||||
country_ = quadruple['country']
|
||||
id_ = quadruple['id']
|
||||
# print(country_)
|
||||
if country_.split()[0] in clean_aff:
|
||||
# print('no specific found')
|
||||
if country_ not in org:
|
||||
id_list.append([org, conf, id_])
|
||||
match_found = True
|
||||
break
|
||||
|
||||
|
||||
if not match_found:
|
||||
for quadruple in dix_name[org]:
|
||||
country_ = quadruple['country']
|
||||
id_ = quadruple['id']
|
||||
if country_ in clean_aff and country_ in org:
|
||||
id_list.append([org, conf, id_])
|
||||
match_found = True
|
||||
# break
|
||||
|
||||
if not match_found:
|
||||
# print('check sp')
|
||||
for sp in specific:
|
||||
if sp in org:
|
||||
for rec in dix_name[org]:
|
||||
if dix_id[rec['id']]['top_level'] == 'y':
|
||||
# print('top level found for specific')
|
||||
id_list.append([org, conf, rec['id']])
|
||||
match_found = True
|
||||
break
|
||||
|
||||
if not match_found:
|
||||
dix_id[rec['id']]['parent'] == 'y'
|
||||
# print('parent found for specific')
|
||||
id_list.append([org, conf, rec['id']])
|
||||
match_found = True
|
||||
break
|
||||
|
||||
if not match_found:
|
||||
# print('check first y')
|
||||
for quadruple in dix_name[org]:
|
||||
if 'department' not in org and 'labora' not in org and quadruple['first'] == 'y':
|
||||
id_list.append([org, conf, quadruple['id']])
|
||||
break
|
||||
|
||||
# print('id_list',id_list)
|
||||
id_list_final = keep_highest_score(id_list)
|
||||
# print('end find_id', id_list_final)
|
||||
return id_list_final
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
from affro.helpers.functions import *
|
||||
from affro.helpers.create_input import *
|
||||
from affro.helpers.matching import *
|
||||
|
||||
def find_name(input, dix_name, simU, simG, limit):
|
||||
# print('start find_name')
|
||||
# print('input',input)
|
||||
"""
|
||||
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
|
||||
|
||||
Args:
|
||||
m (int): The number of DOIs to check.
|
||||
DF (DataFrame): The input DataFrame containing affiliation data.
|
||||
dix_org (dict): A dictionary of names of organizations and their ROR_ids.
|
||||
simU (float): Similarity threshold for universities.
|
||||
simG (float): Similarity threshold for non-universities.
|
||||
|
||||
Returns:
|
||||
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
|
||||
"""
|
||||
clean_aff = input[0]
|
||||
light_aff = input[1].replace(' gmbh', ' ').strip()
|
||||
df_list = input[2]
|
||||
|
||||
countries_list = input[3]
|
||||
|
||||
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
|
||||
result = {}
|
||||
pairs = []
|
||||
|
||||
keywords = [entry["keywords"].replace(' gmbh', ' ').strip() for entry in df_list]
|
||||
|
||||
candidates = get_candidates(countries_list)
|
||||
if len(keywords) > 1 or len(keywords) == 1 and len(keywords[0])>1:
|
||||
for k,s in enumerate(keywords):
|
||||
if len(s) >1 and s not in countries:
|
||||
pairs_k = []
|
||||
# print(s)
|
||||
#--end september 2025
|
||||
try:
|
||||
# print('lucky', s)
|
||||
pairs_k.append((s,s,1, dix_name[s][0]['id'],dix_name[s][0]['country']))
|
||||
if k not in dix:
|
||||
dix[k] = [s]
|
||||
else:
|
||||
dix[k].append(s)
|
||||
|
||||
except Exception as e:
|
||||
# else:
|
||||
# print('not lucky')
|
||||
|
||||
try:
|
||||
pairs_k = find_candidate(s, k , dix, simU, simG, candidates, limit)
|
||||
except:
|
||||
pairs_k = []
|
||||
result[k] = pairs_k
|
||||
if len(pairs_k)>0:
|
||||
pairs.append(pairs_k)
|
||||
|
||||
multi = index_multiple_matchings(pairs)
|
||||
# print('multi', multi)
|
||||
need_check_keys = []
|
||||
ready_keys = []
|
||||
ready_best = []
|
||||
for keyword in multi:
|
||||
try:
|
||||
if multi[keyword]>1:
|
||||
need_check_keys.append(keyword)
|
||||
else:
|
||||
for p in pairs:
|
||||
if keyword in p[0]:
|
||||
if p[0][1] not in ready_keys:
|
||||
ready_keys.append(p[0][1])
|
||||
|
||||
ready_best.append([p[0][1], p[0][2]])
|
||||
except Exception as e:
|
||||
print('ERROR, find_name', e)
|
||||
pass
|
||||
|
||||
pairs_check = [ pair for pair in pairs if pair[0][0] in need_check_keys ]
|
||||
# print('pairs_check',pairs_check)
|
||||
if len(need_check_keys)>0:
|
||||
# print(' len(need_check_keys)', len(need_check_keys))
|
||||
best0 = best_sim_score(clean_aff, light_aff, len(keywords), pairs_check, multi, simU, simG)
|
||||
# print('best0', best0)
|
||||
best1 = {x[0]:dix_name[x[0]][0]['id'] for x in best0 }
|
||||
# print('best1', best1)
|
||||
|
||||
best01 = unique_subset(best0, best1)
|
||||
best = best01 + ready_best
|
||||
else:
|
||||
best = ready_best
|
||||
# print('end find_name', best)
|
||||
return best
|
||||
|
|
@ -28,17 +28,10 @@ def load_txt(relative_path, package="affro"):
|
|||
with full_path.open("r", encoding="utf-8") as file:
|
||||
return [line.strip() for line in file]
|
||||
|
||||
|
||||
|
||||
#categ_string = 'Laboratory|Univ/Inst|Hospital|Foundation|Museum|Government|Company'
|
||||
categ_string = 'Academia|Hospitals|Foundations|Specific|Government|Company|Acronyms'
|
||||
|
||||
dix_org = load_json('jsons/dix_org.json')
|
||||
dix_city = load_json('jsons/dix_city.json')
|
||||
dix_country = load_json('jsons/dix_country.json')
|
||||
dix_mult = load_json('jsons/dix_mult.json')
|
||||
|
||||
dix_country_legalnames = load_json('jsons/dix_country_legalnames.json')
|
||||
|
||||
us_states = [
|
||||
"alabama", "alaska", "arizona", "arkansas", "california",
|
||||
"colorado", "conecticut", "delaware", "florida", "georgia",
|
||||
|
|
@ -52,6 +45,10 @@ us_states = [
|
|||
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
|
||||
]
|
||||
|
||||
dix_name = load_json('jsons/dix_name.json')
|
||||
|
||||
dix_country_legalnames = load_json('jsons/dix_country_legalnames.json')
|
||||
|
||||
def replace_double_consonants(text):
|
||||
# This regex pattern matches any double consonant
|
||||
pattern = r'([bcdfghjklmnpqrstvwxyz])\1'
|
||||
|
|
@ -59,18 +56,32 @@ def replace_double_consonants(text):
|
|||
result = re.sub(pattern, r'\1', text, flags=re.IGNORECASE)
|
||||
return result
|
||||
|
||||
|
||||
#stop_words = ['from', 'the', 'of', 'at', 'de','for','et','für','des', 'in','as','a','and','fur','for','und','di']
|
||||
|
||||
|
||||
def remove_stop_words(text):
|
||||
words = text.split()
|
||||
filtered_words = [word for word in words if word not in stop_words]
|
||||
return ' '.join(filtered_words)
|
||||
filtered_words = []
|
||||
|
||||
for word in words:
|
||||
if word.endswith(","):
|
||||
core = word[:-1] # remove the comma
|
||||
if core not in stop_words:
|
||||
filtered_words.append(core + ",")
|
||||
else:
|
||||
filtered_words.append(",") # keep only the comma
|
||||
else:
|
||||
if word not in stop_words:
|
||||
filtered_words.append(word)
|
||||
|
||||
result = " ".join(filtered_words)
|
||||
# remove spaces before commas
|
||||
result = result.replace(" ,", ",")
|
||||
return result
|
||||
|
||||
|
||||
stop_words = load_txt('txts/stop_words.txt')
|
||||
|
||||
dix_id_country = load_json('jsons/dix_id_country.json')
|
||||
dix_id = load_json('jsons/dix_id.json')
|
||||
|
||||
categ_dicts = load_json('jsons/dix_categ.json')
|
||||
replacements = load_json('jsons/replacements.json')
|
||||
|
|
@ -82,6 +93,10 @@ stop_words.remove('at')
|
|||
university_terms = [replace_double_consonants(x) for x in load_txt('txts/university_terms.txt')]
|
||||
city_names = [replace_double_consonants(x) for x in load_txt('txts/city_names.txt')]
|
||||
|
||||
def is_first(id, name):
|
||||
for quadruple in dix_name[name]:
|
||||
if quadruple['id'] == id:
|
||||
return quadruple['first']
|
||||
|
||||
|
||||
def get_candidates(country_list):
|
||||
|
|
@ -89,7 +104,7 @@ def get_candidates(country_list):
|
|||
cand = [dix_country_legalnames[country] for country in country_list if country in dix_country_legalnames]
|
||||
return list(set([item for sublist in cand for item in sublist]))
|
||||
else:
|
||||
return list(dix_org.keys())
|
||||
return list(dix_name.keys())
|
||||
|
||||
|
||||
def is_contained(s, w):
|
||||
|
|
@ -109,6 +124,11 @@ def is_contained(s, w):
|
|||
return False # Return False immediately
|
||||
return True # If all words from 's' are found in 'w', return True
|
||||
|
||||
def split_sub(s: str) -> str:
|
||||
# Add comma after certain word pairs
|
||||
pattern = r'\b((?:univer))\s+(department|faculty|institu)\b'
|
||||
return re.sub(pattern, r'\1, \2', s, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
def starts_with_any(string, prefixes):
|
||||
"""
|
||||
|
|
@ -158,39 +178,37 @@ def replace_roman_numerals(text):
|
|||
|
||||
def insert_space_between_lower_and_upper(s):
|
||||
"""
|
||||
Inserts a space between a lowercase letter followed by an uppercase letter in a string.
|
||||
|
||||
Parameters:
|
||||
s (str): The input string.
|
||||
|
||||
Returns:
|
||||
str: The modified string with spaces inserted.
|
||||
Insert a space between a lowercase letter and a following uppercase letter,
|
||||
while protecting listed substrings (case-sensitive) and restoring them in lowercase.
|
||||
"""
|
||||
# Temporarily replace 'AstraZeneca' to prevent modification
|
||||
s = s.replace('gGmbH','gmbh')
|
||||
s = s.replace('AstraZeneca', 'ASTRAZENECA_TEMP')
|
||||
s = s.replace('BioNTech', 'BIONTECH_TEMP')
|
||||
s = s.replace('GlaxoSmithKline', 'GLAXO_TEMP')
|
||||
s = s.replace('GmbH', 'GMBH_TEMP')
|
||||
s = s.replace('gmbH', 'GMBH_TEMP')
|
||||
s = s.replace('gGmbH', 'GMBH_TEMP')
|
||||
protected = ['DePaul',
|
||||
'AstraZeneca',
|
||||
'BioNTech',
|
||||
'GlaxoSmithKline',
|
||||
'LifeWatch',
|
||||
'SoBigData',
|
||||
'GmbH',
|
||||
'gGmbH',
|
||||
'gmbH'
|
||||
]
|
||||
|
||||
# Replace protected words with placeholders mapping to their lowercase versions
|
||||
placeholders = {}
|
||||
for i, word in enumerate(protected):
|
||||
key = f"__PROT_{i}__"
|
||||
s = s.replace(word, key)
|
||||
placeholders[key] = word.lower()
|
||||
|
||||
|
||||
# Exclude cases where 'Mc' is followed by a capital letter
|
||||
modified_string = re.sub(r'(?<!Mc)([a-z])([A-Z])', r'\1 \2', s)
|
||||
|
||||
# Ensure no spaces are inserted within 'Mc' sequences
|
||||
modified_string = re.sub(r'(Mc) ([A-Z])', r'\1\2', modified_string)
|
||||
|
||||
# Restore 'AstraZeneca'
|
||||
modified_string = modified_string.replace('ASTRAZENECA_TEMP', 'AstraZeneca')
|
||||
modified_string = modified_string.replace('BIONTECH_TEMP', 'BioNTech')
|
||||
modified_string = modified_string.replace('GLAXO_TEMP', 'GlaxoSmithKline')
|
||||
modified_string = modified_string.replace('GMBH_TEMP', 'gmbh')
|
||||
# Add space between lowercase and uppercase (except after 'Mc')
|
||||
s = re.sub(r'(?<!Mc)([a-z])([A-Z])', r'\1 \2', s)
|
||||
s = re.sub(r'(Mc) ([A-Z])', r'\1\2', s)
|
||||
|
||||
# Restore placeholders to lowercase
|
||||
for key, lower_word in placeholders.items():
|
||||
s = s.replace(key, lower_word)
|
||||
|
||||
return s
|
||||
|
||||
|
||||
return modified_string
|
||||
|
||||
|
||||
|
||||
|
|
@ -216,7 +234,7 @@ def replace_abbr_univ(token):
|
|||
elif token == "u " + city:
|
||||
return "univer " + city
|
||||
elif token == "tu " + city:
|
||||
return "technical univer " + city
|
||||
return "techn univer " + city
|
||||
else:
|
||||
return token
|
||||
|
||||
|
|
@ -224,7 +242,8 @@ def replace_abbr_univ(token):
|
|||
def remove_parentheses(text):
|
||||
return re.sub(r'\([^()]*\)', '', text)
|
||||
|
||||
L = ['univ', 'hospital', 'clinic', 'klinik', 'Univ', 'Hospital', 'Clinic', 'Klinik']
|
||||
L = ['univ', 'hospital', 'clinic', 'klinik', 'Univ', 'Hospital', 'Clinic', 'Klinik'] + [s.title() for s in countries] + countries
|
||||
|
||||
word_pattern = "|".join(map(re.escape, L))
|
||||
|
||||
def process_parentheses(text):
|
||||
|
|
@ -239,16 +258,15 @@ def process_parentheses(text):
|
|||
Returns:
|
||||
str: The modified string after processing parentheses.
|
||||
"""
|
||||
|
||||
text = re.sub(r'\((?![^)]*(' + word_pattern + r'))[^)]*\)', '', text)
|
||||
text_lower = text.lower()
|
||||
text_lower = re.sub(r'\((?![^)]*(' + word_pattern + r'))[^)]*\)', '', text_lower)
|
||||
|
||||
# Replace `(` with `,` and `)` with `,` if a word from L is inside
|
||||
text = re.sub(r'\(([^)]*(' + word_pattern + r')[^)]*)\)', r', \1,', text)
|
||||
text_lower = re.sub(r'\(([^)]*(' + word_pattern + r')[^)]*)\)', r', \1,', text_lower)
|
||||
|
||||
return text
|
||||
return text_lower
|
||||
|
||||
|
||||
|
||||
|
||||
def replace_comma_spaces(text):
|
||||
return text.replace(' ', ' ').replace(' , ', ', ')
|
||||
|
|
@ -313,6 +331,7 @@ def replace_newlines_with_space(text: str, repl: str = " ") -> str:
|
|||
|
||||
return cleaned
|
||||
|
||||
|
||||
def substrings_dict(string):
|
||||
"""
|
||||
Processes a given string by performing the following transformations:
|
||||
|
|
@ -361,7 +380,8 @@ def substrings_dict(string):
|
|||
modified_value = re.sub(r'\btrinity col\b', 'trinity colege', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'\btechnische\b', 'technological', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'\bteknologi\b', 'technology', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'\bpolitehnica\b', 'polytechnic', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'\bpolite\w*', 'polytechnic', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'\bpolyte\w*', 'polytechnic', modified_value, flags=re.IGNORECASE)
|
||||
|
||||
modified_value = re.sub(r'\btechn\w*', 'techn', modified_value, flags=re.IGNORECASE)
|
||||
#modified_value = re.sub(r'techno\w*', 'techno', modified_value, flags=re.IGNORECASE)
|
||||
|
|
@ -375,16 +395,22 @@ def substrings_dict(string):
|
|||
index += 1
|
||||
|
||||
# Add the original substring to the dictionary
|
||||
# else:
|
||||
# dict_string[index] = value.lower().strip()
|
||||
# index += 1
|
||||
|
||||
|
||||
return dict_string
|
||||
|
||||
def split_country(text):
|
||||
try:
|
||||
if text.split(' ')[-1].lower() in countries and startswith(text.split(' ')[-2].lower()) != 'univ':
|
||||
return " ".join(text.split(' ')[0:-1])+", "+ text.split(' ')[-1].lower()
|
||||
else:
|
||||
return text
|
||||
except:
|
||||
return text
|
||||
|
||||
|
||||
def clean_string_ror(input_string):
|
||||
def clean_string_lucky(input_string):
|
||||
|
||||
input_string = replace_underscore(replace_comma_spaces(replace_double_consonants(unidecode(process_parentheses(fully_unescape(input_string.replace("’","'").replace(" ́e","e").replace("'s", "s").replace("'", " "))))))).strip()
|
||||
input_string = replace_underscore(replace_comma_spaces(replace_double_consonants(unidecode(process_parentheses(fully_unescape(input_string.replace("’","'").replace(" ́e","e").replace("'s", "s").replace("'", ""))))))).strip()
|
||||
|
||||
result = remove_stop_words(replace_roman_numerals(input_string.lower()))
|
||||
result = result.replace(' and ',' ')
|
||||
|
|
@ -402,7 +428,8 @@ def clean_string_ror(input_string):
|
|||
|
||||
university_terms = {'universitatsklinikum', 'universitatskinderklinik',
|
||||
'universitatspital', 'universitatskliniken', 'universitetshospital',
|
||||
'universitatsmedizin', 'universitatsbibliothek','universitatszahnklinik'
|
||||
'universitatsmedizin', 'universitatsbibliothek','universitatszahnklinik',
|
||||
'universiteitsmuseum'
|
||||
}
|
||||
|
||||
result = replace_acronyms(result).replace('.', ' ')
|
||||
|
|
@ -425,7 +452,65 @@ def clean_string_ror(input_string):
|
|||
|
||||
result = re.sub(r'\btechnische\b', 'technological', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bteknologi\b', 'technological', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bpolitehnica\b', 'polytechnic', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bpolite\w*', 'polytechnic', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bpolyte\w*', 'polytechnic', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'czechoslovak\b', 'czech', result, flags=re.IGNORECASE)
|
||||
|
||||
result = re.sub(r'\btechn\w*', 'techn', result, flags=re.IGNORECASE)
|
||||
# result = re.sub(r'techno\w*', 'techno', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'scien\w*', 'scien', result, flags=re.IGNORECASE)
|
||||
# result = re.sub(r'\bsaint\b', 'st', result, flags=re.IGNORECASE)
|
||||
|
||||
return result.strip()
|
||||
|
||||
|
||||
|
||||
def clean_string_ror(input_string):
|
||||
|
||||
input_string = replace_underscore(replace_comma_spaces(replace_double_consonants(unidecode(remove_parentheses(fully_unescape(input_string.replace("’","'").replace(" ́e","e").replace("'s", "s").replace("'", ""))))))).strip()
|
||||
|
||||
result = remove_stop_words(replace_roman_numerals(input_string.lower()))
|
||||
result = result.replace(' and ',' ')
|
||||
|
||||
|
||||
# Remove characters that are not from the Latin alphabet, or allowed punctuation
|
||||
result = remove_multi_digit_numbers(replace_comma_spaces(re.sub(r'[^a-zA-Z0-9\s,;/:.\-\—]', '', result).strip()))
|
||||
|
||||
# Restore the " - " sequence from the placeholder
|
||||
#result = result.replace(placeholder, " – ")
|
||||
result = result.replace(':',' ').replace(';',' ').replace('-',' ').replace('—',' ').replace(',',' ')
|
||||
# Replace consecutive whitespace with a single space
|
||||
|
||||
|
||||
|
||||
university_terms = {'universitatsklinikum', 'universitatskinderklinik',
|
||||
'universitatspital', 'universitatskliniken', 'universitetshospital',
|
||||
'universitatsmedizin', 'universitatsbibliothek','universitatszahnklinik',
|
||||
'universiteitsmuseum'
|
||||
}
|
||||
|
||||
result = replace_acronyms(result).replace('.', ' ')
|
||||
result = re.sub(r'\s+', ' ', result)
|
||||
|
||||
# Replace consecutive whitespace with a single space
|
||||
if not any(term in result.lower() for term in university_terms):
|
||||
|
||||
result = re.sub(r'universi\w*', 'univer', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bsaint\b', 'st', result,flags=re.IGNORECASE)
|
||||
result = re.sub(r'institu\w*', 'institu', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'labora\w*', 'labora', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'centre\b', 'center', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'centrum\b', 'center', result, flags=re.IGNORECASE)
|
||||
|
||||
result = re.sub(r'hopital\b', 'hospital', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'hospital(?!s)\w*', 'hospital', result, flags=re.IGNORECASE)
|
||||
|
||||
#result = re.sub(r'centro\b', 'center', result, flags=re.IGNORECASE)
|
||||
|
||||
result = re.sub(r'\btechnische\b', 'technological', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bteknologi\b', 'technological', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bpolite\w*', 'polytechnic', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bpolyte\w*', 'polytechnic', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'czechoslovak\b', 'czech', result, flags=re.IGNORECASE)
|
||||
|
||||
result = re.sub(r'\btechn\w*', 'techn', result, flags=re.IGNORECASE)
|
||||
|
|
@ -436,13 +521,13 @@ def clean_string_ror(input_string):
|
|||
return result.strip()
|
||||
|
||||
def clean_string(input_string):
|
||||
input_string = replace_underscore(replace_comma_spaces(unidecode(process_parentheses(fully_unescape(replace_newlines_with_space(input_string).replace("P.O. Box","").replace("’","'").replace(" ́e","e").replace("'s", "s").replace("'", " ")))))).strip()
|
||||
input_string = replace_underscore(replace_comma_spaces(unidecode(process_parentheses(insert_space_between_lower_and_upper(fully_unescape(replace_newlines_with_space(input_string).replace("P.O. Box","").replace("’","'").replace(" ́e","e").replace("'s", "s").replace("'", " "))))))).strip()
|
||||
|
||||
# result = re.sub(r'(?<! )[–—-](?! )', ' ', input_string)
|
||||
|
||||
# print('h',input_string)
|
||||
|
||||
result = remove_stop_words(replace_double_consonants(replace_roman_numerals(insert_space_between_lower_and_upper(input_string).lower())))
|
||||
result = remove_stop_words(replace_double_consonants(replace_roman_numerals((input_string).lower())))
|
||||
|
||||
|
||||
# Remove characters that are not from the Latin alphabet, or allowed punctuation
|
||||
|
|
@ -458,12 +543,12 @@ def clean_string(input_string):
|
|||
#result = replace_roman_numerals(remove_stop_words(insert_space_between_lower_and_upper(result).lower()))
|
||||
|
||||
|
||||
return result.strip() # Strip leading/trailing spaces
|
||||
return split_country(result.strip()) # Strip leading/trailing spaces
|
||||
|
||||
def description(aff_string):
|
||||
aff_string = aff_string.replace('turkiye', 'turkey')
|
||||
aff_string = aff_string.replace('turkiye', 'turkey').lower()
|
||||
aff_string = aff_string.replace('kirgizistan', 'kyrgyzstan')
|
||||
|
||||
|
||||
descr = []
|
||||
countries_ = []
|
||||
words = re.split(r'[ ,;:/]+', aff_string)
|
||||
|
|
@ -473,16 +558,15 @@ def description(aff_string):
|
|||
# if w in city_names:
|
||||
# descr.append('city')
|
||||
w = re.sub(r'[^A-Za-z\s]', '', w)
|
||||
|
||||
if replace_acronyms(w) in countries:
|
||||
descr.append('country')
|
||||
countries_.append(w)
|
||||
|
||||
|
||||
if replace_acronyms(w) in us_states:
|
||||
descr.append('country')
|
||||
countries_.append('usa')
|
||||
|
||||
elif w in ['univer', 'institu', 'hospital', 'labora']:
|
||||
|
||||
elif w in ['univer', 'institu', 'hospital', 'labora', 'colege']:
|
||||
|
||||
descr.append('basic_key')
|
||||
elif w == 'and':
|
||||
|
|
@ -531,12 +615,12 @@ def split_and(string):
|
|||
tok_no_sl1 = ' '.join(token.replace('-', ' ').split())
|
||||
tok_no_sl2 = ' '.join(token.replace('—', ' ').split())
|
||||
tok_no = ' '.join(token.replace(' and ', ' ').replace(' at ', ' ').replace(' an ', ' ').replace('-', ' ').replace('—', ' ').split())
|
||||
if tok_no in dix_org:
|
||||
if tok_no in dix_name:
|
||||
token = tok_no
|
||||
|
||||
|
||||
else:
|
||||
if tok_no_and not in dix_org:
|
||||
if tok_no_and not in dix_name:
|
||||
# Store once instead of calling multiple times
|
||||
|
||||
if is_subsequence(replace_sequence, token_description):# and token.split(' and ', ' ') not in dix_org:
|
||||
|
|
@ -547,20 +631,20 @@ def split_and(string):
|
|||
else:
|
||||
token = tok_no_and
|
||||
|
||||
if tok_no_at not in dix_org:
|
||||
if tok_no_at not in dix_name:
|
||||
token = ' '.join(token.replace(' at ', ', ').split())
|
||||
else:
|
||||
token = tok_no_at
|
||||
|
||||
if tok_no_an not in dix_org:
|
||||
if tok_no_an not in dix_name:
|
||||
token = ' '.join(token.replace(' an ', ', ').split())
|
||||
else:
|
||||
token = tok_no_an
|
||||
if tok_no_sl1 not in dix_org:
|
||||
if tok_no_sl1 not in dix_name:
|
||||
token = ' '.join(token.replace('-', ',').split())
|
||||
else:
|
||||
token = tok_no_sl1
|
||||
if tok_no_sl2 not in dix_org:
|
||||
if tok_no_sl2 not in dix_name:
|
||||
token = ' '.join(token.replace('—', ',').split())
|
||||
else:
|
||||
token = tok_no_sl2
|
||||
|
|
@ -577,10 +661,10 @@ def reduce(light_aff):
|
|||
aff_no_symbols_d = substrings_dict(light_aff)
|
||||
substring_list = list(aff_no_symbols_d.values())
|
||||
#light_aff_final = ', '.join((substring_list))
|
||||
# print('h', substring_list)
|
||||
# print('h', substring_list)
|
||||
light_aff_final = split_and(', '.join((substring_list)))
|
||||
# print('th', light_aff_final)
|
||||
return light_aff_final
|
||||
# print('th', light_aff_final)
|
||||
return split_sub(light_aff_final)
|
||||
|
||||
|
||||
def unique_subset(L, D):
|
||||
|
|
@ -615,6 +699,7 @@ def str_radius_u(string, radius_u):
|
|||
return result
|
||||
|
||||
|
||||
sp_specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' and ' ' in k]
|
||||
|
||||
def str_radius_spec(string):
|
||||
spec = False
|
||||
|
|
@ -626,16 +711,32 @@ def str_radius_spec(string):
|
|||
except:
|
||||
pass
|
||||
if spec == False:
|
||||
return string
|
||||
|
||||
for x in sp_specific:
|
||||
if x in string:# or categ_dicts[x] == 'Acronyms':
|
||||
spec = True
|
||||
# print('CHECK',x)
|
||||
return x
|
||||
if spec ==False:
|
||||
return string
|
||||
#
|
||||
# def str_radius_spec(string):
|
||||
# spec = False
|
||||
# for x in only_specific:
|
||||
# if x in string:# or categ_dicts[x] == 'Acronyms':
|
||||
# spec = True
|
||||
# return x
|
||||
# if spec ==False:
|
||||
# return string
|
||||
|
||||
|
||||
|
||||
def shorten_keywords(affiliations_simple, radius_u):
|
||||
affiliations_simple_n = []
|
||||
|
||||
for aff in affiliations_simple:
|
||||
# print('check aff', aff)
|
||||
if aff in dix_org:
|
||||
# print('check aff', aff)
|
||||
if aff in dix_name:
|
||||
# print('in dix')
|
||||
affiliations_simple_n.append(aff)
|
||||
|
||||
elif 'univer' in aff:
|
||||
|
|
|
|||
|
|
@ -2,59 +2,33 @@ import Levenshtein
|
|||
from affro.helpers.functions import *
|
||||
from affro.helpers.create_input import *
|
||||
|
||||
|
||||
|
||||
specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' or categ_dicts[k] == 'Acronyms']
|
||||
|
||||
|
||||
# print('HERE', len(dix_org))
|
||||
# print('HERE_city', len(dix_city))
|
||||
# print('HERE_country', len(dix_country))
|
||||
|
||||
|
||||
def index_multiple_matchings(pairs):
|
||||
d = {}
|
||||
for p in pairs:
|
||||
d[p[0][0]] = len(p)
|
||||
|
||||
return d
|
||||
|
||||
def keep_highest_url(lst):
|
||||
best = {}
|
||||
|
||||
for item in lst:
|
||||
name, score, url = item
|
||||
if name not in best or url > best[name][2]: # Keep the highest URL
|
||||
best[name] = item # Store the full entry
|
||||
|
||||
return list(best.values()) # Convert dictionary values back to list
|
||||
|
||||
def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
|
||||
|
||||
|
||||
vectorizer = CountVectorizer()
|
||||
|
||||
similar_k = []
|
||||
pairs_k = []
|
||||
total_pairs = 0
|
||||
# if keyword in dix_org:
|
||||
# print('lucky')
|
||||
# pairs_k.append((keyword,keyword,1,dix_org[keyword], dix_id_country[dix_org[keyword]]))
|
||||
|
||||
for x in candidates_:
|
||||
# print('keyword', keyword)
|
||||
|
||||
if is_contained(keyword, x):
|
||||
# print(0,x,total_pairs)
|
||||
|
||||
if is_contained(keyword, x):# and ('univ' in x or 'inst' in x or len(get_candidates([])) < len(dix_name)):
|
||||
# print('keyword contained')
|
||||
x_vector = vectorizer.fit_transform([x]).toarray()
|
||||
keyword_vector = vectorizer.transform([keyword]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(x_vector, keyword_vector)[0][0]
|
||||
# print('similarity', similarity)
|
||||
if similarity > min(simU, simG):
|
||||
if ('univ' in keyword and 'univ' in x) and similarity > simU:
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
|
||||
pairs_k.append((keyword,x,similarity))
|
||||
total_pairs += 1 # Track total number of pairs
|
||||
|
||||
|
||||
|
|
@ -65,24 +39,17 @@ def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
|
|||
|
||||
|
||||
elif (not 'univ'in keyword and not 'univ' in x) and similarity > simG:
|
||||
# print('pass', keyword, x, similarity)
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
|
||||
pairs_k.append((keyword,x,similarity))
|
||||
total_pairs += 1 # Track total number of pairs
|
||||
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
|
||||
|
||||
|
||||
elif is_contained(x, keyword):
|
||||
# print(0.5,x,total_pairs)
|
||||
if ('univ'in keyword and 'univ' in x):
|
||||
# print(1,x,total_pairs)
|
||||
|
||||
|
||||
keyword_vector = vectorizer.fit_transform([keyword]).toarray()
|
||||
x_vector = vectorizer.transform([x]).toarray()
|
||||
|
|
@ -91,7 +58,7 @@ def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
|
|||
similarity = cosine_similarity(keyword_vector, x_vector)[0][0]
|
||||
if similarity > simU: #max(0.82,sim):
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
|
||||
pairs_k.append((keyword,x,similarity))
|
||||
total_pairs += 1 # Track total number of pairs
|
||||
|
||||
if k not in dix:
|
||||
|
|
@ -102,25 +69,27 @@ def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
|
|||
|
||||
|
||||
elif not 'univ' in keyword and not 'univ' in x:
|
||||
|
||||
# print('not uni')
|
||||
keyword_vector = vectorizer.fit_transform([keyword]).toarray()
|
||||
x_vector = vectorizer.transform([x]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(keyword_vector, x_vector)[0][0]
|
||||
|
||||
if similarity > simG: #max(0.82,sim):
|
||||
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
|
||||
pairs_k.append((keyword,x,similarity))
|
||||
total_pairs += 1 # Track total number of pairs
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
# total_pairs += len(pairs_k) # Track total number of pairs
|
||||
|
||||
if total_pairs >= limit: # Stop if we reach
|
||||
return []
|
||||
|
||||
# print('end find_candidate', pairs_k)
|
||||
return pairs_k
|
||||
|
||||
|
||||
|
|
@ -131,7 +100,6 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
|
|||
"""
|
||||
|
||||
vectorizer = CountVectorizer()
|
||||
univ_num = light_raw.lower().count('univ')
|
||||
result = []
|
||||
best = []
|
||||
|
||||
|
|
@ -141,7 +109,6 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
|
|||
affil = pair_group[0][0]
|
||||
num_uni_p = affil.count('univ')
|
||||
|
||||
# print('AFFIL', affil)
|
||||
for p in pair_group:
|
||||
organization, confidence = p[1], p[2]
|
||||
|
||||
|
|
@ -183,10 +150,8 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
|
|||
|
||||
# Sort by similarity score (descending) and then lexicographically
|
||||
reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True)
|
||||
# print('REDUCED BEST: ', reduced_best)
|
||||
|
||||
result.extend(reduced_best)
|
||||
# print('RESULT EXT: ', result)
|
||||
|
||||
# Step 3: Limit university-related matches
|
||||
univ_list = [r for r in result if 'univ' in r[0]]
|
||||
|
|
@ -214,247 +179,7 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
|
|||
# Convert to list format
|
||||
final_result = [[key, value[0]] for key, value in sorted(result_dict.items(), key=lambda x: x[1][1], reverse=True)]
|
||||
|
||||
# print("RESULT TO USE: ", final_result)
|
||||
return final_result
|
||||
|
||||
|
||||
|
||||
def Aff_Ids(input, dix_org, dix_mult, dix_city, dix_country, simU, simG, limit):
|
||||
|
||||
"""
|
||||
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
|
||||
|
||||
Args:
|
||||
m (int): The number of DOIs to check.
|
||||
DF (DataFrame): The input DataFrame containing affiliation data.
|
||||
dix_org (dict): A dictionary of names of organizations and their ROR_ids.
|
||||
simU (float): Similarity threshold for universities.
|
||||
simG (float): Similarity threshold for non-universities.
|
||||
|
||||
Returns:
|
||||
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
|
||||
"""
|
||||
clean_aff = input[0]
|
||||
# print('CLEAN_AFF (LVL1): ', clean_aff)
|
||||
light_aff = input[1].replace(' gmbh', ' ').strip()
|
||||
# print('LIGHT_AFF (LVL2): ', light_aff)
|
||||
|
||||
df_list = input[2]
|
||||
|
||||
countries_list = input[3]
|
||||
# print('COUNTRIES_LIST: ', countries_list)
|
||||
vectorizer = CountVectorizer()
|
||||
|
||||
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
|
||||
result = {}
|
||||
pairs = []
|
||||
|
||||
keywords = [entry["keywords"].replace(' gmbh', ' ').strip() for entry in df_list]
|
||||
|
||||
candidates = get_candidates(countries_list)
|
||||
|
||||
# print('KEYWORDS: ', keywords)
|
||||
if len(keywords) > 1 or len(keywords) == 1 and len(keywords[0])>1:
|
||||
|
||||
for k,s in enumerate(keywords):
|
||||
pairs_k = []
|
||||
# print('try', s)
|
||||
try:
|
||||
pairs_k.append((s,s,1,dix_org[s],dix_id_country[dix_org[s]]))
|
||||
# print('LUCKY')
|
||||
|
||||
# pairs.append((s,s,similarity,dix_org[s], dix_id_country[dix_org[s]]))
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [s]
|
||||
else:
|
||||
dix[k].append(s)
|
||||
|
||||
except:
|
||||
# print('NOT LUCKY')
|
||||
pairs_k = find_candidate(s, k , dix, simU, simG, candidates, limit)
|
||||
# print('PAIRS K: ', pairs_k)
|
||||
|
||||
result[k] = pairs_k
|
||||
if len(pairs_k)>0:
|
||||
# print('PAIRS K>0: ', pairs_k)
|
||||
|
||||
pairs.append(pairs_k)
|
||||
|
||||
# print('PAIRS: ', pairs)
|
||||
multi = index_multiple_matchings(pairs)
|
||||
# print('MULTIL ',multi)
|
||||
|
||||
need_check_keys = []
|
||||
ready_keys = []
|
||||
ready_best = []
|
||||
for keyword in multi:
|
||||
try:
|
||||
if multi[keyword]>1:
|
||||
need_check_keys.append(keyword)
|
||||
else:
|
||||
for p in pairs:
|
||||
if keyword in p[0]:
|
||||
if p[0][1] not in ready_keys:
|
||||
ready_keys.append(p[0][1])
|
||||
|
||||
ready_best.append([p[0][1], p[0][2]])
|
||||
except:
|
||||
pass
|
||||
# print('READY KEYWORD: ', ready_keys)
|
||||
# print('READY BEST: ', ready_best)
|
||||
|
||||
# print('NEED CHECK KEYWORD: ', need_check_keys)
|
||||
|
||||
pairs_check = [ pair for pair in pairs if pair[0][0] in need_check_keys ]
|
||||
# print('NEED CHECK PAIRS: ', pairs_check)
|
||||
|
||||
|
||||
if len(need_check_keys)>0:
|
||||
best0 = best_sim_score(clean_aff, light_aff, len(keywords), pairs_check, multi, simU, simG)
|
||||
# print('OUTPUT BEST: ', best0)
|
||||
best1 = {x[0]:dix_org[x[0]] for x in best0 }
|
||||
best01 = unique_subset(best0, best1)
|
||||
matched_org = list(set([x[0] for x in best01])) + ready_keys
|
||||
best = best01 + ready_best
|
||||
|
||||
|
||||
|
||||
# print('NEW BEST',best01)
|
||||
else:
|
||||
best = ready_best
|
||||
matched_org = ready_keys
|
||||
|
||||
|
||||
# print('FINAL BEST: ', best)
|
||||
## print('MATCHED: ', matched_org)
|
||||
|
||||
id_list = []
|
||||
|
||||
for org_list in best:
|
||||
org = org_list[0]
|
||||
conf = org_list[1]
|
||||
if dix_mult[org] == 'unique':
|
||||
# print('unique:', org)
|
||||
if 'institu' in org and 'univ' in org:
|
||||
#print('both inst and univ', clean_aff)
|
||||
if dix_city[org][0] not in clean_aff and dix_country[org][0] not in clean_aff:
|
||||
#print('pass')
|
||||
pass
|
||||
else:
|
||||
#print('correct')
|
||||
id_list.append([org, conf, dix_org[org]])
|
||||
else:
|
||||
id_list.append([org, conf, dix_org[org]])
|
||||
|
||||
|
||||
else:
|
||||
# print('not unique:', org)
|
||||
if org in dix_city:
|
||||
match_found = False
|
||||
|
||||
for city in dix_city[org]:
|
||||
if city[0] in clean_aff:
|
||||
if city[0] not in org:
|
||||
# print('city', city[0], org)
|
||||
id_list.append([org, conf, city[1]])
|
||||
match_found = True
|
||||
break
|
||||
else:
|
||||
if clean_aff.count(city[0]) >1:
|
||||
id_list.append([org, conf, city[1]])
|
||||
match_found = True
|
||||
break
|
||||
|
||||
if not match_found:
|
||||
for city in dix_city[org]:
|
||||
if city[0] in clean_aff and city[0] not in org:
|
||||
id_list.append([org, conf, city[1]])
|
||||
break
|
||||
|
||||
if not match_found:
|
||||
match_found2 = False
|
||||
match_found3 = False
|
||||
|
||||
all_countries = list(set([c[0] for c in dix_country[org]]))
|
||||
if len(all_countries) > 1:
|
||||
|
||||
for country in dix_country[org]:
|
||||
# print('country', country[0], org)
|
||||
|
||||
tokens = set(clean_aff.lower().split())
|
||||
text = clean_aff.lower()
|
||||
|
||||
if country[0] == 'united states' and (
|
||||
'united states' in text
|
||||
or {'usa', 'usa.'} & tokens
|
||||
or 'u.s.a.' in text
|
||||
):
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
|
||||
|
||||
if country[0] == 'united kingdom' and (
|
||||
'united kingdom' in text
|
||||
or {'uk', 'uk.'} & tokens
|
||||
or 'u.k.' in text
|
||||
):
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
# print('check country', clean_aff)
|
||||
# if country[0] == 'united states' and (country[0] in clean_aff or 'usa' in clean_aff.split() or 'usa.' in clean_aff.split() or 'u.s.a.' in clean_aff):
|
||||
# id_list.append([org, conf, country[1]])
|
||||
# match_found2 = True
|
||||
# match_found3 = True
|
||||
# break
|
||||
|
||||
# if country[0] == 'united kingdom' and (country[0] in clean_aff or 'uk' in clean_aff.split() or 'u.k.' in clean_aff):
|
||||
# id_list.append([org, conf, country[1]])
|
||||
# match_found2 = True
|
||||
# match_found3 = True
|
||||
# break
|
||||
if country[0] == 'turkey' and (
|
||||
'turkiye' in text
|
||||
#or 'u.k.' in text
|
||||
):
|
||||
# print('here turkey')
|
||||
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
elif country[0].split()[0] in clean_aff:
|
||||
|
||||
if country[0] not in org:
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
else:
|
||||
single_country = all_countries[0]
|
||||
if single_country in clean_aff:
|
||||
id_list.append([org, conf, dix_org[org]])
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
|
||||
if not match_found3:
|
||||
for country in dix_country[org]:
|
||||
if country[0] in clean_aff and country[0] in org:
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
break
|
||||
if not match_found2:
|
||||
for sp in specific:
|
||||
if sp in org:
|
||||
id_list.append([org, conf, dix_org[org]])
|
||||
|
||||
|
||||
# print("RESULT: ", id_list)
|
||||
id_list_final = keep_highest_url(id_list)
|
||||
|
||||
return id_list_final
|
||||
File diff suppressed because one or more lines are too long
536192
affro/jsons/dix_city.json
536192
affro/jsons/dix_city.json
File diff suppressed because it is too large
Load Diff
536184
affro/jsons/dix_country.json
536184
affro/jsons/dix_country.json
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
119124
affro/jsons/dix_id_name.json
119124
affro/jsons/dix_id_name.json
File diff suppressed because it is too large
Load Diff
128381
affro/jsons/dix_mult.json
128381
affro/jsons/dix_mult.json
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
128381
affro/jsons/dix_org.json
128381
affro/jsons/dix_org.json
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
|
@ -16,9 +16,12 @@
|
|||
"universitatskinderklinik": "univer childrens hospital",
|
||||
"universitatskliniken": "univer hospital",
|
||||
"universitätsklinik": "univer hospital",
|
||||
"uniklinik" : "univer hospital",
|
||||
"universitatsmedizin": "univer medicine",
|
||||
"universitatsbibliothek": "univer library",
|
||||
"universiteitsmuseum": "univer museum",
|
||||
"nat.": "national",
|
||||
"pamantasan": "univer",
|
||||
"uniaersity": "univer",
|
||||
"univesity": "univer",
|
||||
"unversity": "univer",
|
||||
|
|
@ -76,5 +79,29 @@
|
|||
"medisch": "medical",
|
||||
"hahn-meitner-institut berlin" : "helmholtz-zentrum berlin",
|
||||
"fachhochschule gelsenkirchen" : "westfalische hochschule",
|
||||
"turkiye" : "turkey"
|
||||
"turkiye" : "turkey",
|
||||
"trinity colege university" : "trinity colege",
|
||||
"tyndal institute" : "tyndal national institute",
|
||||
"st patricks colege, drumcondra" : "dublin city university",
|
||||
"ucd dublin" : "univer colege dublin",
|
||||
"department university" : "department, university",
|
||||
"xi an" : "xian",
|
||||
"sligo general hospital" : "sligo univer hospital",
|
||||
"trinity colege cambridge" : "univer cambridge",
|
||||
"trinity colege, cambridge" : "univer cambridge",
|
||||
"st johns colege, cambridge" : "univer cambridge",
|
||||
"st johns colege cambridge" : "univer cambridge",
|
||||
"kings colege, cambridge" : "univer cambridge",
|
||||
"kings colege cambridge" : "univer cambridge",
|
||||
"eire" : "ireland",
|
||||
"trinity colege, ireland" : "trinity colege dublin",
|
||||
"trinity colege ireland" : "trinity colege dublin",
|
||||
"gilan" : "guilan",
|
||||
"freiberg univer mining techn" : "techn univer bergakademie freiberg",
|
||||
"vishwavidyalaya" : "univer",
|
||||
"rwi esen" : "rwi – leibniz institu economic research",
|
||||
"t. d. medical colege" : "alapuzha medical colege",
|
||||
"sulaymaniyah" : "sulaimani",
|
||||
"-ang" : " ang"
|
||||
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,6 +1,9 @@
|
|||
eire
|
||||
turkiye
|
||||
trinidad
|
||||
united
|
||||
kingdom
|
||||
states
|
||||
emirates
|
||||
hong
|
||||
niederland
|
||||
holand
|
||||
|
|
|
|||
|
|
@ -20,4 +20,6 @@ di
|
|||
l
|
||||
street
|
||||
post-box
|
||||
e.v.
|
||||
e.v.
|
||||
do
|
||||
ng
|
||||
|
|
@ -8,4 +8,11 @@ universitatsbibliothek
|
|||
universitatspital
|
||||
universitetsjukhuset
|
||||
universitatsaugenklinik
|
||||
univesitatsfrauenklinik
|
||||
univesitatsfrauenklinik
|
||||
universitetscentralsjukhus
|
||||
universitatsverlag
|
||||
universitaetsklinikum
|
||||
universitatsalianz
|
||||
universalmuseum
|
||||
universitatszahnklinik
|
||||
universitetsforlaget
|
||||
|
|
@ -1 +1 @@
|
|||
__version__ = "2.2.2"
|
||||
__version__ = "3.1.1"
|
||||
|
|
|
|||
|
|
@ -3,156 +3,150 @@ import sys
|
|||
from affro.helpers.functions import *
|
||||
from affro.helpers.matching import *
|
||||
from affro.helpers.create_input import *
|
||||
import json
|
||||
from affro.helpers.find_name import *
|
||||
from affro.helpers.find_id import *
|
||||
from affro.helpers.disambiguation import *
|
||||
|
||||
|
||||
from . import __version__
|
||||
|
||||
VERSION = __version__
|
||||
|
||||
dix_org = load_json('jsons/dix_org.json')
|
||||
dix_mult = load_json('jsons/dix_mult.json')
|
||||
dix_city = load_json('jsons/dix_city.json')
|
||||
dix_country = load_json('jsons/dix_country.json')
|
||||
dix_status = load_json('jsons/dix_status.json')
|
||||
dix_id_name = load_json('jsons/dix_id_name.json')
|
||||
dix_id_country = load_json('jsons/dix_id_country.json')
|
||||
dix_id_name = load_json('jsons/dix_id_name.json')
|
||||
|
||||
dix_id = load_json('jsons/dix_id.json')
|
||||
dix_name = load_json('jsons/dix_name.json')
|
||||
|
||||
|
||||
dix_status_new = {k :[dix_status[k][0], dix_status[k][1].split(', ')] for k in dix_status}
|
||||
us_states = [
|
||||
"alabama", "alaska", "arizona", "arkansas", "california",
|
||||
"colorado", "conecticut", "delaware", "florida", "georgia",
|
||||
"hawaii", "idaho", "ilinois", "indiana", "iowa",
|
||||
"kansas", "kentucky", "louisiana", "maine", "maryland",
|
||||
"masachusets", "michigan", "minesota", "misisipi", "misouri",
|
||||
"montana", "nebraska", "nevada", "new hampshire", "new jersey",
|
||||
"new mexico", "new york", "north carolina", "north dakota", "ohio",
|
||||
"oklahoma", "oregon", "pensylvania", "rhode island", "south carolina",
|
||||
"south dakota", "tenesee", "texas", "utah", "vermont",
|
||||
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
|
||||
]
|
||||
def produce_result(input, simU, simG, limit):
|
||||
best_name = find_name(input, dix_name, simU, simG, limit)
|
||||
id_result = find_id(input, best_name, dix_name)
|
||||
result = disamb(input, id_result, dix_id)
|
||||
|
||||
|
||||
|
||||
def contains_us_state(text):
|
||||
text = text.lower()
|
||||
return any(state in text for state in us_states)
|
||||
|
||||
def find_ror(input, simU, simG, limit):
|
||||
light_aff = input[0]
|
||||
result = Aff_Ids(input, dix_org, dix_mult, dix_city, dix_country, simU, simG, limit)
|
||||
results_upd = []
|
||||
|
||||
for r in result:
|
||||
|
||||
if "openorgs" in r[2]:
|
||||
results_upd.append([r[1], 'openorgs', r[2], 'active', dix_id_country[r[2]]])
|
||||
|
||||
else:
|
||||
if dix_status_new[r[2]][0] == 'active':
|
||||
results_upd.append([r[1], 'ror', r[2], 'active', dix_id_country[r[2]]])
|
||||
else:
|
||||
if dix_status_new[r[2]][1][0] == '':
|
||||
results_upd.append([r[1], 'ror', r[2], dix_status_new[r[2]][0], dix_id_country[r[2]]])
|
||||
|
||||
|
||||
else:
|
||||
results_upd.append([r[1], 'ror', r[2], dix_status_new[r[2]][0],dix_id_country[r[2]]])
|
||||
for link in (dix_status_new[r[2]][1]):
|
||||
results_upd.append([r[1], 'ror', link, 'active',dix_id_country[r[2]],dix_id_country[link]])
|
||||
|
||||
if len(results_upd) > len(set(description(light_aff)[1])):
|
||||
|
||||
|
||||
final_matching = []
|
||||
light_aff_tokens = [clean_string_ror(x) for x in set(light_aff.split())]
|
||||
for id_ in results_upd:
|
||||
country = dix_id_country[id_[2]]
|
||||
if country == 'united states':
|
||||
if 'united states' in light_aff or 'usa' in light_aff_tokens or contains_us_state(light_aff):
|
||||
final_matching.append(id_)
|
||||
|
||||
elif country == 'united kingdom':
|
||||
if 'united kingdom' in light_aff or 'uk' in light_aff_tokens:
|
||||
final_matching.append(id_)
|
||||
|
||||
elif 'korea' in country:
|
||||
|
||||
if 'korea' in light_aff_tokens:
|
||||
final_matching.append(id_)
|
||||
|
||||
elif country in light_aff:
|
||||
final_matching.append(id_)
|
||||
|
||||
|
||||
if len(final_matching)>0:
|
||||
result_dict = [{'provenance': 'affro', 'version': VERSION, 'pid':'openorgs', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'provenance': 'affro', 'version': VERSION,'pid':'ror', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} for x in final_matching]
|
||||
return result_dict
|
||||
else:
|
||||
|
||||
return [{'provenance': 'affro', 'version': VERSION, 'pid':'openorgs', 'value':x[2], 'name': dix_id_name[x[2]],'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'provenance': 'affro', 'version': VERSION, 'pid':'ror', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} for x in results_upd]
|
||||
|
||||
elif len(results_upd)>0:
|
||||
return [{'provenance': 'affro', 'version': VERSION, 'pid':'openorgs', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'provenance': 'affro', 'version': VERSION, 'pid':'ror', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} for x in results_upd]
|
||||
else:
|
||||
result_dict = []
|
||||
|
||||
return result_dict
|
||||
return result
|
||||
|
||||
|
||||
def run_affro(raw_aff_string):
|
||||
lucky_guess = clean_string_ror(raw_aff_string)
|
||||
lucky_guess = clean_string_lucky(raw_aff_string)
|
||||
# print(lucky_guess)
|
||||
try:
|
||||
if lucky_guess in dix_org:
|
||||
if dix_mult[lucky_guess] == "unique":
|
||||
if 'openorgs' in dix_org[lucky_guess]:
|
||||
if lucky_guess in dix_name:
|
||||
# print('lucky guess hit', lucky_guess)
|
||||
# print('lucky guess found', dix_name[lucky_guess])
|
||||
if len(dix_name[lucky_guess]) == 1:
|
||||
id_ = dix_name[lucky_guess][0]['id']
|
||||
name_ = dix_id[id_]['name']
|
||||
country_ = dix_id[id_]['country']
|
||||
status_ = dix_id[id_]['status']
|
||||
if 'openorgs' in id_:
|
||||
|
||||
return[{'provenance': 'affro', 'version': VERSION, 'pid': 'openorgs', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'openorgs', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
|
||||
else:
|
||||
if dix_status_new[dix_org[lucky_guess]][0] == 'active':
|
||||
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
elif dix_status_new[dix_org[lucky_guess]][1][0]== '':
|
||||
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': dix_status_new[dix_org[lucky_guess]][0], 'country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
if status_[0] == 'active':
|
||||
# print('active')
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
|
||||
elif status_[0]== '':
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value':id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
|
||||
else:
|
||||
res = [{'provenance': 'affro', 'version': VERSION, 'pid' : 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': dix_status_new[dix_org[lucky_guess]][0], 'country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
for successor in dix_status_new[dix_org[lucky_guess]][1]:
|
||||
res.append({'provenance': 'affro', 'version': VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]})
|
||||
res = [{'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
|
||||
for successor in status_[1]:
|
||||
if successor != '':
|
||||
res.append({'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id[successor]['name'], 'confidence': 1, 'status': 'active', 'country':dix_id[successor]['country']})
|
||||
return res
|
||||
else:
|
||||
cand_ids = [
|
||||
key
|
||||
for _, key in dix_city[lucky_guess]
|
||||
if ("ror" in key and dix_status_new[key][0] == "active") or ("openorgs" in key)
|
||||
]
|
||||
num_countries = len(
|
||||
set(
|
||||
dix_id_country[x[1]]
|
||||
for x in dix_city[lucky_guess]
|
||||
if ("ror" in x[1] and dix_status_new[x[1]][0] == "active") or ("openorgs" in x[1])
|
||||
)
|
||||
)
|
||||
# print('multiple candidates')
|
||||
ids = [x['id'] for x in dix_name[lucky_guess]]
|
||||
cand_ids = [id for id in ids if is_first(id, lucky_guess) == 'y']
|
||||
# print('cand_ids', cand_ids)
|
||||
# pick the ror id where 'first' == 'y' (None if not found)
|
||||
if len(cand_ids) !=1:
|
||||
# print('secondary conditions')
|
||||
conditions = [
|
||||
lambda key: ("ror" in key and dix_id[key]['status'][0] == "active"
|
||||
and dix_id[key]['top_level'][0] == 'y') \
|
||||
or ("openorgs" in key),
|
||||
|
||||
if len(cand_ids) == 1 or num_countries == 1:
|
||||
if 'openorgs' in dix_org[lucky_guess]:
|
||||
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'openorgs', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
lambda key: ("ror" in key and dix_id[key]['status'][0] == "active"
|
||||
and dix_id[key]['parent'][0] == 'y') \
|
||||
or ("openorgs" in key),
|
||||
|
||||
lambda key: ("ror" in key and dix_id[key]['status'][0] == "active") \
|
||||
or ("openorgs" in key)
|
||||
]
|
||||
|
||||
for cond in conditions:
|
||||
cand_ids = [key for key in ids if cond(key)]
|
||||
if cand_ids:
|
||||
# print('break')
|
||||
break
|
||||
|
||||
if len(cand_ids) == 0:
|
||||
# print('check result')
|
||||
result = produce_result(create_df_algorithm(raw_aff_string, 10), 0.42, 0.82, 500)
|
||||
|
||||
return result
|
||||
|
||||
# print('cand_ids',cand_ids)
|
||||
if len(cand_ids) == 1:# or num_countries == 1:
|
||||
id_ = cand_ids[0]
|
||||
# print('id',id_)
|
||||
name_ = dix_id[id_]['name']
|
||||
country_ = dix_id[id_]['country']
|
||||
status_ = dix_id[id_]['status']
|
||||
if 'openorgs' in id_:
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'openorgs', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
|
||||
else:
|
||||
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
|
||||
else:
|
||||
return []
|
||||
if status_[0] == 'active':
|
||||
# print('active')
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
|
||||
elif status_[0]== '':
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value':id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
|
||||
else:
|
||||
res = [{'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
|
||||
for successor in status_[1]:
|
||||
if successor != '':
|
||||
res.append({'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id[successor]['name'], 'confidence': 1, 'status': 'active', 'country':dix_id[successor]['country']})
|
||||
return res
|
||||
# return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country':country_}]
|
||||
|
||||
else:
|
||||
found = False
|
||||
for triplet in dix_name[lucky_guess]:
|
||||
if triplet['first'] == 'y':
|
||||
found = True
|
||||
id_ = triplet['id']
|
||||
name_ = dix_id[id_]['name']
|
||||
country_ = dix_id[id_]['country']
|
||||
status_ = dix_id[id_]['status']
|
||||
if 'openorgs' in id_:
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'openorgs', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
|
||||
else:
|
||||
if status_[0] == 'active':
|
||||
# print('active')
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
|
||||
elif status_[0]== '':
|
||||
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value':id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
|
||||
else:
|
||||
res = [{'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
|
||||
for successor in status_[1]:
|
||||
if successor != '':
|
||||
res.append({'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id[successor]['name'], 'confidence': 1, 'status': 'active', 'country':dix_id[successor]['country']})
|
||||
return res
|
||||
|
||||
if found == False:
|
||||
return []
|
||||
else:
|
||||
# print('No lucky guess, running algorithm...')
|
||||
result = find_ror(create_df_algorithm(raw_aff_string, 3), 0.42, 0.82, 500)
|
||||
|
||||
# print('lucky guess miss')
|
||||
result = produce_result(create_df_algorithm(raw_aff_string, 3), 0.42, 0.82, 500)
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# Return some indication of an error, or log the row
|
||||
print(f"Error: {str(e)}")
|
||||
print(f"Error end: {str(e)}")
|
||||
print(raw_aff_string)
|
||||
pass
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def matchings_affro(aff_string):
|
||||
|
|
@ -168,7 +162,7 @@ def matchings_affro(aff_string):
|
|||
# Create the result as a tuple that matches matchings_schema
|
||||
result = []
|
||||
for matching in matchings:
|
||||
# Assuming 'matching' is a dictionary that contains 'provenance', 'version', 'pid', 'value', 'name', 'confidence', 'status', 'country'
|
||||
# Assuming 'matching' is a dictionary that contains 'provenance', 'affro', 'value', 'confidence', 'status'
|
||||
result.append((
|
||||
matching.get("provenance", None),
|
||||
matching.get("version", None),
|
||||
|
|
@ -178,7 +172,6 @@ def matchings_affro(aff_string):
|
|||
float(matching.get("confidence", None)),
|
||||
matching.get("status", None),
|
||||
matching.get("country", None)
|
||||
|
||||
))
|
||||
if len(result)>0:
|
||||
return result
|
||||
|
|
@ -189,5 +182,4 @@ def matchings_affro(aff_string):
|
|||
return ()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,12 +1,9 @@
|
|||
from affro.helpers.functions import *
|
||||
|
||||
def valueToCategory(value):
|
||||
flag = 0
|
||||
|
||||
for k in categ_dicts:
|
||||
if k in value and categ_dicts[k] in categ_string.split('|'):
|
||||
flag = 1
|
||||
return flag
|
||||
return categ_dicts[k]
|
||||
|
||||
|
||||
# tokenization
|
||||
|
|
@ -28,21 +25,30 @@ protect = ['national univer ireland',
|
|||
'state univer',
|
||||
'rijksuniver',
|
||||
'rijks univer',
|
||||
'univer medical center'
|
||||
'univer medical center',
|
||||
'royal colege surgeons',
|
||||
'st patricks colege',
|
||||
'institu techn',
|
||||
'trinity colege',
|
||||
'st johns colege',
|
||||
'wiliam beaumont hospital'
|
||||
]
|
||||
|
||||
def create_df_algorithm(raw_aff_string, radius_u):
|
||||
clean_aff = clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string)))
|
||||
#print(0, clean_aff)
|
||||
countries_list = description(clean_aff)[1]
|
||||
aff_no_symbols_d = substrings_dict(reduce(clean_aff))
|
||||
#print(0.5, aff_no_symbols_d)
|
||||
substring_list = [replace_abbr_univ(x) for x in list(aff_no_symbols_d.values())]
|
||||
#print(1, substring_list)
|
||||
# for k, word in enumerate(substring_list):
|
||||
# print(word)
|
||||
# if word in protect and substring_list[k+1] in city_names:
|
||||
# print('y')
|
||||
# word = word + ', ' + substring_list[k+1]
|
||||
# substring_list[k] = word
|
||||
i = 0
|
||||
|
||||
# print(substring_list,'substring_list')
|
||||
while i < len(substring_list) - 1:
|
||||
if substring_list[i] in protect and any(name in substring_list[i+1] for name in city_names): #substring_list[i+1] in city_names:
|
||||
if substring_list[i] in protect and any(name in substring_list[i+1] for name in city_names+countries): #substring_list[i+1] in city_names:
|
||||
substring_list[i] = substring_list[i] + ' ' + substring_list[i+1]
|
||||
i = i+2
|
||||
continue
|
||||
|
|
@ -61,31 +67,18 @@ def create_df_algorithm(raw_aff_string, radius_u):
|
|||
i = i+1
|
||||
continue
|
||||
|
||||
# elif 'lab' in substring_list[i] and ('colege' in substring_list[i+1] or 'dep' in substring_list[i+1] or 'school' in substring_list[i+1]):
|
||||
# if not 'univ' in substring_list[i]: #'inst' in substring_list[i+1] or
|
||||
# substring_list.pop(i)
|
||||
# else:
|
||||
# i = i+1
|
||||
# continue
|
||||
|
||||
else:
|
||||
i += 1
|
||||
# print(1.4, substring_list)
|
||||
|
||||
|
||||
light_aff = (', '.join((substring_list)))
|
||||
# print(1.5, light_aff)
|
||||
|
||||
|
||||
substring_list = [x for x in substring_list if x.replace(' gmbh','') not in city_names+remove_list]
|
||||
# print(1.7,substring_list)
|
||||
|
||||
|
||||
substring_list0 = [shorten_keywords([x], radius_u) for x in substring_list if len(shorten_keywords([x],radius_u))>0]
|
||||
# print(2,substring_list0 )
|
||||
|
||||
substring_list1 = [inner for outer in substring_list0 for inner in outer]
|
||||
# print(3,substring_list1 )
|
||||
|
||||
aff_list = [{"index": i, "keywords": substring_list1[i], "category": valueToCategory(substring_list1[i])} for i in range(len(substring_list1))]
|
||||
|
||||
filtered_list = [entry for entry in aff_list if entry.get("category") == 1]
|
||||
filtered_list = [entry for entry in aff_list if type(entry.get("category")) == str]
|
||||
|
||||
return [clean_aff, light_aff, filtered_list, countries_list]
|
||||
|
|
@ -0,0 +1,155 @@
|
|||
from affro.helpers.functions import *
|
||||
from affro.helpers.create_input import *
|
||||
from .. import __version__
|
||||
|
||||
VERSION = __version__
|
||||
|
||||
|
||||
us_states = [
|
||||
"alabama", "alaska", "arizona", "arkansas", "california",
|
||||
"colorado", "conecticut", "delaware", "florida", "georgia",
|
||||
"hawaii", "idaho", "ilinois", "indiana", "iowa",
|
||||
"kansas", "kentucky", "louisiana", "maine", "maryland",
|
||||
"masachusets", "michigan", "minesota", "misisipi", "misouri",
|
||||
"montana", "nebraska", "nevada", "new hampshire", "new jersey",
|
||||
"new mexico", "new york", "north carolina", "north dakota", "ohio",
|
||||
"oklahoma", "oregon", "pensylvania", "rhode island", "south carolina",
|
||||
"south dakota", "tennesee", "texas", "utah", "vermont",
|
||||
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
|
||||
]
|
||||
|
||||
|
||||
def contains_us_state(text):
|
||||
text = text.lower()
|
||||
return any(state in text for state in us_states)
|
||||
|
||||
# def get_city(name, dix_name):
|
||||
# return {x['city'] : x['id'] for x in dix_name[name]}
|
||||
|
||||
|
||||
def convert_to_result(id_list_, dix_id):
|
||||
"""
|
||||
id_list_ rows: [something, score, value]
|
||||
dix_id: mapping from id -> {'name':..., 'country':..., 'status': [primary, secondary_list]}
|
||||
"""
|
||||
result_dict = []
|
||||
for r in id_list_:
|
||||
# Confidence is in r[1]
|
||||
score = min(r[1], 1.0)
|
||||
value = r[2]
|
||||
|
||||
rec = dix_id.get(value)
|
||||
|
||||
if rec is None:
|
||||
# missing metadata for this id — skip (or log if you want)
|
||||
continue
|
||||
|
||||
name = rec.get('name')
|
||||
country = rec.get('country')
|
||||
status_field = rec.get('status', [])
|
||||
primary_status = status_field[0] if len(status_field) > 0 else None
|
||||
secondary = status_field[1] if len(status_field) > 1 else []
|
||||
|
||||
def make_entry(pid, val, nm, conf, st, ctry):
|
||||
return {
|
||||
'provenance': 'affro',
|
||||
'version': VERSION,
|
||||
'pid': pid,
|
||||
'value': val,
|
||||
'name': nm,
|
||||
'confidence': conf,
|
||||
'status': st,
|
||||
'country': ctry
|
||||
}
|
||||
|
||||
if "openorgs" in value:
|
||||
result_dict.append(make_entry('openorgs', value, name, score, 'active', country))
|
||||
continue
|
||||
|
||||
# ROR branch
|
||||
if primary_status == 'active':
|
||||
result_dict.append(make_entry('ror', value, name, score, 'active', country))
|
||||
continue
|
||||
|
||||
# primary is not active
|
||||
# treat case where secondary exists and its first element is empty string specially
|
||||
if secondary and secondary[0] == '':
|
||||
result_dict.append(make_entry('ror', value, name, score, primary_status, country))
|
||||
else:
|
||||
# append parent (non-active)
|
||||
result_dict.append(make_entry('ror', value, name, score, primary_status, country))
|
||||
# append linked records (use link's own metadata)
|
||||
for link in secondary:
|
||||
if not link:
|
||||
continue
|
||||
link_rec = dix_id.get(link, {})
|
||||
link_name = link_rec.get('name')
|
||||
link_country = link_rec.get('country')
|
||||
result_dict.append(make_entry('ror', link, link_name, score, 'active', link_country))
|
||||
|
||||
return result_dict
|
||||
|
||||
def count_active(items):
|
||||
return sum(1 for x in items if x.get("status") == "active")
|
||||
|
||||
def disamb(input, id_list_,dix_id):
|
||||
# print('disamb id_list_', id_list_)
|
||||
if id_list_ == []:
|
||||
return []
|
||||
|
||||
clean_aff = input[0]
|
||||
# print(input)
|
||||
result_dict = convert_to_result(id_list_, dix_id)
|
||||
num_actives = count_active(result_dict)
|
||||
# print('result_dict',result_dict)
|
||||
# print('num_actives', num_actives)
|
||||
if len(id_list_) ==1:
|
||||
# print('1')
|
||||
return result_dict
|
||||
|
||||
elif len(description(clean_aff)[1]) == 0:
|
||||
# print('no country in affiliation')
|
||||
# polytechnic?
|
||||
countries_uni = [res['country'] for res in result_dict if 'Uni' in res['name']]
|
||||
if len(countries_uni) >0:
|
||||
final_matching = [res for res in result_dict if res['country'] in countries_uni]
|
||||
return final_matching
|
||||
else:
|
||||
# print('no universities')
|
||||
return result_dict
|
||||
|
||||
elif num_actives > len(set(description(clean_aff)[1])):
|
||||
# print('more results than countries')
|
||||
final_matching = []
|
||||
light_aff_tokens = [clean_string_ror(x) for x in set(clean_aff.split())]
|
||||
for res in result_dict:
|
||||
country = res['country']
|
||||
if country == 'united states':
|
||||
if 'united states' in clean_aff or 'usa' in light_aff_tokens or contains_us_state(clean_aff):
|
||||
final_matching.append(res)
|
||||
|
||||
elif country == 'united kingdom':
|
||||
if 'united kingdom' in clean_aff or 'uk' in light_aff_tokens:
|
||||
final_matching.append(res)
|
||||
|
||||
elif 'korea' in country:
|
||||
|
||||
if 'korea' in light_aff_tokens:
|
||||
final_matching.append(res)
|
||||
|
||||
elif country in clean_aff:
|
||||
final_matching.append(res)
|
||||
|
||||
|
||||
if final_matching:
|
||||
return final_matching
|
||||
|
||||
else:
|
||||
return result_dict
|
||||
|
||||
elif len(result_dict)>0:
|
||||
return result_dict
|
||||
else:
|
||||
# print('leider nichts')
|
||||
return []
|
||||
|
||||
|
|
@ -0,0 +1,167 @@
|
|||
from affro.helpers.functions import *
|
||||
from affro.helpers.create_input import *
|
||||
|
||||
specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' or categ_dicts[k] == 'Acronyms']
|
||||
|
||||
country_synonyms = {x: [x] for x in countries}
|
||||
country_synonyms["united states"] = ["united states", "u.s.a.", "usa", "usa.","states"]
|
||||
country_synonyms["germany"] = ["germany","deutschland"]
|
||||
country_synonyms["united kingdom"] = ["united kingdom", "u.k.", "uk", "uk.","kingdom","england"]
|
||||
country_synonyms["turkey"] = ["turkey","turkiye", "cyprus"]
|
||||
country_synonyms["china"] = ["china", "prc","chinese"]
|
||||
country_synonyms["ireland"] = ["eire", "ireland"]
|
||||
country_synonyms["south korea"] = ["south korea", "korea"]
|
||||
|
||||
special_countries = {'united states', 'united kngdom', 'germany', 'china','turkey'}
|
||||
|
||||
|
||||
|
||||
|
||||
def keep_highest_score(data):
|
||||
""""
|
||||
Keeps only one inner list for each unique last value.
|
||||
The kept list is the one with the greatest second value.
|
||||
If multiple have the same greatest second value, one is kept arbitrarily.
|
||||
"""
|
||||
best = {}
|
||||
for lst in data:
|
||||
key = lst[-1]
|
||||
value = lst[1]
|
||||
if key not in best or value > best[key][1]:
|
||||
best[key] = lst
|
||||
return list(best.values())
|
||||
|
||||
|
||||
def find_id(input, best_names, dix_name):
|
||||
# print('start find_id')
|
||||
clean_aff = input[0]
|
||||
light_aff = input[1]
|
||||
id_list = []
|
||||
|
||||
for org_list in best_names:
|
||||
org = org_list[0]
|
||||
# print('org:', org)
|
||||
conf = org_list[1]
|
||||
|
||||
if len(dix_name[org]) == 1:
|
||||
# print('unique')
|
||||
id_ = dix_name[org][0]['id']
|
||||
city_ = dix_name[org][0]['city']
|
||||
country_ = dix_name[org][0]['country']
|
||||
# print(city_, country_)
|
||||
# print('c',set(country_synonyms[country_]))
|
||||
# print('l',set(light_aff.split()))
|
||||
if (
|
||||
# ('univ' in org and 'institu' in org)
|
||||
# or
|
||||
(
|
||||
city_ not in light_aff
|
||||
and not set(country_synonyms[country_]) & set(light_aff.split())
|
||||
and 'univ' not in org
|
||||
and 'inst' not in org
|
||||
and 'national' not in org
|
||||
and valueToCategory(org) not in ['Company', 'Acronyms', 'Specific']
|
||||
)
|
||||
):
|
||||
pass
|
||||
else:
|
||||
id_list.append([org, conf, id_])
|
||||
# else:
|
||||
# id_list.append([org, conf, id_])
|
||||
|
||||
else:
|
||||
# print('multiple')
|
||||
match_found = False
|
||||
for quadruple in dix_name[org]:
|
||||
city_ = quadruple['city']
|
||||
# print('city', city_)
|
||||
id_ = quadruple['id']
|
||||
|
||||
if city_ in clean_aff:
|
||||
if city_ not in org:
|
||||
id_list.append([org, conf, id_])
|
||||
match_found = True
|
||||
# break
|
||||
else:
|
||||
if clean_aff.count(city_) >1:
|
||||
id_list.append([org, conf, id_])
|
||||
match_found = True
|
||||
# break
|
||||
|
||||
if not match_found:
|
||||
countries_ids = {quadruple['country'] for quadruple in dix_name[org]}
|
||||
if countries_ids & special_countries:
|
||||
# print('special country')
|
||||
for quadruple in dix_name[org]:
|
||||
country_ = quadruple['country']
|
||||
# print(country_)
|
||||
id_ = quadruple['id']
|
||||
|
||||
tokens = set([x.replace(',','') for x in clean_aff.lower().split()])
|
||||
# print('tokens',tokens)
|
||||
text = clean_aff.lower()
|
||||
# print('text', text)
|
||||
|
||||
if ((country_ == 'united states' and ('united states' in text or {'usa', 'usa.'} & tokens or 'u.s.a.' in text)) or
|
||||
(country_ == 'germany' and ('deutschland' in text )) or
|
||||
(country_ == 'united kingdom' and ('united kingdom' in text or ({'uk', 'uk.'} & tokens) or 'u.k.' in text)) or
|
||||
(country_ == 'turkey' and ('turkiye' in text)) or
|
||||
(country_ == 'china' and ('chinese' in text or 'prc' in text))):
|
||||
# print('specific country found')
|
||||
id_list.append([org, conf, id_])
|
||||
match_found = True
|
||||
break
|
||||
|
||||
|
||||
if not match_found:
|
||||
# print('no special country')
|
||||
for quadruple in dix_name[org]:
|
||||
country_ = quadruple['country']
|
||||
id_ = quadruple['id']
|
||||
# print(country_)
|
||||
if country_.split()[0] in clean_aff:
|
||||
# print('no specific found')
|
||||
if country_ not in org:
|
||||
id_list.append([org, conf, id_])
|
||||
match_found = True
|
||||
break
|
||||
|
||||
|
||||
if not match_found:
|
||||
for quadruple in dix_name[org]:
|
||||
country_ = quadruple['country']
|
||||
id_ = quadruple['id']
|
||||
if country_ in clean_aff and country_ in org:
|
||||
id_list.append([org, conf, id_])
|
||||
match_found = True
|
||||
# break
|
||||
|
||||
if not match_found:
|
||||
# print('check sp')
|
||||
for sp in specific:
|
||||
if sp in org:
|
||||
for rec in dix_name[org]:
|
||||
if dix_id[rec['id']]['top_level'] == 'y':
|
||||
# print('top level found for specific')
|
||||
id_list.append([org, conf, rec['id']])
|
||||
match_found = True
|
||||
break
|
||||
|
||||
if not match_found:
|
||||
dix_id[rec['id']]['parent'] == 'y'
|
||||
# print('parent found for specific')
|
||||
id_list.append([org, conf, rec['id']])
|
||||
match_found = True
|
||||
break
|
||||
|
||||
if not match_found:
|
||||
# print('check first y')
|
||||
for quadruple in dix_name[org]:
|
||||
if 'department' not in org and 'labora' not in org and quadruple['first'] == 'y':
|
||||
id_list.append([org, conf, quadruple['id']])
|
||||
break
|
||||
|
||||
# print('id_list',id_list)
|
||||
id_list_final = keep_highest_score(id_list)
|
||||
# print('end find_id', id_list_final)
|
||||
return id_list_final
|
||||
|
|
@ -0,0 +1,94 @@
|
|||
from affro.helpers.functions import *
|
||||
from affro.helpers.create_input import *
|
||||
from affro.helpers.matching import *
|
||||
|
||||
def find_name(input, dix_name, simU, simG, limit):
|
||||
# print('start find_name')
|
||||
# print('input',input)
|
||||
"""
|
||||
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
|
||||
|
||||
Args:
|
||||
m (int): The number of DOIs to check.
|
||||
DF (DataFrame): The input DataFrame containing affiliation data.
|
||||
dix_org (dict): A dictionary of names of organizations and their ROR_ids.
|
||||
simU (float): Similarity threshold for universities.
|
||||
simG (float): Similarity threshold for non-universities.
|
||||
|
||||
Returns:
|
||||
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
|
||||
"""
|
||||
clean_aff = input[0]
|
||||
light_aff = input[1].replace(' gmbh', ' ').strip()
|
||||
df_list = input[2]
|
||||
|
||||
countries_list = input[3]
|
||||
|
||||
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
|
||||
result = {}
|
||||
pairs = []
|
||||
|
||||
keywords = [entry["keywords"].replace(' gmbh', ' ').strip() for entry in df_list]
|
||||
|
||||
candidates = get_candidates(countries_list)
|
||||
if len(keywords) > 1 or len(keywords) == 1 and len(keywords[0])>1:
|
||||
for k,s in enumerate(keywords):
|
||||
if len(s) >1 and s not in countries:
|
||||
pairs_k = []
|
||||
# print(s)
|
||||
#--end september 2025
|
||||
try:
|
||||
# print('lucky', s)
|
||||
pairs_k.append((s,s,1, dix_name[s][0]['id'],dix_name[s][0]['country']))
|
||||
if k not in dix:
|
||||
dix[k] = [s]
|
||||
else:
|
||||
dix[k].append(s)
|
||||
|
||||
except Exception as e:
|
||||
# else:
|
||||
# print('not lucky')
|
||||
|
||||
try:
|
||||
pairs_k = find_candidate(s, k , dix, simU, simG, candidates, limit)
|
||||
except:
|
||||
pairs_k = []
|
||||
result[k] = pairs_k
|
||||
if len(pairs_k)>0:
|
||||
pairs.append(pairs_k)
|
||||
|
||||
multi = index_multiple_matchings(pairs)
|
||||
# print('multi', multi)
|
||||
need_check_keys = []
|
||||
ready_keys = []
|
||||
ready_best = []
|
||||
for keyword in multi:
|
||||
try:
|
||||
if multi[keyword]>1:
|
||||
need_check_keys.append(keyword)
|
||||
else:
|
||||
for p in pairs:
|
||||
if keyword in p[0]:
|
||||
if p[0][1] not in ready_keys:
|
||||
ready_keys.append(p[0][1])
|
||||
|
||||
ready_best.append([p[0][1], p[0][2]])
|
||||
except Exception as e:
|
||||
print('ERROR, find_name', e)
|
||||
pass
|
||||
|
||||
pairs_check = [ pair for pair in pairs if pair[0][0] in need_check_keys ]
|
||||
# print('pairs_check',pairs_check)
|
||||
if len(need_check_keys)>0:
|
||||
# print(' len(need_check_keys)', len(need_check_keys))
|
||||
best0 = best_sim_score(clean_aff, light_aff, len(keywords), pairs_check, multi, simU, simG)
|
||||
# print('best0', best0)
|
||||
best1 = {x[0]:dix_name[x[0]][0]['id'] for x in best0 }
|
||||
# print('best1', best1)
|
||||
|
||||
best01 = unique_subset(best0, best1)
|
||||
best = best01 + ready_best
|
||||
else:
|
||||
best = ready_best
|
||||
# print('end find_name', best)
|
||||
return best
|
||||
|
|
@ -28,17 +28,10 @@ def load_txt(relative_path, package="affro"):
|
|||
with full_path.open("r", encoding="utf-8") as file:
|
||||
return [line.strip() for line in file]
|
||||
|
||||
|
||||
|
||||
#categ_string = 'Laboratory|Univ/Inst|Hospital|Foundation|Museum|Government|Company'
|
||||
categ_string = 'Academia|Hospitals|Foundations|Specific|Government|Company|Acronyms'
|
||||
|
||||
dix_org = load_json('jsons/dix_org.json')
|
||||
dix_city = load_json('jsons/dix_city.json')
|
||||
dix_country = load_json('jsons/dix_country.json')
|
||||
dix_mult = load_json('jsons/dix_mult.json')
|
||||
|
||||
dix_country_legalnames = load_json('jsons/dix_country_legalnames.json')
|
||||
|
||||
us_states = [
|
||||
"alabama", "alaska", "arizona", "arkansas", "california",
|
||||
"colorado", "conecticut", "delaware", "florida", "georgia",
|
||||
|
|
@ -52,6 +45,10 @@ us_states = [
|
|||
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
|
||||
]
|
||||
|
||||
dix_name = load_json('jsons/dix_name.json')
|
||||
|
||||
dix_country_legalnames = load_json('jsons/dix_country_legalnames.json')
|
||||
|
||||
def replace_double_consonants(text):
|
||||
# This regex pattern matches any double consonant
|
||||
pattern = r'([bcdfghjklmnpqrstvwxyz])\1'
|
||||
|
|
@ -59,18 +56,32 @@ def replace_double_consonants(text):
|
|||
result = re.sub(pattern, r'\1', text, flags=re.IGNORECASE)
|
||||
return result
|
||||
|
||||
|
||||
#stop_words = ['from', 'the', 'of', 'at', 'de','for','et','für','des', 'in','as','a','and','fur','for','und','di']
|
||||
|
||||
|
||||
def remove_stop_words(text):
|
||||
words = text.split()
|
||||
filtered_words = [word for word in words if word not in stop_words]
|
||||
return ' '.join(filtered_words)
|
||||
filtered_words = []
|
||||
|
||||
for word in words:
|
||||
if word.endswith(","):
|
||||
core = word[:-1] # remove the comma
|
||||
if core not in stop_words:
|
||||
filtered_words.append(core + ",")
|
||||
else:
|
||||
filtered_words.append(",") # keep only the comma
|
||||
else:
|
||||
if word not in stop_words:
|
||||
filtered_words.append(word)
|
||||
|
||||
result = " ".join(filtered_words)
|
||||
# remove spaces before commas
|
||||
result = result.replace(" ,", ",")
|
||||
return result
|
||||
|
||||
|
||||
stop_words = load_txt('txts/stop_words.txt')
|
||||
|
||||
dix_id_country = load_json('jsons/dix_id_country.json')
|
||||
dix_id = load_json('jsons/dix_id.json')
|
||||
|
||||
categ_dicts = load_json('jsons/dix_categ.json')
|
||||
replacements = load_json('jsons/replacements.json')
|
||||
|
|
@ -82,6 +93,10 @@ stop_words.remove('at')
|
|||
university_terms = [replace_double_consonants(x) for x in load_txt('txts/university_terms.txt')]
|
||||
city_names = [replace_double_consonants(x) for x in load_txt('txts/city_names.txt')]
|
||||
|
||||
def is_first(id, name):
|
||||
for quadruple in dix_name[name]:
|
||||
if quadruple['id'] == id:
|
||||
return quadruple['first']
|
||||
|
||||
|
||||
def get_candidates(country_list):
|
||||
|
|
@ -89,7 +104,7 @@ def get_candidates(country_list):
|
|||
cand = [dix_country_legalnames[country] for country in country_list if country in dix_country_legalnames]
|
||||
return list(set([item for sublist in cand for item in sublist]))
|
||||
else:
|
||||
return list(dix_org.keys())
|
||||
return list(dix_name.keys())
|
||||
|
||||
|
||||
def is_contained(s, w):
|
||||
|
|
@ -109,6 +124,11 @@ def is_contained(s, w):
|
|||
return False # Return False immediately
|
||||
return True # If all words from 's' are found in 'w', return True
|
||||
|
||||
def split_sub(s: str) -> str:
|
||||
# Add comma after certain word pairs
|
||||
pattern = r'\b((?:univer))\s+(department|faculty|institu)\b'
|
||||
return re.sub(pattern, r'\1, \2', s, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
def starts_with_any(string, prefixes):
|
||||
"""
|
||||
|
|
@ -158,39 +178,37 @@ def replace_roman_numerals(text):
|
|||
|
||||
def insert_space_between_lower_and_upper(s):
|
||||
"""
|
||||
Inserts a space between a lowercase letter followed by an uppercase letter in a string.
|
||||
|
||||
Parameters:
|
||||
s (str): The input string.
|
||||
|
||||
Returns:
|
||||
str: The modified string with spaces inserted.
|
||||
Insert a space between a lowercase letter and a following uppercase letter,
|
||||
while protecting listed substrings (case-sensitive) and restoring them in lowercase.
|
||||
"""
|
||||
# Temporarily replace 'AstraZeneca' to prevent modification
|
||||
s = s.replace('gGmbH','gmbh')
|
||||
s = s.replace('AstraZeneca', 'ASTRAZENECA_TEMP')
|
||||
s = s.replace('BioNTech', 'BIONTECH_TEMP')
|
||||
s = s.replace('GlaxoSmithKline', 'GLAXO_TEMP')
|
||||
s = s.replace('GmbH', 'GMBH_TEMP')
|
||||
s = s.replace('gmbH', 'GMBH_TEMP')
|
||||
s = s.replace('gGmbH', 'GMBH_TEMP')
|
||||
protected = ['DePaul',
|
||||
'AstraZeneca',
|
||||
'BioNTech',
|
||||
'GlaxoSmithKline',
|
||||
'LifeWatch',
|
||||
'SoBigData',
|
||||
'GmbH',
|
||||
'gGmbH',
|
||||
'gmbH'
|
||||
]
|
||||
|
||||
# Replace protected words with placeholders mapping to their lowercase versions
|
||||
placeholders = {}
|
||||
for i, word in enumerate(protected):
|
||||
key = f"__PROT_{i}__"
|
||||
s = s.replace(word, key)
|
||||
placeholders[key] = word.lower()
|
||||
|
||||
|
||||
# Exclude cases where 'Mc' is followed by a capital letter
|
||||
modified_string = re.sub(r'(?<!Mc)([a-z])([A-Z])', r'\1 \2', s)
|
||||
|
||||
# Ensure no spaces are inserted within 'Mc' sequences
|
||||
modified_string = re.sub(r'(Mc) ([A-Z])', r'\1\2', modified_string)
|
||||
|
||||
# Restore 'AstraZeneca'
|
||||
modified_string = modified_string.replace('ASTRAZENECA_TEMP', 'AstraZeneca')
|
||||
modified_string = modified_string.replace('BIONTECH_TEMP', 'BioNTech')
|
||||
modified_string = modified_string.replace('GLAXO_TEMP', 'GlaxoSmithKline')
|
||||
modified_string = modified_string.replace('GMBH_TEMP', 'gmbh')
|
||||
# Add space between lowercase and uppercase (except after 'Mc')
|
||||
s = re.sub(r'(?<!Mc)([a-z])([A-Z])', r'\1 \2', s)
|
||||
s = re.sub(r'(Mc) ([A-Z])', r'\1\2', s)
|
||||
|
||||
# Restore placeholders to lowercase
|
||||
for key, lower_word in placeholders.items():
|
||||
s = s.replace(key, lower_word)
|
||||
|
||||
return s
|
||||
|
||||
|
||||
return modified_string
|
||||
|
||||
|
||||
|
||||
|
|
@ -216,7 +234,7 @@ def replace_abbr_univ(token):
|
|||
elif token == "u " + city:
|
||||
return "univer " + city
|
||||
elif token == "tu " + city:
|
||||
return "technical univer " + city
|
||||
return "techn univer " + city
|
||||
else:
|
||||
return token
|
||||
|
||||
|
|
@ -224,7 +242,8 @@ def replace_abbr_univ(token):
|
|||
def remove_parentheses(text):
|
||||
return re.sub(r'\([^()]*\)', '', text)
|
||||
|
||||
L = ['univ', 'hospital', 'clinic', 'klinik', 'Univ', 'Hospital', 'Clinic', 'Klinik']
|
||||
L = ['univ', 'hospital', 'clinic', 'klinik', 'Univ', 'Hospital', 'Clinic', 'Klinik'] + [s.title() for s in countries] + countries
|
||||
|
||||
word_pattern = "|".join(map(re.escape, L))
|
||||
|
||||
def process_parentheses(text):
|
||||
|
|
@ -239,16 +258,15 @@ def process_parentheses(text):
|
|||
Returns:
|
||||
str: The modified string after processing parentheses.
|
||||
"""
|
||||
|
||||
text = re.sub(r'\((?![^)]*(' + word_pattern + r'))[^)]*\)', '', text)
|
||||
text_lower = text.lower()
|
||||
text_lower = re.sub(r'\((?![^)]*(' + word_pattern + r'))[^)]*\)', '', text_lower)
|
||||
|
||||
# Replace `(` with `,` and `)` with `,` if a word from L is inside
|
||||
text = re.sub(r'\(([^)]*(' + word_pattern + r')[^)]*)\)', r', \1,', text)
|
||||
text_lower = re.sub(r'\(([^)]*(' + word_pattern + r')[^)]*)\)', r', \1,', text_lower)
|
||||
|
||||
return text
|
||||
return text_lower
|
||||
|
||||
|
||||
|
||||
|
||||
def replace_comma_spaces(text):
|
||||
return text.replace(' ', ' ').replace(' , ', ', ')
|
||||
|
|
@ -313,6 +331,7 @@ def replace_newlines_with_space(text: str, repl: str = " ") -> str:
|
|||
|
||||
return cleaned
|
||||
|
||||
|
||||
def substrings_dict(string):
|
||||
"""
|
||||
Processes a given string by performing the following transformations:
|
||||
|
|
@ -361,7 +380,8 @@ def substrings_dict(string):
|
|||
modified_value = re.sub(r'\btrinity col\b', 'trinity colege', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'\btechnische\b', 'technological', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'\bteknologi\b', 'technology', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'\bpolitehnica\b', 'polytechnic', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'\bpolite\w*', 'polytechnic', modified_value, flags=re.IGNORECASE)
|
||||
modified_value = re.sub(r'\bpolyte\w*', 'polytechnic', modified_value, flags=re.IGNORECASE)
|
||||
|
||||
modified_value = re.sub(r'\btechn\w*', 'techn', modified_value, flags=re.IGNORECASE)
|
||||
#modified_value = re.sub(r'techno\w*', 'techno', modified_value, flags=re.IGNORECASE)
|
||||
|
|
@ -375,16 +395,22 @@ def substrings_dict(string):
|
|||
index += 1
|
||||
|
||||
# Add the original substring to the dictionary
|
||||
# else:
|
||||
# dict_string[index] = value.lower().strip()
|
||||
# index += 1
|
||||
|
||||
|
||||
return dict_string
|
||||
|
||||
def split_country(text):
|
||||
try:
|
||||
if text.split(' ')[-1].lower() in countries and startswith(text.split(' ')[-2].lower()) != 'univ':
|
||||
return " ".join(text.split(' ')[0:-1])+", "+ text.split(' ')[-1].lower()
|
||||
else:
|
||||
return text
|
||||
except:
|
||||
return text
|
||||
|
||||
|
||||
def clean_string_ror(input_string):
|
||||
def clean_string_lucky(input_string):
|
||||
|
||||
input_string = replace_underscore(replace_comma_spaces(replace_double_consonants(unidecode(process_parentheses(fully_unescape(input_string.replace("’","'").replace(" ́e","e").replace("'s", "s").replace("'", " "))))))).strip()
|
||||
input_string = replace_underscore(replace_comma_spaces(replace_double_consonants(unidecode(process_parentheses(fully_unescape(input_string.replace("’","'").replace(" ́e","e").replace("'s", "s").replace("'", ""))))))).strip()
|
||||
|
||||
result = remove_stop_words(replace_roman_numerals(input_string.lower()))
|
||||
result = result.replace(' and ',' ')
|
||||
|
|
@ -402,7 +428,8 @@ def clean_string_ror(input_string):
|
|||
|
||||
university_terms = {'universitatsklinikum', 'universitatskinderklinik',
|
||||
'universitatspital', 'universitatskliniken', 'universitetshospital',
|
||||
'universitatsmedizin', 'universitatsbibliothek','universitatszahnklinik'
|
||||
'universitatsmedizin', 'universitatsbibliothek','universitatszahnklinik',
|
||||
'universiteitsmuseum'
|
||||
}
|
||||
|
||||
result = replace_acronyms(result).replace('.', ' ')
|
||||
|
|
@ -425,7 +452,65 @@ def clean_string_ror(input_string):
|
|||
|
||||
result = re.sub(r'\btechnische\b', 'technological', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bteknologi\b', 'technological', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bpolitehnica\b', 'polytechnic', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bpolite\w*', 'polytechnic', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bpolyte\w*', 'polytechnic', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'czechoslovak\b', 'czech', result, flags=re.IGNORECASE)
|
||||
|
||||
result = re.sub(r'\btechn\w*', 'techn', result, flags=re.IGNORECASE)
|
||||
# result = re.sub(r'techno\w*', 'techno', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'scien\w*', 'scien', result, flags=re.IGNORECASE)
|
||||
# result = re.sub(r'\bsaint\b', 'st', result, flags=re.IGNORECASE)
|
||||
|
||||
return result.strip()
|
||||
|
||||
|
||||
|
||||
def clean_string_ror(input_string):
|
||||
|
||||
input_string = replace_underscore(replace_comma_spaces(replace_double_consonants(unidecode(remove_parentheses(fully_unescape(input_string.replace("’","'").replace(" ́e","e").replace("'s", "s").replace("'", ""))))))).strip()
|
||||
|
||||
result = remove_stop_words(replace_roman_numerals(input_string.lower()))
|
||||
result = result.replace(' and ',' ')
|
||||
|
||||
|
||||
# Remove characters that are not from the Latin alphabet, or allowed punctuation
|
||||
result = remove_multi_digit_numbers(replace_comma_spaces(re.sub(r'[^a-zA-Z0-9\s,;/:.\-\—]', '', result).strip()))
|
||||
|
||||
# Restore the " - " sequence from the placeholder
|
||||
#result = result.replace(placeholder, " – ")
|
||||
result = result.replace(':',' ').replace(';',' ').replace('-',' ').replace('—',' ').replace(',',' ')
|
||||
# Replace consecutive whitespace with a single space
|
||||
|
||||
|
||||
|
||||
university_terms = {'universitatsklinikum', 'universitatskinderklinik',
|
||||
'universitatspital', 'universitatskliniken', 'universitetshospital',
|
||||
'universitatsmedizin', 'universitatsbibliothek','universitatszahnklinik',
|
||||
'universiteitsmuseum'
|
||||
}
|
||||
|
||||
result = replace_acronyms(result).replace('.', ' ')
|
||||
result = re.sub(r'\s+', ' ', result)
|
||||
|
||||
# Replace consecutive whitespace with a single space
|
||||
if not any(term in result.lower() for term in university_terms):
|
||||
|
||||
result = re.sub(r'universi\w*', 'univer', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bsaint\b', 'st', result,flags=re.IGNORECASE)
|
||||
result = re.sub(r'institu\w*', 'institu', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'labora\w*', 'labora', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'centre\b', 'center', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'centrum\b', 'center', result, flags=re.IGNORECASE)
|
||||
|
||||
result = re.sub(r'hopital\b', 'hospital', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'hospital(?!s)\w*', 'hospital', result, flags=re.IGNORECASE)
|
||||
|
||||
#result = re.sub(r'centro\b', 'center', result, flags=re.IGNORECASE)
|
||||
|
||||
result = re.sub(r'\btechnische\b', 'technological', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bteknologi\b', 'technological', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bpolite\w*', 'polytechnic', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'\bpolyte\w*', 'polytechnic', result, flags=re.IGNORECASE)
|
||||
result = re.sub(r'czechoslovak\b', 'czech', result, flags=re.IGNORECASE)
|
||||
|
||||
result = re.sub(r'\btechn\w*', 'techn', result, flags=re.IGNORECASE)
|
||||
|
|
@ -436,13 +521,13 @@ def clean_string_ror(input_string):
|
|||
return result.strip()
|
||||
|
||||
def clean_string(input_string):
|
||||
input_string = replace_underscore(replace_comma_spaces(unidecode(process_parentheses(fully_unescape(replace_newlines_with_space(input_string).replace("P.O. Box","").replace("’","'").replace(" ́e","e").replace("'s", "s").replace("'", " ")))))).strip()
|
||||
input_string = replace_underscore(replace_comma_spaces(unidecode(process_parentheses(insert_space_between_lower_and_upper(fully_unescape(replace_newlines_with_space(input_string).replace("P.O. Box","").replace("’","'").replace(" ́e","e").replace("'s", "s").replace("'", " "))))))).strip()
|
||||
|
||||
# result = re.sub(r'(?<! )[–—-](?! )', ' ', input_string)
|
||||
|
||||
# print('h',input_string)
|
||||
|
||||
result = remove_stop_words(replace_double_consonants(replace_roman_numerals(insert_space_between_lower_and_upper(input_string).lower())))
|
||||
result = remove_stop_words(replace_double_consonants(replace_roman_numerals((input_string).lower())))
|
||||
|
||||
|
||||
# Remove characters that are not from the Latin alphabet, or allowed punctuation
|
||||
|
|
@ -458,12 +543,12 @@ def clean_string(input_string):
|
|||
#result = replace_roman_numerals(remove_stop_words(insert_space_between_lower_and_upper(result).lower()))
|
||||
|
||||
|
||||
return result.strip() # Strip leading/trailing spaces
|
||||
return split_country(result.strip()) # Strip leading/trailing spaces
|
||||
|
||||
def description(aff_string):
|
||||
aff_string = aff_string.replace('turkiye', 'turkey')
|
||||
aff_string = aff_string.replace('turkiye', 'turkey').lower()
|
||||
aff_string = aff_string.replace('kirgizistan', 'kyrgyzstan')
|
||||
|
||||
|
||||
descr = []
|
||||
countries_ = []
|
||||
words = re.split(r'[ ,;:/]+', aff_string)
|
||||
|
|
@ -473,16 +558,15 @@ def description(aff_string):
|
|||
# if w in city_names:
|
||||
# descr.append('city')
|
||||
w = re.sub(r'[^A-Za-z\s]', '', w)
|
||||
|
||||
if replace_acronyms(w) in countries:
|
||||
descr.append('country')
|
||||
countries_.append(w)
|
||||
|
||||
|
||||
if replace_acronyms(w) in us_states:
|
||||
descr.append('country')
|
||||
countries_.append('usa')
|
||||
|
||||
elif w in ['univer', 'institu', 'hospital', 'labora']:
|
||||
|
||||
elif w in ['univer', 'institu', 'hospital', 'labora', 'colege']:
|
||||
|
||||
descr.append('basic_key')
|
||||
elif w == 'and':
|
||||
|
|
@ -531,12 +615,12 @@ def split_and(string):
|
|||
tok_no_sl1 = ' '.join(token.replace('-', ' ').split())
|
||||
tok_no_sl2 = ' '.join(token.replace('—', ' ').split())
|
||||
tok_no = ' '.join(token.replace(' and ', ' ').replace(' at ', ' ').replace(' an ', ' ').replace('-', ' ').replace('—', ' ').split())
|
||||
if tok_no in dix_org:
|
||||
if tok_no in dix_name:
|
||||
token = tok_no
|
||||
|
||||
|
||||
else:
|
||||
if tok_no_and not in dix_org:
|
||||
if tok_no_and not in dix_name:
|
||||
# Store once instead of calling multiple times
|
||||
|
||||
if is_subsequence(replace_sequence, token_description):# and token.split(' and ', ' ') not in dix_org:
|
||||
|
|
@ -547,20 +631,20 @@ def split_and(string):
|
|||
else:
|
||||
token = tok_no_and
|
||||
|
||||
if tok_no_at not in dix_org:
|
||||
if tok_no_at not in dix_name:
|
||||
token = ' '.join(token.replace(' at ', ', ').split())
|
||||
else:
|
||||
token = tok_no_at
|
||||
|
||||
if tok_no_an not in dix_org:
|
||||
if tok_no_an not in dix_name:
|
||||
token = ' '.join(token.replace(' an ', ', ').split())
|
||||
else:
|
||||
token = tok_no_an
|
||||
if tok_no_sl1 not in dix_org:
|
||||
if tok_no_sl1 not in dix_name:
|
||||
token = ' '.join(token.replace('-', ',').split())
|
||||
else:
|
||||
token = tok_no_sl1
|
||||
if tok_no_sl2 not in dix_org:
|
||||
if tok_no_sl2 not in dix_name:
|
||||
token = ' '.join(token.replace('—', ',').split())
|
||||
else:
|
||||
token = tok_no_sl2
|
||||
|
|
@ -577,10 +661,10 @@ def reduce(light_aff):
|
|||
aff_no_symbols_d = substrings_dict(light_aff)
|
||||
substring_list = list(aff_no_symbols_d.values())
|
||||
#light_aff_final = ', '.join((substring_list))
|
||||
# print('h', substring_list)
|
||||
# print('h', substring_list)
|
||||
light_aff_final = split_and(', '.join((substring_list)))
|
||||
# print('th', light_aff_final)
|
||||
return light_aff_final
|
||||
# print('th', light_aff_final)
|
||||
return split_sub(light_aff_final)
|
||||
|
||||
|
||||
def unique_subset(L, D):
|
||||
|
|
@ -615,6 +699,7 @@ def str_radius_u(string, radius_u):
|
|||
return result
|
||||
|
||||
|
||||
sp_specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' and ' ' in k]
|
||||
|
||||
def str_radius_spec(string):
|
||||
spec = False
|
||||
|
|
@ -626,16 +711,32 @@ def str_radius_spec(string):
|
|||
except:
|
||||
pass
|
||||
if spec == False:
|
||||
return string
|
||||
|
||||
for x in sp_specific:
|
||||
if x in string:# or categ_dicts[x] == 'Acronyms':
|
||||
spec = True
|
||||
# print('CHECK',x)
|
||||
return x
|
||||
if spec ==False:
|
||||
return string
|
||||
#
|
||||
# def str_radius_spec(string):
|
||||
# spec = False
|
||||
# for x in only_specific:
|
||||
# if x in string:# or categ_dicts[x] == 'Acronyms':
|
||||
# spec = True
|
||||
# return x
|
||||
# if spec ==False:
|
||||
# return string
|
||||
|
||||
|
||||
|
||||
def shorten_keywords(affiliations_simple, radius_u):
|
||||
affiliations_simple_n = []
|
||||
|
||||
for aff in affiliations_simple:
|
||||
# print('check aff', aff)
|
||||
if aff in dix_org:
|
||||
# print('check aff', aff)
|
||||
if aff in dix_name:
|
||||
# print('in dix')
|
||||
affiliations_simple_n.append(aff)
|
||||
|
||||
elif 'univer' in aff:
|
||||
|
|
|
|||
|
|
@ -2,59 +2,33 @@ import Levenshtein
|
|||
from affro.helpers.functions import *
|
||||
from affro.helpers.create_input import *
|
||||
|
||||
|
||||
|
||||
specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' or categ_dicts[k] == 'Acronyms']
|
||||
|
||||
|
||||
# print('HERE', len(dix_org))
|
||||
# print('HERE_city', len(dix_city))
|
||||
# print('HERE_country', len(dix_country))
|
||||
|
||||
|
||||
def index_multiple_matchings(pairs):
|
||||
d = {}
|
||||
for p in pairs:
|
||||
d[p[0][0]] = len(p)
|
||||
|
||||
return d
|
||||
|
||||
def keep_highest_url(lst):
|
||||
best = {}
|
||||
|
||||
for item in lst:
|
||||
name, score, url = item
|
||||
if name not in best or url > best[name][2]: # Keep the highest URL
|
||||
best[name] = item # Store the full entry
|
||||
|
||||
return list(best.values()) # Convert dictionary values back to list
|
||||
|
||||
def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
|
||||
|
||||
|
||||
vectorizer = CountVectorizer()
|
||||
|
||||
similar_k = []
|
||||
pairs_k = []
|
||||
total_pairs = 0
|
||||
# if keyword in dix_org:
|
||||
# print('lucky')
|
||||
# pairs_k.append((keyword,keyword,1,dix_org[keyword], dix_id_country[dix_org[keyword]]))
|
||||
|
||||
for x in candidates_:
|
||||
# print('keyword', keyword)
|
||||
|
||||
if is_contained(keyword, x):
|
||||
# print(0,x,total_pairs)
|
||||
|
||||
if is_contained(keyword, x):# and ('univ' in x or 'inst' in x or len(get_candidates([])) < len(dix_name)):
|
||||
# print('keyword contained')
|
||||
x_vector = vectorizer.fit_transform([x]).toarray()
|
||||
keyword_vector = vectorizer.transform([keyword]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(x_vector, keyword_vector)[0][0]
|
||||
# print('similarity', similarity)
|
||||
if similarity > min(simU, simG):
|
||||
if ('univ' in keyword and 'univ' in x) and similarity > simU:
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
|
||||
pairs_k.append((keyword,x,similarity))
|
||||
total_pairs += 1 # Track total number of pairs
|
||||
|
||||
|
||||
|
|
@ -65,24 +39,17 @@ def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
|
|||
|
||||
|
||||
elif (not 'univ'in keyword and not 'univ' in x) and similarity > simG:
|
||||
# print('pass', keyword, x, similarity)
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
|
||||
pairs_k.append((keyword,x,similarity))
|
||||
total_pairs += 1 # Track total number of pairs
|
||||
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
|
||||
|
||||
|
||||
elif is_contained(x, keyword):
|
||||
# print(0.5,x,total_pairs)
|
||||
if ('univ'in keyword and 'univ' in x):
|
||||
# print(1,x,total_pairs)
|
||||
|
||||
|
||||
keyword_vector = vectorizer.fit_transform([keyword]).toarray()
|
||||
x_vector = vectorizer.transform([x]).toarray()
|
||||
|
|
@ -91,7 +58,7 @@ def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
|
|||
similarity = cosine_similarity(keyword_vector, x_vector)[0][0]
|
||||
if similarity > simU: #max(0.82,sim):
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
|
||||
pairs_k.append((keyword,x,similarity))
|
||||
total_pairs += 1 # Track total number of pairs
|
||||
|
||||
if k not in dix:
|
||||
|
|
@ -102,25 +69,27 @@ def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
|
|||
|
||||
|
||||
elif not 'univ' in keyword and not 'univ' in x:
|
||||
|
||||
# print('not uni')
|
||||
keyword_vector = vectorizer.fit_transform([keyword]).toarray()
|
||||
x_vector = vectorizer.transform([x]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(keyword_vector, x_vector)[0][0]
|
||||
|
||||
if similarity > simG: #max(0.82,sim):
|
||||
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
|
||||
pairs_k.append((keyword,x,similarity))
|
||||
total_pairs += 1 # Track total number of pairs
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
# total_pairs += len(pairs_k) # Track total number of pairs
|
||||
|
||||
if total_pairs >= limit: # Stop if we reach
|
||||
return []
|
||||
|
||||
# print('end find_candidate', pairs_k)
|
||||
return pairs_k
|
||||
|
||||
|
||||
|
|
@ -131,7 +100,6 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
|
|||
"""
|
||||
|
||||
vectorizer = CountVectorizer()
|
||||
univ_num = light_raw.lower().count('univ')
|
||||
result = []
|
||||
best = []
|
||||
|
||||
|
|
@ -141,7 +109,6 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
|
|||
affil = pair_group[0][0]
|
||||
num_uni_p = affil.count('univ')
|
||||
|
||||
# print('AFFIL', affil)
|
||||
for p in pair_group:
|
||||
organization, confidence = p[1], p[2]
|
||||
|
||||
|
|
@ -183,10 +150,8 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
|
|||
|
||||
# Sort by similarity score (descending) and then lexicographically
|
||||
reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True)
|
||||
# print('REDUCED BEST: ', reduced_best)
|
||||
|
||||
result.extend(reduced_best)
|
||||
# print('RESULT EXT: ', result)
|
||||
|
||||
# Step 3: Limit university-related matches
|
||||
univ_list = [r for r in result if 'univ' in r[0]]
|
||||
|
|
@ -214,247 +179,7 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
|
|||
# Convert to list format
|
||||
final_result = [[key, value[0]] for key, value in sorted(result_dict.items(), key=lambda x: x[1][1], reverse=True)]
|
||||
|
||||
# print("RESULT TO USE: ", final_result)
|
||||
return final_result
|
||||
|
||||
|
||||
|
||||
def Aff_Ids(input, dix_org, dix_mult, dix_city, dix_country, simU, simG, limit):
|
||||
|
||||
"""
|
||||
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
|
||||
|
||||
Args:
|
||||
m (int): The number of DOIs to check.
|
||||
DF (DataFrame): The input DataFrame containing affiliation data.
|
||||
dix_org (dict): A dictionary of names of organizations and their ROR_ids.
|
||||
simU (float): Similarity threshold for universities.
|
||||
simG (float): Similarity threshold for non-universities.
|
||||
|
||||
Returns:
|
||||
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
|
||||
"""
|
||||
clean_aff = input[0]
|
||||
# print('CLEAN_AFF (LVL1): ', clean_aff)
|
||||
light_aff = input[1].replace(' gmbh', ' ').strip()
|
||||
# print('LIGHT_AFF (LVL2): ', light_aff)
|
||||
|
||||
df_list = input[2]
|
||||
|
||||
countries_list = input[3]
|
||||
# print('COUNTRIES_LIST: ', countries_list)
|
||||
vectorizer = CountVectorizer()
|
||||
|
||||
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
|
||||
result = {}
|
||||
pairs = []
|
||||
|
||||
keywords = [entry["keywords"].replace(' gmbh', ' ').strip() for entry in df_list]
|
||||
|
||||
candidates = get_candidates(countries_list)
|
||||
|
||||
# print('KEYWORDS: ', keywords)
|
||||
if len(keywords) > 1 or len(keywords) == 1 and len(keywords[0])>1:
|
||||
|
||||
for k,s in enumerate(keywords):
|
||||
pairs_k = []
|
||||
# print('try', s)
|
||||
try:
|
||||
pairs_k.append((s,s,1,dix_org[s],dix_id_country[dix_org[s]]))
|
||||
# print('LUCKY')
|
||||
|
||||
# pairs.append((s,s,similarity,dix_org[s], dix_id_country[dix_org[s]]))
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [s]
|
||||
else:
|
||||
dix[k].append(s)
|
||||
|
||||
except:
|
||||
# print('NOT LUCKY')
|
||||
pairs_k = find_candidate(s, k , dix, simU, simG, candidates, limit)
|
||||
# print('PAIRS K: ', pairs_k)
|
||||
|
||||
result[k] = pairs_k
|
||||
if len(pairs_k)>0:
|
||||
# print('PAIRS K>0: ', pairs_k)
|
||||
|
||||
pairs.append(pairs_k)
|
||||
|
||||
# print('PAIRS: ', pairs)
|
||||
multi = index_multiple_matchings(pairs)
|
||||
# print('MULTIL ',multi)
|
||||
|
||||
need_check_keys = []
|
||||
ready_keys = []
|
||||
ready_best = []
|
||||
for keyword in multi:
|
||||
try:
|
||||
if multi[keyword]>1:
|
||||
need_check_keys.append(keyword)
|
||||
else:
|
||||
for p in pairs:
|
||||
if keyword in p[0]:
|
||||
if p[0][1] not in ready_keys:
|
||||
ready_keys.append(p[0][1])
|
||||
|
||||
ready_best.append([p[0][1], p[0][2]])
|
||||
except:
|
||||
pass
|
||||
# print('READY KEYWORD: ', ready_keys)
|
||||
# print('READY BEST: ', ready_best)
|
||||
|
||||
# print('NEED CHECK KEYWORD: ', need_check_keys)
|
||||
|
||||
pairs_check = [ pair for pair in pairs if pair[0][0] in need_check_keys ]
|
||||
# print('NEED CHECK PAIRS: ', pairs_check)
|
||||
|
||||
|
||||
if len(need_check_keys)>0:
|
||||
best0 = best_sim_score(clean_aff, light_aff, len(keywords), pairs_check, multi, simU, simG)
|
||||
# print('OUTPUT BEST: ', best0)
|
||||
best1 = {x[0]:dix_org[x[0]] for x in best0 }
|
||||
best01 = unique_subset(best0, best1)
|
||||
matched_org = list(set([x[0] for x in best01])) + ready_keys
|
||||
best = best01 + ready_best
|
||||
|
||||
|
||||
|
||||
# print('NEW BEST',best01)
|
||||
else:
|
||||
best = ready_best
|
||||
matched_org = ready_keys
|
||||
|
||||
|
||||
# print('FINAL BEST: ', best)
|
||||
## print('MATCHED: ', matched_org)
|
||||
|
||||
id_list = []
|
||||
|
||||
for org_list in best:
|
||||
org = org_list[0]
|
||||
conf = org_list[1]
|
||||
if dix_mult[org] == 'unique':
|
||||
# print('unique:', org)
|
||||
if 'institu' in org and 'univ' in org:
|
||||
#print('both inst and univ', clean_aff)
|
||||
if dix_city[org][0] not in clean_aff and dix_country[org][0] not in clean_aff:
|
||||
#print('pass')
|
||||
pass
|
||||
else:
|
||||
#print('correct')
|
||||
id_list.append([org, conf, dix_org[org]])
|
||||
else:
|
||||
id_list.append([org, conf, dix_org[org]])
|
||||
|
||||
|
||||
else:
|
||||
# print('not unique:', org)
|
||||
if org in dix_city:
|
||||
match_found = False
|
||||
|
||||
for city in dix_city[org]:
|
||||
if city[0] in clean_aff:
|
||||
if city[0] not in org:
|
||||
# print('city', city[0], org)
|
||||
id_list.append([org, conf, city[1]])
|
||||
match_found = True
|
||||
break
|
||||
else:
|
||||
if clean_aff.count(city[0]) >1:
|
||||
id_list.append([org, conf, city[1]])
|
||||
match_found = True
|
||||
break
|
||||
|
||||
if not match_found:
|
||||
for city in dix_city[org]:
|
||||
if city[0] in clean_aff and city[0] not in org:
|
||||
id_list.append([org, conf, city[1]])
|
||||
break
|
||||
|
||||
if not match_found:
|
||||
match_found2 = False
|
||||
match_found3 = False
|
||||
|
||||
all_countries = list(set([c[0] for c in dix_country[org]]))
|
||||
if len(all_countries) > 1:
|
||||
|
||||
for country in dix_country[org]:
|
||||
# print('country', country[0], org)
|
||||
|
||||
tokens = set(clean_aff.lower().split())
|
||||
text = clean_aff.lower()
|
||||
|
||||
if country[0] == 'united states' and (
|
||||
'united states' in text
|
||||
or {'usa', 'usa.'} & tokens
|
||||
or 'u.s.a.' in text
|
||||
):
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
|
||||
|
||||
if country[0] == 'united kingdom' and (
|
||||
'united kingdom' in text
|
||||
or {'uk', 'uk.'} & tokens
|
||||
or 'u.k.' in text
|
||||
):
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
# print('check country', clean_aff)
|
||||
# if country[0] == 'united states' and (country[0] in clean_aff or 'usa' in clean_aff.split() or 'usa.' in clean_aff.split() or 'u.s.a.' in clean_aff):
|
||||
# id_list.append([org, conf, country[1]])
|
||||
# match_found2 = True
|
||||
# match_found3 = True
|
||||
# break
|
||||
|
||||
# if country[0] == 'united kingdom' and (country[0] in clean_aff or 'uk' in clean_aff.split() or 'u.k.' in clean_aff):
|
||||
# id_list.append([org, conf, country[1]])
|
||||
# match_found2 = True
|
||||
# match_found3 = True
|
||||
# break
|
||||
if country[0] == 'turkey' and (
|
||||
'turkiye' in text
|
||||
#or 'u.k.' in text
|
||||
):
|
||||
# print('here turkey')
|
||||
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
elif country[0].split()[0] in clean_aff:
|
||||
|
||||
if country[0] not in org:
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
else:
|
||||
single_country = all_countries[0]
|
||||
if single_country in clean_aff:
|
||||
id_list.append([org, conf, dix_org[org]])
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
|
||||
if not match_found3:
|
||||
for country in dix_country[org]:
|
||||
if country[0] in clean_aff and country[0] in org:
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
break
|
||||
if not match_found2:
|
||||
for sp in specific:
|
||||
if sp in org:
|
||||
id_list.append([org, conf, dix_org[org]])
|
||||
|
||||
|
||||
# print("RESULT: ", id_list)
|
||||
id_list_final = keep_highest_url(id_list)
|
||||
|
||||
return id_list_final
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because one or more lines are too long
|
|
@ -16,9 +16,12 @@
|
|||
"universitatskinderklinik": "univer childrens hospital",
|
||||
"universitatskliniken": "univer hospital",
|
||||
"universitätsklinik": "univer hospital",
|
||||
"uniklinik" : "univer hospital",
|
||||
"universitatsmedizin": "univer medicine",
|
||||
"universitatsbibliothek": "univer library",
|
||||
"universiteitsmuseum": "univer museum",
|
||||
"nat.": "national",
|
||||
"pamantasan": "univer",
|
||||
"uniaersity": "univer",
|
||||
"univesity": "univer",
|
||||
"unversity": "univer",
|
||||
|
|
@ -76,5 +79,29 @@
|
|||
"medisch": "medical",
|
||||
"hahn-meitner-institut berlin" : "helmholtz-zentrum berlin",
|
||||
"fachhochschule gelsenkirchen" : "westfalische hochschule",
|
||||
"turkiye" : "turkey"
|
||||
"turkiye" : "turkey",
|
||||
"trinity colege university" : "trinity colege",
|
||||
"tyndal institute" : "tyndal national institute",
|
||||
"st patricks colege, drumcondra" : "dublin city university",
|
||||
"ucd dublin" : "univer colege dublin",
|
||||
"department university" : "department, university",
|
||||
"xi an" : "xian",
|
||||
"sligo general hospital" : "sligo univer hospital",
|
||||
"trinity colege cambridge" : "univer cambridge",
|
||||
"trinity colege, cambridge" : "univer cambridge",
|
||||
"st johns colege, cambridge" : "univer cambridge",
|
||||
"st johns colege cambridge" : "univer cambridge",
|
||||
"kings colege, cambridge" : "univer cambridge",
|
||||
"kings colege cambridge" : "univer cambridge",
|
||||
"eire" : "ireland",
|
||||
"trinity colege, ireland" : "trinity colege dublin",
|
||||
"trinity colege ireland" : "trinity colege dublin",
|
||||
"gilan" : "guilan",
|
||||
"freiberg univer mining techn" : "techn univer bergakademie freiberg",
|
||||
"vishwavidyalaya" : "univer",
|
||||
"rwi esen" : "rwi – leibniz institu economic research",
|
||||
"t. d. medical colege" : "alapuzha medical colege",
|
||||
"sulaymaniyah" : "sulaimani",
|
||||
"-ang" : " ang"
|
||||
|
||||
}
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
|
|
@ -1,6 +1,9 @@
|
|||
eire
|
||||
turkiye
|
||||
trinidad
|
||||
united
|
||||
kingdom
|
||||
states
|
||||
emirates
|
||||
hong
|
||||
niederland
|
||||
holand
|
||||
|
|
|
|||
|
|
@ -20,4 +20,6 @@ di
|
|||
l
|
||||
street
|
||||
post-box
|
||||
e.v.
|
||||
e.v.
|
||||
do
|
||||
ng
|
||||
|
|
@ -8,4 +8,11 @@ universitatsbibliothek
|
|||
universitatspital
|
||||
universitetsjukhuset
|
||||
universitatsaugenklinik
|
||||
univesitatsfrauenklinik
|
||||
univesitatsfrauenklinik
|
||||
universitetscentralsjukhus
|
||||
universitatsverlag
|
||||
universitaetsklinikum
|
||||
universitatsalianz
|
||||
universalmuseum
|
||||
universitatszahnklinik
|
||||
universitetsforlaget
|
||||
|
|
@ -1,4 +1,4 @@
|
|||
python_Levenshtein==0.27.1
|
||||
scikit_learn==1.4.2
|
||||
setuptools==75.8.0
|
||||
Unidecode==1.3.8
|
||||
unidecode==1.3.8
|
||||
|
|
|
|||
387
test.ipynb
387
test.ipynb
|
|
@ -1,387 +0,0 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from affro.core import run_affro\n",
|
||||
"from affro.core import matchings_affro\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/05ect4e57',\n",
|
||||
" 'Louisiana State University',\n",
|
||||
" 0.8660254037844388,\n",
|
||||
" 'active',\n",
|
||||
" 'united states')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"matchings_affro('Center for Computation and Technology#R##N#Louisiana State University')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/02x8svs93',\n",
|
||||
" 'Near East University',\n",
|
||||
" 1.0,\n",
|
||||
" 'active',\n",
|
||||
" 'cyprus')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"matchings_affro('Department of Electrical and Electronic Engineering, Intelligent Systems Research Group (ISRG), Near East University, Mersin 10, Türkiye')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/010b9wj87',\n",
|
||||
" 'Boston Medical Center',\n",
|
||||
" 1.0,\n",
|
||||
" 'active',\n",
|
||||
" 'united states'),\n",
|
||||
" ('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/03ps5d564',\n",
|
||||
" 'Boston University School of Medicine',\n",
|
||||
" 1.0,\n",
|
||||
" 'withdrawn',\n",
|
||||
" 'united states'),\n",
|
||||
" ('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/05qwgg493',\n",
|
||||
" 'Boston University',\n",
|
||||
" 1.0,\n",
|
||||
" 'active',\n",
|
||||
" 'united states')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"matchings_affro('--label omited: 2--maxwel finland laboratory infectious diseases, boston medical center, boston university school medicine, boston, masachusets')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/010b9wj87',\n",
|
||||
" 'Boston Medical Center',\n",
|
||||
" 1.0,\n",
|
||||
" 'active',\n",
|
||||
" 'united states'),\n",
|
||||
" ('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/03ps5d564',\n",
|
||||
" 'Boston University School of Medicine',\n",
|
||||
" 1.0,\n",
|
||||
" 'withdrawn',\n",
|
||||
" 'united states'),\n",
|
||||
" ('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/05qwgg493',\n",
|
||||
" 'Boston University',\n",
|
||||
" 1.0,\n",
|
||||
" 'active',\n",
|
||||
" 'united states')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"matchings_affro('--label omited: 2--maxwel finland laboratory infectious diseases, boston medical center, boston university school medicine, boston, masachusets')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/045hgzm75',\n",
|
||||
" 'Selçuk University',\n",
|
||||
" 0.7071067811865475,\n",
|
||||
" 'active',\n",
|
||||
" 'turkey'),\n",
|
||||
" ('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/04frf8n21',\n",
|
||||
" 'Kyrgyz-Türkish Manas Üniversity',\n",
|
||||
" 0.816496580927726,\n",
|
||||
" 'active',\n",
|
||||
" 'kyrgyzstan')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"matchings_affro('selcuk universitesi veteriner fakultesi, patoloji anabilim dali, kampus, konya,turkiye kirgizistan-turkiye manas universitesi, veteriner fakultesi, biskek/kirgizistan')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/045hgzm75',\n",
|
||||
" 'Selçuk University',\n",
|
||||
" 0.7071067811865475,\n",
|
||||
" 'active',\n",
|
||||
" 'turkey'),\n",
|
||||
" ('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/04frf8n21',\n",
|
||||
" 'Kyrgyz-Türkish Manas Üniversity',\n",
|
||||
" 0.816496580927726,\n",
|
||||
" 'active',\n",
|
||||
" 'kyrgyzstan')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"matchings_affro('selcuk universitesi veteriner fakultesi, patoloji anabilim dali, kampus, konya,turkiye kirgizistan-turkiye manas universitesi, veteriner fakultesi, biskek/kirgizistan')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/03ad39j10',\n",
|
||||
" 'University of Pisa',\n",
|
||||
" 0.816496580927726,\n",
|
||||
" 'active',\n",
|
||||
" 'italy')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"matchings_affro(\"Universita'vjh di pisa\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/010jx2260',\n",
|
||||
" 'National Institute of Agricultural Botany',\n",
|
||||
" 1.0,\n",
|
||||
" 'active',\n",
|
||||
" 'united kingdom')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"matchings_affro('niab, united kingdom')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/024bc3e07',\n",
|
||||
" 'Google (United Kingdom)',\n",
|
||||
" 1.0,\n",
|
||||
" 'active',\n",
|
||||
" 'united kingdom')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"matchings_affro(\"google, United Kingdom\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[('affro',\n",
|
||||
" '2.2.2',\n",
|
||||
" 'ror',\n",
|
||||
" 'https://ror.org/03ad39j10',\n",
|
||||
" 'University of Pisa',\n",
|
||||
" 0.816496580927726,\n",
|
||||
" 'active',\n",
|
||||
" 'italy')]"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"matchings_affro(\"Universita'vhj di pisa\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"[{'provenance': 'affro',\n",
|
||||
" 'version': '2.2.2',\n",
|
||||
" 'pid': 'ror',\n",
|
||||
" 'value': 'https://ror.org/04gnjpq42',\n",
|
||||
" 'name': 'National and Kapodistrian University of Athens',\n",
|
||||
" 'confidence': 1,\n",
|
||||
" 'status': 'active',\n",
|
||||
" 'country': 'greece'}]"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"run_affro('university of athens')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "base",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.9"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 2
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue