Compare commits

...

7 Commits
2.2.2 ... main

Author SHA1 Message Date
mkallipo 30e88e87f0 clean_string_lucky 2025-12-15 11:49:48 +01:00
mkallipo abd11209b9 bugs 2025-12-12 17:52:37 +01:00
mkallipo b274291443 bugs, new ids 2025-11-19 11:23:53 +01:00
mkallipo baceddf91f just a bug 2025-11-18 19:52:58 +01:00
mkallipo 1ecee1372f just a bug 2025-11-18 18:45:57 +01:00
mkallipo 2da4348884 Fixed bugs, added new OpenOrgs IDs, updated ROR IDs, updated file structure, added is_first label to org names, improved organization's category handling 2025-11-18 13:16:17 +01:00
mkallipo c9914a47d0 new structure for the dictionaries, new openorgs ids, ror version oct 2025-10-17 17:25:01 +02:00
53 changed files with 5046772 additions and 3136690 deletions

View File

@ -1,6 +1,6 @@
Metadata-Version: 2.4
Name: affro
Version: 2.2.2
Version: 3.1.1
Summary: A tool to resolve organization names to ROR or OpenOrgs IDs
Home-page: https://code-repo.d4science.org/mkallipo/affRo
Author: Myrto Kallipoliti

View File

@ -10,17 +10,15 @@ affro.egg-info/dependency_links.txt
affro.egg-info/top_level.txt
affro/helpers/__init__.py
affro/helpers/create_input.py
affro/helpers/disambiguation.py
affro/helpers/find_id.py
affro/helpers/find_name.py
affro/helpers/functions.py
affro/helpers/matching.py
affro/jsons/dix_categ.json
affro/jsons/dix_city.json
affro/jsons/dix_country.json
affro/jsons/dix_country_legalnames.json
affro/jsons/dix_id_country.json
affro/jsons/dix_id_name.json
affro/jsons/dix_mult.json
affro/jsons/dix_org.json
affro/jsons/dix_status.json
affro/jsons/dix_id.json
affro/jsons/dix_name.json
affro/jsons/replacements.json
affro/txts/city_names.txt
affro/txts/country_names.txt

View File

@ -1 +1 @@
__version__ = "2.2.2"
__version__ = "3.1.1"

View File

@ -3,156 +3,150 @@ import sys
from affro.helpers.functions import *
from affro.helpers.matching import *
from affro.helpers.create_input import *
import json
from affro.helpers.find_name import *
from affro.helpers.find_id import *
from affro.helpers.disambiguation import *
from . import __version__
VERSION = __version__
dix_org = load_json('jsons/dix_org.json')
dix_mult = load_json('jsons/dix_mult.json')
dix_city = load_json('jsons/dix_city.json')
dix_country = load_json('jsons/dix_country.json')
dix_status = load_json('jsons/dix_status.json')
dix_id_name = load_json('jsons/dix_id_name.json')
dix_id_country = load_json('jsons/dix_id_country.json')
dix_id_name = load_json('jsons/dix_id_name.json')
dix_id = load_json('jsons/dix_id.json')
dix_name = load_json('jsons/dix_name.json')
dix_status_new = {k :[dix_status[k][0], dix_status[k][1].split(', ')] for k in dix_status}
us_states = [
"alabama", "alaska", "arizona", "arkansas", "california",
"colorado", "conecticut", "delaware", "florida", "georgia",
"hawaii", "idaho", "ilinois", "indiana", "iowa",
"kansas", "kentucky", "louisiana", "maine", "maryland",
"masachusets", "michigan", "minesota", "misisipi", "misouri",
"montana", "nebraska", "nevada", "new hampshire", "new jersey",
"new mexico", "new york", "north carolina", "north dakota", "ohio",
"oklahoma", "oregon", "pensylvania", "rhode island", "south carolina",
"south dakota", "tenesee", "texas", "utah", "vermont",
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
]
def produce_result(input, simU, simG, limit):
best_name = find_name(input, dix_name, simU, simG, limit)
id_result = find_id(input, best_name, dix_name)
result = disamb(input, id_result, dix_id)
def contains_us_state(text):
text = text.lower()
return any(state in text for state in us_states)
def find_ror(input, simU, simG, limit):
light_aff = input[0]
result = Aff_Ids(input, dix_org, dix_mult, dix_city, dix_country, simU, simG, limit)
results_upd = []
for r in result:
if "openorgs" in r[2]:
results_upd.append([r[1], 'openorgs', r[2], 'active', dix_id_country[r[2]]])
else:
if dix_status_new[r[2]][0] == 'active':
results_upd.append([r[1], 'ror', r[2], 'active', dix_id_country[r[2]]])
else:
if dix_status_new[r[2]][1][0] == '':
results_upd.append([r[1], 'ror', r[2], dix_status_new[r[2]][0], dix_id_country[r[2]]])
else:
results_upd.append([r[1], 'ror', r[2], dix_status_new[r[2]][0],dix_id_country[r[2]]])
for link in (dix_status_new[r[2]][1]):
results_upd.append([r[1], 'ror', link, 'active',dix_id_country[r[2]],dix_id_country[link]])
if len(results_upd) > len(set(description(light_aff)[1])):
final_matching = []
light_aff_tokens = [clean_string_ror(x) for x in set(light_aff.split())]
for id_ in results_upd:
country = dix_id_country[id_[2]]
if country == 'united states':
if 'united states' in light_aff or 'usa' in light_aff_tokens or contains_us_state(light_aff):
final_matching.append(id_)
elif country == 'united kingdom':
if 'united kingdom' in light_aff or 'uk' in light_aff_tokens:
final_matching.append(id_)
elif 'korea' in country:
if 'korea' in light_aff_tokens:
final_matching.append(id_)
elif country in light_aff:
final_matching.append(id_)
if len(final_matching)>0:
result_dict = [{'provenance': 'affro', 'version': VERSION, 'pid':'openorgs', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'provenance': 'affro', 'version': VERSION,'pid':'ror', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} for x in final_matching]
return result_dict
else:
return [{'provenance': 'affro', 'version': VERSION, 'pid':'openorgs', 'value':x[2], 'name': dix_id_name[x[2]],'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'provenance': 'affro', 'version': VERSION, 'pid':'ror', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} for x in results_upd]
elif len(results_upd)>0:
return [{'provenance': 'affro', 'version': VERSION, 'pid':'openorgs', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'provenance': 'affro', 'version': VERSION, 'pid':'ror', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} for x in results_upd]
else:
result_dict = []
return result_dict
return result
def run_affro(raw_aff_string):
lucky_guess = clean_string_ror(raw_aff_string)
lucky_guess = clean_string_lucky(raw_aff_string)
# print(lucky_guess)
try:
if lucky_guess in dix_org:
if dix_mult[lucky_guess] == "unique":
if 'openorgs' in dix_org[lucky_guess]:
if lucky_guess in dix_name:
# print('lucky guess hit', lucky_guess)
# print('lucky guess found', dix_name[lucky_guess])
if len(dix_name[lucky_guess]) == 1:
id_ = dix_name[lucky_guess][0]['id']
name_ = dix_id[id_]['name']
country_ = dix_id[id_]['country']
status_ = dix_id[id_]['status']
if 'openorgs' in id_:
return[{'provenance': 'affro', 'version': VERSION, 'pid': 'openorgs', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'openorgs', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
else:
if dix_status_new[dix_org[lucky_guess]][0] == 'active':
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
elif dix_status_new[dix_org[lucky_guess]][1][0]== '':
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': dix_status_new[dix_org[lucky_guess]][0], 'country':dix_id_country[dix_org[lucky_guess]]}]
if status_[0] == 'active':
# print('active')
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
elif status_[0]== '':
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value':id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
else:
res = [{'provenance': 'affro', 'version': VERSION, 'pid' : 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': dix_status_new[dix_org[lucky_guess]][0], 'country':dix_id_country[dix_org[lucky_guess]]}]
for successor in dix_status_new[dix_org[lucky_guess]][1]:
res.append({'provenance': 'affro', 'version': VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]})
res = [{'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
for successor in status_[1]:
if successor != '':
res.append({'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id[successor]['name'], 'confidence': 1, 'status': 'active', 'country':dix_id[successor]['country']})
return res
else:
cand_ids = [
key
for _, key in dix_city[lucky_guess]
if ("ror" in key and dix_status_new[key][0] == "active") or ("openorgs" in key)
]
num_countries = len(
set(
dix_id_country[x[1]]
for x in dix_city[lucky_guess]
if ("ror" in x[1] and dix_status_new[x[1]][0] == "active") or ("openorgs" in x[1])
)
)
# print('multiple candidates')
ids = [x['id'] for x in dix_name[lucky_guess]]
cand_ids = [id for id in ids if is_first(id, lucky_guess) == 'y']
# print('cand_ids', cand_ids)
# pick the ror id where 'first' == 'y' (None if not found)
if len(cand_ids) !=1:
# print('secondary conditions')
conditions = [
lambda key: ("ror" in key and dix_id[key]['status'][0] == "active"
and dix_id[key]['top_level'][0] == 'y') \
or ("openorgs" in key),
if len(cand_ids) == 1 or num_countries == 1:
if 'openorgs' in dix_org[lucky_guess]:
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'openorgs', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
lambda key: ("ror" in key and dix_id[key]['status'][0] == "active"
and dix_id[key]['parent'][0] == 'y') \
or ("openorgs" in key),
lambda key: ("ror" in key and dix_id[key]['status'][0] == "active") \
or ("openorgs" in key)
]
for cond in conditions:
cand_ids = [key for key in ids if cond(key)]
if cand_ids:
# print('break')
break
if len(cand_ids) == 0:
# print('check result')
result = produce_result(create_df_algorithm(raw_aff_string, 10), 0.42, 0.82, 500)
return result
# print('cand_ids',cand_ids)
if len(cand_ids) == 1:# or num_countries == 1:
id_ = cand_ids[0]
# print('id',id_)
name_ = dix_id[id_]['name']
country_ = dix_id[id_]['country']
status_ = dix_id[id_]['status']
if 'openorgs' in id_:
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'openorgs', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
else:
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
else:
return []
if status_[0] == 'active':
# print('active')
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
elif status_[0]== '':
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value':id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
else:
res = [{'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
for successor in status_[1]:
if successor != '':
res.append({'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id[successor]['name'], 'confidence': 1, 'status': 'active', 'country':dix_id[successor]['country']})
return res
# return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country':country_}]
else:
found = False
for triplet in dix_name[lucky_guess]:
if triplet['first'] == 'y':
found = True
id_ = triplet['id']
name_ = dix_id[id_]['name']
country_ = dix_id[id_]['country']
status_ = dix_id[id_]['status']
if 'openorgs' in id_:
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'openorgs', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
else:
if status_[0] == 'active':
# print('active')
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
elif status_[0]== '':
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value':id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
else:
res = [{'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
for successor in status_[1]:
if successor != '':
res.append({'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id[successor]['name'], 'confidence': 1, 'status': 'active', 'country':dix_id[successor]['country']})
return res
if found == False:
return []
else:
# print('No lucky guess, running algorithm...')
result = find_ror(create_df_algorithm(raw_aff_string, 3), 0.42, 0.82, 500)
# print('lucky guess miss')
result = produce_result(create_df_algorithm(raw_aff_string, 3), 0.42, 0.82, 500)
return result
except Exception as e:
# Return some indication of an error, or log the row
print(f"Error: {str(e)}")
print(f"Error end: {str(e)}")
print(raw_aff_string)
pass
def matchings_affro(aff_string):
@ -168,7 +162,7 @@ def matchings_affro(aff_string):
# Create the result as a tuple that matches matchings_schema
result = []
for matching in matchings:
# Assuming 'matching' is a dictionary that contains 'provenance', 'version', 'pid', 'value', 'name', 'confidence', 'status', 'country'
# Assuming 'matching' is a dictionary that contains 'provenance', 'affro', 'value', 'confidence', 'status'
result.append((
matching.get("provenance", None),
matching.get("version", None),
@ -178,7 +172,6 @@ def matchings_affro(aff_string):
float(matching.get("confidence", None)),
matching.get("status", None),
matching.get("country", None)
))
if len(result)>0:
return result
@ -189,5 +182,4 @@ def matchings_affro(aff_string):
return ()

View File

@ -1,12 +1,9 @@
from affro.helpers.functions import *
def valueToCategory(value):
flag = 0
for k in categ_dicts:
if k in value and categ_dicts[k] in categ_string.split('|'):
flag = 1
return flag
return categ_dicts[k]
# tokenization
@ -28,21 +25,30 @@ protect = ['national univer ireland',
'state univer',
'rijksuniver',
'rijks univer',
'univer medical center'
'univer medical center',
'royal colege surgeons',
'st patricks colege',
'institu techn',
'trinity colege',
'st johns colege',
'wiliam beaumont hospital'
]
def create_df_algorithm(raw_aff_string, radius_u):
clean_aff = clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string)))
#print(0, clean_aff)
countries_list = description(clean_aff)[1]
aff_no_symbols_d = substrings_dict(reduce(clean_aff))
#print(0.5, aff_no_symbols_d)
substring_list = [replace_abbr_univ(x) for x in list(aff_no_symbols_d.values())]
#print(1, substring_list)
# for k, word in enumerate(substring_list):
# print(word)
# if word in protect and substring_list[k+1] in city_names:
# print('y')
# word = word + ', ' + substring_list[k+1]
# substring_list[k] = word
i = 0
# print(substring_list,'substring_list')
while i < len(substring_list) - 1:
if substring_list[i] in protect and any(name in substring_list[i+1] for name in city_names): #substring_list[i+1] in city_names:
if substring_list[i] in protect and any(name in substring_list[i+1] for name in city_names+countries): #substring_list[i+1] in city_names:
substring_list[i] = substring_list[i] + ' ' + substring_list[i+1]
i = i+2
continue
@ -61,31 +67,18 @@ def create_df_algorithm(raw_aff_string, radius_u):
i = i+1
continue
# elif 'lab' in substring_list[i] and ('colege' in substring_list[i+1] or 'dep' in substring_list[i+1] or 'school' in substring_list[i+1]):
# if not 'univ' in substring_list[i]: #'inst' in substring_list[i+1] or
# substring_list.pop(i)
# else:
# i = i+1
# continue
else:
i += 1
# print(1.4, substring_list)
light_aff = (', '.join((substring_list)))
# print(1.5, light_aff)
substring_list = [x for x in substring_list if x.replace(' gmbh','') not in city_names+remove_list]
# print(1.7,substring_list)
substring_list0 = [shorten_keywords([x], radius_u) for x in substring_list if len(shorten_keywords([x],radius_u))>0]
# print(2,substring_list0 )
substring_list1 = [inner for outer in substring_list0 for inner in outer]
# print(3,substring_list1 )
aff_list = [{"index": i, "keywords": substring_list1[i], "category": valueToCategory(substring_list1[i])} for i in range(len(substring_list1))]
filtered_list = [entry for entry in aff_list if entry.get("category") == 1]
filtered_list = [entry for entry in aff_list if type(entry.get("category")) == str]
return [clean_aff, light_aff, filtered_list, countries_list]

View File

@ -0,0 +1,155 @@
from affro.helpers.functions import *
from affro.helpers.create_input import *
from .. import __version__
VERSION = __version__
us_states = [
"alabama", "alaska", "arizona", "arkansas", "california",
"colorado", "conecticut", "delaware", "florida", "georgia",
"hawaii", "idaho", "ilinois", "indiana", "iowa",
"kansas", "kentucky", "louisiana", "maine", "maryland",
"masachusets", "michigan", "minesota", "misisipi", "misouri",
"montana", "nebraska", "nevada", "new hampshire", "new jersey",
"new mexico", "new york", "north carolina", "north dakota", "ohio",
"oklahoma", "oregon", "pensylvania", "rhode island", "south carolina",
"south dakota", "tennesee", "texas", "utah", "vermont",
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
]
def contains_us_state(text):
text = text.lower()
return any(state in text for state in us_states)
# def get_city(name, dix_name):
# return {x['city'] : x['id'] for x in dix_name[name]}
def convert_to_result(id_list_, dix_id):
"""
id_list_ rows: [something, score, value]
dix_id: mapping from id -> {'name':..., 'country':..., 'status': [primary, secondary_list]}
"""
result_dict = []
for r in id_list_:
# Confidence is in r[1]
score = min(r[1], 1.0)
value = r[2]
rec = dix_id.get(value)
if rec is None:
# missing metadata for this id — skip (or log if you want)
continue
name = rec.get('name')
country = rec.get('country')
status_field = rec.get('status', [])
primary_status = status_field[0] if len(status_field) > 0 else None
secondary = status_field[1] if len(status_field) > 1 else []
def make_entry(pid, val, nm, conf, st, ctry):
return {
'provenance': 'affro',
'version': VERSION,
'pid': pid,
'value': val,
'name': nm,
'confidence': conf,
'status': st,
'country': ctry
}
if "openorgs" in value:
result_dict.append(make_entry('openorgs', value, name, score, 'active', country))
continue
# ROR branch
if primary_status == 'active':
result_dict.append(make_entry('ror', value, name, score, 'active', country))
continue
# primary is not active
# treat case where secondary exists and its first element is empty string specially
if secondary and secondary[0] == '':
result_dict.append(make_entry('ror', value, name, score, primary_status, country))
else:
# append parent (non-active)
result_dict.append(make_entry('ror', value, name, score, primary_status, country))
# append linked records (use link's own metadata)
for link in secondary:
if not link:
continue
link_rec = dix_id.get(link, {})
link_name = link_rec.get('name')
link_country = link_rec.get('country')
result_dict.append(make_entry('ror', link, link_name, score, 'active', link_country))
return result_dict
def count_active(items):
return sum(1 for x in items if x.get("status") == "active")
def disamb(input, id_list_,dix_id):
# print('disamb id_list_', id_list_)
if id_list_ == []:
return []
clean_aff = input[0]
# print(input)
result_dict = convert_to_result(id_list_, dix_id)
num_actives = count_active(result_dict)
# print('result_dict',result_dict)
# print('num_actives', num_actives)
if len(id_list_) ==1:
# print('1')
return result_dict
elif len(description(clean_aff)[1]) == 0:
# print('no country in affiliation')
# polytechnic?
countries_uni = [res['country'] for res in result_dict if 'Uni' in res['name']]
if len(countries_uni) >0:
final_matching = [res for res in result_dict if res['country'] in countries_uni]
return final_matching
else:
# print('no universities')
return result_dict
elif num_actives > len(set(description(clean_aff)[1])):
# print('more results than countries')
final_matching = []
light_aff_tokens = [clean_string_ror(x) for x in set(clean_aff.split())]
for res in result_dict:
country = res['country']
if country == 'united states':
if 'united states' in clean_aff or 'usa' in light_aff_tokens or contains_us_state(clean_aff):
final_matching.append(res)
elif country == 'united kingdom':
if 'united kingdom' in clean_aff or 'uk' in light_aff_tokens:
final_matching.append(res)
elif 'korea' in country:
if 'korea' in light_aff_tokens:
final_matching.append(res)
elif country in clean_aff:
final_matching.append(res)
if final_matching:
return final_matching
else:
return result_dict
elif len(result_dict)>0:
return result_dict
else:
# print('leider nichts')
return []

167
affro/helpers/find_id.py Normal file
View File

@ -0,0 +1,167 @@
from affro.helpers.functions import *
from affro.helpers.create_input import *
specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' or categ_dicts[k] == 'Acronyms']
country_synonyms = {x: [x] for x in countries}
country_synonyms["united states"] = ["united states", "u.s.a.", "usa", "usa.","states"]
country_synonyms["germany"] = ["germany","deutschland"]
country_synonyms["united kingdom"] = ["united kingdom", "u.k.", "uk", "uk.","kingdom","england"]
country_synonyms["turkey"] = ["turkey","turkiye", "cyprus"]
country_synonyms["china"] = ["china", "prc","chinese"]
country_synonyms["ireland"] = ["eire", "ireland"]
country_synonyms["south korea"] = ["south korea", "korea"]
special_countries = {'united states', 'united kngdom', 'germany', 'china','turkey'}
def keep_highest_score(data):
""""
Keeps only one inner list for each unique last value.
The kept list is the one with the greatest second value.
If multiple have the same greatest second value, one is kept arbitrarily.
"""
best = {}
for lst in data:
key = lst[-1]
value = lst[1]
if key not in best or value > best[key][1]:
best[key] = lst
return list(best.values())
def find_id(input, best_names, dix_name):
# print('start find_id')
clean_aff = input[0]
light_aff = input[1]
id_list = []
for org_list in best_names:
org = org_list[0]
# print('org:', org)
conf = org_list[1]
if len(dix_name[org]) == 1:
# print('unique')
id_ = dix_name[org][0]['id']
city_ = dix_name[org][0]['city']
country_ = dix_name[org][0]['country']
# print(city_, country_)
# print('c',set(country_synonyms[country_]))
# print('l',set(light_aff.split()))
if (
# ('univ' in org and 'institu' in org)
# or
(
city_ not in light_aff
and not set(country_synonyms[country_]) & set(light_aff.split())
and 'univ' not in org
and 'inst' not in org
and 'national' not in org
and valueToCategory(org) not in ['Company', 'Acronyms', 'Specific']
)
):
pass
else:
id_list.append([org, conf, id_])
# else:
# id_list.append([org, conf, id_])
else:
# print('multiple')
match_found = False
for quadruple in dix_name[org]:
city_ = quadruple['city']
# print('city', city_)
id_ = quadruple['id']
if city_ in clean_aff:
if city_ not in org:
id_list.append([org, conf, id_])
match_found = True
# break
else:
if clean_aff.count(city_) >1:
id_list.append([org, conf, id_])
match_found = True
# break
if not match_found:
countries_ids = {quadruple['country'] for quadruple in dix_name[org]}
if countries_ids & special_countries:
# print('special country')
for quadruple in dix_name[org]:
country_ = quadruple['country']
# print(country_)
id_ = quadruple['id']
tokens = set([x.replace(',','') for x in clean_aff.lower().split()])
# print('tokens',tokens)
text = clean_aff.lower()
# print('text', text)
if ((country_ == 'united states' and ('united states' in text or {'usa', 'usa.'} & tokens or 'u.s.a.' in text)) or
(country_ == 'germany' and ('deutschland' in text )) or
(country_ == 'united kingdom' and ('united kingdom' in text or ({'uk', 'uk.'} & tokens) or 'u.k.' in text)) or
(country_ == 'turkey' and ('turkiye' in text)) or
(country_ == 'china' and ('chinese' in text or 'prc' in text))):
# print('specific country found')
id_list.append([org, conf, id_])
match_found = True
break
if not match_found:
# print('no special country')
for quadruple in dix_name[org]:
country_ = quadruple['country']
id_ = quadruple['id']
# print(country_)
if country_.split()[0] in clean_aff:
# print('no specific found')
if country_ not in org:
id_list.append([org, conf, id_])
match_found = True
break
if not match_found:
for quadruple in dix_name[org]:
country_ = quadruple['country']
id_ = quadruple['id']
if country_ in clean_aff and country_ in org:
id_list.append([org, conf, id_])
match_found = True
# break
if not match_found:
# print('check sp')
for sp in specific:
if sp in org:
for rec in dix_name[org]:
if dix_id[rec['id']]['top_level'] == 'y':
# print('top level found for specific')
id_list.append([org, conf, rec['id']])
match_found = True
break
if not match_found:
dix_id[rec['id']]['parent'] == 'y'
# print('parent found for specific')
id_list.append([org, conf, rec['id']])
match_found = True
break
if not match_found:
# print('check first y')
for quadruple in dix_name[org]:
if 'department' not in org and 'labora' not in org and quadruple['first'] == 'y':
id_list.append([org, conf, quadruple['id']])
break
# print('id_list',id_list)
id_list_final = keep_highest_score(id_list)
# print('end find_id', id_list_final)
return id_list_final

View File

@ -0,0 +1,94 @@
from affro.helpers.functions import *
from affro.helpers.create_input import *
from affro.helpers.matching import *
def find_name(input, dix_name, simU, simG, limit):
# print('start find_name')
# print('input',input)
"""
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
Args:
m (int): The number of DOIs to check.
DF (DataFrame): The input DataFrame containing affiliation data.
dix_org (dict): A dictionary of names of organizations and their ROR_ids.
simU (float): Similarity threshold for universities.
simG (float): Similarity threshold for non-universities.
Returns:
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
"""
clean_aff = input[0]
light_aff = input[1].replace(' gmbh', ' ').strip()
df_list = input[2]
countries_list = input[3]
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
result = {}
pairs = []
keywords = [entry["keywords"].replace(' gmbh', ' ').strip() for entry in df_list]
candidates = get_candidates(countries_list)
if len(keywords) > 1 or len(keywords) == 1 and len(keywords[0])>1:
for k,s in enumerate(keywords):
if len(s) >1 and s not in countries:
pairs_k = []
# print(s)
#--end september 2025
try:
# print('lucky', s)
pairs_k.append((s,s,1, dix_name[s][0]['id'],dix_name[s][0]['country']))
if k not in dix:
dix[k] = [s]
else:
dix[k].append(s)
except Exception as e:
# else:
# print('not lucky')
try:
pairs_k = find_candidate(s, k , dix, simU, simG, candidates, limit)
except:
pairs_k = []
result[k] = pairs_k
if len(pairs_k)>0:
pairs.append(pairs_k)
multi = index_multiple_matchings(pairs)
# print('multi', multi)
need_check_keys = []
ready_keys = []
ready_best = []
for keyword in multi:
try:
if multi[keyword]>1:
need_check_keys.append(keyword)
else:
for p in pairs:
if keyword in p[0]:
if p[0][1] not in ready_keys:
ready_keys.append(p[0][1])
ready_best.append([p[0][1], p[0][2]])
except Exception as e:
print('ERROR, find_name', e)
pass
pairs_check = [ pair for pair in pairs if pair[0][0] in need_check_keys ]
# print('pairs_check',pairs_check)
if len(need_check_keys)>0:
# print(' len(need_check_keys)', len(need_check_keys))
best0 = best_sim_score(clean_aff, light_aff, len(keywords), pairs_check, multi, simU, simG)
# print('best0', best0)
best1 = {x[0]:dix_name[x[0]][0]['id'] for x in best0 }
# print('best1', best1)
best01 = unique_subset(best0, best1)
best = best01 + ready_best
else:
best = ready_best
# print('end find_name', best)
return best

View File

@ -28,17 +28,10 @@ def load_txt(relative_path, package="affro"):
with full_path.open("r", encoding="utf-8") as file:
return [line.strip() for line in file]
#categ_string = 'Laboratory|Univ/Inst|Hospital|Foundation|Museum|Government|Company'
categ_string = 'Academia|Hospitals|Foundations|Specific|Government|Company|Acronyms'
dix_org = load_json('jsons/dix_org.json')
dix_city = load_json('jsons/dix_city.json')
dix_country = load_json('jsons/dix_country.json')
dix_mult = load_json('jsons/dix_mult.json')
dix_country_legalnames = load_json('jsons/dix_country_legalnames.json')
us_states = [
"alabama", "alaska", "arizona", "arkansas", "california",
"colorado", "conecticut", "delaware", "florida", "georgia",
@ -52,6 +45,10 @@ us_states = [
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
]
dix_name = load_json('jsons/dix_name.json')
dix_country_legalnames = load_json('jsons/dix_country_legalnames.json')
def replace_double_consonants(text):
# This regex pattern matches any double consonant
pattern = r'([bcdfghjklmnpqrstvwxyz])\1'
@ -59,18 +56,32 @@ def replace_double_consonants(text):
result = re.sub(pattern, r'\1', text, flags=re.IGNORECASE)
return result
#stop_words = ['from', 'the', 'of', 'at', 'de','for','et','für','des', 'in','as','a','and','fur','for','und','di']
def remove_stop_words(text):
words = text.split()
filtered_words = [word for word in words if word not in stop_words]
return ' '.join(filtered_words)
filtered_words = []
for word in words:
if word.endswith(","):
core = word[:-1] # remove the comma
if core not in stop_words:
filtered_words.append(core + ",")
else:
filtered_words.append(",") # keep only the comma
else:
if word not in stop_words:
filtered_words.append(word)
result = " ".join(filtered_words)
# remove spaces before commas
result = result.replace(" ,", ",")
return result
stop_words = load_txt('txts/stop_words.txt')
dix_id_country = load_json('jsons/dix_id_country.json')
dix_id = load_json('jsons/dix_id.json')
categ_dicts = load_json('jsons/dix_categ.json')
replacements = load_json('jsons/replacements.json')
@ -82,6 +93,10 @@ stop_words.remove('at')
university_terms = [replace_double_consonants(x) for x in load_txt('txts/university_terms.txt')]
city_names = [replace_double_consonants(x) for x in load_txt('txts/city_names.txt')]
def is_first(id, name):
for quadruple in dix_name[name]:
if quadruple['id'] == id:
return quadruple['first']
def get_candidates(country_list):
@ -89,7 +104,7 @@ def get_candidates(country_list):
cand = [dix_country_legalnames[country] for country in country_list if country in dix_country_legalnames]
return list(set([item for sublist in cand for item in sublist]))
else:
return list(dix_org.keys())
return list(dix_name.keys())
def is_contained(s, w):
@ -109,6 +124,11 @@ def is_contained(s, w):
return False # Return False immediately
return True # If all words from 's' are found in 'w', return True
def split_sub(s: str) -> str:
# Add comma after certain word pairs
pattern = r'\b((?:univer))\s+(department|faculty|institu)\b'
return re.sub(pattern, r'\1, \2', s, flags=re.IGNORECASE)
def starts_with_any(string, prefixes):
"""
@ -158,39 +178,37 @@ def replace_roman_numerals(text):
def insert_space_between_lower_and_upper(s):
"""
Inserts a space between a lowercase letter followed by an uppercase letter in a string.
Parameters:
s (str): The input string.
Returns:
str: The modified string with spaces inserted.
Insert a space between a lowercase letter and a following uppercase letter,
while protecting listed substrings (case-sensitive) and restoring them in lowercase.
"""
# Temporarily replace 'AstraZeneca' to prevent modification
s = s.replace('gGmbH','gmbh')
s = s.replace('AstraZeneca', 'ASTRAZENECA_TEMP')
s = s.replace('BioNTech', 'BIONTECH_TEMP')
s = s.replace('GlaxoSmithKline', 'GLAXO_TEMP')
s = s.replace('GmbH', 'GMBH_TEMP')
s = s.replace('gmbH', 'GMBH_TEMP')
s = s.replace('gGmbH', 'GMBH_TEMP')
protected = ['DePaul',
'AstraZeneca',
'BioNTech',
'GlaxoSmithKline',
'LifeWatch',
'SoBigData',
'GmbH',
'gGmbH',
'gmbH'
]
# Replace protected words with placeholders mapping to their lowercase versions
placeholders = {}
for i, word in enumerate(protected):
key = f"__PROT_{i}__"
s = s.replace(word, key)
placeholders[key] = word.lower()
# Exclude cases where 'Mc' is followed by a capital letter
modified_string = re.sub(r'(?<!Mc)([a-z])([A-Z])', r'\1 \2', s)
# Ensure no spaces are inserted within 'Mc' sequences
modified_string = re.sub(r'(Mc) ([A-Z])', r'\1\2', modified_string)
# Restore 'AstraZeneca'
modified_string = modified_string.replace('ASTRAZENECA_TEMP', 'AstraZeneca')
modified_string = modified_string.replace('BIONTECH_TEMP', 'BioNTech')
modified_string = modified_string.replace('GLAXO_TEMP', 'GlaxoSmithKline')
modified_string = modified_string.replace('GMBH_TEMP', 'gmbh')
# Add space between lowercase and uppercase (except after 'Mc')
s = re.sub(r'(?<!Mc)([a-z])([A-Z])', r'\1 \2', s)
s = re.sub(r'(Mc) ([A-Z])', r'\1\2', s)
# Restore placeholders to lowercase
for key, lower_word in placeholders.items():
s = s.replace(key, lower_word)
return s
return modified_string
@ -216,7 +234,7 @@ def replace_abbr_univ(token):
elif token == "u " + city:
return "univer " + city
elif token == "tu " + city:
return "technical univer " + city
return "techn univer " + city
else:
return token
@ -224,7 +242,8 @@ def replace_abbr_univ(token):
def remove_parentheses(text):
return re.sub(r'\([^()]*\)', '', text)
L = ['univ', 'hospital', 'clinic', 'klinik', 'Univ', 'Hospital', 'Clinic', 'Klinik']
L = ['univ', 'hospital', 'clinic', 'klinik', 'Univ', 'Hospital', 'Clinic', 'Klinik'] + [s.title() for s in countries] + countries
word_pattern = "|".join(map(re.escape, L))
def process_parentheses(text):
@ -239,16 +258,15 @@ def process_parentheses(text):
Returns:
str: The modified string after processing parentheses.
"""
text = re.sub(r'\((?![^)]*(' + word_pattern + r'))[^)]*\)', '', text)
text_lower = text.lower()
text_lower = re.sub(r'\((?![^)]*(' + word_pattern + r'))[^)]*\)', '', text_lower)
# Replace `(` with `,` and `)` with `,` if a word from L is inside
text = re.sub(r'\(([^)]*(' + word_pattern + r')[^)]*)\)', r', \1,', text)
text_lower = re.sub(r'\(([^)]*(' + word_pattern + r')[^)]*)\)', r', \1,', text_lower)
return text
return text_lower
def replace_comma_spaces(text):
return text.replace(' ', ' ').replace(' , ', ', ')
@ -313,6 +331,7 @@ def replace_newlines_with_space(text: str, repl: str = " ") -> str:
return cleaned
def substrings_dict(string):
"""
Processes a given string by performing the following transformations:
@ -361,7 +380,8 @@ def substrings_dict(string):
modified_value = re.sub(r'\btrinity col\b', 'trinity colege', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\btechnische\b', 'technological', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\bteknologi\b', 'technology', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\bpolitehnica\b', 'polytechnic', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\bpolite\w*', 'polytechnic', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\bpolyte\w*', 'polytechnic', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\btechn\w*', 'techn', modified_value, flags=re.IGNORECASE)
#modified_value = re.sub(r'techno\w*', 'techno', modified_value, flags=re.IGNORECASE)
@ -375,16 +395,22 @@ def substrings_dict(string):
index += 1
# Add the original substring to the dictionary
# else:
# dict_string[index] = value.lower().strip()
# index += 1
return dict_string
def split_country(text):
try:
if text.split(' ')[-1].lower() in countries and startswith(text.split(' ')[-2].lower()) != 'univ':
return " ".join(text.split(' ')[0:-1])+", "+ text.split(' ')[-1].lower()
else:
return text
except:
return text
def clean_string_ror(input_string):
def clean_string_lucky(input_string):
input_string = replace_underscore(replace_comma_spaces(replace_double_consonants(unidecode(process_parentheses(fully_unescape(input_string.replace("","'").replace(" ́e","e").replace("'s", "s").replace("'", " "))))))).strip()
input_string = replace_underscore(replace_comma_spaces(replace_double_consonants(unidecode(process_parentheses(fully_unescape(input_string.replace("","'").replace(" ́e","e").replace("'s", "s").replace("'", ""))))))).strip()
result = remove_stop_words(replace_roman_numerals(input_string.lower()))
result = result.replace(' and ',' ')
@ -402,7 +428,8 @@ def clean_string_ror(input_string):
university_terms = {'universitatsklinikum', 'universitatskinderklinik',
'universitatspital', 'universitatskliniken', 'universitetshospital',
'universitatsmedizin', 'universitatsbibliothek','universitatszahnklinik'
'universitatsmedizin', 'universitatsbibliothek','universitatszahnklinik',
'universiteitsmuseum'
}
result = replace_acronyms(result).replace('.', ' ')
@ -425,7 +452,65 @@ def clean_string_ror(input_string):
result = re.sub(r'\btechnische\b', 'technological', result, flags=re.IGNORECASE)
result = re.sub(r'\bteknologi\b', 'technological', result, flags=re.IGNORECASE)
result = re.sub(r'\bpolitehnica\b', 'polytechnic', result, flags=re.IGNORECASE)
result = re.sub(r'\bpolite\w*', 'polytechnic', result, flags=re.IGNORECASE)
result = re.sub(r'\bpolyte\w*', 'polytechnic', result, flags=re.IGNORECASE)
result = re.sub(r'czechoslovak\b', 'czech', result, flags=re.IGNORECASE)
result = re.sub(r'\btechn\w*', 'techn', result, flags=re.IGNORECASE)
# result = re.sub(r'techno\w*', 'techno', result, flags=re.IGNORECASE)
result = re.sub(r'scien\w*', 'scien', result, flags=re.IGNORECASE)
# result = re.sub(r'\bsaint\b', 'st', result, flags=re.IGNORECASE)
return result.strip()
def clean_string_ror(input_string):
input_string = replace_underscore(replace_comma_spaces(replace_double_consonants(unidecode(remove_parentheses(fully_unescape(input_string.replace("","'").replace(" ́e","e").replace("'s", "s").replace("'", ""))))))).strip()
result = remove_stop_words(replace_roman_numerals(input_string.lower()))
result = result.replace(' and ',' ')
# Remove characters that are not from the Latin alphabet, or allowed punctuation
result = remove_multi_digit_numbers(replace_comma_spaces(re.sub(r'[^a-zA-Z0-9\s,;/:.\-\—]', '', result).strip()))
# Restore the " - " sequence from the placeholder
#result = result.replace(placeholder, " ")
result = result.replace(':',' ').replace(';',' ').replace('-',' ').replace('',' ').replace(',',' ')
# Replace consecutive whitespace with a single space
university_terms = {'universitatsklinikum', 'universitatskinderklinik',
'universitatspital', 'universitatskliniken', 'universitetshospital',
'universitatsmedizin', 'universitatsbibliothek','universitatszahnklinik',
'universiteitsmuseum'
}
result = replace_acronyms(result).replace('.', ' ')
result = re.sub(r'\s+', ' ', result)
# Replace consecutive whitespace with a single space
if not any(term in result.lower() for term in university_terms):
result = re.sub(r'universi\w*', 'univer', result, flags=re.IGNORECASE)
result = re.sub(r'\bsaint\b', 'st', result,flags=re.IGNORECASE)
result = re.sub(r'institu\w*', 'institu', result, flags=re.IGNORECASE)
result = re.sub(r'labora\w*', 'labora', result, flags=re.IGNORECASE)
result = re.sub(r'centre\b', 'center', result, flags=re.IGNORECASE)
result = re.sub(r'centrum\b', 'center', result, flags=re.IGNORECASE)
result = re.sub(r'hopital\b', 'hospital', result, flags=re.IGNORECASE)
result = re.sub(r'hospital(?!s)\w*', 'hospital', result, flags=re.IGNORECASE)
#result = re.sub(r'centro\b', 'center', result, flags=re.IGNORECASE)
result = re.sub(r'\btechnische\b', 'technological', result, flags=re.IGNORECASE)
result = re.sub(r'\bteknologi\b', 'technological', result, flags=re.IGNORECASE)
result = re.sub(r'\bpolite\w*', 'polytechnic', result, flags=re.IGNORECASE)
result = re.sub(r'\bpolyte\w*', 'polytechnic', result, flags=re.IGNORECASE)
result = re.sub(r'czechoslovak\b', 'czech', result, flags=re.IGNORECASE)
result = re.sub(r'\btechn\w*', 'techn', result, flags=re.IGNORECASE)
@ -436,13 +521,13 @@ def clean_string_ror(input_string):
return result.strip()
def clean_string(input_string):
input_string = replace_underscore(replace_comma_spaces(unidecode(process_parentheses(fully_unescape(replace_newlines_with_space(input_string).replace("P.O. Box","").replace("","'").replace(" ́e","e").replace("'s", "s").replace("'", " ")))))).strip()
input_string = replace_underscore(replace_comma_spaces(unidecode(process_parentheses(insert_space_between_lower_and_upper(fully_unescape(replace_newlines_with_space(input_string).replace("P.O. Box","").replace("","'").replace(" ́e","e").replace("'s", "s").replace("'", " "))))))).strip()
# result = re.sub(r'(?<! )[–—-](?! )', ' ', input_string)
# print('h',input_string)
result = remove_stop_words(replace_double_consonants(replace_roman_numerals(insert_space_between_lower_and_upper(input_string).lower())))
result = remove_stop_words(replace_double_consonants(replace_roman_numerals((input_string).lower())))
# Remove characters that are not from the Latin alphabet, or allowed punctuation
@ -458,12 +543,12 @@ def clean_string(input_string):
#result = replace_roman_numerals(remove_stop_words(insert_space_between_lower_and_upper(result).lower()))
return result.strip() # Strip leading/trailing spaces
return split_country(result.strip()) # Strip leading/trailing spaces
def description(aff_string):
aff_string = aff_string.replace('turkiye', 'turkey')
aff_string = aff_string.replace('turkiye', 'turkey').lower()
aff_string = aff_string.replace('kirgizistan', 'kyrgyzstan')
descr = []
countries_ = []
words = re.split(r'[ ,;:/]+', aff_string)
@ -473,16 +558,15 @@ def description(aff_string):
# if w in city_names:
# descr.append('city')
w = re.sub(r'[^A-Za-z\s]', '', w)
if replace_acronyms(w) in countries:
descr.append('country')
countries_.append(w)
if replace_acronyms(w) in us_states:
descr.append('country')
countries_.append('usa')
elif w in ['univer', 'institu', 'hospital', 'labora']:
elif w in ['univer', 'institu', 'hospital', 'labora', 'colege']:
descr.append('basic_key')
elif w == 'and':
@ -531,12 +615,12 @@ def split_and(string):
tok_no_sl1 = ' '.join(token.replace('-', ' ').split())
tok_no_sl2 = ' '.join(token.replace('', ' ').split())
tok_no = ' '.join(token.replace(' and ', ' ').replace(' at ', ' ').replace(' an ', ' ').replace('-', ' ').replace('', ' ').split())
if tok_no in dix_org:
if tok_no in dix_name:
token = tok_no
else:
if tok_no_and not in dix_org:
if tok_no_and not in dix_name:
# Store once instead of calling multiple times
if is_subsequence(replace_sequence, token_description):# and token.split(' and ', ' ') not in dix_org:
@ -547,20 +631,20 @@ def split_and(string):
else:
token = tok_no_and
if tok_no_at not in dix_org:
if tok_no_at not in dix_name:
token = ' '.join(token.replace(' at ', ', ').split())
else:
token = tok_no_at
if tok_no_an not in dix_org:
if tok_no_an not in dix_name:
token = ' '.join(token.replace(' an ', ', ').split())
else:
token = tok_no_an
if tok_no_sl1 not in dix_org:
if tok_no_sl1 not in dix_name:
token = ' '.join(token.replace('-', ',').split())
else:
token = tok_no_sl1
if tok_no_sl2 not in dix_org:
if tok_no_sl2 not in dix_name:
token = ' '.join(token.replace('', ',').split())
else:
token = tok_no_sl2
@ -577,10 +661,10 @@ def reduce(light_aff):
aff_no_symbols_d = substrings_dict(light_aff)
substring_list = list(aff_no_symbols_d.values())
#light_aff_final = ', '.join((substring_list))
# print('h', substring_list)
# print('h', substring_list)
light_aff_final = split_and(', '.join((substring_list)))
# print('th', light_aff_final)
return light_aff_final
# print('th', light_aff_final)
return split_sub(light_aff_final)
def unique_subset(L, D):
@ -615,6 +699,7 @@ def str_radius_u(string, radius_u):
return result
sp_specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' and ' ' in k]
def str_radius_spec(string):
spec = False
@ -626,16 +711,32 @@ def str_radius_spec(string):
except:
pass
if spec == False:
return string
for x in sp_specific:
if x in string:# or categ_dicts[x] == 'Acronyms':
spec = True
# print('CHECK',x)
return x
if spec ==False:
return string
#
# def str_radius_spec(string):
# spec = False
# for x in only_specific:
# if x in string:# or categ_dicts[x] == 'Acronyms':
# spec = True
# return x
# if spec ==False:
# return string
def shorten_keywords(affiliations_simple, radius_u):
affiliations_simple_n = []
for aff in affiliations_simple:
# print('check aff', aff)
if aff in dix_org:
# print('check aff', aff)
if aff in dix_name:
# print('in dix')
affiliations_simple_n.append(aff)
elif 'univer' in aff:

View File

@ -2,59 +2,33 @@ import Levenshtein
from affro.helpers.functions import *
from affro.helpers.create_input import *
specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' or categ_dicts[k] == 'Acronyms']
# print('HERE', len(dix_org))
# print('HERE_city', len(dix_city))
# print('HERE_country', len(dix_country))
def index_multiple_matchings(pairs):
d = {}
for p in pairs:
d[p[0][0]] = len(p)
return d
def keep_highest_url(lst):
best = {}
for item in lst:
name, score, url = item
if name not in best or url > best[name][2]: # Keep the highest URL
best[name] = item # Store the full entry
return list(best.values()) # Convert dictionary values back to list
def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
vectorizer = CountVectorizer()
similar_k = []
pairs_k = []
total_pairs = 0
# if keyword in dix_org:
# print('lucky')
# pairs_k.append((keyword,keyword,1,dix_org[keyword], dix_id_country[dix_org[keyword]]))
for x in candidates_:
# print('keyword', keyword)
if is_contained(keyword, x):
# print(0,x,total_pairs)
if is_contained(keyword, x):# and ('univ' in x or 'inst' in x or len(get_candidates([])) < len(dix_name)):
# print('keyword contained')
x_vector = vectorizer.fit_transform([x]).toarray()
keyword_vector = vectorizer.transform([keyword]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(x_vector, keyword_vector)[0][0]
# print('similarity', similarity)
if similarity > min(simU, simG):
if ('univ' in keyword and 'univ' in x) and similarity > simU:
similar_k.append(similarity)
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
pairs_k.append((keyword,x,similarity))
total_pairs += 1 # Track total number of pairs
@ -65,24 +39,17 @@ def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
elif (not 'univ'in keyword and not 'univ' in x) and similarity > simG:
# print('pass', keyword, x, similarity)
similar_k.append(similarity)
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
pairs_k.append((keyword,x,similarity))
total_pairs += 1 # Track total number of pairs
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif is_contained(x, keyword):
# print(0.5,x,total_pairs)
if ('univ'in keyword and 'univ' in x):
# print(1,x,total_pairs)
keyword_vector = vectorizer.fit_transform([keyword]).toarray()
x_vector = vectorizer.transform([x]).toarray()
@ -91,7 +58,7 @@ def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
similarity = cosine_similarity(keyword_vector, x_vector)[0][0]
if similarity > simU: #max(0.82,sim):
similar_k.append(similarity)
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
pairs_k.append((keyword,x,similarity))
total_pairs += 1 # Track total number of pairs
if k not in dix:
@ -102,25 +69,27 @@ def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
elif not 'univ' in keyword and not 'univ' in x:
# print('not uni')
keyword_vector = vectorizer.fit_transform([keyword]).toarray()
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(keyword_vector, x_vector)[0][0]
if similarity > simG: #max(0.82,sim):
similar_k.append(similarity)
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
pairs_k.append((keyword,x,similarity))
total_pairs += 1 # Track total number of pairs
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
# total_pairs += len(pairs_k) # Track total number of pairs
if total_pairs >= limit: # Stop if we reach
return []
# print('end find_candidate', pairs_k)
return pairs_k
@ -131,7 +100,6 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
"""
vectorizer = CountVectorizer()
univ_num = light_raw.lower().count('univ')
result = []
best = []
@ -141,7 +109,6 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
affil = pair_group[0][0]
num_uni_p = affil.count('univ')
# print('AFFIL', affil)
for p in pair_group:
organization, confidence = p[1], p[2]
@ -183,10 +150,8 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
# Sort by similarity score (descending) and then lexicographically
reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True)
# print('REDUCED BEST: ', reduced_best)
result.extend(reduced_best)
# print('RESULT EXT: ', result)
# Step 3: Limit university-related matches
univ_list = [r for r in result if 'univ' in r[0]]
@ -214,247 +179,7 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
# Convert to list format
final_result = [[key, value[0]] for key, value in sorted(result_dict.items(), key=lambda x: x[1][1], reverse=True)]
# print("RESULT TO USE: ", final_result)
return final_result
def Aff_Ids(input, dix_org, dix_mult, dix_city, dix_country, simU, simG, limit):
"""
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
Args:
m (int): The number of DOIs to check.
DF (DataFrame): The input DataFrame containing affiliation data.
dix_org (dict): A dictionary of names of organizations and their ROR_ids.
simU (float): Similarity threshold for universities.
simG (float): Similarity threshold for non-universities.
Returns:
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
"""
clean_aff = input[0]
# print('CLEAN_AFF (LVL1): ', clean_aff)
light_aff = input[1].replace(' gmbh', ' ').strip()
# print('LIGHT_AFF (LVL2): ', light_aff)
df_list = input[2]
countries_list = input[3]
# print('COUNTRIES_LIST: ', countries_list)
vectorizer = CountVectorizer()
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
result = {}
pairs = []
keywords = [entry["keywords"].replace(' gmbh', ' ').strip() for entry in df_list]
candidates = get_candidates(countries_list)
# print('KEYWORDS: ', keywords)
if len(keywords) > 1 or len(keywords) == 1 and len(keywords[0])>1:
for k,s in enumerate(keywords):
pairs_k = []
# print('try', s)
try:
pairs_k.append((s,s,1,dix_org[s],dix_id_country[dix_org[s]]))
# print('LUCKY')
# pairs.append((s,s,similarity,dix_org[s], dix_id_country[dix_org[s]]))
if k not in dix:
dix[k] = [s]
else:
dix[k].append(s)
except:
# print('NOT LUCKY')
pairs_k = find_candidate(s, k , dix, simU, simG, candidates, limit)
# print('PAIRS K: ', pairs_k)
result[k] = pairs_k
if len(pairs_k)>0:
# print('PAIRS K>0: ', pairs_k)
pairs.append(pairs_k)
# print('PAIRS: ', pairs)
multi = index_multiple_matchings(pairs)
# print('MULTIL ',multi)
need_check_keys = []
ready_keys = []
ready_best = []
for keyword in multi:
try:
if multi[keyword]>1:
need_check_keys.append(keyword)
else:
for p in pairs:
if keyword in p[0]:
if p[0][1] not in ready_keys:
ready_keys.append(p[0][1])
ready_best.append([p[0][1], p[0][2]])
except:
pass
# print('READY KEYWORD: ', ready_keys)
# print('READY BEST: ', ready_best)
# print('NEED CHECK KEYWORD: ', need_check_keys)
pairs_check = [ pair for pair in pairs if pair[0][0] in need_check_keys ]
# print('NEED CHECK PAIRS: ', pairs_check)
if len(need_check_keys)>0:
best0 = best_sim_score(clean_aff, light_aff, len(keywords), pairs_check, multi, simU, simG)
# print('OUTPUT BEST: ', best0)
best1 = {x[0]:dix_org[x[0]] for x in best0 }
best01 = unique_subset(best0, best1)
matched_org = list(set([x[0] for x in best01])) + ready_keys
best = best01 + ready_best
# print('NEW BEST',best01)
else:
best = ready_best
matched_org = ready_keys
# print('FINAL BEST: ', best)
## print('MATCHED: ', matched_org)
id_list = []
for org_list in best:
org = org_list[0]
conf = org_list[1]
if dix_mult[org] == 'unique':
# print('unique:', org)
if 'institu' in org and 'univ' in org:
#print('both inst and univ', clean_aff)
if dix_city[org][0] not in clean_aff and dix_country[org][0] not in clean_aff:
#print('pass')
pass
else:
#print('correct')
id_list.append([org, conf, dix_org[org]])
else:
id_list.append([org, conf, dix_org[org]])
else:
# print('not unique:', org)
if org in dix_city:
match_found = False
for city in dix_city[org]:
if city[0] in clean_aff:
if city[0] not in org:
# print('city', city[0], org)
id_list.append([org, conf, city[1]])
match_found = True
break
else:
if clean_aff.count(city[0]) >1:
id_list.append([org, conf, city[1]])
match_found = True
break
if not match_found:
for city in dix_city[org]:
if city[0] in clean_aff and city[0] not in org:
id_list.append([org, conf, city[1]])
break
if not match_found:
match_found2 = False
match_found3 = False
all_countries = list(set([c[0] for c in dix_country[org]]))
if len(all_countries) > 1:
for country in dix_country[org]:
# print('country', country[0], org)
tokens = set(clean_aff.lower().split())
text = clean_aff.lower()
if country[0] == 'united states' and (
'united states' in text
or {'usa', 'usa.'} & tokens
or 'u.s.a.' in text
):
id_list.append([org, conf, country[1]])
match_found2 = True
match_found3 = True
break
if country[0] == 'united kingdom' and (
'united kingdom' in text
or {'uk', 'uk.'} & tokens
or 'u.k.' in text
):
id_list.append([org, conf, country[1]])
match_found2 = True
match_found3 = True
break
# print('check country', clean_aff)
# if country[0] == 'united states' and (country[0] in clean_aff or 'usa' in clean_aff.split() or 'usa.' in clean_aff.split() or 'u.s.a.' in clean_aff):
# id_list.append([org, conf, country[1]])
# match_found2 = True
# match_found3 = True
# break
# if country[0] == 'united kingdom' and (country[0] in clean_aff or 'uk' in clean_aff.split() or 'u.k.' in clean_aff):
# id_list.append([org, conf, country[1]])
# match_found2 = True
# match_found3 = True
# break
if country[0] == 'turkey' and (
'turkiye' in text
#or 'u.k.' in text
):
# print('here turkey')
id_list.append([org, conf, country[1]])
match_found2 = True
match_found3 = True
break
elif country[0].split()[0] in clean_aff:
if country[0] not in org:
id_list.append([org, conf, country[1]])
match_found2 = True
match_found3 = True
break
else:
single_country = all_countries[0]
if single_country in clean_aff:
id_list.append([org, conf, dix_org[org]])
match_found2 = True
match_found3 = True
break
if not match_found3:
for country in dix_country[org]:
if country[0] in clean_aff and country[0] in org:
id_list.append([org, conf, country[1]])
match_found2 = True
break
if not match_found2:
for sp in specific:
if sp in org:
id_list.append([org, conf, dix_org[org]])
# print("RESULT: ", id_list)
id_list_final = keep_highest_url(id_list)
return id_list_final

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

1443450
affro/jsons/dix_id.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

1078325
affro/jsons/dix_name.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -16,9 +16,12 @@
"universitatskinderklinik": "univer childrens hospital",
"universitatskliniken": "univer hospital",
"universitätsklinik": "univer hospital",
"uniklinik" : "univer hospital",
"universitatsmedizin": "univer medicine",
"universitatsbibliothek": "univer library",
"universiteitsmuseum": "univer museum",
"nat.": "national",
"pamantasan": "univer",
"uniaersity": "univer",
"univesity": "univer",
"unversity": "univer",
@ -76,5 +79,29 @@
"medisch": "medical",
"hahn-meitner-institut berlin" : "helmholtz-zentrum berlin",
"fachhochschule gelsenkirchen" : "westfalische hochschule",
"turkiye" : "turkey"
"turkiye" : "turkey",
"trinity colege university" : "trinity colege",
"tyndal institute" : "tyndal national institute",
"st patricks colege, drumcondra" : "dublin city university",
"ucd dublin" : "univer colege dublin",
"department university" : "department, university",
"xi an" : "xian",
"sligo general hospital" : "sligo univer hospital",
"trinity colege cambridge" : "univer cambridge",
"trinity colege, cambridge" : "univer cambridge",
"st johns colege, cambridge" : "univer cambridge",
"st johns colege cambridge" : "univer cambridge",
"kings colege, cambridge" : "univer cambridge",
"kings colege cambridge" : "univer cambridge",
"eire" : "ireland",
"trinity colege, ireland" : "trinity colege dublin",
"trinity colege ireland" : "trinity colege dublin",
"gilan" : "guilan",
"freiberg univer mining techn" : "techn univer bergakademie freiberg",
"vishwavidyalaya" : "univer",
"rwi esen" : "rwi leibniz institu economic research",
"t. d. medical colege" : "alapuzha medical colege",
"sulaymaniyah" : "sulaimani",
"-ang" : " ang"
}

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,9 @@
eire
turkiye
trinidad
united
kingdom
states
emirates
hong
niederland
holand

View File

@ -20,4 +20,6 @@ di
l
street
post-box
e.v.
e.v.
do
ng

View File

@ -8,4 +8,11 @@ universitatsbibliothek
universitatspital
universitetsjukhuset
universitatsaugenklinik
univesitatsfrauenklinik
univesitatsfrauenklinik
universitetscentralsjukhus
universitatsverlag
universitaetsklinikum
universitatsalianz
universalmuseum
universitatszahnklinik
universitetsforlaget

View File

@ -1 +1 @@
__version__ = "2.2.2"
__version__ = "3.1.1"

View File

@ -3,156 +3,150 @@ import sys
from affro.helpers.functions import *
from affro.helpers.matching import *
from affro.helpers.create_input import *
import json
from affro.helpers.find_name import *
from affro.helpers.find_id import *
from affro.helpers.disambiguation import *
from . import __version__
VERSION = __version__
dix_org = load_json('jsons/dix_org.json')
dix_mult = load_json('jsons/dix_mult.json')
dix_city = load_json('jsons/dix_city.json')
dix_country = load_json('jsons/dix_country.json')
dix_status = load_json('jsons/dix_status.json')
dix_id_name = load_json('jsons/dix_id_name.json')
dix_id_country = load_json('jsons/dix_id_country.json')
dix_id_name = load_json('jsons/dix_id_name.json')
dix_id = load_json('jsons/dix_id.json')
dix_name = load_json('jsons/dix_name.json')
dix_status_new = {k :[dix_status[k][0], dix_status[k][1].split(', ')] for k in dix_status}
us_states = [
"alabama", "alaska", "arizona", "arkansas", "california",
"colorado", "conecticut", "delaware", "florida", "georgia",
"hawaii", "idaho", "ilinois", "indiana", "iowa",
"kansas", "kentucky", "louisiana", "maine", "maryland",
"masachusets", "michigan", "minesota", "misisipi", "misouri",
"montana", "nebraska", "nevada", "new hampshire", "new jersey",
"new mexico", "new york", "north carolina", "north dakota", "ohio",
"oklahoma", "oregon", "pensylvania", "rhode island", "south carolina",
"south dakota", "tenesee", "texas", "utah", "vermont",
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
]
def produce_result(input, simU, simG, limit):
best_name = find_name(input, dix_name, simU, simG, limit)
id_result = find_id(input, best_name, dix_name)
result = disamb(input, id_result, dix_id)
def contains_us_state(text):
text = text.lower()
return any(state in text for state in us_states)
def find_ror(input, simU, simG, limit):
light_aff = input[0]
result = Aff_Ids(input, dix_org, dix_mult, dix_city, dix_country, simU, simG, limit)
results_upd = []
for r in result:
if "openorgs" in r[2]:
results_upd.append([r[1], 'openorgs', r[2], 'active', dix_id_country[r[2]]])
else:
if dix_status_new[r[2]][0] == 'active':
results_upd.append([r[1], 'ror', r[2], 'active', dix_id_country[r[2]]])
else:
if dix_status_new[r[2]][1][0] == '':
results_upd.append([r[1], 'ror', r[2], dix_status_new[r[2]][0], dix_id_country[r[2]]])
else:
results_upd.append([r[1], 'ror', r[2], dix_status_new[r[2]][0],dix_id_country[r[2]]])
for link in (dix_status_new[r[2]][1]):
results_upd.append([r[1], 'ror', link, 'active',dix_id_country[r[2]],dix_id_country[link]])
if len(results_upd) > len(set(description(light_aff)[1])):
final_matching = []
light_aff_tokens = [clean_string_ror(x) for x in set(light_aff.split())]
for id_ in results_upd:
country = dix_id_country[id_[2]]
if country == 'united states':
if 'united states' in light_aff or 'usa' in light_aff_tokens or contains_us_state(light_aff):
final_matching.append(id_)
elif country == 'united kingdom':
if 'united kingdom' in light_aff or 'uk' in light_aff_tokens:
final_matching.append(id_)
elif 'korea' in country:
if 'korea' in light_aff_tokens:
final_matching.append(id_)
elif country in light_aff:
final_matching.append(id_)
if len(final_matching)>0:
result_dict = [{'provenance': 'affro', 'version': VERSION, 'pid':'openorgs', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'provenance': 'affro', 'version': VERSION,'pid':'ror', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} for x in final_matching]
return result_dict
else:
return [{'provenance': 'affro', 'version': VERSION, 'pid':'openorgs', 'value':x[2], 'name': dix_id_name[x[2]],'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'provenance': 'affro', 'version': VERSION, 'pid':'ror', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} for x in results_upd]
elif len(results_upd)>0:
return [{'provenance': 'affro', 'version': VERSION, 'pid':'openorgs', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'provenance': 'affro', 'version': VERSION, 'pid':'ror', 'value':x[2], 'name': dix_id_name[x[2]], 'confidence':x[0], 'status':x[3], 'country':dix_id_country[x[2]]} for x in results_upd]
else:
result_dict = []
return result_dict
return result
def run_affro(raw_aff_string):
lucky_guess = clean_string_ror(raw_aff_string)
lucky_guess = clean_string_lucky(raw_aff_string)
# print(lucky_guess)
try:
if lucky_guess in dix_org:
if dix_mult[lucky_guess] == "unique":
if 'openorgs' in dix_org[lucky_guess]:
if lucky_guess in dix_name:
# print('lucky guess hit', lucky_guess)
# print('lucky guess found', dix_name[lucky_guess])
if len(dix_name[lucky_guess]) == 1:
id_ = dix_name[lucky_guess][0]['id']
name_ = dix_id[id_]['name']
country_ = dix_id[id_]['country']
status_ = dix_id[id_]['status']
if 'openorgs' in id_:
return[{'provenance': 'affro', 'version': VERSION, 'pid': 'openorgs', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'openorgs', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
else:
if dix_status_new[dix_org[lucky_guess]][0] == 'active':
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
elif dix_status_new[dix_org[lucky_guess]][1][0]== '':
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': dix_status_new[dix_org[lucky_guess]][0], 'country':dix_id_country[dix_org[lucky_guess]]}]
if status_[0] == 'active':
# print('active')
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
elif status_[0]== '':
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value':id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
else:
res = [{'provenance': 'affro', 'version': VERSION, 'pid' : 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': dix_status_new[dix_org[lucky_guess]][0], 'country':dix_id_country[dix_org[lucky_guess]]}]
for successor in dix_status_new[dix_org[lucky_guess]][1]:
res.append({'provenance': 'affro', 'version': VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]})
res = [{'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
for successor in status_[1]:
if successor != '':
res.append({'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id[successor]['name'], 'confidence': 1, 'status': 'active', 'country':dix_id[successor]['country']})
return res
else:
cand_ids = [
key
for _, key in dix_city[lucky_guess]
if ("ror" in key and dix_status_new[key][0] == "active") or ("openorgs" in key)
]
num_countries = len(
set(
dix_id_country[x[1]]
for x in dix_city[lucky_guess]
if ("ror" in x[1] and dix_status_new[x[1]][0] == "active") or ("openorgs" in x[1])
)
)
# print('multiple candidates')
ids = [x['id'] for x in dix_name[lucky_guess]]
cand_ids = [id for id in ids if is_first(id, lucky_guess) == 'y']
# print('cand_ids', cand_ids)
# pick the ror id where 'first' == 'y' (None if not found)
if len(cand_ids) !=1:
# print('secondary conditions')
conditions = [
lambda key: ("ror" in key and dix_id[key]['status'][0] == "active"
and dix_id[key]['top_level'][0] == 'y') \
or ("openorgs" in key),
if len(cand_ids) == 1 or num_countries == 1:
if 'openorgs' in dix_org[lucky_guess]:
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'openorgs', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
lambda key: ("ror" in key and dix_id[key]['status'][0] == "active"
and dix_id[key]['parent'][0] == 'y') \
or ("openorgs" in key),
lambda key: ("ror" in key and dix_id[key]['status'][0] == "active") \
or ("openorgs" in key)
]
for cond in conditions:
cand_ids = [key for key in ids if cond(key)]
if cand_ids:
# print('break')
break
if len(cand_ids) == 0:
# print('check result')
result = produce_result(create_df_algorithm(raw_aff_string, 10), 0.42, 0.82, 500)
return result
# print('cand_ids',cand_ids)
if len(cand_ids) == 1:# or num_countries == 1:
id_ = cand_ids[0]
# print('id',id_)
name_ = dix_id[id_]['name']
country_ = dix_id[id_]['country']
status_ = dix_id[id_]['status']
if 'openorgs' in id_:
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'openorgs', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
else:
return [{'provenance': 'affro', 'version': VERSION, 'pid': 'ror', 'value': dix_org[lucky_guess], 'name': dix_id_name[ dix_org[lucky_guess]], 'confidence': 1, 'status': 'active', 'country':dix_id_country[dix_org[lucky_guess]]}]
else:
return []
if status_[0] == 'active':
# print('active')
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
elif status_[0]== '':
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value':id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
else:
res = [{'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
for successor in status_[1]:
if successor != '':
res.append({'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id[successor]['name'], 'confidence': 1, 'status': 'active', 'country':dix_id[successor]['country']})
return res
# return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country':country_}]
else:
found = False
for triplet in dix_name[lucky_guess]:
if triplet['first'] == 'y':
found = True
id_ = triplet['id']
name_ = dix_id[id_]['name']
country_ = dix_id[id_]['country']
status_ = dix_id[id_]['status']
if 'openorgs' in id_:
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'openorgs', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
else:
if status_[0] == 'active':
# print('active')
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': 'active', 'country': country_}]
elif status_[0]== '':
return [{'provenance': 'affro', 'version' : VERSION, 'pid': 'ror', 'value':id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
else:
res = [{'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': id_, 'name': name_, 'confidence': 1, 'status': status_[0], 'country': country_}]
for successor in status_[1]:
if successor != '':
res.append({'provenance': 'affro', 'version' : VERSION, 'pid' : 'ror', 'value': successor, 'name': dix_id[successor]['name'], 'confidence': 1, 'status': 'active', 'country':dix_id[successor]['country']})
return res
if found == False:
return []
else:
# print('No lucky guess, running algorithm...')
result = find_ror(create_df_algorithm(raw_aff_string, 3), 0.42, 0.82, 500)
# print('lucky guess miss')
result = produce_result(create_df_algorithm(raw_aff_string, 3), 0.42, 0.82, 500)
return result
except Exception as e:
# Return some indication of an error, or log the row
print(f"Error: {str(e)}")
print(f"Error end: {str(e)}")
print(raw_aff_string)
pass
def matchings_affro(aff_string):
@ -168,7 +162,7 @@ def matchings_affro(aff_string):
# Create the result as a tuple that matches matchings_schema
result = []
for matching in matchings:
# Assuming 'matching' is a dictionary that contains 'provenance', 'version', 'pid', 'value', 'name', 'confidence', 'status', 'country'
# Assuming 'matching' is a dictionary that contains 'provenance', 'affro', 'value', 'confidence', 'status'
result.append((
matching.get("provenance", None),
matching.get("version", None),
@ -178,7 +172,6 @@ def matchings_affro(aff_string):
float(matching.get("confidence", None)),
matching.get("status", None),
matching.get("country", None)
))
if len(result)>0:
return result
@ -189,5 +182,4 @@ def matchings_affro(aff_string):
return ()

View File

@ -1,12 +1,9 @@
from affro.helpers.functions import *
def valueToCategory(value):
flag = 0
for k in categ_dicts:
if k in value and categ_dicts[k] in categ_string.split('|'):
flag = 1
return flag
return categ_dicts[k]
# tokenization
@ -28,21 +25,30 @@ protect = ['national univer ireland',
'state univer',
'rijksuniver',
'rijks univer',
'univer medical center'
'univer medical center',
'royal colege surgeons',
'st patricks colege',
'institu techn',
'trinity colege',
'st johns colege',
'wiliam beaumont hospital'
]
def create_df_algorithm(raw_aff_string, radius_u):
clean_aff = clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string)))
#print(0, clean_aff)
countries_list = description(clean_aff)[1]
aff_no_symbols_d = substrings_dict(reduce(clean_aff))
#print(0.5, aff_no_symbols_d)
substring_list = [replace_abbr_univ(x) for x in list(aff_no_symbols_d.values())]
#print(1, substring_list)
# for k, word in enumerate(substring_list):
# print(word)
# if word in protect and substring_list[k+1] in city_names:
# print('y')
# word = word + ', ' + substring_list[k+1]
# substring_list[k] = word
i = 0
# print(substring_list,'substring_list')
while i < len(substring_list) - 1:
if substring_list[i] in protect and any(name in substring_list[i+1] for name in city_names): #substring_list[i+1] in city_names:
if substring_list[i] in protect and any(name in substring_list[i+1] for name in city_names+countries): #substring_list[i+1] in city_names:
substring_list[i] = substring_list[i] + ' ' + substring_list[i+1]
i = i+2
continue
@ -61,31 +67,18 @@ def create_df_algorithm(raw_aff_string, radius_u):
i = i+1
continue
# elif 'lab' in substring_list[i] and ('colege' in substring_list[i+1] or 'dep' in substring_list[i+1] or 'school' in substring_list[i+1]):
# if not 'univ' in substring_list[i]: #'inst' in substring_list[i+1] or
# substring_list.pop(i)
# else:
# i = i+1
# continue
else:
i += 1
# print(1.4, substring_list)
light_aff = (', '.join((substring_list)))
# print(1.5, light_aff)
substring_list = [x for x in substring_list if x.replace(' gmbh','') not in city_names+remove_list]
# print(1.7,substring_list)
substring_list0 = [shorten_keywords([x], radius_u) for x in substring_list if len(shorten_keywords([x],radius_u))>0]
# print(2,substring_list0 )
substring_list1 = [inner for outer in substring_list0 for inner in outer]
# print(3,substring_list1 )
aff_list = [{"index": i, "keywords": substring_list1[i], "category": valueToCategory(substring_list1[i])} for i in range(len(substring_list1))]
filtered_list = [entry for entry in aff_list if entry.get("category") == 1]
filtered_list = [entry for entry in aff_list if type(entry.get("category")) == str]
return [clean_aff, light_aff, filtered_list, countries_list]

View File

@ -0,0 +1,155 @@
from affro.helpers.functions import *
from affro.helpers.create_input import *
from .. import __version__
VERSION = __version__
us_states = [
"alabama", "alaska", "arizona", "arkansas", "california",
"colorado", "conecticut", "delaware", "florida", "georgia",
"hawaii", "idaho", "ilinois", "indiana", "iowa",
"kansas", "kentucky", "louisiana", "maine", "maryland",
"masachusets", "michigan", "minesota", "misisipi", "misouri",
"montana", "nebraska", "nevada", "new hampshire", "new jersey",
"new mexico", "new york", "north carolina", "north dakota", "ohio",
"oklahoma", "oregon", "pensylvania", "rhode island", "south carolina",
"south dakota", "tennesee", "texas", "utah", "vermont",
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
]
def contains_us_state(text):
text = text.lower()
return any(state in text for state in us_states)
# def get_city(name, dix_name):
# return {x['city'] : x['id'] for x in dix_name[name]}
def convert_to_result(id_list_, dix_id):
"""
id_list_ rows: [something, score, value]
dix_id: mapping from id -> {'name':..., 'country':..., 'status': [primary, secondary_list]}
"""
result_dict = []
for r in id_list_:
# Confidence is in r[1]
score = min(r[1], 1.0)
value = r[2]
rec = dix_id.get(value)
if rec is None:
# missing metadata for this id — skip (or log if you want)
continue
name = rec.get('name')
country = rec.get('country')
status_field = rec.get('status', [])
primary_status = status_field[0] if len(status_field) > 0 else None
secondary = status_field[1] if len(status_field) > 1 else []
def make_entry(pid, val, nm, conf, st, ctry):
return {
'provenance': 'affro',
'version': VERSION,
'pid': pid,
'value': val,
'name': nm,
'confidence': conf,
'status': st,
'country': ctry
}
if "openorgs" in value:
result_dict.append(make_entry('openorgs', value, name, score, 'active', country))
continue
# ROR branch
if primary_status == 'active':
result_dict.append(make_entry('ror', value, name, score, 'active', country))
continue
# primary is not active
# treat case where secondary exists and its first element is empty string specially
if secondary and secondary[0] == '':
result_dict.append(make_entry('ror', value, name, score, primary_status, country))
else:
# append parent (non-active)
result_dict.append(make_entry('ror', value, name, score, primary_status, country))
# append linked records (use link's own metadata)
for link in secondary:
if not link:
continue
link_rec = dix_id.get(link, {})
link_name = link_rec.get('name')
link_country = link_rec.get('country')
result_dict.append(make_entry('ror', link, link_name, score, 'active', link_country))
return result_dict
def count_active(items):
return sum(1 for x in items if x.get("status") == "active")
def disamb(input, id_list_,dix_id):
# print('disamb id_list_', id_list_)
if id_list_ == []:
return []
clean_aff = input[0]
# print(input)
result_dict = convert_to_result(id_list_, dix_id)
num_actives = count_active(result_dict)
# print('result_dict',result_dict)
# print('num_actives', num_actives)
if len(id_list_) ==1:
# print('1')
return result_dict
elif len(description(clean_aff)[1]) == 0:
# print('no country in affiliation')
# polytechnic?
countries_uni = [res['country'] for res in result_dict if 'Uni' in res['name']]
if len(countries_uni) >0:
final_matching = [res for res in result_dict if res['country'] in countries_uni]
return final_matching
else:
# print('no universities')
return result_dict
elif num_actives > len(set(description(clean_aff)[1])):
# print('more results than countries')
final_matching = []
light_aff_tokens = [clean_string_ror(x) for x in set(clean_aff.split())]
for res in result_dict:
country = res['country']
if country == 'united states':
if 'united states' in clean_aff or 'usa' in light_aff_tokens or contains_us_state(clean_aff):
final_matching.append(res)
elif country == 'united kingdom':
if 'united kingdom' in clean_aff or 'uk' in light_aff_tokens:
final_matching.append(res)
elif 'korea' in country:
if 'korea' in light_aff_tokens:
final_matching.append(res)
elif country in clean_aff:
final_matching.append(res)
if final_matching:
return final_matching
else:
return result_dict
elif len(result_dict)>0:
return result_dict
else:
# print('leider nichts')
return []

View File

@ -0,0 +1,167 @@
from affro.helpers.functions import *
from affro.helpers.create_input import *
specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' or categ_dicts[k] == 'Acronyms']
country_synonyms = {x: [x] for x in countries}
country_synonyms["united states"] = ["united states", "u.s.a.", "usa", "usa.","states"]
country_synonyms["germany"] = ["germany","deutschland"]
country_synonyms["united kingdom"] = ["united kingdom", "u.k.", "uk", "uk.","kingdom","england"]
country_synonyms["turkey"] = ["turkey","turkiye", "cyprus"]
country_synonyms["china"] = ["china", "prc","chinese"]
country_synonyms["ireland"] = ["eire", "ireland"]
country_synonyms["south korea"] = ["south korea", "korea"]
special_countries = {'united states', 'united kngdom', 'germany', 'china','turkey'}
def keep_highest_score(data):
""""
Keeps only one inner list for each unique last value.
The kept list is the one with the greatest second value.
If multiple have the same greatest second value, one is kept arbitrarily.
"""
best = {}
for lst in data:
key = lst[-1]
value = lst[1]
if key not in best or value > best[key][1]:
best[key] = lst
return list(best.values())
def find_id(input, best_names, dix_name):
# print('start find_id')
clean_aff = input[0]
light_aff = input[1]
id_list = []
for org_list in best_names:
org = org_list[0]
# print('org:', org)
conf = org_list[1]
if len(dix_name[org]) == 1:
# print('unique')
id_ = dix_name[org][0]['id']
city_ = dix_name[org][0]['city']
country_ = dix_name[org][0]['country']
# print(city_, country_)
# print('c',set(country_synonyms[country_]))
# print('l',set(light_aff.split()))
if (
# ('univ' in org and 'institu' in org)
# or
(
city_ not in light_aff
and not set(country_synonyms[country_]) & set(light_aff.split())
and 'univ' not in org
and 'inst' not in org
and 'national' not in org
and valueToCategory(org) not in ['Company', 'Acronyms', 'Specific']
)
):
pass
else:
id_list.append([org, conf, id_])
# else:
# id_list.append([org, conf, id_])
else:
# print('multiple')
match_found = False
for quadruple in dix_name[org]:
city_ = quadruple['city']
# print('city', city_)
id_ = quadruple['id']
if city_ in clean_aff:
if city_ not in org:
id_list.append([org, conf, id_])
match_found = True
# break
else:
if clean_aff.count(city_) >1:
id_list.append([org, conf, id_])
match_found = True
# break
if not match_found:
countries_ids = {quadruple['country'] for quadruple in dix_name[org]}
if countries_ids & special_countries:
# print('special country')
for quadruple in dix_name[org]:
country_ = quadruple['country']
# print(country_)
id_ = quadruple['id']
tokens = set([x.replace(',','') for x in clean_aff.lower().split()])
# print('tokens',tokens)
text = clean_aff.lower()
# print('text', text)
if ((country_ == 'united states' and ('united states' in text or {'usa', 'usa.'} & tokens or 'u.s.a.' in text)) or
(country_ == 'germany' and ('deutschland' in text )) or
(country_ == 'united kingdom' and ('united kingdom' in text or ({'uk', 'uk.'} & tokens) or 'u.k.' in text)) or
(country_ == 'turkey' and ('turkiye' in text)) or
(country_ == 'china' and ('chinese' in text or 'prc' in text))):
# print('specific country found')
id_list.append([org, conf, id_])
match_found = True
break
if not match_found:
# print('no special country')
for quadruple in dix_name[org]:
country_ = quadruple['country']
id_ = quadruple['id']
# print(country_)
if country_.split()[0] in clean_aff:
# print('no specific found')
if country_ not in org:
id_list.append([org, conf, id_])
match_found = True
break
if not match_found:
for quadruple in dix_name[org]:
country_ = quadruple['country']
id_ = quadruple['id']
if country_ in clean_aff and country_ in org:
id_list.append([org, conf, id_])
match_found = True
# break
if not match_found:
# print('check sp')
for sp in specific:
if sp in org:
for rec in dix_name[org]:
if dix_id[rec['id']]['top_level'] == 'y':
# print('top level found for specific')
id_list.append([org, conf, rec['id']])
match_found = True
break
if not match_found:
dix_id[rec['id']]['parent'] == 'y'
# print('parent found for specific')
id_list.append([org, conf, rec['id']])
match_found = True
break
if not match_found:
# print('check first y')
for quadruple in dix_name[org]:
if 'department' not in org and 'labora' not in org and quadruple['first'] == 'y':
id_list.append([org, conf, quadruple['id']])
break
# print('id_list',id_list)
id_list_final = keep_highest_score(id_list)
# print('end find_id', id_list_final)
return id_list_final

View File

@ -0,0 +1,94 @@
from affro.helpers.functions import *
from affro.helpers.create_input import *
from affro.helpers.matching import *
def find_name(input, dix_name, simU, simG, limit):
# print('start find_name')
# print('input',input)
"""
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
Args:
m (int): The number of DOIs to check.
DF (DataFrame): The input DataFrame containing affiliation data.
dix_org (dict): A dictionary of names of organizations and their ROR_ids.
simU (float): Similarity threshold for universities.
simG (float): Similarity threshold for non-universities.
Returns:
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
"""
clean_aff = input[0]
light_aff = input[1].replace(' gmbh', ' ').strip()
df_list = input[2]
countries_list = input[3]
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
result = {}
pairs = []
keywords = [entry["keywords"].replace(' gmbh', ' ').strip() for entry in df_list]
candidates = get_candidates(countries_list)
if len(keywords) > 1 or len(keywords) == 1 and len(keywords[0])>1:
for k,s in enumerate(keywords):
if len(s) >1 and s not in countries:
pairs_k = []
# print(s)
#--end september 2025
try:
# print('lucky', s)
pairs_k.append((s,s,1, dix_name[s][0]['id'],dix_name[s][0]['country']))
if k not in dix:
dix[k] = [s]
else:
dix[k].append(s)
except Exception as e:
# else:
# print('not lucky')
try:
pairs_k = find_candidate(s, k , dix, simU, simG, candidates, limit)
except:
pairs_k = []
result[k] = pairs_k
if len(pairs_k)>0:
pairs.append(pairs_k)
multi = index_multiple_matchings(pairs)
# print('multi', multi)
need_check_keys = []
ready_keys = []
ready_best = []
for keyword in multi:
try:
if multi[keyword]>1:
need_check_keys.append(keyword)
else:
for p in pairs:
if keyword in p[0]:
if p[0][1] not in ready_keys:
ready_keys.append(p[0][1])
ready_best.append([p[0][1], p[0][2]])
except Exception as e:
print('ERROR, find_name', e)
pass
pairs_check = [ pair for pair in pairs if pair[0][0] in need_check_keys ]
# print('pairs_check',pairs_check)
if len(need_check_keys)>0:
# print(' len(need_check_keys)', len(need_check_keys))
best0 = best_sim_score(clean_aff, light_aff, len(keywords), pairs_check, multi, simU, simG)
# print('best0', best0)
best1 = {x[0]:dix_name[x[0]][0]['id'] for x in best0 }
# print('best1', best1)
best01 = unique_subset(best0, best1)
best = best01 + ready_best
else:
best = ready_best
# print('end find_name', best)
return best

View File

@ -28,17 +28,10 @@ def load_txt(relative_path, package="affro"):
with full_path.open("r", encoding="utf-8") as file:
return [line.strip() for line in file]
#categ_string = 'Laboratory|Univ/Inst|Hospital|Foundation|Museum|Government|Company'
categ_string = 'Academia|Hospitals|Foundations|Specific|Government|Company|Acronyms'
dix_org = load_json('jsons/dix_org.json')
dix_city = load_json('jsons/dix_city.json')
dix_country = load_json('jsons/dix_country.json')
dix_mult = load_json('jsons/dix_mult.json')
dix_country_legalnames = load_json('jsons/dix_country_legalnames.json')
us_states = [
"alabama", "alaska", "arizona", "arkansas", "california",
"colorado", "conecticut", "delaware", "florida", "georgia",
@ -52,6 +45,10 @@ us_states = [
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
]
dix_name = load_json('jsons/dix_name.json')
dix_country_legalnames = load_json('jsons/dix_country_legalnames.json')
def replace_double_consonants(text):
# This regex pattern matches any double consonant
pattern = r'([bcdfghjklmnpqrstvwxyz])\1'
@ -59,18 +56,32 @@ def replace_double_consonants(text):
result = re.sub(pattern, r'\1', text, flags=re.IGNORECASE)
return result
#stop_words = ['from', 'the', 'of', 'at', 'de','for','et','für','des', 'in','as','a','and','fur','for','und','di']
def remove_stop_words(text):
words = text.split()
filtered_words = [word for word in words if word not in stop_words]
return ' '.join(filtered_words)
filtered_words = []
for word in words:
if word.endswith(","):
core = word[:-1] # remove the comma
if core not in stop_words:
filtered_words.append(core + ",")
else:
filtered_words.append(",") # keep only the comma
else:
if word not in stop_words:
filtered_words.append(word)
result = " ".join(filtered_words)
# remove spaces before commas
result = result.replace(" ,", ",")
return result
stop_words = load_txt('txts/stop_words.txt')
dix_id_country = load_json('jsons/dix_id_country.json')
dix_id = load_json('jsons/dix_id.json')
categ_dicts = load_json('jsons/dix_categ.json')
replacements = load_json('jsons/replacements.json')
@ -82,6 +93,10 @@ stop_words.remove('at')
university_terms = [replace_double_consonants(x) for x in load_txt('txts/university_terms.txt')]
city_names = [replace_double_consonants(x) for x in load_txt('txts/city_names.txt')]
def is_first(id, name):
for quadruple in dix_name[name]:
if quadruple['id'] == id:
return quadruple['first']
def get_candidates(country_list):
@ -89,7 +104,7 @@ def get_candidates(country_list):
cand = [dix_country_legalnames[country] for country in country_list if country in dix_country_legalnames]
return list(set([item for sublist in cand for item in sublist]))
else:
return list(dix_org.keys())
return list(dix_name.keys())
def is_contained(s, w):
@ -109,6 +124,11 @@ def is_contained(s, w):
return False # Return False immediately
return True # If all words from 's' are found in 'w', return True
def split_sub(s: str) -> str:
# Add comma after certain word pairs
pattern = r'\b((?:univer))\s+(department|faculty|institu)\b'
return re.sub(pattern, r'\1, \2', s, flags=re.IGNORECASE)
def starts_with_any(string, prefixes):
"""
@ -158,39 +178,37 @@ def replace_roman_numerals(text):
def insert_space_between_lower_and_upper(s):
"""
Inserts a space between a lowercase letter followed by an uppercase letter in a string.
Parameters:
s (str): The input string.
Returns:
str: The modified string with spaces inserted.
Insert a space between a lowercase letter and a following uppercase letter,
while protecting listed substrings (case-sensitive) and restoring them in lowercase.
"""
# Temporarily replace 'AstraZeneca' to prevent modification
s = s.replace('gGmbH','gmbh')
s = s.replace('AstraZeneca', 'ASTRAZENECA_TEMP')
s = s.replace('BioNTech', 'BIONTECH_TEMP')
s = s.replace('GlaxoSmithKline', 'GLAXO_TEMP')
s = s.replace('GmbH', 'GMBH_TEMP')
s = s.replace('gmbH', 'GMBH_TEMP')
s = s.replace('gGmbH', 'GMBH_TEMP')
protected = ['DePaul',
'AstraZeneca',
'BioNTech',
'GlaxoSmithKline',
'LifeWatch',
'SoBigData',
'GmbH',
'gGmbH',
'gmbH'
]
# Replace protected words with placeholders mapping to their lowercase versions
placeholders = {}
for i, word in enumerate(protected):
key = f"__PROT_{i}__"
s = s.replace(word, key)
placeholders[key] = word.lower()
# Exclude cases where 'Mc' is followed by a capital letter
modified_string = re.sub(r'(?<!Mc)([a-z])([A-Z])', r'\1 \2', s)
# Ensure no spaces are inserted within 'Mc' sequences
modified_string = re.sub(r'(Mc) ([A-Z])', r'\1\2', modified_string)
# Restore 'AstraZeneca'
modified_string = modified_string.replace('ASTRAZENECA_TEMP', 'AstraZeneca')
modified_string = modified_string.replace('BIONTECH_TEMP', 'BioNTech')
modified_string = modified_string.replace('GLAXO_TEMP', 'GlaxoSmithKline')
modified_string = modified_string.replace('GMBH_TEMP', 'gmbh')
# Add space between lowercase and uppercase (except after 'Mc')
s = re.sub(r'(?<!Mc)([a-z])([A-Z])', r'\1 \2', s)
s = re.sub(r'(Mc) ([A-Z])', r'\1\2', s)
# Restore placeholders to lowercase
for key, lower_word in placeholders.items():
s = s.replace(key, lower_word)
return s
return modified_string
@ -216,7 +234,7 @@ def replace_abbr_univ(token):
elif token == "u " + city:
return "univer " + city
elif token == "tu " + city:
return "technical univer " + city
return "techn univer " + city
else:
return token
@ -224,7 +242,8 @@ def replace_abbr_univ(token):
def remove_parentheses(text):
return re.sub(r'\([^()]*\)', '', text)
L = ['univ', 'hospital', 'clinic', 'klinik', 'Univ', 'Hospital', 'Clinic', 'Klinik']
L = ['univ', 'hospital', 'clinic', 'klinik', 'Univ', 'Hospital', 'Clinic', 'Klinik'] + [s.title() for s in countries] + countries
word_pattern = "|".join(map(re.escape, L))
def process_parentheses(text):
@ -239,16 +258,15 @@ def process_parentheses(text):
Returns:
str: The modified string after processing parentheses.
"""
text = re.sub(r'\((?![^)]*(' + word_pattern + r'))[^)]*\)', '', text)
text_lower = text.lower()
text_lower = re.sub(r'\((?![^)]*(' + word_pattern + r'))[^)]*\)', '', text_lower)
# Replace `(` with `,` and `)` with `,` if a word from L is inside
text = re.sub(r'\(([^)]*(' + word_pattern + r')[^)]*)\)', r', \1,', text)
text_lower = re.sub(r'\(([^)]*(' + word_pattern + r')[^)]*)\)', r', \1,', text_lower)
return text
return text_lower
def replace_comma_spaces(text):
return text.replace(' ', ' ').replace(' , ', ', ')
@ -313,6 +331,7 @@ def replace_newlines_with_space(text: str, repl: str = " ") -> str:
return cleaned
def substrings_dict(string):
"""
Processes a given string by performing the following transformations:
@ -361,7 +380,8 @@ def substrings_dict(string):
modified_value = re.sub(r'\btrinity col\b', 'trinity colege', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\btechnische\b', 'technological', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\bteknologi\b', 'technology', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\bpolitehnica\b', 'polytechnic', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\bpolite\w*', 'polytechnic', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\bpolyte\w*', 'polytechnic', modified_value, flags=re.IGNORECASE)
modified_value = re.sub(r'\btechn\w*', 'techn', modified_value, flags=re.IGNORECASE)
#modified_value = re.sub(r'techno\w*', 'techno', modified_value, flags=re.IGNORECASE)
@ -375,16 +395,22 @@ def substrings_dict(string):
index += 1
# Add the original substring to the dictionary
# else:
# dict_string[index] = value.lower().strip()
# index += 1
return dict_string
def split_country(text):
try:
if text.split(' ')[-1].lower() in countries and startswith(text.split(' ')[-2].lower()) != 'univ':
return " ".join(text.split(' ')[0:-1])+", "+ text.split(' ')[-1].lower()
else:
return text
except:
return text
def clean_string_ror(input_string):
def clean_string_lucky(input_string):
input_string = replace_underscore(replace_comma_spaces(replace_double_consonants(unidecode(process_parentheses(fully_unescape(input_string.replace("","'").replace(" ́e","e").replace("'s", "s").replace("'", " "))))))).strip()
input_string = replace_underscore(replace_comma_spaces(replace_double_consonants(unidecode(process_parentheses(fully_unescape(input_string.replace("","'").replace(" ́e","e").replace("'s", "s").replace("'", ""))))))).strip()
result = remove_stop_words(replace_roman_numerals(input_string.lower()))
result = result.replace(' and ',' ')
@ -402,7 +428,8 @@ def clean_string_ror(input_string):
university_terms = {'universitatsklinikum', 'universitatskinderklinik',
'universitatspital', 'universitatskliniken', 'universitetshospital',
'universitatsmedizin', 'universitatsbibliothek','universitatszahnklinik'
'universitatsmedizin', 'universitatsbibliothek','universitatszahnklinik',
'universiteitsmuseum'
}
result = replace_acronyms(result).replace('.', ' ')
@ -425,7 +452,65 @@ def clean_string_ror(input_string):
result = re.sub(r'\btechnische\b', 'technological', result, flags=re.IGNORECASE)
result = re.sub(r'\bteknologi\b', 'technological', result, flags=re.IGNORECASE)
result = re.sub(r'\bpolitehnica\b', 'polytechnic', result, flags=re.IGNORECASE)
result = re.sub(r'\bpolite\w*', 'polytechnic', result, flags=re.IGNORECASE)
result = re.sub(r'\bpolyte\w*', 'polytechnic', result, flags=re.IGNORECASE)
result = re.sub(r'czechoslovak\b', 'czech', result, flags=re.IGNORECASE)
result = re.sub(r'\btechn\w*', 'techn', result, flags=re.IGNORECASE)
# result = re.sub(r'techno\w*', 'techno', result, flags=re.IGNORECASE)
result = re.sub(r'scien\w*', 'scien', result, flags=re.IGNORECASE)
# result = re.sub(r'\bsaint\b', 'st', result, flags=re.IGNORECASE)
return result.strip()
def clean_string_ror(input_string):
input_string = replace_underscore(replace_comma_spaces(replace_double_consonants(unidecode(remove_parentheses(fully_unescape(input_string.replace("","'").replace(" ́e","e").replace("'s", "s").replace("'", ""))))))).strip()
result = remove_stop_words(replace_roman_numerals(input_string.lower()))
result = result.replace(' and ',' ')
# Remove characters that are not from the Latin alphabet, or allowed punctuation
result = remove_multi_digit_numbers(replace_comma_spaces(re.sub(r'[^a-zA-Z0-9\s,;/:.\-\—]', '', result).strip()))
# Restore the " - " sequence from the placeholder
#result = result.replace(placeholder, " ")
result = result.replace(':',' ').replace(';',' ').replace('-',' ').replace('',' ').replace(',',' ')
# Replace consecutive whitespace with a single space
university_terms = {'universitatsklinikum', 'universitatskinderklinik',
'universitatspital', 'universitatskliniken', 'universitetshospital',
'universitatsmedizin', 'universitatsbibliothek','universitatszahnklinik',
'universiteitsmuseum'
}
result = replace_acronyms(result).replace('.', ' ')
result = re.sub(r'\s+', ' ', result)
# Replace consecutive whitespace with a single space
if not any(term in result.lower() for term in university_terms):
result = re.sub(r'universi\w*', 'univer', result, flags=re.IGNORECASE)
result = re.sub(r'\bsaint\b', 'st', result,flags=re.IGNORECASE)
result = re.sub(r'institu\w*', 'institu', result, flags=re.IGNORECASE)
result = re.sub(r'labora\w*', 'labora', result, flags=re.IGNORECASE)
result = re.sub(r'centre\b', 'center', result, flags=re.IGNORECASE)
result = re.sub(r'centrum\b', 'center', result, flags=re.IGNORECASE)
result = re.sub(r'hopital\b', 'hospital', result, flags=re.IGNORECASE)
result = re.sub(r'hospital(?!s)\w*', 'hospital', result, flags=re.IGNORECASE)
#result = re.sub(r'centro\b', 'center', result, flags=re.IGNORECASE)
result = re.sub(r'\btechnische\b', 'technological', result, flags=re.IGNORECASE)
result = re.sub(r'\bteknologi\b', 'technological', result, flags=re.IGNORECASE)
result = re.sub(r'\bpolite\w*', 'polytechnic', result, flags=re.IGNORECASE)
result = re.sub(r'\bpolyte\w*', 'polytechnic', result, flags=re.IGNORECASE)
result = re.sub(r'czechoslovak\b', 'czech', result, flags=re.IGNORECASE)
result = re.sub(r'\btechn\w*', 'techn', result, flags=re.IGNORECASE)
@ -436,13 +521,13 @@ def clean_string_ror(input_string):
return result.strip()
def clean_string(input_string):
input_string = replace_underscore(replace_comma_spaces(unidecode(process_parentheses(fully_unescape(replace_newlines_with_space(input_string).replace("P.O. Box","").replace("","'").replace(" ́e","e").replace("'s", "s").replace("'", " ")))))).strip()
input_string = replace_underscore(replace_comma_spaces(unidecode(process_parentheses(insert_space_between_lower_and_upper(fully_unescape(replace_newlines_with_space(input_string).replace("P.O. Box","").replace("","'").replace(" ́e","e").replace("'s", "s").replace("'", " "))))))).strip()
# result = re.sub(r'(?<! )[–—-](?! )', ' ', input_string)
# print('h',input_string)
result = remove_stop_words(replace_double_consonants(replace_roman_numerals(insert_space_between_lower_and_upper(input_string).lower())))
result = remove_stop_words(replace_double_consonants(replace_roman_numerals((input_string).lower())))
# Remove characters that are not from the Latin alphabet, or allowed punctuation
@ -458,12 +543,12 @@ def clean_string(input_string):
#result = replace_roman_numerals(remove_stop_words(insert_space_between_lower_and_upper(result).lower()))
return result.strip() # Strip leading/trailing spaces
return split_country(result.strip()) # Strip leading/trailing spaces
def description(aff_string):
aff_string = aff_string.replace('turkiye', 'turkey')
aff_string = aff_string.replace('turkiye', 'turkey').lower()
aff_string = aff_string.replace('kirgizistan', 'kyrgyzstan')
descr = []
countries_ = []
words = re.split(r'[ ,;:/]+', aff_string)
@ -473,16 +558,15 @@ def description(aff_string):
# if w in city_names:
# descr.append('city')
w = re.sub(r'[^A-Za-z\s]', '', w)
if replace_acronyms(w) in countries:
descr.append('country')
countries_.append(w)
if replace_acronyms(w) in us_states:
descr.append('country')
countries_.append('usa')
elif w in ['univer', 'institu', 'hospital', 'labora']:
elif w in ['univer', 'institu', 'hospital', 'labora', 'colege']:
descr.append('basic_key')
elif w == 'and':
@ -531,12 +615,12 @@ def split_and(string):
tok_no_sl1 = ' '.join(token.replace('-', ' ').split())
tok_no_sl2 = ' '.join(token.replace('', ' ').split())
tok_no = ' '.join(token.replace(' and ', ' ').replace(' at ', ' ').replace(' an ', ' ').replace('-', ' ').replace('', ' ').split())
if tok_no in dix_org:
if tok_no in dix_name:
token = tok_no
else:
if tok_no_and not in dix_org:
if tok_no_and not in dix_name:
# Store once instead of calling multiple times
if is_subsequence(replace_sequence, token_description):# and token.split(' and ', ' ') not in dix_org:
@ -547,20 +631,20 @@ def split_and(string):
else:
token = tok_no_and
if tok_no_at not in dix_org:
if tok_no_at not in dix_name:
token = ' '.join(token.replace(' at ', ', ').split())
else:
token = tok_no_at
if tok_no_an not in dix_org:
if tok_no_an not in dix_name:
token = ' '.join(token.replace(' an ', ', ').split())
else:
token = tok_no_an
if tok_no_sl1 not in dix_org:
if tok_no_sl1 not in dix_name:
token = ' '.join(token.replace('-', ',').split())
else:
token = tok_no_sl1
if tok_no_sl2 not in dix_org:
if tok_no_sl2 not in dix_name:
token = ' '.join(token.replace('', ',').split())
else:
token = tok_no_sl2
@ -577,10 +661,10 @@ def reduce(light_aff):
aff_no_symbols_d = substrings_dict(light_aff)
substring_list = list(aff_no_symbols_d.values())
#light_aff_final = ', '.join((substring_list))
# print('h', substring_list)
# print('h', substring_list)
light_aff_final = split_and(', '.join((substring_list)))
# print('th', light_aff_final)
return light_aff_final
# print('th', light_aff_final)
return split_sub(light_aff_final)
def unique_subset(L, D):
@ -615,6 +699,7 @@ def str_radius_u(string, radius_u):
return result
sp_specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' and ' ' in k]
def str_radius_spec(string):
spec = False
@ -626,16 +711,32 @@ def str_radius_spec(string):
except:
pass
if spec == False:
return string
for x in sp_specific:
if x in string:# or categ_dicts[x] == 'Acronyms':
spec = True
# print('CHECK',x)
return x
if spec ==False:
return string
#
# def str_radius_spec(string):
# spec = False
# for x in only_specific:
# if x in string:# or categ_dicts[x] == 'Acronyms':
# spec = True
# return x
# if spec ==False:
# return string
def shorten_keywords(affiliations_simple, radius_u):
affiliations_simple_n = []
for aff in affiliations_simple:
# print('check aff', aff)
if aff in dix_org:
# print('check aff', aff)
if aff in dix_name:
# print('in dix')
affiliations_simple_n.append(aff)
elif 'univer' in aff:

View File

@ -2,59 +2,33 @@ import Levenshtein
from affro.helpers.functions import *
from affro.helpers.create_input import *
specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' or categ_dicts[k] == 'Acronyms']
# print('HERE', len(dix_org))
# print('HERE_city', len(dix_city))
# print('HERE_country', len(dix_country))
def index_multiple_matchings(pairs):
d = {}
for p in pairs:
d[p[0][0]] = len(p)
return d
def keep_highest_url(lst):
best = {}
for item in lst:
name, score, url = item
if name not in best or url > best[name][2]: # Keep the highest URL
best[name] = item # Store the full entry
return list(best.values()) # Convert dictionary values back to list
def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
vectorizer = CountVectorizer()
similar_k = []
pairs_k = []
total_pairs = 0
# if keyword in dix_org:
# print('lucky')
# pairs_k.append((keyword,keyword,1,dix_org[keyword], dix_id_country[dix_org[keyword]]))
for x in candidates_:
# print('keyword', keyword)
if is_contained(keyword, x):
# print(0,x,total_pairs)
if is_contained(keyword, x):# and ('univ' in x or 'inst' in x or len(get_candidates([])) < len(dix_name)):
# print('keyword contained')
x_vector = vectorizer.fit_transform([x]).toarray()
keyword_vector = vectorizer.transform([keyword]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(x_vector, keyword_vector)[0][0]
# print('similarity', similarity)
if similarity > min(simU, simG):
if ('univ' in keyword and 'univ' in x) and similarity > simU:
similar_k.append(similarity)
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
pairs_k.append((keyword,x,similarity))
total_pairs += 1 # Track total number of pairs
@ -65,24 +39,17 @@ def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
elif (not 'univ'in keyword and not 'univ' in x) and similarity > simG:
# print('pass', keyword, x, similarity)
similar_k.append(similarity)
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
pairs_k.append((keyword,x,similarity))
total_pairs += 1 # Track total number of pairs
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
elif is_contained(x, keyword):
# print(0.5,x,total_pairs)
if ('univ'in keyword and 'univ' in x):
# print(1,x,total_pairs)
keyword_vector = vectorizer.fit_transform([keyword]).toarray()
x_vector = vectorizer.transform([x]).toarray()
@ -91,7 +58,7 @@ def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
similarity = cosine_similarity(keyword_vector, x_vector)[0][0]
if similarity > simU: #max(0.82,sim):
similar_k.append(similarity)
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
pairs_k.append((keyword,x,similarity))
total_pairs += 1 # Track total number of pairs
if k not in dix:
@ -102,25 +69,27 @@ def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
elif not 'univ' in keyword and not 'univ' in x:
# print('not uni')
keyword_vector = vectorizer.fit_transform([keyword]).toarray()
x_vector = vectorizer.transform([x]).toarray()
# Compute similarity between the vectors
similarity = cosine_similarity(keyword_vector, x_vector)[0][0]
if similarity > simG: #max(0.82,sim):
similar_k.append(similarity)
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
pairs_k.append((keyword,x,similarity))
total_pairs += 1 # Track total number of pairs
if k not in dix:
dix[k] = [x]
else:
dix[k].append(x)
# total_pairs += len(pairs_k) # Track total number of pairs
if total_pairs >= limit: # Stop if we reach
return []
# print('end find_candidate', pairs_k)
return pairs_k
@ -131,7 +100,6 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
"""
vectorizer = CountVectorizer()
univ_num = light_raw.lower().count('univ')
result = []
best = []
@ -141,7 +109,6 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
affil = pair_group[0][0]
num_uni_p = affil.count('univ')
# print('AFFIL', affil)
for p in pair_group:
organization, confidence = p[1], p[2]
@ -183,10 +150,8 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
# Sort by similarity score (descending) and then lexicographically
reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True)
# print('REDUCED BEST: ', reduced_best)
result.extend(reduced_best)
# print('RESULT EXT: ', result)
# Step 3: Limit university-related matches
univ_list = [r for r in result if 'univ' in r[0]]
@ -214,247 +179,7 @@ def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU,
# Convert to list format
final_result = [[key, value[0]] for key, value in sorted(result_dict.items(), key=lambda x: x[1][1], reverse=True)]
# print("RESULT TO USE: ", final_result)
return final_result
def Aff_Ids(input, dix_org, dix_mult, dix_city, dix_country, simU, simG, limit):
"""
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
Args:
m (int): The number of DOIs to check.
DF (DataFrame): The input DataFrame containing affiliation data.
dix_org (dict): A dictionary of names of organizations and their ROR_ids.
simU (float): Similarity threshold for universities.
simG (float): Similarity threshold for non-universities.
Returns:
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
"""
clean_aff = input[0]
# print('CLEAN_AFF (LVL1): ', clean_aff)
light_aff = input[1].replace(' gmbh', ' ').strip()
# print('LIGHT_AFF (LVL2): ', light_aff)
df_list = input[2]
countries_list = input[3]
# print('COUNTRIES_LIST: ', countries_list)
vectorizer = CountVectorizer()
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
result = {}
pairs = []
keywords = [entry["keywords"].replace(' gmbh', ' ').strip() for entry in df_list]
candidates = get_candidates(countries_list)
# print('KEYWORDS: ', keywords)
if len(keywords) > 1 or len(keywords) == 1 and len(keywords[0])>1:
for k,s in enumerate(keywords):
pairs_k = []
# print('try', s)
try:
pairs_k.append((s,s,1,dix_org[s],dix_id_country[dix_org[s]]))
# print('LUCKY')
# pairs.append((s,s,similarity,dix_org[s], dix_id_country[dix_org[s]]))
if k not in dix:
dix[k] = [s]
else:
dix[k].append(s)
except:
# print('NOT LUCKY')
pairs_k = find_candidate(s, k , dix, simU, simG, candidates, limit)
# print('PAIRS K: ', pairs_k)
result[k] = pairs_k
if len(pairs_k)>0:
# print('PAIRS K>0: ', pairs_k)
pairs.append(pairs_k)
# print('PAIRS: ', pairs)
multi = index_multiple_matchings(pairs)
# print('MULTIL ',multi)
need_check_keys = []
ready_keys = []
ready_best = []
for keyword in multi:
try:
if multi[keyword]>1:
need_check_keys.append(keyword)
else:
for p in pairs:
if keyword in p[0]:
if p[0][1] not in ready_keys:
ready_keys.append(p[0][1])
ready_best.append([p[0][1], p[0][2]])
except:
pass
# print('READY KEYWORD: ', ready_keys)
# print('READY BEST: ', ready_best)
# print('NEED CHECK KEYWORD: ', need_check_keys)
pairs_check = [ pair for pair in pairs if pair[0][0] in need_check_keys ]
# print('NEED CHECK PAIRS: ', pairs_check)
if len(need_check_keys)>0:
best0 = best_sim_score(clean_aff, light_aff, len(keywords), pairs_check, multi, simU, simG)
# print('OUTPUT BEST: ', best0)
best1 = {x[0]:dix_org[x[0]] for x in best0 }
best01 = unique_subset(best0, best1)
matched_org = list(set([x[0] for x in best01])) + ready_keys
best = best01 + ready_best
# print('NEW BEST',best01)
else:
best = ready_best
matched_org = ready_keys
# print('FINAL BEST: ', best)
## print('MATCHED: ', matched_org)
id_list = []
for org_list in best:
org = org_list[0]
conf = org_list[1]
if dix_mult[org] == 'unique':
# print('unique:', org)
if 'institu' in org and 'univ' in org:
#print('both inst and univ', clean_aff)
if dix_city[org][0] not in clean_aff and dix_country[org][0] not in clean_aff:
#print('pass')
pass
else:
#print('correct')
id_list.append([org, conf, dix_org[org]])
else:
id_list.append([org, conf, dix_org[org]])
else:
# print('not unique:', org)
if org in dix_city:
match_found = False
for city in dix_city[org]:
if city[0] in clean_aff:
if city[0] not in org:
# print('city', city[0], org)
id_list.append([org, conf, city[1]])
match_found = True
break
else:
if clean_aff.count(city[0]) >1:
id_list.append([org, conf, city[1]])
match_found = True
break
if not match_found:
for city in dix_city[org]:
if city[0] in clean_aff and city[0] not in org:
id_list.append([org, conf, city[1]])
break
if not match_found:
match_found2 = False
match_found3 = False
all_countries = list(set([c[0] for c in dix_country[org]]))
if len(all_countries) > 1:
for country in dix_country[org]:
# print('country', country[0], org)
tokens = set(clean_aff.lower().split())
text = clean_aff.lower()
if country[0] == 'united states' and (
'united states' in text
or {'usa', 'usa.'} & tokens
or 'u.s.a.' in text
):
id_list.append([org, conf, country[1]])
match_found2 = True
match_found3 = True
break
if country[0] == 'united kingdom' and (
'united kingdom' in text
or {'uk', 'uk.'} & tokens
or 'u.k.' in text
):
id_list.append([org, conf, country[1]])
match_found2 = True
match_found3 = True
break
# print('check country', clean_aff)
# if country[0] == 'united states' and (country[0] in clean_aff or 'usa' in clean_aff.split() or 'usa.' in clean_aff.split() or 'u.s.a.' in clean_aff):
# id_list.append([org, conf, country[1]])
# match_found2 = True
# match_found3 = True
# break
# if country[0] == 'united kingdom' and (country[0] in clean_aff or 'uk' in clean_aff.split() or 'u.k.' in clean_aff):
# id_list.append([org, conf, country[1]])
# match_found2 = True
# match_found3 = True
# break
if country[0] == 'turkey' and (
'turkiye' in text
#or 'u.k.' in text
):
# print('here turkey')
id_list.append([org, conf, country[1]])
match_found2 = True
match_found3 = True
break
elif country[0].split()[0] in clean_aff:
if country[0] not in org:
id_list.append([org, conf, country[1]])
match_found2 = True
match_found3 = True
break
else:
single_country = all_countries[0]
if single_country in clean_aff:
id_list.append([org, conf, dix_org[org]])
match_found2 = True
match_found3 = True
break
if not match_found3:
for country in dix_country[org]:
if country[0] in clean_aff and country[0] in org:
id_list.append([org, conf, country[1]])
match_found2 = True
break
if not match_found2:
for sp in specific:
if sp in org:
id_list.append([org, conf, dix_org[org]])
# print("RESULT: ", id_list)
id_list_final = keep_highest_url(id_list)
return id_list_final

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

1443450
build/lib/affro/jsons/dix_id.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -16,9 +16,12 @@
"universitatskinderklinik": "univer childrens hospital",
"universitatskliniken": "univer hospital",
"universitätsklinik": "univer hospital",
"uniklinik" : "univer hospital",
"universitatsmedizin": "univer medicine",
"universitatsbibliothek": "univer library",
"universiteitsmuseum": "univer museum",
"nat.": "national",
"pamantasan": "univer",
"uniaersity": "univer",
"univesity": "univer",
"unversity": "univer",
@ -76,5 +79,29 @@
"medisch": "medical",
"hahn-meitner-institut berlin" : "helmholtz-zentrum berlin",
"fachhochschule gelsenkirchen" : "westfalische hochschule",
"turkiye" : "turkey"
"turkiye" : "turkey",
"trinity colege university" : "trinity colege",
"tyndal institute" : "tyndal national institute",
"st patricks colege, drumcondra" : "dublin city university",
"ucd dublin" : "univer colege dublin",
"department university" : "department, university",
"xi an" : "xian",
"sligo general hospital" : "sligo univer hospital",
"trinity colege cambridge" : "univer cambridge",
"trinity colege, cambridge" : "univer cambridge",
"st johns colege, cambridge" : "univer cambridge",
"st johns colege cambridge" : "univer cambridge",
"kings colege, cambridge" : "univer cambridge",
"kings colege cambridge" : "univer cambridge",
"eire" : "ireland",
"trinity colege, ireland" : "trinity colege dublin",
"trinity colege ireland" : "trinity colege dublin",
"gilan" : "guilan",
"freiberg univer mining techn" : "techn univer bergakademie freiberg",
"vishwavidyalaya" : "univer",
"rwi esen" : "rwi leibniz institu economic research",
"t. d. medical colege" : "alapuzha medical colege",
"sulaymaniyah" : "sulaimani",
"-ang" : " ang"
}

File diff suppressed because it is too large Load Diff

View File

@ -1,6 +1,9 @@
eire
turkiye
trinidad
united
kingdom
states
emirates
hong
niederland
holand

View File

@ -20,4 +20,6 @@ di
l
street
post-box
e.v.
e.v.
do
ng

View File

@ -8,4 +8,11 @@ universitatsbibliothek
universitatspital
universitetsjukhuset
universitatsaugenklinik
univesitatsfrauenklinik
univesitatsfrauenklinik
universitetscentralsjukhus
universitatsverlag
universitaetsklinikum
universitatsalianz
universalmuseum
universitatszahnklinik
universitetsforlaget

View File

@ -1,4 +1,4 @@
python_Levenshtein==0.27.1
scikit_learn==1.4.2
setuptools==75.8.0
Unidecode==1.3.8
unidecode==1.3.8

View File

@ -1,387 +0,0 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"from affro.core import run_affro\n",
"from affro.core import matchings_affro\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/05ect4e57',\n",
" 'Louisiana State University',\n",
" 0.8660254037844388,\n",
" 'active',\n",
" 'united states')]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matchings_affro('Center for Computation and Technology#R##N#Louisiana State University')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/02x8svs93',\n",
" 'Near East University',\n",
" 1.0,\n",
" 'active',\n",
" 'cyprus')]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matchings_affro('Department of Electrical and Electronic Engineering, Intelligent Systems Research Group (ISRG), Near East University, Mersin 10, Türkiye')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/010b9wj87',\n",
" 'Boston Medical Center',\n",
" 1.0,\n",
" 'active',\n",
" 'united states'),\n",
" ('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/03ps5d564',\n",
" 'Boston University School of Medicine',\n",
" 1.0,\n",
" 'withdrawn',\n",
" 'united states'),\n",
" ('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/05qwgg493',\n",
" 'Boston University',\n",
" 1.0,\n",
" 'active',\n",
" 'united states')]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matchings_affro('--label omited: 2--maxwel finland laboratory infectious diseases, boston medical center, boston university school medicine, boston, masachusets')"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/010b9wj87',\n",
" 'Boston Medical Center',\n",
" 1.0,\n",
" 'active',\n",
" 'united states'),\n",
" ('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/03ps5d564',\n",
" 'Boston University School of Medicine',\n",
" 1.0,\n",
" 'withdrawn',\n",
" 'united states'),\n",
" ('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/05qwgg493',\n",
" 'Boston University',\n",
" 1.0,\n",
" 'active',\n",
" 'united states')]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matchings_affro('--label omited: 2--maxwel finland laboratory infectious diseases, boston medical center, boston university school medicine, boston, masachusets')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/045hgzm75',\n",
" 'Selçuk University',\n",
" 0.7071067811865475,\n",
" 'active',\n",
" 'turkey'),\n",
" ('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/04frf8n21',\n",
" 'Kyrgyz-Türkish Manas Üniversity',\n",
" 0.816496580927726,\n",
" 'active',\n",
" 'kyrgyzstan')]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matchings_affro('selcuk universitesi veteriner fakultesi, patoloji anabilim dali, kampus, konya,turkiye kirgizistan-turkiye manas universitesi, veteriner fakultesi, biskek/kirgizistan')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/045hgzm75',\n",
" 'Selçuk University',\n",
" 0.7071067811865475,\n",
" 'active',\n",
" 'turkey'),\n",
" ('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/04frf8n21',\n",
" 'Kyrgyz-Türkish Manas Üniversity',\n",
" 0.816496580927726,\n",
" 'active',\n",
" 'kyrgyzstan')]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matchings_affro('selcuk universitesi veteriner fakultesi, patoloji anabilim dali, kampus, konya,turkiye kirgizistan-turkiye manas universitesi, veteriner fakultesi, biskek/kirgizistan')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/03ad39j10',\n",
" 'University of Pisa',\n",
" 0.816496580927726,\n",
" 'active',\n",
" 'italy')]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matchings_affro(\"Universita'vjh di pisa\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/010jx2260',\n",
" 'National Institute of Agricultural Botany',\n",
" 1.0,\n",
" 'active',\n",
" 'united kingdom')]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matchings_affro('niab, united kingdom')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/024bc3e07',\n",
" 'Google (United Kingdom)',\n",
" 1.0,\n",
" 'active',\n",
" 'united kingdom')]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matchings_affro(\"google, United Kingdom\")"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[('affro',\n",
" '2.2.2',\n",
" 'ror',\n",
" 'https://ror.org/03ad39j10',\n",
" 'University of Pisa',\n",
" 0.816496580927726,\n",
" 'active',\n",
" 'italy')]"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"matchings_affro(\"Universita'vhj di pisa\")"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[{'provenance': 'affro',\n",
" 'version': '2.2.2',\n",
" 'pid': 'ror',\n",
" 'value': 'https://ror.org/04gnjpq42',\n",
" 'name': 'National and Kapodistrian University of Athens',\n",
" 'confidence': 1,\n",
" 'status': 'active',\n",
" 'country': 'greece'}]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"run_affro('university of athens')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "base",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

1183
test_gitea.ipynb Normal file

File diff suppressed because it is too large Load Diff