Merge pull request 'redirection of non active ror ids' (#1) from main into openaire-workflow-ready

Reviewed-on: #1
This commit is contained in:
Myrto Kallipoliti 2024-09-12 16:01:46 +02:00
commit b5d32cb730
4 changed files with 29 additions and 17 deletions

BIN
.DS_Store vendored Normal file

Binary file not shown.

View File

@ -15,7 +15,7 @@ def affro(raw_aff_string):
try:
result = Aff_Ids(create_df_algorithm(raw_aff_string), dix_org, dix_mult, dix_city, dix_country, 0.65, 0.82)
if len(result)>0:
result_dict = [json.dumps({'Origin': 'affRo', 'RORid':x[2], 'Confidence':x[1]}) for x in result]
result_dict = [json.dumps({'Origin': 'affRo', 'RORid':x[2], 'Confidence':x[1], 'Status':x[3]}) for x in result]
else:
result_dict = []
@ -25,7 +25,6 @@ def affro(raw_aff_string):
print(f"Error: {str(e)}")
print(raw_aff_string)
pass
#raw_aff = 'university of california, los angeles, university of athens, university of california, san diego, university of athens, greece'
if __name__ == "__main__":

File diff suppressed because one or more lines are too long

View File

@ -9,6 +9,9 @@ from sklearn.metrics.pairwise import cosine_similarity
from functions_cluster import *
from create_input_cluster import *
with open('/Users/myrto/Documents/openAIRE/3. ror/dictionaries/dix_status.json', 'rb') as f:
dix_status = json.load(f)
def best_sim_score(light_raw, candidate_num, pairs_list, m, simU, simG):
"""
Finds the best match between a 'key word' and several legal names from the OpenAIRE database.
@ -259,39 +262,37 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
# id_list = []
if dix_mult[x] != 'unique':
if x in list(dix_city_ror.keys()):
match_found0 = False
match_found = False
for city in dix_city_ror[x]:
if city[0] in light_aff:
if city[0] not in x:
ids[i] = city[1]
match_found0 = True
ids[i] = city[1]
match_found = True
break
else:
if light_aff.count(city[0]) >1:
ids[i] = city[1]
match_found = True
break
if not match_found:
for city in dix_city_ror[x]:
if city[0] in light_aff and city[0] not in x:
ids[i] = city[1]
match_found0 = True
print('ok')
break
if not match_found:
match_found2 = False
match_found3 = False
for country in dix_country_ror[x]:
if country[0] == 'united states' and (country[0] in light_aff or 'usa' in light_aff):
ids[i] = country[1]
match_found2 = True
match_found3 = True
break
if country[0] == 'united kingdom' and (country[0] in light_aff or 'uk' in light_aff):
ids[i] = country[1]
match_found2 = True
match_found3 = True
break
@ -299,7 +300,6 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
if country[0] not in x:
ids[i] = country[1]
match_found2 = True
match_found3 = True
break
@ -307,13 +307,25 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
for country in dix_country_ror[x]:
if country[0] in light_aff and country[0] in x:
ids[i] = country[1]
match_found2 = True
break
results = [[x[0],x[1], ids[i]] for i,x in enumerate(best)]
results_upd = []
return results #[[result[to_check[i]] for i in ready] + [to_check[2]], best[0]]
for r in results:
if dix_status[r[2]][0] == 'active':
r.append('active')
results_upd.append(r)
else:
if dix_status[r[2]][1] == '':
r.append(dix_status[r[2]][0])
results_upd.append(r)
else:
r.append(dix_status[r[2]][0])
results_upd.append(r)
results_upd.append([r[0],r[1], dix_status[r[2]][1], 'active'])
return results_upd