Compare commits
258 Commits
1.0.0
...
openaire-w
| Author | SHA1 | Date |
|---|---|---|
|
|
1d946234f8 | |
|
|
3e896bbfe2 | |
|
|
a95d7cd024 | |
|
|
a8c301de36 | |
|
|
5fbb024a20 | |
|
|
52290e8787 | |
|
|
f03f70e831 | |
|
|
6859f12cd2 | |
|
|
33cab47ac8 | |
|
|
9bf6fe36b8 | |
|
|
6b11f34a89 | |
|
|
4660c63e60 | |
|
|
8434c94f21 | |
|
|
98ba852129 | |
|
|
b53569908d | |
|
|
f5f10da1ea | |
|
|
72a32faefc | |
|
|
0962586053 | |
|
|
34fd7a7a28 | |
|
|
e1377d61fc | |
|
|
c75f4f38ae | |
|
|
3553849645 | |
|
|
96cb07353e | |
|
|
620e3d6691 | |
|
|
dc4dc3c38c | |
|
|
c82ecb1426 | |
|
|
9fe4517a06 | |
|
|
355a11f24d | |
|
|
29bf85307d | |
|
|
2728017f2b | |
|
|
ed7966f6f5 | |
|
|
f258d79f6f | |
|
|
2a8745a787 | |
|
|
89ac8f5274 | |
|
|
dc50bb9757 | |
|
|
ac635da607 | |
|
|
8b99f38175 | |
|
|
7ae5609fbd | |
|
|
7018e58015 | |
|
|
5c66ce9719 | |
|
|
d5d18145ec | |
|
|
f2bf93e6af | |
|
|
5302fa5f8d | |
|
|
7567d9af6c | |
|
|
e6c3672434 | |
|
|
e66c149b38 | |
|
|
3657ed1b7a | |
|
|
ad7df5ba49 | |
|
|
d419d7c137 | |
|
|
f50c93b527 | |
|
|
631409dbc4 | |
|
|
520ee36a9a | |
|
|
c0e1241ce6 | |
|
|
bde5bfeee0 | |
|
|
6b5170f945 | |
|
|
4550216303 | |
|
|
0d87e4f0ae | |
|
|
a82b862e85 | |
|
|
4c5d791c7b | |
|
|
f2049819a6 | |
|
|
fc7ca0e7d8 | |
|
|
8d04dc5550 | |
|
|
e010726b45 | |
|
|
8cdfa270a7 | |
|
|
3853e69079 | |
|
|
255c43665e | |
|
|
7af622b2bd | |
|
|
8a8a8af732 | |
|
|
8b2352c919 | |
|
|
0d74625e8b | |
|
|
3e7cd72cd8 | |
|
|
592630adf4 | |
|
|
7a9df9b72b | |
|
|
f5dae58a47 | |
|
|
dd89e31614 | |
|
|
0e11fdc9fb | |
|
|
6e08f677ae | |
|
|
d3c23584b6 | |
|
|
aa6a5699aa | |
|
|
690d7995f4 | |
|
|
70ae671862 | |
|
|
5615e4d31b | |
|
|
ab51386c85 | |
|
|
79c5c2d402 | |
|
|
27b7164b94 | |
|
|
828d02f980 | |
|
|
38ef05dce9 | |
|
|
9909c49f66 | |
|
|
8e37938f50 | |
|
|
6a5dcde3d9 | |
|
|
1f5616758d | |
|
|
e60699229c | |
|
|
8980be55e7 | |
|
|
18fb501f68 | |
|
|
7c9c7635a0 | |
|
|
622e166a4a | |
|
|
abc9fe937c | |
|
|
b40c6c3499 | |
|
|
ee4130b8f3 | |
|
|
7701f696fb | |
|
|
a9d00653e6 | |
|
|
2518046eb1 | |
|
|
bdae908bb2 | |
|
|
9c871dd923 | |
|
|
4889f6482c | |
|
|
25a60409f8 | |
|
|
58d2bd0603 | |
|
|
a8b2b0beb6 | |
|
|
cbb57a2343 | |
|
|
5267b632f1 | |
|
|
f522ed55af | |
|
|
130989c969 | |
|
|
e3c5632314 | |
|
|
8becc886a7 | |
|
|
5458d3e4ab | |
|
|
d4e02d93b5 | |
|
|
7ccaf6e413 | |
|
|
0ae91fc373 | |
|
|
fe2cfc5b91 | |
|
|
29498eab83 | |
|
|
4777ab4a40 | |
|
|
2cf614ec84 | |
|
|
ab6d437e1e | |
|
|
07ada4db55 | |
|
|
00adbb401f | |
|
|
5e30f5028d | |
|
|
c379fffe21 | |
|
|
dca233b205 | |
|
|
38d8c23dbc | |
|
|
8624c0f0e4 | |
|
|
9dccaa0ae4 | |
|
|
c3ff71a7fd | |
|
|
4310051cb5 | |
|
|
4aca6faef8 | |
|
|
9e2c056d98 | |
|
|
31ee6c3460 | |
|
|
f2dc62e481 | |
|
|
298c9a8b64 | |
|
|
6a92f5b1de | |
|
|
f21e6244ac | |
|
|
e717bf36d8 | |
|
|
1673c5e099 | |
|
|
5cd3229e52 | |
|
|
77c2caad29 | |
|
|
f7929ecc2f | |
|
|
dbcd97d9be | |
|
|
10fcb7cca9 | |
|
|
129799ecd0 | |
|
|
414f07607a | |
|
|
7d26fc23ac | |
|
|
41605c2de0 | |
|
|
24aa99176b | |
|
|
fdbdb19d75 | |
|
|
276b23b4fd | |
|
|
f365d415c5 | |
|
|
cb782e95f1 | |
|
|
157920ed02 | |
|
|
6d7a2bfa97 | |
|
|
a0dc9a3166 | |
|
|
4c84c36f53 | |
|
|
8c516fb558 | |
|
|
c9231e7064 | |
|
|
418ba54def | |
|
|
1222241c53 | |
|
|
30f0c17348 | |
|
|
bcc0a4a144 | |
|
|
f7ce9637c6 | |
|
|
4f8941cc22 | |
|
|
efa33eba22 | |
|
|
37703c6111 | |
|
|
6ca711d61c | |
|
|
9ed2510be9 | |
|
|
b2d4b1d2ac | |
|
|
57bae53c19 | |
|
|
d65bc6c385 | |
|
|
f1ef61bf39 | |
|
|
730090670d | |
|
|
c48f2e787f | |
|
|
0874760a8b | |
|
|
d207df7563 | |
|
|
ada983a780 | |
|
|
8d4d57b33d | |
|
|
8cc00a0e41 | |
|
|
1890b44591 | |
|
|
ffa66e3354 | |
|
|
afe0c6e1a5 | |
|
|
b526a1f391 | |
|
|
028f027f31 | |
|
|
f8d65d5ab1 | |
|
|
6b78f2344b | |
|
|
9146fe892f | |
|
|
3ba978a636 | |
|
|
baa6e3b4d9 | |
|
|
d35a9563b4 | |
|
|
58f46697bf | |
|
|
cbfca1922a | |
|
|
29ec06a7de | |
|
|
7037355f85 | |
|
|
b70bcebb12 | |
|
|
c0ee7428cf | |
|
|
f4c44175ac | |
|
|
f9432b9a2a | |
|
|
e5b9780075 | |
|
|
cbf99acebf | |
|
|
c3f8acf85e | |
|
|
bc0f33973e | |
|
|
fc241e1c81 | |
|
|
076c46c0ff | |
|
|
1bf09651c1 | |
|
|
e16048b50a | |
|
|
4824f5dd92 | |
|
|
5644d5adc4 | |
|
|
a2d9463e81 | |
|
|
0332c85b40 | |
|
|
dd316ceb4e | |
|
|
3eadd45d1a | |
|
|
fd32a8dd90 | |
|
|
41c7fa661a | |
|
|
84a47f6fdc | |
|
|
44f0f9987f | |
|
|
ad691c28c2 | |
|
|
2806511e02 | |
|
|
0043e4051f | |
|
|
a59d0ce9fc | |
|
|
e2f8007433 | |
|
|
f8479083f2 | |
|
|
9440f863c9 | |
|
|
f78456288c | |
|
|
997f2e492f | |
|
|
982a1b0b9f | |
|
|
4fe3d31ed5 | |
|
|
efa4db4e52 | |
|
|
ea2e27a9f4 | |
|
|
e33bf4ef14 | |
|
|
f4704aef4d | |
|
|
0500fc586f | |
|
|
5568aa92ec | |
|
|
600ddf8087 | |
|
|
03dc19fd3b | |
|
|
d9dbc679e3 | |
|
|
413ec3773e | |
|
|
ba98a16bcb | |
|
|
415b45e3ca | |
|
|
8c6f6a5a9a | |
|
|
b4f79adc56 | |
|
|
90426a6d29 | |
|
|
ad656121ed | |
|
|
ca6e8ad3b9 | |
|
|
8325c94e56 | |
|
|
5795ec6493 | |
|
|
57569fbb3b | |
|
|
968ecf9680 | |
|
|
2c6e7b7a70 | |
|
|
9473c30a09 | |
|
|
bace694d21 | |
|
|
a7b703b67d | |
|
|
b38be012a0 | |
|
|
fbf55b3d5d |
|
|
@ -0,0 +1,7 @@
|
|||
# Ignore macOS system files
|
||||
.DS_Store
|
||||
|
||||
# Ignore Python cache files
|
||||
__pycache__/
|
||||
|
||||
.idea
|
||||
393
affro_cluster.py
393
affro_cluster.py
|
|
@ -5,27 +5,396 @@ from matching_cluster import *
|
|||
from create_input_cluster import *
|
||||
import json
|
||||
|
||||
dix_org = load_json('dictionaries/dix_acad.json')
|
||||
dix_mult = load_json('dictionaries/dix_mult.json')
|
||||
dix_city = load_json('dictionaries/dix_city.json')
|
||||
dix_country = load_json('dictionaries/dix_country.json')
|
||||
#path_dict = "dictionaries/"
|
||||
path_dict = ""
|
||||
prefix = ""
|
||||
|
||||
dix_org_ror = load_json(path_dict + 'dix_acad'+prefix+'.json')
|
||||
dix_mult_ror = load_json(path_dict + 'dix_mult'+prefix+'.json')
|
||||
dix_city_ror = load_json(path_dict + 'dix_city'+prefix+'.json')
|
||||
dix_country_ror = load_json(path_dict + 'dix_country'+prefix+'.json')
|
||||
dix_org_oaire = load_json(path_dict + 'dix_acad_oaire.json')
|
||||
dix_mult_oaire = load_json(path_dict + 'dix_mult_oaire.json')
|
||||
dix_city_oaire = load_json(path_dict + 'dix_city_oaire.json')
|
||||
dix_country_oaire = load_json(path_dict + 'dix_country_oaire.json')
|
||||
dix_status = load_json(path_dict + 'dix_status'+prefix+'.json')
|
||||
#dix_grids = load_json('dictionaries/dix_grids_rors.json')
|
||||
dix_id_country_ror = load_json(path_dict + 'dix_id_country'+prefix+'.json')
|
||||
dix_id_country_oaire = load_json(path_dict + 'dix_id_country_oaire.json')
|
||||
|
||||
dix_country_legalnames = load_json(path_dict + 'dix_country_legalnames'+prefix+'.json')
|
||||
dix_org = dict(dix_org_ror)
|
||||
dix_org.update(dix_org_oaire)
|
||||
|
||||
dix_mult = dict(dix_mult_ror)
|
||||
dix_mult.update(dix_mult_oaire)
|
||||
|
||||
dix_city = dict(dix_city_ror)
|
||||
dix_city.update(dix_city_oaire)
|
||||
|
||||
for x in dix_city_oaire:
|
||||
if x in dix_city_ror:
|
||||
if type(dix_city_ror[x][0]) == list:
|
||||
if type(dix_city_oaire[x][0]) == str:
|
||||
dix_city[x] = dix_city_ror[x] +[dix_city_oaire[x]]
|
||||
else:
|
||||
dix_city[x] = dix_city_ror[x]+dix_city_oaire[x]
|
||||
else:
|
||||
if type(dix_city_oaire[x][0]) == str:
|
||||
dix_city[x] = [dix_city_ror[x],dix_city_oaire[x]]
|
||||
else:
|
||||
dix_city[x] = dix_city_oaire[x] + [dix_city_ror[x]]
|
||||
|
||||
dix_country = dict(dix_country_ror)
|
||||
dix_country.update(dix_country_oaire)
|
||||
|
||||
for x in dix_country_oaire:
|
||||
if x in dix_country_ror:
|
||||
if type(dix_country_ror[x][0]) == list:
|
||||
if type(dix_country_oaire[x][0]) == str:
|
||||
dix_country[x] = dix_country_ror[x] +[dix_country_oaire[x]]
|
||||
else:
|
||||
dix_country[x] = dix_country_ror[x]+dix_country_oaire[x]
|
||||
else:
|
||||
if type(dix_country_oaire[x][0]) == str:
|
||||
dix_country[x] = [dix_country_ror[x],dix_country_oaire[x]]
|
||||
else:
|
||||
dix_country[x] = dix_country_oaire[x] + [dix_country_ror[x]]
|
||||
|
||||
|
||||
|
||||
|
||||
dix_id_country_nc = dict(dix_id_country_ror)
|
||||
dix_id_country_nc.update(dix_id_country_oaire)
|
||||
|
||||
# dix_mult = dix_mult_ror | dix_mult_oaire
|
||||
# dix_country = dix_country_ror | dix_country_oaire
|
||||
# dix_city = dix_city_ror | dix_city_oaire
|
||||
# dix_id_country_nc = dix_id_country_ror | dix_id_country_oaire
|
||||
dix_id_country = {x:remove_stop_words(replace_double_consonants(dix_id_country_nc[x])) for x in list(dix_id_country_nc.keys())}
|
||||
# print(dix_id_country_ror['https://ror.org/03vba6259'])
|
||||
# print(dix_id_country['https://ror.org/03vba6259'])
|
||||
|
||||
#dix_org1 = {x.replace('clinique', 'center').replace('centers', 'center') : dix_org[x] for x in dix_org}
|
||||
|
||||
# dix_mult1 = {x.replace('clinique', 'center').replace('centers', 'center') : dix_mult[x] for x in dix_mult}
|
||||
# dix_city1 = {x.replace('clinique', 'center').replace('centers', 'center') : dix_city[x] for x in dix_city}
|
||||
# dix_country1 = {x.replace('clinique', 'center').replace('centers', 'center') : dix_country[x] for x in dix_country}
|
||||
|
||||
dix_status_new = {k :[dix_status[k][0], dix_status[k][1].split(', ')] for k in dix_status}
|
||||
us_states = [
|
||||
"alabama", "alaska", "arizona", "arkansas", "california",
|
||||
"colorado", "connecticut", "delaware", "florida", "georgia",
|
||||
"hawaii", "idaho", "illinois", "indiana", "iowa",
|
||||
"kansas", "kentucky", "louisiana", "maine", "maryland",
|
||||
"massachusetts", "michigan", "minnesota", "mississippi", "missouri",
|
||||
"montana", "nebraska", "nevada", "new hampshire", "new jersey",
|
||||
"new mexico", "new york", "north carolina", "north dakota", "ohio",
|
||||
"oklahoma", "oregon", "pennsylvania", "rhode island", "south carolina",
|
||||
"south dakota", "tennessee", "texas", "utah", "vermont",
|
||||
"virginia", "washington", "west virginia", "wisconsin", "wyoming"
|
||||
]
|
||||
|
||||
def contains_us_state(text):
|
||||
text = text.lower()
|
||||
return any(state in text for state in us_states)
|
||||
|
||||
def find_ror_new1(input, simU, simG, limit):
|
||||
|
||||
light_aff = input[0]
|
||||
result = Aff_Ids(input, dix_org, dix_mult, dix_city, dix_country, simU, simG, limit)
|
||||
# print('res', result)
|
||||
results_upd = []
|
||||
|
||||
for r in result:
|
||||
|
||||
if "openorgs" in r[2]:
|
||||
results_upd.append([r[1], 'OpenOrgs', r[2], 'active', dix_id_country[r[2]]])
|
||||
|
||||
else:
|
||||
if dix_status_new[r[2]][0] == 'active':
|
||||
results_upd.append([r[1], 'ROR', r[2], 'active', dix_id_country[r[2]]])
|
||||
else:
|
||||
if dix_status_new[r[2]][1][0] == '':
|
||||
results_upd.append([r[1], 'ROR', r[2], dix_status_new[r[2]][0], dix_id_country[r[2]]])
|
||||
|
||||
# elif len(dix_status[r[2]][1]) == 1:
|
||||
|
||||
# results_upd.append([r[1], 'ROR', r[2], dix_status[r[2]][0]])
|
||||
|
||||
# results_upd.append([r[1], 'ROR', dix_status[r[2]][1][0], 'active'])
|
||||
|
||||
else:
|
||||
results_upd.append([r[1], 'ROR', r[2], dix_status_new[r[2]][0],dix_id_country[r[2]]])
|
||||
for link in (dix_status_new[r[2]][1]):
|
||||
results_upd.append([r[1], 'ROR', link, 'active',dix_id_country[r[2]],dix_id_country[link]])
|
||||
|
||||
# print('results_upd',results_upd)
|
||||
# print('len(set(description(light_aff)[1]))', len(set(description(light_aff)[1])))
|
||||
if len(results_upd) > len(set(description(light_aff)[1])):
|
||||
|
||||
|
||||
final_matching = []
|
||||
light_aff_tokens = [clean_string_ror(x) for x in set(light_aff.split())]
|
||||
# print('light_aff_tokens',light_aff_tokens)
|
||||
for id_ in results_upd:
|
||||
country = dix_id_country[id_[2]]
|
||||
# print(id_, country)
|
||||
|
||||
|
||||
if country == 'united states':
|
||||
# print('united states')
|
||||
if 'united states' in light_aff or 'usa' in light_aff_tokens or contains_us_state(light_aff):
|
||||
# print('found')
|
||||
final_matching.append(id_)
|
||||
|
||||
elif country == 'united kingdom':
|
||||
if 'united kingdom' in light_aff or 'uk' in light_aff_tokens:
|
||||
final_matching.append(id_)
|
||||
|
||||
elif 'korea' in country:
|
||||
|
||||
if 'korea' in light_aff_tokens:
|
||||
final_matching.append(id_)
|
||||
|
||||
elif country in light_aff:
|
||||
final_matching.append(id_)
|
||||
|
||||
|
||||
if len(final_matching)>0:
|
||||
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in final_matching]
|
||||
|
||||
else:
|
||||
|
||||
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3],'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
|
||||
|
||||
elif len(results_upd) == len(set(description(light_aff)[1])) ==1:
|
||||
light_aff_tokens = [clean_string_ror(x) for x in set(re.split(r'[ ,]+', light_aff))]
|
||||
# print('light_aff_tokens',light_aff_tokens)
|
||||
country = dix_id_country[results_upd[0][2]]
|
||||
|
||||
if country == 'united states':
|
||||
# print('united states')
|
||||
if 'united states' in light_aff or 'usa' in light_aff_tokens or contains_us_state(light_aff):
|
||||
# print('found')
|
||||
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3],'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
|
||||
|
||||
elif country == 'united kingdom':
|
||||
if 'united kingdom' in light_aff or 'uk' in light_aff_tokens:
|
||||
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3],'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
|
||||
|
||||
elif 'korea' in country:
|
||||
|
||||
if 'korea' in light_aff_tokens:
|
||||
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3],'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
|
||||
|
||||
elif country in light_aff:
|
||||
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3],'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
|
||||
|
||||
else:
|
||||
return []
|
||||
|
||||
elif len(results_upd)>0:
|
||||
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3],'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
|
||||
else:
|
||||
return []
|
||||
|
||||
def find_ror_new(input, simU, simG, limit):
|
||||
|
||||
light_aff = input[0]
|
||||
result = Aff_Ids(input, dix_org, dix_mult, dix_city, dix_country, simU, simG, limit)
|
||||
# print('res', result)
|
||||
results_upd = []
|
||||
|
||||
for r in result:
|
||||
|
||||
if "openorgs" in r[2]:
|
||||
results_upd.append([r[1], 'OpenOrgs', r[2], 'active', dix_id_country[r[2]]])
|
||||
|
||||
else:
|
||||
if dix_status_new[r[2]][0] == 'active':
|
||||
results_upd.append([r[1], 'ROR', r[2], 'active', dix_id_country[r[2]]])
|
||||
else:
|
||||
if dix_status_new[r[2]][1][0] == '':
|
||||
results_upd.append([r[1], 'ROR', r[2], dix_status_new[r[2]][0], dix_id_country[r[2]]])
|
||||
|
||||
# elif len(dix_status[r[2]][1]) == 1:
|
||||
|
||||
# results_upd.append([r[1], 'ROR', r[2], dix_status[r[2]][0]])
|
||||
|
||||
# results_upd.append([r[1], 'ROR', dix_status[r[2]][1][0], 'active'])
|
||||
|
||||
else:
|
||||
results_upd.append([r[1], 'ROR', r[2], dix_status_new[r[2]][0],dix_id_country[r[2]]])
|
||||
for link in (dix_status_new[r[2]][1]):
|
||||
results_upd.append([r[1], 'ROR', link, 'active',dix_id_country[r[2]],dix_id_country[link]])
|
||||
|
||||
# print('results_upd',results_upd)
|
||||
# print('len(set(description(light_aff)[1]))', len(set(description(light_aff)[1])))
|
||||
if len(results_upd) > len(set(description(light_aff)[1])):
|
||||
|
||||
|
||||
final_matching = []
|
||||
light_aff_tokens = [clean_string_ror(x) for x in set(light_aff.split())]
|
||||
# print('light_aff_tokens',light_aff_tokens)
|
||||
for id_ in results_upd:
|
||||
country = dix_id_country[id_[2]]
|
||||
# print(id_, country)
|
||||
|
||||
|
||||
if country == 'united states':
|
||||
# print('united states')
|
||||
if 'united states' in light_aff or 'usa' in light_aff_tokens or contains_us_state(light_aff):
|
||||
# print('found')
|
||||
final_matching.append(id_)
|
||||
|
||||
elif country == 'united kingdom':
|
||||
if 'united kingdom' in light_aff or 'uk' in light_aff_tokens:
|
||||
final_matching.append(id_)
|
||||
|
||||
elif 'korea' in country:
|
||||
|
||||
if 'korea' in light_aff_tokens:
|
||||
final_matching.append(id_)
|
||||
|
||||
elif country in light_aff:
|
||||
final_matching.append(id_)
|
||||
|
||||
|
||||
if len(final_matching)>0:
|
||||
result_dict = [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in final_matching]
|
||||
return result_dict
|
||||
else:
|
||||
|
||||
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
|
||||
|
||||
elif len(results_upd)>0:
|
||||
return [{'Provenance': 'AffRo', 'PID':'OpenOrgs', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} if "openorgs" in x[2] else {'Provenance': 'AffRo', 'PID':'ROR', 'Value':x[2], 'Confidence':x[0], 'Status':x[3], 'Country':dix_id_country[x[2]]} for x in results_upd]
|
||||
else:
|
||||
result_dict = []
|
||||
|
||||
return result_dict
|
||||
|
||||
|
||||
def affro(raw_aff_string):
|
||||
lucky_guess = clean_string_ror(raw_aff_string)
|
||||
try:
|
||||
result = Aff_Ids(create_df_algorithm(raw_aff_string), dix_org, dix_mult, dix_city, dix_country, 0.65, 0.82)
|
||||
if len(result)>0:
|
||||
result_dict = [json.dumps({'Origin': 'affRo', 'RORid':x[2], 'Confidence':x[1]}) for x in result]
|
||||
if lucky_guess in dix_org:
|
||||
# print('lucky')
|
||||
if dix_mult[lucky_guess] == "unique":
|
||||
# print('unique')
|
||||
if 'openorgs' in dix_org[lucky_guess]:
|
||||
|
||||
return [{'Provenance': 'AffRo', 'PID': 'OpenOrgs', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active', 'Country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
else:
|
||||
if dix_status_new[dix_org[lucky_guess]][0] == 'active':
|
||||
# print('lucky, active')
|
||||
return [{'Provenance': 'AffRo', 'PID': 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active', 'Country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
elif dix_status_new[dix_org[lucky_guess]][1][0]== '':
|
||||
# print('lucky not active, not succesor')
|
||||
return [{'Provenance': 'AffRo', 'PID': 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': dix_status_new[dix_org[lucky_guess]][0], 'Country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
else:
|
||||
# print('lucky not active, succesor')
|
||||
res = [{'Provenance': 'AffRo', 'PID' : 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': dix_status_new[dix_org[lucky_guess]][0], 'Country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
# print('res',res)
|
||||
for successor in dix_status_new[dix_org[lucky_guess]][1]:
|
||||
res.append({'Provenance': 'AffRo', 'PID' : 'ROR', 'Value': successor, 'Confidence': 1, 'Status': 'active', 'Country':dix_id_country[dix_org[lucky_guess]]})
|
||||
return res
|
||||
else:
|
||||
|
||||
cand_ids = [x[1] for x in dix_city_ror[lucky_guess] if dix_status_new[x[1]][0] == 'active']
|
||||
# print('cand_ids', cand_ids)
|
||||
if len(cand_ids) == 1:
|
||||
if 'OpenOrgs' in dix_org[lucky_guess]:
|
||||
return [{'Provenance': 'AffRo', 'PID': 'OpenOrgs', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active', 'Country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
else:
|
||||
return [{'Provenance': 'AffRo', 'PID': 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active', 'Country':dix_id_country[dix_org[lucky_guess]]}]
|
||||
|
||||
else:
|
||||
return []
|
||||
else:
|
||||
result_dict = []
|
||||
|
||||
return result_dict
|
||||
# print("not lucky")
|
||||
result = find_ror_new(create_df_algorithm(raw_aff_string, 3), 0.42, 0.82, 500)
|
||||
|
||||
return result
|
||||
except Exception as e:
|
||||
# Return some indication of an error, or log the row
|
||||
print(f"Error: {str(e)}")
|
||||
print(raw_aff_string)
|
||||
pass
|
||||
#raw_aff = 'university of california, los angeles, university of athens, university of california, san diego, university of athens, greece'
|
||||
|
||||
|
||||
def affro_config(raw_aff_string, rad_u, sim_u, sim_g, limit):
|
||||
lucky_guess = clean_string_ror(raw_aff_string)
|
||||
try:
|
||||
if lucky_guess in dix_org:
|
||||
if dix_mult[lucky_guess] == "unique":
|
||||
if 'openorgs' in dix_org[lucky_guess]:
|
||||
return [{'Provenance': 'AffRo', 'PID': 'OpenOrgs', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active'}]
|
||||
else:
|
||||
if dix_status_new[dix_org[lucky_guess]][0] == 'active':
|
||||
return [{'Provenance': 'AffRo', 'PID': 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active'}]
|
||||
elif dix_status_new[dix_org[lucky_guess]][1] == '':
|
||||
return [{'Provenance': 'AffRo', 'PID': 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': dix_status_new[dix_org[lucky_guess]][0]}]
|
||||
else:
|
||||
res = [{'Provenance': 'AffRo', 'PID' : 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': dix_status_new[dix_org[lucky_guess]][0]}]
|
||||
for successor in dix_status_new[dix_org[lucky_guess]][1]:
|
||||
res.append({'Provenance': 'AffRo', 'PID' : 'ROR', 'Value': successor, 'Confidence': 1, 'Status': 'active'})
|
||||
return res
|
||||
else:
|
||||
cand_ids = [x[1] for x in dix_city_ror[lucky_guess] if dix_status_new[x[1]][0] == 'active']
|
||||
# print('cand_ids', cand_ids)
|
||||
if len(cand_ids) == 1:
|
||||
if 'OpenOrgs' in dix_org[lucky_guess]:
|
||||
return [{'Provenance': 'AffRo', 'PID': 'OpenOrgs', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active'}]
|
||||
else:
|
||||
return [{'Provenance': 'AffRo', 'PID': 'ROR', 'Value': dix_org[lucky_guess], 'Confidence': 1, 'Status': 'active'}]
|
||||
|
||||
else:
|
||||
return []
|
||||
else:
|
||||
|
||||
result = find_ror_new(create_df_algorithm(raw_aff_string, rad_u), sim_u, sim_g, limit)
|
||||
return result
|
||||
except Exception as e:
|
||||
# Return some indication of an error, or log the row
|
||||
print(f"Error: {str(e)}")
|
||||
print(raw_aff_string)
|
||||
pass
|
||||
|
||||
|
||||
|
||||
def matchings_affro(aff_string):
|
||||
# global operation_counter
|
||||
try:
|
||||
matchings = affro(aff_string)
|
||||
# operation_counter += 1
|
||||
|
||||
# Ensure matchings is a list, even if affro returns a single dict
|
||||
if not isinstance(matchings, list):
|
||||
matchings = [matchings]
|
||||
|
||||
# Create the result as a tuple that matches matchings_schema
|
||||
result = []
|
||||
for matching in matchings:
|
||||
# Assuming 'matching' is a dictionary that contains 'Provenance', 'PID', 'Value', 'Confidence', 'Status'
|
||||
result.append((
|
||||
matching.get("Provenance", None),
|
||||
matching.get("PID", None),
|
||||
matching.get("Value", None),
|
||||
float(matching.get("Confidence", None)),
|
||||
matching.get("Status", None)
|
||||
))
|
||||
if len(result)>0:
|
||||
return result
|
||||
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing affiliation string {aff_string}: {str(e)}")
|
||||
return ()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
@ -38,3 +407,5 @@ if __name__ == "__main__":
|
|||
# float_arg2 = float(sys.argv[3])
|
||||
|
||||
print(affro(string_arg))
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,28 +0,0 @@
|
|||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.functions import udf
|
||||
from pyspark.sql.types import StringType
|
||||
|
||||
import sys
|
||||
|
||||
from affro_cluster import *
|
||||
|
||||
# Initialize SparkSession
|
||||
spark = SparkSession.builder.appName("CustomFunctionExample").getOrCreate()
|
||||
|
||||
# Register the function as a UDF
|
||||
affro_udf = udf(affro, StringType())
|
||||
|
||||
# Input list of strings
|
||||
input_data = ["university of athens", "university of vienna", "UCLA"]
|
||||
|
||||
# # Convert the list to a Spark DataFrame
|
||||
df = spark.createDataFrame(input_data, "string").toDF("raw_affiliation_string")
|
||||
|
||||
# # Apply your custom UDF to the DataFrame
|
||||
df_with_custom_value = df.withColumn("affro_value", affro_udf(df["raw_affiliation_string"]))
|
||||
|
||||
|
||||
df_with_custom_value.show(truncate=False)
|
||||
|
||||
# Stop the SparkSession
|
||||
spark.stop()
|
||||
|
|
@ -0,0 +1,163 @@
|
|||
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size, from_json, struct
|
||||
import sys
|
||||
from threading import Thread
|
||||
from affro_cluster import *
|
||||
from schemas import *
|
||||
|
||||
|
||||
spark = SparkSession.builder.appName("AffRo - Author model").getOrCreate()
|
||||
|
||||
|
||||
input_file = sys.argv[1]
|
||||
author_file = sys.argv[2]
|
||||
|
||||
matching_array_schema = ArrayType(
|
||||
StructType([
|
||||
StructField("Provenance", StringType(), nullable=False),
|
||||
StructField("PID", StringType(), nullable=False),
|
||||
StructField("Value", StringType(), nullable=False),
|
||||
StructField("Confidence", DoubleType(), nullable=False),
|
||||
StructField("Status", StringType(), nullable=False)
|
||||
])
|
||||
)
|
||||
|
||||
affro_udf = udf(matchings_affro, matching_array_schema)
|
||||
|
||||
|
||||
exploded = spark.read.json(input_file) \
|
||||
.filter(col("id").isNotNull()) \
|
||||
.select(
|
||||
col("id"),
|
||||
explode(col("author")).alias("author") #this allows to split all the raw_aff_string and to parallelize better
|
||||
)
|
||||
|
||||
# Explode the "author.rawAffiliationString" column into separate rows
|
||||
affiliations_exploded = exploded.withColumn(
|
||||
"Affiliation",
|
||||
explode(col("author.rawAffiliationString"))
|
||||
)
|
||||
|
||||
matching_df = (
|
||||
affiliations_exploded
|
||||
.filter(col("Affiliation").isNotNull()) # Keep rows with valid "Affiliation"
|
||||
.select("Affiliation") # Select only the "Affiliation" column
|
||||
.distinct() # Remove duplicate rows based on "Affiliation"
|
||||
.select(
|
||||
col("Affiliation"),
|
||||
affro_udf(col("Affiliation")).alias("Matchings") # Apply UDF to "Affiliation"
|
||||
)
|
||||
.filter(col("Matchings").isNotNull()) # Exclude rows with null "Matchings"
|
||||
)
|
||||
|
||||
|
||||
#x id id, author, rawaffiliationstring, matchings
|
||||
def toAuthorModel(x):
|
||||
name = {}
|
||||
name['Full'] = x['author']['fullname']
|
||||
name['First'] = x['author']['name']
|
||||
name['Last'] = x['author']['surname']
|
||||
orcid = None
|
||||
if 'pid' in x['author'] :
|
||||
try:
|
||||
for p in x['author']['pid']:
|
||||
if 'qualifier' in p and p['qualifier']['classid'] == 'orcid':
|
||||
orcid = p['value']
|
||||
except:
|
||||
pass
|
||||
name['orcid'] = orcid
|
||||
ret = {'key': x['id']+x['author']['fullname'],'id' :x['id'], 'author':name, 'Raw_affiliation' : x['Affiliation'], 'Matchings':getMatchings(x['Matchings'])}
|
||||
return ret
|
||||
|
||||
def getMatchings(matches):
|
||||
matchings = []
|
||||
for m in matches:
|
||||
matchings.append({"Provenance":m['Provenance'], "PID":m["PID"], "Value":m["Value"], "Confidence":m["Confidence"],"Status":m["Status"]})
|
||||
return matchings
|
||||
|
||||
def regroupAndSelectDistinctMatch(x):
|
||||
ret = []
|
||||
dic = {}
|
||||
for m in x:
|
||||
for e in m:
|
||||
if e['Status'] == 'active':
|
||||
if not e['Value'] in dic:
|
||||
dic[e['Value']] = 0
|
||||
if dic[e['Value']] < float(e['Confidence']):
|
||||
dic[e['Value']] = float(e['Confidence'])
|
||||
for e in dic:
|
||||
ret.append({"Provenance":"AffRo", "PID":"ROR", "Value":e,"Confidence":dic[e], "Status":"active"})
|
||||
|
||||
return ret
|
||||
|
||||
def aggregateAuthor(group):
|
||||
affiliations = []
|
||||
matchings = []
|
||||
for e in group:
|
||||
affiliations.append(e['Raw_affiliation'])
|
||||
matchings.append(e['Matchings'])
|
||||
return {"id":e['id'], "author" : e["author"], "Raw_affiliation" : affiliations, "Matchings":regroupAndSelectDistinctMatch(matchings)}
|
||||
|
||||
def aggregateResult(group):
|
||||
authors = []
|
||||
matchings = []
|
||||
for e in group:
|
||||
amatch = getMatchings(e["Matchings"])
|
||||
authors.append({
|
||||
"Name": {"First":e["author"]['First'], "Last":e["author"]["Last"],"Full":e["author"]["Full"],"orcid":e["author"]["orcid"]},
|
||||
"Corresponding": None,
|
||||
"Contributor_roles": None,
|
||||
"Raw_affiliations": [aff for aff in e["Raw_affiliation"]],
|
||||
"Matchings": amatch
|
||||
})
|
||||
matchings.append(amatch)
|
||||
ret = {"id": group[0]["id"], "Authors": authors, "Organizations":regroupAndSelectDistinctMatch(matchings)}
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
applyMatch_udf = udf(toAuthorModel, match_author_schema)
|
||||
aggregate_udf = udf(aggregateAuthor,match_author_grouped_schema)
|
||||
aggregateResult_udf = udf(aggregateResult, author_schema)
|
||||
|
||||
#spark = SparkSession.builder.appName("AffRo-Matchings").getOrCreate()
|
||||
|
||||
#matching_df = spark.read.json(maching_file)
|
||||
|
||||
# exploded = spark.read.schema(result_schema).json(input_file) \
|
||||
# .select(
|
||||
# col("id"),
|
||||
# explode(col("author")).alias("author") #this allows to split all the raw_aff_string and to parallelize better
|
||||
# )
|
||||
|
||||
|
||||
# affiliations_exploded = exploded.withColumn(
|
||||
# "Affiliation",
|
||||
# explode(col("author.rawAffiliationString"))
|
||||
# )
|
||||
|
||||
|
||||
extend = (
|
||||
matching_df
|
||||
.join(affiliations_exploded, on="Affiliation")
|
||||
.filter(col("Matchings").isNotNull())
|
||||
)
|
||||
|
||||
applyMatchDf = extend \
|
||||
.withColumn("application", applyMatch_udf(struct("*"))) \
|
||||
.select("application.*")
|
||||
|
||||
|
||||
groupedAuthorDf = applyMatchDf \
|
||||
.groupBy("key").agg(collect_list(struct("*")).alias("group")) \
|
||||
.withColumn("aggresult", aggregate_udf("group")) \
|
||||
.select("aggresult.*")
|
||||
|
||||
|
||||
groupedResultDf = groupedAuthorDf \
|
||||
.groupBy("id").agg(collect_list(struct("*")).alias("group")) \
|
||||
.withColumn("result", aggregateResult_udf("group")) \
|
||||
.select("result.*")
|
||||
|
||||
groupedResultDf.write.mode("overwrite").json(author_file, compression="gzip")
|
||||
|
|
@ -1,77 +1,91 @@
|
|||
from functions_cluster import *
|
||||
|
||||
def create_df_algorithm(raw_aff_string):
|
||||
aff_no_symbols_d = substrings_dict(clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string))))
|
||||
substring_list = list(aff_no_symbols_d.values())
|
||||
def valueToCategory(value):
|
||||
flag = 0
|
||||
|
||||
for k in categ_dicts:
|
||||
if k in value and categ_dicts[k] in categ_string.split('|'):
|
||||
flag = 1
|
||||
return flag
|
||||
|
||||
|
||||
# tokenization
|
||||
protect = ['national univer ireland',
|
||||
'univer',
|
||||
'univer california',
|
||||
'univer colege hospital',
|
||||
'univer colege',
|
||||
'univer hospital',
|
||||
'imperial colege',
|
||||
'city univer',
|
||||
'univer medical school',
|
||||
'california state univer',
|
||||
'national techn univer',
|
||||
'techn univer',
|
||||
'islamic azad univer',
|
||||
'univer nevada',
|
||||
'univer maryland',
|
||||
'state univer',
|
||||
'rijksuniver',
|
||||
'rijks univer',
|
||||
'univer medical center'
|
||||
]
|
||||
|
||||
def create_df_algorithm(raw_aff_string, radius_u):
|
||||
clean_aff = clean_string(remove_outer_parentheses(remove_leading_numbers(raw_aff_string)))
|
||||
#print(0, clean_aff)
|
||||
countries_list = description(clean_aff)[1]
|
||||
aff_no_symbols_d = substrings_dict(reduce(clean_aff))
|
||||
#print(0.5, aff_no_symbols_d)
|
||||
substring_list = [replace_abbr_univ(x) for x in list(aff_no_symbols_d.values())]
|
||||
#print(1, substring_list)
|
||||
i = 0
|
||||
|
||||
while i < len(substring_list) - 1:
|
||||
if is_contained('progr', substring_list[i]) and is_contained('dep', substring_list[i+1]):
|
||||
substring_list.pop(i)
|
||||
|
||||
|
||||
elif (is_contained('assistant', substring_list[i]) or is_contained('researcher', substring_list[i]) or is_contained('phd', substring_list[i]) or is_contained('student', substring_list[i]) or is_contained('section', substring_list[i]) or is_contained('prof', substring_list[i]) or is_contained('director', substring_list[i])) and (not is_contained('school', substring_list[i+1]) or is_contained('univ', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('lab', substring_list[i+1]) or is_contained('fac', substring_list[i+1])):
|
||||
substring_list.pop(i)
|
||||
|
||||
elif (is_contained('engineer', substring_list[i]) or is_contained('progr', substring_list[i]) or is_contained('unit', substring_list[i]) or is_contained('lab', substring_list[i]) or is_contained('dep', substring_list[i]) or is_contained('school', substring_list[i]) or is_contained('inst', substring_list[i]) #or is_contained('hosp', substring_list[i])
|
||||
or is_contained('fac', substring_list[i])) and is_contained('univ', substring_list[i+1]):
|
||||
if not is_contained('univ', substring_list[i]):
|
||||
if substring_list[i] in protect and any(name in substring_list[i+1] for name in city_names): #substring_list[i+1] in city_names:
|
||||
substring_list[i] = substring_list[i] + ' ' + substring_list[i+1]
|
||||
i = i+2
|
||||
continue
|
||||
|
||||
elif ('assistant' in substring_list[i] or 'researcher' in substring_list[i] or 'phd' in substring_list[i] or 'student' in substring_list[i] or 'section' in substring_list[i] or 'prof' in substring_list[i] or 'director' in substring_list[i]) and (not 'school' in substring_list[i+1] or 'univ', substring_list[i+1] or 'inst' in substring_list[i+1] or 'lab' in substring_list[i+1] or 'fac' in substring_list[i+1]):
|
||||
if not 'univ' in substring_list[i]:
|
||||
substring_list.pop(i)
|
||||
else:
|
||||
i = i+1
|
||||
|
||||
elif ('engineer' in substring_list[i] or 'progr'in substring_list[i] or 'unit' in substring_list[i] or 'dep' in substring_list[i] or 'school' in substring_list[i] #or 'lab' in substring_list[i] # or 'inst' in substring_list[i] #or is_contained('hosp', substring_list[i])
|
||||
or 'fac' in substring_list[i]) and 'univ' in substring_list[i+1]:
|
||||
if not 'univ' in substring_list[i]:
|
||||
substring_list.pop(i)
|
||||
else:
|
||||
i = i+1
|
||||
continue
|
||||
|
||||
elif is_contained('lab', substring_list[i]) and (is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('school', substring_list[i+1])):
|
||||
if not is_contained('univ', substring_list[i]):
|
||||
substring_list.pop(i)
|
||||
else:
|
||||
i = i+1
|
||||
continue
|
||||
|
||||
elif is_contained('dep', substring_list[i]) and (is_contained('tech', substring_list[i+1]) or is_contained('colege', substring_list[i+1]) or is_contained('inst', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('school', substring_list[i+1]) or is_contained('fac', substring_list[i+1])):
|
||||
if not is_contained('univ', substring_list[i]):
|
||||
substring_list.pop(i)
|
||||
else:
|
||||
i = i+1
|
||||
continue
|
||||
|
||||
elif is_contained('inst',substring_list[i]) and (is_contained('school', substring_list[i+1]) or is_contained('dep', substring_list[i+1]) or is_contained('acad', substring_list[i+1]) or is_contained('hosp', substring_list[i+1]) or is_contained('clin', substring_list[i+1]) or is_contained('klin', substring_list[i+1]) or is_contained('fak', substring_list[i+1]) or is_contained('fac', substring_list[i+1]) or is_contained('cent', substring_list[i+1]) or is_contained('div', substring_list[i+1])):
|
||||
if not is_contained('univ', substring_list[i]):
|
||||
substring_list.pop(i)
|
||||
else:
|
||||
i = i+1
|
||||
continue
|
||||
|
||||
|
||||
elif is_contained('school',substring_list[i]) and is_contained('colege', substring_list[i+1]):
|
||||
if not is_contained('univ', substring_list[i]):
|
||||
substring_list.pop(i)
|
||||
else:
|
||||
i = i+1
|
||||
continue
|
||||
# elif 'lab' in substring_list[i] and ('colege' in substring_list[i+1] or 'dep' in substring_list[i+1] or 'school' in substring_list[i+1]):
|
||||
# if not 'univ' in substring_list[i]: #'inst' in substring_list[i+1] or
|
||||
# substring_list.pop(i)
|
||||
# else:
|
||||
# i = i+1
|
||||
# continue
|
||||
|
||||
else:
|
||||
i += 1
|
||||
# print(1.4, substring_list)
|
||||
|
||||
light_aff = (', '.join((substring_list)))
|
||||
for x in substring_list:
|
||||
if x in city_names+remove_list:
|
||||
substring_list.remove(x)
|
||||
|
||||
substring_list = [shorten_keywords_spark([x])[0] for x in substring_list]
|
||||
# print(1.5, light_aff)
|
||||
|
||||
substring_list = [x for x in substring_list if x.replace(' gmbh','') not in city_names+remove_list]
|
||||
# print(1.7,substring_list)
|
||||
|
||||
substring_list0 = [shorten_keywords([x], radius_u) for x in substring_list if len(shorten_keywords([x],radius_u))>0]
|
||||
# print(2,substring_list0 )
|
||||
|
||||
def valueToCategory(value):
|
||||
flag = 0
|
||||
substring_list1 = [inner for outer in substring_list0 for inner in outer]
|
||||
# print(3,substring_list1 )
|
||||
|
||||
for k in categ_dicts:
|
||||
if k in value:
|
||||
flag = 1
|
||||
return flag
|
||||
|
||||
aff_list = [{"index": i, "keywords": substring_list[i], "category": valueToCategory(substring_list[i])} for i in range(len(substring_list))]
|
||||
aff_list = [{"index": i, "keywords": substring_list1[i], "category": valueToCategory(substring_list1[i])} for i in range(len(substring_list1))]
|
||||
|
||||
filtered_list = [entry for entry in aff_list if entry.get("category") == 1]
|
||||
|
||||
return [light_aff, filtered_list]
|
||||
return [clean_aff, light_aff, filtered_list, countries_list]
|
||||
|
|
@ -0,0 +1,62 @@
|
|||
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
|
||||
|
||||
from utils import *
|
||||
from affro_cluster import *
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size
|
||||
import sys
|
||||
|
||||
spark = SparkSession.builder.appName("AffRo-Crossref").getOrCreate()
|
||||
|
||||
folder_path = sys.argv[1]
|
||||
hdfs_output_path = sys.argv[2]
|
||||
|
||||
matchings_schema = ArrayType(
|
||||
StructType([
|
||||
StructField("Provenance", StringType(), nullable=True),
|
||||
StructField("PID", StringType(), nullable=True),
|
||||
StructField("Value", StringType(), nullable=True),
|
||||
StructField("Confidence", DoubleType(), nullable=True),
|
||||
StructField("Status", StringType(), nullable=True)
|
||||
])
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
affro_udf = udf(matchings_affro, matchings_schema)
|
||||
distinct_matchings_udf = udf(regroupAndSelectDistinctMatch, matchings_schema)
|
||||
|
||||
|
||||
exploded = spark.read.json(folder_path) \
|
||||
.filter(col("DOI").isNotNull()) \
|
||||
.select(
|
||||
col("DOI").alias("DOI"),
|
||||
explode(col("author")).alias("author") #this allows to split all the raw_aff_string and to parallelize better
|
||||
)
|
||||
|
||||
|
||||
|
||||
affiliations_exploded = exploded.withColumn("affiliation", explode(col("author.affiliation")))
|
||||
|
||||
|
||||
result = (
|
||||
affiliations_exploded
|
||||
.filter(col("affiliation.name").isNotNull())
|
||||
.select(
|
||||
col("DOI"),
|
||||
col("affiliation.name").alias("raw_affiliation_string")
|
||||
)
|
||||
.withColumn("raw_affiliation_string", affro_udf(col("raw_affiliation_string")))
|
||||
.groupBy("DOI")
|
||||
.agg(collect_set("raw_affiliation_string").alias("group"))
|
||||
.withColumn("Matchings", aggregate_udf("group")) # Use collect_set for unique values
|
||||
.filter((col("Matchings").isNotNull()) & (size(col("Matchings")) > 0))
|
||||
)
|
||||
|
||||
|
||||
result.write \
|
||||
.mode("overwrite") \
|
||||
.option("compression", "gzip") \
|
||||
.json(hdfs_output_path)
|
||||
|
|
@ -0,0 +1,104 @@
|
|||
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size, from_json, struct
|
||||
from affro_cluster import *
|
||||
from schemas import *
|
||||
from utils import *
|
||||
|
||||
import sys
|
||||
|
||||
folder_path = sys.argv[1]
|
||||
hdfs_output_path = sys.argv[2]
|
||||
startFrom = sys.argv[3]
|
||||
affiliationToAuthor = False
|
||||
|
||||
# Initialize Spark session
|
||||
spark = SparkSession.builder.appName("AffRo-DataCite").getOrCreate()
|
||||
|
||||
|
||||
|
||||
affro_udf = udf(matchings_affro, matching_array_schema)
|
||||
applyMatch_udf = udf(toAuthorModel, match_author_schema)
|
||||
aggregate_udf = udf(aggregateAuthor,match_author_grouped_schema)
|
||||
aggregateResult_udf = udf(aggregateResult, author_schema)
|
||||
aggregateResultNoAuthor_udf = udf(aggregateResultNoAuthor, author_schema)
|
||||
|
||||
|
||||
if(startFrom == 'Matchings'):
|
||||
df = spark.read.option("mode", "PERMISSIVE").parquet(folder_path)
|
||||
df_parsed = df.withColumn("json_parsed", from_json(col("json"), json_schema))
|
||||
|
||||
exploded = df_parsed.select(
|
||||
col("json_parsed.attributes.doi").alias("id"), # Mapping "id" to "doi"
|
||||
explode(col("json_parsed.attributes.creators")).alias("author") # Exploding creators instead of attributes
|
||||
).filter(col("id").isNotNull())
|
||||
|
||||
exploded.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/exploded", compression="gzip")
|
||||
|
||||
affiliations_exploded = exploded.withColumn(
|
||||
"Affiliation",
|
||||
explode(col("author.affiliation").alias("Affiliation"))
|
||||
)
|
||||
|
||||
affiliations_exploded.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/affiliations_exploded", compression="gzip")
|
||||
matching_df = (
|
||||
affiliations_exploded
|
||||
.filter(col("Affiliation").isNotNull()) # Keep rows with valid "Affiliation"
|
||||
.select("Affiliation") # Select only the "Affiliation" column
|
||||
.distinct() # Remove duplicate rows based on "Affiliation"
|
||||
.select(
|
||||
col("Affiliation"),
|
||||
affro_udf(col("Affiliation")).alias("Matchings") # Apply UDF to "Affiliation"
|
||||
)
|
||||
.filter(col("Matchings").isNotNull()) # Exclude rows with null "Matchings"
|
||||
)
|
||||
matching_df.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/matching_df", compression="gzip")
|
||||
|
||||
matching_df = spark.read.schema(affiliation_schema).json("/tmp/miriam/affroOnDatacite/matching_df")
|
||||
affiliations_exploded = spark.read.schema(affiliation_exploded_schema).json("/tmp/miriam/affroOnDatacite/affiliations_exploded")
|
||||
|
||||
if(not affiliationToAuthor):
|
||||
aff_per_paper = (affiliations_exploded
|
||||
.select("id","Affiliation")
|
||||
.distinct())
|
||||
|
||||
aff_per_paper.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/aff_per_paper", compression="gzip")
|
||||
|
||||
|
||||
(matching_df
|
||||
.join(aff_per_paper, on="Affiliation")
|
||||
.filter(col("Matchings").isNotNull())
|
||||
.groupBy("id").agg(collect_list(struct("*")).alias("group"))
|
||||
.withColumn("result", aggregateResultNoAuthor_udf("group"))
|
||||
.select("result.*")
|
||||
.write.mode("overwrite").json(hdfs_output_path, compression="gzip")
|
||||
)
|
||||
else:
|
||||
extend = (
|
||||
matching_df
|
||||
.join(affiliations_exploded, on="Affiliation")
|
||||
.filter(col("Matchings").isNotNull())
|
||||
)
|
||||
|
||||
extend.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/extend", compression="gzip")
|
||||
|
||||
applyMatchDf = extend \
|
||||
.withColumn("application", applyMatch_udf(struct("*"))) \
|
||||
.select("application.*")
|
||||
|
||||
applyMatchDf.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/applyMatchDf", compression="gzip")
|
||||
|
||||
groupedAuthorDf = applyMatchDf \
|
||||
.groupBy("key").agg(collect_list(struct("*")).alias("group")) \
|
||||
.withColumn("aggresult", aggregate_udf("group")) \
|
||||
.select("aggresult.*")
|
||||
|
||||
groupedAuthorDf.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/groupedAuthorDf", compression="gzip")
|
||||
|
||||
groupedResultDf = groupedAuthorDf \
|
||||
.groupBy("id").agg(collect_list(struct("*")).alias("group")) \
|
||||
.withColumn("result", aggregateResult_udf("group")) \
|
||||
.select("result.*")
|
||||
groupedResultDf.write.mode("overwrite").json("/tmp/miriam/affroOnDatacite/groupedResultDf", compression="gzip")
|
||||
|
||||
groupedResultDf.write.mode("overwrite").json(hdfs_output_path, compression="gzip")
|
||||
|
|
@ -0,0 +1,30 @@
|
|||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size, from_json, struct
|
||||
import sys
|
||||
|
||||
from affro_cluster import *
|
||||
from schemas import *
|
||||
from utils import *
|
||||
|
||||
|
||||
spark = SparkSession.builder.appName("AffRo - Dataset").getOrCreate()
|
||||
|
||||
#the path of the dataset to be mapped. It contains all the affiliation strings to be processed
|
||||
#The outcoma will be written in Append to the output path.
|
||||
input_path = sys.argv[1]
|
||||
|
||||
#the output file
|
||||
output_path = sys.argv[2]
|
||||
|
||||
affro_udf = udf(matchings_affro, matching_array_schema)
|
||||
|
||||
input_dataset = spark.read.schema(affiliation_string_schema).json(input_path)
|
||||
|
||||
( input_dataset
|
||||
.select(col("raw_affiliation_string").alias("Affiliation"),
|
||||
affro_udf(col("raw_affiliation_string")).alias("Matchings") )
|
||||
.filter(col("Matchings").isNotNull())
|
||||
.write
|
||||
.mode("append")
|
||||
.json(output_path)
|
||||
)
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,60 @@
|
|||
{"copernicus institu sustainable development" : "openorgs____::0000102848",
|
||||
"copernicus institu sustainable development energy resources" : "openorgs____::0000102849",
|
||||
"copernicus institu sustainable development environmental governance" : "openorgs____::0000102850",
|
||||
"copernicus institu sustainable development environmental scien" : "openorgs____::0000102851",
|
||||
"copernicus institu sustainable development inovation studies" : "openorgs____::0000102852",
|
||||
"leiden institu advanced computer scien" : "openorgs____::0000102908",
|
||||
"gorlaeus labora" : "openorgs____::0000102887",
|
||||
"leiden institu chemistry gorlaeus labora" : "openorgs____::0000102887",
|
||||
"institu biology leiden" : "openorgs____::0000102854",
|
||||
"leiden institu chemistry" : "openorgs____::0000102880",
|
||||
"leiden malaria research group": "openorgs____::0000102937",
|
||||
"sylvius labora" : "openorgs____::0000102938",
|
||||
"leiden mathematical institu" : "openorgs____::0000102939",
|
||||
"leiden faculty archaeology" : "openorgs____::0000102940",
|
||||
"erevnitiko idrima pl" : "openorgs____::0000103020",
|
||||
"frederick research center" : "openorgs____::0000103018",
|
||||
"cyens center excelence" : "openorgs____::0000103019",
|
||||
"space systems solutions ltd" : "openorgs____::0000103021",
|
||||
"eratosthenes center excelence" : "openorgs____::0000103022",
|
||||
"cyric cyprus research and innovation center ltd" : "openorgs____::0000103023",
|
||||
"danaos shiping company limited" : "openorgs____::0000103024",
|
||||
"cyprus space exploration organization" : "openorgs____::0000103025",
|
||||
"cyprus museum" : "openorgs____::0000103027",
|
||||
"larnaca general hospital" : "openorgs____::0000103028",
|
||||
"agricultural research institu" : "openorgs____::0000103029",
|
||||
"research education institu child health" : "openorgs____::0000103030",
|
||||
"german oncology center" : "openorgs____::0000103031",
|
||||
"cyprus neuroscien techn institu" : "openorgs____::0000103032",
|
||||
"salzgiter manesman forschung" : "openorgs____::0000103035",
|
||||
"aesculap ag" : "openorgs____::0000103036",
|
||||
"telekom inovation labora" : "openorgs____::0000103037",
|
||||
"dlr institu vernetzte energiesysteme" : "openorgs____::0000103038",
|
||||
"akademie ofentliches gesundheitswesen duseldorf" : "openorgs____::0000103039",
|
||||
"ibe rd institu lung health" : "openorgs____::0000103040",
|
||||
"herzentrum leipzig" : "openorgs____::0000103041",
|
||||
"bundesforschungsanstalt fischerei" : "openorgs____::0000098305",
|
||||
"osteuropa institu" : "openorgs____::0000103042",
|
||||
"hochschule politik Munchen" : "openorgs____::0000103043",
|
||||
"qualcom cdma techn" : "openorgs____::0000103046",
|
||||
"kompetenzentrum obstbau bodensee" : "openorgs____::0000103047",
|
||||
"institu angewandte qualitatsforderung forschung im gesundheitswesen" : "openorgs____::0000103048",
|
||||
"dresearch digital media systems":"openorgs____::0000103049",
|
||||
"eemagine medical imaging solutions": "openorgs____::0000103050",
|
||||
"forschungszentrum energietechnologie":"openorgs____::0000103051",
|
||||
"european radiation dosimetry group" : "openorgs____::0000103052",
|
||||
"ge healthcare":"openorgs____::0000103053",
|
||||
"global energy interconection research institu europe":"openorgs____::0000103054",
|
||||
"proteros biostructures": "openorgs____::0000103055",
|
||||
"frankfurter algemeine zeitung" : "openorgs____::0000103056",
|
||||
"stiftung neanderthal museum": "openorgs____::0000103057",
|
||||
"nvision imaging techn" : "openorgs____::0000103058",
|
||||
"institu scien networking oldenburg" :"openorgs____::0000103059",
|
||||
"zentrum internationale bildungsvergleichstudien" : "openorgs____::0000103060",
|
||||
"evangelisches klinikum bethel": "openorgs____::0000103061",
|
||||
"internationale hochschule liebenzel": "openorgs____::0000103062",
|
||||
"walter schotky institu": "openorgs____::0000103063",
|
||||
"institu phytopathologie": "openorgs____::0000103065",
|
||||
"helios klinikum wupertal": "openorgs____::0000103066",
|
||||
"univer herzentrum hamburg": "openorgs____::0000103067",
|
||||
"zentrum graphische datenverarbeitung" : "openorgs____::0000017671"}
|
||||
|
|
@ -1 +1 @@
|
|||
{"research": "Univ/Inst", "uniwersytet": "Univ/Inst", "investigacions": "Univ/Inst", "institu": "Univ/Inst", "istitut": "Univ/Inst", "univ": "Univ/Inst", "col": "Univ/Inst", "center": "Univ/Inst", "polytechnic": "Univ/Inst", "politecnico": "Univ/Inst", "centre": "Univ/Inst", "cnrs": "Univ/Inst", "faculty": "Univ/Inst", "school": "Univ/Inst", "academ": "Univ/Inst", "akadem": "Univ/Inst", "hochschule": "Univ/Inst", "ecole": "Univ/Inst", "tech": "Univ/Inst", "observ": "Univ/Inst", "escuela": "Univ/Inst", "escola": "Univ/Inst", "discovery programe": "Univ/Inst", "ku leuven": "Univ/Inst", "ucla": "Univ/Inst", "eth zurich": "Univ/Inst", "athena": "Univ/Inst", "openaire": "Univ/Inst", "erasmus": "Univ/Inst", "ist austria": "Univ/Inst", "lmu munich": "Univ/Inst", "cancer trials ireland": "Univ/Inst", "food safety authority": "Univ/Inst", "ucd": "Univ/Inst", "tcd": "Univ/Inst", "apc microbiome": "Univ/Inst", "nasa": "Univ/Inst", "ucl": "Univ/Inst", "zentrum": "Univ/Inst", "ncsr demokritos": "Univ/Inst", "panepistemio": "Univ/Inst", "forth": "Univ/Inst", "nui galway": "Univ/Inst", "nui maynooth": "Univ/Inst", "tu wien": "Univ/Inst", "tu dublin": "Univ/Inst", "lab": "Laboratory", "science": "Laboratory", "cientific": "Laboratory", "hospital": "Hospital", "clinic": "Hospital", "hopital": "Hospital", "klinik": "Hospital", "oncol": "Hospital", "medical": "Hospital", "health": "Hospital", "medicin": "Hospital", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "museum": "Museum", "library": "Museum", "foundation": "Foundation", "asociation": "Foundation", "organization": "Foundation", "society": "Foundation", "group": "Foundation", "royal": "Foundation", "ofice": "Foundation", "trust": "Foundation", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "microsoft": "Specific", "teagasc": "Specific", "ibm research": "Specific", "alergan": "Specific", "analog devices": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific"}
|
||||
{"research": "Academia", "centro": "Academia", "recherche": "Academia", "uniwersytet": "Academia", "investigacions": "Academia", "institu": "Academia", "istitut": "Academia", "univ": "Academia", "col": "Academia", "center": "Academia", "polytechnic": "Academia", "tech": "Academia", "politecnico": "Academia", "polutekhneio": "Academia", "prifysgol": "Academia", "centre": "Academia", "kentro": "Academia", "politechnika": "Academia", "szkola": "Academia", "faculty": "Academia", "school": "Academia", "academ": "Academia", "akadem": "Academia", "schule": "Academia", "ecole": "Academia", "observ": "Academia", "escuela": "Academia", "escola": "Academia", "unibertsi": "Academia", "yliopisto": "Academia", "instytut": "Academia", "zentrum": "Academia", "panepist": "Academia", "lab": "Academia", "scien": "Academia", "cientific": "Academia", "engingeering": "Academia", "egyetem": "Academia", "sterewacht": "Academia", "tecnol": "Academia", "data": "Academia", "hospital": "Hospitals", "clinic": "Hospitals", "hopital": "Hospitals", "klinik": "Hospitals", "oncol": "Hospitals", "medical": "Hospitals", "health": "Hospitals", "medicin": "Hospitals", "nosokomei": "Hospitals", "krankenhaus": "Hospitals", "ziekenhuis": "Hospitals", "spital": "Hospitals", "cancer": "Hospitals", "pharma": "Hospitals", "therapeutics": "Hospitals", "ospedale": "Hospitals", "gmbh": "Company", "company": "Company", "industr": "Company", "etaireia": "Company", "corporation": "Company", "inc": "Company", "ltd": "Company", "foundation": "Foundations", "asociation": "Foundations", "organization": "Foundations", "society": "Foundations", "group": "Foundations", "royal": "Foundations", "ofice": "Foundations", "trust": "Foundations", "survey": "Foundations", "museum": "Foundations", "library": "Foundations", "bank": "Foundations", "comision": "Foundations", "district": "Government", "federation": "Government", "government": "Government", "municipal": "Government", "county": "Government", "council": "Government", "agency": "Government", "unknown": "Unknown", "squib": "Specific", "intel": "Specific", "motorola": "Specific", "samsung": "Specific", "hitachi": "Specific", "roche": "Specific", "google": "Specific", "yahoo": "Specific", "ebay": "Specific", "novartis": "Specific", "microsoft": "Specific", "ibm": "Specific", "alergan": "Specific", "medtronic": "Specific", "xilinx": "Specific", "pfizer": "Specific", "glaxosmithkline": "Specific", "astrazeneca": "Specific", "boehringer ingelheim": "Specific", "demokritos": "Specific", "siemens": "Specific", "forth": "Specific", "eli lily": "Specific", "boeing": "Specific", "alphabet": "Specific", "johnson johnson": "Specific", "moderna": "Specific", "bayer": "Specific", "huawei": "Specific", "amazon": "Specific", "sinopec": "Specific", "novo nordisk": "Specific", "sanofi": "Specific", "amgen": "Specific", "abvie": "Specific", "basf": "Specific", "toyota": "Specific", "gilead": "Specific", "teagasc": "Specific", "apc microbiome": "Specific", "nestle": "Specific", "perkinelmer": "Specific", "shel": "Specific", "visteon": "Specific", "hewlet packard": "Specific", "fujitsu": "Specific", "toshiba": "Specific", "eth zurich": "Acronyms", "riken": "Acronyms", "ucd": "Acronyms", "tcd": "Acronyms", "nasa": "Acronyms", "bgi": "Acronyms", "ntt": "Acronyms", "dow": "Acronyms", "ustc": "Acronyms", "zju": "Acronyms", "pku": "Acronyms", "ucas": "Acronyms", "nju": "Acronyms", "scu": "Acronyms", "mit": "Acronyms", "hust": "Acronyms", "ucla": "Acronyms", "uc san diego": "Acronyms", "uc berkeley": "Acronyms", "jhu": "Acronyms", "uchicago": "Acronyms", "caltech": "Acronyms", "usp": "Acronyms", "conicet": "Acronyms", "unicamp": "Acronyms", "cnrs": "Acronyms", "ucl": "Acronyms", "csic": "Acronyms", "epfl": "Acronyms", "ucph": "Acronyms", "tum": "Acronyms", "lmu": "Acronyms", "ku leuven": "Acronyms", "ircs": "Acronyms", "nhs": "Acronyms", "discovery programe": "Acronyms", "ncsr demokritos": "Acronyms", "nui galway": "Acronyms", "nui maynooth": "Acronyms", "tu wien": "Acronyms", "tu dublin": "Acronyms", "athena": "Acronyms", "openaire": "Acronyms", "erasmus": "Acronyms", "ist austria": "Acronyms", "nit": "Acronyms", "cern": "Acronyms", "lg": "Acronyms", "chu": "Acronyms", "jst": "Acronyms", "lumc": "Acronyms"}
|
||||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,66 @@
|
|||
{"copernicus institu sustainable development" : ["utrecht" , "openorgs____::0000102848"],
|
||||
"copernicus institu sustainable development energy resources" : ["utrecht" , "openorgs____::0000102849"],
|
||||
"copernicus institu sustainable development environmental governance": ["utrecht", "openorgs____::0000102850"],
|
||||
"copernicus institu sustainable development environmental scien" : ["utrecht","openorgs____::0000102851"],
|
||||
"copernicus institu sustainable development inovation studies" : ["utrecht", "openorgs____::0000102852"],
|
||||
"leiden institu advanced computer scien" : ["leiden", "openorgs____::0000102908"],
|
||||
"gorlaeus labora" : ["leiden", "openorgs____::0000102887"],
|
||||
"leiden institu chemistry gorlaeus labora" : ["leiden", "openorgs____::0000102887"],
|
||||
"institu biology leiden" : ["leiden", "openorgs____::0000102854"],
|
||||
"leiden institu chemistry" : ["leiden", "openorgs____::0000102880"],
|
||||
"leiden malaria research group": ["leiden", "openorgs____::0000102937"],
|
||||
"sylvius labora" : ["leiden", "openorgs____::0000102938"],
|
||||
"leiden mathematical institu" : ["leiden", "openorgs____::0000102939"],
|
||||
"leiden faculty archaeology" : ["leiden", "openorgs____::0000102940"],
|
||||
"erevnitiko idrima pl":["nicosia","openorgs____::0000103020"],
|
||||
"frederick research center":["nicosia","openorgs____::0000103018"],
|
||||
"cyens center excelence":["nicosia","openorgs____::0000103019"],
|
||||
"space systems solutions ltd":["nicosia","openorgs____::0000103021"],
|
||||
"eratosthenes center excelence":["limasol","openorgs____::0000103022"],
|
||||
"cyric cyprus research and innovation center ltd":["nicosia","openorgs____::0000103023"],
|
||||
"danaos shiping company limited":["limasol","openorgs____::0000103024"],
|
||||
"cyprus space exploration organization":["nicosia","openorgs____::0000103025"],
|
||||
"cyprus museum":["nicosia","openorgs____::0000103027"],
|
||||
"larnaca general hospital":["larnaca","openorgs____::0000103028"],
|
||||
"agricultural research institu":["nicosia","openorgs____::0000103029"],
|
||||
"research education institu child health":["nicosia","openorgs____::0000103030"],
|
||||
"german oncology center":["limasol","openorgs____::0000103031"],
|
||||
"cyprus neuroscien techno institu":["nicosia","openorgs____::0000103032"],
|
||||
"salzgiter manesman forschung":["salzgiter","openorgs____::0000103035"],
|
||||
"aesculap ag" : ["tutlingen","openorgs____::0000103036"],
|
||||
"telekom inovation labora" : ["berlin","openorgs____::0000103037"],
|
||||
"dlr institu vernetzte energiesysteme" : ["oldenburg","openorgs____::0000103038"],
|
||||
"akademie ofentliches gesundheitswesen duseldorf" : ["duseldorf","openorgs____::0000103039"],
|
||||
"ibe rd institu lung health" : ["munster","openorgs____::0000103040"],
|
||||
"herzentrum leipzig" : ["leipzig","openorgs____::0000103041"],
|
||||
"bundesforschungsanstalt fischerei" : ["hamburg","openorgs____::0000098305"],
|
||||
"osteuropa institu" : ["berlin","openorgs____::0000103042"],
|
||||
"hochschule politik Munchen" : ["munich","openorgs____::0000103043"],
|
||||
"qualcom cdma techn" : ["munich","openorgs____::0000103046"],
|
||||
"kompetenzentrum obstbau bodensee" : ["ravensburg","openorgs____::0000103047"],
|
||||
"institu angewandte qualitatsforderung forschung im gesundheitswesen" : ["gotingen","openorgs____::0000103048"],
|
||||
"dresearch digital media systems": ["berlin","openorgs____::0000103049"],
|
||||
"eemagine medical imaging solutions": ["berlin","openorgs____::0000103050"],
|
||||
"forschungszentrum energietechnologie": ["oldenburg,","openorgs____::0000103051"],
|
||||
"european radiation dosimetry group" : ["neuherberg","openorgs____::0000103052"],
|
||||
"ge healthcare": ["duseldorf","openorgs____::0000103053"],
|
||||
"global energy interconection research institu europe": ["berlin","openorgs____::0000103054"],
|
||||
"proteros biostructures": ["martinsried,","openorgs____::0000103055"],
|
||||
"frankfurter algemeine zeitung" : ["martinsried","openorgs____::0000103056"],
|
||||
"stiftung neanderthal museum": ["metman","openorgs____::0000103057"],
|
||||
"nvision imaging techn" : ["ulm","openorgs____::0000103058"],
|
||||
"institu scien networking oldenburg" : ["oldenburg","openorgs____::0000103059"],
|
||||
"zentrum internationale bildungsvergleichstudien" : ["munich","openorgs____::0000103060"],
|
||||
"evangelisches klinikum bethel": ["bielefeld","openorgs____::0000103061"],
|
||||
"internationale hochschule liebenzel": ["bad liebenzel,","openorgs____::0000103062"],
|
||||
"walter schotky institu": ["munich","openorgs____::0000103063"],
|
||||
"institu phytopathologie": ["kiel","openorgs____::0000103065"],
|
||||
"helios klinikum wupertal": ["wupertal","openorgs____::0000103066"],
|
||||
"univer herzentrum hamburg": ["hamburg","openorgs____::0000103067"],
|
||||
"zentrum graphische datenverarbeitung" : ["darmstadt","openorgs____::0000017671"]}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,65 @@
|
|||
{
|
||||
"copernicus institu sustainable development" : ["netherlands","openorgs____::0000102848"],
|
||||
"copernicus institu sustainable development energy resources" : ["netherlands", "openorgs____::0000102849"],
|
||||
"copernicus institu sustainable development environmental governance" : ["netherlands", "openorgs____::0000102850"],
|
||||
"copernicus institu sustainable development environmental scien" : ["netherlands", "openorgs____::0000102851"],
|
||||
"copernicus institu sustainable development inovation studies": ["netherlands", "openorgs____::0000102852"],
|
||||
"leiden institu advanced computer scien" : ["netherlands", "openorgs____::0000102908"],
|
||||
"gorlaeus labora" : ["netherlands", "openorgs____::0000102887"],
|
||||
"leiden institu chemistry gorlaeus labora" : ["netherlands", "openorgs____::0000102887"],
|
||||
"institu biology leiden" : ["netherlands", "openorgs____::0000102854"],
|
||||
"Leiden malaria research group": ["netherlands", "openorgs____::0000102937"],
|
||||
"sylvius labora" : ["netherlands", "openorgs____::0000102938"],
|
||||
"leiden mathematical institu" : ["netherlands", "openorgs____::0000102939"],
|
||||
"leiden faculty archaeology" : ["netherlands", "openorgs____::0000102940"],
|
||||
"erevnitiko idrima pl":["cyprus","openorgs____::0000103020"],
|
||||
"frederick research center":["cyprus","openorgs____::0000103018"],
|
||||
"cyens center excelence":["cyprus","openorgs____::0000103019"],
|
||||
"space systems solutions ltd":["cyprus","openorgs____::0000103021"],
|
||||
"eratosthenes center excelence":["cyprus","openorgs____::0000103022"],
|
||||
"cyric cyprus research and innovation center ltd":["cyprus","openorgs____::0000103023"],
|
||||
"danaos shiping company limited":["cyprus","openorgs____::0000103024"],
|
||||
"cyprus space exploration organization":["cyprus","openorgs____::0000103025"],
|
||||
"cyprus museum":["cyprus","openorgs____::0000103027"],
|
||||
"larnaca general hospital":["cyprus","openorgs____::0000103028"],
|
||||
"agricultural research institu":["cyprus","openorgs____::0000103029"],
|
||||
"research education institu child health":["cyprus","openorgs____::0000103030"],
|
||||
"german oncology center":["cyprus","openorgs____::0000103031"],
|
||||
"cyprus neuroscien techno institu":["cyprus","openorgs____::0000103032"],
|
||||
"salzgiter manesman forschung":["germany","openorgs____::0000103035"],
|
||||
"aesculap ag" : ["germany","openorgs____::0000103036"],
|
||||
"telekom inovation labora" : ["germany","openorgs____::0000103037"],
|
||||
"dlr institu vernetzte energiesysteme" : ["germany","openorgs____::0000103038"],
|
||||
"akademie ofentliches gesundheitswesen duseldorf" : ["germany","openorgs____::0000103039"],
|
||||
"ibe rd institu lung health" : ["germany","openorgs____::0000103040"],
|
||||
"herzentrum leipzig" : ["germany","openorgs____::0000103041"],
|
||||
"bundesforschungsanstalt fischerei" : ["germany","openorgs____::0000098305"],
|
||||
"osteuropa institu" : ["germany","openorgs____::0000103042"],
|
||||
"hochschule politik Munchen" : ["germany","openorgs____::0000103043"],
|
||||
"qualcom cdma techn" : ["germany","openorgs____::0000103046"],
|
||||
"kompetenzentrum obstbau bodensee" : ["germany","openorgs____::0000103047"],
|
||||
"institu angewandte qualitatsforderung forschung im gesundheitswesen" : ["germany","openorgs____::0000103048"],
|
||||
"dresearch digital media systems": ["germany","openorgs____::0000103049"],
|
||||
"eemagine medical imaging solutions": ["germany","openorgs____::0000103050"],
|
||||
"forschungszentrum energietechnologie": ["germany","openorgs____::0000103051"],
|
||||
"european radiation dosimetry group" : ["germany","openorgs____::0000103052"],
|
||||
"ge healthcare": ["germany","openorgs____::0000103053"],
|
||||
"global energy interconection research institu europe": ["germany","openorgs____::0000103054"],
|
||||
"proteros biostructures": ["germany","openorgs____::0000103055"],
|
||||
"frankfurter algemeine zeitung" : ["germany","openorgs____::0000103056"],
|
||||
"stiftung neanderthal museum": ["germany","openorgs____::0000103057"],
|
||||
"nvision imaging techn" : ["germany","openorgs____::0000103058"],
|
||||
"institu scien networking oldenburg" : ["germany","openorgs____::0000103059"],
|
||||
"zentrum internationale bildungsvergleichstudien" : ["germany","openorgs____::0000103060"],
|
||||
"evangelisches klinikum bethel": ["germany","openorgs____::0000103061"],
|
||||
"internationale hochschule liebenzel": ["germany","openorgs____::0000103062"],
|
||||
"walter schotky institu": ["germany","openorgs____::0000103063"],
|
||||
"institu phytopathologie": ["germany","openorgs____::0000103065"],
|
||||
"helios klinikum wupertal": ["germany","openorgs____::0000103066"],
|
||||
"univer herzentrum hamburg": ["germany","openorgs____::0000103067"],
|
||||
"zentrum graphische datenverarbeitung" : ["germany","openorgs____::0000017671"]}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,62 @@
|
|||
{"openorgs____::0000102848" : "netherlands",
|
||||
"openorgs____::0000102849" : "netherlands",
|
||||
"openorgs____::0000102850" : "netherlands",
|
||||
"openorgs____::0000102851" : "netherlands",
|
||||
"openorgs____::0000102852" : "netherlands",
|
||||
"openorgs____::0000102908" : "netherlands",
|
||||
"openorgs____::0000102887" : "netherlands",
|
||||
"openorgs____::0000102854" : "netherlands",
|
||||
"openorgs____::0000102880" : "netherlands",
|
||||
"openorgs____::0000102937" : "netherlands",
|
||||
"openorgs____::0000102938" : "netherlands",
|
||||
"openorgs____::0000102939" : "netherlands",
|
||||
"openorgs____::0000102940" : "netherlands",
|
||||
"openorgs____::0000103018" : "cyprus",
|
||||
"openorgs____::0000103019" : "cyprus",
|
||||
"openorgs____::0000103020" : "cyprus",
|
||||
"openorgs____::0000103021" : "cyprus",
|
||||
"openorgs____::0000103022" : "cyprus",
|
||||
"openorgs____::0000103023" : "cyprus",
|
||||
"openorgs____::0000103024" : "cyprus",
|
||||
"openorgs____::0000103025" : "cyprus",
|
||||
"openorgs____::0000103027" : "cyprus",
|
||||
"openorgs____::0000103028" : "cyprus",
|
||||
"openorgs____::0000103029" : "cyprus",
|
||||
"openorgs____::0000103030" : "cyprus",
|
||||
"openorgs____::0000103031" : "cyprus",
|
||||
"openorgs____::0000103032" : "cyprus",
|
||||
"openorgs____::0000103035" : "germany",
|
||||
"openorgs____::0000103036" : "germany",
|
||||
"openorgs____::0000103037" : "germany",
|
||||
"openorgs____::0000103038" : "germany",
|
||||
"openorgs____::0000103039" : "germany",
|
||||
"openorgs____::0000103040" : "germany",
|
||||
"openorgs____::0000103041" : "germany",
|
||||
"openorgs____::0000098305" : "germany",
|
||||
"openorgs____::0000103042" : "germany",
|
||||
"openorgs____::0000103043" : "germany",
|
||||
"openorgs____::0000103046" : "germany",
|
||||
"openorgs____::0000103047" : "germany",
|
||||
"openorgs____::0000103048" : "germany",
|
||||
"openorgs____::0000103049" : "germany",
|
||||
"openorgs____::0000103050" : "germany",
|
||||
"openorgs____::0000103051" : "germany",
|
||||
"openorgs____::0000103052" : "germany",
|
||||
"openorgs____::0000103053" : "germany",
|
||||
"openorgs____::0000103054" : "germany",
|
||||
"openorgs____::0000103055" : "germany",
|
||||
"openorgs____::0000103056" : "germany",
|
||||
"openorgs____::0000103057" : "germany",
|
||||
"openorgs____::0000103058" : "germany",
|
||||
"openorgs____::0000103059" : "germany",
|
||||
"openorgs____::0000103060" : "germany",
|
||||
"openorgs____::0000103061" : "germany",
|
||||
"openorgs____::0000103062" : "germany",
|
||||
"openorgs____::0000103063" : "germany",
|
||||
"openorgs____::0000103065" : "germany",
|
||||
"openorgs____::0000103066" : "germany",
|
||||
"openorgs____::0000103067" : "germany",
|
||||
"openorgs____::0000017671" : "germany"}
|
||||
|
||||
|
||||
|
||||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1,62 @@
|
|||
{
|
||||
"copernicus institu sustainable development": "unique",
|
||||
"copernicus institu sustainable development energy resources": "unique",
|
||||
"copernicus institu sustainable development environmental governance": "unique",
|
||||
"copernicus institu sustainable development environmental scien": "unique",
|
||||
"copernicus institu sustainable development inovation studies": "unique",
|
||||
"leiden institu advanced computer scien": "unique",
|
||||
"gorlaeus labora": "unique",
|
||||
"institu biology leiden": "unique",
|
||||
"leiden institu chemistry": "unique",
|
||||
"leiden malaria research group": "unique",
|
||||
"sylvius labora": "unique",
|
||||
"leiden mathematical institu": "unique",
|
||||
"leiden faculty archaeology": "unique",
|
||||
"leiden institu chemistry gorlaeus labora": "unique",
|
||||
"erevnitiko idrima pl": "unique",
|
||||
"frederick research center": "unique",
|
||||
"cyens center excelence": "unique",
|
||||
"space systems solutions ltd": "unique",
|
||||
"eratosthenes center excelence": "unique",
|
||||
"cyric cyprus research and innovation center ltd": "unique",
|
||||
"danaos shiping company limited": "unique",
|
||||
"cyprus space exploration organization": "unique",
|
||||
"cyprus museum": "unique",
|
||||
"larnaca general hospital": "unique",
|
||||
"agricultural research institu": "many",
|
||||
"research education institu child health": "unique",
|
||||
"german oncology center": "unique",
|
||||
"cyprus neuroscien techn institu": "unique",
|
||||
"salzgiter manesman forschung": "unique",
|
||||
"aesculap ag": "unique",
|
||||
"telekom inovation labora": "unique",
|
||||
"dlr institu vernetzte energiesysteme": "unique",
|
||||
"akademie ofentliches gesundheitswesen duseldorf": "unique",
|
||||
"ibe rd institu lung health": "unique",
|
||||
"herzentrum leipzig": "unique",
|
||||
"bundesforschungsanstalt fischerei": "unique",
|
||||
"osteuropa institu": "unique",
|
||||
"hochschule politik Munchen": "unique",
|
||||
"qualcom cdma techn": "unique",
|
||||
"kompetenzentrum obstbau bodensee": "unique",
|
||||
"institu angewandte qualitatsforderung forschung im gesundheitswesen": "unique",
|
||||
"dresearch digital media systems": "unique",
|
||||
"eemagine medical imaging solutions": "unique",
|
||||
"forschungszentrum energietechnologie": "unique",
|
||||
"european radiation dosimetry group": "unique",
|
||||
"ge healthcare": "many",
|
||||
"global energy interconection research institu europe": "unique",
|
||||
"proteros biostructures": "unique",
|
||||
"frankfurter algemeine zeitung": "unique",
|
||||
"stiftung neanderthal museum": "unique",
|
||||
"nvision imaging techn": "unique",
|
||||
"institu scien networking oldenburg": "unique",
|
||||
"zentrum internationale bildungsvergleichstudien": "unique",
|
||||
"evangelisches klinikum bethel": "unique",
|
||||
"internationale hochschule liebenzel": "unique",
|
||||
"walter schotky institu": "unique",
|
||||
"institu phytopathologie": "unique",
|
||||
"helios klinikum wupertal": "unique",
|
||||
"univer herzentrum hamburg": "unique",
|
||||
"zentrum graphische datenverarbeitung": "unique"
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
|
|
@ -0,0 +1 @@
|
|||
{"hopital": "hospital", "hosp.": "hospital", "med. ctr." : "medical center" , "lab.":"labora","czechoslovak": "czech", "saint": "st", "aghia": "agia","kyprou":"kuprou", "technologiko": "tekhnologiko", "panepistimio":"panepistemio","universitatsfrauenklinik": "university hospital", "Universitätsaugenklinik": "university eye hospital", "universitatsklinikum": "univer hospital", "universitetshospital": "univer hospital", "universitatskinderklinik": "univer childrens hospital", "universitatskliniken": "univer hospital", "universit\u00e4tsklinik": "univer hospital", "universitatsmedizin": "univer medicine", "universitatsbibliothek": "univer library", "nat.": "national", "uniaersity":"univer", "univesity":"university", "unversity":"univer", "uni-versity": "univer", "un iversity": "univer", "uniuersity": "univer", "unviersity":"university", "uni versity": "univer", "unive rsity": "univer", "univ ersity": "univer", "ilniversity": "univer","inst ": "institu ", "adv ": "advanced ", "univ ": "univer ", "stud ": "studies ", "inst.": "institu", "sci." : "scien", "acad.": "academy" , "adv.": "advanced", "univ.": "univer", "stud.": "studies", "univcrsity" : "university", "uniuersity": "university", "unirersity": "university", "univsity": "university", "techniche": "technological", "univ col": "university colege", "univ. col.": "university colege", "col.": "colege", "medical school university": "university", "hipokration": "hipocration", "belfield, dublin": "dublin", "balsbridge, dublin": "dublin", "earlsfort terace, dublin": "dublin", "bon secours hospital, cork": "bon secours hospital cork", "bon secours hospital, dublin": "bon secours hospital dublin", "bon secours hospital, galway": "bon secours hospital galway", "bon secours hospital, tralee": "bon secours hospital tralee", "bon secours health system": "bon secours hospital dublin", "bon secours hospital, glasnevin": "bon secours hospital dublin", "imperial colege science, technology medicine": "imperial colege science technology medicine", "ucl queen square institute neurology": "ucl, london", "ucl institute neurology": "ucl, london", "royal holoway, university london": "royal holoway univer london", "city, university london": "city univer london", "city university, london": "city univer london", "aeginition": "eginition", "national technical university, athens": "national technical university athens", "leiden institu physics" : "univer leiden", "leyden":"leiden", " leiden law school": "univer leiden", "leiden umc" : "leiden univer medical center", "rijksuniver" : "univer", "leids" : "leiden", "lumc leiden" : "lumc,leiden", "medisch":"medical"}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,104 @@
|
|||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size, from_json, struct
|
||||
import sys
|
||||
|
||||
from affro_cluster import *
|
||||
from schemas import *
|
||||
from utils import *
|
||||
|
||||
|
||||
spark = SparkSession.builder.appName("AffRo - Graph").getOrCreate()
|
||||
|
||||
#the path of the graph to be used
|
||||
input_path = sys.argv[1]
|
||||
#eventual already mapped affiliation strings with respective matchings (only for the same version of the algorithm)
|
||||
already_matched_path = sys.argv[2]
|
||||
#the output file
|
||||
output_path = sys.argv[3]
|
||||
#matching already done for this iteration
|
||||
matching_done = sys.argv[4]
|
||||
|
||||
affro_udf = udf(matchings_affro, matching_array_schema)
|
||||
applyMatch_udf = udf(toAuthorModelGraph, match_author_schema)
|
||||
aggregate_udf = udf(aggregateAuthor,match_author_grouped_schema)
|
||||
aggregateResult_udf = udf(aggregateResultGraph, author_schema)
|
||||
|
||||
if already_matched_path == "none":
|
||||
already_matched_df = spark.createDataFrame([], affiliation_schema)
|
||||
else:
|
||||
already_matched_df = spark.read.schema(affiliation_schema).json(already_matched_path)
|
||||
|
||||
exploded = spark.read.schema(graph_entry_schema).json(input_path + "/publication") \
|
||||
.union(spark.read.schema(graph_entry_schema).json(input_path + "/dataset")) \
|
||||
.union(spark.read.schema(graph_entry_schema).json(input_path + "/software")) \
|
||||
.union(spark.read.schema(graph_entry_schema).json(input_path + "/otherresearchproduct")) \
|
||||
.select(
|
||||
col("id"), # Extract desired value
|
||||
explode(col("author")).alias("author") # Keep exploding author as before
|
||||
) \
|
||||
.filter(col("id").isNotNull()) # Remove rows where 'id' is NULL
|
||||
|
||||
# Explode the "author.rawAffiliationString" column into separate rows
|
||||
affiliations_exploded = exploded.withColumn(
|
||||
"Affiliation",
|
||||
explode(col("author.rawAffiliationString"))
|
||||
)
|
||||
|
||||
if matching_done == 'no':
|
||||
affiliations_df = (
|
||||
affiliations_exploded
|
||||
.filter(col("Affiliation").isNotNull()) # Keep rows with valid "Affiliation"
|
||||
.select("Affiliation") # Select only the "Affiliation" column
|
||||
.distinct() # Remove duplicate rows based on "Affiliation"
|
||||
)
|
||||
affiliations_df.write.mode("overwrite").json("/tmp/miriam/exploded", compression="gzip")
|
||||
#Select only the not already matches affiliation strings
|
||||
to_be_matched_df = (
|
||||
affiliations_df
|
||||
.join(already_matched_df, affiliations_df["Affiliation"] == already_matched_df["Affiliation"], "left")
|
||||
.filter(already_matched_df["Affiliation"].isNull())
|
||||
.select(affiliations_df["Affiliation"])
|
||||
)
|
||||
|
||||
matching_df = (
|
||||
to_be_matched_df # Remove duplicate rows based on "Affiliation"
|
||||
.select(
|
||||
col("Affiliation"),
|
||||
affro_udf(col("Affiliation")).alias("Matchings") # Apply UDF to "Affiliation"
|
||||
)
|
||||
.filter(col("Matchings").isNotNull()) # Exclude rows with null "Matchings"
|
||||
)
|
||||
|
||||
if already_matched_path == "none":
|
||||
already_matched_path = "/tmp/miriam/affroOnGraph/matching_df"
|
||||
|
||||
matching_df.write.mode("append").json(already_matched_path, compression="gzip")
|
||||
|
||||
matching_df = spark.read.schema(affiliation_schema).json(already_matched_path)
|
||||
|
||||
extend = (
|
||||
affiliations_exploded
|
||||
.join(matching_df, on="Affiliation")
|
||||
.filter(col("Matchings").isNotNull())
|
||||
)
|
||||
|
||||
extend.write.mode("overwrite").json("/tmp/miriam/affro-serializations/extend", compression="gzip")
|
||||
applyMatchDf = extend \
|
||||
.withColumn("application", applyMatch_udf(struct("*"))) \
|
||||
.select("application.*")
|
||||
|
||||
applyMatchDf.write.mode("overwrite").json("/tmp/miriam/affro-serializations/applyMatchDf", compression="gzip")
|
||||
groupedAuthorDf = applyMatchDf \
|
||||
.groupBy("key").agg(collect_list(struct("*")).alias("group")) \
|
||||
.withColumn("aggresult", aggregate_udf("group")) \
|
||||
.select("aggresult.*")
|
||||
|
||||
groupedAuthorDf.write.mode("overwrite").json("/tmp/miriam/affro-serializations/groupedAuthorDf", compression="gzip")
|
||||
|
||||
groupedResultDf = groupedAuthorDf \
|
||||
.groupBy("id").agg(collect_list(struct("*")).alias("group")) \
|
||||
.withColumn("result", aggregateResult_udf("group")) \
|
||||
.select("result.*")
|
||||
|
||||
|
||||
groupedResultDf.write.mode("overwrite").json(output_path, compression="gzip")
|
||||
|
|
@ -1,121 +1,223 @@
|
|||
from collections import defaultdict
|
||||
from collections import Counter
|
||||
|
||||
import Levenshtein
|
||||
|
||||
from sklearn.feature_extraction.text import CountVectorizer
|
||||
from sklearn.metrics.pairwise import cosine_similarity
|
||||
|
||||
from functions_cluster import *
|
||||
from create_input_cluster import *
|
||||
|
||||
def best_sim_score(light_raw, candidate_num, pairs_list, m, simU, simG):
|
||||
|
||||
|
||||
specific = [k for k in categ_dicts if categ_dicts[k] == 'Specific' or categ_dicts[k] == 'Acronyms']
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def index_multiple_matchings(pairs):
|
||||
d = {}
|
||||
for p in pairs:
|
||||
d[p[0][0]] = len(p)
|
||||
|
||||
return d
|
||||
|
||||
def keep_highest_url(lst):
|
||||
best = {}
|
||||
|
||||
for item in lst:
|
||||
name, score, url = item
|
||||
if name not in best or url > best[name][2]: # Keep the highest URL
|
||||
best[name] = item # Store the full entry
|
||||
|
||||
return list(best.values()) # Convert dictionary values back to list
|
||||
|
||||
def find_candidate(keyword, k, dix, simU, simG, candidates_, limit):
|
||||
|
||||
vectorizer = CountVectorizer()
|
||||
|
||||
similar_k = []
|
||||
pairs_k = []
|
||||
total_pairs = 0
|
||||
# if keyword in dix_org:
|
||||
# print('lucky')
|
||||
# pairs_k.append((keyword,keyword,1,dix_org[keyword], dix_id_country[dix_org[keyword]]))
|
||||
|
||||
for x in candidates_:
|
||||
# print('keyword', keyword)
|
||||
|
||||
if is_contained(keyword, x):
|
||||
# print(0,x,total_pairs)
|
||||
|
||||
x_vector = vectorizer.fit_transform([x]).toarray()
|
||||
keyword_vector = vectorizer.transform([keyword]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(x_vector, keyword_vector)[0][0]
|
||||
if similarity > min(simU, simG):
|
||||
if ('univ' in keyword and 'univ' in x) and similarity > simU:
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
|
||||
total_pairs += 1 # Track total number of pairs
|
||||
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
|
||||
|
||||
elif (not 'univ'in keyword and not 'univ' in x) and similarity > simG:
|
||||
# print('pass', keyword, x, similarity)
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
|
||||
total_pairs += 1 # Track total number of pairs
|
||||
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
|
||||
|
||||
|
||||
elif is_contained(x, keyword):
|
||||
# print(0.5,x,total_pairs)
|
||||
if ('univ'in keyword and 'univ' in x):
|
||||
# print(1,x,total_pairs)
|
||||
|
||||
|
||||
keyword_vector = vectorizer.fit_transform([keyword]).toarray()
|
||||
x_vector = vectorizer.transform([x]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(keyword_vector, x_vector)[0][0]
|
||||
if similarity > simU: #max(0.82,sim):
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
|
||||
total_pairs += 1 # Track total number of pairs
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
|
||||
|
||||
|
||||
elif not 'univ' in keyword and not 'univ' in x:
|
||||
|
||||
keyword_vector = vectorizer.fit_transform([keyword]).toarray()
|
||||
x_vector = vectorizer.transform([x]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(keyword_vector, x_vector)[0][0]
|
||||
if similarity > simG: #max(0.82,sim):
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((keyword,x,similarity,dix_org[x], dix_id_country[dix_org[x]]))
|
||||
total_pairs += 1 # Track total number of pairs
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
# total_pairs += len(pairs_k) # Track total number of pairs
|
||||
if total_pairs >= limit: # Stop if we reach
|
||||
return []
|
||||
|
||||
return pairs_k
|
||||
|
||||
|
||||
|
||||
def best_sim_score(clean_aff, light_raw, candidate_num, pairs_list, multi, simU, simG):
|
||||
"""
|
||||
Finds the best match between a 'key word' and several legal names from the OpenAIRE database.
|
||||
---> corrects special cases in the main map that follows
|
||||
|
||||
Args:
|
||||
light_raw
|
||||
l2 candidate_num: number of candidates.
|
||||
l3 pairs_list: List of pairs. (s, x, score)
|
||||
l4 m: mult
|
||||
|
||||
Returns:
|
||||
List: Resulting list containing OpenAIRE names and their similarity scores.
|
||||
Finds the best match between a keyword (clean_aff) and legal names from the PID database.
|
||||
"""
|
||||
|
||||
vectorizer = CountVectorizer()
|
||||
univ_num = light_raw.lower().count('univ')
|
||||
univ_num = light_raw.lower().count('univ')
|
||||
result = []
|
||||
best = []
|
||||
s = light_raw
|
||||
|
||||
for j in range(len(pairs_list)):
|
||||
x = pairs_list[j][1]
|
||||
|
||||
if [x, pairs_list[j][2]] in result:
|
||||
continue
|
||||
|
||||
if m[pairs_list[j][0]] == 1:
|
||||
|
||||
if is_contained('univ', x.lower()) and pairs_list[j][2] > simU:
|
||||
result.append([x, pairs_list[j][2]])
|
||||
elif pairs_list[j][2] > simG:
|
||||
result.append([x, pairs_list[j][2]])
|
||||
best = []
|
||||
|
||||
elif pairs_list[j][2] >= 0.98: # and (is_contained("univ", x.lower()) or is_contained("college", x.lower()) or is_contained("center", x.lower()) or is_contained("schule", x.lower())): # If the similarity score of a pair (s,x) was 1, we store it to results list
|
||||
result.append([pairs_list[j][1], 1])
|
||||
for pair_group in pairs_list:
|
||||
|
||||
best_j = []
|
||||
affil = pair_group[0][0]
|
||||
num_uni_p = affil.count('univ')
|
||||
|
||||
# print('AFFIL', affil)
|
||||
for p in pair_group:
|
||||
organization, confidence = p[1], p[2]
|
||||
|
||||
# Skip duplicates
|
||||
if [organization, confidence] in result:
|
||||
continue
|
||||
|
||||
else:
|
||||
try:
|
||||
if not is_contained("univ", x.lower()):
|
||||
continue # Skip if x does not contain "university" or "univ"
|
||||
# Check similarity conditions
|
||||
if multi[p[0]] == 1:
|
||||
if 'univ' in organization.lower() and confidence > simU:
|
||||
result.append([organization, confidence])
|
||||
elif confidence > simG:
|
||||
result.append([organization, confidence])
|
||||
|
||||
elif confidence >= 0.98:
|
||||
result.append([organization, 1])
|
||||
else:
|
||||
if "univ" not in organization:
|
||||
continue # Skip if 'univ' is missing
|
||||
|
||||
try:
|
||||
s_vector = vectorizer.fit_transform([light_raw]).toarray()
|
||||
x_vector = vectorizer.transform([organization]).toarray()
|
||||
similarity = cosine_similarity(x_vector, s_vector)[0][0]
|
||||
|
||||
if similarity > 0.1: #use Levenshtein to better handle misspellings
|
||||
similarity_l = 1 - Levenshtein.distance(organization, affil) / max(len(organization), len(affil))
|
||||
best_j.append([organization, similarity, similarity_l])
|
||||
|
||||
# if (is_contained('hosp', x.lower()) and not is_contained('hosp', s)) or (not is_contained('hosp', x.lower()) and is_contained('hosp', s)) or (is_contained('hopital', x.lower()) and not is_contained('hopital', s)) or (not is_contained('hopital', x.lower()) and is_contained('hopital', s)):
|
||||
# continue
|
||||
s_vector = vectorizer.fit_transform([s]).toarray() #Else we compute the similarity of s with the original affiiation name
|
||||
x_vector = vectorizer.transform([x]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(x_vector, s_vector)[0][0]
|
||||
if similarity > 0.1:
|
||||
similarity_l = 1 - Levenshtein.distance(x, pairs_list[j][0]) / max(len(x), len(pairs_list[j][0]))
|
||||
except Exception as ex:
|
||||
print("Error:", ex)
|
||||
|
||||
best.append([x, similarity, similarity_l]) #(similarity+similarity2)/2])
|
||||
except:
|
||||
KeyError
|
||||
|
||||
if best:
|
||||
# max_numbers = defaultdict(float)
|
||||
|
||||
|
||||
# Assuming best is a list of three-element lists
|
||||
# Each element is (string, number1, number2)
|
||||
# Step 2: Keep only the best similarity per organization
|
||||
max_numbers = defaultdict(float)
|
||||
for item in best:
|
||||
string, number1, number2 = item # Unpack the three elements
|
||||
max_numbers[string] = max(max_numbers[string], number1)
|
||||
for org, sim, sim_l in best_j:
|
||||
max_numbers[org] = max(max_numbers[org], sim)
|
||||
|
||||
reduced_best = [[string, number1, number2] for string, number1, number2 in best if number1 == max_numbers[string]]
|
||||
reduced_best = [[org, sim, sim_l] for org, sim, sim_l in best_j if sim == max_numbers[org]]
|
||||
|
||||
# Sort by number1 decreasingly and then by number2 in descending order
|
||||
# Sort by similarity score (descending) and then lexicographically
|
||||
reduced_best.sort(key=lambda x: (x[1], x[2]), reverse=True)
|
||||
# print('REDUCED BEST: ', reduced_best)
|
||||
|
||||
result = result + reduced_best
|
||||
|
||||
univ_list = []
|
||||
other_list = []
|
||||
|
||||
for r in result:
|
||||
if is_contained('univ', r[0]):
|
||||
univ_list.append(r)
|
||||
else:
|
||||
other_list.append(r)
|
||||
|
||||
limit = min(univ_num, candidate_num)
|
||||
result.extend(reduced_best)
|
||||
# print('RESULT EXT: ', result)
|
||||
|
||||
if len(univ_list) > limit:
|
||||
result = univ_list[:limit] + other_list
|
||||
|
||||
# Step 3: Limit university-related matches
|
||||
univ_list = [r for r in result if 'univ' in r[0]]
|
||||
other_list = [r for r in result if 'univ' not in r[0]]
|
||||
|
||||
limit = min(num_uni_p, candidate_num)
|
||||
if len(univ_list) > limit:
|
||||
result = univ_list[:limit] + other_list
|
||||
|
||||
best.append(best_j)
|
||||
|
||||
# Step 4: Construct final dictionary **with highest confidence values**
|
||||
pairs_dict = {p[1]: p[2] for group in pairs_list for p in group}
|
||||
|
||||
# Select the best confidence score for each organization
|
||||
result_dict = {}
|
||||
pairs_dict = {}
|
||||
|
||||
|
||||
for l in pairs_list:
|
||||
pairs_dict[l[1]] = l[2]
|
||||
|
||||
|
||||
for p in result:
|
||||
result_dict[p[0]] = pairs_dict[p[0]]
|
||||
|
||||
|
||||
result_dict_list = [[y[0], result_dict[y[0]]] for y in result]
|
||||
|
||||
return result_dict_list
|
||||
for res in result:
|
||||
org = res[0]
|
||||
similarity_score = res[1]
|
||||
if org in pairs_dict:
|
||||
best_confidence = pairs_dict[org] # Original confidence score from pairs_list
|
||||
if org not in result_dict or similarity_score > result_dict[org][1]:
|
||||
result_dict[org] = [best_confidence, similarity_score]
|
||||
|
||||
# Convert to list format
|
||||
final_result = [[key, value[0]] for key, value in sorted(result_dict.items(), key=lambda x: x[1][1], reverse=True)]
|
||||
|
||||
# print("RESULT TO USE: ", final_result)
|
||||
return final_result
|
||||
|
||||
|
||||
|
||||
def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG):
|
||||
|
||||
def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG, limit):
|
||||
|
||||
"""
|
||||
Matches affiliations in DataFrame 'DF' with names from dictionary 'dix_org' and their ROR_ids based on similarity scores.
|
||||
|
|
@ -130,190 +232,207 @@ def Aff_Ids(input, dix_org, dix_mult, dix_city_ror, dix_country_ror, simU, simG)
|
|||
Returns:
|
||||
DataFrame: The final DataFrame with matched affiliations and their corresponding similarity scores.
|
||||
"""
|
||||
df_list = input[1]
|
||||
light_aff = input[0]
|
||||
clean_aff = input[0]
|
||||
# print('CLEAN_AFF (LVL1): ', clean_aff)
|
||||
light_aff = input[1].replace(' gmbh', ' ').strip()
|
||||
# print('LIGHT_AFF (LVL2): ', light_aff)
|
||||
|
||||
df_list = input[2]
|
||||
|
||||
countries_list = input[3]
|
||||
# print('COUNTRIES_LIST: ', countries_list)
|
||||
vectorizer = CountVectorizer()
|
||||
|
||||
lnamelist = list(dix_org.keys())
|
||||
dix = {} # will store indeces and legalnames of organizations of the DOI { i : [legalname1, legalname2,...]}
|
||||
#pairs = []
|
||||
result = {}
|
||||
pairs = []
|
||||
|
||||
|
||||
def get_keywords(filtered_list):
|
||||
# Extract the "keywords" values from the dictionaries in filtered_list
|
||||
keywords_list = [entry["keywords"] for entry in filtered_list]
|
||||
keywords = [entry["keywords"].replace(' gmbh', ' ').strip() for entry in df_list]
|
||||
|
||||
candidates = get_candidates(countries_list)
|
||||
|
||||
# print('KEYWORDS: ', keywords)
|
||||
if len(keywords) > 1 or len(keywords) == 1 and len(keywords[0])>1:
|
||||
for k,s in enumerate(keywords):
|
||||
pairs_k = []
|
||||
# print('try', s)
|
||||
try:
|
||||
pairs_k.append((s,s,1,dix_org[s],dix_id_country[dix_org[s]]))
|
||||
# print('LUCKY')
|
||||
|
||||
# pairs.append((s,s,similarity,dix_org[s], dix_id_country[dix_org[s]]))
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [s]
|
||||
else:
|
||||
dix[k].append(s)
|
||||
|
||||
except:
|
||||
# print('NOT LUCKY')
|
||||
pairs_k = find_candidate(s, k , dix, simU, simG, candidates, limit)
|
||||
# print('PAIRS K: ', pairs_k)
|
||||
|
||||
return keywords_list
|
||||
keywords = get_keywords(df_list)
|
||||
result[k] = pairs_k
|
||||
if len(pairs_k)>0:
|
||||
# print('PAIRS K>0: ', pairs_k)
|
||||
|
||||
|
||||
for k,s in enumerate(keywords):
|
||||
similar_k = []
|
||||
pairs_k = []
|
||||
|
||||
if s in lnamelist:
|
||||
similarity = 1
|
||||
similar_k.append(similarity)
|
||||
pairs.append(pairs_k)
|
||||
|
||||
pairs_k.append((s,s,similarity,dix_org[s]))
|
||||
pairs.append((s,s,similarity,dix_org[s]))
|
||||
# print('PAIRS: ', pairs)
|
||||
multi = index_multiple_matchings(pairs)
|
||||
# print('MULTIL ',multi)
|
||||
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [s]
|
||||
else:
|
||||
dix[k].append(s)
|
||||
else:
|
||||
|
||||
for x in lnamelist:
|
||||
if is_contained(s, x):
|
||||
|
||||
x_vector = vectorizer.fit_transform([x]).toarray()
|
||||
s_vector = vectorizer.transform([s]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(x_vector, s_vector)[0][0]
|
||||
if similarity > min(simU, simG):
|
||||
if (is_contained('univ', s) and is_contained('univ', x)) and similarity > simU:
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((s,x,similarity,dix_org[x]))
|
||||
pairs.append((s,x,similarity,dix_org[x]))
|
||||
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
elif (not is_contained('univ', s) and not is_contained('univ', x)) and similarity > simG:
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((s,x,similarity,dix_org[x]))
|
||||
pairs.append((s,x,similarity,dix_org[x]))
|
||||
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
|
||||
elif is_contained(x, s):
|
||||
if (is_contained('univ', s) and is_contained('univ', x)):
|
||||
|
||||
s_vector = vectorizer.fit_transform([s]).toarray()
|
||||
x_vector = vectorizer.transform([x]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(s_vector, x_vector)[0][0]
|
||||
if similarity > simU: #max(0.82,sim):
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((s,x,similarity,dix_org[x]))
|
||||
pairs.append((s,x,similarity,dix_org[x]))
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
elif not is_contained('univ', s) and not is_contained('univ', x):
|
||||
|
||||
s_vector = vectorizer.fit_transform([s]).toarray()
|
||||
x_vector = vectorizer.transform([x]).toarray()
|
||||
|
||||
# Compute similarity between the vectors
|
||||
similarity = cosine_similarity(s_vector, x_vector)[0][0]
|
||||
if similarity > simG: #max(0.82,sim):
|
||||
similar_k.append(similarity)
|
||||
pairs_k.append((s,x,similarity,dix_org[x]))
|
||||
pairs.append((s,x,similarity,dix_org[x]))
|
||||
|
||||
if k not in dix:
|
||||
dix[k] = [x]
|
||||
else:
|
||||
dix[k].append(x)
|
||||
|
||||
result[k] = pairs_k
|
||||
|
||||
multi = index_multiple_matchings(list(set(pairs)))
|
||||
# need_check = list(set([i for i in range(len(multi)) if list(multi.values())[i]>1]))
|
||||
# print('here', multi)
|
||||
# need_check_keys = [keywords[i] for i in range(len(keywords)) if multi[keywords[i]]>1]
|
||||
need_check_keys = []
|
||||
for i in range(len(keywords)):
|
||||
ready_keys = []
|
||||
ready_best = []
|
||||
for keyword in multi:
|
||||
try:
|
||||
if multi[keywords[i]]>1:
|
||||
need_check_keys.append(keywords[i])
|
||||
if multi[keyword]>1:
|
||||
need_check_keys.append(keyword)
|
||||
else:
|
||||
for p in pairs:
|
||||
if keyword in p[0]:
|
||||
if p[0][1] not in ready_keys:
|
||||
ready_keys.append(p[0][1])
|
||||
|
||||
ready_best.append([p[0][1], p[0][2]])
|
||||
except:
|
||||
pass
|
||||
# print('READY KEYWORD: ', ready_keys)
|
||||
# print('READY BEST: ', ready_best)
|
||||
|
||||
# print('NEED CHECK KEYWORD: ', need_check_keys)
|
||||
|
||||
pairs_check = [ pair for pair in pairs if pair[0][0] in need_check_keys ]
|
||||
# print('NEED CHECK PAIRS: ', pairs_check)
|
||||
|
||||
|
||||
if len(need_check_keys)>0:
|
||||
best0 = best_sim_score(clean_aff, light_aff, len(keywords), pairs_check, multi, simU, simG)
|
||||
# print('OUTPUT BEST: ', best0)
|
||||
best1 = {x[0]:dix_org[x[0]] for x in best0 }
|
||||
best01 = unique_subset(best0, best1)
|
||||
matched_org = list(set([x[0] for x in best01])) + ready_keys
|
||||
best = best01 + ready_best
|
||||
|
||||
|
||||
|
||||
# print('NEW BEST',best01)
|
||||
else:
|
||||
best = ready_best
|
||||
matched_org = ready_keys
|
||||
|
||||
best = best_sim_score(light_aff, len(keywords), pairs, multi, simU, simG)
|
||||
matched_org = [x[0] for x in best]
|
||||
# best_o = []
|
||||
# best_s = []
|
||||
# best_result = []
|
||||
# for x in best:
|
||||
# best_o.append([x[i][0] for i in range(len(x))])
|
||||
# best_s.append([round(x[i][1],2) for i in range(len(x))])
|
||||
# num_mathced = [len(best_s[i]) for i in range(len(need_check))]
|
||||
ids = [dix_org[x[0]] for x in best]
|
||||
for i,x in enumerate(matched_org):
|
||||
# id_list = []
|
||||
if dix_mult[x] != 'unique':
|
||||
if x in list(dix_city_ror.keys()):
|
||||
match_found0 = False
|
||||
|
||||
# print('FINAL BEST: ', best)
|
||||
## print('MATCHED: ', matched_org)
|
||||
|
||||
id_list = []
|
||||
|
||||
for org_list in best:
|
||||
org = org_list[0]
|
||||
conf = org_list[1]
|
||||
if dix_mult[org] == 'unique':
|
||||
# print('unique:', org)
|
||||
if 'institu' in org and 'univ' in org:
|
||||
#print('both inst and univ', clean_aff)
|
||||
if dix_city_ror[org][0] not in clean_aff and dix_country_ror[org][0] not in clean_aff:
|
||||
#print('pass')
|
||||
pass
|
||||
else:
|
||||
#print('correct')
|
||||
id_list.append([org, conf, dix_org[org]])
|
||||
else:
|
||||
id_list.append([org, conf, dix_org[org]])
|
||||
|
||||
|
||||
else:
|
||||
# print('not unique:', org)
|
||||
if org in dix_city_ror:
|
||||
match_found = False
|
||||
|
||||
for city in dix_city_ror[x]:
|
||||
if city[0] in light_aff:
|
||||
if city[0] not in x:
|
||||
ids[i] = city[1]
|
||||
|
||||
match_found0 = True
|
||||
for city in dix_city_ror[org]:
|
||||
if city[0] in clean_aff:
|
||||
if city[0] not in org:
|
||||
# print('city', city[0], org)
|
||||
id_list.append([org, conf, city[1]])
|
||||
match_found = True
|
||||
break
|
||||
else:
|
||||
if clean_aff.count(city[0]) >1:
|
||||
id_list.append([org, conf, city[1]])
|
||||
match_found = True
|
||||
break
|
||||
|
||||
if not match_found:
|
||||
for city in dix_city_ror[x]:
|
||||
if city[0] in light_aff and city[0] not in x:
|
||||
ids[i] = city[1]
|
||||
match_found0 = True
|
||||
print('ok')
|
||||
for city in dix_city_ror[org]:
|
||||
if city[0] in clean_aff and city[0] not in org:
|
||||
id_list.append([org, conf, city[1]])
|
||||
break
|
||||
|
||||
if not match_found:
|
||||
match_found2 = False
|
||||
match_found3 = False
|
||||
|
||||
for country in dix_country_ror[x]:
|
||||
if country[0] == 'united states' and (country[0] in light_aff or 'usa' in light_aff):
|
||||
ids[i] = country[1]
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
for country in dix_country_ror[org]:
|
||||
# print('country', country[0], org)
|
||||
|
||||
if country[0] == 'united kingdom' and (country[0] in light_aff or 'uk' in light_aff):
|
||||
ids[i] = country[1]
|
||||
tokens = set(clean_aff.lower().split())
|
||||
text = clean_aff.lower()
|
||||
|
||||
if country[0] == 'united states' and (
|
||||
'united states' in text
|
||||
or {'usa', 'usa.'} & tokens
|
||||
or 'u.s.a.' in text
|
||||
):
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
|
||||
elif country[0] in light_aff:
|
||||
|
||||
if country[0] == 'united kingdom' and (
|
||||
'united kingdom' in text
|
||||
or {'uk', 'uk.'} & tokens
|
||||
or 'u.k.' in text
|
||||
):
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
# print('check country', clean_aff)
|
||||
# if country[0] == 'united states' and (country[0] in clean_aff or 'usa' in clean_aff.split() or 'usa.' in clean_aff.split() or 'u.s.a.' in clean_aff):
|
||||
# id_list.append([org, conf, country[1]])
|
||||
# match_found2 = True
|
||||
# match_found3 = True
|
||||
# break
|
||||
|
||||
# if country[0] == 'united kingdom' and (country[0] in clean_aff or 'uk' in clean_aff.split() or 'u.k.' in clean_aff):
|
||||
# id_list.append([org, conf, country[1]])
|
||||
# match_found2 = True
|
||||
# match_found3 = True
|
||||
# break
|
||||
|
||||
if country[0] not in x:
|
||||
ids[i] = country[1]
|
||||
elif country[0].split()[0] in clean_aff:
|
||||
|
||||
if country[0] not in org:
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
match_found3 = True
|
||||
break
|
||||
|
||||
if not match_found3:
|
||||
for country in dix_country_ror[x]:
|
||||
if country[0] in light_aff and country[0] in x:
|
||||
ids[i] = country[1]
|
||||
for country in dix_country_ror[org]:
|
||||
if country[0] in clean_aff and country[0] in org:
|
||||
id_list.append([org, conf, country[1]])
|
||||
match_found2 = True
|
||||
break
|
||||
|
||||
|
||||
|
||||
|
||||
if not match_found2:
|
||||
for sp in specific:
|
||||
if sp in org:
|
||||
id_list.append([org, conf, dix_org[org]])
|
||||
|
||||
results = [[x[0],x[1], ids[i]] for i,x in enumerate(best)]
|
||||
|
||||
return results #[[result[to_check[i]] for i in ready] + [to_check[2]], best[0]]
|
||||
# print("RESULT: ", id_list)
|
||||
id_list_final = keep_highest_url(id_list)
|
||||
|
||||
return id_list_final
|
||||
|
|
@ -0,0 +1,80 @@
|
|||
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
|
||||
from affro_cluster import *
|
||||
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set
|
||||
import sys
|
||||
from schemas import *
|
||||
|
||||
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
|
||||
|
||||
folder_path = sys.argv[1]
|
||||
hdfs_output_path = sys.argv[2]
|
||||
|
||||
#Version of affro application on a single raw_aff_string and returns just the Matchins set
|
||||
def oalex_affro(aff_string):
|
||||
try:
|
||||
matchings = affro(aff_string)
|
||||
# Ensure matchings is a list, even if affro returns a single dict
|
||||
if not isinstance(matchings, list):
|
||||
matchings = [matchings]
|
||||
|
||||
# Create the result as a tuple that matches matchings_schema
|
||||
result = []
|
||||
for matching in matchings:
|
||||
# Assuming 'matching' is a dictionary that contains 'Provenance', 'PID', 'Value', 'Confidence', 'Status'
|
||||
result.append((
|
||||
matching.get("Provenance", None),
|
||||
matching.get("PID", None),
|
||||
matching.get("Value", None),
|
||||
float(matching.get("Confidence", None)),
|
||||
matching.get("Status", None)
|
||||
))
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error processing affiliation string {aff_string}: {str(e)}")
|
||||
return ()
|
||||
|
||||
oalex_affro_udf = udf(oalex_affro, matching_array_schema)
|
||||
|
||||
exploded = spark.read.schema(ddl_schema_aff).json(folder_path) \
|
||||
.filter(col("doi").isNotNull()) \
|
||||
.select(
|
||||
col("doi"),
|
||||
explode("authorships").alias("authors")
|
||||
) \
|
||||
.select(
|
||||
col("doi"),
|
||||
col("authors.raw_affiliation_strings").alias("raw_aff_string")
|
||||
) \
|
||||
.select(
|
||||
col("doi").alias("DOI"),
|
||||
explode(col("raw_aff_string")).alias("affiliation") #this allows to split all the raw_aff_string and to parallelize better
|
||||
)
|
||||
|
||||
exploded.write.mode("overwrite").json("/tmp/miriam/affroOnOalex/exploded", compression="gzip")
|
||||
|
||||
|
||||
affs = exploded \
|
||||
.select("affiliation") \
|
||||
.distinct() \
|
||||
.withColumn("Matchings", oalex_affro_udf(col("affiliation")))
|
||||
|
||||
affs.write.mode("overwrite").json("/tmp/miriam/affroOnOalex/matchings", compression="gzip")
|
||||
|
||||
|
||||
affs.join(exploded, on="affiliation") \
|
||||
.select(col("DOI"),
|
||||
explode(col("Matchings")).alias("match")
|
||||
) \
|
||||
.groupBy("DOI") \
|
||||
.agg(
|
||||
collect_set("match").alias("Matchings") #each exploded match is collected again
|
||||
) \
|
||||
.write \
|
||||
.mode("overwrite") \
|
||||
.option("compression","gzip") \
|
||||
.json(hdfs_output_path)
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -0,0 +1,151 @@
|
|||
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size, from_json, struct
|
||||
import sys
|
||||
from affro_cluster import *
|
||||
from schemas import *
|
||||
|
||||
|
||||
spark = SparkSession.builder.appName("AffRo - Authors model").getOrCreate()
|
||||
|
||||
|
||||
input_file = sys.argv[1]
|
||||
author_file = sys.argv[2]
|
||||
|
||||
matching_array_schema = ArrayType(
|
||||
StructType([
|
||||
StructField("Provenance", StringType(), nullable=False),
|
||||
StructField("PID", StringType(), nullable=False),
|
||||
StructField("Value", StringType(), nullable=False),
|
||||
StructField("Confidence", DoubleType(), nullable=False),
|
||||
StructField("Status", StringType(), nullable=False)
|
||||
])
|
||||
)
|
||||
|
||||
affro_udf = udf(matchings_affro, matching_array_schema)
|
||||
|
||||
|
||||
exploded = spark.read.json(input_file) \
|
||||
.withColumn("exploded", explode(col("instance"))) \
|
||||
.withColumn("altId", explode(col("exploded.alternateIdentifier"))) \
|
||||
.filter("altId.qualifier.classid == 'doi'") \
|
||||
.select(
|
||||
col("altId.value").alias("id"), # Extract desired value
|
||||
explode(col("author")).alias("author") # Keep exploding author as before
|
||||
) \
|
||||
.filter(col("id").isNotNull()) # Remove rows where 'id' is NULL
|
||||
|
||||
# Explode the "author.rawAffiliationString" column into separate rows
|
||||
affiliations_exploded = exploded.withColumn(
|
||||
"Affiliation",
|
||||
explode(col("author.rawAffiliationString"))
|
||||
)
|
||||
|
||||
matching_df = (
|
||||
affiliations_exploded
|
||||
.filter(col("Affiliation").isNotNull()) # Keep rows with valid "Affiliation"
|
||||
.select("Affiliation") # Select only the "Affiliation" column
|
||||
.distinct() # Remove duplicate rows based on "Affiliation"
|
||||
.select(
|
||||
col("Affiliation"),
|
||||
affro_udf(col("Affiliation")).alias("Matchings") # Apply UDF to "Affiliation"
|
||||
)
|
||||
.filter(col("Matchings").isNotNull()) # Exclude rows with null "Matchings"
|
||||
)
|
||||
|
||||
|
||||
#x id id, author, rawaffiliationstring, matchings
|
||||
def toAuthorModel(x):
|
||||
name = {}
|
||||
name['Full'] = x['author']['fullname']
|
||||
name['First'] = x['author']['name']
|
||||
name['Last'] = x['author']['surname']
|
||||
orcid = None
|
||||
if 'pid' in x['author'] :
|
||||
try:
|
||||
for p in x['author']['pid']:
|
||||
if 'qualifier' in p and p['qualifier']['classid'] == 'orcid':
|
||||
orcid = p['value']
|
||||
except:
|
||||
pass
|
||||
name['orcid'] = orcid
|
||||
ret = {'key': x['id']+x['author']['fullname'],'id' :x['id'], 'author':name, 'Raw_affiliation' : x['Affiliation'], 'Matchings':getMatchings(x['Matchings'])}
|
||||
return ret
|
||||
|
||||
def getMatchings(matches):
|
||||
matchings = []
|
||||
for m in matches:
|
||||
matchings.append({"Provenance":m['Provenance'], "PID":m["PID"], "Value":m["Value"], "Confidence":m["Confidence"],"Status":m["Status"]})
|
||||
return matchings
|
||||
|
||||
def regroupAndSelectDistinctMatch(x):
|
||||
ret = []
|
||||
dic = {}
|
||||
for m in x:
|
||||
for e in m:
|
||||
if e['Status'] == 'active':
|
||||
if not e['Value'] in dic:
|
||||
dic[e['Value']] = 0
|
||||
if dic[e['Value']] < float(e['Confidence']):
|
||||
dic[e['Value']] = float(e['Confidence'])
|
||||
for e in dic:
|
||||
ret.append({"Provenance":"AffRo", "PID":"ROR", "Value":e,"Confidence":dic[e], "Status":"active"})
|
||||
|
||||
return ret
|
||||
|
||||
def aggregateAuthor(group):
|
||||
affiliations = []
|
||||
matchings = []
|
||||
for e in group:
|
||||
affiliations.append(e['Raw_affiliation'])
|
||||
matchings.append(e['Matchings'])
|
||||
return {"id":e['id'], "author" : e["author"], "Raw_affiliation" : affiliations, "Matchings":regroupAndSelectDistinctMatch(matchings)}
|
||||
|
||||
def aggregateResult(group):
|
||||
authors = []
|
||||
matchings = []
|
||||
for e in group:
|
||||
amatch = getMatchings(e["Matchings"])
|
||||
authors.append({
|
||||
"Name": {"First":e["author"]['First'], "Last":e["author"]["Last"],"Full":e["author"]["Full"],"orcid":e["author"]["orcid"]},
|
||||
"Corresponding": None,
|
||||
"Contributor_roles": None,
|
||||
"Raw_affiliations": [aff for aff in e["Raw_affiliation"]],
|
||||
"Matchings": amatch
|
||||
})
|
||||
matchings.append(amatch)
|
||||
ret = {"id": group[0]["id"], "Authors": authors, "Organizations":regroupAndSelectDistinctMatch(matchings)}
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
applyMatch_udf = udf(toAuthorModel, match_author_schema)
|
||||
aggregate_udf = udf(aggregateAuthor,match_author_grouped_schema)
|
||||
aggregateResult_udf = udf(aggregateResult, author_schema)
|
||||
|
||||
extend = (
|
||||
matching_df
|
||||
.join(affiliations_exploded, on="Affiliation")
|
||||
.filter(col("Matchings").isNotNull())
|
||||
)
|
||||
|
||||
applyMatchDf = extend \
|
||||
.withColumn("application", applyMatch_udf(struct("*"))) \
|
||||
.select("application.*")
|
||||
|
||||
|
||||
|
||||
groupedAuthorDf = applyMatchDf \
|
||||
.groupBy("key").agg(collect_list(struct("*")).alias("group")) \
|
||||
.withColumn("aggresult", aggregate_udf("group")) \
|
||||
.select("aggresult.*")
|
||||
|
||||
|
||||
groupedResultDf = groupedAuthorDf \
|
||||
.groupBy("id").agg(collect_list(struct("*")).alias("group")) \
|
||||
.withColumn("result", aggregateResult_udf("group")) \
|
||||
.select("result.*") \
|
||||
.withColumnRenamed("id", "DOI")
|
||||
|
||||
|
||||
groupedResultDf.write.mode("overwrite").json(author_file, compression="gzip")
|
||||
|
|
@ -0,0 +1,81 @@
|
|||
from pyspark.sql.types import StringType, ArrayType, StructType, StructField, DoubleType
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size
|
||||
|
||||
from affro_cluster import *
|
||||
|
||||
spark = SparkSession.builder.appName("AffRo - Relations").getOrCreate()
|
||||
|
||||
|
||||
input_path = sys.argv[1]
|
||||
maching_file = sys.argv[2]
|
||||
|
||||
|
||||
try:
|
||||
matching_df = spark.read.json(maching_file)
|
||||
affro_dict = dict(matching_df.rdd.map(lambda row: (row[0], row[1])).collect())
|
||||
|
||||
except:
|
||||
schema = StructType([
|
||||
StructField("Affiliation", StringType(), True),
|
||||
StructField("Matchings", StringType(), True)
|
||||
])
|
||||
|
||||
matching_df = spark.createDataFrame([], schema)
|
||||
affro_dict = {}
|
||||
|
||||
|
||||
|
||||
|
||||
matchings_schema = ArrayType(
|
||||
StructType([
|
||||
StructField("Provenance", StringType(), nullable=False),
|
||||
StructField("PID", StringType(), nullable=False),
|
||||
StructField("Value", StringType(), nullable=False),
|
||||
StructField("Confidence", DoubleType(), nullable=False),
|
||||
StructField("Status", StringType(), nullable=False)
|
||||
])
|
||||
)
|
||||
|
||||
|
||||
affro_udf = udf(matchings_affro, matchings_schema)
|
||||
|
||||
|
||||
exploded = spark.read.json(input_path) \
|
||||
.filter(col("id").isNotNull()) \
|
||||
.select(
|
||||
col("id").alias("ID"),
|
||||
explode(col("author")).alias("author") #this allows to split all the raw_aff_string and to parallelize better
|
||||
)
|
||||
|
||||
# Explode the "author.rawAffiliationString" column into separate rows
|
||||
affiliations_exploded = exploded.withColumn(
|
||||
"Affiliation",
|
||||
explode(col("author.rawAffiliationString"))
|
||||
)
|
||||
|
||||
extend = (
|
||||
matching_df
|
||||
.join(affiliations_exploded, on="Affiliation", how="right")
|
||||
.filter(col("Matchings").isNull())
|
||||
)
|
||||
|
||||
result = (
|
||||
extend
|
||||
.filter(col("Affiliation").isNotNull()) # Keep rows with valid "Affiliation"
|
||||
.select("Affiliation") # Select only the "Affiliation" column
|
||||
.distinct() # Remove duplicate rows based on "Affiliation"
|
||||
.select(
|
||||
col("Affiliation"),
|
||||
affro_udf(col("Affiliation")).alias("Matchings") # Apply UDF to "Affiliation"
|
||||
)
|
||||
.filter(col("Matchings").isNotNull()) # Exclude rows with null "Matchings"
|
||||
)
|
||||
|
||||
|
||||
result.write \
|
||||
.format("json") \
|
||||
.option("compression", "gzip") \
|
||||
.mode("append") \
|
||||
.save(maching_file)
|
||||
|
||||
|
|
@ -0,0 +1,117 @@
|
|||
from schemas import *
|
||||
from pyspark.sql.types import *
|
||||
from pyspark.sql import SparkSession
|
||||
from pyspark.sql.functions import col, explode, first, collect_list, udf, collect_set, size, from_json, struct
|
||||
import sys
|
||||
|
||||
|
||||
#x id id, author, rawaffiliationstring, matchings
|
||||
def toAuthorModel(x):
|
||||
name = {}
|
||||
name['Full'] = x['author']['fullname']
|
||||
name['First'] = x['author']['name']
|
||||
name['Last'] = x['author']['surname']
|
||||
orcid = None
|
||||
if 'pid' in x['author'] :
|
||||
for p in x['author']['pid']:
|
||||
if 'qualifier' in p and p['qualifier']['classid'] == 'orcid':
|
||||
orcid = p['value']
|
||||
name['orcid'] = orcid
|
||||
ret = {'key': x['id']+x['author']['fullname'],'id' :x['id'], 'author':name, 'Raw_affiliation' : x['Affiliation'], 'Matchings':getMatchings(x['Matchings'])}
|
||||
return ret
|
||||
|
||||
def getMatchings(matches):
|
||||
matchings = []
|
||||
for m in matches:
|
||||
matchings.append({"Provenance":m['Provenance'], "PID":m["PID"], "Value":m["Value"], "Confidence":m["Confidence"],"Status":m["Status"]})
|
||||
return matchings
|
||||
|
||||
def regroupAndSelectDistinctMatch(x):
|
||||
ret = []
|
||||
dic = {}
|
||||
for m in x:
|
||||
for e in m:
|
||||
if e['Status'] == 'active':
|
||||
if not e['Value'] in dic:
|
||||
dic[e['Value']] = 0
|
||||
if dic[e['Value']] < float(e['Confidence']):
|
||||
dic[e['Value']] = float(e['Confidence'])
|
||||
for e in dic:
|
||||
ret.append({"Provenance":"AffRo", "PID":"ROR", "Value":e,"Confidence":dic[e], "Status":"active"})
|
||||
|
||||
return ret
|
||||
|
||||
def aggregateAuthor(group):
|
||||
affiliations = []
|
||||
matchings = []
|
||||
for e in group:
|
||||
affiliations.append(e['Raw_affiliation'])
|
||||
matchings.append(e['Matchings'])
|
||||
return {"id":e['id'], "author" : e["author"], "Raw_affiliation" : affiliations, "Matchings":regroupAndSelectDistinctMatch(matchings)}
|
||||
|
||||
def aggregateResult(group):
|
||||
authors = []
|
||||
matchings = []
|
||||
for e in group:
|
||||
amatch = getMatchings(e["Matchings"])
|
||||
authors.append({
|
||||
"Name": {"First":e["author"]['First'], "Last":e["author"]["Last"],"Full":e["author"]["Full"],"orcid":e["author"]["orcid"]},
|
||||
"Corresponding": None,
|
||||
"Contributor_roles": None,
|
||||
"Raw_affiliations": [aff for aff in e["Raw_affiliation"]],
|
||||
"Matchings": amatch
|
||||
})
|
||||
matchings.append(amatch)
|
||||
ret = {"id": group[0]["id"], "Authors": authors, "Organizations":regroupAndSelectDistinctMatch(matchings)}
|
||||
return ret
|
||||
|
||||
maching_file = sys.argv[2]
|
||||
input_file = sys.argv[1]
|
||||
author_file = sys.argv[3]
|
||||
|
||||
applyMatch_udf = udf(toAuthorModel, match_author_schema)
|
||||
aggregate_udf = udf(aggregateAuthor,match_author_grouped_schema)
|
||||
aggregateResult_udf = udf(aggregateResult, author_schema)
|
||||
|
||||
spark = SparkSession.builder.appName("AffRo-Matchings").getOrCreate()
|
||||
|
||||
matching_df = spark.read.json(maching_file)
|
||||
|
||||
exploded = spark.read.schema(result_schema).json(input_file) \
|
||||
.select(
|
||||
col("id"),
|
||||
explode(col("author")).alias("author") #this allows to split all the raw_aff_string and to parallelize better
|
||||
)
|
||||
|
||||
|
||||
affiliations_exploded = exploded.withColumn(
|
||||
"Affiliation",
|
||||
explode(col("author.rawAffiliationString"))
|
||||
)
|
||||
|
||||
|
||||
extend = (
|
||||
matching_df
|
||||
.join(affiliations_exploded, on="Affiliation")
|
||||
.filter(col("Matchings").isNotNull())
|
||||
)
|
||||
|
||||
applyMatchDf = extend \
|
||||
.withColumn("application", applyMatch_udf(struct("*"))) \
|
||||
.select("application.*")
|
||||
|
||||
|
||||
groupedAuthorDf = applyMatchDf \
|
||||
.groupBy("key").agg(collect_list(struct("*")).alias("group")) \
|
||||
.withColumn("aggresult", aggregate_udf("group")) \
|
||||
.select("aggresult.*")
|
||||
|
||||
|
||||
groupedResultDf = groupedAuthorDf \
|
||||
.groupBy("id").agg(collect_list(struct("*")).alias("group")) \
|
||||
.withColumn("result", aggregateResult_udf("group")) \
|
||||
.select("result.*")
|
||||
|
||||
groupedResultDf.write.mode("overwrite").json(author_file, compression="gzip")
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,147 @@
|
|||
from pyspark.sql.types import (
|
||||
StringType, ArrayType, StructType, StructField, FloatType, IntegerType, MapType, BooleanType, DoubleType
|
||||
)
|
||||
|
||||
# Schema per il nome dell'autore
|
||||
author_info_schema = StructType([
|
||||
StructField("Full", StringType(), True),
|
||||
StructField("First", StringType(), True),
|
||||
StructField("Last", StringType(), True),
|
||||
StructField("orcid", StringType(), True)
|
||||
])
|
||||
|
||||
# Schema per un matching
|
||||
matching_schema = StructType([
|
||||
StructField("Provenance", StringType(), True),
|
||||
StructField("PID", StringType(), True),
|
||||
StructField("Value", StringType(), True),
|
||||
StructField("Confidence", DoubleType(), True),
|
||||
StructField("Status", StringType(), True)
|
||||
])
|
||||
|
||||
matching_array_schema = ArrayType(matching_schema)
|
||||
|
||||
affiliation_schema = StructType([
|
||||
StructField("Affiliation", StringType(), nullable=False),
|
||||
StructField("Matchings", matching_array_schema, True)
|
||||
])
|
||||
|
||||
creator_schema = StructType([
|
||||
StructField("name", StringType(), True),
|
||||
StructField("givenName", StringType(), True),
|
||||
StructField("familyName", StringType(), True),
|
||||
StructField("nameType", StringType(), True),
|
||||
StructField("affiliation", ArrayType(StringType()), True),
|
||||
StructField("nameIdentifiers", ArrayType(StringType()), True)
|
||||
])
|
||||
|
||||
json_schema = StructType([
|
||||
StructField("doi", StringType(), True),
|
||||
StructField("attributes", StructType([
|
||||
StructField("doi", StringType(), True),
|
||||
StructField("identifiers", ArrayType(StringType()), True),
|
||||
StructField("creators", ArrayType(creator_schema), True),
|
||||
]), True)
|
||||
])
|
||||
|
||||
graph_author_schema = StructType([
|
||||
StructField("fullname", StringType(), True),
|
||||
StructField("rawAffiliationString", ArrayType(StringType()), True)])
|
||||
|
||||
graph_entry_schema = StructType([
|
||||
StructField("id", StringType(), True),
|
||||
StructField("author", ArrayType(graph_author_schema), True)])
|
||||
|
||||
# Schema per un autore con affiliazione e matching
|
||||
match_author_schema = StructType([
|
||||
StructField("id", StringType(), True),
|
||||
StructField("key", StringType(), True),
|
||||
StructField("author", author_info_schema, True),
|
||||
StructField("Raw_affiliation", StringType(), True),
|
||||
StructField("Matchings", ArrayType(matching_schema), True)
|
||||
])
|
||||
|
||||
# Schema per la versione aggregata degli autori
|
||||
match_author_grouped_schema = StructType([
|
||||
StructField("id", StringType(), True),
|
||||
StructField("author", author_info_schema, True),
|
||||
StructField("Raw_affiliation", ArrayType(StringType()), True),
|
||||
StructField("Matchings", ArrayType(matching_schema), True)
|
||||
])
|
||||
|
||||
# Schema per un'organizzazione
|
||||
organization_schema = matching_array_schema
|
||||
|
||||
# Schema per un autore con informazioni dettagliate
|
||||
detailed_author_schema = StructType([
|
||||
StructField("Name", author_info_schema, True),
|
||||
StructField("Corresponding", BooleanType(), True),
|
||||
StructField("Contributor_roles", ArrayType(MapType(StringType(), StringType())), True),
|
||||
StructField("Raw_affiliations", ArrayType(StringType()), True),
|
||||
StructField("Matchings", ArrayType(matching_schema), True)
|
||||
])
|
||||
|
||||
# Schema per il set di autori e organizzazioni
|
||||
author_schema = StructType([
|
||||
StructField("id", StringType(), True),
|
||||
StructField("Authors", ArrayType(detailed_author_schema), True),
|
||||
StructField("Organizations", organization_schema, True)
|
||||
])
|
||||
|
||||
# Schema per un identificatore con qualificatori
|
||||
qualifier_schema = StructType([
|
||||
StructField("classid", StringType(), True),
|
||||
StructField("classname", StringType(), True),
|
||||
StructField("schemeid", StringType(), True),
|
||||
StructField("schemename", StringType(), True)
|
||||
])
|
||||
|
||||
pid_schema = StructType([
|
||||
StructField("key", StringType(), True),
|
||||
StructField("qualifier", qualifier_schema, True),
|
||||
StructField("value", StringType(), True)
|
||||
])
|
||||
|
||||
# Schema per gli autori nel grafo
|
||||
graph_author_schema = StructType([
|
||||
StructField("fullname", StringType(), True),
|
||||
StructField("name", StringType(), True),
|
||||
StructField("surname", StringType(), True),
|
||||
StructField("rank", IntegerType(), True),
|
||||
StructField("pid", ArrayType(pid_schema), True),
|
||||
StructField("rawAffiliationString", ArrayType(StringType()), True)
|
||||
])
|
||||
|
||||
# Schema per il dataset finale
|
||||
result_schema = StructType([
|
||||
StructField("id", StringType(), False),
|
||||
StructField("author", ArrayType(graph_author_schema), True)
|
||||
])
|
||||
|
||||
affiliation_exploded_schema = StructType([
|
||||
StructField("id", StringType(), True),
|
||||
StructField("author", creator_schema, True),
|
||||
StructField("Affiliation", StringType(), True)
|
||||
|
||||
])
|
||||
|
||||
ddl_schema_aff = StructType([
|
||||
StructField("id", StringType(), True),
|
||||
StructField("doi", StringType(), True),
|
||||
StructField("publication_year", StringType(), True),
|
||||
StructField("authorships", ArrayType(
|
||||
StructType([
|
||||
StructField("institutions", ArrayType(
|
||||
StructType([
|
||||
StructField("ror", StringType(), True),
|
||||
StructField("country_code", StringType(), True)
|
||||
])
|
||||
), True),
|
||||
StructField("raw_affiliation_strings", ArrayType(StringType()), True)
|
||||
])
|
||||
), True)
|
||||
])
|
||||
|
||||
affiliation_string_schema = StructType([
|
||||
StructField("raw_affiliation_string", StringType(), nullable=False)
|
||||
])
|
||||
Binary file not shown.
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
12294
txt_files/city_names.txt
12294
txt_files/city_names.txt
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,239 @@
|
|||
trinidad
|
||||
united
|
||||
hong
|
||||
niederland
|
||||
holand
|
||||
deutschland
|
||||
usa
|
||||
uk
|
||||
korea
|
||||
africa
|
||||
turkey
|
||||
afghanistan
|
||||
aland
|
||||
albania
|
||||
algeria
|
||||
american samoa
|
||||
andora
|
||||
angola
|
||||
antarctica
|
||||
antigua barbuda
|
||||
argentina
|
||||
armenia
|
||||
aruba
|
||||
australia
|
||||
austria
|
||||
azerbaijan
|
||||
bahamas
|
||||
bahrain
|
||||
bangladesh
|
||||
barbados
|
||||
belarus
|
||||
belgium
|
||||
belize
|
||||
benin
|
||||
bermuda
|
||||
bhutan
|
||||
bolivia
|
||||
bonaire sint eustatius saba
|
||||
bosnia herzegovina
|
||||
botswana
|
||||
brazil
|
||||
brunei
|
||||
bulgaria
|
||||
burkina faso
|
||||
burundi
|
||||
cabo verde
|
||||
cambodia
|
||||
cameroon
|
||||
canada
|
||||
cayman islands
|
||||
central african republic
|
||||
chad
|
||||
chile
|
||||
china
|
||||
colombia
|
||||
comoros
|
||||
congo republic
|
||||
costa rica
|
||||
croatia
|
||||
cuba
|
||||
curacao
|
||||
cyprus
|
||||
czechia
|
||||
denmark
|
||||
djibouti
|
||||
dominica
|
||||
dominican republic
|
||||
dr congo
|
||||
ecuador
|
||||
egypt
|
||||
el salvador
|
||||
equatorial guinea
|
||||
eritrea
|
||||
estonia
|
||||
eswatini
|
||||
ethiopia
|
||||
falkland islands
|
||||
faroe islands
|
||||
fiji
|
||||
finland
|
||||
france
|
||||
french guiana
|
||||
french polynesia
|
||||
gabon
|
||||
gambia
|
||||
georgia
|
||||
germany
|
||||
ghana
|
||||
gibraltar
|
||||
greece
|
||||
greenland
|
||||
grenada
|
||||
guadeloupe
|
||||
guam
|
||||
guatemala
|
||||
guinea
|
||||
guinea bisau
|
||||
guyana
|
||||
haiti
|
||||
honduras
|
||||
hong kong
|
||||
hungary
|
||||
iceland
|
||||
india
|
||||
indonesia
|
||||
iran
|
||||
iraq
|
||||
ireland
|
||||
isle man
|
||||
israel
|
||||
italy
|
||||
ivory coast
|
||||
jamaica
|
||||
japan
|
||||
jersey
|
||||
jordan
|
||||
kazakhstan
|
||||
kenya
|
||||
kiribati
|
||||
kosovo
|
||||
kuwait
|
||||
kyrgyzstan
|
||||
laos
|
||||
latvia
|
||||
lebanon
|
||||
lesotho
|
||||
liberia
|
||||
libya
|
||||
liechtenstein
|
||||
lithuania
|
||||
luxembourg
|
||||
macao
|
||||
madagascar
|
||||
malawi
|
||||
malaysia
|
||||
maldives
|
||||
mali
|
||||
malta
|
||||
martinique
|
||||
mauritania
|
||||
mauritius
|
||||
mayote
|
||||
mexico
|
||||
micronesia
|
||||
moldova
|
||||
monaco
|
||||
mongolia
|
||||
montenegro
|
||||
montserat
|
||||
moroco
|
||||
mozambique
|
||||
myanmar
|
||||
namibia
|
||||
nepal
|
||||
netherlands
|
||||
new caledonia
|
||||
new zealand
|
||||
nicaragua
|
||||
niger
|
||||
nigeria
|
||||
niue
|
||||
north korea
|
||||
north macedonia
|
||||
northern mariana islands
|
||||
norway
|
||||
oman
|
||||
pakistan
|
||||
palau
|
||||
palestine
|
||||
panama
|
||||
papua new guinea
|
||||
paraguay
|
||||
peru
|
||||
philipines
|
||||
poland
|
||||
portugal
|
||||
puerto rico
|
||||
qatar
|
||||
reunion
|
||||
romania
|
||||
rusia
|
||||
rwanda
|
||||
samoa
|
||||
san marino
|
||||
sao tome principe
|
||||
saudi arabia
|
||||
senegal
|
||||
serbia
|
||||
seycheles
|
||||
siera leone
|
||||
singapore
|
||||
sint maarten
|
||||
slovakia
|
||||
slovenia
|
||||
solomon islands
|
||||
somalia
|
||||
south africa
|
||||
south korea
|
||||
south sudan
|
||||
spain
|
||||
sri lanka
|
||||
st kits nevis
|
||||
st lucia
|
||||
st vincent grenadines
|
||||
sudan
|
||||
suriname
|
||||
svalbard jan mayen
|
||||
sweden
|
||||
switzerland
|
||||
syria
|
||||
taiwan
|
||||
tajikistan
|
||||
tanzania
|
||||
thailand
|
||||
timor leste
|
||||
togo
|
||||
tonga
|
||||
trinidad tobago
|
||||
tunisia
|
||||
turkiye
|
||||
turkmenistan
|
||||
turks caicos islands
|
||||
tuvalu
|
||||
uganda
|
||||
ukraine
|
||||
united arab emirates
|
||||
united kingdom
|
||||
united states
|
||||
uruguay
|
||||
us virgin islands
|
||||
uzbekistan
|
||||
vanuatu
|
||||
vatican city
|
||||
venezuela
|
||||
vietnam
|
||||
western sahara
|
||||
yemen
|
||||
zambia
|
||||
zimbabwe
|
||||
|
|
@ -1,28 +1,36 @@
|
|||
universi
|
||||
colege street
|
||||
research institu
|
||||
laboratory
|
||||
labora
|
||||
gmbh
|
||||
inc
|
||||
universi of
|
||||
ltd
|
||||
research center
|
||||
foundation
|
||||
faculty
|
||||
national institu
|
||||
school medicine
|
||||
universi school
|
||||
univer school
|
||||
graduate school
|
||||
graduate school engineering
|
||||
institu tropical medicine
|
||||
institu virology
|
||||
faculty medicine
|
||||
laboratory
|
||||
universi park
|
||||
labora
|
||||
univer park
|
||||
institu science
|
||||
polytechnic universi
|
||||
universi 1
|
||||
ciudad universi
|
||||
universi campus
|
||||
universi hospitals
|
||||
polytechnic univer
|
||||
univer 1
|
||||
ciudad univer
|
||||
univer campus
|
||||
univer hospitals
|
||||
colege
|
||||
universi road
|
||||
universitetska str
|
||||
univer road
|
||||
univer str
|
||||
clinic
|
||||
techn
|
||||
univer
|
||||
institu st
|
||||
po box
|
||||
rijksuniver
|
||||
institu
|
||||
hochschule
|
||||
|
|
@ -5,7 +5,6 @@ at
|
|||
de
|
||||
for
|
||||
et
|
||||
für
|
||||
des
|
||||
in
|
||||
as
|
||||
|
|
@ -14,3 +13,11 @@ and
|
|||
fur
|
||||
for
|
||||
und
|
||||
der
|
||||
aus
|
||||
dem
|
||||
di
|
||||
l
|
||||
street
|
||||
post-box
|
||||
e.v.
|
||||
|
|
@ -5,4 +5,7 @@ universitatskliniken
|
|||
universitetshospital
|
||||
universitatsmedizin
|
||||
universitatsbibliothek
|
||||
universitatspital
|
||||
universitatspital
|
||||
universitetsjukhuset
|
||||
universitatsaugenklinik
|
||||
univesitatsfrauenklinik
|
||||
|
|
@ -1,116 +0,0 @@
|
|||
import json
|
||||
import os
|
||||
from pyspark.sql import SparkSession
|
||||
from affro_cluster import *
|
||||
|
||||
folder_path = '/user/zeppelin/miriam.baglioni/AffStringFromIISDataset2'
|
||||
#folder_path = 'check'
|
||||
|
||||
json_file_names = []
|
||||
|
||||
# Loop through all files in the directory
|
||||
for file_name in os.listdir(folder_path):
|
||||
# Check if the file is a JSON file (you can adjust the check as needed)
|
||||
if file_name != '_SUCCESS':
|
||||
json_file_names.append(file_name)
|
||||
|
||||
# json_file_names now contains the names of all JSON files in the folder
|
||||
|
||||
# Initialize Spark session
|
||||
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
|
||||
|
||||
def remove_duplicates(list_of_dicts):
|
||||
# Use a set to store tuples of dictionary items to filter out duplicates
|
||||
seen = set()
|
||||
unique_list_of_dicts = []
|
||||
|
||||
for d in list_of_dicts:
|
||||
# Convert the dictionary to a tuple of items
|
||||
items = tuple(d.items())
|
||||
if items not in seen:
|
||||
seen.add(items)
|
||||
unique_list_of_dicts.append(d)
|
||||
|
||||
return unique_list_of_dicts
|
||||
|
||||
def update_record(record):
|
||||
id = record['id']
|
||||
authors = []
|
||||
try:
|
||||
for author in record['authors']:
|
||||
author_object = {}
|
||||
if 'orcid.org/0' in author['fullName']:
|
||||
author_object['Name'] = {'Full':author['fullName'].split(',')[1], 'First' : None, 'Last' : None}
|
||||
author_object['ORCID'] = author['fullName'].split(',')[0][:36]
|
||||
else:
|
||||
author_object['Name'] = {'Full':author['fullName'], 'First' : None, 'Last' : None}
|
||||
author_object['ORCID'] = None
|
||||
author_object['Raw_affiliations'] = [affiliation['raw_affiliation_string'] for affiliation in author['affiliations']]
|
||||
all_affs_with_ror = []
|
||||
have_ror = False
|
||||
for affiliation in author['affiliations']:
|
||||
# author_object['Raw_affiliations'] = [x for x in affiliation['raw_affiliation_string']]
|
||||
if 'ORCID: 0' in affiliation['raw_affiliation_string']:
|
||||
x = affiliation['raw_affiliation_string']
|
||||
author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID: ')[1]
|
||||
elif 'ORCID 0' in affiliation['raw_affiliation_string']:
|
||||
x = affiliation['raw_affiliation_string']
|
||||
author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID ')[1]
|
||||
if 'ror.org' in affiliation['raw_affiliation_string']:
|
||||
have_ror = True
|
||||
all_affs_with_ror.append({
|
||||
'Origin': 'data',
|
||||
'RORid': affiliation['raw_affiliation_string'][0:25],
|
||||
'Confidence': None
|
||||
})
|
||||
|
||||
|
||||
else:
|
||||
if len(affro(affiliation['raw_affiliation_string']))>0:
|
||||
author_object['Organization_PIDs'] = affro(affiliation['raw_affiliation_string'])
|
||||
author_object['Organization_PIDs'] = remove_duplicates([json.loads(x) for x in author_object['Organization_PIDs']])
|
||||
|
||||
else:
|
||||
author_object['Organization_PIDs'] = []
|
||||
|
||||
if have_ror == True:
|
||||
author_object['Organization_PIDs'] = all_affs_with_ror
|
||||
order = ["Name", "Raw_affiliations", "Organization_PIDs", "ORCID"]
|
||||
|
||||
reordered_data = {k: author_object[k] for k in order}
|
||||
|
||||
authors.append(reordered_data)
|
||||
|
||||
|
||||
organizations = remove_duplicates([x for author in authors for x in author['Organization_PIDs']])
|
||||
|
||||
updt = {'ID' : id, 'Authors' : authors, 'Organizations' : organizations}
|
||||
return updt
|
||||
except Exception as e:
|
||||
print(f"Error processing record with id {record.get('id')}: {str(e)}")
|
||||
return None
|
||||
|
||||
|
||||
|
||||
for file in json_file_names:
|
||||
print('start processing '+str(file))
|
||||
df = spark.read.json(folder_path + '/' + file)
|
||||
|
||||
# Apply the update_record function
|
||||
updated_rdd = df.rdd.map(lambda row: update_record(row.asDict()))
|
||||
|
||||
# Convert updated RDD to JSON strings
|
||||
json_rdd = updated_rdd.map(lambda record: json.dumps(record))
|
||||
|
||||
# Collect the data and write to an output file with a unique name
|
||||
json_data = json_rdd.collect()
|
||||
|
||||
# Create a new filename by appending "_output.json" to the original filename (without extension)
|
||||
output_file_name = file+'_output.json'
|
||||
print('end processing '+str(file))
|
||||
|
||||
with open(output_file_name, 'w') as f:
|
||||
for i, item in enumerate(json_data):
|
||||
print('write '+str(i))
|
||||
|
||||
f.write(item + '\n')
|
||||
|
|
@ -0,0 +1,123 @@
|
|||
|
||||
|
||||
#x id id, author, rawaffiliationstring, matchings
|
||||
def toAuthorModelGraph(x):
|
||||
name = {}
|
||||
name['Full'] = x['author']['fullname']
|
||||
orcid = None
|
||||
if 'pid' in x['author'] :
|
||||
try:
|
||||
for p in x['author']['pid']:
|
||||
if 'qualifier' in p and p['qualifier']['classid'] == 'orcid':
|
||||
orcid = p['value']
|
||||
except:
|
||||
pass
|
||||
name['orcid'] = orcid
|
||||
if name['Full'] is None:
|
||||
key = x['id']
|
||||
else:
|
||||
key = x['id'] + name['Full']
|
||||
|
||||
return {'key': key,'id' :x['id'], 'author':name, 'Raw_affiliation' : x['Affiliation'], 'Matchings':getMatchings(x['Matchings'])}
|
||||
|
||||
|
||||
def toAuthorModel(x):
|
||||
name = {}
|
||||
name['Full'] = x['author']['name']
|
||||
name['First'] = x['author']['givenName']
|
||||
name['Last'] = x['author']['familyName']
|
||||
if name['Full'] is None:
|
||||
if not x['author']['familyName'] is None and not x['author']['givenName'] is None:
|
||||
name['Full'] = x['author']['familyName'] + ", " + x['author']['givenName']
|
||||
elif not x['author']['familyName'] is None:
|
||||
name['Full'] = x['author']['familyName']
|
||||
else:
|
||||
name['Full'] = x['author']['givenName']
|
||||
orcid = None
|
||||
if 'pid' in x['author'] :
|
||||
try:
|
||||
for p in x['author']['nameIdentifiers']:
|
||||
if 'nameIdentifierScheme' in p and p['nameIdentifierScheme'].lower() == 'orcid':
|
||||
orcid = p['nameIdentifier']
|
||||
except:
|
||||
pass
|
||||
name['orcid'] = orcid
|
||||
if name['Full'] is None:
|
||||
key = x['id']
|
||||
else:
|
||||
key = x['id'] + name['Full']
|
||||
|
||||
return {'key': key,'id' :x['id'], 'author':name, 'Raw_affiliation' : x['Affiliation'], 'Matchings':getMatchings(x['Matchings'])}
|
||||
|
||||
def getMatchings(matches):
|
||||
matchings = []
|
||||
for m in matches:
|
||||
matchings.append({"Provenance":m['Provenance'], "PID":m["PID"], "Value":m["Value"], "Confidence":m["Confidence"],"Status":m["Status"]})
|
||||
return matchings
|
||||
|
||||
def regroupAndSelectDistinctMatch(x):
|
||||
ret = []
|
||||
dic = {}
|
||||
for m in x:
|
||||
for e in m:
|
||||
if e['Status'] == 'active':
|
||||
if not e['Value'] in dic:
|
||||
dic[e['Value']] = 0
|
||||
if dic[e['Value']] < float(e['Confidence']):
|
||||
dic[e['Value']] = float(e['Confidence'])
|
||||
for e in dic:
|
||||
ret.append({"Provenance":"AffRo", "PID":"ROR", "Value":e,"Confidence":dic[e], "Status":"active"})
|
||||
|
||||
return ret
|
||||
|
||||
def aggregateAuthor(group):
|
||||
affiliations = []
|
||||
matchings = []
|
||||
for e in group:
|
||||
affiliations.append(e['Raw_affiliation'])
|
||||
matchings.append(e['Matchings'])
|
||||
return {"id":e['id'], "author" : e["author"], "Raw_affiliation" : affiliations, "Matchings":regroupAndSelectDistinctMatch(matchings)}
|
||||
|
||||
def aggregateResultGraph(group):
|
||||
authors = []
|
||||
matchings = []
|
||||
for e in group:
|
||||
amatch = getMatchings(e["Matchings"])
|
||||
authors.append({
|
||||
"Name": {"First":e["author"]['First'], "Last":e["author"]["Last"],"Full":e["author"]["Full"],"orcid":e["author"]["orcid"]},
|
||||
"Corresponding": None,
|
||||
"Contributor_roles": None,
|
||||
"Raw_affiliations": [aff for aff in e["Raw_affiliation"]],
|
||||
"Matchings": amatch
|
||||
})
|
||||
matchings.append(amatch)
|
||||
|
||||
ret = {"id": group[0]["id"], "Authors": authors, "Organizations":regroupAndSelectDistinctMatch(matchings)}
|
||||
return ret
|
||||
|
||||
|
||||
def aggregateResult(group, graph=False):
|
||||
authors = []
|
||||
matchings = []
|
||||
for e in group:
|
||||
amatch = getMatchings(e["Matchings"])
|
||||
authors.append({
|
||||
"Name": {"First":e["author"]['First'], "Last":e["author"]["Last"],"Full":e["author"]["Full"],"orcid":e["author"]["orcid"]},
|
||||
"Corresponding": None,
|
||||
"Contributor_roles": None,
|
||||
"Raw_affiliations": [aff for aff in e["Raw_affiliation"]],
|
||||
"Matchings": amatch
|
||||
})
|
||||
matchings.append(amatch)
|
||||
if not graph:
|
||||
ret = {"DOI": group[0]["id"], "Authors": authors, "Organizations":regroupAndSelectDistinctMatch(matchings)}
|
||||
else:
|
||||
ret = {"id": group[0]["id"], "Authors": authors, "Organizations":regroupAndSelectDistinctMatch(matchings)}
|
||||
return ret
|
||||
|
||||
def aggregateResultNoAuthor(group):
|
||||
matchings = []
|
||||
for e in group:
|
||||
matchings.append(getMatchings(e["Matchings"]))
|
||||
ret = {"id": group[0]["id"], "Authors": [], "Organizations":regroupAndSelectDistinctMatch(matchings)}
|
||||
return ret
|
||||
Loading…
Reference in New Issue