This commit is contained in:
mkallipo 2024-12-01 20:00:49 +01:00
parent 413ec3773e
commit d9dbc679e3
7 changed files with 36 additions and 19 deletions

View File

@ -13,6 +13,7 @@ dix_org_oaire = load_json('dictionaries/dix_acad_oaire.json')
dix_mult_oaire = load_json('dictionaries/dix_mult_oaire.json')
dix_country_oaire = load_json('dictionaries/dix_country_oaire.json')
dix_status = load_json('dictionaries/dix_status.json')
dix_grids = load_json('dictionaries/dix_grids_rors.json')
def find_ror(input, simU, simG):

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -3,7 +3,8 @@ import os
from pyspark.sql import SparkSession
from affro_cluster import *
folder_path = '/user/zeppelin/miriam.baglioni/AffStringFromIISDataset2'
folder_path ='/Users/myrto/Documents/openAIRE/7. matching/data_samples/iis_short'
#folder_path = 'check'
json_file_names = []
@ -14,7 +15,6 @@ for file_name in os.listdir(folder_path):
if file_name != '_SUCCESS':
json_file_names.append(file_name)
# json_file_names now contains the names of all JSON files in the folder
# Initialize Spark session
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
@ -40,14 +40,14 @@ def update_record(record):
for author in record['authors']:
author_object = {}
if 'orcid.org/0' in author['fullName']:
author_object['Name'] = {'Full':author['fullName'].split(',')[1], 'First' : None, 'Last' : None}
author_object['Name'] = {'Full':author['fullName'].split(',')[1].strip(), 'First' : 'None', 'Last' : 'None', 'Type' : 'None'}
author_object['ORCID'] = author['fullName'].split(',')[0][:36]
else:
author_object['Name'] = {'Full':author['fullName'], 'First' : None, 'Last' : None}
author_object['ORCID'] = None
author_object['Name'] = {'Full':author['fullName'].strip(), 'First' : 'None', 'Last' : 'None', 'Type' : 'None'}
author_object['ORCID'] = 'None'
author_object['Raw_affiliations'] = [affiliation['raw_affiliation_string'] for affiliation in author['affiliations']]
all_affs_with_ror = []
have_ror = False
have_id = False
for affiliation in author['affiliations']:
# author_object['Raw_affiliations'] = [x for x in affiliation['raw_affiliation_string']]
if 'ORCID: 0' in affiliation['raw_affiliation_string']:
@ -57,32 +57,48 @@ def update_record(record):
x = affiliation['raw_affiliation_string']
author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID ')[1]
if 'ror.org' in affiliation['raw_affiliation_string']:
have_ror = True
have_id = True
all_affs_with_ror.append({
'Origin': 'data',
'RORid': affiliation['raw_affiliation_string'][0:25],
'Confidence': None
})
elif 'grid.' in affiliation['raw_affiliation_string']:
have_id = True
for k in dix_grids:
if k in affiliation['raw_affiliation_string'].split(' ')[0]:
try:
all_affs_with_ror.append({
'Provenance': 'Data',
'PID' : 'ROR',
'Value' : dix_grids[k] ,
'Confidence': 1
})
except:
pass
else:
if len(affro(affiliation['raw_affiliation_string']))>0:
author_object['Organization_PIDs'] = affro(affiliation['raw_affiliation_string'])
author_object['Organization_PIDs'] = remove_duplicates([json.loads(x) for x in author_object['Organization_PIDs']])
author_object['Matchings'] = affro(affiliation['raw_affiliation_string'])
try:
author_object['Matchings'] = remove_duplicates([json.loads(x) for x in author_object['Matchings']])
except:
author_object['Matchings'] = remove_duplicates([x for x in author_object['Matchings']])
else:
author_object['Organization_PIDs'] = []
author_object['Matchings'] = []
if have_ror == True:
author_object['Organization_PIDs'] = all_affs_with_ror
order = ["Name", "Raw_affiliations", "Organization_PIDs", "ORCID"]
if have_id == True:
author_object['Matchings'] = all_affs_with_ror
order = ["Name", "Raw_affiliations", "Matchings", "ORCID"]
reordered_data = {k: author_object[k] for k in order}
authors.append(reordered_data)
organizations = remove_duplicates([x for author in authors for x in author['Organization_PIDs']])
organizations = remove_duplicates([x for author in authors for x in author['Matchings']])
updt = {'ID' : id, 'Authors' : authors, 'Organizations' : organizations}
return updt