updates
This commit is contained in:
parent
413ec3773e
commit
d9dbc679e3
Binary file not shown.
Binary file not shown.
|
@ -13,6 +13,7 @@ dix_org_oaire = load_json('dictionaries/dix_acad_oaire.json')
|
|||
dix_mult_oaire = load_json('dictionaries/dix_mult_oaire.json')
|
||||
dix_country_oaire = load_json('dictionaries/dix_country_oaire.json')
|
||||
dix_status = load_json('dictionaries/dix_status.json')
|
||||
dix_grids = load_json('dictionaries/dix_grids_rors.json')
|
||||
|
||||
|
||||
def find_ror(input, simU, simG):
|
||||
|
|
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
@ -3,7 +3,8 @@ import os
|
|||
from pyspark.sql import SparkSession
|
||||
from affro_cluster import *
|
||||
|
||||
folder_path = '/user/zeppelin/miriam.baglioni/AffStringFromIISDataset2'
|
||||
folder_path ='/Users/myrto/Documents/openAIRE/7. matching/data_samples/iis_short'
|
||||
|
||||
#folder_path = 'check'
|
||||
|
||||
json_file_names = []
|
||||
|
@ -14,7 +15,6 @@ for file_name in os.listdir(folder_path):
|
|||
if file_name != '_SUCCESS':
|
||||
json_file_names.append(file_name)
|
||||
|
||||
# json_file_names now contains the names of all JSON files in the folder
|
||||
|
||||
# Initialize Spark session
|
||||
spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
|
||||
|
@ -40,14 +40,14 @@ def update_record(record):
|
|||
for author in record['authors']:
|
||||
author_object = {}
|
||||
if 'orcid.org/0' in author['fullName']:
|
||||
author_object['Name'] = {'Full':author['fullName'].split(',')[1], 'First' : None, 'Last' : None}
|
||||
author_object['Name'] = {'Full':author['fullName'].split(',')[1].strip(), 'First' : 'None', 'Last' : 'None', 'Type' : 'None'}
|
||||
author_object['ORCID'] = author['fullName'].split(',')[0][:36]
|
||||
else:
|
||||
author_object['Name'] = {'Full':author['fullName'], 'First' : None, 'Last' : None}
|
||||
author_object['ORCID'] = None
|
||||
author_object['Name'] = {'Full':author['fullName'].strip(), 'First' : 'None', 'Last' : 'None', 'Type' : 'None'}
|
||||
author_object['ORCID'] = 'None'
|
||||
author_object['Raw_affiliations'] = [affiliation['raw_affiliation_string'] for affiliation in author['affiliations']]
|
||||
all_affs_with_ror = []
|
||||
have_ror = False
|
||||
have_id = False
|
||||
for affiliation in author['affiliations']:
|
||||
# author_object['Raw_affiliations'] = [x for x in affiliation['raw_affiliation_string']]
|
||||
if 'ORCID: 0' in affiliation['raw_affiliation_string']:
|
||||
|
@ -57,32 +57,48 @@ def update_record(record):
|
|||
x = affiliation['raw_affiliation_string']
|
||||
author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID ')[1]
|
||||
if 'ror.org' in affiliation['raw_affiliation_string']:
|
||||
have_ror = True
|
||||
have_id = True
|
||||
all_affs_with_ror.append({
|
||||
'Origin': 'data',
|
||||
'RORid': affiliation['raw_affiliation_string'][0:25],
|
||||
'Confidence': None
|
||||
})
|
||||
|
||||
|
||||
elif 'grid.' in affiliation['raw_affiliation_string']:
|
||||
have_id = True
|
||||
|
||||
for k in dix_grids:
|
||||
if k in affiliation['raw_affiliation_string'].split(' ')[0]:
|
||||
try:
|
||||
all_affs_with_ror.append({
|
||||
'Provenance': 'Data',
|
||||
'PID' : 'ROR',
|
||||
'Value' : dix_grids[k] ,
|
||||
'Confidence': 1
|
||||
})
|
||||
except:
|
||||
pass
|
||||
else:
|
||||
if len(affro(affiliation['raw_affiliation_string']))>0:
|
||||
author_object['Organization_PIDs'] = affro(affiliation['raw_affiliation_string'])
|
||||
author_object['Organization_PIDs'] = remove_duplicates([json.loads(x) for x in author_object['Organization_PIDs']])
|
||||
|
||||
author_object['Matchings'] = affro(affiliation['raw_affiliation_string'])
|
||||
try:
|
||||
author_object['Matchings'] = remove_duplicates([json.loads(x) for x in author_object['Matchings']])
|
||||
except:
|
||||
author_object['Matchings'] = remove_duplicates([x for x in author_object['Matchings']])
|
||||
|
||||
else:
|
||||
author_object['Organization_PIDs'] = []
|
||||
author_object['Matchings'] = []
|
||||
|
||||
if have_ror == True:
|
||||
author_object['Organization_PIDs'] = all_affs_with_ror
|
||||
order = ["Name", "Raw_affiliations", "Organization_PIDs", "ORCID"]
|
||||
if have_id == True:
|
||||
author_object['Matchings'] = all_affs_with_ror
|
||||
order = ["Name", "Raw_affiliations", "Matchings", "ORCID"]
|
||||
|
||||
reordered_data = {k: author_object[k] for k in order}
|
||||
|
||||
authors.append(reordered_data)
|
||||
|
||||
|
||||
organizations = remove_duplicates([x for author in authors for x in author['Organization_PIDs']])
|
||||
organizations = remove_duplicates([x for author in authors for x in author['Matchings']])
|
||||
|
||||
updt = {'ID' : id, 'Authors' : authors, 'Organizations' : organizations}
|
||||
return updt
|
||||
|
|
Loading…
Reference in New Issue