updates

2024-12-01 20:00:49 +01:00 · 2024-12-01 20:00:49 +01:00 · d9dbc679e3
parent 413ec3773e
commit d9dbc679e3
7 changed files with 36 additions and 19 deletions
--- a/pycache/affro_cluster.cpython-311.pyc
+++ b/pycache/affro_cluster.cpython-311.pyc
--- a/pycache/functions_cluster.cpython-311.pyc
+++ b/pycache/functions_cluster.cpython-311.pyc
--- a/affro_cluster.py
+++ b/affro_cluster.py
@ -13,6 +13,7 @@ dix_org_oaire = load_json('dictionaries/dix_acad_oaire.json')
 dix_mult_oaire = load_json('dictionaries/dix_mult_oaire.json')
 dix_country_oaire = load_json('dictionaries/dix_country_oaire.json')
 dix_status = load_json('dictionaries/dix_status.json')
+dix_grids = load_json('dictionaries/dix_grids_rors.json')

    
 def find_ror(input, simU, simG):
--- a/dictionaries/dix_acad_oaire.json
+++ b/dictionaries/dix_acad_oaire.json
--- a/dictionaries/dix_country_oaire.json
+++ b/dictionaries/dix_country_oaire.json
--- a/dictionaries/dix_mult_oaire.json
+++ b/dictionaries/dix_mult_oaire.json
--- a/update_records.py
+++ b/update_records.py
@ -3,7 +3,8 @@ import os
 from pyspark.sql import SparkSession
 from affro_cluster import *

-folder_path = '/user/zeppelin/miriam.baglioni/AffStringFromIISDataset2'
+folder_path ='/Users/myrto/Documents/openAIRE/7. matching/data_samples/iis_short'
+
 #folder_path = 'check'

 json_file_names = []
@ -14,7 +15,6 @@ for file_name in os.listdir(folder_path):
    if file_name != '_SUCCESS':
        json_file_names.append(file_name)

-# json_file_names now contains the names of all JSON files in the folder

 # Initialize Spark session
 spark = SparkSession.builder.appName("JSONProcessing").getOrCreate()
@ -40,14 +40,14 @@ def update_record(record):
        for author in record['authors']:
            author_object = {}
            if 'orcid.org/0'  in author['fullName']:
-                author_object['Name'] = {'Full':author['fullName'].split(',')[1], 'First' : None, 'Last' : None}
+                author_object['Name'] = {'Full':author['fullName'].split(',')[1].strip(), 'First' : 'None', 'Last' : 'None', 'Type' : 'None'}
                author_object['ORCID'] = author['fullName'].split(',')[0][:36]
            else:
-                author_object['Name'] = {'Full':author['fullName'], 'First' : None, 'Last' : None}
-                author_object['ORCID'] = None 
+                author_object['Name'] = {'Full':author['fullName'].strip(), 'First' : 'None', 'Last' : 'None', 'Type' : 'None'}
+                author_object['ORCID'] = 'None' 
            author_object['Raw_affiliations'] = [affiliation['raw_affiliation_string'] for affiliation in author['affiliations']]
            all_affs_with_ror = []
-            have_ror = False
+            have_id = False
            for affiliation in author['affiliations']:
           #     author_object['Raw_affiliations'] = [x for x in affiliation['raw_affiliation_string']]
                if 'ORCID: 0' in affiliation['raw_affiliation_string']:
@ -57,32 +57,48 @@ def update_record(record):
                    x = affiliation['raw_affiliation_string']
                    author_object['ORCID'] = 'https://orcid.org/'+x.split('ORCID ')[1]
                if 'ror.org' in affiliation['raw_affiliation_string']:
-                    have_ror = True
+                    have_id = True
                    all_affs_with_ror.append({
                    'Origin': 'data',
                    'RORid': affiliation['raw_affiliation_string'][0:25],
                    'Confidence': None
                    })
             
-                        
+                elif 'grid.' in affiliation['raw_affiliation_string']:
+                    have_id = True
+                                            
+                    for k in dix_grids:
+                        if k in affiliation['raw_affiliation_string'].split(' ')[0]:
+                            try: 
+                                all_affs_with_ror.append({
+                                'Provenance': 'Data',
+                                'PID' : 'ROR',
+                                'Value' : dix_grids[k] ,
+                                'Confidence': 1
+                                })
+                            except:
+                                pass      
                else:
                    if len(affro(affiliation['raw_affiliation_string']))>0:
-                        author_object['Organization_PIDs'] = affro(affiliation['raw_affiliation_string'])
-                        author_object['Organization_PIDs'] = remove_duplicates([json.loads(x) for x in author_object['Organization_PIDs']])
-
+                        author_object['Matchings'] = affro(affiliation['raw_affiliation_string'])
+                        try:
+                            author_object['Matchings'] = remove_duplicates([json.loads(x) for x in author_object['Matchings']])
+                        except:
+                            author_object['Matchings'] = remove_duplicates([x for x in author_object['Matchings']])
+                      
                    else:
-                        author_object['Organization_PIDs'] = []
+                        author_object['Matchings'] = []
                    
-            if have_ror == True:
-                author_object['Organization_PIDs'] = all_affs_with_ror
-            order = ["Name", "Raw_affiliations", "Organization_PIDs", "ORCID"]
+            if have_id == True:
+                author_object['Matchings'] = all_affs_with_ror
+            order = ["Name", "Raw_affiliations", "Matchings", "ORCID"]

            reordered_data = {k: author_object[k] for k in order}

            authors.append(reordered_data)
        
        
-        organizations =  remove_duplicates([x for author in authors for x in  author['Organization_PIDs']])
+        organizations =  remove_duplicates([x for author in authors for x in  author['Matchings']])

        updt = {'ID' : id, 'Authors' : authors, 'Organizations' : organizations}
        return updt