DiamondOA/ParseISSN.ipynb

7.8 KiB

Campi del dump ROAD che ci interessano 001 - ISSN 041 - lingua di pubblicazione 044 - publisher country 082 a) DDC subject classification 245 a) Title proper 246 acronimo titolo 260 b) editore 260 c) date of pubblication 856 - url della risorsa 981 a)subject 982 sottocategorie di subject 983 sottocategorie di subject 984 sottocategorie di subject
In [18]:
data = {
        'Record ID': [],
        'Date of Publication': [],
        'Country of Publication': [],
        'Language': [],
        'ISSN': [],
        'ISSNL': [],
        'ddc_subject_classification': [],
        'Publisher':[],
        'Title': [],
        'Access URL': [],
        'subject': [],
        'subject_level1': [],
        'subject_level2': [],
        'subject_level3': []
    }
In [19]:
def parseRecord(record):
    parsed_record={}
    subjects = set()
    subjects_l1=set()
    subjects_l2=set()
    subjects_l3=set()
    access_url = set()
    # Fields
    for field in record.get('fields', []):
        for tag, value in field.items():
            # Gestisci campi controllati
            if tag == '001':
                parsed_record['Record ID'] = value
            elif tag == '008':
                general_info = value
                parsed_record['Date of Publication']= general_info[7:11].strip() + " - " + general_info[11:15].strip()
                parsed_record['Country of Publication']= general_info[15:18].strip()
                parsed_record['Language']= general_info[35:38].strip()
            elif tag == '022':
                for subfield in value.get('subfields', []):
                    if 'a' in subfield:
                        parsed_record['ISSN'] = subfield['a']
                    if 'l' in subfield:
                        parsed_record['ISSNL'] = subfield['l']
            elif tag == '044':
                country_code = value.get('subfields', [{}])[0].get('c', 'Unknown')
                parsed_record['Country of Publication'] = country_code
            elif tag == '082':
                parsed_record['ddc_subject_classification']= value.get('subfields', [{}])[0].get('a', 'Unknown')
            elif tag == '245':
                title = ' '.join([sub.get('a', '') for sub in value.get('subfields', [])])
                parsed_record['Title'] = title.strip()
            elif tag == '260':
                parsed_record['Publisher'] = value.get('subfields', [{}])[0].get('a', ' ') + " " + value.get('subfields', [{}])[1].get('b', ' ')
            elif tag == '856':
                url = next((sub.get('u') for sub in value.get('subfields', []) if 'u' in sub), None)
                if url:
                    access_url.add(url)
            elif tag == '981': 
                subjects.add( value.get('subfields', [{}])[0].get('a', 'Unknown'))
            elif tag == '982':
                subjects_l1.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))
            elif tag == '983':
                subjects_l2.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))
            elif tag == '984':
                subjects_l3.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))
    parsed_record['subject'] = ", ".join(list(subjects))
    parsed_record["subject_level1"]= ", ".join(list(subjects_l1))
    parsed_record["subject_level2"]= ", ".join(list(subjects_l2))
    parsed_record["subject_level3"]= ", ".join(list(subjects_l3))
    parsed_record['Access URL'] = ", ".join(list(access_url))

    data['Record ID'].append(parsed_record.get('Record ID', ''))
    data['Date of Publication'].append(parsed_record.get('Date of Publication', ''))
    data['Country of Publication'].append(parsed_record.get('Country of Publication', ''))
    data['Language'].append(parsed_record.get('Language', ''))
    data['ISSN'].append(parsed_record.get('ISSN', ''))
    data['ISSNL'].append(parsed_record.get('ISSNL', ''))
    data['ddc_subject_classification'].append(parsed_record.get('ddc_subject_classification', ''))

    data['Publisher'].append(parsed_record.get('Publisher',''))
    data['Title'].append(parsed_record.get('Title', ''))
    data['Access URL'].append(parsed_record.get('Access URL', ''))
    data['subject'].append(parsed_record.get('subject',' '))
    data['subject_level1'].append(parsed_record.get('subject_level1',' '))
    data['subject_level2'].append(parsed_record.get('subject_level2',' '))
    data['subject_level3'].append(parsed_record.get('subject_level3',' '))
                
In [20]:
import json

fin = open('./input/downloadedITNotOpenHandled.json')
lines = fin.read().split("\n")
fin.close()
for line in lines:
    entry = json.loads(line)
    for record in entry:
        parseRecord(record)
In [21]:
fin = open('./input/downloadedITNotOpenoalex.json')

for line in fin:
    if line[0] == '[' or line[0] == ',':
        parseRecord(json.loads(line[1:].strip()))
    else:
        continue
                    
In [23]:
import pandas as pd

idf = pd.DataFrame(data, columns = ['Record ID',
        'Date of Publication',
        'Country of Publication',
        'Language',
        'ISSN',
        'ISSNL',
        'ddc_subject_classification',
                                    'Publisher',
        'Title',
        'Access URL',
        'subject',
        'subject_level1',
        'subject_level2',
        'subject_level3'])

idf.to_csv('./input/issnExtracted.tsv',sep="\t")