7.8 KiB
7.8 KiB
Campi del dump ROAD che ci interessano 001 - ISSN 041 - lingua di pubblicazione 044 - publisher country 082 a) DDC subject classification 245 a) Title proper 246 acronimo titolo 260 b) editore 260 c) date of pubblication 856 - url della risorsa 981 a)subject 982 sottocategorie di subject 983 sottocategorie di subject 984 sottocategorie di subject
In [18]:
data = {
'Record ID': [],
'Date of Publication': [],
'Country of Publication': [],
'Language': [],
'ISSN': [],
'ISSNL': [],
'ddc_subject_classification': [],
'Publisher':[],
'Title': [],
'Access URL': [],
'subject': [],
'subject_level1': [],
'subject_level2': [],
'subject_level3': []
}
In [19]:
def parseRecord(record):
parsed_record={}
subjects = set()
subjects_l1=set()
subjects_l2=set()
subjects_l3=set()
access_url = set()
# Fields
for field in record.get('fields', []):
for tag, value in field.items():
# Gestisci campi controllati
if tag == '001':
parsed_record['Record ID'] = value
elif tag == '008':
general_info = value
parsed_record['Date of Publication']= general_info[7:11].strip() + " - " + general_info[11:15].strip()
parsed_record['Country of Publication']= general_info[15:18].strip()
parsed_record['Language']= general_info[35:38].strip()
elif tag == '022':
for subfield in value.get('subfields', []):
if 'a' in subfield:
parsed_record['ISSN'] = subfield['a']
if 'l' in subfield:
parsed_record['ISSNL'] = subfield['l']
elif tag == '044':
country_code = value.get('subfields', [{}])[0].get('c', 'Unknown')
parsed_record['Country of Publication'] = country_code
elif tag == '082':
parsed_record['ddc_subject_classification']= value.get('subfields', [{}])[0].get('a', 'Unknown')
elif tag == '245':
title = ' '.join([sub.get('a', '') for sub in value.get('subfields', [])])
parsed_record['Title'] = title.strip()
elif tag == '260':
parsed_record['Publisher'] = value.get('subfields', [{}])[0].get('a', ' ') + " " + value.get('subfields', [{}])[1].get('b', ' ')
elif tag == '856':
url = next((sub.get('u') for sub in value.get('subfields', []) if 'u' in sub), None)
if url:
access_url.add(url)
elif tag == '981':
subjects.add( value.get('subfields', [{}])[0].get('a', 'Unknown'))
elif tag == '982':
subjects_l1.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))
elif tag == '983':
subjects_l2.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))
elif tag == '984':
subjects_l3.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))
parsed_record['subject'] = ", ".join(list(subjects))
parsed_record["subject_level1"]= ", ".join(list(subjects_l1))
parsed_record["subject_level2"]= ", ".join(list(subjects_l2))
parsed_record["subject_level3"]= ", ".join(list(subjects_l3))
parsed_record['Access URL'] = ", ".join(list(access_url))
data['Record ID'].append(parsed_record.get('Record ID', ''))
data['Date of Publication'].append(parsed_record.get('Date of Publication', ''))
data['Country of Publication'].append(parsed_record.get('Country of Publication', ''))
data['Language'].append(parsed_record.get('Language', ''))
data['ISSN'].append(parsed_record.get('ISSN', ''))
data['ISSNL'].append(parsed_record.get('ISSNL', ''))
data['ddc_subject_classification'].append(parsed_record.get('ddc_subject_classification', ''))
data['Publisher'].append(parsed_record.get('Publisher',''))
data['Title'].append(parsed_record.get('Title', ''))
data['Access URL'].append(parsed_record.get('Access URL', ''))
data['subject'].append(parsed_record.get('subject',' '))
data['subject_level1'].append(parsed_record.get('subject_level1',' '))
data['subject_level2'].append(parsed_record.get('subject_level2',' '))
data['subject_level3'].append(parsed_record.get('subject_level3',' '))
In [20]:
import json
fin = open('./input/downloadedITNotOpenHandled.json')
lines = fin.read().split("\n")
fin.close()
for line in lines:
entry = json.loads(line)
for record in entry:
parseRecord(record)
In [21]:
fin = open('./input/downloadedITNotOpenoalex.json')
for line in fin:
if line[0] == '[' or line[0] == ',':
parseRecord(json.loads(line[1:].strip()))
else:
continue
In [23]:
import pandas as pd
idf = pd.DataFrame(data, columns = ['Record ID',
'Date of Publication',
'Country of Publication',
'Language',
'ISSN',
'ISSNL',
'ddc_subject_classification',
'Publisher',
'Title',
'Access URL',
'subject',
'subject_level1',
'subject_level2',
'subject_level3'])
idf.to_csv('./input/issnExtracted.tsv',sep="\t")