DiamondOA/ExtractFromROAD.ipynb

6.9 KiB

In [1]:
import xml.etree.ElementTree as ET
In [2]:
tree = ET.parse('./input/ROAD.xml')
In [3]:
root = tree.getroot()
In [4]:
len(root.findall('./record'))
Out[4]:
66173

Campi del dump ROAD che ci interessano 001 - ISSN 041 - lingua di pubblicazione 044 - publisher country 082 a) DDC subject classification 245 a) Title proper 246 acronimo titolo 260 b) editore 260 c) date of pubblication 856 - url della risorsa 981 a)subject 982 sottocategorie di subject 983 sottocategorie di subject 984 sottocategorie di subject

In [7]:
import pandas as pd

data = {"ISSN" : [],
"lang":[],
"publisher_country": [],
"ddc_subject_classification": [],
"title": [],
"title_acronym": [],
"editor": [],
"date_of_publication":[],
"url":[], 
"subject":[],
"subject_level1":[],
"subject_level2":[],
"subject_level3":[]}

    
for item in root.findall('./record'): 
    data['ISSN'].append(item.find('.//controlfield[@tag="001"]').text) 
    data['lang'].append(';'.join(elem.text for elem in item.findall('.//datafield[@tag="041"]/subfield[@code = "a"]')))
    data['publisher_country'].append(item.find('.//datafield[@tag="044"]/subfield[@code = "c"]').text)  if item.find('.//datafield[@tag="044"]/subfield[@code = "c"]') is not None else data['publisher_country'].append("")
    data['ddc_subject_classification'].append(item.find('.//datafield[@tag="082"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="082"]/subfield[@code = "a"]') is not None else data['ddc_subject_classification'].append("")
    data['title'].append(item.find('.//datafield[@tag="245"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="245"]/subfield[@code = "a"]') is not None else data['title'].append("")
    data['title_acronym'].append(item.find('.//datafield[@tag="246"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="246"]/subfield[@code = "a"]') is not None else data["title_acronym"].append("none")
    data['editor'].append(item.find('.//datafield[@tag="260"]/subfield[@code = "b"]').text) if item.find('.//datafield[@tag="260"]/subfield[@code = "b"]') is not None else data['editor'].append("")
    data['date_of_publication'].append(item.find('.//datafield[@tag="260"]/subfield[@code = "c"]').text) if item.find('.//datafield[@tag="260"]/subfield[@code = "c"]') is not None else data['date_of_publication'].append("")
    data['url'].append(item.find('.//datafield[@tag="856"]/subfield[@code = "u"]').text) if item.find('.//datafield[@tag="856"]/subfield[@code = "u"]') is not None else data['url'].append("")
    data['subject'].append(item.find('.//datafield[@tag="981"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="981"]/subfield[@code = "a"]') is not None else data['subject'].append("")
    data['subject_level1'].append(item.find('.//datafield[@tag="982"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="982"]/subfield[@code = "a"]') is not None else data['subject_level1'].append("")
    data['subject_level2'].append(item.find('.//datafield[@tag="983"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="983"]/subfield[@code = "a"]') is not None else data['subject_level2'].append("")
    data['subject_level3'].append(item.find('.//datafield[@tag="984"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="984"]/subfield[@code = "a"]') is not None else data['subject_level3'].append("")


idf = pd.DataFrame(data, columns = ["ISSN" ,
"lang",
"publisher_country",
"ddc_subject_classification",
"title",
"title_acronym",
"editor",
"date_of_publication",
"url",
"subject",
"subject_level1",
"subject_level2",
"subject_level3"])

idf.to_csv('roadExtracted.tsv',sep="\t")
    
In [5]:
records[0]
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
Cell In[5], line 1
----> 1 records[0]

NameError: name 'records' is not defined
In [9]:
count = 0
for i in root.findall('./record'):
    count += 1

count
Out[9]:
66173