6.9 KiB
6.9 KiB
In [1]:
import xml.etree.ElementTree as ET
In [2]:
tree = ET.parse('./input/ROAD.xml')
In [3]:
root = tree.getroot()
In [4]:
len(root.findall('./record'))
Out[4]:
Campi del dump ROAD che ci interessano 001 - ISSN 041 - lingua di pubblicazione 044 - publisher country 082 a) DDC subject classification 245 a) Title proper 246 acronimo titolo 260 b) editore 260 c) date of pubblication 856 - url della risorsa 981 a)subject 982 sottocategorie di subject 983 sottocategorie di subject 984 sottocategorie di subject
In [7]:
import pandas as pd
data = {"ISSN" : [],
"lang":[],
"publisher_country": [],
"ddc_subject_classification": [],
"title": [],
"title_acronym": [],
"editor": [],
"date_of_publication":[],
"url":[],
"subject":[],
"subject_level1":[],
"subject_level2":[],
"subject_level3":[]}
for item in root.findall('./record'):
data['ISSN'].append(item.find('.//controlfield[@tag="001"]').text)
data['lang'].append(';'.join(elem.text for elem in item.findall('.//datafield[@tag="041"]/subfield[@code = "a"]')))
data['publisher_country'].append(item.find('.//datafield[@tag="044"]/subfield[@code = "c"]').text) if item.find('.//datafield[@tag="044"]/subfield[@code = "c"]') is not None else data['publisher_country'].append("")
data['ddc_subject_classification'].append(item.find('.//datafield[@tag="082"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="082"]/subfield[@code = "a"]') is not None else data['ddc_subject_classification'].append("")
data['title'].append(item.find('.//datafield[@tag="245"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="245"]/subfield[@code = "a"]') is not None else data['title'].append("")
data['title_acronym'].append(item.find('.//datafield[@tag="246"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="246"]/subfield[@code = "a"]') is not None else data["title_acronym"].append("none")
data['editor'].append(item.find('.//datafield[@tag="260"]/subfield[@code = "b"]').text) if item.find('.//datafield[@tag="260"]/subfield[@code = "b"]') is not None else data['editor'].append("")
data['date_of_publication'].append(item.find('.//datafield[@tag="260"]/subfield[@code = "c"]').text) if item.find('.//datafield[@tag="260"]/subfield[@code = "c"]') is not None else data['date_of_publication'].append("")
data['url'].append(item.find('.//datafield[@tag="856"]/subfield[@code = "u"]').text) if item.find('.//datafield[@tag="856"]/subfield[@code = "u"]') is not None else data['url'].append("")
data['subject'].append(item.find('.//datafield[@tag="981"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="981"]/subfield[@code = "a"]') is not None else data['subject'].append("")
data['subject_level1'].append(item.find('.//datafield[@tag="982"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="982"]/subfield[@code = "a"]') is not None else data['subject_level1'].append("")
data['subject_level2'].append(item.find('.//datafield[@tag="983"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="983"]/subfield[@code = "a"]') is not None else data['subject_level2'].append("")
data['subject_level3'].append(item.find('.//datafield[@tag="984"]/subfield[@code = "a"]').text) if item.find('.//datafield[@tag="984"]/subfield[@code = "a"]') is not None else data['subject_level3'].append("")
idf = pd.DataFrame(data, columns = ["ISSN" ,
"lang",
"publisher_country",
"ddc_subject_classification",
"title",
"title_acronym",
"editor",
"date_of_publication",
"url",
"subject",
"subject_level1",
"subject_level2",
"subject_level3"])
idf.to_csv('roadExtracted.tsv',sep="\t")
In [5]:
records[0]
In [9]:
count = 0
for i in root.findall('./record'):
count += 1
count
Out[9]: