203 lines
6.9 KiB
Plaintext
203 lines
6.9 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"id": "f98fd5d8-c045-4519-b3cd-4767163a4b8d",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import xml.etree.ElementTree as ET"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"id": "23ec1ff8-8385-4f5a-8875-a56fa9147afb",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"tree = ET.parse('./input/ROAD.xml')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"id": "e66f071b-77d9-483c-ac88-cad275f4f7d3",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"root = tree.getroot()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"id": "6082528c-b442-4707-9485-e6fba11df440",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"66173"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"len(root.findall('./record'))"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"id": "284d016c-3fdc-4692-8084-9bdf19c1f358",
|
|
"metadata": {},
|
|
"source": [
|
|
"Campi del dump ROAD che ci interessano\n",
|
|
"001 - ISSN\n",
|
|
"041 - lingua di pubblicazione\n",
|
|
"044 - publisher country\n",
|
|
"082 a) DDC subject classification\n",
|
|
"245 a) Title proper\n",
|
|
"246 acronimo titolo\n",
|
|
"260 b) editore\n",
|
|
"260 c) date of pubblication\n",
|
|
"856 - url della risorsa\n",
|
|
"981 a)subject\n",
|
|
"982 sottocategorie di subject\n",
|
|
"983 sottocategorie di subject\n",
|
|
"984 sottocategorie di subject\n"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"id": "5e898ae6-7293-48da-a36b-ed14646bdba8",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"data = {\"ISSN\" : [],\n",
|
|
"\"lang\":[],\n",
|
|
"\"publisher_country\": [],\n",
|
|
"\"ddc_subject_classification\": [],\n",
|
|
"\"title\": [],\n",
|
|
"\"title_acronym\": [],\n",
|
|
"\"editor\": [],\n",
|
|
"\"date_of_publication\":[],\n",
|
|
"\"url\":[], \n",
|
|
"\"subject\":[],\n",
|
|
"\"subject_level1\":[],\n",
|
|
"\"subject_level2\":[],\n",
|
|
"\"subject_level3\":[]}\n",
|
|
"\n",
|
|
" \n",
|
|
"for item in root.findall('./record'): \n",
|
|
" data['ISSN'].append(item.find('.//controlfield[@tag=\"001\"]').text) \n",
|
|
" data['lang'].append(';'.join(elem.text for elem in item.findall('.//datafield[@tag=\"041\"]/subfield[@code = \"a\"]')))\n",
|
|
" data['publisher_country'].append(item.find('.//datafield[@tag=\"044\"]/subfield[@code = \"c\"]').text) if item.find('.//datafield[@tag=\"044\"]/subfield[@code = \"c\"]') is not None else data['publisher_country'].append(\"\")\n",
|
|
" data['ddc_subject_classification'].append(item.find('.//datafield[@tag=\"082\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"082\"]/subfield[@code = \"a\"]') is not None else data['ddc_subject_classification'].append(\"\")\n",
|
|
" data['title'].append(item.find('.//datafield[@tag=\"245\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"245\"]/subfield[@code = \"a\"]') is not None else data['title'].append(\"\")\n",
|
|
" data['title_acronym'].append(item.find('.//datafield[@tag=\"246\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"246\"]/subfield[@code = \"a\"]') is not None else data[\"title_acronym\"].append(\"none\")\n",
|
|
" data['editor'].append(item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"b\"]').text) if item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"b\"]') is not None else data['editor'].append(\"\")\n",
|
|
" data['date_of_publication'].append(item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"c\"]').text) if item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"c\"]') is not None else data['date_of_publication'].append(\"\")\n",
|
|
" data['url'].append(item.find('.//datafield[@tag=\"856\"]/subfield[@code = \"u\"]').text) if item.find('.//datafield[@tag=\"856\"]/subfield[@code = \"u\"]') is not None else data['url'].append(\"\")\n",
|
|
" data['subject'].append(item.find('.//datafield[@tag=\"981\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"981\"]/subfield[@code = \"a\"]') is not None else data['subject'].append(\"\")\n",
|
|
" data['subject_level1'].append(item.find('.//datafield[@tag=\"982\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"982\"]/subfield[@code = \"a\"]') is not None else data['subject_level1'].append(\"\")\n",
|
|
" data['subject_level2'].append(item.find('.//datafield[@tag=\"983\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"983\"]/subfield[@code = \"a\"]') is not None else data['subject_level2'].append(\"\")\n",
|
|
" data['subject_level3'].append(item.find('.//datafield[@tag=\"984\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"984\"]/subfield[@code = \"a\"]') is not None else data['subject_level3'].append(\"\")\n",
|
|
"\n",
|
|
"\n",
|
|
"idf = pd.DataFrame(data, columns = [\"ISSN\" ,\n",
|
|
"\"lang\",\n",
|
|
"\"publisher_country\",\n",
|
|
"\"ddc_subject_classification\",\n",
|
|
"\"title\",\n",
|
|
"\"title_acronym\",\n",
|
|
"\"editor\",\n",
|
|
"\"date_of_publication\",\n",
|
|
"\"url\",\n",
|
|
"\"subject\",\n",
|
|
"\"subject_level1\",\n",
|
|
"\"subject_level2\",\n",
|
|
"\"subject_level3\"])\n",
|
|
"\n",
|
|
"idf.to_csv('roadExtracted.tsv',sep=\"\\t\")\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"id": "38235b0e-1677-453d-8ceb-0407d8f1db27",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"ename": "NameError",
|
|
"evalue": "name 'records' is not defined",
|
|
"output_type": "error",
|
|
"traceback": [
|
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
|
"Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mrecords\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n",
|
|
"\u001b[0;31mNameError\u001b[0m: name 'records' is not defined"
|
|
]
|
|
}
|
|
],
|
|
"source": [
|
|
"records[0]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 9,
|
|
"id": "44630017-c87e-492b-9629-3ba310f134c2",
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"66173"
|
|
]
|
|
},
|
|
"execution_count": 9,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"count = 0\n",
|
|
"for i in root.findall('./record'):\n",
|
|
" count += 1\n",
|
|
"\n",
|
|
"count"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|