{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "f98fd5d8-c045-4519-b3cd-4767163a4b8d", "metadata": {}, "outputs": [], "source": [ "import xml.etree.ElementTree as ET" ] }, { "cell_type": "code", "execution_count": 2, "id": "23ec1ff8-8385-4f5a-8875-a56fa9147afb", "metadata": {}, "outputs": [], "source": [ "tree = ET.parse('./input/ROAD.xml')" ] }, { "cell_type": "code", "execution_count": 3, "id": "e66f071b-77d9-483c-ac88-cad275f4f7d3", "metadata": {}, "outputs": [], "source": [ "root = tree.getroot()" ] }, { "cell_type": "code", "execution_count": 4, "id": "6082528c-b442-4707-9485-e6fba11df440", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "66173" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(root.findall('./record'))" ] }, { "cell_type": "markdown", "id": "284d016c-3fdc-4692-8084-9bdf19c1f358", "metadata": {}, "source": [ "Campi del dump ROAD che ci interessano\n", "001 - ISSN\n", "041 - lingua di pubblicazione\n", "044 - publisher country\n", "082 a) DDC subject classification\n", "245 a) Title proper\n", "246 acronimo titolo\n", "260 b) editore\n", "260 c) date of pubblication\n", "856 - url della risorsa\n", "981 a)subject\n", "982 sottocategorie di subject\n", "983 sottocategorie di subject\n", "984 sottocategorie di subject\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "5e898ae6-7293-48da-a36b-ed14646bdba8", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "data = {\"ISSN\" : [],\n", "\"lang\":[],\n", "\"publisher_country\": [],\n", "\"ddc_subject_classification\": [],\n", "\"title\": [],\n", "\"title_acronym\": [],\n", "\"editor\": [],\n", "\"date_of_publication\":[],\n", "\"url\":[], \n", "\"subject\":[],\n", "\"subject_level1\":[],\n", "\"subject_level2\":[],\n", "\"subject_level3\":[]}\n", "\n", " \n", "for item in root.findall('./record'): \n", " data['ISSN'].append(item.find('.//controlfield[@tag=\"001\"]').text) \n", " data['lang'].append(';'.join(elem.text for elem in item.findall('.//datafield[@tag=\"041\"]/subfield[@code = \"a\"]')))\n", " data['publisher_country'].append(item.find('.//datafield[@tag=\"044\"]/subfield[@code = \"c\"]').text) if item.find('.//datafield[@tag=\"044\"]/subfield[@code = \"c\"]') is not None else data['publisher_country'].append(\"\")\n", " data['ddc_subject_classification'].append(item.find('.//datafield[@tag=\"082\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"082\"]/subfield[@code = \"a\"]') is not None else data['ddc_subject_classification'].append(\"\")\n", " data['title'].append(item.find('.//datafield[@tag=\"245\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"245\"]/subfield[@code = \"a\"]') is not None else data['title'].append(\"\")\n", " data['title_acronym'].append(item.find('.//datafield[@tag=\"246\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"246\"]/subfield[@code = \"a\"]') is not None else data[\"title_acronym\"].append(\"none\")\n", " data['editor'].append(item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"b\"]').text) if item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"b\"]') is not None else data['editor'].append(\"\")\n", " data['date_of_publication'].append(item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"c\"]').text) if item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"c\"]') is not None else data['date_of_publication'].append(\"\")\n", " data['url'].append(item.find('.//datafield[@tag=\"856\"]/subfield[@code = \"u\"]').text) if item.find('.//datafield[@tag=\"856\"]/subfield[@code = \"u\"]') is not None else data['url'].append(\"\")\n", " data['subject'].append(item.find('.//datafield[@tag=\"981\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"981\"]/subfield[@code = \"a\"]') is not None else data['subject'].append(\"\")\n", " data['subject_level1'].append(item.find('.//datafield[@tag=\"982\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"982\"]/subfield[@code = \"a\"]') is not None else data['subject_level1'].append(\"\")\n", " data['subject_level2'].append(item.find('.//datafield[@tag=\"983\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"983\"]/subfield[@code = \"a\"]') is not None else data['subject_level2'].append(\"\")\n", " data['subject_level3'].append(item.find('.//datafield[@tag=\"984\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"984\"]/subfield[@code = \"a\"]') is not None else data['subject_level3'].append(\"\")\n", "\n", "\n", "idf = pd.DataFrame(data, columns = [\"ISSN\" ,\n", "\"lang\",\n", "\"publisher_country\",\n", "\"ddc_subject_classification\",\n", "\"title\",\n", "\"title_acronym\",\n", "\"editor\",\n", "\"date_of_publication\",\n", "\"url\",\n", "\"subject\",\n", "\"subject_level1\",\n", "\"subject_level2\",\n", "\"subject_level3\"])\n", "\n", "idf.to_csv('roadExtracted.tsv',sep=\"\\t\")\n", " " ] }, { "cell_type": "code", "execution_count": 5, "id": "38235b0e-1677-453d-8ceb-0407d8f1db27", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'records' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mrecords\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n", "\u001b[0;31mNameError\u001b[0m: name 'records' is not defined" ] } ], "source": [ "records[0]" ] }, { "cell_type": "code", "execution_count": 9, "id": "44630017-c87e-492b-9629-3ba310f134c2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "66173" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "count = 0\n", "for i in root.findall('./record'):\n", " count += 1\n", "\n", "count" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }