Code to extract the relevant information from ROAD metadata

This commit is contained in:
Miriam Baglioni 2024-12-11 10:36:52 +01:00
commit 8e7410d840
1 changed files with 202 additions and 0 deletions

202
ExtractFromROAD.ipynb Normal file
View File

@ -0,0 +1,202 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "f98fd5d8-c045-4519-b3cd-4767163a4b8d",
"metadata": {},
"outputs": [],
"source": [
"import xml.etree.ElementTree as ET"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "23ec1ff8-8385-4f5a-8875-a56fa9147afb",
"metadata": {},
"outputs": [],
"source": [
"tree = ET.parse('./input/ROAD.xml')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "e66f071b-77d9-483c-ac88-cad275f4f7d3",
"metadata": {},
"outputs": [],
"source": [
"root = tree.getroot()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "6082528c-b442-4707-9485-e6fba11df440",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"66173"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"len(root.findall('./record'))"
]
},
{
"cell_type": "markdown",
"id": "284d016c-3fdc-4692-8084-9bdf19c1f358",
"metadata": {},
"source": [
"Campi del dump ROAD che ci interessano\n",
"001 - ISSN\n",
"041 - lingua di pubblicazione\n",
"044 - publisher country\n",
"082 a) DDC subject classification\n",
"245 a) Title proper\n",
"246 acronimo titolo\n",
"260 b) editore\n",
"260 c) date of pubblication\n",
"856 - url della risorsa\n",
"981 a)subject\n",
"982 sottocategorie di subject\n",
"983 sottocategorie di subject\n",
"984 sottocategorie di subject\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "5e898ae6-7293-48da-a36b-ed14646bdba8",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"data = {\"ISSN\" : [],\n",
"\"lang\":[],\n",
"\"publisher_country\": [],\n",
"\"ddc_subject_classification\": [],\n",
"\"title\": [],\n",
"\"title_acronym\": [],\n",
"\"editor\": [],\n",
"\"date_of_publication\":[],\n",
"\"url\":[], \n",
"\"subject\":[],\n",
"\"subject_level1\":[],\n",
"\"subject_level2\":[],\n",
"\"subject_level3\":[]}\n",
"\n",
" \n",
"for item in root.findall('./record'): \n",
" data['ISSN'].append(item.find('.//controlfield[@tag=\"001\"]').text) \n",
" data['lang'].append(';'.join(elem.text for elem in item.findall('.//datafield[@tag=\"041\"]/subfield[@code = \"a\"]')))\n",
" data['publisher_country'].append(item.find('.//datafield[@tag=\"044\"]/subfield[@code = \"c\"]').text) if item.find('.//datafield[@tag=\"044\"]/subfield[@code = \"c\"]') is not None else data['publisher_country'].append(\"\")\n",
" data['ddc_subject_classification'].append(item.find('.//datafield[@tag=\"082\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"082\"]/subfield[@code = \"a\"]') is not None else data['ddc_subject_classification'].append(\"\")\n",
" data['title'].append(item.find('.//datafield[@tag=\"245\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"245\"]/subfield[@code = \"a\"]') is not None else data['title'].append(\"\")\n",
" data['title_acronym'].append(item.find('.//datafield[@tag=\"246\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"246\"]/subfield[@code = \"a\"]') is not None else data[\"title_acronym\"].append(\"none\")\n",
" data['editor'].append(item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"b\"]').text) if item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"b\"]') is not None else data['editor'].append(\"\")\n",
" data['date_of_publication'].append(item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"c\"]').text) if item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"c\"]') is not None else data['date_of_publication'].append(\"\")\n",
" data['url'].append(item.find('.//datafield[@tag=\"856\"]/subfield[@code = \"u\"]').text) if item.find('.//datafield[@tag=\"856\"]/subfield[@code = \"u\"]') is not None else data['url'].append(\"\")\n",
" data['subject'].append(item.find('.//datafield[@tag=\"981\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"981\"]/subfield[@code = \"a\"]') is not None else data['subject'].append(\"\")\n",
" data['subject_level1'].append(item.find('.//datafield[@tag=\"982\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"982\"]/subfield[@code = \"a\"]') is not None else data['subject_level1'].append(\"\")\n",
" data['subject_level2'].append(item.find('.//datafield[@tag=\"983\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"983\"]/subfield[@code = \"a\"]') is not None else data['subject_level2'].append(\"\")\n",
" data['subject_level3'].append(item.find('.//datafield[@tag=\"984\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"984\"]/subfield[@code = \"a\"]') is not None else data['subject_level3'].append(\"\")\n",
"\n",
"\n",
"idf = pd.DataFrame(data, columns = [\"ISSN\" ,\n",
"\"lang\",\n",
"\"publisher_country\",\n",
"\"ddc_subject_classification\",\n",
"\"title\",\n",
"\"title_acronym\",\n",
"\"editor\",\n",
"\"date_of_publication\",\n",
"\"url\",\n",
"\"subject\",\n",
"\"subject_level1\",\n",
"\"subject_level2\",\n",
"\"subject_level3\"])\n",
"\n",
"idf.to_csv('roadExtracted.tsv',sep=\"\\t\")\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "38235b0e-1677-453d-8ceb-0407d8f1db27",
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'records' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mrecords\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n",
"\u001b[0;31mNameError\u001b[0m: name 'records' is not defined"
]
}
],
"source": [
"records[0]"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "44630017-c87e-492b-9629-3ba310f134c2",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"66173"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"count = 0\n",
"for i in root.findall('./record'):\n",
" count += 1\n",
"\n",
"count"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}