Code to extract the relevant information from ROAD metadata
This commit is contained in:
commit
8e7410d840
|
@ -0,0 +1,202 @@
|
|||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "f98fd5d8-c045-4519-b3cd-4767163a4b8d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import xml.etree.ElementTree as ET"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "23ec1ff8-8385-4f5a-8875-a56fa9147afb",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"tree = ET.parse('./input/ROAD.xml')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "e66f071b-77d9-483c-ac88-cad275f4f7d3",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"root = tree.getroot()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "6082528c-b442-4707-9485-e6fba11df440",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"66173"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"len(root.findall('./record'))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "284d016c-3fdc-4692-8084-9bdf19c1f358",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Campi del dump ROAD che ci interessano\n",
|
||||
"001 - ISSN\n",
|
||||
"041 - lingua di pubblicazione\n",
|
||||
"044 - publisher country\n",
|
||||
"082 a) DDC subject classification\n",
|
||||
"245 a) Title proper\n",
|
||||
"246 acronimo titolo\n",
|
||||
"260 b) editore\n",
|
||||
"260 c) date of pubblication\n",
|
||||
"856 - url della risorsa\n",
|
||||
"981 a)subject\n",
|
||||
"982 sottocategorie di subject\n",
|
||||
"983 sottocategorie di subject\n",
|
||||
"984 sottocategorie di subject\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "5e898ae6-7293-48da-a36b-ed14646bdba8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"data = {\"ISSN\" : [],\n",
|
||||
"\"lang\":[],\n",
|
||||
"\"publisher_country\": [],\n",
|
||||
"\"ddc_subject_classification\": [],\n",
|
||||
"\"title\": [],\n",
|
||||
"\"title_acronym\": [],\n",
|
||||
"\"editor\": [],\n",
|
||||
"\"date_of_publication\":[],\n",
|
||||
"\"url\":[], \n",
|
||||
"\"subject\":[],\n",
|
||||
"\"subject_level1\":[],\n",
|
||||
"\"subject_level2\":[],\n",
|
||||
"\"subject_level3\":[]}\n",
|
||||
"\n",
|
||||
" \n",
|
||||
"for item in root.findall('./record'): \n",
|
||||
" data['ISSN'].append(item.find('.//controlfield[@tag=\"001\"]').text) \n",
|
||||
" data['lang'].append(';'.join(elem.text for elem in item.findall('.//datafield[@tag=\"041\"]/subfield[@code = \"a\"]')))\n",
|
||||
" data['publisher_country'].append(item.find('.//datafield[@tag=\"044\"]/subfield[@code = \"c\"]').text) if item.find('.//datafield[@tag=\"044\"]/subfield[@code = \"c\"]') is not None else data['publisher_country'].append(\"\")\n",
|
||||
" data['ddc_subject_classification'].append(item.find('.//datafield[@tag=\"082\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"082\"]/subfield[@code = \"a\"]') is not None else data['ddc_subject_classification'].append(\"\")\n",
|
||||
" data['title'].append(item.find('.//datafield[@tag=\"245\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"245\"]/subfield[@code = \"a\"]') is not None else data['title'].append(\"\")\n",
|
||||
" data['title_acronym'].append(item.find('.//datafield[@tag=\"246\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"246\"]/subfield[@code = \"a\"]') is not None else data[\"title_acronym\"].append(\"none\")\n",
|
||||
" data['editor'].append(item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"b\"]').text) if item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"b\"]') is not None else data['editor'].append(\"\")\n",
|
||||
" data['date_of_publication'].append(item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"c\"]').text) if item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"c\"]') is not None else data['date_of_publication'].append(\"\")\n",
|
||||
" data['url'].append(item.find('.//datafield[@tag=\"856\"]/subfield[@code = \"u\"]').text) if item.find('.//datafield[@tag=\"856\"]/subfield[@code = \"u\"]') is not None else data['url'].append(\"\")\n",
|
||||
" data['subject'].append(item.find('.//datafield[@tag=\"981\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"981\"]/subfield[@code = \"a\"]') is not None else data['subject'].append(\"\")\n",
|
||||
" data['subject_level1'].append(item.find('.//datafield[@tag=\"982\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"982\"]/subfield[@code = \"a\"]') is not None else data['subject_level1'].append(\"\")\n",
|
||||
" data['subject_level2'].append(item.find('.//datafield[@tag=\"983\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"983\"]/subfield[@code = \"a\"]') is not None else data['subject_level2'].append(\"\")\n",
|
||||
" data['subject_level3'].append(item.find('.//datafield[@tag=\"984\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"984\"]/subfield[@code = \"a\"]') is not None else data['subject_level3'].append(\"\")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"idf = pd.DataFrame(data, columns = [\"ISSN\" ,\n",
|
||||
"\"lang\",\n",
|
||||
"\"publisher_country\",\n",
|
||||
"\"ddc_subject_classification\",\n",
|
||||
"\"title\",\n",
|
||||
"\"title_acronym\",\n",
|
||||
"\"editor\",\n",
|
||||
"\"date_of_publication\",\n",
|
||||
"\"url\",\n",
|
||||
"\"subject\",\n",
|
||||
"\"subject_level1\",\n",
|
||||
"\"subject_level2\",\n",
|
||||
"\"subject_level3\"])\n",
|
||||
"\n",
|
||||
"idf.to_csv('roadExtracted.tsv',sep=\"\\t\")\n",
|
||||
" "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "38235b0e-1677-453d-8ceb-0407d8f1db27",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"ename": "NameError",
|
||||
"evalue": "name 'records' is not defined",
|
||||
"output_type": "error",
|
||||
"traceback": [
|
||||
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
||||
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
||||
"Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mrecords\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n",
|
||||
"\u001b[0;31mNameError\u001b[0m: name 'records' is not defined"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"records[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "44630017-c87e-492b-9629-3ba310f134c2",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"66173"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"count = 0\n",
|
||||
"for i in root.findall('./record'):\n",
|
||||
" count += 1\n",
|
||||
"\n",
|
||||
"count"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.12.3"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
Loading…
Reference in New Issue