Code to extract the relevant information from ROAD metadata

2024-12-11 10:36:52 +01:00 · 2024-12-11 10:36:52 +01:00 · 8e7410d840
commit 8e7410d840
1 changed files with 202 additions and 0 deletions
--- a/ExtractFromROAD.ipynb
+++ b/ExtractFromROAD.ipynb
@ -0,0 +1,202 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "f98fd5d8-c045-4519-b3cd-4767163a4b8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import xml.etree.ElementTree as ET"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "23ec1ff8-8385-4f5a-8875-a56fa9147afb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tree = ET.parse('./input/ROAD.xml')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e66f071b-77d9-483c-ac88-cad275f4f7d3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "root = tree.getroot()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "6082528c-b442-4707-9485-e6fba11df440",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "66173"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(root.findall('./record'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "284d016c-3fdc-4692-8084-9bdf19c1f358",
+   "metadata": {},
+   "source": [
+    "Campi del dump ROAD che ci interessano\n",
+    "001 - ISSN\n",
+    "041 - lingua di pubblicazione\n",
+    "044 - publisher country\n",
+    "082 a) DDC subject classification\n",
+    "245 a) Title proper\n",
+    "246 acronimo titolo\n",
+    "260 b) editore\n",
+    "260 c) date of pubblication\n",
+    "856 - url della risorsa\n",
+    "981 a)subject\n",
+    "982 sottocategorie di subject\n",
+    "983 sottocategorie di subject\n",
+    "984 sottocategorie di subject\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "5e898ae6-7293-48da-a36b-ed14646bdba8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "\n",
+    "data = {\"ISSN\" : [],\n",
+    "\"lang\":[],\n",
+    "\"publisher_country\": [],\n",
+    "\"ddc_subject_classification\": [],\n",
+    "\"title\": [],\n",
+    "\"title_acronym\": [],\n",
+    "\"editor\": [],\n",
+    "\"date_of_publication\":[],\n",
+    "\"url\":[], \n",
+    "\"subject\":[],\n",
+    "\"subject_level1\":[],\n",
+    "\"subject_level2\":[],\n",
+    "\"subject_level3\":[]}\n",
+    "\n",
+    "    \n",
+    "for item in root.findall('./record'): \n",
+    "    data['ISSN'].append(item.find('.//controlfield[@tag=\"001\"]').text) \n",
+    "    data['lang'].append(';'.join(elem.text for elem in item.findall('.//datafield[@tag=\"041\"]/subfield[@code = \"a\"]')))\n",
+    "    data['publisher_country'].append(item.find('.//datafield[@tag=\"044\"]/subfield[@code = \"c\"]').text)  if item.find('.//datafield[@tag=\"044\"]/subfield[@code = \"c\"]') is not None else data['publisher_country'].append(\"\")\n",
+    "    data['ddc_subject_classification'].append(item.find('.//datafield[@tag=\"082\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"082\"]/subfield[@code = \"a\"]') is not None else data['ddc_subject_classification'].append(\"\")\n",
+    "    data['title'].append(item.find('.//datafield[@tag=\"245\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"245\"]/subfield[@code = \"a\"]') is not None else data['title'].append(\"\")\n",
+    "    data['title_acronym'].append(item.find('.//datafield[@tag=\"246\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"246\"]/subfield[@code = \"a\"]') is not None else data[\"title_acronym\"].append(\"none\")\n",
+    "    data['editor'].append(item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"b\"]').text) if item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"b\"]') is not None else data['editor'].append(\"\")\n",
+    "    data['date_of_publication'].append(item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"c\"]').text) if item.find('.//datafield[@tag=\"260\"]/subfield[@code = \"c\"]') is not None else data['date_of_publication'].append(\"\")\n",
+    "    data['url'].append(item.find('.//datafield[@tag=\"856\"]/subfield[@code = \"u\"]').text) if item.find('.//datafield[@tag=\"856\"]/subfield[@code = \"u\"]') is not None else data['url'].append(\"\")\n",
+    "    data['subject'].append(item.find('.//datafield[@tag=\"981\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"981\"]/subfield[@code = \"a\"]') is not None else data['subject'].append(\"\")\n",
+    "    data['subject_level1'].append(item.find('.//datafield[@tag=\"982\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"982\"]/subfield[@code = \"a\"]') is not None else data['subject_level1'].append(\"\")\n",
+    "    data['subject_level2'].append(item.find('.//datafield[@tag=\"983\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"983\"]/subfield[@code = \"a\"]') is not None else data['subject_level2'].append(\"\")\n",
+    "    data['subject_level3'].append(item.find('.//datafield[@tag=\"984\"]/subfield[@code = \"a\"]').text) if item.find('.//datafield[@tag=\"984\"]/subfield[@code = \"a\"]') is not None else data['subject_level3'].append(\"\")\n",
+    "\n",
+    "\n",
+    "idf = pd.DataFrame(data, columns = [\"ISSN\" ,\n",
+    "\"lang\",\n",
+    "\"publisher_country\",\n",
+    "\"ddc_subject_classification\",\n",
+    "\"title\",\n",
+    "\"title_acronym\",\n",
+    "\"editor\",\n",
+    "\"date_of_publication\",\n",
+    "\"url\",\n",
+    "\"subject\",\n",
+    "\"subject_level1\",\n",
+    "\"subject_level2\",\n",
+    "\"subject_level3\"])\n",
+    "\n",
+    "idf.to_csv('roadExtracted.tsv',sep=\"\\t\")\n",
+    "    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "38235b0e-1677-453d-8ceb-0407d8f1db27",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'records' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[5], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mrecords\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'records' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "records[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "44630017-c87e-492b-9629-3ba310f134c2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "66173"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "count = 0\n",
+    "for i in root.findall('./record'):\n",
+    "    count += 1\n",
+    "\n",
+    "count"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}