{ "cells": [ { "cell_type": "raw", "id": "65cbfa78-4ff5-4381-ab5f-f3d37ff5a285", "metadata": {}, "source": [ "Campi del dump ROAD che ci interessano 001 - ISSN 041 - lingua di pubblicazione 044 - publisher country 082 a) DDC subject classification 245 a) Title proper 246 acronimo titolo 260 b) editore 260 c) date of pubblication 856 - url della risorsa 981 a)subject 982 sottocategorie di subject 983 sottocategorie di subject 984 sottocategorie di subject" ] }, { "cell_type": "code", "execution_count": 18, "id": "6bfd5c76-6e02-42f7-8d25-3d605efe2c13", "metadata": {}, "outputs": [], "source": [ "data = {\n", " 'Record ID': [],\n", " 'Date of Publication': [],\n", " 'Country of Publication': [],\n", " 'Language': [],\n", " 'ISSN': [],\n", " 'ISSNL': [],\n", " 'ddc_subject_classification': [],\n", " 'Publisher':[],\n", " 'Title': [],\n", " 'Access URL': [],\n", " 'subject': [],\n", " 'subject_level1': [],\n", " 'subject_level2': [],\n", " 'subject_level3': []\n", " }" ] }, { "cell_type": "code", "execution_count": 19, "id": "1eb7bad3-f208-43fc-b4eb-21991ec5324b", "metadata": {}, "outputs": [], "source": [ "def parseRecord(record):\n", " parsed_record={}\n", " subjects = set()\n", " subjects_l1=set()\n", " subjects_l2=set()\n", " subjects_l3=set()\n", " access_url = set()\n", " # Fields\n", " for field in record.get('fields', []):\n", " for tag, value in field.items():\n", " # Gestisci campi controllati\n", " if tag == '001':\n", " parsed_record['Record ID'] = value\n", " elif tag == '008':\n", " general_info = value\n", " parsed_record['Date of Publication']= general_info[7:11].strip() + \" - \" + general_info[11:15].strip()\n", " parsed_record['Country of Publication']= general_info[15:18].strip()\n", " parsed_record['Language']= general_info[35:38].strip()\n", " elif tag == '022':\n", " for subfield in value.get('subfields', []):\n", " if 'a' in subfield:\n", " parsed_record['ISSN'] = subfield['a']\n", " if 'l' in subfield:\n", " parsed_record['ISSNL'] = subfield['l']\n", " elif tag == '044':\n", " country_code = value.get('subfields', [{}])[0].get('c', 'Unknown')\n", " parsed_record['Country of Publication'] = country_code\n", " elif tag == '082':\n", " parsed_record['ddc_subject_classification']= value.get('subfields', [{}])[0].get('a', 'Unknown')\n", " elif tag == '245':\n", " title = ' '.join([sub.get('a', '') for sub in value.get('subfields', [])])\n", " parsed_record['Title'] = title.strip()\n", " elif tag == '260':\n", " parsed_record['Publisher'] = value.get('subfields', [{}])[0].get('a', ' ') + \" \" + value.get('subfields', [{}])[1].get('b', ' ')\n", " elif tag == '856':\n", " url = next((sub.get('u') for sub in value.get('subfields', []) if 'u' in sub), None)\n", " if url:\n", " access_url.add(url)\n", " elif tag == '981': \n", " subjects.add( value.get('subfields', [{}])[0].get('a', 'Unknown'))\n", " elif tag == '982':\n", " subjects_l1.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))\n", " elif tag == '983':\n", " subjects_l2.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))\n", " elif tag == '984':\n", " subjects_l3.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))\n", " parsed_record['subject'] = \", \".join(list(subjects))\n", " parsed_record[\"subject_level1\"]= \", \".join(list(subjects_l1))\n", " parsed_record[\"subject_level2\"]= \", \".join(list(subjects_l2))\n", " parsed_record[\"subject_level3\"]= \", \".join(list(subjects_l3))\n", " parsed_record['Access URL'] = \", \".join(list(access_url))\n", "\n", " data['Record ID'].append(parsed_record.get('Record ID', ''))\n", " data['Date of Publication'].append(parsed_record.get('Date of Publication', ''))\n", " data['Country of Publication'].append(parsed_record.get('Country of Publication', ''))\n", " data['Language'].append(parsed_record.get('Language', ''))\n", " data['ISSN'].append(parsed_record.get('ISSN', ''))\n", " data['ISSNL'].append(parsed_record.get('ISSNL', ''))\n", " data['ddc_subject_classification'].append(parsed_record.get('ddc_subject_classification', ''))\n", "\n", " data['Publisher'].append(parsed_record.get('Publisher',''))\n", " data['Title'].append(parsed_record.get('Title', ''))\n", " data['Access URL'].append(parsed_record.get('Access URL', ''))\n", " data['subject'].append(parsed_record.get('subject',' '))\n", " data['subject_level1'].append(parsed_record.get('subject_level1',' '))\n", " data['subject_level2'].append(parsed_record.get('subject_level2',' '))\n", " data['subject_level3'].append(parsed_record.get('subject_level3',' '))\n", " " ] }, { "cell_type": "code", "execution_count": 20, "id": "13d8b15a-6ba6-485c-9faf-f910e3f60359", "metadata": {}, "outputs": [], "source": [ "import json\n", "\n", "fin = open('./input/downloadedITNotOpenHandled.json')\n", "lines = fin.read().split(\"\\n\")\n", "fin.close()\n", "for line in lines:\n", " entry = json.loads(line)\n", " for record in entry:\n", " parseRecord(record)" ] }, { "cell_type": "code", "execution_count": 21, "id": "a8785d2d-90d7-437b-894a-72ed3caafb3b", "metadata": {}, "outputs": [], "source": [ "fin = open('./input/downloadedITNotOpenoalex.json')\n", "\n", "for line in fin:\n", " if line[0] == '[' or line[0] == ',':\n", " parseRecord(json.loads(line[1:].strip()))\n", " else:\n", " continue\n", " " ] }, { "cell_type": "code", "execution_count": 23, "id": "4fd825b4-472d-4081-959f-8ae6db0a9f46", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "idf = pd.DataFrame(data, columns = ['Record ID',\n", " 'Date of Publication',\n", " 'Country of Publication',\n", " 'Language',\n", " 'ISSN',\n", " 'ISSNL',\n", " 'ddc_subject_classification',\n", " 'Publisher',\n", " 'Title',\n", " 'Access URL',\n", " 'subject',\n", " 'subject_level1',\n", " 'subject_level2',\n", " 'subject_level3'])\n", "\n", "idf.to_csv('./input/issnExtracted.tsv',sep=\"\\t\")" ] }, { "cell_type": "code", "execution_count": null, "id": "463c0438-fe6d-4d73-9927-9351408f9218", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 5 }