206 lines
7.8 KiB
Plaintext
206 lines
7.8 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "raw",
|
|
"id": "65cbfa78-4ff5-4381-ab5f-f3d37ff5a285",
|
|
"metadata": {},
|
|
"source": [
|
|
"Campi del dump ROAD che ci interessano 001 - ISSN 041 - lingua di pubblicazione 044 - publisher country 082 a) DDC subject classification 245 a) Title proper 246 acronimo titolo 260 b) editore 260 c) date of pubblication 856 - url della risorsa 981 a)subject 982 sottocategorie di subject 983 sottocategorie di subject 984 sottocategorie di subject"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 18,
|
|
"id": "6bfd5c76-6e02-42f7-8d25-3d605efe2c13",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"data = {\n",
|
|
" 'Record ID': [],\n",
|
|
" 'Date of Publication': [],\n",
|
|
" 'Country of Publication': [],\n",
|
|
" 'Language': [],\n",
|
|
" 'ISSN': [],\n",
|
|
" 'ISSNL': [],\n",
|
|
" 'ddc_subject_classification': [],\n",
|
|
" 'Publisher':[],\n",
|
|
" 'Title': [],\n",
|
|
" 'Access URL': [],\n",
|
|
" 'subject': [],\n",
|
|
" 'subject_level1': [],\n",
|
|
" 'subject_level2': [],\n",
|
|
" 'subject_level3': []\n",
|
|
" }"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 19,
|
|
"id": "1eb7bad3-f208-43fc-b4eb-21991ec5324b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"def parseRecord(record):\n",
|
|
" parsed_record={}\n",
|
|
" subjects = set()\n",
|
|
" subjects_l1=set()\n",
|
|
" subjects_l2=set()\n",
|
|
" subjects_l3=set()\n",
|
|
" access_url = set()\n",
|
|
" # Fields\n",
|
|
" for field in record.get('fields', []):\n",
|
|
" for tag, value in field.items():\n",
|
|
" # Gestisci campi controllati\n",
|
|
" if tag == '001':\n",
|
|
" parsed_record['Record ID'] = value\n",
|
|
" elif tag == '008':\n",
|
|
" general_info = value\n",
|
|
" parsed_record['Date of Publication']= general_info[7:11].strip() + \" - \" + general_info[11:15].strip()\n",
|
|
" parsed_record['Country of Publication']= general_info[15:18].strip()\n",
|
|
" parsed_record['Language']= general_info[35:38].strip()\n",
|
|
" elif tag == '022':\n",
|
|
" for subfield in value.get('subfields', []):\n",
|
|
" if 'a' in subfield:\n",
|
|
" parsed_record['ISSN'] = subfield['a']\n",
|
|
" if 'l' in subfield:\n",
|
|
" parsed_record['ISSNL'] = subfield['l']\n",
|
|
" elif tag == '044':\n",
|
|
" country_code = value.get('subfields', [{}])[0].get('c', 'Unknown')\n",
|
|
" parsed_record['Country of Publication'] = country_code\n",
|
|
" elif tag == '082':\n",
|
|
" parsed_record['ddc_subject_classification']= value.get('subfields', [{}])[0].get('a', 'Unknown')\n",
|
|
" elif tag == '245':\n",
|
|
" title = ' '.join([sub.get('a', '') for sub in value.get('subfields', [])])\n",
|
|
" parsed_record['Title'] = title.strip()\n",
|
|
" elif tag == '260':\n",
|
|
" parsed_record['Publisher'] = value.get('subfields', [{}])[0].get('a', ' ') + \" \" + value.get('subfields', [{}])[1].get('b', ' ')\n",
|
|
" elif tag == '856':\n",
|
|
" url = next((sub.get('u') for sub in value.get('subfields', []) if 'u' in sub), None)\n",
|
|
" if url:\n",
|
|
" access_url.add(url)\n",
|
|
" elif tag == '981': \n",
|
|
" subjects.add( value.get('subfields', [{}])[0].get('a', 'Unknown'))\n",
|
|
" elif tag == '982':\n",
|
|
" subjects_l1.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))\n",
|
|
" elif tag == '983':\n",
|
|
" subjects_l2.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))\n",
|
|
" elif tag == '984':\n",
|
|
" subjects_l3.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))\n",
|
|
" parsed_record['subject'] = \", \".join(list(subjects))\n",
|
|
" parsed_record[\"subject_level1\"]= \", \".join(list(subjects_l1))\n",
|
|
" parsed_record[\"subject_level2\"]= \", \".join(list(subjects_l2))\n",
|
|
" parsed_record[\"subject_level3\"]= \", \".join(list(subjects_l3))\n",
|
|
" parsed_record['Access URL'] = \", \".join(list(access_url))\n",
|
|
"\n",
|
|
" data['Record ID'].append(parsed_record.get('Record ID', ''))\n",
|
|
" data['Date of Publication'].append(parsed_record.get('Date of Publication', ''))\n",
|
|
" data['Country of Publication'].append(parsed_record.get('Country of Publication', ''))\n",
|
|
" data['Language'].append(parsed_record.get('Language', ''))\n",
|
|
" data['ISSN'].append(parsed_record.get('ISSN', ''))\n",
|
|
" data['ISSNL'].append(parsed_record.get('ISSNL', ''))\n",
|
|
" data['ddc_subject_classification'].append(parsed_record.get('ddc_subject_classification', ''))\n",
|
|
"\n",
|
|
" data['Publisher'].append(parsed_record.get('Publisher',''))\n",
|
|
" data['Title'].append(parsed_record.get('Title', ''))\n",
|
|
" data['Access URL'].append(parsed_record.get('Access URL', ''))\n",
|
|
" data['subject'].append(parsed_record.get('subject',' '))\n",
|
|
" data['subject_level1'].append(parsed_record.get('subject_level1',' '))\n",
|
|
" data['subject_level2'].append(parsed_record.get('subject_level2',' '))\n",
|
|
" data['subject_level3'].append(parsed_record.get('subject_level3',' '))\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 20,
|
|
"id": "13d8b15a-6ba6-485c-9faf-f910e3f60359",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import json\n",
|
|
"\n",
|
|
"fin = open('./input/downloadedITNotOpenHandled.json')\n",
|
|
"lines = fin.read().split(\"\\n\")\n",
|
|
"fin.close()\n",
|
|
"for line in lines:\n",
|
|
" entry = json.loads(line)\n",
|
|
" for record in entry:\n",
|
|
" parseRecord(record)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 21,
|
|
"id": "a8785d2d-90d7-437b-894a-72ed3caafb3b",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"fin = open('./input/downloadedITNotOpenoalex.json')\n",
|
|
"\n",
|
|
"for line in fin:\n",
|
|
" if line[0] == '[' or line[0] == ',':\n",
|
|
" parseRecord(json.loads(line[1:].strip()))\n",
|
|
" else:\n",
|
|
" continue\n",
|
|
" "
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 23,
|
|
"id": "4fd825b4-472d-4081-959f-8ae6db0a9f46",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"idf = pd.DataFrame(data, columns = ['Record ID',\n",
|
|
" 'Date of Publication',\n",
|
|
" 'Country of Publication',\n",
|
|
" 'Language',\n",
|
|
" 'ISSN',\n",
|
|
" 'ISSNL',\n",
|
|
" 'ddc_subject_classification',\n",
|
|
" 'Publisher',\n",
|
|
" 'Title',\n",
|
|
" 'Access URL',\n",
|
|
" 'subject',\n",
|
|
" 'subject_level1',\n",
|
|
" 'subject_level2',\n",
|
|
" 'subject_level3'])\n",
|
|
"\n",
|
|
"idf.to_csv('./input/issnExtracted.tsv',sep=\"\\t\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"id": "463c0438-fe6d-4d73-9927-9351408f9218",
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3 (ipykernel)",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.12.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 5
|
|
}
|