DiamondOA/ParseISSN.ipynb

206 lines
7.8 KiB
Plaintext

{
"cells": [
{
"cell_type": "raw",
"id": "65cbfa78-4ff5-4381-ab5f-f3d37ff5a285",
"metadata": {},
"source": [
"Campi del dump ROAD che ci interessano 001 - ISSN 041 - lingua di pubblicazione 044 - publisher country 082 a) DDC subject classification 245 a) Title proper 246 acronimo titolo 260 b) editore 260 c) date of pubblication 856 - url della risorsa 981 a)subject 982 sottocategorie di subject 983 sottocategorie di subject 984 sottocategorie di subject"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "6bfd5c76-6e02-42f7-8d25-3d605efe2c13",
"metadata": {},
"outputs": [],
"source": [
"data = {\n",
" 'Record ID': [],\n",
" 'Date of Publication': [],\n",
" 'Country of Publication': [],\n",
" 'Language': [],\n",
" 'ISSN': [],\n",
" 'ISSNL': [],\n",
" 'ddc_subject_classification': [],\n",
" 'Publisher':[],\n",
" 'Title': [],\n",
" 'Access URL': [],\n",
" 'subject': [],\n",
" 'subject_level1': [],\n",
" 'subject_level2': [],\n",
" 'subject_level3': []\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "1eb7bad3-f208-43fc-b4eb-21991ec5324b",
"metadata": {},
"outputs": [],
"source": [
"def parseRecord(record):\n",
" parsed_record={}\n",
" subjects = set()\n",
" subjects_l1=set()\n",
" subjects_l2=set()\n",
" subjects_l3=set()\n",
" access_url = set()\n",
" # Fields\n",
" for field in record.get('fields', []):\n",
" for tag, value in field.items():\n",
" # Gestisci campi controllati\n",
" if tag == '001':\n",
" parsed_record['Record ID'] = value\n",
" elif tag == '008':\n",
" general_info = value\n",
" parsed_record['Date of Publication']= general_info[7:11].strip() + \" - \" + general_info[11:15].strip()\n",
" parsed_record['Country of Publication']= general_info[15:18].strip()\n",
" parsed_record['Language']= general_info[35:38].strip()\n",
" elif tag == '022':\n",
" for subfield in value.get('subfields', []):\n",
" if 'a' in subfield:\n",
" parsed_record['ISSN'] = subfield['a']\n",
" if 'l' in subfield:\n",
" parsed_record['ISSNL'] = subfield['l']\n",
" elif tag == '044':\n",
" country_code = value.get('subfields', [{}])[0].get('c', 'Unknown')\n",
" parsed_record['Country of Publication'] = country_code\n",
" elif tag == '082':\n",
" parsed_record['ddc_subject_classification']= value.get('subfields', [{}])[0].get('a', 'Unknown')\n",
" elif tag == '245':\n",
" title = ' '.join([sub.get('a', '') for sub in value.get('subfields', [])])\n",
" parsed_record['Title'] = title.strip()\n",
" elif tag == '260':\n",
" parsed_record['Publisher'] = value.get('subfields', [{}])[0].get('a', ' ') + \" \" + value.get('subfields', [{}])[1].get('b', ' ')\n",
" elif tag == '856':\n",
" url = next((sub.get('u') for sub in value.get('subfields', []) if 'u' in sub), None)\n",
" if url:\n",
" access_url.add(url)\n",
" elif tag == '981': \n",
" subjects.add( value.get('subfields', [{}])[0].get('a', 'Unknown'))\n",
" elif tag == '982':\n",
" subjects_l1.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))\n",
" elif tag == '983':\n",
" subjects_l2.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))\n",
" elif tag == '984':\n",
" subjects_l3.add(value.get('subfields', [{}])[0].get('a', 'Unknown'))\n",
" parsed_record['subject'] = \", \".join(list(subjects))\n",
" parsed_record[\"subject_level1\"]= \", \".join(list(subjects_l1))\n",
" parsed_record[\"subject_level2\"]= \", \".join(list(subjects_l2))\n",
" parsed_record[\"subject_level3\"]= \", \".join(list(subjects_l3))\n",
" parsed_record['Access URL'] = \", \".join(list(access_url))\n",
"\n",
" data['Record ID'].append(parsed_record.get('Record ID', ''))\n",
" data['Date of Publication'].append(parsed_record.get('Date of Publication', ''))\n",
" data['Country of Publication'].append(parsed_record.get('Country of Publication', ''))\n",
" data['Language'].append(parsed_record.get('Language', ''))\n",
" data['ISSN'].append(parsed_record.get('ISSN', ''))\n",
" data['ISSNL'].append(parsed_record.get('ISSNL', ''))\n",
" data['ddc_subject_classification'].append(parsed_record.get('ddc_subject_classification', ''))\n",
"\n",
" data['Publisher'].append(parsed_record.get('Publisher',''))\n",
" data['Title'].append(parsed_record.get('Title', ''))\n",
" data['Access URL'].append(parsed_record.get('Access URL', ''))\n",
" data['subject'].append(parsed_record.get('subject',' '))\n",
" data['subject_level1'].append(parsed_record.get('subject_level1',' '))\n",
" data['subject_level2'].append(parsed_record.get('subject_level2',' '))\n",
" data['subject_level3'].append(parsed_record.get('subject_level3',' '))\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 20,
"id": "13d8b15a-6ba6-485c-9faf-f910e3f60359",
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"\n",
"fin = open('./input/downloadedITNotOpenHandled.json')\n",
"lines = fin.read().split(\"\\n\")\n",
"fin.close()\n",
"for line in lines:\n",
" entry = json.loads(line)\n",
" for record in entry:\n",
" parseRecord(record)"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "a8785d2d-90d7-437b-894a-72ed3caafb3b",
"metadata": {},
"outputs": [],
"source": [
"fin = open('./input/downloadedITNotOpenoalex.json')\n",
"\n",
"for line in fin:\n",
" if line[0] == '[' or line[0] == ',':\n",
" parseRecord(json.loads(line[1:].strip()))\n",
" else:\n",
" continue\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 23,
"id": "4fd825b4-472d-4081-959f-8ae6db0a9f46",
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"idf = pd.DataFrame(data, columns = ['Record ID',\n",
" 'Date of Publication',\n",
" 'Country of Publication',\n",
" 'Language',\n",
" 'ISSN',\n",
" 'ISSNL',\n",
" 'ddc_subject_classification',\n",
" 'Publisher',\n",
" 'Title',\n",
" 'Access URL',\n",
" 'subject',\n",
" 'subject_level1',\n",
" 'subject_level2',\n",
" 'subject_level3'])\n",
"\n",
"idf.to_csv('./input/issnExtracted.tsv',sep=\"\\t\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "463c0438-fe6d-4d73-9927-9351408f9218",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.3"
}
},
"nbformat": 4,
"nbformat_minor": 5
}