1175 lines
37 KiB
Plaintext
1175 lines
37 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 1,
|
||
"id": "d2717360-0015-4f77-9c1c-a1adf1c003c8",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import requests, json, os\n",
|
||
"from typing import List\n",
|
||
"import xml.etree.ElementTree as ET\n",
|
||
"from tqdm.notebook import tqdm\n",
|
||
"import pandas as pd\n",
|
||
"from IPython.display import JSON as pretty_print"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "64910fc4-cae7-43cc-8fa0-ff4c103ebfe6",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## Fetching and retrieving the id of all the EZB DOA JOURNALS"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "d8c8c9bf-ca6c-422a-8309-afdac259b494",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### Parameters"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "70f52e18-45c9-43ca-9763-094ec7aae600",
|
||
"metadata": {},
|
||
"source": [
|
||
"For what I understood:\n",
|
||
"\n",
|
||
"- jq_term → filtering term\n",
|
||
"- xmloutput → 1 in xml other values returns in html\n",
|
||
"- sc → Starting Character of the output list\n",
|
||
"- sindex → Starting Index to iterate over the xml output journals\n",
|
||
"- hits_per_page → number of items per page. Default or not specified: 50\n",
|
||
"\n",
|
||
"documentation: https://ezb.ur.de/services/xmloutput.phtml?bibid=AAAAA&colors=1&lang=en\n",
|
||
"\n",
|
||
"however I don't know what FKW means for *jq_type* (it is not listed in #6.6 of the documentation)\n",
|
||
"\n",
|
||
"same for the possible *jq_term*. I suppose they are the categories shown in the advanced search > journal categories list (https://ezb.ur.de/search.phtml?bibid=AAAAA&colors=1&lang=en)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 2,
|
||
"id": "0b5bd5d9-166f-4bf0-8649-69082df8b6ec",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"jq_type1='FKW'\n",
|
||
"jq_term1='Diamond_Open_Access_Journal'\n",
|
||
"xmloutput=1\n",
|
||
"lang='en'\n",
|
||
"sc='A'\n",
|
||
"sindex=0"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 3,
|
||
"id": "c18a0c19-9ba0-47c4-88a1-04529be5d8e8",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"base_page = f\"https://ezb.ur.de/searchres.phtml?jq_type1={jq_type1}&jq_term1={jq_term1}&xmloutput={xmloutput}&lang={lang}\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 4,
|
||
"id": "e1fd5855-fcf9-4058-98ba-333e0b714653",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def fetch(url):\n",
|
||
" response = requests.get(url)\n",
|
||
" if response.status_code == 200:\n",
|
||
" return ET.fromstring(response.content)\n",
|
||
" else:\n",
|
||
" return None"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 5,
|
||
"id": "4376fb42-b255-4206-99ca-fae55a8fcf30",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def get_journals_for_page(sc, sindex):\n",
|
||
" journals = []\n",
|
||
" url = f\"{base_page}&sc={sc}&sindex={sindex}\"\n",
|
||
" root = fetch(url)\n",
|
||
" if not root: return journals\n",
|
||
" alphabetical_order = root.find(\".//alphabetical_order\")\n",
|
||
" if alphabetical_order:\n",
|
||
" for journal in alphabetical_order.findall(\".//journal\"):\n",
|
||
" jourid = journal.attrib.get(\"jourid\", \"unknown\")\n",
|
||
" title = journal.find(\"title\").text if journal.find(\"title\") is not None else \"\"\n",
|
||
" journals.append({'ezb-id':jourid, 'journame':title})\n",
|
||
" return journals"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "0cabceab-78e6-414a-a233-62d97e4b3059",
|
||
"metadata": {},
|
||
"source": [
|
||
"in the xml output next_fifty property contains the indices for the next pages"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 6,
|
||
"id": "70bdb93c-77fd-44e1-b57f-7205fb05b605",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def extract_journals_from_section(sc):\n",
|
||
" section_journals = []\n",
|
||
" root = fetch(f\"{base_page}&sc={sc}\")\n",
|
||
" sindinces = [0]\n",
|
||
" nexts = root.findall('.//next_fifty')\n",
|
||
" for next_fifty in nexts:\n",
|
||
" sindex = next_fifty.attrib.get('sindex')\n",
|
||
" sindinces.append(sindex)\n",
|
||
" for sindex in tqdm(sindinces):\n",
|
||
" journals = get_journals_for_page(sc, sindex)\n",
|
||
" section_journals.extend(journals)\n",
|
||
" return section_journals"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 7,
|
||
"id": "c66d6db4-97e9-4b86-81b8-e26030f92f68",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def iterate_and_extract_journals():\n",
|
||
" root = fetch(base_page)\n",
|
||
" if not root: return\n",
|
||
" doa_journals = []\n",
|
||
" s = 0\n",
|
||
" other_pages = root.findall(\".//other_pages\")\n",
|
||
" sections = []\n",
|
||
" for page in other_pages:\n",
|
||
" sc = page.attrib.get(\"sc\")\n",
|
||
" sections.append(sc)\n",
|
||
" #print(sections)\n",
|
||
" for sc in tqdm(sections):\n",
|
||
" journals = extract_journals_from_section(sc)\n",
|
||
" doa_journals.extend(journals) \n",
|
||
" return doa_journals"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 8,
|
||
"id": "5081f71e-eeea-4c95-87b7-12fbb8139c38",
|
||
"metadata": {
|
||
"scrolled": true,
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "333030eac5e346ec915d61223e70c9b3",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/24 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "b8cdec9f97c34bb8866d34b63f8b1149",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/12 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "da8ac2def3f148c1b397df8c0e4bb94f",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/4 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "2b7b85e2233e4fd4908a9966fa600886",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/8 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "91583038af8f4c9da77a4ac1ca136ef0",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/3 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "4cd6152440374056bd3b9fa43c262240",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/7 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "7dc6080fba74434891ef9f5a8d643290",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/2 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "e9c722f22fe74ce58fcd3086267bcf6d",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/3 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "4cb4f66406b746148bebfcaf365d90b2",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/2 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "384cb6875c3b4837ad5d2c3d43d75f39",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/7 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "99c1fbb7a76d42eeb0098c132c0bf763",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/10 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "293cf3d864fc4ef9ac457a9e22316219",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/2 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "63893fa3933845ababd2687ae1af2706",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/3 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "52de4965928c4705b4005ace48c49a8f",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/5 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "35d27b5ad8034a4e8806509e61aa9d94",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/3 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "be8d3ef7d0dd4e38815677b5ef258f0e",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/2 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "d58aea3038c04de1a75d7c02fb8b8026",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/6 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "ef1e15697e864d328365d96a1cb04a9f",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "49a27f652259445f8e22401130919c99",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/15 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "324906b86a3246739c0129b8a4e601bf",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/7 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "98368cb86e3548d199dcecbc1fcccae9",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/3 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "9a64a6142404459ba079de75f3a2eb01",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "3d5b82b491d647e28732de3d29a0e1c2",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/2 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "307af57f828b402199113a06a6e0e3c6",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/2 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
},
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "c6a47b0a70e74feab8cdcb8945b14efe",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/1 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"doa_journals = iterate_and_extract_journals()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "51c04a8e-91c6-40d3-9d10-35e8018359dd",
|
||
"metadata": {},
|
||
"source": [
|
||
"I noticed that, in ezb search functionality, the journals starting with **Z** appears both in **W** and in **Z** so i'm deduplicating the output"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 9,
|
||
"id": "aaef2e1e-6055-42f9-96b7-cb138b5c1a25",
|
||
"metadata": {
|
||
"scrolled": true,
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"seen_ids = set()\n",
|
||
"journals = []\n",
|
||
"for doa in doa_journals:\n",
|
||
" id_ = doa['ezb-id']\n",
|
||
" if not id_ in seen_ids: journals.append(doa), seen_ids.add(id_)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "0feb1f13-73db-4904-874d-3435989b8211",
|
||
"metadata": {},
|
||
"source": [
|
||
"Saving journals in a local JSON file"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "e6f443cc-0830-4e8a-b771-8ed1fad1420d",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"with open(\"ezb_doa_journals.json\", \"w\") as f:\n",
|
||
" json.dump(journals, f)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "markdown",
|
||
"id": "03bc5bd6-004a-488b-8b16-06a0711ec0e1",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## Retrieving Journals Info"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 10,
|
||
"id": "e3d0a049-59d4-4528-b468-9b980911c794",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"base_url=\"https://ezb.ur.de/detail.phtml?lang=en&xmloutput=1\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 11,
|
||
"id": "c6dd4d70-5c6d-42ad-bd73-91c6b1df9fb0",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def get_publisher_name(xml: ET.ElementTree) -> str:\n",
|
||
" publisher = xml.find(f\".//publisher\")\n",
|
||
" return publisher.text if not publisher is None else \"\"\n",
|
||
"\n",
|
||
"def get_e_issns(xml: ET.ElementTree) -> List[str]:\n",
|
||
" issns = []\n",
|
||
" for e_issn in xml.findall(\".//E_ISSN\"):\n",
|
||
" issns.append(e_issn.text)\n",
|
||
" return issns\n",
|
||
"\n",
|
||
"def get_p_issns(xml: ET.ElementTree) -> List[str]:\n",
|
||
" issns = []\n",
|
||
" for p_issn in xml.findall(\".//P_ISSN\"):\n",
|
||
" issns.append(p_issn.text)\n",
|
||
" return issns\n",
|
||
"\n",
|
||
"def get_subjects(xml: ET.ElementTree) -> List[str]:\n",
|
||
" subjects = []\n",
|
||
" for subject in xml.findall(\".//subject\"):\n",
|
||
" subjects.append(subject.text)\n",
|
||
" return subjects\n",
|
||
"\n",
|
||
"def get_categories(xml: ET.ElementTree) -> List[str]:\n",
|
||
" categories = []\n",
|
||
" for category in xml.findall(\".//category\"):\n",
|
||
" categories.append(category.text)\n",
|
||
" return categories\n",
|
||
"\n",
|
||
"def get_first_issue(xml: ET.ElementTree) -> str:\n",
|
||
" date = xml.find(\".//first_date\")\n",
|
||
" return date.text if not date is None else \"\"\n",
|
||
"\n",
|
||
"def get_home_page(xml: ET.ElementTree) -> str:\n",
|
||
" homepage = xml.find(\".//homepage\")\n",
|
||
" return homepage.text if not homepage is None else \"\"\n",
|
||
"\n",
|
||
"def get_appearence(xml: ET.ElementTree) -> str:\n",
|
||
" appearence = xml.find(\".//appearence\")\n",
|
||
" return appearence.text if not appearence is None else \"\"\n",
|
||
"\n",
|
||
"def get_costs(xml: ET.ElementTree) -> str:\n",
|
||
" costs = xml.find(\".//costs\") \n",
|
||
" return costs.text if not costs is None else \"\"\n",
|
||
"\n",
|
||
"def get_access_conditions(xml: ET.ElementTree) -> str:\n",
|
||
" ac = xml.find(\".//access_conditions\")\n",
|
||
" return ac.text if not ac is None else \"\"\n",
|
||
"\n",
|
||
"def get_doaj_info(xml: ET.ElementTree) -> dict:\n",
|
||
" d = {}\n",
|
||
" doaj = xml.find(\".//doaj\")\n",
|
||
" if doaj:\n",
|
||
" d['doaj:url'] = doaj.attrib.get('url')\n",
|
||
" for child in doaj:\n",
|
||
" tag = child.tag\n",
|
||
" if \"}\" in tag:\n",
|
||
" tag = tag.split(\"}\", 1)[1]\n",
|
||
" d[f\"doaj:{tag}\"]=child.text\n",
|
||
" return d"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 12,
|
||
"id": "48726252-4021-4cb5-9a48-15bd03497aa4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def get_publishings_info(xml: ET.ElementTree) -> List[object]:\n",
|
||
" publishing = []\n",
|
||
" publishing_tag = xml.find(\".//publishing\")\n",
|
||
" if not publishing_tag is None:\n",
|
||
" for info in publishing_tag:\n",
|
||
" if not info.tag in publishing: publishing.append(info.tag)\n",
|
||
" return publishing"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 13,
|
||
"id": "d59b38b2-2f0c-4768-a940-55ec9b8c4316",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def get_detail_tags(xml: ET.ElementTree) -> List[str]:\n",
|
||
" keywords = []\n",
|
||
" details = xml.find(\".//detail\")\n",
|
||
" if not details is None:\n",
|
||
" for child in details:\n",
|
||
" if not child.tag in keywords: keywords.append(child.tag)\n",
|
||
" return keywords"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 14,
|
||
"id": "cba9c62f-20b0-4a8a-903f-8ed8ce9a2acb",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"def get_openapc_info(xml: ET.ElementTree) -> dict:\n",
|
||
" d = {}\n",
|
||
" namespace = {\n",
|
||
" 'openapc': 'https://olap.openapc.net/'\n",
|
||
" }\n",
|
||
" period = xml.find(\".//openapc:period\", namespaces=namespace)\n",
|
||
" n_items = xml.find(\".//openapc:apc_num_items\", namespaces=namespace)\n",
|
||
" amount = xml.find(\".//openapc:apc_amount_avg\", namespaces=namespace)\n",
|
||
" d['openapc:period'] = period.text if not period is None else \"NA\"\n",
|
||
" d['openapc:apc_num_items'] = n_items.text if not period is None else \"NA\"\n",
|
||
" d['openapc:apc_amount_avg'] = amount.text if not amount is None else \"NA\"\n",
|
||
" d['openapc:apc_amount_avg_currency'] = amount.attrib.get(\"currency\", \"NA\") if not amount is None else \"NA\"\n",
|
||
" \n",
|
||
" return d"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 24,
|
||
"id": "7782bea1-2840-4d02-991f-b77d6ae264c1",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "dedc1909b0f840f38cb0b191c1779cca",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/4953 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"all_tags = set()\n",
|
||
"for journal in tqdm(journals):\n",
|
||
" jourid = journal['ezb-id']\n",
|
||
" jour_url = f\"{base_url}&jour_id={jourid}\"\n",
|
||
" jour_xml = fetch(jour_url)\n",
|
||
" tags = get_detail_tags(jour_xml)\n",
|
||
" for t in tags:\n",
|
||
" all_tags.add(t)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 15,
|
||
"id": "d2829b8d-8b0b-49d4-9992-e0aee1f97489",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "d021d6f8252a4a3f9751763297094ee4",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/4953 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"for journal in tqdm(journals):\n",
|
||
" jourid = journal['ezb-id']\n",
|
||
" jour_url = f\"{base_url}&jour_id={jourid}\"\n",
|
||
" jour_xml = fetch(jour_url)\n",
|
||
" journal['publisher_name'] = get_publisher_name(jour_xml)\n",
|
||
" journal['E-ISSNs'] = get_e_issns(jour_xml)\n",
|
||
" journal['P-ISSNs'] = get_p_issns(jour_xml)\n",
|
||
" journal['ezb_subjects'] = get_subjects(jour_xml)\n",
|
||
" journal['ezb_categories'] = get_categories(jour_xml)\n",
|
||
" journal['first_issue'] = get_first_issue(jour_xml)\n",
|
||
" journal['home_page'] = get_home_page(jour_xml)\n",
|
||
" journal['appearence'] = get_appearence(jour_xml)\n",
|
||
" journal['costs'] = get_costs(jour_xml)\n",
|
||
" journal['access_conditions'] = get_access_conditions(jour_xml)\n",
|
||
" doaj = get_doaj_info(jour_xml)\n",
|
||
" for k,v in doaj.items():\n",
|
||
" journal[k] = v\n",
|
||
" openapc = get_openapc_info(jour_xml)\n",
|
||
" for k,v in openapc.items():\n",
|
||
" journal[k] = v"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "6af91eb5-3475-4647-abdb-0569aa8b0819",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"application/vnd.jupyter.widget-view+json": {
|
||
"model_id": "6b55b866795c478396d05afe459838d1",
|
||
"version_major": 2,
|
||
"version_minor": 0
|
||
},
|
||
"text/plain": [
|
||
" 0%| | 0/4953 [00:00<?, ?it/s]"
|
||
]
|
||
},
|
||
"metadata": {},
|
||
"output_type": "display_data"
|
||
}
|
||
],
|
||
"source": [
|
||
"journal_publishing = {}\n",
|
||
"for journal in tqdm(journals):\n",
|
||
" jourid = journal['ezb-id']\n",
|
||
" jour_url = f\"{base_url}&jour_id={jourid}\"\n",
|
||
" jour_xml = fetch(jour_url)\n",
|
||
" publishing = get_publishings_info(jour_xml)\n",
|
||
" journal_publishing[jourid] = {'journame': journal['journame'], 'home_page': journal['home_page'], 'publishing': publishing}"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 22,
|
||
"id": "3f158bd2-592f-41cf-ad75-99801acd6033",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"with open(\"ezb-doa-journals.json\", \"w\") as f:\n",
|
||
" json.dump(journals, f)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 16,
|
||
"id": "290dea7b-cf70-42c7-976c-edc065aa2a85",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df = pd.DataFrame(journals)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 26,
|
||
"id": "5b97ace9-c028-4605-8af1-495c22aa5869",
|
||
"metadata": {},
|
||
"outputs": [
|
||
{
|
||
"data": {
|
||
"text/html": [
|
||
"<div>\n",
|
||
"<style scoped>\n",
|
||
" .dataframe tbody tr th:only-of-type {\n",
|
||
" vertical-align: middle;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe tbody tr th {\n",
|
||
" vertical-align: top;\n",
|
||
" }\n",
|
||
"\n",
|
||
" .dataframe thead th {\n",
|
||
" text-align: right;\n",
|
||
" }\n",
|
||
"</style>\n",
|
||
"<table border=\"1\" class=\"dataframe\">\n",
|
||
" <thead>\n",
|
||
" <tr style=\"text-align: right;\">\n",
|
||
" <th></th>\n",
|
||
" <th>ezb-id</th>\n",
|
||
" <th>journame</th>\n",
|
||
" <th>publisher_name</th>\n",
|
||
" <th>E-ISSNs</th>\n",
|
||
" <th>P-ISSNs</th>\n",
|
||
" <th>ezb_subjects</th>\n",
|
||
" <th>ezb_categories</th>\n",
|
||
" <th>first_issue</th>\n",
|
||
" <th>home_page</th>\n",
|
||
" <th>appearence</th>\n",
|
||
" <th>...</th>\n",
|
||
" <th>doaj:journal_plagiarism_screening_policy</th>\n",
|
||
" <th>doaj:plagiarism_information_url</th>\n",
|
||
" <th>doaj:url_for_journal_instructions_for_authors</th>\n",
|
||
" <th>doaj:last_updated_date</th>\n",
|
||
" <th>doaj:average_number_of_weeks_between_article_submission_and_publication</th>\n",
|
||
" <th>openapc:period</th>\n",
|
||
" <th>openapc:apc_num_items</th>\n",
|
||
" <th>openapc:apc_amount_avg</th>\n",
|
||
" <th>openapc:apc_amount_avg_currency</th>\n",
|
||
" <th>doaj:apc_amount</th>\n",
|
||
" </tr>\n",
|
||
" </thead>\n",
|
||
" <tbody>\n",
|
||
" <tr>\n",
|
||
" <th>0</th>\n",
|
||
" <td>496159</td>\n",
|
||
" <td>1616: Anuario de Literatura Comparada</td>\n",
|
||
" <td>Ediciones Universidad Salamanca</td>\n",
|
||
" <td>2445-2262</td>\n",
|
||
" <td>[0210-7287]</td>\n",
|
||
" <td>[Linguistics and Literature, Romance Studies]</td>\n",
|
||
" <td>[Diamond Open Access Journal, Indexed in DOAJ,...</td>\n",
|
||
" <td>2011</td>\n",
|
||
" <td>https://revistas.usal.es/dos/index.php/1616_An...</td>\n",
|
||
" <td>Fulltext, online only</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>Yes</td>\n",
|
||
" <td>http://revistas.usal.es/index.php/1616_Anuario...</td>\n",
|
||
" <td>http://revistas.usal.es/index.php/1616_Anuario...</td>\n",
|
||
" <td>2020-03-09 18:14:02</td>\n",
|
||
" <td>4</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>1</th>\n",
|
||
" <td>134172</td>\n",
|
||
" <td>19 : Interdisciplinary Studies in the Long Nin...</td>\n",
|
||
" <td>School of Arts, Birkbeck College, Univ. of Lon...</td>\n",
|
||
" <td>1755-1560</td>\n",
|
||
" <td>[]</td>\n",
|
||
" <td>[Linguistics and Literature]</td>\n",
|
||
" <td>[Diamond Open Access Journal, DOAJ Seal, Index...</td>\n",
|
||
" <td>2005</td>\n",
|
||
" <td>https://19.bbk.ac.uk/</td>\n",
|
||
" <td>Fulltext, online only</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>Yes</td>\n",
|
||
" <td>https://19.bbk.ac.uk/site/research-integrity/</td>\n",
|
||
" <td>https://19.bbk.ac.uk/site/author-guidelines/</td>\n",
|
||
" <td>2024-06-24 14:43:28</td>\n",
|
||
" <td>26</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>2</th>\n",
|
||
" <td>170939</td>\n",
|
||
" <td>452ºF, The Journal of Literary Theory and Comp...</td>\n",
|
||
" <td>Universitat de Barcelona</td>\n",
|
||
" <td>2013-3294</td>\n",
|
||
" <td>[]</td>\n",
|
||
" <td>[Linguistics and Literature]</td>\n",
|
||
" <td>[Diamond Open Access Journal, Indexed in DOAJ,...</td>\n",
|
||
" <td>2010</td>\n",
|
||
" <td>http://www.452f.com</td>\n",
|
||
" <td>Fulltext, online only</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>No</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>http://www.452f.com/index.php/en/manual-estilo16</td>\n",
|
||
" <td>2024-03-11 10:17:21</td>\n",
|
||
" <td>16</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>3</th>\n",
|
||
" <td>33700</td>\n",
|
||
" <td>49th Parallel: An Interdisciplinary Journal of...</td>\n",
|
||
" <td>University of Birmingham, Department of Americ...</td>\n",
|
||
" <td>1753-5794</td>\n",
|
||
" <td>[]</td>\n",
|
||
" <td>[English, American Studies, History, Political...</td>\n",
|
||
" <td>[Diamond Open Access Journal, Indexed in DOAJ,...</td>\n",
|
||
" <td>1999</td>\n",
|
||
" <td>https://49thparalleljournal.org/</td>\n",
|
||
" <td>Fulltext, online only</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>No</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>http://49thparalleljournal.org/submissions/</td>\n",
|
||
" <td>2017-04-10 14:45:56</td>\n",
|
||
" <td>24</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" <tr>\n",
|
||
" <th>4</th>\n",
|
||
" <td>461034</td>\n",
|
||
" <td>A&P Continuidad: Publicación Temática de Arqui...</td>\n",
|
||
" <td>Facultad de Arquitectura, Planeamiento y Diseñ...</td>\n",
|
||
" <td>2362-6097</td>\n",
|
||
" <td>[2362-6089]</td>\n",
|
||
" <td>[Architecture, Civil Engineering]</td>\n",
|
||
" <td>[Diamond Open Access Journal, Indexed in DOAJ,...</td>\n",
|
||
" <td>2014</td>\n",
|
||
" <td>https://www.ayp.fapyd.unr.edu.ar/index.php/ayp...</td>\n",
|
||
" <td>Fulltext, online and print</td>\n",
|
||
" <td>...</td>\n",
|
||
" <td>No</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" <td>http://www.ayp.fapyd.unr.edu.ar/index.php/ayp/...</td>\n",
|
||
" <td>2019-08-30 10:04:08</td>\n",
|
||
" <td>12</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NA</td>\n",
|
||
" <td>NaN</td>\n",
|
||
" </tr>\n",
|
||
" </tbody>\n",
|
||
"</table>\n",
|
||
"<p>5 rows × 33 columns</p>\n",
|
||
"</div>"
|
||
],
|
||
"text/plain": [
|
||
" ezb-id journame \\\n",
|
||
"0 496159 1616: Anuario de Literatura Comparada \n",
|
||
"1 134172 19 : Interdisciplinary Studies in the Long Nin... \n",
|
||
"2 170939 452ºF, The Journal of Literary Theory and Comp... \n",
|
||
"3 33700 49th Parallel: An Interdisciplinary Journal of... \n",
|
||
"4 461034 A&P Continuidad: Publicación Temática de Arqui... \n",
|
||
"\n",
|
||
" publisher_name E-ISSNs P-ISSNs \\\n",
|
||
"0 Ediciones Universidad Salamanca 2445-2262 [0210-7287] \n",
|
||
"1 School of Arts, Birkbeck College, Univ. of Lon... 1755-1560 [] \n",
|
||
"2 Universitat de Barcelona 2013-3294 [] \n",
|
||
"3 University of Birmingham, Department of Americ... 1753-5794 [] \n",
|
||
"4 Facultad de Arquitectura, Planeamiento y Diseñ... 2362-6097 [2362-6089] \n",
|
||
"\n",
|
||
" ezb_subjects \\\n",
|
||
"0 [Linguistics and Literature, Romance Studies] \n",
|
||
"1 [Linguistics and Literature] \n",
|
||
"2 [Linguistics and Literature] \n",
|
||
"3 [English, American Studies, History, Political... \n",
|
||
"4 [Architecture, Civil Engineering] \n",
|
||
"\n",
|
||
" ezb_categories first_issue \\\n",
|
||
"0 [Diamond Open Access Journal, Indexed in DOAJ,... 2011 \n",
|
||
"1 [Diamond Open Access Journal, DOAJ Seal, Index... 2005 \n",
|
||
"2 [Diamond Open Access Journal, Indexed in DOAJ,... 2010 \n",
|
||
"3 [Diamond Open Access Journal, Indexed in DOAJ,... 1999 \n",
|
||
"4 [Diamond Open Access Journal, Indexed in DOAJ,... 2014 \n",
|
||
"\n",
|
||
" home_page \\\n",
|
||
"0 https://revistas.usal.es/dos/index.php/1616_An... \n",
|
||
"1 https://19.bbk.ac.uk/ \n",
|
||
"2 http://www.452f.com \n",
|
||
"3 https://49thparalleljournal.org/ \n",
|
||
"4 https://www.ayp.fapyd.unr.edu.ar/index.php/ayp... \n",
|
||
"\n",
|
||
" appearence ... doaj:journal_plagiarism_screening_policy \\\n",
|
||
"0 Fulltext, online only ... Yes \n",
|
||
"1 Fulltext, online only ... Yes \n",
|
||
"2 Fulltext, online only ... No \n",
|
||
"3 Fulltext, online only ... No \n",
|
||
"4 Fulltext, online and print ... No \n",
|
||
"\n",
|
||
" doaj:plagiarism_information_url \\\n",
|
||
"0 http://revistas.usal.es/index.php/1616_Anuario... \n",
|
||
"1 https://19.bbk.ac.uk/site/research-integrity/ \n",
|
||
"2 NaN \n",
|
||
"3 NaN \n",
|
||
"4 NaN \n",
|
||
"\n",
|
||
" doaj:url_for_journal_instructions_for_authors doaj:last_updated_date \\\n",
|
||
"0 http://revistas.usal.es/index.php/1616_Anuario... 2020-03-09 18:14:02 \n",
|
||
"1 https://19.bbk.ac.uk/site/author-guidelines/ 2024-06-24 14:43:28 \n",
|
||
"2 http://www.452f.com/index.php/en/manual-estilo16 2024-03-11 10:17:21 \n",
|
||
"3 http://49thparalleljournal.org/submissions/ 2017-04-10 14:45:56 \n",
|
||
"4 http://www.ayp.fapyd.unr.edu.ar/index.php/ayp/... 2019-08-30 10:04:08 \n",
|
||
"\n",
|
||
" doaj:average_number_of_weeks_between_article_submission_and_publication \\\n",
|
||
"0 4 \n",
|
||
"1 26 \n",
|
||
"2 16 \n",
|
||
"3 24 \n",
|
||
"4 12 \n",
|
||
"\n",
|
||
" openapc:period openapc:apc_num_items openapc:apc_amount_avg \\\n",
|
||
"0 NA NA NA \n",
|
||
"1 NA NA NA \n",
|
||
"2 NA NA NA \n",
|
||
"3 NA NA NA \n",
|
||
"4 NA NA NA \n",
|
||
"\n",
|
||
" openapc:apc_amount_avg_currency doaj:apc_amount \n",
|
||
"0 NA NaN \n",
|
||
"1 NA NaN \n",
|
||
"2 NA NaN \n",
|
||
"3 NA NaN \n",
|
||
"4 NA NaN \n",
|
||
"\n",
|
||
"[5 rows x 33 columns]"
|
||
]
|
||
},
|
||
"execution_count": 26,
|
||
"metadata": {},
|
||
"output_type": "execute_result"
|
||
}
|
||
],
|
||
"source": [
|
||
"df.head()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 23,
|
||
"id": "a1b904d4-68d8-4b4d-8d2c-923aece48519",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"sep = \"/\""
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 25,
|
||
"id": "2dd9b474-9129-4aab-994a-d565f6adb767",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df['E-ISSNs'] = df['E-ISSNs'].apply(lambda x: sep.join(x) if len(x) > 0 else \"\")\n",
|
||
"df['P-ISSNs'] = df['P-ISSNs'].apply(lambda x: sep.join(x) if len(x) > 0 else \"\")\n",
|
||
"df['ezb_subjects'] = df['ezb_subjects'].apply(lambda x: sep.join(x) if len(x) > 0 else \"\")\n",
|
||
"df['ezb_categories'] = df['ezb_categories'].apply(lambda x: sep.join(x) if len(x) > 0 else \"\")"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": 32,
|
||
"id": "40f21e86-3b0a-4128-827f-0c0fa19a67d4",
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"df.to_csv(\"ezb_journals.csv\", index=None, sep=\";\")"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.8.8"
|
||
}
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 5
|
||
}
|