diff --git a/ezb-downloader.ipynb b/ezb-downloader.ipynb new file mode 100644 index 0000000..56407bf --- /dev/null +++ b/ezb-downloader.ipynb @@ -0,0 +1,1174 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d2717360-0015-4f77-9c1c-a1adf1c003c8", + "metadata": {}, + "outputs": [], + "source": [ + "import requests, json, os\n", + "from typing import List\n", + "import xml.etree.ElementTree as ET\n", + "from tqdm.notebook import tqdm\n", + "import pandas as pd\n", + "from IPython.display import JSON as pretty_print" + ] + }, + { + "cell_type": "markdown", + "id": "64910fc4-cae7-43cc-8fa0-ff4c103ebfe6", + "metadata": { + "tags": [] + }, + "source": [ + "## Fetching and retrieving the id of all the EZB DOA JOURNALS" + ] + }, + { + "cell_type": "markdown", + "id": "d8c8c9bf-ca6c-422a-8309-afdac259b494", + "metadata": { + "tags": [] + }, + "source": [ + "### Parameters" + ] + }, + { + "cell_type": "markdown", + "id": "70f52e18-45c9-43ca-9763-094ec7aae600", + "metadata": {}, + "source": [ + "For what I understood:\n", + "\n", + "- jq_term → filtering term\n", + "- xmloutput → 1 in xml other values returns in html\n", + "- sc → Starting Character of the output list\n", + "- sindex → Starting Index to iterate over the xml output journals\n", + "- hits_per_page → number of items per page. Default or not specified: 50\n", + "\n", + "documentation: https://ezb.ur.de/services/xmloutput.phtml?bibid=AAAAA&colors=1&lang=en\n", + "\n", + "however I don't know what FKW means for *jq_type* (it is not listed in #6.6 of the documentation)\n", + "\n", + "same for the possible *jq_term*. I suppose they are the categories shown in the advanced search > journal categories list (https://ezb.ur.de/search.phtml?bibid=AAAAA&colors=1&lang=en)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0b5bd5d9-166f-4bf0-8649-69082df8b6ec", + "metadata": {}, + "outputs": [], + "source": [ + "jq_type1='FKW'\n", + "jq_term1='Diamond_Open_Access_Journal'\n", + "xmloutput=1\n", + "lang='en'\n", + "sc='A'\n", + "sindex=0" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c18a0c19-9ba0-47c4-88a1-04529be5d8e8", + "metadata": {}, + "outputs": [], + "source": [ + "base_page = f\"https://ezb.ur.de/searchres.phtml?jq_type1={jq_type1}&jq_term1={jq_term1}&xmloutput={xmloutput}&lang={lang}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e1fd5855-fcf9-4058-98ba-333e0b714653", + "metadata": {}, + "outputs": [], + "source": [ + "def fetch(url):\n", + " response = requests.get(url)\n", + " if response.status_code == 200:\n", + " return ET.fromstring(response.content)\n", + " else:\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4376fb42-b255-4206-99ca-fae55a8fcf30", + "metadata": {}, + "outputs": [], + "source": [ + "def get_journals_for_page(sc, sindex):\n", + " journals = []\n", + " url = f\"{base_page}&sc={sc}&sindex={sindex}\"\n", + " root = fetch(url)\n", + " if not root: return journals\n", + " alphabetical_order = root.find(\".//alphabetical_order\")\n", + " if alphabetical_order:\n", + " for journal in alphabetical_order.findall(\".//journal\"):\n", + " jourid = journal.attrib.get(\"jourid\", \"unknown\")\n", + " title = journal.find(\"title\").text if journal.find(\"title\") is not None else \"\"\n", + " journals.append({'ezb-id':jourid, 'journame':title})\n", + " return journals" + ] + }, + { + "cell_type": "markdown", + "id": "0cabceab-78e6-414a-a233-62d97e4b3059", + "metadata": {}, + "source": [ + "in the xml output next_fifty property contains the indices for the next pages" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "70bdb93c-77fd-44e1-b57f-7205fb05b605", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_journals_from_section(sc):\n", + " section_journals = []\n", + " root = fetch(f\"{base_page}&sc={sc}\")\n", + " sindinces = [0]\n", + " nexts = root.findall('.//next_fifty')\n", + " for next_fifty in nexts:\n", + " sindex = next_fifty.attrib.get('sindex')\n", + " sindinces.append(sindex)\n", + " for sindex in tqdm(sindinces):\n", + " journals = get_journals_for_page(sc, sindex)\n", + " section_journals.extend(journals)\n", + " return section_journals" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c66d6db4-97e9-4b86-81b8-e26030f92f68", + "metadata": {}, + "outputs": [], + "source": [ + "def iterate_and_extract_journals():\n", + " root = fetch(base_page)\n", + " if not root: return\n", + " doa_journals = []\n", + " s = 0\n", + " other_pages = root.findall(\".//other_pages\")\n", + " sections = []\n", + " for page in other_pages:\n", + " sc = page.attrib.get(\"sc\")\n", + " sections.append(sc)\n", + " #print(sections)\n", + " for sc in tqdm(sections):\n", + " journals = extract_journals_from_section(sc)\n", + " doa_journals.extend(journals) \n", + " return doa_journals" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5081f71e-eeea-4c95-87b7-12fbb8139c38", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "333030eac5e346ec915d61223e70c9b3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/24 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "b8cdec9f97c34bb8866d34b63f8b1149", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/12 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "da8ac2def3f148c1b397df8c0e4bb94f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/4 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2b7b85e2233e4fd4908a9966fa600886", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/8 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "91583038af8f4c9da77a4ac1ca136ef0", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4cd6152440374056bd3b9fa43c262240", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/7 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "7dc6080fba74434891ef9f5a8d643290", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e9c722f22fe74ce58fcd3086267bcf6d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "4cb4f66406b746148bebfcaf365d90b2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "384cb6875c3b4837ad5d2c3d43d75f39", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/7 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "99c1fbb7a76d42eeb0098c132c0bf763", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/10 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "293cf3d864fc4ef9ac457a9e22316219", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "63893fa3933845ababd2687ae1af2706", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "52de4965928c4705b4005ace48c49a8f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/5 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "35d27b5ad8034a4e8806509e61aa9d94", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "be8d3ef7d0dd4e38815677b5ef258f0e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d58aea3038c04de1a75d7c02fb8b8026", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/6 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ef1e15697e864d328365d96a1cb04a9f", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "49a27f652259445f8e22401130919c99", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/15 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "324906b86a3246739c0129b8a4e601bf", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/7 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "98368cb86e3548d199dcecbc1fcccae9", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/3 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "9a64a6142404459ba079de75f3a2eb01", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3d5b82b491d647e28732de3d29a0e1c2", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "307af57f828b402199113a06a6e0e3c6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/2 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c6a47b0a70e74feab8cdcb8945b14efe", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/1 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "doa_journals = iterate_and_extract_journals()" + ] + }, + { + "cell_type": "markdown", + "id": "51c04a8e-91c6-40d3-9d10-35e8018359dd", + "metadata": {}, + "source": [ + "I noticed that, in ezb search functionality, the journals starting with **Z** appears both in **W** and in **Z** so i'm deduplicating the output" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "aaef2e1e-6055-42f9-96b7-cb138b5c1a25", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "seen_ids = set()\n", + "journals = []\n", + "for doa in doa_journals:\n", + " id_ = doa['ezb-id']\n", + " if not id_ in seen_ids: journals.append(doa), seen_ids.add(id_)" + ] + }, + { + "cell_type": "markdown", + "id": "0feb1f13-73db-4904-874d-3435989b8211", + "metadata": {}, + "source": [ + "Saving journals in a local JSON file" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "e6f443cc-0830-4e8a-b771-8ed1fad1420d", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"ezb_doa_journals.json\", \"w\") as f:\n", + " json.dump(journals, f)" + ] + }, + { + "cell_type": "markdown", + "id": "03bc5bd6-004a-488b-8b16-06a0711ec0e1", + "metadata": { + "tags": [] + }, + "source": [ + "## Retrieving Journals Info" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "e3d0a049-59d4-4528-b468-9b980911c794", + "metadata": {}, + "outputs": [], + "source": [ + "base_url=\"https://ezb.ur.de/detail.phtml?lang=en&xmloutput=1\"" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "c6dd4d70-5c6d-42ad-bd73-91c6b1df9fb0", + "metadata": {}, + "outputs": [], + "source": [ + "def get_publisher_name(xml: ET.ElementTree) -> str:\n", + " publisher = xml.find(f\".//publisher\")\n", + " return publisher.text if not publisher is None else \"\"\n", + "\n", + "def get_e_issns(xml: ET.ElementTree) -> List[str]:\n", + " issns = []\n", + " for e_issn in xml.findall(\".//E_ISSN\"):\n", + " issns.append(e_issn.text)\n", + " return issns\n", + "\n", + "def get_p_issns(xml: ET.ElementTree) -> List[str]:\n", + " issns = []\n", + " for p_issn in xml.findall(\".//P_ISSN\"):\n", + " issns.append(p_issn.text)\n", + " return issns\n", + "\n", + "def get_subjects(xml: ET.ElementTree) -> List[str]:\n", + " subjects = []\n", + " for subject in xml.findall(\".//subject\"):\n", + " subjects.append(subject.text)\n", + " return subjects\n", + "\n", + "def get_categories(xml: ET.ElementTree) -> List[str]:\n", + " categories = []\n", + " for category in xml.findall(\".//category\"):\n", + " categories.append(category.text)\n", + " return categories\n", + "\n", + "def get_first_issue(xml: ET.ElementTree) -> str:\n", + " date = xml.find(\".//first_date\")\n", + " return date.text if not date is None else \"\"\n", + "\n", + "def get_home_page(xml: ET.ElementTree) -> str:\n", + " homepage = xml.find(\".//homepage\")\n", + " return homepage.text if not homepage is None else \"\"\n", + "\n", + "def get_appearence(xml: ET.ElementTree) -> str:\n", + " appearence = xml.find(\".//appearence\")\n", + " return appearence.text if not appearence is None else \"\"\n", + "\n", + "def get_costs(xml: ET.ElementTree) -> str:\n", + " costs = xml.find(\".//costs\") \n", + " return costs.text if not costs is None else \"\"\n", + "\n", + "def get_access_conditions(xml: ET.ElementTree) -> str:\n", + " ac = xml.find(\".//access_conditions\")\n", + " return ac.text if not ac is None else \"\"\n", + "\n", + "def get_doaj_info(xml: ET.ElementTree) -> dict:\n", + " d = {}\n", + " doaj = xml.find(\".//doaj\")\n", + " if doaj:\n", + " d['doaj:url'] = doaj.attrib.get('url')\n", + " for child in doaj:\n", + " tag = child.tag\n", + " if \"}\" in tag:\n", + " tag = tag.split(\"}\", 1)[1]\n", + " d[f\"doaj:{tag}\"]=child.text\n", + " return d" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "48726252-4021-4cb5-9a48-15bd03497aa4", + "metadata": {}, + "outputs": [], + "source": [ + "def get_publishings_info(xml: ET.ElementTree) -> List[object]:\n", + " publishing = []\n", + " publishing_tag = xml.find(\".//publishing\")\n", + " if not publishing_tag is None:\n", + " for info in publishing_tag:\n", + " if not info.tag in publishing: publishing.append(info.tag)\n", + " return publishing" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d59b38b2-2f0c-4768-a940-55ec9b8c4316", + "metadata": {}, + "outputs": [], + "source": [ + "def get_detail_tags(xml: ET.ElementTree) -> List[str]:\n", + " keywords = []\n", + " details = xml.find(\".//detail\")\n", + " if not details is None:\n", + " for child in details:\n", + " if not child.tag in keywords: keywords.append(child.tag)\n", + " return keywords" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "cba9c62f-20b0-4a8a-903f-8ed8ce9a2acb", + "metadata": {}, + "outputs": [], + "source": [ + "def get_openapc_info(xml: ET.ElementTree) -> dict:\n", + " d = {}\n", + " namespace = {\n", + " 'openapc': 'https://olap.openapc.net/'\n", + " }\n", + " period = xml.find(\".//openapc:period\", namespaces=namespace)\n", + " n_items = xml.find(\".//openapc:apc_num_items\", namespaces=namespace)\n", + " amount = xml.find(\".//openapc:apc_amount_avg\", namespaces=namespace)\n", + " d['openapc:period'] = period.text if not period is None else \"NA\"\n", + " d['openapc:apc_num_items'] = n_items.text if not period is None else \"NA\"\n", + " d['openapc:apc_amount_avg'] = amount.text if not amount is None else \"NA\"\n", + " d['openapc:apc_amount_avg_currency'] = amount.attrib.get(\"currency\", \"NA\") if not amount is None else \"NA\"\n", + " \n", + " return d" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "7782bea1-2840-4d02-991f-b77d6ae264c1", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dedc1909b0f840f38cb0b191c1779cca", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/4953 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "all_tags = set()\n", + "for journal in tqdm(journals):\n", + " jourid = journal['ezb-id']\n", + " jour_url = f\"{base_url}&jour_id={jourid}\"\n", + " jour_xml = fetch(jour_url)\n", + " tags = get_detail_tags(jour_xml)\n", + " for t in tags:\n", + " all_tags.add(t)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "d2829b8d-8b0b-49d4-9992-e0aee1f97489", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d021d6f8252a4a3f9751763297094ee4", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/4953 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "for journal in tqdm(journals):\n", + " jourid = journal['ezb-id']\n", + " jour_url = f\"{base_url}&jour_id={jourid}\"\n", + " jour_xml = fetch(jour_url)\n", + " journal['publisher_name'] = get_publisher_name(jour_xml)\n", + " journal['E-ISSNs'] = get_e_issns(jour_xml)\n", + " journal['P-ISSNs'] = get_p_issns(jour_xml)\n", + " journal['ezb_subjects'] = get_subjects(jour_xml)\n", + " journal['ezb_categories'] = get_categories(jour_xml)\n", + " journal['first_issue'] = get_first_issue(jour_xml)\n", + " journal['home_page'] = get_home_page(jour_xml)\n", + " journal['appearence'] = get_appearence(jour_xml)\n", + " journal['costs'] = get_costs(jour_xml)\n", + " journal['access_conditions'] = get_access_conditions(jour_xml)\n", + " doaj = get_doaj_info(jour_xml)\n", + " for k,v in doaj.items():\n", + " journal[k] = v\n", + " openapc = get_openapc_info(jour_xml)\n", + " for k,v in openapc.items():\n", + " journal[k] = v" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "6af91eb5-3475-4647-abdb-0569aa8b0819", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6b55b866795c478396d05afe459838d1", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/4953 [00:00, ?it/s]" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "journal_publishing = {}\n", + "for journal in tqdm(journals):\n", + " jourid = journal['ezb-id']\n", + " jour_url = f\"{base_url}&jour_id={jourid}\"\n", + " jour_xml = fetch(jour_url)\n", + " publishing = get_publishings_info(jour_xml)\n", + " journal_publishing[jourid] = {'journame': journal['journame'], 'home_page': journal['home_page'], 'publishing': publishing}" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "3f158bd2-592f-41cf-ad75-99801acd6033", + "metadata": {}, + "outputs": [], + "source": [ + "with open(\"ezb-doa-journals.json\", \"w\") as f:\n", + " json.dump(journals, f)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "290dea7b-cf70-42c7-976c-edc065aa2a85", + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.DataFrame(journals)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "5b97ace9-c028-4605-8af1-495c22aa5869", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | ezb-id | \n", + "journame | \n", + "publisher_name | \n", + "E-ISSNs | \n", + "P-ISSNs | \n", + "ezb_subjects | \n", + "ezb_categories | \n", + "first_issue | \n", + "home_page | \n", + "appearence | \n", + "... | \n", + "doaj:journal_plagiarism_screening_policy | \n", + "doaj:plagiarism_information_url | \n", + "doaj:url_for_journal_instructions_for_authors | \n", + "doaj:last_updated_date | \n", + "doaj:average_number_of_weeks_between_article_submission_and_publication | \n", + "openapc:period | \n", + "openapc:apc_num_items | \n", + "openapc:apc_amount_avg | \n", + "openapc:apc_amount_avg_currency | \n", + "doaj:apc_amount | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "496159 | \n", + "1616: Anuario de Literatura Comparada | \n", + "Ediciones Universidad Salamanca | \n", + "2445-2262 | \n", + "[0210-7287] | \n", + "[Linguistics and Literature, Romance Studies] | \n", + "[Diamond Open Access Journal, Indexed in DOAJ,... | \n", + "2011 | \n", + "https://revistas.usal.es/dos/index.php/1616_An... | \n", + "Fulltext, online only | \n", + "... | \n", + "Yes | \n", + "http://revistas.usal.es/index.php/1616_Anuario... | \n", + "http://revistas.usal.es/index.php/1616_Anuario... | \n", + "2020-03-09 18:14:02 | \n", + "4 | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NaN | \n", + "
1 | \n", + "134172 | \n", + "19 : Interdisciplinary Studies in the Long Nin... | \n", + "School of Arts, Birkbeck College, Univ. of Lon... | \n", + "1755-1560 | \n", + "[] | \n", + "[Linguistics and Literature] | \n", + "[Diamond Open Access Journal, DOAJ Seal, Index... | \n", + "2005 | \n", + "https://19.bbk.ac.uk/ | \n", + "Fulltext, online only | \n", + "... | \n", + "Yes | \n", + "https://19.bbk.ac.uk/site/research-integrity/ | \n", + "https://19.bbk.ac.uk/site/author-guidelines/ | \n", + "2024-06-24 14:43:28 | \n", + "26 | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NaN | \n", + "
2 | \n", + "170939 | \n", + "452ºF, The Journal of Literary Theory and Comp... | \n", + "Universitat de Barcelona | \n", + "2013-3294 | \n", + "[] | \n", + "[Linguistics and Literature] | \n", + "[Diamond Open Access Journal, Indexed in DOAJ,... | \n", + "2010 | \n", + "http://www.452f.com | \n", + "Fulltext, online only | \n", + "... | \n", + "No | \n", + "NaN | \n", + "http://www.452f.com/index.php/en/manual-estilo16 | \n", + "2024-03-11 10:17:21 | \n", + "16 | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NaN | \n", + "
3 | \n", + "33700 | \n", + "49th Parallel: An Interdisciplinary Journal of... | \n", + "University of Birmingham, Department of Americ... | \n", + "1753-5794 | \n", + "[] | \n", + "[English, American Studies, History, Political... | \n", + "[Diamond Open Access Journal, Indexed in DOAJ,... | \n", + "1999 | \n", + "https://49thparalleljournal.org/ | \n", + "Fulltext, online only | \n", + "... | \n", + "No | \n", + "NaN | \n", + "http://49thparalleljournal.org/submissions/ | \n", + "2017-04-10 14:45:56 | \n", + "24 | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NaN | \n", + "
4 | \n", + "461034 | \n", + "A&P Continuidad: Publicación Temática de Arqui... | \n", + "Facultad de Arquitectura, Planeamiento y Diseñ... | \n", + "2362-6097 | \n", + "[2362-6089] | \n", + "[Architecture, Civil Engineering] | \n", + "[Diamond Open Access Journal, Indexed in DOAJ,... | \n", + "2014 | \n", + "https://www.ayp.fapyd.unr.edu.ar/index.php/ayp... | \n", + "Fulltext, online and print | \n", + "... | \n", + "No | \n", + "NaN | \n", + "http://www.ayp.fapyd.unr.edu.ar/index.php/ayp/... | \n", + "2019-08-30 10:04:08 | \n", + "12 | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NA | \n", + "NaN | \n", + "
5 rows × 33 columns
\n", + "