{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "d2717360-0015-4f77-9c1c-a1adf1c003c8", "metadata": {}, "outputs": [], "source": [ "import requests, json, os\n", "from typing import List\n", "import xml.etree.ElementTree as ET\n", "from tqdm.notebook import tqdm\n", "import pandas as pd\n", "from IPython.display import JSON as pretty_print" ] }, { "cell_type": "markdown", "id": "64910fc4-cae7-43cc-8fa0-ff4c103ebfe6", "metadata": { "tags": [] }, "source": [ "## Fetching and retrieving the id of all the EZB DOA JOURNALS" ] }, { "cell_type": "markdown", "id": "d8c8c9bf-ca6c-422a-8309-afdac259b494", "metadata": { "tags": [] }, "source": [ "### Parameters" ] }, { "cell_type": "markdown", "id": "70f52e18-45c9-43ca-9763-094ec7aae600", "metadata": {}, "source": [ "For what I understood:\n", "\n", "- jq_term → filtering term\n", "- xmloutput → 1 in xml other values returns in html\n", "- sc → Starting Character of the output list\n", "- sindex → Starting Index to iterate over the xml output journals\n", "- hits_per_page → number of items per page. Default or not specified: 50\n", "\n", "documentation: https://ezb.ur.de/services/xmloutput.phtml?bibid=AAAAA&colors=1&lang=en\n", "\n", "however I don't know what FKW means for *jq_type* (it is not listed in #6.6 of the documentation)\n", "\n", "same for the possible *jq_term*. I suppose they are the categories shown in the advanced search > journal categories list (https://ezb.ur.de/search.phtml?bibid=AAAAA&colors=1&lang=en)" ] }, { "cell_type": "code", "execution_count": 2, "id": "0b5bd5d9-166f-4bf0-8649-69082df8b6ec", "metadata": {}, "outputs": [], "source": [ "jq_type1='FKW'\n", "jq_term1='Diamond_Open_Access_Journal'\n", "xmloutput=1\n", "lang='en'\n", "sc='A'\n", "sindex=0" ] }, { "cell_type": "code", "execution_count": 3, "id": "c18a0c19-9ba0-47c4-88a1-04529be5d8e8", "metadata": {}, "outputs": [], "source": [ "base_page = f\"https://ezb.ur.de/searchres.phtml?jq_type1={jq_type1}&jq_term1={jq_term1}&xmloutput={xmloutput}&lang={lang}\"" ] }, { "cell_type": "code", "execution_count": 4, "id": "e1fd5855-fcf9-4058-98ba-333e0b714653", "metadata": {}, "outputs": [], "source": [ "def fetch(url):\n", " response = requests.get(url)\n", " if response.status_code == 200:\n", " return ET.fromstring(response.content)\n", " else:\n", " return None" ] }, { "cell_type": "code", "execution_count": 5, "id": "4376fb42-b255-4206-99ca-fae55a8fcf30", "metadata": {}, "outputs": [], "source": [ "def get_journals_for_page(sc, sindex):\n", " journals = []\n", " url = f\"{base_page}&sc={sc}&sindex={sindex}\"\n", " root = fetch(url)\n", " if not root: return journals\n", " alphabetical_order = root.find(\".//alphabetical_order\")\n", " if alphabetical_order:\n", " for journal in alphabetical_order.findall(\".//journal\"):\n", " jourid = journal.attrib.get(\"jourid\", \"unknown\")\n", " title = journal.find(\"title\").text if journal.find(\"title\") is not None else \"\"\n", " journals.append({'ezb-id':jourid, 'journame':title})\n", " return journals" ] }, { "cell_type": "markdown", "id": "0cabceab-78e6-414a-a233-62d97e4b3059", "metadata": {}, "source": [ "in the xml output next_fifty property contains the indices for the next pages" ] }, { "cell_type": "code", "execution_count": 6, "id": "70bdb93c-77fd-44e1-b57f-7205fb05b605", "metadata": {}, "outputs": [], "source": [ "def extract_journals_from_section(sc):\n", " section_journals = []\n", " root = fetch(f\"{base_page}&sc={sc}\")\n", " sindinces = [0]\n", " nexts = root.findall('.//next_fifty')\n", " for next_fifty in nexts:\n", " sindex = next_fifty.attrib.get('sindex')\n", " sindinces.append(sindex)\n", " for sindex in tqdm(sindinces):\n", " journals = get_journals_for_page(sc, sindex)\n", " section_journals.extend(journals)\n", " return section_journals" ] }, { "cell_type": "code", "execution_count": 7, "id": "c66d6db4-97e9-4b86-81b8-e26030f92f68", "metadata": {}, "outputs": [], "source": [ "def iterate_and_extract_journals():\n", " root = fetch(base_page)\n", " if not root: return\n", " doa_journals = []\n", " s = 0\n", " other_pages = root.findall(\".//other_pages\")\n", " sections = []\n", " for page in other_pages:\n", " sc = page.attrib.get(\"sc\")\n", " sections.append(sc)\n", " #print(sections)\n", " for sc in tqdm(sections):\n", " journals = extract_journals_from_section(sc)\n", " doa_journals.extend(journals) \n", " return doa_journals" ] }, { "cell_type": "code", "execution_count": 8, "id": "5081f71e-eeea-4c95-87b7-12fbb8139c38", "metadata": { "scrolled": true, "tags": [] }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "333030eac5e346ec915d61223e70c9b3", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/24 [00:00 str:\n", " publisher = xml.find(f\".//publisher\")\n", " return publisher.text if not publisher is None else \"\"\n", "\n", "def get_e_issns(xml: ET.ElementTree) -> List[str]:\n", " issns = []\n", " for e_issn in xml.findall(\".//E_ISSN\"):\n", " issns.append(e_issn.text)\n", " return issns\n", "\n", "def get_p_issns(xml: ET.ElementTree) -> List[str]:\n", " issns = []\n", " for p_issn in xml.findall(\".//P_ISSN\"):\n", " issns.append(p_issn.text)\n", " return issns\n", "\n", "def get_subjects(xml: ET.ElementTree) -> List[str]:\n", " subjects = []\n", " for subject in xml.findall(\".//subject\"):\n", " subjects.append(subject.text)\n", " return subjects\n", "\n", "def get_categories(xml: ET.ElementTree) -> List[str]:\n", " categories = []\n", " for category in xml.findall(\".//category\"):\n", " categories.append(category.text)\n", " return categories\n", "\n", "def get_first_issue(xml: ET.ElementTree) -> str:\n", " date = xml.find(\".//first_date\")\n", " return date.text if not date is None else \"\"\n", "\n", "def get_home_page(xml: ET.ElementTree) -> str:\n", " homepage = xml.find(\".//homepage\")\n", " return homepage.text if not homepage is None else \"\"\n", "\n", "def get_appearence(xml: ET.ElementTree) -> str:\n", " appearence = xml.find(\".//appearence\")\n", " return appearence.text if not appearence is None else \"\"\n", "\n", "def get_costs(xml: ET.ElementTree) -> str:\n", " costs = xml.find(\".//costs\") \n", " return costs.text if not costs is None else \"\"\n", "\n", "def get_access_conditions(xml: ET.ElementTree) -> str:\n", " ac = xml.find(\".//access_conditions\")\n", " return ac.text if not ac is None else \"\"\n", "\n", "def get_doaj_info(xml: ET.ElementTree) -> dict:\n", " d = {}\n", " doaj = xml.find(\".//doaj\")\n", " if doaj:\n", " d['doaj:url'] = doaj.attrib.get('url')\n", " for child in doaj:\n", " tag = child.tag\n", " if \"}\" in tag:\n", " tag = tag.split(\"}\", 1)[1]\n", " d[f\"doaj:{tag}\"]=child.text\n", " return d" ] }, { "cell_type": "code", "execution_count": 12, "id": "48726252-4021-4cb5-9a48-15bd03497aa4", "metadata": {}, "outputs": [], "source": [ "def get_publishings_info(xml: ET.ElementTree) -> List[object]:\n", " publishing = []\n", " publishing_tag = xml.find(\".//publishing\")\n", " if not publishing_tag is None:\n", " for info in publishing_tag:\n", " if not info.tag in publishing: publishing.append(info.tag)\n", " return publishing" ] }, { "cell_type": "code", "execution_count": 13, "id": "d59b38b2-2f0c-4768-a940-55ec9b8c4316", "metadata": {}, "outputs": [], "source": [ "def get_detail_tags(xml: ET.ElementTree) -> List[str]:\n", " keywords = []\n", " details = xml.find(\".//detail\")\n", " if not details is None:\n", " for child in details:\n", " if not child.tag in keywords: keywords.append(child.tag)\n", " return keywords" ] }, { "cell_type": "code", "execution_count": 14, "id": "cba9c62f-20b0-4a8a-903f-8ed8ce9a2acb", "metadata": {}, "outputs": [], "source": [ "def get_openapc_info(xml: ET.ElementTree) -> dict:\n", " d = {}\n", " namespace = {\n", " 'openapc': 'https://olap.openapc.net/'\n", " }\n", " period = xml.find(\".//openapc:period\", namespaces=namespace)\n", " n_items = xml.find(\".//openapc:apc_num_items\", namespaces=namespace)\n", " amount = xml.find(\".//openapc:apc_amount_avg\", namespaces=namespace)\n", " d['openapc:period'] = period.text if not period is None else \"NA\"\n", " d['openapc:apc_num_items'] = n_items.text if not period is None else \"NA\"\n", " d['openapc:apc_amount_avg'] = amount.text if not amount is None else \"NA\"\n", " d['openapc:apc_amount_avg_currency'] = amount.attrib.get(\"currency\", \"NA\") if not amount is None else \"NA\"\n", " \n", " return d" ] }, { "cell_type": "code", "execution_count": 24, "id": "7782bea1-2840-4d02-991f-b77d6ae264c1", "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "dedc1909b0f840f38cb0b191c1779cca", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/4953 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ezb-idjournamepublisher_nameE-ISSNsP-ISSNsezb_subjectsezb_categoriesfirst_issuehome_pageappearence...doaj:journal_plagiarism_screening_policydoaj:plagiarism_information_urldoaj:url_for_journal_instructions_for_authorsdoaj:last_updated_datedoaj:average_number_of_weeks_between_article_submission_and_publicationopenapc:periodopenapc:apc_num_itemsopenapc:apc_amount_avgopenapc:apc_amount_avg_currencydoaj:apc_amount
04961591616: Anuario de Literatura ComparadaEdiciones Universidad Salamanca2445-2262[0210-7287][Linguistics and Literature, Romance Studies][Diamond Open Access Journal, Indexed in DOAJ,...2011https://revistas.usal.es/dos/index.php/1616_An...Fulltext, online only...Yeshttp://revistas.usal.es/index.php/1616_Anuario...http://revistas.usal.es/index.php/1616_Anuario...2020-03-09 18:14:024NANANANANaN
113417219 : Interdisciplinary Studies in the Long Nin...School of Arts, Birkbeck College, Univ. of Lon...1755-1560[][Linguistics and Literature][Diamond Open Access Journal, DOAJ Seal, Index...2005https://19.bbk.ac.uk/Fulltext, online only...Yeshttps://19.bbk.ac.uk/site/research-integrity/https://19.bbk.ac.uk/site/author-guidelines/2024-06-24 14:43:2826NANANANANaN
2170939452ºF, The Journal of Literary Theory and Comp...Universitat de Barcelona2013-3294[][Linguistics and Literature][Diamond Open Access Journal, Indexed in DOAJ,...2010http://www.452f.comFulltext, online only...NoNaNhttp://www.452f.com/index.php/en/manual-estilo162024-03-11 10:17:2116NANANANANaN
33370049th Parallel: An Interdisciplinary Journal of...University of Birmingham, Department of Americ...1753-5794[][English, American Studies, History, Political...[Diamond Open Access Journal, Indexed in DOAJ,...1999https://49thparalleljournal.org/Fulltext, online only...NoNaNhttp://49thparalleljournal.org/submissions/2017-04-10 14:45:5624NANANANANaN
4461034A&P Continuidad: Publicación Temática de Arqui...Facultad de Arquitectura, Planeamiento y Diseñ...2362-6097[2362-6089][Architecture, Civil Engineering][Diamond Open Access Journal, Indexed in DOAJ,...2014https://www.ayp.fapyd.unr.edu.ar/index.php/ayp...Fulltext, online and print...NoNaNhttp://www.ayp.fapyd.unr.edu.ar/index.php/ayp/...2019-08-30 10:04:0812NANANANANaN
\n", "

5 rows × 33 columns

\n", "" ], "text/plain": [ " ezb-id journame \\\n", "0 496159 1616: Anuario de Literatura Comparada \n", "1 134172 19 : Interdisciplinary Studies in the Long Nin... \n", "2 170939 452ºF, The Journal of Literary Theory and Comp... \n", "3 33700 49th Parallel: An Interdisciplinary Journal of... \n", "4 461034 A&P Continuidad: Publicación Temática de Arqui... \n", "\n", " publisher_name E-ISSNs P-ISSNs \\\n", "0 Ediciones Universidad Salamanca 2445-2262 [0210-7287] \n", "1 School of Arts, Birkbeck College, Univ. of Lon... 1755-1560 [] \n", "2 Universitat de Barcelona 2013-3294 [] \n", "3 University of Birmingham, Department of Americ... 1753-5794 [] \n", "4 Facultad de Arquitectura, Planeamiento y Diseñ... 2362-6097 [2362-6089] \n", "\n", " ezb_subjects \\\n", "0 [Linguistics and Literature, Romance Studies] \n", "1 [Linguistics and Literature] \n", "2 [Linguistics and Literature] \n", "3 [English, American Studies, History, Political... \n", "4 [Architecture, Civil Engineering] \n", "\n", " ezb_categories first_issue \\\n", "0 [Diamond Open Access Journal, Indexed in DOAJ,... 2011 \n", "1 [Diamond Open Access Journal, DOAJ Seal, Index... 2005 \n", "2 [Diamond Open Access Journal, Indexed in DOAJ,... 2010 \n", "3 [Diamond Open Access Journal, Indexed in DOAJ,... 1999 \n", "4 [Diamond Open Access Journal, Indexed in DOAJ,... 2014 \n", "\n", " home_page \\\n", "0 https://revistas.usal.es/dos/index.php/1616_An... \n", "1 https://19.bbk.ac.uk/ \n", "2 http://www.452f.com \n", "3 https://49thparalleljournal.org/ \n", "4 https://www.ayp.fapyd.unr.edu.ar/index.php/ayp... \n", "\n", " appearence ... doaj:journal_plagiarism_screening_policy \\\n", "0 Fulltext, online only ... Yes \n", "1 Fulltext, online only ... Yes \n", "2 Fulltext, online only ... No \n", "3 Fulltext, online only ... No \n", "4 Fulltext, online and print ... No \n", "\n", " doaj:plagiarism_information_url \\\n", "0 http://revistas.usal.es/index.php/1616_Anuario... \n", "1 https://19.bbk.ac.uk/site/research-integrity/ \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "\n", " doaj:url_for_journal_instructions_for_authors doaj:last_updated_date \\\n", "0 http://revistas.usal.es/index.php/1616_Anuario... 2020-03-09 18:14:02 \n", "1 https://19.bbk.ac.uk/site/author-guidelines/ 2024-06-24 14:43:28 \n", "2 http://www.452f.com/index.php/en/manual-estilo16 2024-03-11 10:17:21 \n", "3 http://49thparalleljournal.org/submissions/ 2017-04-10 14:45:56 \n", "4 http://www.ayp.fapyd.unr.edu.ar/index.php/ayp/... 2019-08-30 10:04:08 \n", "\n", " doaj:average_number_of_weeks_between_article_submission_and_publication \\\n", "0 4 \n", "1 26 \n", "2 16 \n", "3 24 \n", "4 12 \n", "\n", " openapc:period openapc:apc_num_items openapc:apc_amount_avg \\\n", "0 NA NA NA \n", "1 NA NA NA \n", "2 NA NA NA \n", "3 NA NA NA \n", "4 NA NA NA \n", "\n", " openapc:apc_amount_avg_currency doaj:apc_amount \n", "0 NA NaN \n", "1 NA NaN \n", "2 NA NaN \n", "3 NA NaN \n", "4 NA NaN \n", "\n", "[5 rows x 33 columns]" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 23, "id": "a1b904d4-68d8-4b4d-8d2c-923aece48519", "metadata": {}, "outputs": [], "source": [ "sep = \"/\"" ] }, { "cell_type": "code", "execution_count": 25, "id": "2dd9b474-9129-4aab-994a-d565f6adb767", "metadata": {}, "outputs": [], "source": [ "df['E-ISSNs'] = df['E-ISSNs'].apply(lambda x: sep.join(x) if len(x) > 0 else \"\")\n", "df['P-ISSNs'] = df['P-ISSNs'].apply(lambda x: sep.join(x) if len(x) > 0 else \"\")\n", "df['ezb_subjects'] = df['ezb_subjects'].apply(lambda x: sep.join(x) if len(x) > 0 else \"\")\n", "df['ezb_categories'] = df['ezb_categories'].apply(lambda x: sep.join(x) if len(x) > 0 else \"\")" ] }, { "cell_type": "code", "execution_count": 32, "id": "40f21e86-3b0a-4128-827f-0c0fa19a67d4", "metadata": {}, "outputs": [], "source": [ "df.to_csv(\"ezb_journals.csv\", index=None, sep=\";\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.8" } }, "nbformat": 4, "nbformat_minor": 5 }