From 1021cd73b4e2939a78d8df5f9a33732933e2b44d Mon Sep 17 00:00:00 2001 From: Simone Angioni Date: Wed, 11 Dec 2024 13:53:08 +0100 Subject: [PATCH] Upload files to "/" --- ezb-downloader.ipynb | 1174 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1174 insertions(+) create mode 100644 ezb-downloader.ipynb diff --git a/ezb-downloader.ipynb b/ezb-downloader.ipynb new file mode 100644 index 0000000..56407bf --- /dev/null +++ b/ezb-downloader.ipynb @@ -0,0 +1,1174 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d2717360-0015-4f77-9c1c-a1adf1c003c8", + "metadata": {}, + "outputs": [], + "source": [ + "import requests, json, os\n", + "from typing import List\n", + "import xml.etree.ElementTree as ET\n", + "from tqdm.notebook import tqdm\n", + "import pandas as pd\n", + "from IPython.display import JSON as pretty_print" + ] + }, + { + "cell_type": "markdown", + "id": "64910fc4-cae7-43cc-8fa0-ff4c103ebfe6", + "metadata": { + "tags": [] + }, + "source": [ + "## Fetching and retrieving the id of all the EZB DOA JOURNALS" + ] + }, + { + "cell_type": "markdown", + "id": "d8c8c9bf-ca6c-422a-8309-afdac259b494", + "metadata": { + "tags": [] + }, + "source": [ + "### Parameters" + ] + }, + { + "cell_type": "markdown", + "id": "70f52e18-45c9-43ca-9763-094ec7aae600", + "metadata": {}, + "source": [ + "For what I understood:\n", + "\n", + "- jq_term → filtering term\n", + "- xmloutput → 1 in xml other values returns in html\n", + "- sc → Starting Character of the output list\n", + "- sindex → Starting Index to iterate over the xml output journals\n", + "- hits_per_page → number of items per page. Default or not specified: 50\n", + "\n", + "documentation: https://ezb.ur.de/services/xmloutput.phtml?bibid=AAAAA&colors=1&lang=en\n", + "\n", + "however I don't know what FKW means for *jq_type* (it is not listed in #6.6 of the documentation)\n", + "\n", + "same for the possible *jq_term*. I suppose they are the categories shown in the advanced search > journal categories list (https://ezb.ur.de/search.phtml?bibid=AAAAA&colors=1&lang=en)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "0b5bd5d9-166f-4bf0-8649-69082df8b6ec", + "metadata": {}, + "outputs": [], + "source": [ + "jq_type1='FKW'\n", + "jq_term1='Diamond_Open_Access_Journal'\n", + "xmloutput=1\n", + "lang='en'\n", + "sc='A'\n", + "sindex=0" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "c18a0c19-9ba0-47c4-88a1-04529be5d8e8", + "metadata": {}, + "outputs": [], + "source": [ + "base_page = f\"https://ezb.ur.de/searchres.phtml?jq_type1={jq_type1}&jq_term1={jq_term1}&xmloutput={xmloutput}&lang={lang}\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "e1fd5855-fcf9-4058-98ba-333e0b714653", + "metadata": {}, + "outputs": [], + "source": [ + "def fetch(url):\n", + " response = requests.get(url)\n", + " if response.status_code == 200:\n", + " return ET.fromstring(response.content)\n", + " else:\n", + " return None" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4376fb42-b255-4206-99ca-fae55a8fcf30", + "metadata": {}, + "outputs": [], + "source": [ + "def get_journals_for_page(sc, sindex):\n", + " journals = []\n", + " url = f\"{base_page}&sc={sc}&sindex={sindex}\"\n", + " root = fetch(url)\n", + " if not root: return journals\n", + " alphabetical_order = root.find(\".//alphabetical_order\")\n", + " if alphabetical_order:\n", + " for journal in alphabetical_order.findall(\".//journal\"):\n", + " jourid = journal.attrib.get(\"jourid\", \"unknown\")\n", + " title = journal.find(\"title\").text if journal.find(\"title\") is not None else \"\"\n", + " journals.append({'ezb-id':jourid, 'journame':title})\n", + " return journals" + ] + }, + { + "cell_type": "markdown", + "id": "0cabceab-78e6-414a-a233-62d97e4b3059", + "metadata": {}, + "source": [ + "in the xml output next_fifty property contains the indices for the next pages" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "70bdb93c-77fd-44e1-b57f-7205fb05b605", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_journals_from_section(sc):\n", + " section_journals = []\n", + " root = fetch(f\"{base_page}&sc={sc}\")\n", + " sindinces = [0]\n", + " nexts = root.findall('.//next_fifty')\n", + " for next_fifty in nexts:\n", + " sindex = next_fifty.attrib.get('sindex')\n", + " sindinces.append(sindex)\n", + " for sindex in tqdm(sindinces):\n", + " journals = get_journals_for_page(sc, sindex)\n", + " section_journals.extend(journals)\n", + " return section_journals" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "c66d6db4-97e9-4b86-81b8-e26030f92f68", + "metadata": {}, + "outputs": [], + "source": [ + "def iterate_and_extract_journals():\n", + " root = fetch(base_page)\n", + " if not root: return\n", + " doa_journals = []\n", + " s = 0\n", + " other_pages = root.findall(\".//other_pages\")\n", + " sections = []\n", + " for page in other_pages:\n", + " sc = page.attrib.get(\"sc\")\n", + " sections.append(sc)\n", + " #print(sections)\n", + " for sc in tqdm(sections):\n", + " journals = extract_journals_from_section(sc)\n", + " doa_journals.extend(journals) \n", + " return doa_journals" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "5081f71e-eeea-4c95-87b7-12fbb8139c38", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "333030eac5e346ec915d61223e70c9b3", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/24 [00:00 str:\n", + " publisher = xml.find(f\".//publisher\")\n", + " return publisher.text if not publisher is None else \"\"\n", + "\n", + "def get_e_issns(xml: ET.ElementTree) -> List[str]:\n", + " issns = []\n", + " for e_issn in xml.findall(\".//E_ISSN\"):\n", + " issns.append(e_issn.text)\n", + " return issns\n", + "\n", + "def get_p_issns(xml: ET.ElementTree) -> List[str]:\n", + " issns = []\n", + " for p_issn in xml.findall(\".//P_ISSN\"):\n", + " issns.append(p_issn.text)\n", + " return issns\n", + "\n", + "def get_subjects(xml: ET.ElementTree) -> List[str]:\n", + " subjects = []\n", + " for subject in xml.findall(\".//subject\"):\n", + " subjects.append(subject.text)\n", + " return subjects\n", + "\n", + "def get_categories(xml: ET.ElementTree) -> List[str]:\n", + " categories = []\n", + " for category in xml.findall(\".//category\"):\n", + " categories.append(category.text)\n", + " return categories\n", + "\n", + "def get_first_issue(xml: ET.ElementTree) -> str:\n", + " date = xml.find(\".//first_date\")\n", + " return date.text if not date is None else \"\"\n", + "\n", + "def get_home_page(xml: ET.ElementTree) -> str:\n", + " homepage = xml.find(\".//homepage\")\n", + " return homepage.text if not homepage is None else \"\"\n", + "\n", + "def get_appearence(xml: ET.ElementTree) -> str:\n", + " appearence = xml.find(\".//appearence\")\n", + " return appearence.text if not appearence is None else \"\"\n", + "\n", + "def get_costs(xml: ET.ElementTree) -> str:\n", + " costs = xml.find(\".//costs\") \n", + " return costs.text if not costs is None else \"\"\n", + "\n", + "def get_access_conditions(xml: ET.ElementTree) -> str:\n", + " ac = xml.find(\".//access_conditions\")\n", + " return ac.text if not ac is None else \"\"\n", + "\n", + "def get_doaj_info(xml: ET.ElementTree) -> dict:\n", + " d = {}\n", + " doaj = xml.find(\".//doaj\")\n", + " if doaj:\n", + " d['doaj:url'] = doaj.attrib.get('url')\n", + " for child in doaj:\n", + " tag = child.tag\n", + " if \"}\" in tag:\n", + " tag = tag.split(\"}\", 1)[1]\n", + " d[f\"doaj:{tag}\"]=child.text\n", + " return d" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "48726252-4021-4cb5-9a48-15bd03497aa4", + "metadata": {}, + "outputs": [], + "source": [ + "def get_publishings_info(xml: ET.ElementTree) -> List[object]:\n", + " publishing = []\n", + " publishing_tag = xml.find(\".//publishing\")\n", + " if not publishing_tag is None:\n", + " for info in publishing_tag:\n", + " if not info.tag in publishing: publishing.append(info.tag)\n", + " return publishing" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "d59b38b2-2f0c-4768-a940-55ec9b8c4316", + "metadata": {}, + "outputs": [], + "source": [ + "def get_detail_tags(xml: ET.ElementTree) -> List[str]:\n", + " keywords = []\n", + " details = xml.find(\".//detail\")\n", + " if not details is None:\n", + " for child in details:\n", + " if not child.tag in keywords: keywords.append(child.tag)\n", + " return keywords" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "cba9c62f-20b0-4a8a-903f-8ed8ce9a2acb", + "metadata": {}, + "outputs": [], + "source": [ + "def get_openapc_info(xml: ET.ElementTree) -> dict:\n", + " d = {}\n", + " namespace = {\n", + " 'openapc': 'https://olap.openapc.net/'\n", + " }\n", + " period = xml.find(\".//openapc:period\", namespaces=namespace)\n", + " n_items = xml.find(\".//openapc:apc_num_items\", namespaces=namespace)\n", + " amount = xml.find(\".//openapc:apc_amount_avg\", namespaces=namespace)\n", + " d['openapc:period'] = period.text if not period is None else \"NA\"\n", + " d['openapc:apc_num_items'] = n_items.text if not period is None else \"NA\"\n", + " d['openapc:apc_amount_avg'] = amount.text if not amount is None else \"NA\"\n", + " d['openapc:apc_amount_avg_currency'] = amount.attrib.get(\"currency\", \"NA\") if not amount is None else \"NA\"\n", + " \n", + " return d" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "7782bea1-2840-4d02-991f-b77d6ae264c1", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dedc1909b0f840f38cb0b191c1779cca", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/4953 [00:00\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ezb-idjournamepublisher_nameE-ISSNsP-ISSNsezb_subjectsezb_categoriesfirst_issuehome_pageappearence...doaj:journal_plagiarism_screening_policydoaj:plagiarism_information_urldoaj:url_for_journal_instructions_for_authorsdoaj:last_updated_datedoaj:average_number_of_weeks_between_article_submission_and_publicationopenapc:periodopenapc:apc_num_itemsopenapc:apc_amount_avgopenapc:apc_amount_avg_currencydoaj:apc_amount
04961591616: Anuario de Literatura ComparadaEdiciones Universidad Salamanca2445-2262[0210-7287][Linguistics and Literature, Romance Studies][Diamond Open Access Journal, Indexed in DOAJ,...2011https://revistas.usal.es/dos/index.php/1616_An...Fulltext, online only...Yeshttp://revistas.usal.es/index.php/1616_Anuario...http://revistas.usal.es/index.php/1616_Anuario...2020-03-09 18:14:024NANANANANaN
113417219 : Interdisciplinary Studies in the Long Nin...School of Arts, Birkbeck College, Univ. of Lon...1755-1560[][Linguistics and Literature][Diamond Open Access Journal, DOAJ Seal, Index...2005https://19.bbk.ac.uk/Fulltext, online only...Yeshttps://19.bbk.ac.uk/site/research-integrity/https://19.bbk.ac.uk/site/author-guidelines/2024-06-24 14:43:2826NANANANANaN
2170939452ºF, The Journal of Literary Theory and Comp...Universitat de Barcelona2013-3294[][Linguistics and Literature][Diamond Open Access Journal, Indexed in DOAJ,...2010http://www.452f.comFulltext, online only...NoNaNhttp://www.452f.com/index.php/en/manual-estilo162024-03-11 10:17:2116NANANANANaN
33370049th Parallel: An Interdisciplinary Journal of...University of Birmingham, Department of Americ...1753-5794[][English, American Studies, History, Political...[Diamond Open Access Journal, Indexed in DOAJ,...1999https://49thparalleljournal.org/Fulltext, online only...NoNaNhttp://49thparalleljournal.org/submissions/2017-04-10 14:45:5624NANANANANaN
4461034A&P Continuidad: Publicación Temática de Arqui...Facultad de Arquitectura, Planeamiento y Diseñ...2362-6097[2362-6089][Architecture, Civil Engineering][Diamond Open Access Journal, Indexed in DOAJ,...2014https://www.ayp.fapyd.unr.edu.ar/index.php/ayp...Fulltext, online and print...NoNaNhttp://www.ayp.fapyd.unr.edu.ar/index.php/ayp/...2019-08-30 10:04:0812NANANANANaN
\n", + "

5 rows × 33 columns

\n", + "" + ], + "text/plain": [ + " ezb-id journame \\\n", + "0 496159 1616: Anuario de Literatura Comparada \n", + "1 134172 19 : Interdisciplinary Studies in the Long Nin... \n", + "2 170939 452ºF, The Journal of Literary Theory and Comp... \n", + "3 33700 49th Parallel: An Interdisciplinary Journal of... \n", + "4 461034 A&P Continuidad: Publicación Temática de Arqui... \n", + "\n", + " publisher_name E-ISSNs P-ISSNs \\\n", + "0 Ediciones Universidad Salamanca 2445-2262 [0210-7287] \n", + "1 School of Arts, Birkbeck College, Univ. of Lon... 1755-1560 [] \n", + "2 Universitat de Barcelona 2013-3294 [] \n", + "3 University of Birmingham, Department of Americ... 1753-5794 [] \n", + "4 Facultad de Arquitectura, Planeamiento y Diseñ... 2362-6097 [2362-6089] \n", + "\n", + " ezb_subjects \\\n", + "0 [Linguistics and Literature, Romance Studies] \n", + "1 [Linguistics and Literature] \n", + "2 [Linguistics and Literature] \n", + "3 [English, American Studies, History, Political... \n", + "4 [Architecture, Civil Engineering] \n", + "\n", + " ezb_categories first_issue \\\n", + "0 [Diamond Open Access Journal, Indexed in DOAJ,... 2011 \n", + "1 [Diamond Open Access Journal, DOAJ Seal, Index... 2005 \n", + "2 [Diamond Open Access Journal, Indexed in DOAJ,... 2010 \n", + "3 [Diamond Open Access Journal, Indexed in DOAJ,... 1999 \n", + "4 [Diamond Open Access Journal, Indexed in DOAJ,... 2014 \n", + "\n", + " home_page \\\n", + "0 https://revistas.usal.es/dos/index.php/1616_An... \n", + "1 https://19.bbk.ac.uk/ \n", + "2 http://www.452f.com \n", + "3 https://49thparalleljournal.org/ \n", + "4 https://www.ayp.fapyd.unr.edu.ar/index.php/ayp... \n", + "\n", + " appearence ... doaj:journal_plagiarism_screening_policy \\\n", + "0 Fulltext, online only ... Yes \n", + "1 Fulltext, online only ... Yes \n", + "2 Fulltext, online only ... No \n", + "3 Fulltext, online only ... No \n", + "4 Fulltext, online and print ... No \n", + "\n", + " doaj:plagiarism_information_url \\\n", + "0 http://revistas.usal.es/index.php/1616_Anuario... \n", + "1 https://19.bbk.ac.uk/site/research-integrity/ \n", + "2 NaN \n", + "3 NaN \n", + "4 NaN \n", + "\n", + " doaj:url_for_journal_instructions_for_authors doaj:last_updated_date \\\n", + "0 http://revistas.usal.es/index.php/1616_Anuario... 2020-03-09 18:14:02 \n", + "1 https://19.bbk.ac.uk/site/author-guidelines/ 2024-06-24 14:43:28 \n", + "2 http://www.452f.com/index.php/en/manual-estilo16 2024-03-11 10:17:21 \n", + "3 http://49thparalleljournal.org/submissions/ 2017-04-10 14:45:56 \n", + "4 http://www.ayp.fapyd.unr.edu.ar/index.php/ayp/... 2019-08-30 10:04:08 \n", + "\n", + " doaj:average_number_of_weeks_between_article_submission_and_publication \\\n", + "0 4 \n", + "1 26 \n", + "2 16 \n", + "3 24 \n", + "4 12 \n", + "\n", + " openapc:period openapc:apc_num_items openapc:apc_amount_avg \\\n", + "0 NA NA NA \n", + "1 NA NA NA \n", + "2 NA NA NA \n", + "3 NA NA NA \n", + "4 NA NA NA \n", + "\n", + " openapc:apc_amount_avg_currency doaj:apc_amount \n", + "0 NA NaN \n", + "1 NA NaN \n", + "2 NA NaN \n", + "3 NA NaN \n", + "4 NA NaN \n", + "\n", + "[5 rows x 33 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "a1b904d4-68d8-4b4d-8d2c-923aece48519", + "metadata": {}, + "outputs": [], + "source": [ + "sep = \"/\"" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "2dd9b474-9129-4aab-994a-d565f6adb767", + "metadata": {}, + "outputs": [], + "source": [ + "df['E-ISSNs'] = df['E-ISSNs'].apply(lambda x: sep.join(x) if len(x) > 0 else \"\")\n", + "df['P-ISSNs'] = df['P-ISSNs'].apply(lambda x: sep.join(x) if len(x) > 0 else \"\")\n", + "df['ezb_subjects'] = df['ezb_subjects'].apply(lambda x: sep.join(x) if len(x) > 0 else \"\")\n", + "df['ezb_categories'] = df['ezb_categories'].apply(lambda x: sep.join(x) if len(x) > 0 else \"\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "40f21e86-3b0a-4128-827f-0c0fa19a67d4", + "metadata": {}, + "outputs": [], + "source": [ + "df.to_csv(\"ezb_journals.csv\", index=None, sep=\";\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.8" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}