DiamondOA/ezb-downloader.ipynb

37 KiB
Raw Blame History

In [1]:
import requests, json, os
from typing import List
import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm
import pandas as pd
from IPython.display import JSON as pretty_print

Fetching and retrieving the id of all the EZB DOA JOURNALS

Parameters

For what I understood:

  • jq_term → filtering term
  • xmloutput → 1 in xml other values returns in html
  • sc → Starting Character of the output list
  • sindex → Starting Index to iterate over the xml output journals
  • hits_per_page → number of items per page. Default or not specified: 50

documentation: https://ezb.ur.de/services/xmloutput.phtml?bibid=AAAAA&colors=1&lang=en

however I don't know what FKW means for jq_type (it is not listed in #6.6 of the documentation)

same for the possible jq_term. I suppose they are the categories shown in the advanced search > journal categories list (https://ezb.ur.de/search.phtml?bibid=AAAAA&colors=1&lang=en)

In [2]:
jq_type1='FKW'
jq_term1='Diamond_Open_Access_Journal'
xmloutput=1
lang='en'
sc='A'
sindex=0
In [3]:
base_page = f"https://ezb.ur.de/searchres.phtml?jq_type1={jq_type1}&jq_term1={jq_term1}&xmloutput={xmloutput}&lang={lang}"
In [4]:
def fetch(url):
    response = requests.get(url)
    if response.status_code == 200:
        return ET.fromstring(response.content)
    else:
        return None
In [5]:
def get_journals_for_page(sc, sindex):
    journals = []
    url = f"{base_page}&sc={sc}&sindex={sindex}"
    root = fetch(url)
    if not root: return journals
    alphabetical_order = root.find(".//alphabetical_order")
    if alphabetical_order:
        for journal in alphabetical_order.findall(".//journal"):
            jourid = journal.attrib.get("jourid", "unknown")
            title = journal.find("title").text if journal.find("title") is not None else ""
            journals.append({'ezb-id':jourid, 'journame':title})
    return journals

in the xml output next_fifty property contains the indices for the next pages

In [6]:
def extract_journals_from_section(sc):
    section_journals = []
    root = fetch(f"{base_page}&sc={sc}")
    sindinces = [0]
    nexts = root.findall('.//next_fifty')
    for next_fifty in nexts:
        sindex = next_fifty.attrib.get('sindex')
        sindinces.append(sindex)
    for sindex in tqdm(sindinces):
        journals = get_journals_for_page(sc, sindex)
        section_journals.extend(journals)
    return section_journals
In [7]:
def iterate_and_extract_journals():
    root = fetch(base_page)
    if not root: return
    doa_journals = []
    s = 0
    other_pages = root.findall(".//other_pages")
    sections = []
    for page in other_pages:
        sc = page.attrib.get("sc")
        sections.append(sc)
    #print(sections)
    for sc in tqdm(sections):
        journals = extract_journals_from_section(sc)
        doa_journals.extend(journals)        
    return doa_journals
In [8]:
doa_journals = iterate_and_extract_journals()

I noticed that, in ezb search functionality, the journals starting with Z appears both in W and in Z so i'm deduplicating the output

In [9]:
seen_ids = set()
journals = []
for doa in doa_journals:
    id_ = doa['ezb-id']
    if not id_ in seen_ids: journals.append(doa), seen_ids.add(id_)

Saving journals in a local JSON file

In [12]:
with open("ezb_doa_journals.json", "w") as f:
    json.dump(journals, f)

Retrieving Journals Info

In [10]:
base_url="https://ezb.ur.de/detail.phtml?lang=en&xmloutput=1"
In [11]:
def get_publisher_name(xml: ET.ElementTree) -> str:
    publisher = xml.find(f".//publisher")
    return publisher.text if not publisher is None else ""

def get_e_issns(xml: ET.ElementTree) -> List[str]:
    issns = []
    for e_issn in xml.findall(".//E_ISSN"):
        issns.append(e_issn.text)
    return issns

def get_p_issns(xml: ET.ElementTree) -> List[str]:
    issns = []
    for p_issn in xml.findall(".//P_ISSN"):
        issns.append(p_issn.text)
    return issns

def get_subjects(xml: ET.ElementTree) -> List[str]:
    subjects = []
    for subject in xml.findall(".//subject"):
        subjects.append(subject.text)
    return subjects

def get_categories(xml: ET.ElementTree) -> List[str]:
    categories = []
    for category in xml.findall(".//category"):
        categories.append(category.text)
    return categories

def get_first_issue(xml: ET.ElementTree) -> str:
    date = xml.find(".//first_date")
    return date.text if not date is None else ""

def get_home_page(xml: ET.ElementTree) -> str:
    homepage = xml.find(".//homepage")
    return homepage.text if not homepage is None else ""

def get_appearence(xml: ET.ElementTree) -> str:
    appearence = xml.find(".//appearence")
    return appearence.text if not appearence is None else ""

def get_costs(xml: ET.ElementTree) -> str:
    costs = xml.find(".//costs") 
    return costs.text if not costs is None else ""

def get_access_conditions(xml: ET.ElementTree) -> str:
    ac = xml.find(".//access_conditions")
    return ac.text if not ac is None else ""

def get_doaj_info(xml: ET.ElementTree) -> dict:
    d = {}
    doaj = xml.find(".//doaj")
    if doaj:
        d['doaj:url'] = doaj.attrib.get('url')
        for child in doaj:
            tag = child.tag
            if "}" in tag:
                tag = tag.split("}", 1)[1]
            d[f"doaj:{tag}"]=child.text
    return d
In [12]:
def get_publishings_info(xml: ET.ElementTree) -> List[object]:
    publishing = []
    publishing_tag = xml.find(".//publishing")
    if not publishing_tag is None:
        for info in publishing_tag:
            if not info.tag in publishing: publishing.append(info.tag)
    return publishing
In [13]:
def get_detail_tags(xml: ET.ElementTree) -> List[str]:
    keywords = []
    details = xml.find(".//detail")
    if not details is None:
        for child in details:
            if not child.tag in keywords: keywords.append(child.tag)
    return keywords
In [14]:
def get_openapc_info(xml: ET.ElementTree) -> dict:
    d = {}
    namespace = {
        'openapc': 'https://olap.openapc.net/'
    }
    period = xml.find(".//openapc:period", namespaces=namespace)
    n_items = xml.find(".//openapc:apc_num_items", namespaces=namespace)
    amount = xml.find(".//openapc:apc_amount_avg", namespaces=namespace)
    d['openapc:period'] = period.text if not period is None else "NA"
    d['openapc:apc_num_items'] = n_items.text if not period is None else "NA"
    d['openapc:apc_amount_avg'] = amount.text if not amount is None else "NA"
    d['openapc:apc_amount_avg_currency'] = amount.attrib.get("currency", "NA") if not amount is None else "NA"
    
    return d
In [24]:
all_tags = set()
for journal in tqdm(journals):
    jourid = journal['ezb-id']
    jour_url = f"{base_url}&jour_id={jourid}"
    jour_xml = fetch(jour_url)
    tags = get_detail_tags(jour_xml)
    for t in tags:
        all_tags.add(t)
In [15]:
for journal in tqdm(journals):
    jourid = journal['ezb-id']
    jour_url = f"{base_url}&jour_id={jourid}"
    jour_xml = fetch(jour_url)
    journal['publisher_name'] = get_publisher_name(jour_xml)
    journal['E-ISSNs'] = get_e_issns(jour_xml)
    journal['P-ISSNs'] = get_p_issns(jour_xml)
    journal['ezb_subjects'] = get_subjects(jour_xml)
    journal['ezb_categories'] = get_categories(jour_xml)
    journal['first_issue'] = get_first_issue(jour_xml)
    journal['home_page'] = get_home_page(jour_xml)
    journal['appearence'] = get_appearence(jour_xml)
    journal['costs'] = get_costs(jour_xml)
    journal['access_conditions'] = get_access_conditions(jour_xml)
    doaj = get_doaj_info(jour_xml)
    for k,v in doaj.items():
        journal[k] = v
    openapc = get_openapc_info(jour_xml)
    for k,v in openapc.items():
        journal[k] = v
In [32]:
journal_publishing = {}
for journal in tqdm(journals):
    jourid = journal['ezb-id']
    jour_url = f"{base_url}&jour_id={jourid}"
    jour_xml = fetch(jour_url)
    publishing = get_publishings_info(jour_xml)
    journal_publishing[jourid] = {'journame': journal['journame'], 'home_page': journal['home_page'], 'publishing': publishing}
In [22]:
with open("ezb-doa-journals.json", "w") as f:
    json.dump(journals, f)
In [16]:
df = pd.DataFrame(journals)
In [26]:
df.head()
Out[26]:
ezb-id journame publisher_name E-ISSNs P-ISSNs ezb_subjects ezb_categories first_issue home_page appearence ... doaj:journal_plagiarism_screening_policy doaj:plagiarism_information_url doaj:url_for_journal_instructions_for_authors doaj:last_updated_date doaj:average_number_of_weeks_between_article_submission_and_publication openapc:period openapc:apc_num_items openapc:apc_amount_avg openapc:apc_amount_avg_currency doaj:apc_amount
0 496159 1616: Anuario de Literatura Comparada Ediciones Universidad Salamanca 2445-2262 [0210-7287] [Linguistics and Literature, Romance Studies] [Diamond Open Access Journal, Indexed in DOAJ,... 2011 https://revistas.usal.es/dos/index.php/1616_An... Fulltext, online only ... Yes http://revistas.usal.es/index.php/1616_Anuario... http://revistas.usal.es/index.php/1616_Anuario... 2020-03-09 18:14:02 4 NA NA NA NA NaN
1 134172 19 : Interdisciplinary Studies in the Long Nin... School of Arts, Birkbeck College, Univ. of Lon... 1755-1560 [] [Linguistics and Literature] [Diamond Open Access Journal, DOAJ Seal, Index... 2005 https://19.bbk.ac.uk/ Fulltext, online only ... Yes https://19.bbk.ac.uk/site/research-integrity/ https://19.bbk.ac.uk/site/author-guidelines/ 2024-06-24 14:43:28 26 NA NA NA NA NaN
2 170939 452ºF, The Journal of Literary Theory and Comp... Universitat de Barcelona 2013-3294 [] [Linguistics and Literature] [Diamond Open Access Journal, Indexed in DOAJ,... 2010 http://www.452f.com Fulltext, online only ... No NaN http://www.452f.com/index.php/en/manual-estilo16 2024-03-11 10:17:21 16 NA NA NA NA NaN
3 33700 49th Parallel: An Interdisciplinary Journal of... University of Birmingham, Department of Americ... 1753-5794 [] [English, American Studies, History, Political... [Diamond Open Access Journal, Indexed in DOAJ,... 1999 https://49thparalleljournal.org/ Fulltext, online only ... No NaN http://49thparalleljournal.org/submissions/ 2017-04-10 14:45:56 24 NA NA NA NA NaN
4 461034 A&P Continuidad: Publicación Temática de Arqui... Facultad de Arquitectura, Planeamiento y Diseñ... 2362-6097 [2362-6089] [Architecture, Civil Engineering] [Diamond Open Access Journal, Indexed in DOAJ,... 2014 https://www.ayp.fapyd.unr.edu.ar/index.php/ayp... Fulltext, online and print ... No NaN http://www.ayp.fapyd.unr.edu.ar/index.php/ayp/... 2019-08-30 10:04:08 12 NA NA NA NA NaN

5 rows × 33 columns

In [23]:
sep = "/"
In [25]:
df['E-ISSNs'] = df['E-ISSNs'].apply(lambda x: sep.join(x) if len(x) > 0 else "")
df['P-ISSNs'] = df['P-ISSNs'].apply(lambda x: sep.join(x) if len(x) > 0 else "")
df['ezb_subjects'] = df['ezb_subjects'].apply(lambda x: sep.join(x) if len(x) > 0 else "")
df['ezb_categories'] = df['ezb_categories'].apply(lambda x: sep.join(x) if len(x) > 0 else "")
In [32]:
df.to_csv("ezb_journals.csv", index=None, sep=";")