37 KiB
37 KiB
In [1]:
import requests, json, os
from typing import List
import xml.etree.ElementTree as ET
from tqdm.notebook import tqdm
import pandas as pd
from IPython.display import JSON as pretty_print
Fetching and retrieving the id of all the EZB DOA JOURNALS¶
Parameters¶
For what I understood:
- jq_term → filtering term
- xmloutput → 1 in xml other values returns in html
- sc → Starting Character of the output list
- sindex → Starting Index to iterate over the xml output journals
- hits_per_page → number of items per page. Default or not specified: 50
documentation: https://ezb.ur.de/services/xmloutput.phtml?bibid=AAAAA&colors=1&lang=en
however I don't know what FKW means for jq_type (it is not listed in #6.6 of the documentation)
same for the possible jq_term. I suppose they are the categories shown in the advanced search > journal categories list (https://ezb.ur.de/search.phtml?bibid=AAAAA&colors=1&lang=en)
In [2]:
jq_type1='FKW'
jq_term1='Diamond_Open_Access_Journal'
xmloutput=1
lang='en'
sc='A'
sindex=0
In [3]:
base_page = f"https://ezb.ur.de/searchres.phtml?jq_type1={jq_type1}&jq_term1={jq_term1}&xmloutput={xmloutput}&lang={lang}"
In [4]:
def fetch(url):
response = requests.get(url)
if response.status_code == 200:
return ET.fromstring(response.content)
else:
return None
In [5]:
def get_journals_for_page(sc, sindex):
journals = []
url = f"{base_page}&sc={sc}&sindex={sindex}"
root = fetch(url)
if not root: return journals
alphabetical_order = root.find(".//alphabetical_order")
if alphabetical_order:
for journal in alphabetical_order.findall(".//journal"):
jourid = journal.attrib.get("jourid", "unknown")
title = journal.find("title").text if journal.find("title") is not None else ""
journals.append({'ezb-id':jourid, 'journame':title})
return journals
in the xml output next_fifty property contains the indices for the next pages
In [6]:
def extract_journals_from_section(sc):
section_journals = []
root = fetch(f"{base_page}&sc={sc}")
sindinces = [0]
nexts = root.findall('.//next_fifty')
for next_fifty in nexts:
sindex = next_fifty.attrib.get('sindex')
sindinces.append(sindex)
for sindex in tqdm(sindinces):
journals = get_journals_for_page(sc, sindex)
section_journals.extend(journals)
return section_journals
In [7]:
def iterate_and_extract_journals():
root = fetch(base_page)
if not root: return
doa_journals = []
s = 0
other_pages = root.findall(".//other_pages")
sections = []
for page in other_pages:
sc = page.attrib.get("sc")
sections.append(sc)
#print(sections)
for sc in tqdm(sections):
journals = extract_journals_from_section(sc)
doa_journals.extend(journals)
return doa_journals
In [8]:
doa_journals = iterate_and_extract_journals()
I noticed that, in ezb search functionality, the journals starting with Z appears both in W and in Z so i'm deduplicating the output
In [9]:
seen_ids = set()
journals = []
for doa in doa_journals:
id_ = doa['ezb-id']
if not id_ in seen_ids: journals.append(doa), seen_ids.add(id_)
Saving journals in a local JSON file
In [12]:
with open("ezb_doa_journals.json", "w") as f:
json.dump(journals, f)
Retrieving Journals Info¶
In [10]:
base_url="https://ezb.ur.de/detail.phtml?lang=en&xmloutput=1"
In [11]:
def get_publisher_name(xml: ET.ElementTree) -> str:
publisher = xml.find(f".//publisher")
return publisher.text if not publisher is None else ""
def get_e_issns(xml: ET.ElementTree) -> List[str]:
issns = []
for e_issn in xml.findall(".//E_ISSN"):
issns.append(e_issn.text)
return issns
def get_p_issns(xml: ET.ElementTree) -> List[str]:
issns = []
for p_issn in xml.findall(".//P_ISSN"):
issns.append(p_issn.text)
return issns
def get_subjects(xml: ET.ElementTree) -> List[str]:
subjects = []
for subject in xml.findall(".//subject"):
subjects.append(subject.text)
return subjects
def get_categories(xml: ET.ElementTree) -> List[str]:
categories = []
for category in xml.findall(".//category"):
categories.append(category.text)
return categories
def get_first_issue(xml: ET.ElementTree) -> str:
date = xml.find(".//first_date")
return date.text if not date is None else ""
def get_home_page(xml: ET.ElementTree) -> str:
homepage = xml.find(".//homepage")
return homepage.text if not homepage is None else ""
def get_appearence(xml: ET.ElementTree) -> str:
appearence = xml.find(".//appearence")
return appearence.text if not appearence is None else ""
def get_costs(xml: ET.ElementTree) -> str:
costs = xml.find(".//costs")
return costs.text if not costs is None else ""
def get_access_conditions(xml: ET.ElementTree) -> str:
ac = xml.find(".//access_conditions")
return ac.text if not ac is None else ""
def get_doaj_info(xml: ET.ElementTree) -> dict:
d = {}
doaj = xml.find(".//doaj")
if doaj:
d['doaj:url'] = doaj.attrib.get('url')
for child in doaj:
tag = child.tag
if "}" in tag:
tag = tag.split("}", 1)[1]
d[f"doaj:{tag}"]=child.text
return d
In [12]:
def get_publishings_info(xml: ET.ElementTree) -> List[object]:
publishing = []
publishing_tag = xml.find(".//publishing")
if not publishing_tag is None:
for info in publishing_tag:
if not info.tag in publishing: publishing.append(info.tag)
return publishing
In [13]:
def get_detail_tags(xml: ET.ElementTree) -> List[str]:
keywords = []
details = xml.find(".//detail")
if not details is None:
for child in details:
if not child.tag in keywords: keywords.append(child.tag)
return keywords
In [14]:
def get_openapc_info(xml: ET.ElementTree) -> dict:
d = {}
namespace = {
'openapc': 'https://olap.openapc.net/'
}
period = xml.find(".//openapc:period", namespaces=namespace)
n_items = xml.find(".//openapc:apc_num_items", namespaces=namespace)
amount = xml.find(".//openapc:apc_amount_avg", namespaces=namespace)
d['openapc:period'] = period.text if not period is None else "NA"
d['openapc:apc_num_items'] = n_items.text if not period is None else "NA"
d['openapc:apc_amount_avg'] = amount.text if not amount is None else "NA"
d['openapc:apc_amount_avg_currency'] = amount.attrib.get("currency", "NA") if not amount is None else "NA"
return d
In [24]:
all_tags = set()
for journal in tqdm(journals):
jourid = journal['ezb-id']
jour_url = f"{base_url}&jour_id={jourid}"
jour_xml = fetch(jour_url)
tags = get_detail_tags(jour_xml)
for t in tags:
all_tags.add(t)
In [15]:
for journal in tqdm(journals):
jourid = journal['ezb-id']
jour_url = f"{base_url}&jour_id={jourid}"
jour_xml = fetch(jour_url)
journal['publisher_name'] = get_publisher_name(jour_xml)
journal['E-ISSNs'] = get_e_issns(jour_xml)
journal['P-ISSNs'] = get_p_issns(jour_xml)
journal['ezb_subjects'] = get_subjects(jour_xml)
journal['ezb_categories'] = get_categories(jour_xml)
journal['first_issue'] = get_first_issue(jour_xml)
journal['home_page'] = get_home_page(jour_xml)
journal['appearence'] = get_appearence(jour_xml)
journal['costs'] = get_costs(jour_xml)
journal['access_conditions'] = get_access_conditions(jour_xml)
doaj = get_doaj_info(jour_xml)
for k,v in doaj.items():
journal[k] = v
openapc = get_openapc_info(jour_xml)
for k,v in openapc.items():
journal[k] = v
In [32]:
journal_publishing = {}
for journal in tqdm(journals):
jourid = journal['ezb-id']
jour_url = f"{base_url}&jour_id={jourid}"
jour_xml = fetch(jour_url)
publishing = get_publishings_info(jour_xml)
journal_publishing[jourid] = {'journame': journal['journame'], 'home_page': journal['home_page'], 'publishing': publishing}
In [22]:
with open("ezb-doa-journals.json", "w") as f:
json.dump(journals, f)
In [16]:
df = pd.DataFrame(journals)
In [26]:
df.head()
Out[26]:
In [23]:
sep = "/"
In [25]:
df['E-ISSNs'] = df['E-ISSNs'].apply(lambda x: sep.join(x) if len(x) > 0 else "")
df['P-ISSNs'] = df['P-ISSNs'].apply(lambda x: sep.join(x) if len(x) > 0 else "")
df['ezb_subjects'] = df['ezb_subjects'].apply(lambda x: sep.join(x) if len(x) > 0 else "")
df['ezb_categories'] = df['ezb_categories'].apply(lambda x: sep.join(x) if len(x) > 0 else "")
In [32]:
df.to_csv("ezb_journals.csv", index=None, sep=";")