In [1]:
import json
import numpy as np
import pandas as pd
import requests
import dask.dataframe as dd
import multiprocessing as mp

In [2]:
df = pd.DataFrame.from_dict(json.loads(open('../data/interim/urls.json').read()), orient='index', columns=['redirect'])
df = df.reset_index()
df.rename(columns={'index': 'url'}, inplace=True)
df.drop('redirect', axis=1, inplace=True)
df

Unnamed: 0,url
0,http://sidc.be/silso/home
1,https://bmrb.io/
2,http://www.pac.dfo-mpo.gc.ca/science/oceans/da...
3,https://www.cpc.ncep.noaa.gov/
4,http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...
...,...
13352,http://radar.brookes.ac.uk/radar/access/home.do
13353,http://www.ub.uni-siegen.de/opus
13354,http://numerique.bibliotheque.toulouse.fr/cgi-...
13355,https://www.utas.edu.au/library


In [3]:
df.describe()

Unnamed: 0,url
count,13357
unique,13357
top,http://sidc.be/silso/home
freq,1


In [4]:
def resolve(url):
    print(url)
    timeout = 30
    head_final_code = head_url = head_chain = head_codes = head_elapsed = head_error = get_final_code = get_url = get_chain = get_codes = get_elapsed = get_error = ''

    try:
        head = requests.head(url, timeout=timeout, allow_redirects=True)
        head_final_code = head.status_code
        head_url = head.url
        head_chain = [res.url for res in head.history]
        head_codes = [res.status_code for res in head.history]
        head_elapsed = head.elapsed.total_seconds()
    except Exception as e:
        head_error = e
    
    try:
        get = requests.get(url, timeout=timeout, allow_redirects=True)
        get_final_code = get.status_code
        get_url = get.url
        get_chain = [res.url for res in get.history]
        get_codes = [res.status_code for res in get.history]
        get_elapsed = get.elapsed.total_seconds()
    except Exception as e:
        get_error = e

    return dict(url=url,
                head_final_code=head_final_code, 
                head_url=head_url, 
                head_chain=head_chain, 
                head_codes=head_codes, 
                head_elapsed=head_elapsed, 
                head_error=head_error, 
                get_final_code=get_final_code, 
                get_url=get_url, 
                get_chain=get_chain, 
                get_codes=get_codes,
                get_elapsed=get_elapsed, 
                get_error=get_error)

In [5]:
# resolve('https://isoarch.eu/')
# resolve('https://doi.org/10.1007/978-3-030-30760-8_15')

In [6]:
ddf = dd.from_pandas(df, npartitions=mp.cpu_count())

ddf = ddf.map_partitions(lambda df: df.apply((lambda row: resolve(row['url'])), axis=1)).compute(scheduler='processes')
ddf


foo
foo
http://sidc.be/silso/home
http://escholarship.amherst.edu/
http://digitalcommons.fau.edu/
http://dspace.lib.fcu.edu.tw/
https://bmrb.io/
http://www.pac.dfo-mpo.gc.ca/science/oceans/data-donnees/index-eng.html
https://www.cpc.ncep.noaa.gov/
http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-bin/index.cgi?org=as
https://www.geochron.org/
http://ipt.taibif.tw/
https://www.ars-grin.gov/
https://data.seaiceportal.de
http://arabidopsis.info
https://isoarch.eu/
http://digitalcommons.libraries.columbia.edu/
http://210.72.88.198:8080/
http://epi.minsal.cl/bases-de-datos/
https://tbindia.indiancst.com/GPMSTBTransportal/
https://www.chinare.org.cn/en/
https://brainbase.io
https://obis.org/
https://portal.conp.ca
https://aeronet.gsfc.nasa.gov/new_web/data.html
https://arch.library.northwestern.edu/
https://www.cvrgrid.org/
https://codeocean.com
https://disc.gsfc.nasa.gov
http://metabolonote.kazusa.or.jp
https://www.ncbi.nlm.nih.gov/genbank/
https://www.nist.gov/pml/atomic-spectra-database
https:

0        {'url': 'http://sidc.be/silso/home', 'head_fin...
1        {'url': 'https://bmrb.io/', 'head_final_code':...
2        {'url': 'http://www.pac.dfo-mpo.gc.ca/science/...
3        {'url': 'https://www.cpc.ncep.noaa.gov/', 'hea...
4        {'url': 'http://cosmos.bot.kyoto-u.ac.jp/acyto...
                               ...                        
13352    {'url': 'http://radar.brookes.ac.uk/radar/acce...
13353    {'url': 'http://www.ub.uni-siegen.de/opus', 'h...
13354    {'url': 'http://numerique.bibliotheque.toulous...
13355    {'url': 'https://www.utas.edu.au/library', 'he...
13356    {'url': 'http://204.232.200.153:8080/jspui/', ...
Length: 13357, dtype: object

In [13]:
ddf = ddf.apply(pd.Series)
ddf

Unnamed: 0,url,head_final_code,head_redirect,head_chain,head_codes,head_elapsed,head_error,get_final_code,get_redirect,get_chain,get_codes,get_elapsed,get_error
0,http://sidc.be/silso/home,200,https://www.sidc.be/silso/home,[http://sidc.be/silso/home],[302],0.163961,,200,https://www.sidc.be/silso/home,[http://sidc.be/silso/home],[302],0.153598,
1,https://bmrb.io/,200,https://bmrb.io/,[],[],0.544952,,200,https://bmrb.io/,[],[],0.486676,
2,http://www.pac.dfo-mpo.gc.ca/science/oceans/da...,200,https://www.pac.dfo-mpo.gc.ca/science/oceans/d...,[http://www.pac.dfo-mpo.gc.ca/science/oceans/d...,[307],0.684978,,200,https://www.pac.dfo-mpo.gc.ca/science/oceans/d...,[http://www.pac.dfo-mpo.gc.ca/science/oceans/d...,[307],0.513675,
3,https://www.cpc.ncep.noaa.gov/,200,https://www.cpc.ncep.noaa.gov/,[],[],0.50544,,200,https://www.cpc.ncep.noaa.gov/,[],[],0.502208,
4,http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...,404,http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...,[],[],1.260253,,404,http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...,[],[],0.822833,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13352,http://radar.brookes.ac.uk/radar/access/home.do,200,https://radar.brookes.ac.uk/radar/home.do,[http://radar.brookes.ac.uk/radar/access/home....,"[302, 302, 302]",0.071604,,200,https://radar.brookes.ac.uk/radar/home.do,[http://radar.brookes.ac.uk/radar/access/home....,"[302, 302, 302]",0.082388,
13353,http://www.ub.uni-siegen.de/opus,200,https://www.ub.uni-siegen.de/opus/,"[http://www.ub.uni-siegen.de/opus, https://www...","[301, 301]",0.039016,,200,https://www.ub.uni-siegen.de/opus/,"[http://www.ub.uni-siegen.de/opus, https://www...","[301, 301]",0.039533,
13354,http://numerique.bibliotheque.toulouse.fr/cgi-...,,,,,,None: Max retries exceeded with url: /cgi-bin/...,,,,,,None: Max retries exceeded with url: /cgi-bin/...
13355,https://www.utas.edu.au/library,200,https://www.utas.edu.au/library,[],[],0.222858,,200,https://www.utas.edu.au/library,[],[],0.212977,


In [14]:
ddf.apply(pd.Series).describe()

Unnamed: 0,url,head_final_code,head_redirect,head_chain,head_codes,head_elapsed,head_error,get_final_code,get_redirect,get_chain,get_codes,get_elapsed,get_error
count,13357,13357,13357.0,13357,13357,13357.0,13357.0,13357,13357.0,13357,13357,13357.0,13357.0
unique,13357,19,9313.0,4038,60,10865.0,2457.0,18,9329.0,4065,57,10883.0,2438.0
top,http://sidc.be/silso/home,200,,[],[],,,200,,[],[],,
freq,1,9952,2456.0,6593,6593,2456.0,10901.0,10108,2437.0,6583,6583,2437.0,10920.0


In [15]:
ddf.apply(pd.Series).to_csv('../data/processed/redirects_dask.tsv', sep='\t', index=False)

In [None]:
ddf