In [2]:
import json
import numpy as np
import pandas as pd
import requests
import ast


In [2]:
df = pd.DataFrame.from_dict(json.loads(open('../data/interim/urls.json').read()), orient='index', columns=['redirect'])
df = df.reset_index()
df.rename(columns={'index': 'url'}, inplace=True)
df.drop('redirect', axis=1, inplace=True)
df

Unnamed: 0,url
0,http://sidc.be/silso/home
1,https://bmrb.io/
2,http://www.pac.dfo-mpo.gc.ca/science/oceans/da...
3,https://www.cpc.ncep.noaa.gov/
4,http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...
...,...
13352,http://radar.brookes.ac.uk/radar/access/home.do
13353,http://www.ub.uni-siegen.de/opus
13354,http://numerique.bibliotheque.toulouse.fr/cgi-...
13355,https://www.utas.edu.au/library


In [3]:
df.describe()

Unnamed: 0,url
count,13357
unique,13357
top,http://sidc.be/silso/home
freq,1


In [8]:
def resolve(url):
    print(url)
    timeout = 30
    head_final_code = head_url = head_chain = head_codes = head_elapsed = head_error = get_final_code = get_url = get_chain = get_codes = get_elapsed = get_error = ''

    try:
        head = requests.head(url, timeout=timeout, allow_redirects=True)
        head_final_code = head.status_code
        head_url = head.url
        head_chain = [res.url for res in head.history]
        head_codes = [res.status_code for res in head.history]
        head_elapsed = head.elapsed.total_seconds()
    except Exception as e:
        head_error = e
    
    try:
        get = requests.get(url, timeout=timeout, allow_redirects=True)
        get_final_code = get.status_code
        get_url = get.url
        get_chain = [res.url for res in get.history]
        get_codes = [res.status_code for res in get.history]
        get_elapsed = get.elapsed.total_seconds()
    except Exception as e:
        get_error = e

    return dict(url=url,
                head_final_code=head_final_code, 
                head_url=head_url, 
                head_chain=head_chain, 
                head_codes=head_codes, 
                head_elapsed=head_elapsed, 
                head_error=head_error, 
                get_final_code=get_final_code, 
                get_url=get_url, 
                get_chain=get_chain, 
                get_codes=get_codes,
                get_elapsed=get_elapsed, 
                get_error=get_error)

In [9]:
resolve('https://doi.org/10.1007/978-3-030-30760-8_15')

https://doi.org/10.1007/978-3-030-30760-8_15


{'url': 'https://doi.org/10.1007/978-3-030-30760-8_15',
 'head_final_code': 200,
 'head_url': 'https://link.springer.com/chapter/10.1007/978-3-030-30760-8_15',
 'head_chain': ['https://doi.org/10.1007/978-3-030-30760-8_15',
  'http://link.springer.com/10.1007/978-3-030-30760-8_15',
  'https://link.springer.com/chapter/10.1007/978-3-030-30760-8_15',
  'https://idp.springer.com/authorize?response_type=cookie&client_id=springerlink&redirect_uri=https%3A%2F%2Flink.springer.com%2Fchapter%2F10.1007%2F978-3-030-30760-8_15'],
 'head_codes': [302, 301, 303, 302],
 'head_elapsed': 1.312367,
 'head_error': '',
 'get_final_code': 200,
 'get_url': 'https://link.springer.com/chapter/10.1007/978-3-030-30760-8_15',
 'get_chain': ['https://doi.org/10.1007/978-3-030-30760-8_15',
  'http://link.springer.com/10.1007/978-3-030-30760-8_15',
  'https://link.springer.com/chapter/10.1007/978-3-030-30760-8_15',
  'https://idp.springer.com/authorize?response_type=cookie&client_id=springerlink&redirect_uri=https%

In [8]:
for index, row in df.iterrows():
        if 'get_final_code' not in row or pd.isna(row['get_final_code']):
            res = resolve(row['url'])
            for key in res:
                df.at[index, key] = str(res[key])

http://sidc.be/silso/home
https://bmrb.io/
http://www.pac.dfo-mpo.gc.ca/science/oceans/data-donnees/index-eng.html
https://www.cpc.ncep.noaa.gov/
http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-bin/index.cgi?org=as
https://www.geochron.org/
http://ipt.taibif.tw/
https://www.ars-grin.gov/
https://data.seaiceportal.de
http://arabidopsis.info
https://isoarch.eu/
http://epi.minsal.cl/bases-de-datos/
https://tbindia.indiancst.com/GPMSTBTransportal/
https://www.chinare.org.cn/en/
https://brainbase.io
https://obis.org/
https://portal.conp.ca
https://aeronet.gsfc.nasa.gov/new_web/data.html
https://arch.library.northwestern.edu/
https://www.cvrgrid.org/
https://codeocean.com
https://disc.gsfc.nasa.gov
http://metabolonote.kazusa.or.jp
https://www.ncbi.nlm.nih.gov/genbank/
https://www.nist.gov/pml/atomic-spectra-database
https://www.trackhubregistry.org/
http://www.flyrnai.org/
http://www.hgvd.genome.med.kyoto-u.ac.jp/
https://fantom.gsc.riken.jp/
https://ladsweb.modaps.eosdis.nasa.gov/
http://archi

In [47]:
df

Unnamed: 0,url,head_final_code,head_redirect,head_chain,head_codes,head_elapsed,head_error,get_final_code,get_redirect,get_chain,get_codes,get_elapsed,get_error
0,http://sidc.be/silso/home,200,https://www.sidc.be/silso/home,['http://sidc.be/silso/home'],[302],0.151548,,200,https://www.sidc.be/silso/home,['http://sidc.be/silso/home'],[302],0.145799,
1,https://bmrb.io/,200,https://bmrb.io/,[],[],0.527131,,200,https://bmrb.io/,[],[],0.487402,
2,http://www.pac.dfo-mpo.gc.ca/science/oceans/da...,200,https://www.pac.dfo-mpo.gc.ca/science/oceans/d...,['http://www.pac.dfo-mpo.gc.ca/science/oceans/...,[307],0.530466,,200,https://www.pac.dfo-mpo.gc.ca/science/oceans/d...,['http://www.pac.dfo-mpo.gc.ca/science/oceans/...,[307],0.510097,
3,https://www.cpc.ncep.noaa.gov/,200,https://www.cpc.ncep.noaa.gov/,[],[],0.507708,,200,https://www.cpc.ncep.noaa.gov/,[],[],0.492151,
4,http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...,404,http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...,[],[],0.586088,,404,http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...,[],[],0.54267,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
13352,http://radar.brookes.ac.uk/radar/access/home.do,200,https://radar.brookes.ac.uk/radar/home.do,['http://radar.brookes.ac.uk/radar/access/home...,"[302, 302, 302]",0.082406,,200,https://radar.brookes.ac.uk/radar/home.do,['http://radar.brookes.ac.uk/radar/access/home...,"[302, 302, 302]",0.088766,
13353,http://www.ub.uni-siegen.de/opus,200,https://www.ub.uni-siegen.de/opus/,"['http://www.ub.uni-siegen.de/opus', 'https://...","[301, 301]",0.056328,,200,https://www.ub.uni-siegen.de/opus/,"['http://www.ub.uni-siegen.de/opus', 'https://...","[301, 301]",0.039649,
13354,http://numerique.bibliotheque.toulouse.fr/cgi-...,,,,,,HTTPConnectionPool(host='numerique.bibliothequ...,,,,,,HTTPConnectionPool(host='numerique.bibliothequ...
13355,https://www.utas.edu.au/library,200,https://www.utas.edu.au/library,[],[],0.223856,,200,https://www.utas.edu.au/library,[],[],0.595001,


In [12]:
df.describe()

Unnamed: 0,url,head_final_code,head_redirect,head_chain,head_codes,head_elapsed,head_error,get_final_code,get_redirect,get_chain,get_codes,get_elapsed,get_error
count,13357,13357,13357.0,13357,13357,13357.0,13357.0,13357,13357.0,13357,13357,13357.0,13357.0
unique,13357,19,9332.0,4057,60,10894.0,2301.0,18,9354.0,4083,58,10907.0,2303.0
top,http://sidc.be/silso/home,200,,[],[],,,200,,[],[],,
freq,1,9976,2431.0,6596,6596,2431.0,10926.0,10139,2403.0,6595,6595,2403.0,10954.0


In [42]:
df.to_csv('../data/processed/redirects_new.tsv', sep='\t', index=False)

In [20]:
resolve('http://content.wsulibs.wsu.edu')

http://content.wsulibs.wsu.edu


{'url': 'http://content.wsulibs.wsu.edu',
 'head_final_code': 404,
 'head_url': 'https://www.oclc.org/url/?404;http://content.wsulibs.wsu.edu/',
 'head_chain': ['http://content.wsulibs.wsu.edu/'],
 'head_codes': [302],
 'head_elapsed': 1.345008,
 'head_error': '',
 'get_final_code': 404,
 'get_url': 'https://www.oclc.org/url/notfound',
 'get_chain': ['http://content.wsulibs.wsu.edu/',
  'https://www.oclc.org/url/?404;http://content.wsulibs.wsu.edu/'],
 'get_codes': [302, 302],
 'get_elapsed': 0.192602,
 'get_error': ''}