scholarly_repo_availability/notebooks/03.1-analysis.ipynb

295 KiB
Raw Blame History

In [1]:
import json
import requests
import ast

import numpy as np
import pandas as pd
In [2]:
df = pd.read_csv('../data/processed/redirects_new.tsv', sep='\t')
df = df[df.url.notna()]
df
Out[2]:
url head_final_code head_url head_chain head_codes head_elapsed head_error get_final_code get_url get_chain get_codes get_elapsed get_error
0 http://sidc.be/silso/home 200.0 https://www.sidc.be/silso/home ['http://sidc.be/silso/home'] [302] 0.151548 NaN 200.0 https://www.sidc.be/silso/home ['http://sidc.be/silso/home'] [302] 0.145799 NaN
1 https://bmrb.io/ 200.0 https://bmrb.io/ [] [] 0.527131 NaN 200.0 https://bmrb.io/ [] [] 0.487402 NaN
2 http://www.pac.dfo-mpo.gc.ca/science/oceans/da... 200.0 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... ['http://www.pac.dfo-mpo.gc.ca/science/oceans/... [307] 0.530466 NaN 200.0 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... ['http://www.pac.dfo-mpo.gc.ca/science/oceans/... [307] 0.510097 NaN
3 https://www.cpc.ncep.noaa.gov/ 200.0 https://www.cpc.ncep.noaa.gov/ [] [] 0.507708 NaN 200.0 https://www.cpc.ncep.noaa.gov/ [] [] 0.492151 NaN
4 http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... 404.0 http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... [] [] 0.586088 NaN 404.0 http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... [] [] 0.542670 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
13352 http://radar.brookes.ac.uk/radar/access/home.do 200.0 https://radar.brookes.ac.uk/radar/home.do ['http://radar.brookes.ac.uk/radar/access/home... [302, 302, 302] 0.082406 NaN 200.0 https://radar.brookes.ac.uk/radar/home.do ['http://radar.brookes.ac.uk/radar/access/home... [302, 302, 302] 0.088766 NaN
13353 http://www.ub.uni-siegen.de/opus 200.0 https://www.ub.uni-siegen.de/opus/ ['http://www.ub.uni-siegen.de/opus', 'https://... [301, 301] 0.056328 NaN 200.0 https://www.ub.uni-siegen.de/opus/ ['http://www.ub.uni-siegen.de/opus', 'https://... [301, 301] 0.039649 NaN
13354 http://numerique.bibliotheque.toulouse.fr/cgi-... NaN NaN NaN NaN NaN HTTPConnectionPool(host='numerique.bibliothequ... NaN NaN NaN NaN NaN HTTPConnectionPool(host='numerique.bibliothequ...
13355 https://www.utas.edu.au/library 200.0 https://www.utas.edu.au/library [] [] 0.223856 NaN 200.0 https://www.utas.edu.au/library [] [] 0.595001 NaN
13356 http://204.232.200.153:8080/jspui/ NaN NaN NaN NaN NaN HTTPConnectionPool(host='204.232.200.153', por... NaN NaN NaN NaN NaN HTTPConnectionPool(host='204.232.200.153', por...

13356 rows × 13 columns

In [3]:
df.describe(include='all')
Out[3]:
url head_final_code head_url head_chain head_codes head_elapsed head_error get_final_code get_url get_chain get_codes get_elapsed get_error
count 13356 10926.000000 10926 10926 10926 10926.000000 2430 10954.00000 10954 10954 10954 10954.000000 2402
unique 13356 NaN 9331 4056 59 NaN 2299 NaN 9353 4082 57 NaN 2301
top http://sidc.be/silso/home NaN https://www.unit.no/ugyldig-lenke-til-dokument... [] [] NaN ('Connection aborted.', RemoteDisconnected('Re... NaN https://www.unit.no/ugyldig-lenke-til-dokument... [] [] NaN ('Connection aborted.', RemoteDisconnected('Re...
freq 1 NaN 16 6596 6596 NaN 34 NaN 16 6595 6595 NaN 22
mean NaN 218.406736 NaN NaN NaN 1.124777 NaN 215.73407 NaN NaN NaN 1.125515 NaN
std NaN 61.035428 NaN NaN NaN 3.103610 NaN 56.89142 NaN NaN NaN 3.184153 NaN
min NaN 200.000000 NaN NaN NaN 0.013261 NaN 200.00000 NaN NaN NaN 0.013800 NaN
25% NaN 200.000000 NaN NaN NaN 0.276535 NaN 200.00000 NaN NaN NaN 0.254967 NaN
50% NaN 200.000000 NaN NaN NaN 0.583665 NaN 200.00000 NaN NaN NaN 0.527238 NaN
75% NaN 200.000000 NaN NaN NaN 1.059370 NaN 200.00000 NaN NaN NaN 0.944213 NaN
max NaN 504.000000 NaN NaN NaN 61.235589 NaN 504.00000 NaN NaN NaN 61.500941 NaN

Final status codes

HEAD

In [4]:
df[['url', 'head_final_code']].groupby('head_final_code').count()
Out[4]:
url
head_final_code
200.0 9976
201.0 19
400.0 19
401.0 14
403.0 205
404.0 489
405.0 50
406.0 20
410.0 15
412.0 1
419.0 1
429.0 1
444.0 1
500.0 47
501.0 3
502.0 15
503.0 49
504.0 1
In [5]:
final_head = df[['url', 'head_final_code']]
final_head['class'] = final_head.head_final_code.floordiv(100).mul(100)
final_head.groupby('class').count()
/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/3455765756.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_head['class'] = final_head.head_final_code.floordiv(100).mul(100)
Out[5]:
url head_final_code
class
200.0 9995 9995
400.0 816 816
500.0 115 115
In [6]:
final_head.groupby('class').count().sum()
Out[6]:
url                10926
head_final_code    10926
dtype: int64
In [7]:
final_head.groupby('class').count().div(df.shape[0]).mul(100)
Out[7]:
url head_final_code
class
200.0 74.835280 74.835280
400.0 6.109614 6.109614
500.0 0.861036 0.861036

GET

In [8]:
df[['url', 'get_final_code']].groupby('get_final_code').count()
Out[8]:
url
get_final_code
200.0 10139
201.0 19
400.0 8
401.0 13
403.0 168
404.0 464
405.0 1
406.0 21
410.0 15
412.0 1
419.0 1
429.0 1
444.0 1
500.0 40
502.0 13
503.0 48
504.0 1
In [9]:
final_get = df[['url', 'get_final_code']]
final_get['class'] = final_get.get_final_code.floordiv(100).mul(100)
final_get.groupby('class').count()
/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/1946842175.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_get['class'] = final_get.get_final_code.floordiv(100).mul(100)
Out[9]:
url get_final_code
class
200.0 10158 10158
400.0 694 694
500.0 102 102
In [10]:
final_get.groupby('class').count().sum()
Out[10]:
url               10954
get_final_code    10954
dtype: int64
In [11]:
final_get.groupby('class').count().div(df.shape[0]).mul(100)
Out[11]:
url get_final_code
class
200.0 76.055705 76.055705
400.0 5.196167 5.196167
500.0 0.763702 0.763702

Errors

In [12]:
head_errors = df[df.head_error.notna()][['url','head_error']]
get_errors = df[df.get_error.notna()][['url','get_error']]
In [13]:
head_errors[head_errors.head_error.str.contains('Max')]
Out[13]:
url head_error
10 https://isoarch.eu/ HTTPSConnectionPool(host='isoarch.eu', port=44...
13 https://www.chinare.org.cn/en/ HTTPSConnectionPool(host='www.chinare.org.cn',...
26 http://www.flyrnai.org/ HTTPSConnectionPool(host='www.flyrnai.org', po...
71 http://alternaria.vbi.vt.edu/index.html HTTPConnectionPool(host='alternaria.vbi.vt.edu...
83 http://www.pazar.info/ HTTPConnectionPool(host='www.pazar.info', port...
... ... ...
13344 http://intranet.thomasdanby.ac.uk/dspace/ HTTPConnectionPool(host='intranet.thomasdanby....
13345 http://dspace.dgsca.unam.mx:8080/dspace/ HTTPConnectionPool(host='dspace.dgsca.unam.mx'...
13347 https://library.for.nau.edu:8443/ HTTPSConnectionPool(host='library.for.nau.edu'...
13354 http://numerique.bibliotheque.toulouse.fr/cgi-... HTTPConnectionPool(host='numerique.bibliothequ...
13356 http://204.232.200.153:8080/jspui/ HTTPConnectionPool(host='204.232.200.153', por...

2242 rows × 2 columns

In [14]:
get_errors[get_errors.get_error.str.contains('Max')]
Out[14]:
url get_error
10 https://isoarch.eu/ HTTPSConnectionPool(host='isoarch.eu', port=44...
13 https://www.chinare.org.cn/en/ HTTPSConnectionPool(host='www.chinare.org.cn',...
26 http://www.flyrnai.org/ HTTPSConnectionPool(host='www.flyrnai.org', po...
57 http://stellabase.org HTTPConnectionPool(host='cnidarians.bu.edu', p...
71 http://alternaria.vbi.vt.edu/index.html HTTPConnectionPool(host='alternaria.vbi.vt.edu...
... ... ...
13344 http://intranet.thomasdanby.ac.uk/dspace/ HTTPConnectionPool(host='intranet.thomasdanby....
13345 http://dspace.dgsca.unam.mx:8080/dspace/ HTTPConnectionPool(host='dspace.dgsca.unam.mx'...
13347 https://library.for.nau.edu:8443/ HTTPSConnectionPool(host='library.for.nau.edu'...
13354 http://numerique.bibliotheque.toulouse.fr/cgi-... HTTPConnectionPool(host='numerique.bibliothequ...
13356 http://204.232.200.153:8080/jspui/ HTTPConnectionPool(host='204.232.200.153', por...

2256 rows × 2 columns

In [15]:
head_errors[head_errors.head_error.str.contains('30 redirect')]
Out[15]:
url head_error
485 https://idr.openmicroscopy.org Exceeded 30 redirects.
1191 http://hgtree.snu.ac.kr Exceeded 30 redirects.
1462 http://cfgp.snu.ac.kr Exceeded 30 redirects.
2526 https://incois.gov.in/tropflux/index.jsp Exceeded 30 redirects.
2550 https://odis.incois.gov.in/ Exceeded 30 redirects.
3872 https://www.projectdatasphere.org/projectdatas... Exceeded 30 redirects.
6141 http://repository.poligran.edu.co/ Exceeded 30 redirects.
10841 http://library.wur.nl/way/ Exceeded 30 redirects.
In [16]:
get_errors[get_errors.get_error.str.contains('30 redirect')]
Out[16]:
url get_error
6141 http://repository.poligran.edu.co/ Exceeded 30 redirects.

Testing Ukranian repositories

In [17]:
get_errors[get_errors.url.str.contains('(\.ua$)|(\.ua/)', regex=True)]
/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/1742693288.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.
  get_errors[get_errors.url.str.contains('(\.ua$)|(\.ua/)', regex=True)]
Out[17]:
url get_error
4892 http://eprints.isofts.kiev.ua/ ('Connection aborted.', ConnectionResetError(5...
4943 http://www.library.ukma.kiev.ua/dspace/ HTTPConnectionPool(host='www.library.ukma.kiev...
4983 http://repository.ibss.org.ua/dspace/ HTTPConnectionPool(host='repository.ibss.org.u...
5147 http://www.rarelib.undp.org.ua/eng/index.php3 HTTPConnectionPool(host='www.rarelib.undp.org....
5817 http://openarchive.nure.ua/ HTTPSConnectionPool(host='openarchive.nure.ua'...
5827 http://dspace.uabs.edu.ua/jspui/ HTTPConnectionPool(host='dspace.uabs.edu.ua', ...
6135 http://kgmtu.edu.ua/jspui/ HTTPConnectionPool(host='kgmtu.edu.ua', port=8...
6231 http://dspace.uccu.org.ua/ HTTPConnectionPool(host='dspace.uccu.org.ua', ...
6474 http://repo.uipa.edu.ua/jspui/ HTTPConnectionPool(host='repo.uipa.edu.ua', po...
7067 http://ir.asta.edu.ua/jspui/ HTTPConnectionPool(host='ir.asta.edu.ua', port...
7490 http://dspace.hnpu.edu.ua/ HTTPConnectionPool(host='dspace.hnpu.edu.ua', ...
7989 http://repository.mdu.in.ua/jspui/ HTTPConnectionPool(host='repository.mdu.in.ua'...
8085 http://elar.tsatu.edu.ua/ HTTPConnectionPool(host='elar.tsatu.edu.ua', p...
8105 http://dspace.zsea.edu.ua/ HTTPSConnectionPool(host='dspace.znu.edu.ua', ...
8197 http://dspace.knau.kharkov.ua/jspui/ HTTPConnectionPool(host='dspace.knau.kharkov.u...
8199 http://rep.btsau.edu.ua HTTPConnectionPool(host='rep.btsau.edu.ua', po...
8206 https://biblio.onat.edu.ua/ HTTPSConnectionPool(host='biblio.onat.edu.ua',...
8419 https://rep.ksma.ks.ua HTTPSConnectionPool(host='rep.ksma.ks.ua', por...
10016 http://eprints.ksame.kharkov.ua/ HTTPConnectionPool(host='eprints.ksame.kharkov...
10107 http://dspace.uabs.edu.ua/ HTTPConnectionPool(host='dspace.uabs.edu.ua', ...
10799 http://repository.ibss.org.ua/dspace HTTPConnectionPool(host='repository.ibss.org.u...
10906 http://repo.sau.sumy.ua/ HTTPConnectionPool(host='repo.sau.sumy.ua', po...
10976 http://www.dspace.humanities.org.ua/dspace/ HTTPConnectionPool(host='www.dspace.humanities...
11616 http:// visnykagro.knau.kharkov.ua HTTPConnectionPool(host='%20visnykagro.knau.kh...
11803 http://repository.mdu.in.ua/ HTTPConnectionPool(host='repository.mdu.in.ua'...
11831 http://sj-dstu.dp.ua/ HTTPConnectionPool(host='sj-dstu.dp.ua', port=...
12107 http://www.hups.mil.gov.ua/periodic-app/journal HTTPConnectionPool(host='www.hups.mil.gov.ua',...
12746 http://www.e-archive.org.ua/ HTTPConnectionPool(host='www.e-archive.org.ua'...
12753 http://library.khnu.km.ua/jspui/ HTTPConnectionPool(host='library.khnu.km.ua', ...
12916 http://dspace.ukrfa.kharkov.ua/ HTTPConnectionPool(host='dspace.ukrfa.kharkov....
12922 http://dspace.ukrfa.kharkov.ua HTTPConnectionPool(host='dspace.ukrfa.kharkov....
12970 http://evrica.org.ua HTTPConnectionPool(host='evrica.org.ua', port=...
12992 http://dspace.academy.sumy.ua/ HTTPConnectionPool(host='dspace.academy.sumy.u...
13017 http://open-archive.kture.kharkov.ua/ HTTPConnectionPool(host='open-archive.kture.kh...
13034 http://dspace.onu.edu.ua/ HTTPConnectionPool(host='192.168.30.200', port...
13052 http://open-arhive.kture.kharkov.ua HTTPConnectionPool(host='open-arhive.kture.kha...
In [18]:
get_errors[get_errors.url.str.contains('(\.ua$)|(\.ua/)', regex=True)].count()
/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/2005047697.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.
  get_errors[get_errors.url.str.contains('(\.ua$)|(\.ua/)', regex=True)].count()
Out[18]:
url          36
get_error    36
dtype: int64
In [19]:
df[df.url.str.contains('(\.ua$)|(\.ua/)', regex=True)]
/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/2148841299.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.
  df[df.url.str.contains('(\.ua$)|(\.ua/)', regex=True)]
Out[19]:
url head_final_code head_url head_chain head_codes head_elapsed head_error get_final_code get_url get_chain get_codes get_elapsed get_error
2185 http://inform.ikd.kiev.ua/index.php?path=/en/i... 200.0 http://inform.ikd.kiev.ua/?path=/en/index ['http://inform.ikd.kiev.ua/index.php?path=/en... [301] 0.402488 NaN 200.0 http://inform.ikd.kiev.ua/?path=/en/index ['http://inform.ikd.kiev.ua/index.php?path=/en... [301] 0.452971 NaN
2286 http://wdc.org.ua/ 200.0 http://wdc.org.ua/ [] [] 0.905063 NaN 200.0 http://wdc.org.ua/ [] [] 0.816887 NaN
4892 http://eprints.isofts.kiev.ua/ NaN NaN NaN NaN NaN ('Connection aborted.', ConnectionResetError(5... NaN NaN NaN NaN NaN ('Connection aborted.', ConnectionResetError(5...
4943 http://www.library.ukma.kiev.ua/dspace/ NaN NaN NaN NaN NaN HTTPConnectionPool(host='www.library.ukma.kiev... NaN NaN NaN NaN NaN HTTPConnectionPool(host='www.library.ukma.kiev...
4950 http://www.dspace.ucu.edu.ua/dspace/ 403.0 http://www.dspace.ucu.edu.ua/dspace/ [] [] 0.073373 NaN 403.0 http://www.dspace.ucu.edu.ua/dspace/ [] [] 0.073223 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
13015 http://ir.kneu.edu.ua 200.0 http://ir.kneu.edu.ua/ [] [] 1.609602 NaN 200.0 http://ir.kneu.edu.ua/ [] [] 1.508854 NaN
13017 http://open-archive.kture.kharkov.ua/ NaN NaN NaN NaN NaN HTTPConnectionPool(host='open-archive.kture.kh... NaN NaN NaN NaN NaN HTTPConnectionPool(host='open-archive.kture.kh...
13034 http://dspace.onu.edu.ua/ NaN NaN NaN NaN NaN HTTPConnectionPool(host='192.168.30.200', port... NaN NaN NaN NaN NaN HTTPConnectionPool(host='192.168.30.200', port...
13052 http://open-arhive.kture.kharkov.ua NaN NaN NaN NaN NaN HTTPConnectionPool(host='open-arhive.kture.kha... NaN NaN NaN NaN NaN HTTPConnectionPool(host='open-arhive.kture.kha...
13248 http://www.ekmair.ukma.edu.ua 200.0 http://ekmair.ukma.edu.ua/ ['http://www.ekmair.ukma.edu.ua/'] [301] 0.661039 NaN 200.0 http://ekmair.ukma.edu.ua/ ['http://www.ekmair.ukma.edu.ua/'] [301] 0.755170 NaN

146 rows × 13 columns

Redirects

In [20]:
head_chains = df[df.head_codes.notna()][['url', 'head_url', 'head_final_code', 'head_codes']]
head_chains['len'] = head_chains.head_codes.apply(ast.literal_eval).str.len()
head_chains.groupby('len').count()
Out[20]:
url head_url head_final_code head_codes
len
0 6596 6596 6596 6596
1 3364 3364 3364 3364
2 686 686 686 686
3 233 233 233 233
4 37 37 37 37
5 10 10 10 10
In [21]:
get_chains = df[df.get_codes.notna()][['url', 'get_url', 'get_final_code', 'get_codes']]
get_chains['len'] = get_chains.get_codes.apply(ast.literal_eval).str.len()
get_chains.groupby('len').count()
Out[21]:
url get_url get_final_code get_codes
len
0 6595 6595 6595 6595
1 3376 3376 3376 3376
2 699 699 699 699
3 229 229 229 229
4 43 43 43 43
5 12 12 12 12
In [22]:
data = head_chains.groupby('len').count().join(get_chains.groupby('len').count(), lsuffix='_head', rsuffix='_get')
data.index.rename('# redirects', inplace=True)
data.rename(columns={'url_head': 'HEAD', 'url_get': 'GET'}, inplace=True)
data[['HEAD', 'GET']][1:].plot.bar(rot=0)
Out[22]:
<AxesSubplot:xlabel='# redirects'>
In [23]:
data[['HEAD', 'GET']][1:].sum()
Out[23]:
HEAD    4330
GET     4359
dtype: int64
In [24]:
data[['HEAD', 'GET']][1:].sum().div(df.shape[0]).mul(100)
Out[24]:
HEAD    32.419886
GET     32.637017
dtype: float64
In [25]:
head_chains['class'] = head_chains.head_final_code.floordiv(100).mul(100)
head_chains[(head_chains.len > 0) & (head_chains['class'] != 200)]
In [26]:
head_chains[(head_chains.len > 0) & (head_chains['class'] != 200)].groupby('class').count()
Out[26]:
url head_url head_final_code head_codes len
class
400.0 338 338 338 338 338
500.0 40 40 40 40 40
In [27]:
get_chains['class'] = get_chains.get_final_code.floordiv(100).mul(100)
get_chains[(get_chains.len > 0) & (get_chains['class'] != 200)]
In [28]:
get_chains[(get_chains.len > 0) & (get_chains['class'] != 200)].groupby('class').count()
Out[28]:
url get_url get_final_code get_codes len
class
400.0 307 307 307 307 307
500.0 37 37 37 37 37

Redirect reason

In [50]:
head_chains[(head_chains.len == 1) & (head_chains.url.str.replace('http://', 'https://') == head_chains.head_url)]
In [51]:
head_chains[(head_chains.len == 1) & (head_chains.url + '/' == head_chains.head_url)]
In [53]:
head_chains[(head_chains.len == 1) & (head_chains.url.str.replace('http://', 'https://') + '/' == head_chains.head_url)]
In [56]:
get_chains[(get_chains.len == 1) & (get_chains.url.str.replace('http://', 'https://') == get_chains.get_url)]
In [57]:
get_chains[(get_chains.len == 1) & (get_chains.url + '/' == get_chains.get_url)]
In [58]:
get_chains[(get_chains.len == 1) & (get_chains.url.str.replace('http://', 'https://') + '/' == get_chains.get_url)]

Redirect mismatch

In [29]:
df[(df.head_url != df.get_url) & (df.head_url.notna()) & (df.get_url.notna())]
Out[29]:
url head_final_code head_url head_chain head_codes head_elapsed head_error get_final_code get_url get_chain get_codes get_elapsed get_error
127 https://models.physiomeproject.org 200.0 https://models.physiomeproject.org/ [] [] 0.658525 NaN 200.0 https://models.physiomeproject.org/welcome ['https://models.physiomeproject.org/'] [302] 0.575681 NaN
371 https://rdp.utas.edu.au 200.0 https://login.microsoftonline.com/15eb5d55-199... ['https://rdp.utas.edu.au/'] [302] 0.297592 NaN 200.0 https://login.microsoftonline.com/15eb5d55-199... ['https://rdp.utas.edu.au/'] [302] 0.285739 NaN
479 https://sciflection.com 200.0 https://sciflection.com/ [] [] 0.176450 NaN 200.0 https://sciflection.com/main ['https://sciflection.com/', 'http://sciflecti... [302, 301] 0.391234 NaN
684 http://www.ntrr-nti.org/ 405.0 http://www.ntrr-nti.org/ [] [] 1.133437 NaN 200.0 https://www.nattrauma.org/research/ntrr/ ['http://www.ntrr-nti.org/'] [301] 2.539869 NaN
697 http://www.jcm.riken.go.jp/ 200.0 https://jcm.brc.riken.jp/en/ ['http://www.jcm.riken.go.jp/', 'https://www.j... [301, 301, 301, 301] 0.506354 NaN 200.0 https://jcm.brc.riken.jp/en/?doing_wp_cron=165... ['http://www.jcm.riken.go.jp/', 'https://www.j... [301, 301, 301, 301, 302] 0.630979 NaN
... ... ... ... ... ... ... ... ... ... ... ... ... ...
13132 http://hik.diva-portal.org/smash/search.jsf 200.0 http://lnu.diva-portal.org/smash/search.jsf?ds... ['http://hik.diva-portal.org/smash/search.jsf'... [301, 302] 0.287171 NaN 200.0 http://lnu.diva-portal.org/smash/search.jsf?ds... ['http://hik.diva-portal.org/smash/search.jsf'... [301, 302] 0.291231 NaN
13133 http://hgo.diva-portal.org/smash/search.jsf 200.0 http://uu.diva-portal.org/smash/search.jsf?dsw... ['http://hgo.diva-portal.org/smash/search.jsf'... [301, 302] 0.292740 NaN 200.0 http://uu.diva-portal.org/smash/search.jsf?dsw... ['http://hgo.diva-portal.org/smash/search.jsf'... [301, 302] 0.289726 NaN
13266 http://www.diva-portal.org/gih/ 200.0 http://gih.diva-portal.org/smash/search.jsf?ds... ['http://www.diva-portal.org/gih/', 'http://gi... [301, 302] 0.301519 NaN 200.0 http://gih.diva-portal.org/smash/search.jsf?ds... ['http://www.diva-portal.org/gih/', 'http://gi... [301, 302] 2.143105 NaN
13283 http://www.compscipreprints.com/ 200.0 http://survey-smiles.com ['http://www.compscipreprints.com/'] [302] 0.257651 NaN 200.0 http://www.compscipreprints.com/ [] [] 0.093911 NaN
13320 http://trinity.cdmhost.com/index.php 404.0 https://www.oclc.org/url/?404;http://trinity.c... ['http://trinity.cdmhost.com/index.php'] [302] 0.589119 NaN 404.0 https://www.oclc.org/url/notfound ['http://trinity.cdmhost.com/index.php', 'http... [302, 302] 0.139563 NaN

146 rows × 13 columns

URL conflation

In [31]:
head_chains.describe(include='all')
Out[31]:
url head_url head_final_code head_codes len class
count 10926 10926 10926.000000 10926 10926.000000 10926.000000
unique 10926 9331 NaN 59 NaN NaN
top http://sidc.be/silso/home https://www.unit.no/ugyldig-lenke-til-dokument... NaN [] NaN NaN
freq 1 16 NaN 6596 NaN NaN
mean NaN NaN 218.406736 NaN 0.515559 218.094454
std NaN NaN 61.035428 NaN 0.749850 60.063069
min NaN NaN 200.000000 NaN 0.000000 200.000000
25% NaN NaN 200.000000 NaN 0.000000 200.000000
50% NaN NaN 200.000000 NaN 0.000000 200.000000
75% NaN NaN 200.000000 NaN 1.000000 200.000000
max NaN NaN 504.000000 NaN 5.000000 500.000000
In [32]:
head_chains.groupby('head_url').count().sort_values('url', ascending=False).head(50)
Out[32]:
url head_final_code head_codes len class
head_url
https://www.unit.no/ugyldig-lenke-til-dokument-i-vitenarkiv 16 16 16 16 16
https://www.nlm.nih.gov/toxnet/index.html 6 6 6 6 6
http://survey-smiles.com 6 6 6 6 6
https://nda.nih.gov/ 5 5 5 5 5
https://digitalrepository.unm.edu/ 4 4 4 4 4
https://ir.vanderbilt.edu/ 4 4 4 4 4
https://ddd.uab.cat/ 4 4 4 4 4
https://intellectum.unisabana.edu.co/ 4 4 4 4 4
https://opus.bibliothek.uni-augsburg.de/opus4/home 4 4 4 4 4
https://pure.iiasa.ac.at/ 4 4 4 4 4
http://eprints.nottingham.ac.uk/ 4 4 4 4 4
https://research.usc.edu.au/esploro/ 3 3 3 3 3
https://dataverse.harvard.edu/ 3 3 3 3 3
https://epub.wupperinst.org/home 3 3 3 3 3
https://pure.mpg.de/ 3 3 3 3 3
https://dlib.york.ac.uk/yodl/app/home/index 3 3 3 3 3
https://archiv.ub.uni-marburg.de/ubfind/ 3 3 3 3 3
https://eprints.unife.it/ 3 3 3 3 3
https://dataspace.princeton.edu/ 3 3 3 3 3
https://eprints.lib.hokudai.ac.jp/ 3 3 3 3 3
https://pub.uni-bielefeld.de/ 3 3 3 3 3
https://aquadocs.org/ 3 3 3 3 3
https://dataverse.tdl.org/ 3 3 3 3 3
https://uwspace.uwaterloo.ca/ 3 3 3 3 3
https://research.utwente.nl/ 3 3 3 3 3
https://datadryad.org/stash 3 3 3 3 3
https://er.knutd.edu.ua/ 3 3 3 3 3
https://esango.cput.ac.za/ 3 3 3 3 3
https://repositorio.fucsalud.edu.co/ 3 3 3 3 3
https://repository.up.ac.za/ 3 3 3 3 3
https://repository.usta.edu.co/ 3 3 3 3 3
https://repositori.upf.edu/ 3 3 3 3 3
https://bibliotecadigital.exactas.uba.ar/ 3 3 3 3 3
https://repo.odmu.edu.ua/xmlui/ 3 3 3 3 3
https://bcnroc.ajuntament.barcelona.cat/jspui/ 3 3 3 3 3
https://data.gov.uk/ 3 3 3 3 3
https://mru.arcabc.ca/ 3 3 3 3 3
https://munin.uit.no/ 3 3 3 3 3
https://red.uao.edu.co/ 3 3 3 3 3
https://research-repository.st-andrews.ac.uk/ 3 3 3 3 3
https://research-repository.uwa.edu.au/ 3 3 3 3 3
https://research.cbs.dk/ 3 3 3 3 3
https://webext.pasteur.fr/genolist/ 3 3 3 3 3
https://prism.ucalgary.ca/ 3 3 3 3 3
https://air.unimi.it/ 3 3 3 3 3
http://rua.ua.es/dspace/ 3 3 3 3 3
https://dial.uclouvain.be/ 3 3 3 3 3
https://sc.lib.miamioh.edu/ 3 3 3 3 3
https://dspace.ankara.edu.tr/xmlui/ 3 3 3 3 3
https://scholar.smu.edu/ 3 3 3 3 3
In [33]:
get_chains.groupby('get_url').count().sort_values('url', ascending=False).head(50)
Out[33]:
url get_final_code get_codes len class
get_url
https://www.unit.no/ugyldig-lenke-til-dokument-i-vitenarkiv 16 16 16 16 16
https://www.oclc.org/url/notfound 7 7 7 7 7
https://www.nlm.nih.gov/toxnet/index.html 6 6 6 6 6
https://nda.nih.gov/ 5 5 5 5 5
https://digitalrepository.unm.edu/ 4 4 4 4 4
https://opus.bibliothek.uni-augsburg.de/opus4/home 4 4 4 4 4
https://intellectum.unisabana.edu.co/ 4 4 4 4 4
https://pure.iiasa.ac.at/ 4 4 4 4 4
https://ir.vanderbilt.edu/ 4 4 4 4 4
http://eprints.nottingham.ac.uk/ 4 4 4 4 4
https://ddd.uab.cat/ 4 4 4 4 4
https://mru.arcabc.ca/ 3 3 3 3 3
https://research.usc.edu.au/esploro/ 3 3 3 3 3
https://aquadocs.org/ 3 3 3 3 3
https://uwspace.uwaterloo.ca/ 3 3 3 3 3
https://pub.uni-bielefeld.de/ 3 3 3 3 3
https://dataverse.tdl.org/ 3 3 3 3 3
https://eprints.lib.hokudai.ac.jp/ 3 3 3 3 3
https://archiv.ub.uni-marburg.de/ubfind/ 3 3 3 3 3
https://eprints.unife.it/ 3 3 3 3 3
https://repositorio.fucsalud.edu.co/ 3 3 3 3 3
https://research.utwente.nl/ 3 3 3 3 3
https://repository.up.ac.za/ 3 3 3 3 3
https://pure.mpg.de/ 3 3 3 3 3
https://epub.wupperinst.org/home 3 3 3 3 3
https://archive.researchdata.leeds.ac.uk/ 3 3 3 3 3
https://repository.usta.edu.co/ 3 3 3 3 3
https://dataverse.harvard.edu/ 3 3 3 3 3
https://bcnroc.ajuntament.barcelona.cat/jspui/ 3 3 3 3 3
https://er.knutd.edu.ua/ 3 3 3 3 3
https://dataspace.princeton.edu/ 3 3 3 3 3
https://repositori.upf.edu/ 3 3 3 3 3
https://bibliotecadigital.exactas.uba.ar/ 3 3 3 3 3
https://prism.ucalgary.ca/ 3 3 3 3 3
https://research.cbs.dk/ 3 3 3 3 3
https://webext.pasteur.fr/genolist/ 3 3 3 3 3
https://ngdc.cncb.ac.cn/idog/ 3 3 3 3 3
https://research-repository.uwa.edu.au/ 3 3 3 3 3
https://research-repository.st-andrews.ac.uk/ 3 3 3 3 3
https://repo.odmu.edu.ua/xmlui/ 3 3 3 3 3
https://red.uao.edu.co/ 3 3 3 3 3
https://data.gov.uk/ 3 3 3 3 3
https://munin.uit.no/ 3 3 3 3 3
https://datadryad.org/stash 3 3 3 3 3
http://rua.ua.es/dspace/ 3 3 3 3 3
http://dspace.uevora.pt/rdpc/ 3 3 3 3 3
https://sc.lib.miamioh.edu/ 3 3 3 3 3
https://papyrus.bib.umontreal.ca/xmlui/ 3 3 3 3 3
https://scholar.smu.edu/ 3 3 3 3 3
https://dro.deakin.edu.au/ 3 3 3 3 3
In [34]:
head_conflation = head_chains.groupby('head_url').aggregate(list)
head_conflation
In [59]:
head_conflation.to_csv('../data/processed/head_conflation.tsv', sep='\t')
In [71]:
head_conflation[head_conflation.url.str.len() > 1]
Out[71]:
url head_final_code head_codes len class
head_url
http://125.99.47.158:8090/jspui/ [http://125.99.47.158:8090/jspui, http://125.9... [200.0, 200.0] 302], [ [1, 0] [200.0, 200.0]
http://160.97.80.9:8080/jspui/ [http://160.97.80.9:8080/jspui/, http://160.97... [404.0, 404.0] ], [302 [0, 1] [400.0, 400.0]
http://acikarsiv.atauni.edu.tr/ [http://acikarsiv.atauni.edu.tr/, http://acika... [200.0, 200.0] ], [ [0, 0] [200.0, 200.0]
http://acikarsiv.atilim.edu.tr/ [http://acikarsiv.atilim.edu.tr/, http://acika... [200.0, 200.0] ], [ [0, 0] [200.0, 200.0]
http://acikerisim.afsu.edu.tr/xmlui/ [http://acikerisim.afsu.edu.tr, http://acikeri... [200.0, 200.0] 302, 302], [302, 302 [2, 2] [200.0, 200.0]
... ... ... ... ... ...
https://zaguan.unizar.es/ [http://zaguan.unizar.es/, http://zaguan.uniza... [200.0, 200.0] 301], [301 [1, 1] [200.0, 200.0]
https://zbc.ksiaznica.szczecin.pl/dlibra [http://zbc.ksiaznica.szczecin.pl/dlibra, http... [200.0, 200.0] 302], [302, 302 [1, 2] [200.0, 200.0]
https://zenodo.org/ [https://zenodo.org/, http://zenodo.org/] [200.0, 200.0] ], [301 [0, 1] [200.0, 200.0]
https://zone.biblio.laurentian.ca/ [https://zone.biblio.laurentian.ca, https://zo... [200.0, 200.0] ], [ [0, 0] [200.0, 200.0]
https://zuscholars.zu.ac.ae/ [https://zuscholars.zu.ac.ae, https://zuschola... [200.0, 200.0] ], [ [0, 0] [200.0, 200.0]

1456 rows × 5 columns

In [77]:
head_conflation[head_conflation.url.str.len() > 1].url.apply(len).sum()
Out[77]:
3051
In [61]:
get_conflation = get_chains.groupby('get_url').aggregate(list)
get_conflation
In [62]:
get_conflation.to_csv('../data/processed/get_conflation.tsv', sep='\t')
In [72]:
get_conflation[get_conflation.url.str.len() > 1]
Out[72]:
url get_final_code get_codes len class
get_url
http://125.99.47.158:8090/jspui/ [http://125.99.47.158:8090/jspui, http://125.9... [200.0, 200.0] 302], [ [1, 0] [200.0, 200.0]
http://160.97.80.9:8080/jspui/ [http://160.97.80.9:8080/jspui/, http://160.97... [404.0, 404.0] ], [302 [0, 1] [400.0, 400.0]
http://acikarsiv.atauni.edu.tr/ [http://acikarsiv.atauni.edu.tr/, http://acika... [200.0, 200.0] ], [ [0, 0] [200.0, 200.0]
http://acikarsiv.atilim.edu.tr/ [http://acikarsiv.atilim.edu.tr/, http://acika... [200.0, 200.0] ], [ [0, 0] [200.0, 200.0]
http://acikerisim.afsu.edu.tr/xmlui/ [http://acikerisim.afsu.edu.tr, http://acikeri... [200.0, 200.0] 302, 302], [302, 302 [2, 2] [200.0, 200.0]
... ... ... ... ... ...
https://zaguan.unizar.es/ [http://zaguan.unizar.es/, http://zaguan.uniza... [200.0, 200.0] 301], [301 [1, 1] [200.0, 200.0]
https://zbc.ksiaznica.szczecin.pl/dlibra [http://zbc.ksiaznica.szczecin.pl/dlibra, http... [200.0, 200.0] 302], [302, 302 [1, 2] [200.0, 200.0]
https://zenodo.org/ [https://zenodo.org/, http://zenodo.org/] [200.0, 200.0] ], [301 [0, 1] [200.0, 200.0]
https://zone.biblio.laurentian.ca/ [https://zone.biblio.laurentian.ca, https://zo... [200.0, 200.0] ], [ [0, 0] [200.0, 200.0]
https://zuscholars.zu.ac.ae/ [https://zuscholars.zu.ac.ae, https://zuschola... [200.0, 200.0] ], [ [0, 0] [200.0, 200.0]

1459 rows × 5 columns

In [76]:
get_conflation[get_conflation.url.str.len() > 1].url.apply(len).sum()
Out[76]:
3060