295 KiB
295 KiB
In [1]:
import json
import requests
import ast
import numpy as np
import pandas as pd
In [2]:
df = pd.read_csv('../data/processed/redirects_new.tsv', sep='\t')
df = df[df.url.notna()]
df
Out[2]:
13356 rows × 13 columns
In [3]:
df.describe(include='all')
Out[3]:
url | head_final_code | head_url | head_chain | head_codes | head_elapsed | head_error | get_final_code | get_url | get_chain | get_codes | get_elapsed | get_error | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 13356 | 10926.000000 | 10926 | 10926 | 10926 | 10926.000000 | 2430 | 10954.00000 | 10954 | 10954 | 10954 | 10954.000000 | 2402 |
unique | 13356 | NaN | 9331 | 4056 | 59 | NaN | 2299 | NaN | 9353 | 4082 | 57 | NaN | 2301 |
top | http://sidc.be/silso/home | NaN | https://www.unit.no/ugyldig-lenke-til-dokument... | [] | [] | NaN | ('Connection aborted.', RemoteDisconnected('Re... | NaN | https://www.unit.no/ugyldig-lenke-til-dokument... | [] | [] | NaN | ('Connection aborted.', RemoteDisconnected('Re... |
freq | 1 | NaN | 16 | 6596 | 6596 | NaN | 34 | NaN | 16 | 6595 | 6595 | NaN | 22 |
mean | NaN | 218.406736 | NaN | NaN | NaN | 1.124777 | NaN | 215.73407 | NaN | NaN | NaN | 1.125515 | NaN |
std | NaN | 61.035428 | NaN | NaN | NaN | 3.103610 | NaN | 56.89142 | NaN | NaN | NaN | 3.184153 | NaN |
min | NaN | 200.000000 | NaN | NaN | NaN | 0.013261 | NaN | 200.00000 | NaN | NaN | NaN | 0.013800 | NaN |
25% | NaN | 200.000000 | NaN | NaN | NaN | 0.276535 | NaN | 200.00000 | NaN | NaN | NaN | 0.254967 | NaN |
50% | NaN | 200.000000 | NaN | NaN | NaN | 0.583665 | NaN | 200.00000 | NaN | NaN | NaN | 0.527238 | NaN |
75% | NaN | 200.000000 | NaN | NaN | NaN | 1.059370 | NaN | 200.00000 | NaN | NaN | NaN | 0.944213 | NaN |
max | NaN | 504.000000 | NaN | NaN | NaN | 61.235589 | NaN | 504.00000 | NaN | NaN | NaN | 61.500941 | NaN |
Final status codes¶
HEAD¶
In [4]:
df[['url', 'head_final_code']].groupby('head_final_code').count()
Out[4]:
url | |
---|---|
head_final_code | |
200.0 | 9976 |
201.0 | 19 |
400.0 | 19 |
401.0 | 14 |
403.0 | 205 |
404.0 | 489 |
405.0 | 50 |
406.0 | 20 |
410.0 | 15 |
412.0 | 1 |
419.0 | 1 |
429.0 | 1 |
444.0 | 1 |
500.0 | 47 |
501.0 | 3 |
502.0 | 15 |
503.0 | 49 |
504.0 | 1 |
In [5]:
final_head = df[['url', 'head_final_code']]
final_head['class'] = final_head.head_final_code.floordiv(100).mul(100)
final_head.groupby('class').count()
/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/3455765756.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy final_head['class'] = final_head.head_final_code.floordiv(100).mul(100)
Out[5]:
url | head_final_code | |
---|---|---|
class | ||
200.0 | 9995 | 9995 |
400.0 | 816 | 816 |
500.0 | 115 | 115 |
In [6]:
final_head.groupby('class').count().sum()
Out[6]:
url 10926 head_final_code 10926 dtype: int64
In [7]:
final_head.groupby('class').count().div(df.shape[0]).mul(100)
Out[7]:
url | head_final_code | |
---|---|---|
class | ||
200.0 | 74.835280 | 74.835280 |
400.0 | 6.109614 | 6.109614 |
500.0 | 0.861036 | 0.861036 |
GET¶
In [8]:
df[['url', 'get_final_code']].groupby('get_final_code').count()
Out[8]:
url | |
---|---|
get_final_code | |
200.0 | 10139 |
201.0 | 19 |
400.0 | 8 |
401.0 | 13 |
403.0 | 168 |
404.0 | 464 |
405.0 | 1 |
406.0 | 21 |
410.0 | 15 |
412.0 | 1 |
419.0 | 1 |
429.0 | 1 |
444.0 | 1 |
500.0 | 40 |
502.0 | 13 |
503.0 | 48 |
504.0 | 1 |
In [9]:
final_get = df[['url', 'get_final_code']]
final_get['class'] = final_get.get_final_code.floordiv(100).mul(100)
final_get.groupby('class').count()
/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/1946842175.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy final_get['class'] = final_get.get_final_code.floordiv(100).mul(100)
Out[9]:
url | get_final_code | |
---|---|---|
class | ||
200.0 | 10158 | 10158 |
400.0 | 694 | 694 |
500.0 | 102 | 102 |
In [10]:
final_get.groupby('class').count().sum()
Out[10]:
url 10954 get_final_code 10954 dtype: int64
In [11]:
final_get.groupby('class').count().div(df.shape[0]).mul(100)
Out[11]:
url | get_final_code | |
---|---|---|
class | ||
200.0 | 76.055705 | 76.055705 |
400.0 | 5.196167 | 5.196167 |
500.0 | 0.763702 | 0.763702 |
Errors¶
In [12]:
head_errors = df[df.head_error.notna()][['url','head_error']]
get_errors = df[df.get_error.notna()][['url','get_error']]
In [13]:
head_errors[head_errors.head_error.str.contains('Max')]
Out[13]:
url | head_error | |
---|---|---|
10 | https://isoarch.eu/ | HTTPSConnectionPool(host='isoarch.eu', port=44... |
13 | https://www.chinare.org.cn/en/ | HTTPSConnectionPool(host='www.chinare.org.cn',... |
26 | http://www.flyrnai.org/ | HTTPSConnectionPool(host='www.flyrnai.org', po... |
71 | http://alternaria.vbi.vt.edu/index.html | HTTPConnectionPool(host='alternaria.vbi.vt.edu... |
83 | http://www.pazar.info/ | HTTPConnectionPool(host='www.pazar.info', port... |
... | ... | ... |
13344 | http://intranet.thomasdanby.ac.uk/dspace/ | HTTPConnectionPool(host='intranet.thomasdanby.... |
13345 | http://dspace.dgsca.unam.mx:8080/dspace/ | HTTPConnectionPool(host='dspace.dgsca.unam.mx'... |
13347 | https://library.for.nau.edu:8443/ | HTTPSConnectionPool(host='library.for.nau.edu'... |
13354 | http://numerique.bibliotheque.toulouse.fr/cgi-... | HTTPConnectionPool(host='numerique.bibliothequ... |
13356 | http://204.232.200.153:8080/jspui/ | HTTPConnectionPool(host='204.232.200.153', por... |
2242 rows × 2 columns
In [14]:
get_errors[get_errors.get_error.str.contains('Max')]
Out[14]:
url | get_error | |
---|---|---|
10 | https://isoarch.eu/ | HTTPSConnectionPool(host='isoarch.eu', port=44... |
13 | https://www.chinare.org.cn/en/ | HTTPSConnectionPool(host='www.chinare.org.cn',... |
26 | http://www.flyrnai.org/ | HTTPSConnectionPool(host='www.flyrnai.org', po... |
57 | http://stellabase.org | HTTPConnectionPool(host='cnidarians.bu.edu', p... |
71 | http://alternaria.vbi.vt.edu/index.html | HTTPConnectionPool(host='alternaria.vbi.vt.edu... |
... | ... | ... |
13344 | http://intranet.thomasdanby.ac.uk/dspace/ | HTTPConnectionPool(host='intranet.thomasdanby.... |
13345 | http://dspace.dgsca.unam.mx:8080/dspace/ | HTTPConnectionPool(host='dspace.dgsca.unam.mx'... |
13347 | https://library.for.nau.edu:8443/ | HTTPSConnectionPool(host='library.for.nau.edu'... |
13354 | http://numerique.bibliotheque.toulouse.fr/cgi-... | HTTPConnectionPool(host='numerique.bibliothequ... |
13356 | http://204.232.200.153:8080/jspui/ | HTTPConnectionPool(host='204.232.200.153', por... |
2256 rows × 2 columns
In [15]:
head_errors[head_errors.head_error.str.contains('30 redirect')]
Out[15]:
url | head_error | |
---|---|---|
485 | https://idr.openmicroscopy.org | Exceeded 30 redirects. |
1191 | http://hgtree.snu.ac.kr | Exceeded 30 redirects. |
1462 | http://cfgp.snu.ac.kr | Exceeded 30 redirects. |
2526 | https://incois.gov.in/tropflux/index.jsp | Exceeded 30 redirects. |
2550 | https://odis.incois.gov.in/ | Exceeded 30 redirects. |
3872 | https://www.projectdatasphere.org/projectdatas... | Exceeded 30 redirects. |
6141 | http://repository.poligran.edu.co/ | Exceeded 30 redirects. |
10841 | http://library.wur.nl/way/ | Exceeded 30 redirects. |
In [16]:
get_errors[get_errors.get_error.str.contains('30 redirect')]
Out[16]:
url | get_error | |
---|---|---|
6141 | http://repository.poligran.edu.co/ | Exceeded 30 redirects. |
Testing Ukranian repositories
In [17]:
get_errors[get_errors.url.str.contains('(\.ua$)|(\.ua/)', regex=True)]
/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/1742693288.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract. get_errors[get_errors.url.str.contains('(\.ua$)|(\.ua/)', regex=True)]
Out[17]:
url | get_error | |
---|---|---|
4892 | http://eprints.isofts.kiev.ua/ | ('Connection aborted.', ConnectionResetError(5... |
4943 | http://www.library.ukma.kiev.ua/dspace/ | HTTPConnectionPool(host='www.library.ukma.kiev... |
4983 | http://repository.ibss.org.ua/dspace/ | HTTPConnectionPool(host='repository.ibss.org.u... |
5147 | http://www.rarelib.undp.org.ua/eng/index.php3 | HTTPConnectionPool(host='www.rarelib.undp.org.... |
5817 | http://openarchive.nure.ua/ | HTTPSConnectionPool(host='openarchive.nure.ua'... |
5827 | http://dspace.uabs.edu.ua/jspui/ | HTTPConnectionPool(host='dspace.uabs.edu.ua', ... |
6135 | http://kgmtu.edu.ua/jspui/ | HTTPConnectionPool(host='kgmtu.edu.ua', port=8... |
6231 | http://dspace.uccu.org.ua/ | HTTPConnectionPool(host='dspace.uccu.org.ua', ... |
6474 | http://repo.uipa.edu.ua/jspui/ | HTTPConnectionPool(host='repo.uipa.edu.ua', po... |
7067 | http://ir.asta.edu.ua/jspui/ | HTTPConnectionPool(host='ir.asta.edu.ua', port... |
7490 | http://dspace.hnpu.edu.ua/ | HTTPConnectionPool(host='dspace.hnpu.edu.ua', ... |
7989 | http://repository.mdu.in.ua/jspui/ | HTTPConnectionPool(host='repository.mdu.in.ua'... |
8085 | http://elar.tsatu.edu.ua/ | HTTPConnectionPool(host='elar.tsatu.edu.ua', p... |
8105 | http://dspace.zsea.edu.ua/ | HTTPSConnectionPool(host='dspace.znu.edu.ua', ... |
8197 | http://dspace.knau.kharkov.ua/jspui/ | HTTPConnectionPool(host='dspace.knau.kharkov.u... |
8199 | http://rep.btsau.edu.ua | HTTPConnectionPool(host='rep.btsau.edu.ua', po... |
8206 | https://biblio.onat.edu.ua/ | HTTPSConnectionPool(host='biblio.onat.edu.ua',... |
8419 | https://rep.ksma.ks.ua | HTTPSConnectionPool(host='rep.ksma.ks.ua', por... |
10016 | http://eprints.ksame.kharkov.ua/ | HTTPConnectionPool(host='eprints.ksame.kharkov... |
10107 | http://dspace.uabs.edu.ua/ | HTTPConnectionPool(host='dspace.uabs.edu.ua', ... |
10799 | http://repository.ibss.org.ua/dspace | HTTPConnectionPool(host='repository.ibss.org.u... |
10906 | http://repo.sau.sumy.ua/ | HTTPConnectionPool(host='repo.sau.sumy.ua', po... |
10976 | http://www.dspace.humanities.org.ua/dspace/ | HTTPConnectionPool(host='www.dspace.humanities... |
11616 | http:// visnykagro.knau.kharkov.ua | HTTPConnectionPool(host='%20visnykagro.knau.kh... |
11803 | http://repository.mdu.in.ua/ | HTTPConnectionPool(host='repository.mdu.in.ua'... |
11831 | http://sj-dstu.dp.ua/ | HTTPConnectionPool(host='sj-dstu.dp.ua', port=... |
12107 | http://www.hups.mil.gov.ua/periodic-app/journal | HTTPConnectionPool(host='www.hups.mil.gov.ua',... |
12746 | http://www.e-archive.org.ua/ | HTTPConnectionPool(host='www.e-archive.org.ua'... |
12753 | http://library.khnu.km.ua/jspui/ | HTTPConnectionPool(host='library.khnu.km.ua', ... |
12916 | http://dspace.ukrfa.kharkov.ua/ | HTTPConnectionPool(host='dspace.ukrfa.kharkov.... |
12922 | http://dspace.ukrfa.kharkov.ua | HTTPConnectionPool(host='dspace.ukrfa.kharkov.... |
12970 | http://evrica.org.ua | HTTPConnectionPool(host='evrica.org.ua', port=... |
12992 | http://dspace.academy.sumy.ua/ | HTTPConnectionPool(host='dspace.academy.sumy.u... |
13017 | http://open-archive.kture.kharkov.ua/ | HTTPConnectionPool(host='open-archive.kture.kh... |
13034 | http://dspace.onu.edu.ua/ | HTTPConnectionPool(host='192.168.30.200', port... |
13052 | http://open-arhive.kture.kharkov.ua | HTTPConnectionPool(host='open-arhive.kture.kha... |
In [18]:
get_errors[get_errors.url.str.contains('(\.ua$)|(\.ua/)', regex=True)].count()
/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/2005047697.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract. get_errors[get_errors.url.str.contains('(\.ua$)|(\.ua/)', regex=True)].count()
Out[18]:
url 36 get_error 36 dtype: int64
In [19]:
df[df.url.str.contains('(\.ua$)|(\.ua/)', regex=True)]
/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/2148841299.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract. df[df.url.str.contains('(\.ua$)|(\.ua/)', regex=True)]
Out[19]:
url | head_final_code | head_url | head_chain | head_codes | head_elapsed | head_error | get_final_code | get_url | get_chain | get_codes | get_elapsed | get_error | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
2185 | http://inform.ikd.kiev.ua/index.php?path=/en/i... | 200.0 | http://inform.ikd.kiev.ua/?path=/en/index | ['http://inform.ikd.kiev.ua/index.php?path=/en... | [301] | 0.402488 | NaN | 200.0 | http://inform.ikd.kiev.ua/?path=/en/index | ['http://inform.ikd.kiev.ua/index.php?path=/en... | [301] | 0.452971 | NaN |
2286 | http://wdc.org.ua/ | 200.0 | http://wdc.org.ua/ | [] | [] | 0.905063 | NaN | 200.0 | http://wdc.org.ua/ | [] | [] | 0.816887 | NaN |
4892 | http://eprints.isofts.kiev.ua/ | NaN | NaN | NaN | NaN | NaN | ('Connection aborted.', ConnectionResetError(5... | NaN | NaN | NaN | NaN | NaN | ('Connection aborted.', ConnectionResetError(5... |
4943 | http://www.library.ukma.kiev.ua/dspace/ | NaN | NaN | NaN | NaN | NaN | HTTPConnectionPool(host='www.library.ukma.kiev... | NaN | NaN | NaN | NaN | NaN | HTTPConnectionPool(host='www.library.ukma.kiev... |
4950 | http://www.dspace.ucu.edu.ua/dspace/ | 403.0 | http://www.dspace.ucu.edu.ua/dspace/ | [] | [] | 0.073373 | NaN | 403.0 | http://www.dspace.ucu.edu.ua/dspace/ | [] | [] | 0.073223 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
13015 | http://ir.kneu.edu.ua | 200.0 | http://ir.kneu.edu.ua/ | [] | [] | 1.609602 | NaN | 200.0 | http://ir.kneu.edu.ua/ | [] | [] | 1.508854 | NaN |
13017 | http://open-archive.kture.kharkov.ua/ | NaN | NaN | NaN | NaN | NaN | HTTPConnectionPool(host='open-archive.kture.kh... | NaN | NaN | NaN | NaN | NaN | HTTPConnectionPool(host='open-archive.kture.kh... |
13034 | http://dspace.onu.edu.ua/ | NaN | NaN | NaN | NaN | NaN | HTTPConnectionPool(host='192.168.30.200', port... | NaN | NaN | NaN | NaN | NaN | HTTPConnectionPool(host='192.168.30.200', port... |
13052 | http://open-arhive.kture.kharkov.ua | NaN | NaN | NaN | NaN | NaN | HTTPConnectionPool(host='open-arhive.kture.kha... | NaN | NaN | NaN | NaN | NaN | HTTPConnectionPool(host='open-arhive.kture.kha... |
13248 | http://www.ekmair.ukma.edu.ua | 200.0 | http://ekmair.ukma.edu.ua/ | ['http://www.ekmair.ukma.edu.ua/'] | [301] | 0.661039 | NaN | 200.0 | http://ekmair.ukma.edu.ua/ | ['http://www.ekmair.ukma.edu.ua/'] | [301] | 0.755170 | NaN |
146 rows × 13 columns
Redirects¶
In [20]:
head_chains = df[df.head_codes.notna()][['url', 'head_url', 'head_final_code', 'head_codes']]
head_chains['len'] = head_chains.head_codes.apply(ast.literal_eval).str.len()
head_chains.groupby('len').count()
Out[20]:
url | head_url | head_final_code | head_codes | |
---|---|---|---|---|
len | ||||
0 | 6596 | 6596 | 6596 | 6596 |
1 | 3364 | 3364 | 3364 | 3364 |
2 | 686 | 686 | 686 | 686 |
3 | 233 | 233 | 233 | 233 |
4 | 37 | 37 | 37 | 37 |
5 | 10 | 10 | 10 | 10 |
In [21]:
get_chains = df[df.get_codes.notna()][['url', 'get_url', 'get_final_code', 'get_codes']]
get_chains['len'] = get_chains.get_codes.apply(ast.literal_eval).str.len()
get_chains.groupby('len').count()
Out[21]:
url | get_url | get_final_code | get_codes | |
---|---|---|---|---|
len | ||||
0 | 6595 | 6595 | 6595 | 6595 |
1 | 3376 | 3376 | 3376 | 3376 |
2 | 699 | 699 | 699 | 699 |
3 | 229 | 229 | 229 | 229 |
4 | 43 | 43 | 43 | 43 |
5 | 12 | 12 | 12 | 12 |
In [22]:
data = head_chains.groupby('len').count().join(get_chains.groupby('len').count(), lsuffix='_head', rsuffix='_get')
data.index.rename('# redirects', inplace=True)
data.rename(columns={'url_head': 'HEAD', 'url_get': 'GET'}, inplace=True)
data[['HEAD', 'GET']][1:].plot.bar(rot=0)
Out[22]:
<AxesSubplot:xlabel='# redirects'>
In [23]:
data[['HEAD', 'GET']][1:].sum()
Out[23]:
HEAD 4330 GET 4359 dtype: int64
In [24]:
data[['HEAD', 'GET']][1:].sum().div(df.shape[0]).mul(100)
Out[24]:
HEAD 32.419886 GET 32.637017 dtype: float64
In [25]:
head_chains['class'] = head_chains.head_final_code.floordiv(100).mul(100)
head_chains[(head_chains.len > 0) & (head_chains['class'] != 200)]
Out[25]:
378 rows × 6 columns
In [26]:
head_chains[(head_chains.len > 0) & (head_chains['class'] != 200)].groupby('class').count()
Out[26]:
url | head_url | head_final_code | head_codes | len | |
---|---|---|---|---|---|
class | |||||
400.0 | 338 | 338 | 338 | 338 | 338 |
500.0 | 40 | 40 | 40 | 40 | 40 |
In [27]:
get_chains['class'] = get_chains.get_final_code.floordiv(100).mul(100)
get_chains[(get_chains.len > 0) & (get_chains['class'] != 200)]
Out[27]:
344 rows × 6 columns
In [28]:
get_chains[(get_chains.len > 0) & (get_chains['class'] != 200)].groupby('class').count()
Out[28]:
url | get_url | get_final_code | get_codes | len | |
---|---|---|---|---|---|
class | |||||
400.0 | 307 | 307 | 307 | 307 | 307 |
500.0 | 37 | 37 | 37 | 37 | 37 |
Redirect reason¶
In [36]:
head_chains[head_chains.len == 1]
Out[36]:
3364 rows × 6 columns
In [50]:
head_chains[(head_chains.len == 1) & (head_chains.url.str.replace('http://', 'https://') == head_chains.head_url)]
Out[50]:
1665 rows × 6 columns
In [51]:
head_chains[(head_chains.len == 1) & (head_chains.url + '/' == head_chains.head_url)]
Out[51]:
url | head_url | head_final_code | head_codes | len | class | |
---|---|---|---|---|---|---|
119 | https://www.biosino.org/node | https://www.biosino.org/node/ | 200.0 | [302] | 1 | 200.0 |
314 | http://metacat.tfri.gov.tw/tfri | http://metacat.tfri.gov.tw/tfri/ | 200.0 | [301] | 1 | 200.0 |
463 | https://nda.nih.gov/abcd | https://nda.nih.gov/abcd/ | 200.0 | [302] | 1 | 200.0 |
551 | https://www1.usgs.gov/obis-usa/ipt | https://www1.usgs.gov/obis-usa/ipt/ | 200.0 | [301] | 1 | 200.0 |
559 | https://www.arm.gov/data | https://www.arm.gov/data/ | 200.0 | [301] | 1 | 200.0 |
... | ... | ... | ... | ... | ... | ... |
12642 | http://repositorio.utec.edu.sv:8080/jspui | http://repositorio.utec.edu.sv:8080/jspui/ | 200.0 | [302] | 1 | 200.0 |
12706 | http://adudspace.adu.edu.tr:8080/jspui | http://adudspace.adu.edu.tr:8080/jspui/ | 200.0 | [302] | 1 | 200.0 |
12874 | http://160.97.80.9:8080/jspui | http://160.97.80.9:8080/jspui/ | 404.0 | [302] | 1 | 400.0 |
13030 | http://library.pdpu.ac.in:8080/xmlui | http://library.pdpu.ac.in:8080/xmlui/ | 200.0 | [302] | 1 | 200.0 |
13267 | http://cab.unime.it/mus | http://cab.unime.it/mus/ | 200.0 | [302] | 1 | 200.0 |
134 rows × 6 columns
In [53]:
head_chains[(head_chains.len == 1) & (head_chains.url.str.replace('http://', 'https://') + '/' == head_chains.head_url)]
Out[53]:
url | head_url | head_final_code | head_codes | len | class | |
---|---|---|---|---|---|---|
9 | http://arabidopsis.info | https://arabidopsis.info/ | 200.0 | [301] | 1 | 200.0 |
46 | http://pcddb.cryst.bbk.ac.uk | https://pcddb.cryst.bbk.ac.uk/ | 200.0 | [302] | 1 | 200.0 |
113 | http://www.marinedataarchive.eu | https://www.marinedataarchive.eu/ | 200.0 | [301] | 1 | 200.0 |
119 | https://www.biosino.org/node | https://www.biosino.org/node/ | 200.0 | [302] | 1 | 200.0 |
200 | http://www.hydroshare.org | https://www.hydroshare.org/ | 200.0 | [301] | 1 | 200.0 |
... | ... | ... | ... | ... | ... | ... |
13032 | http://repositorio.ipsantarem.pt | https://repositorio.ipsantarem.pt/ | 200.0 | [302] | 1 | 200.0 |
13077 | http://bibliotecadigital.ipb.pt | https://bibliotecadigital.ipb.pt/ | 200.0 | [302] | 1 | 200.0 |
13091 | http://eprints.nmlindia.org | https://eprints.nmlindia.org/ | 200.0 | [301] | 1 | 200.0 |
13124 | http://www.ju.edu.et | https://www.ju.edu.et/ | 200.0 | [301] | 1 | 200.0 |
13137 | http://scholarship.rollins.edu | https://scholarship.rollins.edu/ | 200.0 | [301] | 1 | 200.0 |
487 rows × 6 columns
In [54]:
get_chains[get_chains.len == 1]
Out[54]:
3376 rows × 6 columns
In [56]:
get_chains[(get_chains.len == 1) & (get_chains.url.str.replace('http://', 'https://') == get_chains.get_url)]
Out[56]:
1668 rows × 6 columns
In [57]:
get_chains[(get_chains.len == 1) & (get_chains.url + '/' == get_chains.get_url)]
Out[57]:
url | get_url | get_final_code | get_codes | len | class | |
---|---|---|---|---|---|---|
119 | https://www.biosino.org/node | https://www.biosino.org/node/ | 200.0 | [302] | 1 | 200.0 |
314 | http://metacat.tfri.gov.tw/tfri | http://metacat.tfri.gov.tw/tfri/ | 200.0 | [301] | 1 | 200.0 |
463 | https://nda.nih.gov/abcd | https://nda.nih.gov/abcd/ | 200.0 | [302] | 1 | 200.0 |
551 | https://www1.usgs.gov/obis-usa/ipt | https://www1.usgs.gov/obis-usa/ipt/ | 200.0 | [301] | 1 | 200.0 |
559 | https://www.arm.gov/data | https://www.arm.gov/data/ | 200.0 | [301] | 1 | 200.0 |
... | ... | ... | ... | ... | ... | ... |
12642 | http://repositorio.utec.edu.sv:8080/jspui | http://repositorio.utec.edu.sv:8080/jspui/ | 200.0 | [302] | 1 | 200.0 |
12706 | http://adudspace.adu.edu.tr:8080/jspui | http://adudspace.adu.edu.tr:8080/jspui/ | 200.0 | [302] | 1 | 200.0 |
12874 | http://160.97.80.9:8080/jspui | http://160.97.80.9:8080/jspui/ | 404.0 | [302] | 1 | 400.0 |
13030 | http://library.pdpu.ac.in:8080/xmlui | http://library.pdpu.ac.in:8080/xmlui/ | 200.0 | [302] | 1 | 200.0 |
13267 | http://cab.unime.it/mus | http://cab.unime.it/mus/ | 200.0 | [302] | 1 | 200.0 |
134 rows × 6 columns
In [58]:
get_chains[(get_chains.len == 1) & (get_chains.url.str.replace('http://', 'https://') + '/' == get_chains.get_url)]
Out[58]:
url | get_url | get_final_code | get_codes | len | class | |
---|---|---|---|---|---|---|
9 | http://arabidopsis.info | https://arabidopsis.info/ | 200.0 | [301] | 1 | 200.0 |
46 | http://pcddb.cryst.bbk.ac.uk | https://pcddb.cryst.bbk.ac.uk/ | 200.0 | [302] | 1 | 200.0 |
113 | http://www.marinedataarchive.eu | https://www.marinedataarchive.eu/ | 200.0 | [301] | 1 | 200.0 |
119 | https://www.biosino.org/node | https://www.biosino.org/node/ | 200.0 | [302] | 1 | 200.0 |
200 | http://www.hydroshare.org | https://www.hydroshare.org/ | 200.0 | [301] | 1 | 200.0 |
... | ... | ... | ... | ... | ... | ... |
13032 | http://repositorio.ipsantarem.pt | https://repositorio.ipsantarem.pt/ | 200.0 | [302] | 1 | 200.0 |
13077 | http://bibliotecadigital.ipb.pt | https://bibliotecadigital.ipb.pt/ | 200.0 | [302] | 1 | 200.0 |
13091 | http://eprints.nmlindia.org | https://eprints.nmlindia.org/ | 200.0 | [301] | 1 | 200.0 |
13124 | http://www.ju.edu.et | https://www.ju.edu.et/ | 200.0 | [301] | 1 | 200.0 |
13137 | http://scholarship.rollins.edu | https://scholarship.rollins.edu/ | 200.0 | [301] | 1 | 200.0 |
490 rows × 6 columns
Redirect mismatch¶
In [29]:
df[(df.head_url != df.get_url) & (df.head_url.notna()) & (df.get_url.notna())]
Out[29]:
146 rows × 13 columns
URL conflation¶
In [30]:
head_chains
Out[30]:
10926 rows × 6 columns
In [31]:
head_chains.describe(include='all')
Out[31]:
url | head_url | head_final_code | head_codes | len | class | |
---|---|---|---|---|---|---|
count | 10926 | 10926 | 10926.000000 | 10926 | 10926.000000 | 10926.000000 |
unique | 10926 | 9331 | NaN | 59 | NaN | NaN |
top | http://sidc.be/silso/home | https://www.unit.no/ugyldig-lenke-til-dokument... | NaN | [] | NaN | NaN |
freq | 1 | 16 | NaN | 6596 | NaN | NaN |
mean | NaN | NaN | 218.406736 | NaN | 0.515559 | 218.094454 |
std | NaN | NaN | 61.035428 | NaN | 0.749850 | 60.063069 |
min | NaN | NaN | 200.000000 | NaN | 0.000000 | 200.000000 |
25% | NaN | NaN | 200.000000 | NaN | 0.000000 | 200.000000 |
50% | NaN | NaN | 200.000000 | NaN | 0.000000 | 200.000000 |
75% | NaN | NaN | 200.000000 | NaN | 1.000000 | 200.000000 |
max | NaN | NaN | 504.000000 | NaN | 5.000000 | 500.000000 |
In [32]:
head_chains.groupby('head_url').count().sort_values('url', ascending=False).head(50)
Out[32]:
In [33]:
get_chains.groupby('get_url').count().sort_values('url', ascending=False).head(50)
Out[33]:
In [34]:
head_conflation = head_chains.groupby('head_url').aggregate(list)
head_conflation
Out[34]:
9331 rows × 5 columns
In [59]:
head_conflation.to_csv('../data/processed/head_conflation.tsv', sep='\t')
In [71]:
head_conflation[head_conflation.url.str.len() > 1]
Out[71]:
1456 rows × 5 columns
In [77]:
head_conflation[head_conflation.url.str.len() > 1].url.apply(len).sum()
Out[77]:
3051
In [61]:
get_conflation = get_chains.groupby('get_url').aggregate(list)
get_conflation
Out[61]:
9353 rows × 5 columns
In [62]:
get_conflation.to_csv('../data/processed/get_conflation.tsv', sep='\t')
In [72]:
get_conflation[get_conflation.url.str.len() > 1]
Out[72]:
1459 rows × 5 columns
In [76]:
get_conflation[get_conflation.url.str.len() > 1].url.apply(len).sum()
Out[76]:
3060