{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import json\n", "import requests\n", "import ast\n", "\n", "import numpy as np\n", "import pandas as pd\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_final_codehead_urlhead_chainhead_codeshead_elapsedhead_errorget_final_codeget_urlget_chainget_codesget_elapsedget_error
0http://sidc.be/silso/home200.0https://www.sidc.be/silso/home['http://sidc.be/silso/home'][302]0.151548NaN200.0https://www.sidc.be/silso/home['http://sidc.be/silso/home'][302]0.145799NaN
1https://bmrb.io/200.0https://bmrb.io/[][]0.527131NaN200.0https://bmrb.io/[][]0.487402NaN
2http://www.pac.dfo-mpo.gc.ca/science/oceans/da...200.0https://www.pac.dfo-mpo.gc.ca/science/oceans/d...['http://www.pac.dfo-mpo.gc.ca/science/oceans/...[307]0.530466NaN200.0https://www.pac.dfo-mpo.gc.ca/science/oceans/d...['http://www.pac.dfo-mpo.gc.ca/science/oceans/...[307]0.510097NaN
3https://www.cpc.ncep.noaa.gov/200.0https://www.cpc.ncep.noaa.gov/[][]0.507708NaN200.0https://www.cpc.ncep.noaa.gov/[][]0.492151NaN
4http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...404.0http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...[][]0.586088NaN404.0http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...[][]0.542670NaN
..........................................
13352http://radar.brookes.ac.uk/radar/access/home.do200.0https://radar.brookes.ac.uk/radar/home.do['http://radar.brookes.ac.uk/radar/access/home...[302, 302, 302]0.082406NaN200.0https://radar.brookes.ac.uk/radar/home.do['http://radar.brookes.ac.uk/radar/access/home...[302, 302, 302]0.088766NaN
13353http://www.ub.uni-siegen.de/opus200.0https://www.ub.uni-siegen.de/opus/['http://www.ub.uni-siegen.de/opus', 'https://...[301, 301]0.056328NaN200.0https://www.ub.uni-siegen.de/opus/['http://www.ub.uni-siegen.de/opus', 'https://...[301, 301]0.039649NaN
13354http://numerique.bibliotheque.toulouse.fr/cgi-...NaNNaNNaNNaNNaNHTTPConnectionPool(host='numerique.bibliothequ...NaNNaNNaNNaNNaNHTTPConnectionPool(host='numerique.bibliothequ...
13355https://www.utas.edu.au/library200.0https://www.utas.edu.au/library[][]0.223856NaN200.0https://www.utas.edu.au/library[][]0.595001NaN
13356http://204.232.200.153:8080/jspui/NaNNaNNaNNaNNaNHTTPConnectionPool(host='204.232.200.153', por...NaNNaNNaNNaNNaNHTTPConnectionPool(host='204.232.200.153', por...
\n", "

13356 rows × 13 columns

\n", "
" ], "text/plain": [ " url head_final_code \\\n", "0 http://sidc.be/silso/home 200.0 \n", "1 https://bmrb.io/ 200.0 \n", "2 http://www.pac.dfo-mpo.gc.ca/science/oceans/da... 200.0 \n", "3 https://www.cpc.ncep.noaa.gov/ 200.0 \n", "4 http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... 404.0 \n", "... ... ... \n", "13352 http://radar.brookes.ac.uk/radar/access/home.do 200.0 \n", "13353 http://www.ub.uni-siegen.de/opus 200.0 \n", "13354 http://numerique.bibliotheque.toulouse.fr/cgi-... NaN \n", "13355 https://www.utas.edu.au/library 200.0 \n", "13356 http://204.232.200.153:8080/jspui/ NaN \n", "\n", " head_url \\\n", "0 https://www.sidc.be/silso/home \n", "1 https://bmrb.io/ \n", "2 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... \n", "3 https://www.cpc.ncep.noaa.gov/ \n", "4 http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... \n", "... ... \n", "13352 https://radar.brookes.ac.uk/radar/home.do \n", "13353 https://www.ub.uni-siegen.de/opus/ \n", "13354 NaN \n", "13355 https://www.utas.edu.au/library \n", "13356 NaN \n", "\n", " head_chain head_codes \\\n", "0 ['http://sidc.be/silso/home'] [302] \n", "1 [] [] \n", "2 ['http://www.pac.dfo-mpo.gc.ca/science/oceans/... [307] \n", "3 [] [] \n", "4 [] [] \n", "... ... ... \n", "13352 ['http://radar.brookes.ac.uk/radar/access/home... [302, 302, 302] \n", "13353 ['http://www.ub.uni-siegen.de/opus', 'https://... [301, 301] \n", "13354 NaN NaN \n", "13355 [] [] \n", "13356 NaN NaN \n", "\n", " head_elapsed head_error \\\n", "0 0.151548 NaN \n", "1 0.527131 NaN \n", "2 0.530466 NaN \n", "3 0.507708 NaN \n", "4 0.586088 NaN \n", "... ... ... \n", "13352 0.082406 NaN \n", "13353 0.056328 NaN \n", "13354 NaN HTTPConnectionPool(host='numerique.bibliothequ... \n", "13355 0.223856 NaN \n", "13356 NaN HTTPConnectionPool(host='204.232.200.153', por... \n", "\n", " get_final_code get_url \\\n", "0 200.0 https://www.sidc.be/silso/home \n", "1 200.0 https://bmrb.io/ \n", "2 200.0 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... \n", "3 200.0 https://www.cpc.ncep.noaa.gov/ \n", "4 404.0 http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... \n", "... ... ... \n", "13352 200.0 https://radar.brookes.ac.uk/radar/home.do \n", "13353 200.0 https://www.ub.uni-siegen.de/opus/ \n", "13354 NaN NaN \n", "13355 200.0 https://www.utas.edu.au/library \n", "13356 NaN NaN \n", "\n", " get_chain get_codes \\\n", "0 ['http://sidc.be/silso/home'] [302] \n", "1 [] [] \n", "2 ['http://www.pac.dfo-mpo.gc.ca/science/oceans/... [307] \n", "3 [] [] \n", "4 [] [] \n", "... ... ... \n", "13352 ['http://radar.brookes.ac.uk/radar/access/home... [302, 302, 302] \n", "13353 ['http://www.ub.uni-siegen.de/opus', 'https://... [301, 301] \n", "13354 NaN NaN \n", "13355 [] [] \n", "13356 NaN NaN \n", "\n", " get_elapsed get_error \n", "0 0.145799 NaN \n", "1 0.487402 NaN \n", "2 0.510097 NaN \n", "3 0.492151 NaN \n", "4 0.542670 NaN \n", "... ... ... \n", "13352 0.088766 NaN \n", "13353 0.039649 NaN \n", "13354 NaN HTTPConnectionPool(host='numerique.bibliothequ... \n", "13355 0.595001 NaN \n", "13356 NaN HTTPConnectionPool(host='204.232.200.153', por... \n", "\n", "[13356 rows x 13 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('../data/processed/redirects_new.tsv', sep='\\t')\n", "df = df[df.url.notna()]\n", "df" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_final_codehead_urlhead_chainhead_codeshead_elapsedhead_errorget_final_codeget_urlget_chainget_codesget_elapsedget_error
count1335610926.00000010926109261092610926.000000243010954.0000010954109541095410954.0000002402
unique13356NaN9331405659NaN2299NaN9353408257NaN2301
tophttp://sidc.be/silso/homeNaNhttps://www.unit.no/ugyldig-lenke-til-dokument...[][]NaN('Connection aborted.', RemoteDisconnected('Re...NaNhttps://www.unit.no/ugyldig-lenke-til-dokument...[][]NaN('Connection aborted.', RemoteDisconnected('Re...
freq1NaN1665966596NaN34NaN1665956595NaN22
meanNaN218.406736NaNNaNNaN1.124777NaN215.73407NaNNaNNaN1.125515NaN
stdNaN61.035428NaNNaNNaN3.103610NaN56.89142NaNNaNNaN3.184153NaN
minNaN200.000000NaNNaNNaN0.013261NaN200.00000NaNNaNNaN0.013800NaN
25%NaN200.000000NaNNaNNaN0.276535NaN200.00000NaNNaNNaN0.254967NaN
50%NaN200.000000NaNNaNNaN0.583665NaN200.00000NaNNaNNaN0.527238NaN
75%NaN200.000000NaNNaNNaN1.059370NaN200.00000NaNNaNNaN0.944213NaN
maxNaN504.000000NaNNaNNaN61.235589NaN504.00000NaNNaNNaN61.500941NaN
\n", "
" ], "text/plain": [ " url head_final_code \\\n", "count 13356 10926.000000 \n", "unique 13356 NaN \n", "top http://sidc.be/silso/home NaN \n", "freq 1 NaN \n", "mean NaN 218.406736 \n", "std NaN 61.035428 \n", "min NaN 200.000000 \n", "25% NaN 200.000000 \n", "50% NaN 200.000000 \n", "75% NaN 200.000000 \n", "max NaN 504.000000 \n", "\n", " head_url head_chain \\\n", "count 10926 10926 \n", "unique 9331 4056 \n", "top https://www.unit.no/ugyldig-lenke-til-dokument... [] \n", "freq 16 6596 \n", "mean NaN NaN \n", "std NaN NaN \n", "min NaN NaN \n", "25% NaN NaN \n", "50% NaN NaN \n", "75% NaN NaN \n", "max NaN NaN \n", "\n", " head_codes head_elapsed \\\n", "count 10926 10926.000000 \n", "unique 59 NaN \n", "top [] NaN \n", "freq 6596 NaN \n", "mean NaN 1.124777 \n", "std NaN 3.103610 \n", "min NaN 0.013261 \n", "25% NaN 0.276535 \n", "50% NaN 0.583665 \n", "75% NaN 1.059370 \n", "max NaN 61.235589 \n", "\n", " head_error get_final_code \\\n", "count 2430 10954.00000 \n", "unique 2299 NaN \n", "top ('Connection aborted.', RemoteDisconnected('Re... NaN \n", "freq 34 NaN \n", "mean NaN 215.73407 \n", "std NaN 56.89142 \n", "min NaN 200.00000 \n", "25% NaN 200.00000 \n", "50% NaN 200.00000 \n", "75% NaN 200.00000 \n", "max NaN 504.00000 \n", "\n", " get_url get_chain get_codes \\\n", "count 10954 10954 10954 \n", "unique 9353 4082 57 \n", "top https://www.unit.no/ugyldig-lenke-til-dokument... [] [] \n", "freq 16 6595 6595 \n", "mean NaN NaN NaN \n", "std NaN NaN NaN \n", "min NaN NaN NaN \n", "25% NaN NaN NaN \n", "50% NaN NaN NaN \n", "75% NaN NaN NaN \n", "max NaN NaN NaN \n", "\n", " get_elapsed get_error \n", "count 10954.000000 2402 \n", "unique NaN 2301 \n", "top NaN ('Connection aborted.', RemoteDisconnected('Re... \n", "freq NaN 22 \n", "mean 1.125515 NaN \n", "std 3.184153 NaN \n", "min 0.013800 NaN \n", "25% 0.254967 NaN \n", "50% 0.527238 NaN \n", "75% 0.944213 NaN \n", "max 61.500941 NaN " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe(include='all')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Final status codes" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### HEAD" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
url
head_final_code
200.09976
201.019
400.019
401.014
403.0205
404.0489
405.050
406.020
410.015
412.01
419.01
429.01
444.01
500.047
501.03
502.015
503.049
504.01
\n", "
" ], "text/plain": [ " url\n", "head_final_code \n", "200.0 9976\n", "201.0 19\n", "400.0 19\n", "401.0 14\n", "403.0 205\n", "404.0 489\n", "405.0 50\n", "406.0 20\n", "410.0 15\n", "412.0 1\n", "419.0 1\n", "429.0 1\n", "444.0 1\n", "500.0 47\n", "501.0 3\n", "502.0 15\n", "503.0 49\n", "504.0 1" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[['url', 'head_final_code']].groupby('head_final_code').count()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/3455765756.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " final_head['class'] = final_head.head_final_code.floordiv(100).mul(100)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_final_code
class
200.099959995
400.0816816
500.0115115
\n", "
" ], "text/plain": [ " url head_final_code\n", "class \n", "200.0 9995 9995\n", "400.0 816 816\n", "500.0 115 115" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_head = df[['url', 'head_final_code']]\n", "final_head['class'] = final_head.head_final_code.floordiv(100).mul(100)\n", "final_head.groupby('class').count()" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "url 10926\n", "head_final_code 10926\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_head.groupby('class').count().sum()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_final_code
class
200.074.83528074.835280
400.06.1096146.109614
500.00.8610360.861036
\n", "
" ], "text/plain": [ " url head_final_code\n", "class \n", "200.0 74.835280 74.835280\n", "400.0 6.109614 6.109614\n", "500.0 0.861036 0.861036" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_head.groupby('class').count().div(df.shape[0]).mul(100)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### GET" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
url
get_final_code
200.010139
201.019
400.08
401.013
403.0168
404.0464
405.01
406.021
410.015
412.01
419.01
429.01
444.01
500.040
502.013
503.048
504.01
\n", "
" ], "text/plain": [ " url\n", "get_final_code \n", "200.0 10139\n", "201.0 19\n", "400.0 8\n", "401.0 13\n", "403.0 168\n", "404.0 464\n", "405.0 1\n", "406.0 21\n", "410.0 15\n", "412.0 1\n", "419.0 1\n", "429.0 1\n", "444.0 1\n", "500.0 40\n", "502.0 13\n", "503.0 48\n", "504.0 1" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[['url', 'get_final_code']].groupby('get_final_code').count()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/1946842175.py:2: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " final_get['class'] = final_get.get_final_code.floordiv(100).mul(100)\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_final_code
class
200.01015810158
400.0694694
500.0102102
\n", "
" ], "text/plain": [ " url get_final_code\n", "class \n", "200.0 10158 10158\n", "400.0 694 694\n", "500.0 102 102" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_get = df[['url', 'get_final_code']]\n", "final_get['class'] = final_get.get_final_code.floordiv(100).mul(100)\n", "final_get.groupby('class').count()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "url 10954\n", "get_final_code 10954\n", "dtype: int64" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_get.groupby('class').count().sum()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_final_code
class
200.076.05570576.055705
400.05.1961675.196167
500.00.7637020.763702
\n", "
" ], "text/plain": [ " url get_final_code\n", "class \n", "200.0 76.055705 76.055705\n", "400.0 5.196167 5.196167\n", "500.0 0.763702 0.763702" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "final_get.groupby('class').count().div(df.shape[0]).mul(100)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Errors" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "head_errors = df[df.head_error.notna()][['url','head_error']]\n", "get_errors = df[df.get_error.notna()][['url','get_error']]" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_error
10https://isoarch.eu/HTTPSConnectionPool(host='isoarch.eu', port=44...
13https://www.chinare.org.cn/en/HTTPSConnectionPool(host='www.chinare.org.cn',...
26http://www.flyrnai.org/HTTPSConnectionPool(host='www.flyrnai.org', po...
71http://alternaria.vbi.vt.edu/index.htmlHTTPConnectionPool(host='alternaria.vbi.vt.edu...
83http://www.pazar.info/HTTPConnectionPool(host='www.pazar.info', port...
.........
13344http://intranet.thomasdanby.ac.uk/dspace/HTTPConnectionPool(host='intranet.thomasdanby....
13345http://dspace.dgsca.unam.mx:8080/dspace/HTTPConnectionPool(host='dspace.dgsca.unam.mx'...
13347https://library.for.nau.edu:8443/HTTPSConnectionPool(host='library.for.nau.edu'...
13354http://numerique.bibliotheque.toulouse.fr/cgi-...HTTPConnectionPool(host='numerique.bibliothequ...
13356http://204.232.200.153:8080/jspui/HTTPConnectionPool(host='204.232.200.153', por...
\n", "

2242 rows × 2 columns

\n", "
" ], "text/plain": [ " url \\\n", "10 https://isoarch.eu/ \n", "13 https://www.chinare.org.cn/en/ \n", "26 http://www.flyrnai.org/ \n", "71 http://alternaria.vbi.vt.edu/index.html \n", "83 http://www.pazar.info/ \n", "... ... \n", "13344 http://intranet.thomasdanby.ac.uk/dspace/ \n", "13345 http://dspace.dgsca.unam.mx:8080/dspace/ \n", "13347 https://library.for.nau.edu:8443/ \n", "13354 http://numerique.bibliotheque.toulouse.fr/cgi-... \n", "13356 http://204.232.200.153:8080/jspui/ \n", "\n", " head_error \n", "10 HTTPSConnectionPool(host='isoarch.eu', port=44... \n", "13 HTTPSConnectionPool(host='www.chinare.org.cn',... \n", "26 HTTPSConnectionPool(host='www.flyrnai.org', po... \n", "71 HTTPConnectionPool(host='alternaria.vbi.vt.edu... \n", "83 HTTPConnectionPool(host='www.pazar.info', port... \n", "... ... \n", "13344 HTTPConnectionPool(host='intranet.thomasdanby.... \n", "13345 HTTPConnectionPool(host='dspace.dgsca.unam.mx'... \n", "13347 HTTPSConnectionPool(host='library.for.nau.edu'... \n", "13354 HTTPConnectionPool(host='numerique.bibliothequ... \n", "13356 HTTPConnectionPool(host='204.232.200.153', por... \n", "\n", "[2242 rows x 2 columns]" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_errors[head_errors.head_error.str.contains('Max')]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_error
10https://isoarch.eu/HTTPSConnectionPool(host='isoarch.eu', port=44...
13https://www.chinare.org.cn/en/HTTPSConnectionPool(host='www.chinare.org.cn',...
26http://www.flyrnai.org/HTTPSConnectionPool(host='www.flyrnai.org', po...
57http://stellabase.orgHTTPConnectionPool(host='cnidarians.bu.edu', p...
71http://alternaria.vbi.vt.edu/index.htmlHTTPConnectionPool(host='alternaria.vbi.vt.edu...
.........
13344http://intranet.thomasdanby.ac.uk/dspace/HTTPConnectionPool(host='intranet.thomasdanby....
13345http://dspace.dgsca.unam.mx:8080/dspace/HTTPConnectionPool(host='dspace.dgsca.unam.mx'...
13347https://library.for.nau.edu:8443/HTTPSConnectionPool(host='library.for.nau.edu'...
13354http://numerique.bibliotheque.toulouse.fr/cgi-...HTTPConnectionPool(host='numerique.bibliothequ...
13356http://204.232.200.153:8080/jspui/HTTPConnectionPool(host='204.232.200.153', por...
\n", "

2256 rows × 2 columns

\n", "
" ], "text/plain": [ " url \\\n", "10 https://isoarch.eu/ \n", "13 https://www.chinare.org.cn/en/ \n", "26 http://www.flyrnai.org/ \n", "57 http://stellabase.org \n", "71 http://alternaria.vbi.vt.edu/index.html \n", "... ... \n", "13344 http://intranet.thomasdanby.ac.uk/dspace/ \n", "13345 http://dspace.dgsca.unam.mx:8080/dspace/ \n", "13347 https://library.for.nau.edu:8443/ \n", "13354 http://numerique.bibliotheque.toulouse.fr/cgi-... \n", "13356 http://204.232.200.153:8080/jspui/ \n", "\n", " get_error \n", "10 HTTPSConnectionPool(host='isoarch.eu', port=44... \n", "13 HTTPSConnectionPool(host='www.chinare.org.cn',... \n", "26 HTTPSConnectionPool(host='www.flyrnai.org', po... \n", "57 HTTPConnectionPool(host='cnidarians.bu.edu', p... \n", "71 HTTPConnectionPool(host='alternaria.vbi.vt.edu... \n", "... ... \n", "13344 HTTPConnectionPool(host='intranet.thomasdanby.... \n", "13345 HTTPConnectionPool(host='dspace.dgsca.unam.mx'... \n", "13347 HTTPSConnectionPool(host='library.for.nau.edu'... \n", "13354 HTTPConnectionPool(host='numerique.bibliothequ... \n", "13356 HTTPConnectionPool(host='204.232.200.153', por... \n", "\n", "[2256 rows x 2 columns]" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_errors[get_errors.get_error.str.contains('Max')]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_error
485https://idr.openmicroscopy.orgExceeded 30 redirects.
1191http://hgtree.snu.ac.krExceeded 30 redirects.
1462http://cfgp.snu.ac.krExceeded 30 redirects.
2526https://incois.gov.in/tropflux/index.jspExceeded 30 redirects.
2550https://odis.incois.gov.in/Exceeded 30 redirects.
3872https://www.projectdatasphere.org/projectdatas...Exceeded 30 redirects.
6141http://repository.poligran.edu.co/Exceeded 30 redirects.
10841http://library.wur.nl/way/Exceeded 30 redirects.
\n", "
" ], "text/plain": [ " url \\\n", "485 https://idr.openmicroscopy.org \n", "1191 http://hgtree.snu.ac.kr \n", "1462 http://cfgp.snu.ac.kr \n", "2526 https://incois.gov.in/tropflux/index.jsp \n", "2550 https://odis.incois.gov.in/ \n", "3872 https://www.projectdatasphere.org/projectdatas... \n", "6141 http://repository.poligran.edu.co/ \n", "10841 http://library.wur.nl/way/ \n", "\n", " head_error \n", "485 Exceeded 30 redirects. \n", "1191 Exceeded 30 redirects. \n", "1462 Exceeded 30 redirects. \n", "2526 Exceeded 30 redirects. \n", "2550 Exceeded 30 redirects. \n", "3872 Exceeded 30 redirects. \n", "6141 Exceeded 30 redirects. \n", "10841 Exceeded 30 redirects. " ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_errors[head_errors.head_error.str.contains('30 redirect')]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_error
6141http://repository.poligran.edu.co/Exceeded 30 redirects.
\n", "
" ], "text/plain": [ " url get_error\n", "6141 http://repository.poligran.edu.co/ Exceeded 30 redirects." ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_errors[get_errors.get_error.str.contains('30 redirect')]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Testing Ukranian repositories" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/1742693288.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", " get_errors[get_errors.url.str.contains('(\\.ua$)|(\\.ua/)', regex=True)]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_error
4892http://eprints.isofts.kiev.ua/('Connection aborted.', ConnectionResetError(5...
4943http://www.library.ukma.kiev.ua/dspace/HTTPConnectionPool(host='www.library.ukma.kiev...
4983http://repository.ibss.org.ua/dspace/HTTPConnectionPool(host='repository.ibss.org.u...
5147http://www.rarelib.undp.org.ua/eng/index.php3HTTPConnectionPool(host='www.rarelib.undp.org....
5817http://openarchive.nure.ua/HTTPSConnectionPool(host='openarchive.nure.ua'...
5827http://dspace.uabs.edu.ua/jspui/HTTPConnectionPool(host='dspace.uabs.edu.ua', ...
6135http://kgmtu.edu.ua/jspui/HTTPConnectionPool(host='kgmtu.edu.ua', port=8...
6231http://dspace.uccu.org.ua/HTTPConnectionPool(host='dspace.uccu.org.ua', ...
6474http://repo.uipa.edu.ua/jspui/HTTPConnectionPool(host='repo.uipa.edu.ua', po...
7067http://ir.asta.edu.ua/jspui/HTTPConnectionPool(host='ir.asta.edu.ua', port...
7490http://dspace.hnpu.edu.ua/HTTPConnectionPool(host='dspace.hnpu.edu.ua', ...
7989http://repository.mdu.in.ua/jspui/HTTPConnectionPool(host='repository.mdu.in.ua'...
8085http://elar.tsatu.edu.ua/HTTPConnectionPool(host='elar.tsatu.edu.ua', p...
8105http://dspace.zsea.edu.ua/HTTPSConnectionPool(host='dspace.znu.edu.ua', ...
8197http://dspace.knau.kharkov.ua/jspui/HTTPConnectionPool(host='dspace.knau.kharkov.u...
8199http://rep.btsau.edu.uaHTTPConnectionPool(host='rep.btsau.edu.ua', po...
8206https://biblio.onat.edu.ua/HTTPSConnectionPool(host='biblio.onat.edu.ua',...
8419https://rep.ksma.ks.uaHTTPSConnectionPool(host='rep.ksma.ks.ua', por...
10016http://eprints.ksame.kharkov.ua/HTTPConnectionPool(host='eprints.ksame.kharkov...
10107http://dspace.uabs.edu.ua/HTTPConnectionPool(host='dspace.uabs.edu.ua', ...
10799http://repository.ibss.org.ua/dspaceHTTPConnectionPool(host='repository.ibss.org.u...
10906http://repo.sau.sumy.ua/HTTPConnectionPool(host='repo.sau.sumy.ua', po...
10976http://www.dspace.humanities.org.ua/dspace/HTTPConnectionPool(host='www.dspace.humanities...
11616http:// visnykagro.knau.kharkov.uaHTTPConnectionPool(host='%20visnykagro.knau.kh...
11803http://repository.mdu.in.ua/HTTPConnectionPool(host='repository.mdu.in.ua'...
11831http://sj-dstu.dp.ua/HTTPConnectionPool(host='sj-dstu.dp.ua', port=...
12107http://www.hups.mil.gov.ua/periodic-app/journalHTTPConnectionPool(host='www.hups.mil.gov.ua',...
12746http://www.e-archive.org.ua/HTTPConnectionPool(host='www.e-archive.org.ua'...
12753http://library.khnu.km.ua/jspui/HTTPConnectionPool(host='library.khnu.km.ua', ...
12916http://dspace.ukrfa.kharkov.ua/HTTPConnectionPool(host='dspace.ukrfa.kharkov....
12922http://dspace.ukrfa.kharkov.uaHTTPConnectionPool(host='dspace.ukrfa.kharkov....
12970http://evrica.org.uaHTTPConnectionPool(host='evrica.org.ua', port=...
12992http://dspace.academy.sumy.ua/HTTPConnectionPool(host='dspace.academy.sumy.u...
13017http://open-archive.kture.kharkov.ua/HTTPConnectionPool(host='open-archive.kture.kh...
13034http://dspace.onu.edu.ua/HTTPConnectionPool(host='192.168.30.200', port...
13052http://open-arhive.kture.kharkov.uaHTTPConnectionPool(host='open-arhive.kture.kha...
\n", "
" ], "text/plain": [ " url \\\n", "4892 http://eprints.isofts.kiev.ua/ \n", "4943 http://www.library.ukma.kiev.ua/dspace/ \n", "4983 http://repository.ibss.org.ua/dspace/ \n", "5147 http://www.rarelib.undp.org.ua/eng/index.php3 \n", "5817 http://openarchive.nure.ua/ \n", "5827 http://dspace.uabs.edu.ua/jspui/ \n", "6135 http://kgmtu.edu.ua/jspui/ \n", "6231 http://dspace.uccu.org.ua/ \n", "6474 http://repo.uipa.edu.ua/jspui/ \n", "7067 http://ir.asta.edu.ua/jspui/ \n", "7490 http://dspace.hnpu.edu.ua/ \n", "7989 http://repository.mdu.in.ua/jspui/ \n", "8085 http://elar.tsatu.edu.ua/ \n", "8105 http://dspace.zsea.edu.ua/ \n", "8197 http://dspace.knau.kharkov.ua/jspui/ \n", "8199 http://rep.btsau.edu.ua \n", "8206 https://biblio.onat.edu.ua/ \n", "8419 https://rep.ksma.ks.ua \n", "10016 http://eprints.ksame.kharkov.ua/ \n", "10107 http://dspace.uabs.edu.ua/ \n", "10799 http://repository.ibss.org.ua/dspace \n", "10906 http://repo.sau.sumy.ua/ \n", "10976 http://www.dspace.humanities.org.ua/dspace/ \n", "11616 http:// visnykagro.knau.kharkov.ua \n", "11803 http://repository.mdu.in.ua/ \n", "11831 http://sj-dstu.dp.ua/ \n", "12107 http://www.hups.mil.gov.ua/periodic-app/journal \n", "12746 http://www.e-archive.org.ua/ \n", "12753 http://library.khnu.km.ua/jspui/ \n", "12916 http://dspace.ukrfa.kharkov.ua/ \n", "12922 http://dspace.ukrfa.kharkov.ua \n", "12970 http://evrica.org.ua \n", "12992 http://dspace.academy.sumy.ua/ \n", "13017 http://open-archive.kture.kharkov.ua/ \n", "13034 http://dspace.onu.edu.ua/ \n", "13052 http://open-arhive.kture.kharkov.ua \n", "\n", " get_error \n", "4892 ('Connection aborted.', ConnectionResetError(5... \n", "4943 HTTPConnectionPool(host='www.library.ukma.kiev... \n", "4983 HTTPConnectionPool(host='repository.ibss.org.u... \n", "5147 HTTPConnectionPool(host='www.rarelib.undp.org.... \n", "5817 HTTPSConnectionPool(host='openarchive.nure.ua'... \n", "5827 HTTPConnectionPool(host='dspace.uabs.edu.ua', ... \n", "6135 HTTPConnectionPool(host='kgmtu.edu.ua', port=8... \n", "6231 HTTPConnectionPool(host='dspace.uccu.org.ua', ... \n", "6474 HTTPConnectionPool(host='repo.uipa.edu.ua', po... \n", "7067 HTTPConnectionPool(host='ir.asta.edu.ua', port... \n", "7490 HTTPConnectionPool(host='dspace.hnpu.edu.ua', ... \n", "7989 HTTPConnectionPool(host='repository.mdu.in.ua'... \n", "8085 HTTPConnectionPool(host='elar.tsatu.edu.ua', p... \n", "8105 HTTPSConnectionPool(host='dspace.znu.edu.ua', ... \n", "8197 HTTPConnectionPool(host='dspace.knau.kharkov.u... \n", "8199 HTTPConnectionPool(host='rep.btsau.edu.ua', po... \n", "8206 HTTPSConnectionPool(host='biblio.onat.edu.ua',... \n", "8419 HTTPSConnectionPool(host='rep.ksma.ks.ua', por... \n", "10016 HTTPConnectionPool(host='eprints.ksame.kharkov... \n", "10107 HTTPConnectionPool(host='dspace.uabs.edu.ua', ... \n", "10799 HTTPConnectionPool(host='repository.ibss.org.u... \n", "10906 HTTPConnectionPool(host='repo.sau.sumy.ua', po... \n", "10976 HTTPConnectionPool(host='www.dspace.humanities... \n", "11616 HTTPConnectionPool(host='%20visnykagro.knau.kh... \n", "11803 HTTPConnectionPool(host='repository.mdu.in.ua'... \n", "11831 HTTPConnectionPool(host='sj-dstu.dp.ua', port=... \n", "12107 HTTPConnectionPool(host='www.hups.mil.gov.ua',... \n", "12746 HTTPConnectionPool(host='www.e-archive.org.ua'... \n", "12753 HTTPConnectionPool(host='library.khnu.km.ua', ... \n", "12916 HTTPConnectionPool(host='dspace.ukrfa.kharkov.... \n", "12922 HTTPConnectionPool(host='dspace.ukrfa.kharkov.... \n", "12970 HTTPConnectionPool(host='evrica.org.ua', port=... \n", "12992 HTTPConnectionPool(host='dspace.academy.sumy.u... \n", "13017 HTTPConnectionPool(host='open-archive.kture.kh... \n", "13034 HTTPConnectionPool(host='192.168.30.200', port... \n", "13052 HTTPConnectionPool(host='open-arhive.kture.kha... " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_errors[get_errors.url.str.contains('(\\.ua$)|(\\.ua/)', regex=True)]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/2005047697.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", " get_errors[get_errors.url.str.contains('(\\.ua$)|(\\.ua/)', regex=True)].count()\n" ] }, { "data": { "text/plain": [ "url 36\n", "get_error 36\n", "dtype: int64" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_errors[get_errors.url.str.contains('(\\.ua$)|(\\.ua/)', regex=True)].count()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/2148841299.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", " df[df.url.str.contains('(\\.ua$)|(\\.ua/)', regex=True)]\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_final_codehead_urlhead_chainhead_codeshead_elapsedhead_errorget_final_codeget_urlget_chainget_codesget_elapsedget_error
2185http://inform.ikd.kiev.ua/index.php?path=/en/i...200.0http://inform.ikd.kiev.ua/?path=/en/index['http://inform.ikd.kiev.ua/index.php?path=/en...[301]0.402488NaN200.0http://inform.ikd.kiev.ua/?path=/en/index['http://inform.ikd.kiev.ua/index.php?path=/en...[301]0.452971NaN
2286http://wdc.org.ua/200.0http://wdc.org.ua/[][]0.905063NaN200.0http://wdc.org.ua/[][]0.816887NaN
4892http://eprints.isofts.kiev.ua/NaNNaNNaNNaNNaN('Connection aborted.', ConnectionResetError(5...NaNNaNNaNNaNNaN('Connection aborted.', ConnectionResetError(5...
4943http://www.library.ukma.kiev.ua/dspace/NaNNaNNaNNaNNaNHTTPConnectionPool(host='www.library.ukma.kiev...NaNNaNNaNNaNNaNHTTPConnectionPool(host='www.library.ukma.kiev...
4950http://www.dspace.ucu.edu.ua/dspace/403.0http://www.dspace.ucu.edu.ua/dspace/[][]0.073373NaN403.0http://www.dspace.ucu.edu.ua/dspace/[][]0.073223NaN
..........................................
13015http://ir.kneu.edu.ua200.0http://ir.kneu.edu.ua/[][]1.609602NaN200.0http://ir.kneu.edu.ua/[][]1.508854NaN
13017http://open-archive.kture.kharkov.ua/NaNNaNNaNNaNNaNHTTPConnectionPool(host='open-archive.kture.kh...NaNNaNNaNNaNNaNHTTPConnectionPool(host='open-archive.kture.kh...
13034http://dspace.onu.edu.ua/NaNNaNNaNNaNNaNHTTPConnectionPool(host='192.168.30.200', port...NaNNaNNaNNaNNaNHTTPConnectionPool(host='192.168.30.200', port...
13052http://open-arhive.kture.kharkov.uaNaNNaNNaNNaNNaNHTTPConnectionPool(host='open-arhive.kture.kha...NaNNaNNaNNaNNaNHTTPConnectionPool(host='open-arhive.kture.kha...
13248http://www.ekmair.ukma.edu.ua200.0http://ekmair.ukma.edu.ua/['http://www.ekmair.ukma.edu.ua/'][301]0.661039NaN200.0http://ekmair.ukma.edu.ua/['http://www.ekmair.ukma.edu.ua/'][301]0.755170NaN
\n", "

146 rows × 13 columns

\n", "
" ], "text/plain": [ " url head_final_code \\\n", "2185 http://inform.ikd.kiev.ua/index.php?path=/en/i... 200.0 \n", "2286 http://wdc.org.ua/ 200.0 \n", "4892 http://eprints.isofts.kiev.ua/ NaN \n", "4943 http://www.library.ukma.kiev.ua/dspace/ NaN \n", "4950 http://www.dspace.ucu.edu.ua/dspace/ 403.0 \n", "... ... ... \n", "13015 http://ir.kneu.edu.ua 200.0 \n", "13017 http://open-archive.kture.kharkov.ua/ NaN \n", "13034 http://dspace.onu.edu.ua/ NaN \n", "13052 http://open-arhive.kture.kharkov.ua NaN \n", "13248 http://www.ekmair.ukma.edu.ua 200.0 \n", "\n", " head_url \\\n", "2185 http://inform.ikd.kiev.ua/?path=/en/index \n", "2286 http://wdc.org.ua/ \n", "4892 NaN \n", "4943 NaN \n", "4950 http://www.dspace.ucu.edu.ua/dspace/ \n", "... ... \n", "13015 http://ir.kneu.edu.ua/ \n", "13017 NaN \n", "13034 NaN \n", "13052 NaN \n", "13248 http://ekmair.ukma.edu.ua/ \n", "\n", " head_chain head_codes \\\n", "2185 ['http://inform.ikd.kiev.ua/index.php?path=/en... [301] \n", "2286 [] [] \n", "4892 NaN NaN \n", "4943 NaN NaN \n", "4950 [] [] \n", "... ... ... \n", "13015 [] [] \n", "13017 NaN NaN \n", "13034 NaN NaN \n", "13052 NaN NaN \n", "13248 ['http://www.ekmair.ukma.edu.ua/'] [301] \n", "\n", " head_elapsed head_error \\\n", "2185 0.402488 NaN \n", "2286 0.905063 NaN \n", "4892 NaN ('Connection aborted.', ConnectionResetError(5... \n", "4943 NaN HTTPConnectionPool(host='www.library.ukma.kiev... \n", "4950 0.073373 NaN \n", "... ... ... \n", "13015 1.609602 NaN \n", "13017 NaN HTTPConnectionPool(host='open-archive.kture.kh... \n", "13034 NaN HTTPConnectionPool(host='192.168.30.200', port... \n", "13052 NaN HTTPConnectionPool(host='open-arhive.kture.kha... \n", "13248 0.661039 NaN \n", "\n", " get_final_code get_url \\\n", "2185 200.0 http://inform.ikd.kiev.ua/?path=/en/index \n", "2286 200.0 http://wdc.org.ua/ \n", "4892 NaN NaN \n", "4943 NaN NaN \n", "4950 403.0 http://www.dspace.ucu.edu.ua/dspace/ \n", "... ... ... \n", "13015 200.0 http://ir.kneu.edu.ua/ \n", "13017 NaN NaN \n", "13034 NaN NaN \n", "13052 NaN NaN \n", "13248 200.0 http://ekmair.ukma.edu.ua/ \n", "\n", " get_chain get_codes \\\n", "2185 ['http://inform.ikd.kiev.ua/index.php?path=/en... [301] \n", "2286 [] [] \n", "4892 NaN NaN \n", "4943 NaN NaN \n", "4950 [] [] \n", "... ... ... \n", "13015 [] [] \n", "13017 NaN NaN \n", "13034 NaN NaN \n", "13052 NaN NaN \n", "13248 ['http://www.ekmair.ukma.edu.ua/'] [301] \n", "\n", " get_elapsed get_error \n", "2185 0.452971 NaN \n", "2286 0.816887 NaN \n", "4892 NaN ('Connection aborted.', ConnectionResetError(5... \n", "4943 NaN HTTPConnectionPool(host='www.library.ukma.kiev... \n", "4950 0.073223 NaN \n", "... ... ... \n", "13015 1.508854 NaN \n", "13017 NaN HTTPConnectionPool(host='open-archive.kture.kh... \n", "13034 NaN HTTPConnectionPool(host='192.168.30.200', port... \n", "13052 NaN HTTPConnectionPool(host='open-arhive.kture.kha... \n", "13248 0.755170 NaN \n", "\n", "[146 rows x 13 columns]" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.url.str.contains('(\\.ua$)|(\\.ua/)', regex=True)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Redirects" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_urlhead_final_codehead_codes
len
06596659665966596
13364336433643364
2686686686686
3233233233233
437373737
510101010
\n", "
" ], "text/plain": [ " url head_url head_final_code head_codes\n", "len \n", "0 6596 6596 6596 6596\n", "1 3364 3364 3364 3364\n", "2 686 686 686 686\n", "3 233 233 233 233\n", "4 37 37 37 37\n", "5 10 10 10 10" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_chains = df[df.head_codes.notna()][['url', 'head_url', 'head_final_code', 'head_codes']]\n", "head_chains['len'] = head_chains.head_codes.apply(ast.literal_eval).str.len()\n", "head_chains.groupby('len').count()\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_urlget_final_codeget_codes
len
06595659565956595
13376337633763376
2699699699699
3229229229229
443434343
512121212
\n", "
" ], "text/plain": [ " url get_url get_final_code get_codes\n", "len \n", "0 6595 6595 6595 6595\n", "1 3376 3376 3376 3376\n", "2 699 699 699 699\n", "3 229 229 229 229\n", "4 43 43 43 43\n", "5 12 12 12 12" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_chains = df[df.get_codes.notna()][['url', 'get_url', 'get_final_code', 'get_codes']]\n", "get_chains['len'] = get_chains.get_codes.apply(ast.literal_eval).str.len()\n", "get_chains.groupby('len').count()" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEICAYAAACzliQjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAYCUlEQVR4nO3df5TV9X3n8ecrCA4rbESdesgMycy6bBNJNgOdBVtNN5BG0HoWTNMg6SY0dUs8B3LUNE2hZ08kSW3iSSttGkMOiSxYI4RKjBNlJRSRhKYIgyKK6DqruA5FGUFIlGBE3vvH/QxccH7cmblzL/J5Pc65536/7+/n+72f7z2c13z53O/9XEUEZmaWh3dUuwNmZlY5Dn0zs4w49M3MMuLQNzPLiEPfzCwjDn0zs4z0GvqSaiRtkfSYpJ2SvpzqyyQ9J2l7ejSluiR9U1KbpB2SJhQda7akZ9Jj9qCdlZmZdemsEtq8DkyJiFclDQU2SfrfadufR8Tdp7S/AhibHpOAxcAkSecBNwHNQADbJLVExCvdvfAFF1wQDQ0NfTohM7Pcbdu27eWIqO1qW6+hH4Vvb72aVoemR0/f6JoO3JH22yzpXEmjgQ8D6yLiAICkdcA0YEV3B2poaKC1tbW3LpqZWRFJz3e3raQxfUlDJG0H9lEI7ofTppvTEM4iSWenWh3wQtHu7anWXd3MzCqkpNCPiDcjogmoByZKej+wAHgv8F+A84C/KEeHJM2R1CqptaOjoxyHNDOzpE9370TEQWADMC0i9kbB68D/AiamZnuAMUW71adad/VTX2NJRDRHRHNtbZdDUmZm1k+9julLqgXeiIiDkoYDHwVukTQ6IvZKEjADeCLt0gLMk7SSwge5h1K7tcBfSxqV2l1O4X8LZmb99sYbb9De3s6RI0eq3ZWKq6mpob6+nqFDh5a8Tyl374wGlksaQuF/Bqsi4j5JD6Y/CAK2A9el9muAK4E24DDwGYCIOCDpq8DW1O4rnR/qmpn1V3t7OyNHjqShoYHCNWgeIoL9+/fT3t5OY2NjyfuVcvfODmB8F/Up3bQPYG4325YCS0vunZlZL44cOZJd4ANI4vzzz6evn336G7lm9raXW+B36s95O/TNzAZoxIgRJ60vW7aMefPmAbBw4ULq6upoamo6/jh48ODxtjfccAN1dXUcO3bspP1ra2sZP348Y8eOZerUqfz85z8vS19LGdM3M3vbaJh/f1mPt/vrvz/gY9x444184QtfeEv92LFj3HPPPYwZM4aNGzcyefLk49tmzpzJt771LQA2bNjAxz72MTZs2MD73ve+AfXljA/9cvwD2F3zyYF3ZOGhgR/DzM4oDz30EOPGjWPmzJmsWLHipNAvNnnyZObMmcOSJUtYtGjRgF7TwztmZgP0q1/96qThmy996UsnbV+0aNHxbcXBvmLFCmbNmsXVV1/N/fffzxtvvNHta0yYMIGnnnpqwH0946/0zcwG2/Dhw9m+ffvx9WXLlp00b1hXwzu//vWvWbNmDbfeeisjR45k0qRJrF27lquuuqrL1yjcGDlwDn0zsypYu3YtBw8e5AMf+AAAhw8fZvjw4d2G/qOPPjrg8Xxw6JuZVcWKFSv43ve+x6xZswB47bXXaGxs5PDhw29pu3HjRpYsWcKGDRsG/LoOfTOzQbZo0SLuvPPO4+t33XUXDzzwAN/5zneO18455xwuu+wyfvzjHwPwgx/8gE2bNnH48GEaGxtZvXp1Wa70Va5xosHQ3NwcA51P33fvmJ3Zdu3aVZYwfLvq6vwlbYuI5q7a++4dM7OMOPTNzDLi0Dczy4hD38wsIw59M7OMOPTNzDLi+/TNzAbopZde4sYbb2Tz5s2MGjWKYcOG8cUvfpFRo0Yxffr0k37ZasGCBXzta18D4MUXX2TIkCF0/h74li1bGDZs2KD21aFvZmeWhe8s8/F6/o5NRDBjxgxmz57NXXfdBcDzzz9PS0sLo0aN4kMf+hD33XffSfvMnDmzcOiFCxkxYkSX0y4PFg/vmJkNwIMPPsiwYcO47rrrjtfe85738LnPfa6Kveqer/TNzAZg586dTJgwodvtP/vZz2hqajq+vnr1ai666KIK9KxrDn0zszKaO3cumzZtYtiwYXzjG9/ocninmjy8Y2Y2AOPGjeORRx45vn7bbbexfv16Ojo6qtir7vUa+pJqJG2R9JiknZK+nOqNkh6W1CbpB5KGpfrZab0tbW8oOtaCVH9a0tRBOyszswqZMmUKR44cYfHixcdrXU2PfLoo5Ur/dWBKRHwQaAKmSboEuAVYFBH/EXgFuDa1vxZ4JdUXpXZIuhi4BhgHTAO+LWlIGc/FzKziJPGjH/2IjRs30tjYyMSJE5k9eza33HILcGJMv/Nx9913V7W/vY7pR2Hu5VfT6tD0CGAK0Dnn8HJgIbAYmJ6WAe4GviVJqb4yIl4HnpPUBkwE/rUcJ2JmBlRlGvPRo0ezcuXKLrcdOtR9fxYuXDhIPepeSWP6koZI2g7sA9YB/xc4GBFHU5N2oC4t1wEvAKTth4Dzi+td7GNmZhVQUuhHxJsR0QTUU7g6f+9gdUjSHEmtklpP1w9CzMzervp0905EHAQ2AL8NnCupc3ioHtiTlvcAYwDS9ncC+4vrXexT/BpLIqI5Ipo7v5psZmblUcrdO7WSzk3Lw4GPArsohP/HU7PZwL1puSWtk7Y/mD4XaAGuSXf3NAJjgS1lOg8zy9jp/LOvg6k/513Kl7NGA8vTnTbvAFZFxH2SngRWSvor4FHg9tT+duAf0we1ByjcsUNE7JS0CngSOArMjYg3+9xjM7MiNTU17N+/n/PPP5/CPSN5iAj2799PTU1Nn/Yr5e6dHcD4LurPUhjfP7V+BPjDbo51M3Bzn3poZtaD+vp62tvbT9svQw2mmpoa6uvr+7SPp2Ews7e1oUOHnjR1sfXM0zCYmWXEoW9mlhGHvplZRhz6ZmYZceibmWXEoW9mlhGHvplZRhz6ZmYZceibmWXEoW9mlhGHvplZRhz6ZmYZceibmWXEoW9mlhGHvplZRhz6ZmYZceibmWXEoW9mlhGHvplZRhz6ZmYZceibmWWk19CXNEbSBklPStop6fpUXyhpj6Tt6XFl0T4LJLVJelrS1KL6tFRrkzR/cE7JzMy6c1YJbY4CfxYRj0gaCWyTtC5tWxQRf1PcWNLFwDXAOOBdwD9L+k9p823AR4F2YKukloh4shwnYmZmves19CNiL7A3Lf9S0i6groddpgMrI+J14DlJbcDEtK0tIp4FkLQytXXom5lVSJ/G9CU1AOOBh1NpnqQdkpZKGpVqdcALRbu1p1p39VNfY46kVkmtHR0dfememZn1ouTQlzQCWA3cEBG/ABYDFwFNFP4n8Lfl6FBELImI5ohorq2tLcchzcwsKWVMH0lDKQT+9yPihwAR8VLR9u8C96XVPcCYot3rU40e6mZmVgGl3L0j4HZgV0TcWlQfXdTsauCJtNwCXCPpbEmNwFhgC7AVGCupUdIwCh/2tpTnNMzMrBSlXOlfCnwKeFzS9lT7S2CWpCYggN3AZwEiYqekVRQ+oD0KzI2INwEkzQPWAkOApRGxs2xnYmZmvSrl7p1NgLrYtKaHfW4Gbu6ivqan/czMbHD5G7lmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhnpNfQljZG0QdKTknZKuj7Vz5O0TtIz6XlUqkvSNyW1SdohaULRsWan9s9Imj14p2VmZl0p5Ur/KPBnEXExcAkwV9LFwHxgfUSMBdandYArgLHpMQdYDIU/EsBNwCRgInBT5x8KMzOrjF5DPyL2RsQjafmXwC6gDpgOLE/NlgMz0vJ04I4o2AycK2k0MBVYFxEHIuIVYB0wrZwnY2ZmPevTmL6kBmA88DBwYUTsTZteBC5My3XAC0W7tadad3UzM6uQkkNf0ghgNXBDRPyieFtEBBDl6JCkOZJaJbV2dHSU45BmZpaUFPqShlII/O9HxA9T+aU0bEN63pfqe4AxRbvXp1p39ZNExJKIaI6I5tra2r6ci5mZ9aKUu3cE3A7siohbiza1AJ134MwG7i2qfzrdxXMJcCgNA60FLpc0Kn2Ae3mqmZlZhZxVQptLgU8Bj0vanmp/CXwdWCXpWuB54BNp2xrgSqANOAx8BiAiDkj6KrA1tftKRBwox0mYmVlpeg39iNgEqJvNH+mifQBzuznWUmBpXzpoZmbl42/kmpllxKFvZpYRh76ZWUYc+mZmGXHom5llxKFvZpYRh76ZWUYc+mZmGXHom5llxKFvZpYRh76ZWUYc+mZmGXHom5llxKFvZpYRh76ZWUYc+mZmGXHom5llxKFvZpYRh76ZWUYc+mZmGXHom5llxKFvZpaRXkNf0lJJ+yQ9UVRbKGmPpO3pcWXRtgWS2iQ9LWlqUX1aqrVJml/+UzEzs96UcqW/DJjWRX1RRDSlxxoASRcD1wDj0j7fljRE0hDgNuAK4GJgVmprZmYVdFZvDSLip5IaSjzedGBlRLwOPCepDZiYtrVFxLMAklamtk/2vctmZtZfAxnTnydpRxr+GZVqdcALRW3aU627upmZVVB/Q38xcBHQBOwF/rZcHZI0R1KrpNaOjo5yHdbMzOhn6EfESxHxZkQcA77LiSGcPcCYoqb1qdZdvatjL4mI5ohorq2t7U/3zMysG/0KfUmji1avBjrv7GkBrpF0tqRGYCywBdgKjJXUKGkYhQ97W/rfbTMz649eP8iVtAL4MHCBpHbgJuDDkpqAAHYDnwWIiJ2SVlH4gPYoMDci3kzHmQesBYYASyNiZ7lPxszMelbK3Tuzuijf3kP7m4Gbu6ivAdb0qXdmZlZW/kaumVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhnpNfQlLZW0T9ITRbXzJK2T9Ex6HpXqkvRNSW2SdkiaULTP7NT+GUmzB+d0zMysJ6Vc6S8Dpp1Smw+sj4ixwPq0DnAFMDY95gCLofBHArgJmARMBG7q/ENhZmaV02voR8RPgQOnlKcDy9PycmBGUf2OKNgMnCtpNDAVWBcRByLiFWAdb/1DYmZmg6y/Y/oXRsTetPwicGFargNeKGrXnmrd1d9C0hxJrZJaOzo6+tk9MzPryoA/yI2IAKIMfek83pKIaI6I5tra2nId1szM6H/ov5SGbUjP+1J9DzCmqF19qnVXNzOzCupv6LcAnXfgzAbuLap/Ot3FcwlwKA0DrQUulzQqfYB7eaqZmVkFndVbA0krgA8DF0hqp3AXzteBVZKuBZ4HPpGarwGuBNqAw8BnACLigKSvAltTu69ExKkfDpuZ2SDrNfQjYlY3mz7SRdsA5nZznKXA0j71zszMysrfyDUzy4hD38wsIw59M7OMOPTNzDLi0Dczy4hD38wsIw59M7OMOPTNzDLi0Dczy4hD38wsIw59M7OMOPTNzDLi0Dczy0ivs2yanWThO8twjEMDP4aZ9YtDPyMN8+8f8DF215ShI2ZWNR7eMTPLiEPfzCwjDn0zs4w49M3MMuLQNzPLiEPfzCwjAwp9SbslPS5pu6TWVDtP0jpJz6TnUakuSd+U1CZph6QJ5TgBMzMrXTmu9CdHRFNENKf1+cD6iBgLrE/rAFcAY9NjDrC4DK9tZmZ9MBjDO9OB5Wl5OTCjqH5HFGwGzpU0ehBe38zMujHQ0A/gJ5K2SZqTahdGxN60/CJwYVquA14o2rc91czMrEIGOg3DZRGxR9JvAOskPVW8MSJCUvTlgOmPxxyAd7/73QPsnpmZFRvQlX5E7EnP+4B7gInAS53DNul5X2q+BxhTtHt9qp16zCUR0RwRzbW1tQPpnpmZnaLfoS/pHEkjO5eBy4EngBZgdmo2G7g3LbcAn0538VwCHCoaBjIzswoYyPDOhcA9kjqPc1dEPCBpK7BK0rXA88AnUvs1wJVAG3AY+MwAXtvMzPqh36EfEc8CH+yivh/4SBf1AOb29/XMzGzg/I1cM7OMOPTNzDLi0Dczy4hD38wsIw59M7OMOPTNzDLi0Dczy4hD38wsIw59M7OMOPTNzDLi0Dczy4hD38wsIwP9ERWzt6WG+fcP+Bi7az458I4sPDTwY5j1ga/0zcwy4tA3M8uIQ9/MLCMOfTOzjDj0zcwy4tA3M8uIQ9/MLCMOfTOzjDj0zcwyUvHQlzRN0tOS2iTNr/Trm5nlrKLTMEgaAtwGfBRoB7ZKaomIJyvZDzMrs4XvLMMxPCVFJVR67p2JQFtEPAsgaSUwHXDom1VJeeYhKkNHrCIqHfp1wAtF6+3ApAr3wcysSzlMxKeIGLSDv+XFpI8D0yLif6T1TwGTImJeUZs5wJy0+pvA0xXrYPcuAF6udidOE34vTvB7cYLfixNOh/fiPRFR29WGSl/p7wHGFK3Xp9pxEbEEWFLJTvVGUmtENFe7H6cDvxcn+L04we/FCaf7e1Hpu3e2AmMlNUoaBlwDtFS4D2Zm2arolX5EHJU0D1gLDAGWRsTOSvbBzCxnFf/lrIhYA6yp9OsO0Gk13FRlfi9O8Htxgt+LE07r96KiH+SamVl1eRoGM7OMOPR7IGmppH2Snqh2X6pJ0hhJGyQ9KWmnpOur3adqkVQjaYukx9J78eVq96naJA2R9Kik+6rdl2qStFvS45K2S2qtdn+64+GdHkj6XeBV4I6IeH+1+1MtkkYDoyPiEUkjgW3AjBynz5Ak4JyIeFXSUGATcH1EbK5y16pG0ueBZuDfR8RV1e5PtUjaDTRHRLXv0e+Rr/R7EBE/BQ5Uux/VFhF7I+KRtPxLYBeFb1dnJwpeTatD0yPbKydJ9cDvA9+rdl+sNA596xNJDcB44OEqd6Vq0nDGdmAfsC4isn0vgL8Dvggcq3I/TgcB/ETStjSzwGnJoW8lkzQCWA3cEBG/qHZ/qiUi3oyIJgrfKJ8oKcuhP0lXAfsiYlu1+3KauCwiJgBXAHPT8PBpx6FvJUnj16uB70fED6vdn9NBRBwENgDTqtyVarkU+G9pLHslMEXSndXtUvVExJ70vA+4h8Kswqcdh771Kn14eTuwKyJurXZ/qklSraRz0/JwCr8N8VRVO1UlEbEgIuojooHClCoPRsR/r3K3qkLSOekmBySdA1wOnJZ3/Tn0eyBpBfCvwG9Kapd0bbX7VCWXAp+icCW3PT2urHanqmQ0sEHSDgpzSa2LiKxvVTQALgQ2SXoM2ALcHxEPVLlPXfItm2ZmGfGVvplZRhz6ZmYZceibmWXEoW9mlhGHvplZRhz6dkaS9DVJkyXNkLSgzMd+NT2/S9LdZTrmDZL+XTmOZdYTh76dqSYBm4H/Cvy0t8aS+vwrchHxbxHx8XIcC7gBcOjboKv4zyWaDSZJ3wCmAo0Uvlh3EfARSXdHxFdOabsMOEJhArl/kXQbcBtQCxwG/jQinpLUCNwFjADuLdq/AbgvIt4v6Y+Bj6U2Q9KX1/4BeD+FmTgXRsS9koYAt1CYuuEY8F1AwLsofOnrZeD3KHwDupnCJF5LI2JROd8ny5dD384oEfHnklYBnwY+DzwUEZf2sEs98DsR8aak9cB1EfGMpEnAt4EpwN8DiyPiDklzezjWBOA/R8QBSX9NYVqCP0nTNmyR9M+pXw1AU0QclXReav95YHJEvCzpt4C6zt9w6Jz2wawcHPp2JpoAPAa8l8Lc/z35pxT4I4DfAf6pMNUQAGen50uBP0jL/0jhSr0r6yKi8/cXLqcwGdkX0noN8G4KV/HfiYijAEXtiz0L/AdJ/wDcD/ykl3MwK5lD384YkpqAZRSu3l+mMEauNPf9b0fEr7rY7bX0/A7gYJoyuSulzFfyWtGygD+IiKdP6WOvB4mIVyR9kMIw1XXAJ4A/KeH1zXrlD3LtjBER21No/x/gYuBBYGpENHUT+MX7/gJ4TtIfQuEvRQpegH+hMIskwB+V2J21wOfSDKVIGp/q64DPdn7YK+m8VP8l0DlL4wXAOyJiNfA/KfzPxawsHPp2RpFUC7wSEceA9/bxd3z/CLg2zZS4E5ie6tdT+FGMxyn9ZyK/SuED3B2SdqZ1KPys4P9L9ceAT6b6EuABSRvSazyU/odyJ1DWW04tb55l08wsI77SNzPLiEPfzCwjDn0zs4w49M3MMuLQNzPLiEPfzCwjDn0zs4w49M3MMvL/AUIHAL2DlXT/AAAAAElFTkSuQmCC", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "data = head_chains.groupby('len').count().join(get_chains.groupby('len').count(), lsuffix='_head', rsuffix='_get')\n", "data.index.rename('# redirects', inplace=True)\n", "data.rename(columns={'url_head': 'HEAD', 'url_get': 'GET'}, inplace=True)\n", "data[['HEAD', 'GET']][1:].plot.bar(rot=0)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "HEAD 4330\n", "GET 4359\n", "dtype: int64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[['HEAD', 'GET']][1:].sum()" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "HEAD 32.419886\n", "GET 32.637017\n", "dtype: float64" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data[['HEAD', 'GET']][1:].sum().div(df.shape[0]).mul(100)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_urlhead_final_codehead_codeslenclass
11http://epi.minsal.cl/bases-de-datos/http://epi.minsal.cl/bases-de-datos/403.0[302]1400.0
84https://protocols.io/welcomehttps://www.protocols.io:443/welcome400.0[301]1400.0
177http://agris.fao.org/agris-search/index.dohttps://agris.fao.org:443/agris-search/index.do404.0[301]1400.0
228http://oryzatagline.cirad.fr/https://oryzatagline.cirad.fr/403.0[301]1400.0
243http://www.ncbi.nlm.nih.gov/unistshttps://www.ncbi.nlm.nih.gov/unists410.0[301]1400.0
.....................
13262http://www.kumc.edu/archiehttps://www.kumc.edu/archie404.0[302]1400.0
13320http://trinity.cdmhost.com/index.phphttps://www.oclc.org/url/?404;http://trinity.c...404.0[302]1400.0
13340http://calvados.c3sl.ufpr.br/ojs2/index.php/ve...https://revistas.ufpr.br/index.php/veterinary/...404.0[301]1400.0
13346http://www4.furg.br/ojs/biblos/ojs/index.phphttps://sistemas.furg.br/ojs/biblos/ojs/index.php404.0[302]1400.0
13348http://lss.fnal.gov/ird/fermilab_spires.htmlhttps://lss.fnal.gov/ird/fermilab_spires.html404.0[302]1400.0
\n", "

378 rows × 6 columns

\n", "
" ], "text/plain": [ " url \\\n", "11 http://epi.minsal.cl/bases-de-datos/ \n", "84 https://protocols.io/welcome \n", "177 http://agris.fao.org/agris-search/index.do \n", "228 http://oryzatagline.cirad.fr/ \n", "243 http://www.ncbi.nlm.nih.gov/unists \n", "... ... \n", "13262 http://www.kumc.edu/archie \n", "13320 http://trinity.cdmhost.com/index.php \n", "13340 http://calvados.c3sl.ufpr.br/ojs2/index.php/ve... \n", "13346 http://www4.furg.br/ojs/biblos/ojs/index.php \n", "13348 http://lss.fnal.gov/ird/fermilab_spires.html \n", "\n", " head_url head_final_code \\\n", "11 http://epi.minsal.cl/bases-de-datos/ 403.0 \n", "84 https://www.protocols.io:443/welcome 400.0 \n", "177 https://agris.fao.org:443/agris-search/index.do 404.0 \n", "228 https://oryzatagline.cirad.fr/ 403.0 \n", "243 https://www.ncbi.nlm.nih.gov/unists 410.0 \n", "... ... ... \n", "13262 https://www.kumc.edu/archie 404.0 \n", "13320 https://www.oclc.org/url/?404;http://trinity.c... 404.0 \n", "13340 https://revistas.ufpr.br/index.php/veterinary/... 404.0 \n", "13346 https://sistemas.furg.br/ojs/biblos/ojs/index.php 404.0 \n", "13348 https://lss.fnal.gov/ird/fermilab_spires.html 404.0 \n", "\n", " head_codes len class \n", "11 [302] 1 400.0 \n", "84 [301] 1 400.0 \n", "177 [301] 1 400.0 \n", "228 [301] 1 400.0 \n", "243 [301] 1 400.0 \n", "... ... ... ... \n", "13262 [302] 1 400.0 \n", "13320 [302] 1 400.0 \n", "13340 [301] 1 400.0 \n", "13346 [302] 1 400.0 \n", "13348 [302] 1 400.0 \n", "\n", "[378 rows x 6 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_chains['class'] = head_chains.head_final_code.floordiv(100).mul(100)\n", "head_chains[(head_chains.len > 0) & (head_chains['class'] != 200)]" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_urlhead_final_codehead_codeslen
class
400.0338338338338338
500.04040404040
\n", "
" ], "text/plain": [ " url head_url head_final_code head_codes len\n", "class \n", "400.0 338 338 338 338 338\n", "500.0 40 40 40 40 40" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_chains[(head_chains.len > 0) & (head_chains['class'] != 200)].groupby('class').count()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_urlget_final_codeget_codeslenclass
11http://epi.minsal.cl/bases-de-datos/http://epi.minsal.cl/bases-de-datos/403.0[302]1400.0
177http://agris.fao.org/agris-search/index.dohttps://agris.fao.org:443/agris-search/index.do404.0[301]1400.0
228http://oryzatagline.cirad.fr/https://oryzatagline.cirad.fr/403.0[301]1400.0
243http://www.ncbi.nlm.nih.gov/unistshttps://www.ncbi.nlm.nih.gov/unists410.0[301]1400.0
288http://www.ncbi.nlm.nih.gov/dbSTS/index.htmlhttps://www.ncbi.nlm.nih.gov/dbSTS/index.html403.0[301]1400.0
.....................
13262http://www.kumc.edu/archiehttps://www.kumc.edu/archie404.0[302]1400.0
13320http://trinity.cdmhost.com/index.phphttps://www.oclc.org/url/notfound404.0[302, 302]2400.0
13340http://calvados.c3sl.ufpr.br/ojs2/index.php/ve...https://revistas.ufpr.br/index.php/veterinary/...404.0[301]1400.0
13346http://www4.furg.br/ojs/biblos/ojs/index.phphttps://sistemas.furg.br/ojs/biblos/ojs/index.php404.0[302]1400.0
13348http://lss.fnal.gov/ird/fermilab_spires.htmlhttps://lss.fnal.gov/ird/fermilab_spires.html404.0[302]1400.0
\n", "

344 rows × 6 columns

\n", "
" ], "text/plain": [ " url \\\n", "11 http://epi.minsal.cl/bases-de-datos/ \n", "177 http://agris.fao.org/agris-search/index.do \n", "228 http://oryzatagline.cirad.fr/ \n", "243 http://www.ncbi.nlm.nih.gov/unists \n", "288 http://www.ncbi.nlm.nih.gov/dbSTS/index.html \n", "... ... \n", "13262 http://www.kumc.edu/archie \n", "13320 http://trinity.cdmhost.com/index.php \n", "13340 http://calvados.c3sl.ufpr.br/ojs2/index.php/ve... \n", "13346 http://www4.furg.br/ojs/biblos/ojs/index.php \n", "13348 http://lss.fnal.gov/ird/fermilab_spires.html \n", "\n", " get_url get_final_code \\\n", "11 http://epi.minsal.cl/bases-de-datos/ 403.0 \n", "177 https://agris.fao.org:443/agris-search/index.do 404.0 \n", "228 https://oryzatagline.cirad.fr/ 403.0 \n", "243 https://www.ncbi.nlm.nih.gov/unists 410.0 \n", "288 https://www.ncbi.nlm.nih.gov/dbSTS/index.html 403.0 \n", "... ... ... \n", "13262 https://www.kumc.edu/archie 404.0 \n", "13320 https://www.oclc.org/url/notfound 404.0 \n", "13340 https://revistas.ufpr.br/index.php/veterinary/... 404.0 \n", "13346 https://sistemas.furg.br/ojs/biblos/ojs/index.php 404.0 \n", "13348 https://lss.fnal.gov/ird/fermilab_spires.html 404.0 \n", "\n", " get_codes len class \n", "11 [302] 1 400.0 \n", "177 [301] 1 400.0 \n", "228 [301] 1 400.0 \n", "243 [301] 1 400.0 \n", "288 [301] 1 400.0 \n", "... ... ... ... \n", "13262 [302] 1 400.0 \n", "13320 [302, 302] 2 400.0 \n", "13340 [301] 1 400.0 \n", "13346 [302] 1 400.0 \n", "13348 [302] 1 400.0 \n", "\n", "[344 rows x 6 columns]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_chains['class'] = get_chains.get_final_code.floordiv(100).mul(100)\n", "get_chains[(get_chains.len > 0) & (get_chains['class'] != 200)]" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_urlget_final_codeget_codeslen
class
400.0307307307307307
500.03737373737
\n", "
" ], "text/plain": [ " url get_url get_final_code get_codes len\n", "class \n", "400.0 307 307 307 307 307\n", "500.0 37 37 37 37 37" ] }, "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_chains[(get_chains.len > 0) & (get_chains['class'] != 200)].groupby('class').count()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Redirect reason" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_urlhead_final_codehead_codeslenclass
0http://sidc.be/silso/homehttps://www.sidc.be/silso/home200.0[302]1200.0
2http://www.pac.dfo-mpo.gc.ca/science/oceans/da...https://www.pac.dfo-mpo.gc.ca/science/oceans/d...200.0[307]1200.0
6http://ipt.taibif.tw/https://ipt.taibif.tw/200.0[301]1200.0
9http://arabidopsis.infohttps://arabidopsis.info/200.0[301]1200.0
11http://epi.minsal.cl/bases-de-datos/http://epi.minsal.cl/bases-de-datos/403.0[302]1400.0
.....................
13341http://calvados.c3sl.ufpr.br/ojs2/index.php/acta/https://revistas.ufpr.br/index.php/acta/200.0[301]1200.0
13346http://www4.furg.br/ojs/biblos/ojs/index.phphttps://sistemas.furg.br/ojs/biblos/ojs/index.php404.0[302]1400.0
13348http://lss.fnal.gov/ird/fermilab_spires.htmlhttps://lss.fnal.gov/ird/fermilab_spires.html404.0[302]1400.0
13350http://cloud.ncl.edu.tw/chna/https://cloud.ncl.edu.tw/chna/200.0[302]1200.0
13351http://www.myopenarchive.org/https://www.myopenarchive.org/200.0[301]1200.0
\n", "

3364 rows × 6 columns

\n", "
" ], "text/plain": [ " url \\\n", "0 http://sidc.be/silso/home \n", "2 http://www.pac.dfo-mpo.gc.ca/science/oceans/da... \n", "6 http://ipt.taibif.tw/ \n", "9 http://arabidopsis.info \n", "11 http://epi.minsal.cl/bases-de-datos/ \n", "... ... \n", "13341 http://calvados.c3sl.ufpr.br/ojs2/index.php/acta/ \n", "13346 http://www4.furg.br/ojs/biblos/ojs/index.php \n", "13348 http://lss.fnal.gov/ird/fermilab_spires.html \n", "13350 http://cloud.ncl.edu.tw/chna/ \n", "13351 http://www.myopenarchive.org/ \n", "\n", " head_url head_final_code \\\n", "0 https://www.sidc.be/silso/home 200.0 \n", "2 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... 200.0 \n", "6 https://ipt.taibif.tw/ 200.0 \n", "9 https://arabidopsis.info/ 200.0 \n", "11 http://epi.minsal.cl/bases-de-datos/ 403.0 \n", "... ... ... \n", "13341 https://revistas.ufpr.br/index.php/acta/ 200.0 \n", "13346 https://sistemas.furg.br/ojs/biblos/ojs/index.php 404.0 \n", "13348 https://lss.fnal.gov/ird/fermilab_spires.html 404.0 \n", "13350 https://cloud.ncl.edu.tw/chna/ 200.0 \n", "13351 https://www.myopenarchive.org/ 200.0 \n", "\n", " head_codes len class \n", "0 [302] 1 200.0 \n", "2 [307] 1 200.0 \n", "6 [301] 1 200.0 \n", "9 [301] 1 200.0 \n", "11 [302] 1 400.0 \n", "... ... ... ... \n", "13341 [301] 1 200.0 \n", "13346 [302] 1 400.0 \n", "13348 [302] 1 400.0 \n", "13350 [302] 1 200.0 \n", "13351 [301] 1 200.0 \n", "\n", "[3364 rows x 6 columns]" ] }, "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_chains[head_chains.len == 1]" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_urlhead_final_codehead_codeslenclass
2http://www.pac.dfo-mpo.gc.ca/science/oceans/da...https://www.pac.dfo-mpo.gc.ca/science/oceans/d...200.0[307]1200.0
6http://ipt.taibif.tw/https://ipt.taibif.tw/200.0[301]1200.0
27http://www.hgvd.genome.med.kyoto-u.ac.jp/https://www.hgvd.genome.med.kyoto-u.ac.jp/200.0[301]1200.0
30http://archive.researchdata.leeds.ac.uk/https://archive.researchdata.leeds.ac.uk/200.0[302]1200.0
33http://biodb.swu.edu.cn/qprimerdb/https://biodb.swu.edu.cn/qprimerdb/200.0[301]1200.0
.....................
13288http://dspace.sunyconnect.suny.edu/https://dspace.sunyconnect.suny.edu/200.0[301]1200.0
13295http://scholar.google.com/https://scholar.google.com/200.0[302]1200.0
13348http://lss.fnal.gov/ird/fermilab_spires.htmlhttps://lss.fnal.gov/ird/fermilab_spires.html404.0[302]1400.0
13350http://cloud.ncl.edu.tw/chna/https://cloud.ncl.edu.tw/chna/200.0[302]1200.0
13351http://www.myopenarchive.org/https://www.myopenarchive.org/200.0[301]1200.0
\n", "

1665 rows × 6 columns

\n", "
" ], "text/plain": [ " url \\\n", "2 http://www.pac.dfo-mpo.gc.ca/science/oceans/da... \n", "6 http://ipt.taibif.tw/ \n", "27 http://www.hgvd.genome.med.kyoto-u.ac.jp/ \n", "30 http://archive.researchdata.leeds.ac.uk/ \n", "33 http://biodb.swu.edu.cn/qprimerdb/ \n", "... ... \n", "13288 http://dspace.sunyconnect.suny.edu/ \n", "13295 http://scholar.google.com/ \n", "13348 http://lss.fnal.gov/ird/fermilab_spires.html \n", "13350 http://cloud.ncl.edu.tw/chna/ \n", "13351 http://www.myopenarchive.org/ \n", "\n", " head_url head_final_code \\\n", "2 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... 200.0 \n", "6 https://ipt.taibif.tw/ 200.0 \n", "27 https://www.hgvd.genome.med.kyoto-u.ac.jp/ 200.0 \n", "30 https://archive.researchdata.leeds.ac.uk/ 200.0 \n", "33 https://biodb.swu.edu.cn/qprimerdb/ 200.0 \n", "... ... ... \n", "13288 https://dspace.sunyconnect.suny.edu/ 200.0 \n", "13295 https://scholar.google.com/ 200.0 \n", "13348 https://lss.fnal.gov/ird/fermilab_spires.html 404.0 \n", "13350 https://cloud.ncl.edu.tw/chna/ 200.0 \n", "13351 https://www.myopenarchive.org/ 200.0 \n", "\n", " head_codes len class \n", "2 [307] 1 200.0 \n", "6 [301] 1 200.0 \n", "27 [301] 1 200.0 \n", "30 [302] 1 200.0 \n", "33 [301] 1 200.0 \n", "... ... ... ... \n", "13288 [301] 1 200.0 \n", "13295 [302] 1 200.0 \n", "13348 [302] 1 400.0 \n", "13350 [302] 1 200.0 \n", "13351 [301] 1 200.0 \n", "\n", "[1665 rows x 6 columns]" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_chains[(head_chains.len == 1) & (head_chains.url.str.replace('http://', 'https://') == head_chains.head_url)]" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_urlhead_final_codehead_codeslenclass
119https://www.biosino.org/nodehttps://www.biosino.org/node/200.0[302]1200.0
314http://metacat.tfri.gov.tw/tfrihttp://metacat.tfri.gov.tw/tfri/200.0[301]1200.0
463https://nda.nih.gov/abcdhttps://nda.nih.gov/abcd/200.0[302]1200.0
551https://www1.usgs.gov/obis-usa/ipthttps://www1.usgs.gov/obis-usa/ipt/200.0[301]1200.0
559https://www.arm.gov/datahttps://www.arm.gov/data/200.0[301]1200.0
.....................
12642http://repositorio.utec.edu.sv:8080/jspuihttp://repositorio.utec.edu.sv:8080/jspui/200.0[302]1200.0
12706http://adudspace.adu.edu.tr:8080/jspuihttp://adudspace.adu.edu.tr:8080/jspui/200.0[302]1200.0
12874http://160.97.80.9:8080/jspuihttp://160.97.80.9:8080/jspui/404.0[302]1400.0
13030http://library.pdpu.ac.in:8080/xmluihttp://library.pdpu.ac.in:8080/xmlui/200.0[302]1200.0
13267http://cab.unime.it/mushttp://cab.unime.it/mus/200.0[302]1200.0
\n", "

134 rows × 6 columns

\n", "
" ], "text/plain": [ " url \\\n", "119 https://www.biosino.org/node \n", "314 http://metacat.tfri.gov.tw/tfri \n", "463 https://nda.nih.gov/abcd \n", "551 https://www1.usgs.gov/obis-usa/ipt \n", "559 https://www.arm.gov/data \n", "... ... \n", "12642 http://repositorio.utec.edu.sv:8080/jspui \n", "12706 http://adudspace.adu.edu.tr:8080/jspui \n", "12874 http://160.97.80.9:8080/jspui \n", "13030 http://library.pdpu.ac.in:8080/xmlui \n", "13267 http://cab.unime.it/mus \n", "\n", " head_url head_final_code head_codes \\\n", "119 https://www.biosino.org/node/ 200.0 [302] \n", "314 http://metacat.tfri.gov.tw/tfri/ 200.0 [301] \n", "463 https://nda.nih.gov/abcd/ 200.0 [302] \n", "551 https://www1.usgs.gov/obis-usa/ipt/ 200.0 [301] \n", "559 https://www.arm.gov/data/ 200.0 [301] \n", "... ... ... ... \n", "12642 http://repositorio.utec.edu.sv:8080/jspui/ 200.0 [302] \n", "12706 http://adudspace.adu.edu.tr:8080/jspui/ 200.0 [302] \n", "12874 http://160.97.80.9:8080/jspui/ 404.0 [302] \n", "13030 http://library.pdpu.ac.in:8080/xmlui/ 200.0 [302] \n", "13267 http://cab.unime.it/mus/ 200.0 [302] \n", "\n", " len class \n", "119 1 200.0 \n", "314 1 200.0 \n", "463 1 200.0 \n", "551 1 200.0 \n", "559 1 200.0 \n", "... ... ... \n", "12642 1 200.0 \n", "12706 1 200.0 \n", "12874 1 400.0 \n", "13030 1 200.0 \n", "13267 1 200.0 \n", "\n", "[134 rows x 6 columns]" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_chains[(head_chains.len == 1) & (head_chains.url + '/' == head_chains.head_url)]" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_urlhead_final_codehead_codeslenclass
9http://arabidopsis.infohttps://arabidopsis.info/200.0[301]1200.0
46http://pcddb.cryst.bbk.ac.ukhttps://pcddb.cryst.bbk.ac.uk/200.0[302]1200.0
113http://www.marinedataarchive.euhttps://www.marinedataarchive.eu/200.0[301]1200.0
119https://www.biosino.org/nodehttps://www.biosino.org/node/200.0[302]1200.0
200http://www.hydroshare.orghttps://www.hydroshare.org/200.0[301]1200.0
.....................
13032http://repositorio.ipsantarem.pthttps://repositorio.ipsantarem.pt/200.0[302]1200.0
13077http://bibliotecadigital.ipb.pthttps://bibliotecadigital.ipb.pt/200.0[302]1200.0
13091http://eprints.nmlindia.orghttps://eprints.nmlindia.org/200.0[301]1200.0
13124http://www.ju.edu.ethttps://www.ju.edu.et/200.0[301]1200.0
13137http://scholarship.rollins.eduhttps://scholarship.rollins.edu/200.0[301]1200.0
\n", "

487 rows × 6 columns

\n", "
" ], "text/plain": [ " url head_url \\\n", "9 http://arabidopsis.info https://arabidopsis.info/ \n", "46 http://pcddb.cryst.bbk.ac.uk https://pcddb.cryst.bbk.ac.uk/ \n", "113 http://www.marinedataarchive.eu https://www.marinedataarchive.eu/ \n", "119 https://www.biosino.org/node https://www.biosino.org/node/ \n", "200 http://www.hydroshare.org https://www.hydroshare.org/ \n", "... ... ... \n", "13032 http://repositorio.ipsantarem.pt https://repositorio.ipsantarem.pt/ \n", "13077 http://bibliotecadigital.ipb.pt https://bibliotecadigital.ipb.pt/ \n", "13091 http://eprints.nmlindia.org https://eprints.nmlindia.org/ \n", "13124 http://www.ju.edu.et https://www.ju.edu.et/ \n", "13137 http://scholarship.rollins.edu https://scholarship.rollins.edu/ \n", "\n", " head_final_code head_codes len class \n", "9 200.0 [301] 1 200.0 \n", "46 200.0 [302] 1 200.0 \n", "113 200.0 [301] 1 200.0 \n", "119 200.0 [302] 1 200.0 \n", "200 200.0 [301] 1 200.0 \n", "... ... ... ... ... \n", "13032 200.0 [302] 1 200.0 \n", "13077 200.0 [302] 1 200.0 \n", "13091 200.0 [301] 1 200.0 \n", "13124 200.0 [301] 1 200.0 \n", "13137 200.0 [301] 1 200.0 \n", "\n", "[487 rows x 6 columns]" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_chains[(head_chains.len == 1) & (head_chains.url.str.replace('http://', 'https://') + '/' == head_chains.head_url)]" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_urlget_final_codeget_codeslenclass
0http://sidc.be/silso/homehttps://www.sidc.be/silso/home200.0[302]1200.0
2http://www.pac.dfo-mpo.gc.ca/science/oceans/da...https://www.pac.dfo-mpo.gc.ca/science/oceans/d...200.0[307]1200.0
6http://ipt.taibif.tw/https://ipt.taibif.tw/200.0[301]1200.0
9http://arabidopsis.infohttps://arabidopsis.info/200.0[301]1200.0
11http://epi.minsal.cl/bases-de-datos/http://epi.minsal.cl/bases-de-datos/403.0[302]1400.0
.....................
13341http://calvados.c3sl.ufpr.br/ojs2/index.php/acta/https://revistas.ufpr.br/index.php/acta/200.0[301]1200.0
13346http://www4.furg.br/ojs/biblos/ojs/index.phphttps://sistemas.furg.br/ojs/biblos/ojs/index.php404.0[302]1400.0
13348http://lss.fnal.gov/ird/fermilab_spires.htmlhttps://lss.fnal.gov/ird/fermilab_spires.html404.0[302]1400.0
13350http://cloud.ncl.edu.tw/chna/https://cloud.ncl.edu.tw/chna/200.0[302]1200.0
13351http://www.myopenarchive.org/https://www.myopenarchive.org/200.0[301]1200.0
\n", "

3376 rows × 6 columns

\n", "
" ], "text/plain": [ " url \\\n", "0 http://sidc.be/silso/home \n", "2 http://www.pac.dfo-mpo.gc.ca/science/oceans/da... \n", "6 http://ipt.taibif.tw/ \n", "9 http://arabidopsis.info \n", "11 http://epi.minsal.cl/bases-de-datos/ \n", "... ... \n", "13341 http://calvados.c3sl.ufpr.br/ojs2/index.php/acta/ \n", "13346 http://www4.furg.br/ojs/biblos/ojs/index.php \n", "13348 http://lss.fnal.gov/ird/fermilab_spires.html \n", "13350 http://cloud.ncl.edu.tw/chna/ \n", "13351 http://www.myopenarchive.org/ \n", "\n", " get_url get_final_code \\\n", "0 https://www.sidc.be/silso/home 200.0 \n", "2 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... 200.0 \n", "6 https://ipt.taibif.tw/ 200.0 \n", "9 https://arabidopsis.info/ 200.0 \n", "11 http://epi.minsal.cl/bases-de-datos/ 403.0 \n", "... ... ... \n", "13341 https://revistas.ufpr.br/index.php/acta/ 200.0 \n", "13346 https://sistemas.furg.br/ojs/biblos/ojs/index.php 404.0 \n", "13348 https://lss.fnal.gov/ird/fermilab_spires.html 404.0 \n", "13350 https://cloud.ncl.edu.tw/chna/ 200.0 \n", "13351 https://www.myopenarchive.org/ 200.0 \n", "\n", " get_codes len class \n", "0 [302] 1 200.0 \n", "2 [307] 1 200.0 \n", "6 [301] 1 200.0 \n", "9 [301] 1 200.0 \n", "11 [302] 1 400.0 \n", "... ... ... ... \n", "13341 [301] 1 200.0 \n", "13346 [302] 1 400.0 \n", "13348 [302] 1 400.0 \n", "13350 [302] 1 200.0 \n", "13351 [301] 1 200.0 \n", "\n", "[3376 rows x 6 columns]" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_chains[get_chains.len == 1]" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_urlget_final_codeget_codeslenclass
2http://www.pac.dfo-mpo.gc.ca/science/oceans/da...https://www.pac.dfo-mpo.gc.ca/science/oceans/d...200.0[307]1200.0
6http://ipt.taibif.tw/https://ipt.taibif.tw/200.0[301]1200.0
27http://www.hgvd.genome.med.kyoto-u.ac.jp/https://www.hgvd.genome.med.kyoto-u.ac.jp/200.0[301]1200.0
30http://archive.researchdata.leeds.ac.uk/https://archive.researchdata.leeds.ac.uk/200.0[302]1200.0
33http://biodb.swu.edu.cn/qprimerdb/https://biodb.swu.edu.cn/qprimerdb/200.0[301]1200.0
.....................
13288http://dspace.sunyconnect.suny.edu/https://dspace.sunyconnect.suny.edu/200.0[301]1200.0
13295http://scholar.google.com/https://scholar.google.com/200.0[302]1200.0
13348http://lss.fnal.gov/ird/fermilab_spires.htmlhttps://lss.fnal.gov/ird/fermilab_spires.html404.0[302]1400.0
13350http://cloud.ncl.edu.tw/chna/https://cloud.ncl.edu.tw/chna/200.0[302]1200.0
13351http://www.myopenarchive.org/https://www.myopenarchive.org/200.0[301]1200.0
\n", "

1668 rows × 6 columns

\n", "
" ], "text/plain": [ " url \\\n", "2 http://www.pac.dfo-mpo.gc.ca/science/oceans/da... \n", "6 http://ipt.taibif.tw/ \n", "27 http://www.hgvd.genome.med.kyoto-u.ac.jp/ \n", "30 http://archive.researchdata.leeds.ac.uk/ \n", "33 http://biodb.swu.edu.cn/qprimerdb/ \n", "... ... \n", "13288 http://dspace.sunyconnect.suny.edu/ \n", "13295 http://scholar.google.com/ \n", "13348 http://lss.fnal.gov/ird/fermilab_spires.html \n", "13350 http://cloud.ncl.edu.tw/chna/ \n", "13351 http://www.myopenarchive.org/ \n", "\n", " get_url get_final_code \\\n", "2 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... 200.0 \n", "6 https://ipt.taibif.tw/ 200.0 \n", "27 https://www.hgvd.genome.med.kyoto-u.ac.jp/ 200.0 \n", "30 https://archive.researchdata.leeds.ac.uk/ 200.0 \n", "33 https://biodb.swu.edu.cn/qprimerdb/ 200.0 \n", "... ... ... \n", "13288 https://dspace.sunyconnect.suny.edu/ 200.0 \n", "13295 https://scholar.google.com/ 200.0 \n", "13348 https://lss.fnal.gov/ird/fermilab_spires.html 404.0 \n", "13350 https://cloud.ncl.edu.tw/chna/ 200.0 \n", "13351 https://www.myopenarchive.org/ 200.0 \n", "\n", " get_codes len class \n", "2 [307] 1 200.0 \n", "6 [301] 1 200.0 \n", "27 [301] 1 200.0 \n", "30 [302] 1 200.0 \n", "33 [301] 1 200.0 \n", "... ... ... ... \n", "13288 [301] 1 200.0 \n", "13295 [302] 1 200.0 \n", "13348 [302] 1 400.0 \n", "13350 [302] 1 200.0 \n", "13351 [301] 1 200.0 \n", "\n", "[1668 rows x 6 columns]" ] }, "execution_count": 56, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_chains[(get_chains.len == 1) & (get_chains.url.str.replace('http://', 'https://') == get_chains.get_url)]" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_urlget_final_codeget_codeslenclass
119https://www.biosino.org/nodehttps://www.biosino.org/node/200.0[302]1200.0
314http://metacat.tfri.gov.tw/tfrihttp://metacat.tfri.gov.tw/tfri/200.0[301]1200.0
463https://nda.nih.gov/abcdhttps://nda.nih.gov/abcd/200.0[302]1200.0
551https://www1.usgs.gov/obis-usa/ipthttps://www1.usgs.gov/obis-usa/ipt/200.0[301]1200.0
559https://www.arm.gov/datahttps://www.arm.gov/data/200.0[301]1200.0
.....................
12642http://repositorio.utec.edu.sv:8080/jspuihttp://repositorio.utec.edu.sv:8080/jspui/200.0[302]1200.0
12706http://adudspace.adu.edu.tr:8080/jspuihttp://adudspace.adu.edu.tr:8080/jspui/200.0[302]1200.0
12874http://160.97.80.9:8080/jspuihttp://160.97.80.9:8080/jspui/404.0[302]1400.0
13030http://library.pdpu.ac.in:8080/xmluihttp://library.pdpu.ac.in:8080/xmlui/200.0[302]1200.0
13267http://cab.unime.it/mushttp://cab.unime.it/mus/200.0[302]1200.0
\n", "

134 rows × 6 columns

\n", "
" ], "text/plain": [ " url \\\n", "119 https://www.biosino.org/node \n", "314 http://metacat.tfri.gov.tw/tfri \n", "463 https://nda.nih.gov/abcd \n", "551 https://www1.usgs.gov/obis-usa/ipt \n", "559 https://www.arm.gov/data \n", "... ... \n", "12642 http://repositorio.utec.edu.sv:8080/jspui \n", "12706 http://adudspace.adu.edu.tr:8080/jspui \n", "12874 http://160.97.80.9:8080/jspui \n", "13030 http://library.pdpu.ac.in:8080/xmlui \n", "13267 http://cab.unime.it/mus \n", "\n", " get_url get_final_code get_codes \\\n", "119 https://www.biosino.org/node/ 200.0 [302] \n", "314 http://metacat.tfri.gov.tw/tfri/ 200.0 [301] \n", "463 https://nda.nih.gov/abcd/ 200.0 [302] \n", "551 https://www1.usgs.gov/obis-usa/ipt/ 200.0 [301] \n", "559 https://www.arm.gov/data/ 200.0 [301] \n", "... ... ... ... \n", "12642 http://repositorio.utec.edu.sv:8080/jspui/ 200.0 [302] \n", "12706 http://adudspace.adu.edu.tr:8080/jspui/ 200.0 [302] \n", "12874 http://160.97.80.9:8080/jspui/ 404.0 [302] \n", "13030 http://library.pdpu.ac.in:8080/xmlui/ 200.0 [302] \n", "13267 http://cab.unime.it/mus/ 200.0 [302] \n", "\n", " len class \n", "119 1 200.0 \n", "314 1 200.0 \n", "463 1 200.0 \n", "551 1 200.0 \n", "559 1 200.0 \n", "... ... ... \n", "12642 1 200.0 \n", "12706 1 200.0 \n", "12874 1 400.0 \n", "13030 1 200.0 \n", "13267 1 200.0 \n", "\n", "[134 rows x 6 columns]" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_chains[(get_chains.len == 1) & (get_chains.url + '/' == get_chains.get_url)]" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_urlget_final_codeget_codeslenclass
9http://arabidopsis.infohttps://arabidopsis.info/200.0[301]1200.0
46http://pcddb.cryst.bbk.ac.ukhttps://pcddb.cryst.bbk.ac.uk/200.0[302]1200.0
113http://www.marinedataarchive.euhttps://www.marinedataarchive.eu/200.0[301]1200.0
119https://www.biosino.org/nodehttps://www.biosino.org/node/200.0[302]1200.0
200http://www.hydroshare.orghttps://www.hydroshare.org/200.0[301]1200.0
.....................
13032http://repositorio.ipsantarem.pthttps://repositorio.ipsantarem.pt/200.0[302]1200.0
13077http://bibliotecadigital.ipb.pthttps://bibliotecadigital.ipb.pt/200.0[302]1200.0
13091http://eprints.nmlindia.orghttps://eprints.nmlindia.org/200.0[301]1200.0
13124http://www.ju.edu.ethttps://www.ju.edu.et/200.0[301]1200.0
13137http://scholarship.rollins.eduhttps://scholarship.rollins.edu/200.0[301]1200.0
\n", "

490 rows × 6 columns

\n", "
" ], "text/plain": [ " url get_url \\\n", "9 http://arabidopsis.info https://arabidopsis.info/ \n", "46 http://pcddb.cryst.bbk.ac.uk https://pcddb.cryst.bbk.ac.uk/ \n", "113 http://www.marinedataarchive.eu https://www.marinedataarchive.eu/ \n", "119 https://www.biosino.org/node https://www.biosino.org/node/ \n", "200 http://www.hydroshare.org https://www.hydroshare.org/ \n", "... ... ... \n", "13032 http://repositorio.ipsantarem.pt https://repositorio.ipsantarem.pt/ \n", "13077 http://bibliotecadigital.ipb.pt https://bibliotecadigital.ipb.pt/ \n", "13091 http://eprints.nmlindia.org https://eprints.nmlindia.org/ \n", "13124 http://www.ju.edu.et https://www.ju.edu.et/ \n", "13137 http://scholarship.rollins.edu https://scholarship.rollins.edu/ \n", "\n", " get_final_code get_codes len class \n", "9 200.0 [301] 1 200.0 \n", "46 200.0 [302] 1 200.0 \n", "113 200.0 [301] 1 200.0 \n", "119 200.0 [302] 1 200.0 \n", "200 200.0 [301] 1 200.0 \n", "... ... ... ... ... \n", "13032 200.0 [302] 1 200.0 \n", "13077 200.0 [302] 1 200.0 \n", "13091 200.0 [301] 1 200.0 \n", "13124 200.0 [301] 1 200.0 \n", "13137 200.0 [301] 1 200.0 \n", "\n", "[490 rows x 6 columns]" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_chains[(get_chains.len == 1) & (get_chains.url.str.replace('http://', 'https://') + '/' == get_chains.get_url)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Redirect mismatch" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_final_codehead_urlhead_chainhead_codeshead_elapsedhead_errorget_final_codeget_urlget_chainget_codesget_elapsedget_error
127https://models.physiomeproject.org200.0https://models.physiomeproject.org/[][]0.658525NaN200.0https://models.physiomeproject.org/welcome['https://models.physiomeproject.org/'][302]0.575681NaN
371https://rdp.utas.edu.au200.0https://login.microsoftonline.com/15eb5d55-199...['https://rdp.utas.edu.au/'][302]0.297592NaN200.0https://login.microsoftonline.com/15eb5d55-199...['https://rdp.utas.edu.au/'][302]0.285739NaN
479https://sciflection.com200.0https://sciflection.com/[][]0.176450NaN200.0https://sciflection.com/main['https://sciflection.com/', 'http://sciflecti...[302, 301]0.391234NaN
684http://www.ntrr-nti.org/405.0http://www.ntrr-nti.org/[][]1.133437NaN200.0https://www.nattrauma.org/research/ntrr/['http://www.ntrr-nti.org/'][301]2.539869NaN
697http://www.jcm.riken.go.jp/200.0https://jcm.brc.riken.jp/en/['http://www.jcm.riken.go.jp/', 'https://www.j...[301, 301, 301, 301]0.506354NaN200.0https://jcm.brc.riken.jp/en/?doing_wp_cron=165...['http://www.jcm.riken.go.jp/', 'https://www.j...[301, 301, 301, 301, 302]0.630979NaN
..........................................
13132http://hik.diva-portal.org/smash/search.jsf200.0http://lnu.diva-portal.org/smash/search.jsf?ds...['http://hik.diva-portal.org/smash/search.jsf'...[301, 302]0.287171NaN200.0http://lnu.diva-portal.org/smash/search.jsf?ds...['http://hik.diva-portal.org/smash/search.jsf'...[301, 302]0.291231NaN
13133http://hgo.diva-portal.org/smash/search.jsf200.0http://uu.diva-portal.org/smash/search.jsf?dsw...['http://hgo.diva-portal.org/smash/search.jsf'...[301, 302]0.292740NaN200.0http://uu.diva-portal.org/smash/search.jsf?dsw...['http://hgo.diva-portal.org/smash/search.jsf'...[301, 302]0.289726NaN
13266http://www.diva-portal.org/gih/200.0http://gih.diva-portal.org/smash/search.jsf?ds...['http://www.diva-portal.org/gih/', 'http://gi...[301, 302]0.301519NaN200.0http://gih.diva-portal.org/smash/search.jsf?ds...['http://www.diva-portal.org/gih/', 'http://gi...[301, 302]2.143105NaN
13283http://www.compscipreprints.com/200.0http://survey-smiles.com['http://www.compscipreprints.com/'][302]0.257651NaN200.0http://www.compscipreprints.com/[][]0.093911NaN
13320http://trinity.cdmhost.com/index.php404.0https://www.oclc.org/url/?404;http://trinity.c...['http://trinity.cdmhost.com/index.php'][302]0.589119NaN404.0https://www.oclc.org/url/notfound['http://trinity.cdmhost.com/index.php', 'http...[302, 302]0.139563NaN
\n", "

146 rows × 13 columns

\n", "
" ], "text/plain": [ " url head_final_code \\\n", "127 https://models.physiomeproject.org 200.0 \n", "371 https://rdp.utas.edu.au 200.0 \n", "479 https://sciflection.com 200.0 \n", "684 http://www.ntrr-nti.org/ 405.0 \n", "697 http://www.jcm.riken.go.jp/ 200.0 \n", "... ... ... \n", "13132 http://hik.diva-portal.org/smash/search.jsf 200.0 \n", "13133 http://hgo.diva-portal.org/smash/search.jsf 200.0 \n", "13266 http://www.diva-portal.org/gih/ 200.0 \n", "13283 http://www.compscipreprints.com/ 200.0 \n", "13320 http://trinity.cdmhost.com/index.php 404.0 \n", "\n", " head_url \\\n", "127 https://models.physiomeproject.org/ \n", "371 https://login.microsoftonline.com/15eb5d55-199... \n", "479 https://sciflection.com/ \n", "684 http://www.ntrr-nti.org/ \n", "697 https://jcm.brc.riken.jp/en/ \n", "... ... \n", "13132 http://lnu.diva-portal.org/smash/search.jsf?ds... \n", "13133 http://uu.diva-portal.org/smash/search.jsf?dsw... \n", "13266 http://gih.diva-portal.org/smash/search.jsf?ds... \n", "13283 http://survey-smiles.com \n", "13320 https://www.oclc.org/url/?404;http://trinity.c... \n", "\n", " head_chain \\\n", "127 [] \n", "371 ['https://rdp.utas.edu.au/'] \n", "479 [] \n", "684 [] \n", "697 ['http://www.jcm.riken.go.jp/', 'https://www.j... \n", "... ... \n", "13132 ['http://hik.diva-portal.org/smash/search.jsf'... \n", "13133 ['http://hgo.diva-portal.org/smash/search.jsf'... \n", "13266 ['http://www.diva-portal.org/gih/', 'http://gi... \n", "13283 ['http://www.compscipreprints.com/'] \n", "13320 ['http://trinity.cdmhost.com/index.php'] \n", "\n", " head_codes head_elapsed head_error get_final_code \\\n", "127 [] 0.658525 NaN 200.0 \n", "371 [302] 0.297592 NaN 200.0 \n", "479 [] 0.176450 NaN 200.0 \n", "684 [] 1.133437 NaN 200.0 \n", "697 [301, 301, 301, 301] 0.506354 NaN 200.0 \n", "... ... ... ... ... \n", "13132 [301, 302] 0.287171 NaN 200.0 \n", "13133 [301, 302] 0.292740 NaN 200.0 \n", "13266 [301, 302] 0.301519 NaN 200.0 \n", "13283 [302] 0.257651 NaN 200.0 \n", "13320 [302] 0.589119 NaN 404.0 \n", "\n", " get_url \\\n", "127 https://models.physiomeproject.org/welcome \n", "371 https://login.microsoftonline.com/15eb5d55-199... \n", "479 https://sciflection.com/main \n", "684 https://www.nattrauma.org/research/ntrr/ \n", "697 https://jcm.brc.riken.jp/en/?doing_wp_cron=165... \n", "... ... \n", "13132 http://lnu.diva-portal.org/smash/search.jsf?ds... \n", "13133 http://uu.diva-portal.org/smash/search.jsf?dsw... \n", "13266 http://gih.diva-portal.org/smash/search.jsf?ds... \n", "13283 http://www.compscipreprints.com/ \n", "13320 https://www.oclc.org/url/notfound \n", "\n", " get_chain \\\n", "127 ['https://models.physiomeproject.org/'] \n", "371 ['https://rdp.utas.edu.au/'] \n", "479 ['https://sciflection.com/', 'http://sciflecti... \n", "684 ['http://www.ntrr-nti.org/'] \n", "697 ['http://www.jcm.riken.go.jp/', 'https://www.j... \n", "... ... \n", "13132 ['http://hik.diva-portal.org/smash/search.jsf'... \n", "13133 ['http://hgo.diva-portal.org/smash/search.jsf'... \n", "13266 ['http://www.diva-portal.org/gih/', 'http://gi... \n", "13283 [] \n", "13320 ['http://trinity.cdmhost.com/index.php', 'http... \n", "\n", " get_codes get_elapsed get_error \n", "127 [302] 0.575681 NaN \n", "371 [302] 0.285739 NaN \n", "479 [302, 301] 0.391234 NaN \n", "684 [301] 2.539869 NaN \n", "697 [301, 301, 301, 301, 302] 0.630979 NaN \n", "... ... ... ... \n", "13132 [301, 302] 0.291231 NaN \n", "13133 [301, 302] 0.289726 NaN \n", "13266 [301, 302] 2.143105 NaN \n", "13283 [] 0.093911 NaN \n", "13320 [302, 302] 0.139563 NaN \n", "\n", "[146 rows x 13 columns]" ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[(df.head_url != df.get_url) & (df.head_url.notna()) & (df.get_url.notna())]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### URL conflation" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_urlhead_final_codehead_codeslenclass
0http://sidc.be/silso/homehttps://www.sidc.be/silso/home200.0[302]1200.0
1https://bmrb.io/https://bmrb.io/200.0[]0200.0
2http://www.pac.dfo-mpo.gc.ca/science/oceans/da...https://www.pac.dfo-mpo.gc.ca/science/oceans/d...200.0[307]1200.0
3https://www.cpc.ncep.noaa.gov/https://www.cpc.ncep.noaa.gov/200.0[]0200.0
4http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...404.0[]0400.0
.....................
13350http://cloud.ncl.edu.tw/chna/https://cloud.ncl.edu.tw/chna/200.0[302]1200.0
13351http://www.myopenarchive.org/https://www.myopenarchive.org/200.0[301]1200.0
13352http://radar.brookes.ac.uk/radar/access/home.dohttps://radar.brookes.ac.uk/radar/home.do200.0[302, 302, 302]3200.0
13353http://www.ub.uni-siegen.de/opushttps://www.ub.uni-siegen.de/opus/200.0[301, 301]2200.0
13355https://www.utas.edu.au/libraryhttps://www.utas.edu.au/library200.0[]0200.0
\n", "

10926 rows × 6 columns

\n", "
" ], "text/plain": [ " url \\\n", "0 http://sidc.be/silso/home \n", "1 https://bmrb.io/ \n", "2 http://www.pac.dfo-mpo.gc.ca/science/oceans/da... \n", "3 https://www.cpc.ncep.noaa.gov/ \n", "4 http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... \n", "... ... \n", "13350 http://cloud.ncl.edu.tw/chna/ \n", "13351 http://www.myopenarchive.org/ \n", "13352 http://radar.brookes.ac.uk/radar/access/home.do \n", "13353 http://www.ub.uni-siegen.de/opus \n", "13355 https://www.utas.edu.au/library \n", "\n", " head_url head_final_code \\\n", "0 https://www.sidc.be/silso/home 200.0 \n", "1 https://bmrb.io/ 200.0 \n", "2 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... 200.0 \n", "3 https://www.cpc.ncep.noaa.gov/ 200.0 \n", "4 http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... 404.0 \n", "... ... ... \n", "13350 https://cloud.ncl.edu.tw/chna/ 200.0 \n", "13351 https://www.myopenarchive.org/ 200.0 \n", "13352 https://radar.brookes.ac.uk/radar/home.do 200.0 \n", "13353 https://www.ub.uni-siegen.de/opus/ 200.0 \n", "13355 https://www.utas.edu.au/library 200.0 \n", "\n", " head_codes len class \n", "0 [302] 1 200.0 \n", "1 [] 0 200.0 \n", "2 [307] 1 200.0 \n", "3 [] 0 200.0 \n", "4 [] 0 400.0 \n", "... ... ... ... \n", "13350 [302] 1 200.0 \n", "13351 [301] 1 200.0 \n", "13352 [302, 302, 302] 3 200.0 \n", "13353 [301, 301] 2 200.0 \n", "13355 [] 0 200.0 \n", "\n", "[10926 rows x 6 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_chains" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_urlhead_final_codehead_codeslenclass
count109261092610926.0000001092610926.00000010926.000000
unique109269331NaN59NaNNaN
tophttp://sidc.be/silso/homehttps://www.unit.no/ugyldig-lenke-til-dokument...NaN[]NaNNaN
freq116NaN6596NaNNaN
meanNaNNaN218.406736NaN0.515559218.094454
stdNaNNaN61.035428NaN0.74985060.063069
minNaNNaN200.000000NaN0.000000200.000000
25%NaNNaN200.000000NaN0.000000200.000000
50%NaNNaN200.000000NaN0.000000200.000000
75%NaNNaN200.000000NaN1.000000200.000000
maxNaNNaN504.000000NaN5.000000500.000000
\n", "
" ], "text/plain": [ " url \\\n", "count 10926 \n", "unique 10926 \n", "top http://sidc.be/silso/home \n", "freq 1 \n", "mean NaN \n", "std NaN \n", "min NaN \n", "25% NaN \n", "50% NaN \n", "75% NaN \n", "max NaN \n", "\n", " head_url head_final_code \\\n", "count 10926 10926.000000 \n", "unique 9331 NaN \n", "top https://www.unit.no/ugyldig-lenke-til-dokument... NaN \n", "freq 16 NaN \n", "mean NaN 218.406736 \n", "std NaN 61.035428 \n", "min NaN 200.000000 \n", "25% NaN 200.000000 \n", "50% NaN 200.000000 \n", "75% NaN 200.000000 \n", "max NaN 504.000000 \n", "\n", " head_codes len class \n", "count 10926 10926.000000 10926.000000 \n", "unique 59 NaN NaN \n", "top [] NaN NaN \n", "freq 6596 NaN NaN \n", "mean NaN 0.515559 218.094454 \n", "std NaN 0.749850 60.063069 \n", "min NaN 0.000000 200.000000 \n", "25% NaN 0.000000 200.000000 \n", "50% NaN 0.000000 200.000000 \n", "75% NaN 1.000000 200.000000 \n", "max NaN 5.000000 500.000000 " ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_chains.describe(include='all')" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_final_codehead_codeslenclass
head_url
https://www.unit.no/ugyldig-lenke-til-dokument-i-vitenarkiv1616161616
https://www.nlm.nih.gov/toxnet/index.html66666
http://survey-smiles.com66666
https://nda.nih.gov/55555
https://digitalrepository.unm.edu/44444
https://ir.vanderbilt.edu/44444
https://ddd.uab.cat/44444
https://intellectum.unisabana.edu.co/44444
https://opus.bibliothek.uni-augsburg.de/opus4/home44444
https://pure.iiasa.ac.at/44444
http://eprints.nottingham.ac.uk/44444
https://research.usc.edu.au/esploro/33333
https://dataverse.harvard.edu/33333
https://epub.wupperinst.org/home33333
https://pure.mpg.de/33333
https://dlib.york.ac.uk/yodl/app/home/index33333
https://archiv.ub.uni-marburg.de/ubfind/33333
https://eprints.unife.it/33333
https://dataspace.princeton.edu/33333
https://eprints.lib.hokudai.ac.jp/33333
https://pub.uni-bielefeld.de/33333
https://aquadocs.org/33333
https://dataverse.tdl.org/33333
https://uwspace.uwaterloo.ca/33333
https://research.utwente.nl/33333
https://datadryad.org/stash33333
https://er.knutd.edu.ua/33333
https://esango.cput.ac.za/33333
https://repositorio.fucsalud.edu.co/33333
https://repository.up.ac.za/33333
https://repository.usta.edu.co/33333
https://repositori.upf.edu/33333
https://bibliotecadigital.exactas.uba.ar/33333
https://repo.odmu.edu.ua/xmlui/33333
https://bcnroc.ajuntament.barcelona.cat/jspui/33333
https://data.gov.uk/33333
https://mru.arcabc.ca/33333
https://munin.uit.no/33333
https://red.uao.edu.co/33333
https://research-repository.st-andrews.ac.uk/33333
https://research-repository.uwa.edu.au/33333
https://research.cbs.dk/33333
https://webext.pasteur.fr/genolist/33333
https://prism.ucalgary.ca/33333
https://air.unimi.it/33333
http://rua.ua.es/dspace/33333
https://dial.uclouvain.be/33333
https://sc.lib.miamioh.edu/33333
https://dspace.ankara.edu.tr/xmlui/33333
https://scholar.smu.edu/33333
\n", "
" ], "text/plain": [ " url head_final_code \\\n", "head_url \n", "https://www.unit.no/ugyldig-lenke-til-dokument-... 16 16 \n", "https://www.nlm.nih.gov/toxnet/index.html 6 6 \n", "http://survey-smiles.com 6 6 \n", "https://nda.nih.gov/ 5 5 \n", "https://digitalrepository.unm.edu/ 4 4 \n", "https://ir.vanderbilt.edu/ 4 4 \n", "https://ddd.uab.cat/ 4 4 \n", "https://intellectum.unisabana.edu.co/ 4 4 \n", "https://opus.bibliothek.uni-augsburg.de/opus4/home 4 4 \n", "https://pure.iiasa.ac.at/ 4 4 \n", "http://eprints.nottingham.ac.uk/ 4 4 \n", "https://research.usc.edu.au/esploro/ 3 3 \n", "https://dataverse.harvard.edu/ 3 3 \n", "https://epub.wupperinst.org/home 3 3 \n", "https://pure.mpg.de/ 3 3 \n", "https://dlib.york.ac.uk/yodl/app/home/index 3 3 \n", "https://archiv.ub.uni-marburg.de/ubfind/ 3 3 \n", "https://eprints.unife.it/ 3 3 \n", "https://dataspace.princeton.edu/ 3 3 \n", "https://eprints.lib.hokudai.ac.jp/ 3 3 \n", "https://pub.uni-bielefeld.de/ 3 3 \n", "https://aquadocs.org/ 3 3 \n", "https://dataverse.tdl.org/ 3 3 \n", "https://uwspace.uwaterloo.ca/ 3 3 \n", "https://research.utwente.nl/ 3 3 \n", "https://datadryad.org/stash 3 3 \n", "https://er.knutd.edu.ua/ 3 3 \n", "https://esango.cput.ac.za/ 3 3 \n", "https://repositorio.fucsalud.edu.co/ 3 3 \n", "https://repository.up.ac.za/ 3 3 \n", "https://repository.usta.edu.co/ 3 3 \n", "https://repositori.upf.edu/ 3 3 \n", "https://bibliotecadigital.exactas.uba.ar/ 3 3 \n", "https://repo.odmu.edu.ua/xmlui/ 3 3 \n", "https://bcnroc.ajuntament.barcelona.cat/jspui/ 3 3 \n", "https://data.gov.uk/ 3 3 \n", "https://mru.arcabc.ca/ 3 3 \n", "https://munin.uit.no/ 3 3 \n", "https://red.uao.edu.co/ 3 3 \n", "https://research-repository.st-andrews.ac.uk/ 3 3 \n", "https://research-repository.uwa.edu.au/ 3 3 \n", "https://research.cbs.dk/ 3 3 \n", "https://webext.pasteur.fr/genolist/ 3 3 \n", "https://prism.ucalgary.ca/ 3 3 \n", "https://air.unimi.it/ 3 3 \n", "http://rua.ua.es/dspace/ 3 3 \n", "https://dial.uclouvain.be/ 3 3 \n", "https://sc.lib.miamioh.edu/ 3 3 \n", "https://dspace.ankara.edu.tr/xmlui/ 3 3 \n", "https://scholar.smu.edu/ 3 3 \n", "\n", " head_codes len class \n", "head_url \n", "https://www.unit.no/ugyldig-lenke-til-dokument-... 16 16 16 \n", "https://www.nlm.nih.gov/toxnet/index.html 6 6 6 \n", "http://survey-smiles.com 6 6 6 \n", "https://nda.nih.gov/ 5 5 5 \n", "https://digitalrepository.unm.edu/ 4 4 4 \n", "https://ir.vanderbilt.edu/ 4 4 4 \n", "https://ddd.uab.cat/ 4 4 4 \n", "https://intellectum.unisabana.edu.co/ 4 4 4 \n", "https://opus.bibliothek.uni-augsburg.de/opus4/home 4 4 4 \n", "https://pure.iiasa.ac.at/ 4 4 4 \n", "http://eprints.nottingham.ac.uk/ 4 4 4 \n", "https://research.usc.edu.au/esploro/ 3 3 3 \n", "https://dataverse.harvard.edu/ 3 3 3 \n", "https://epub.wupperinst.org/home 3 3 3 \n", "https://pure.mpg.de/ 3 3 3 \n", "https://dlib.york.ac.uk/yodl/app/home/index 3 3 3 \n", "https://archiv.ub.uni-marburg.de/ubfind/ 3 3 3 \n", "https://eprints.unife.it/ 3 3 3 \n", "https://dataspace.princeton.edu/ 3 3 3 \n", "https://eprints.lib.hokudai.ac.jp/ 3 3 3 \n", "https://pub.uni-bielefeld.de/ 3 3 3 \n", "https://aquadocs.org/ 3 3 3 \n", "https://dataverse.tdl.org/ 3 3 3 \n", "https://uwspace.uwaterloo.ca/ 3 3 3 \n", "https://research.utwente.nl/ 3 3 3 \n", "https://datadryad.org/stash 3 3 3 \n", "https://er.knutd.edu.ua/ 3 3 3 \n", "https://esango.cput.ac.za/ 3 3 3 \n", "https://repositorio.fucsalud.edu.co/ 3 3 3 \n", "https://repository.up.ac.za/ 3 3 3 \n", "https://repository.usta.edu.co/ 3 3 3 \n", "https://repositori.upf.edu/ 3 3 3 \n", "https://bibliotecadigital.exactas.uba.ar/ 3 3 3 \n", "https://repo.odmu.edu.ua/xmlui/ 3 3 3 \n", "https://bcnroc.ajuntament.barcelona.cat/jspui/ 3 3 3 \n", "https://data.gov.uk/ 3 3 3 \n", "https://mru.arcabc.ca/ 3 3 3 \n", "https://munin.uit.no/ 3 3 3 \n", "https://red.uao.edu.co/ 3 3 3 \n", "https://research-repository.st-andrews.ac.uk/ 3 3 3 \n", "https://research-repository.uwa.edu.au/ 3 3 3 \n", "https://research.cbs.dk/ 3 3 3 \n", "https://webext.pasteur.fr/genolist/ 3 3 3 \n", "https://prism.ucalgary.ca/ 3 3 3 \n", "https://air.unimi.it/ 3 3 3 \n", "http://rua.ua.es/dspace/ 3 3 3 \n", "https://dial.uclouvain.be/ 3 3 3 \n", "https://sc.lib.miamioh.edu/ 3 3 3 \n", "https://dspace.ankara.edu.tr/xmlui/ 3 3 3 \n", "https://scholar.smu.edu/ 3 3 3 " ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_chains.groupby('head_url').count().sort_values('url', ascending=False).head(50)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_final_codeget_codeslenclass
get_url
https://www.unit.no/ugyldig-lenke-til-dokument-i-vitenarkiv1616161616
https://www.oclc.org/url/notfound77777
https://www.nlm.nih.gov/toxnet/index.html66666
https://nda.nih.gov/55555
https://digitalrepository.unm.edu/44444
https://opus.bibliothek.uni-augsburg.de/opus4/home44444
https://intellectum.unisabana.edu.co/44444
https://pure.iiasa.ac.at/44444
https://ir.vanderbilt.edu/44444
http://eprints.nottingham.ac.uk/44444
https://ddd.uab.cat/44444
https://mru.arcabc.ca/33333
https://research.usc.edu.au/esploro/33333
https://aquadocs.org/33333
https://uwspace.uwaterloo.ca/33333
https://pub.uni-bielefeld.de/33333
https://dataverse.tdl.org/33333
https://eprints.lib.hokudai.ac.jp/33333
https://archiv.ub.uni-marburg.de/ubfind/33333
https://eprints.unife.it/33333
https://repositorio.fucsalud.edu.co/33333
https://research.utwente.nl/33333
https://repository.up.ac.za/33333
https://pure.mpg.de/33333
https://epub.wupperinst.org/home33333
https://archive.researchdata.leeds.ac.uk/33333
https://repository.usta.edu.co/33333
https://dataverse.harvard.edu/33333
https://bcnroc.ajuntament.barcelona.cat/jspui/33333
https://er.knutd.edu.ua/33333
https://dataspace.princeton.edu/33333
https://repositori.upf.edu/33333
https://bibliotecadigital.exactas.uba.ar/33333
https://prism.ucalgary.ca/33333
https://research.cbs.dk/33333
https://webext.pasteur.fr/genolist/33333
https://ngdc.cncb.ac.cn/idog/33333
https://research-repository.uwa.edu.au/33333
https://research-repository.st-andrews.ac.uk/33333
https://repo.odmu.edu.ua/xmlui/33333
https://red.uao.edu.co/33333
https://data.gov.uk/33333
https://munin.uit.no/33333
https://datadryad.org/stash33333
http://rua.ua.es/dspace/33333
http://dspace.uevora.pt/rdpc/33333
https://sc.lib.miamioh.edu/33333
https://papyrus.bib.umontreal.ca/xmlui/33333
https://scholar.smu.edu/33333
https://dro.deakin.edu.au/33333
\n", "
" ], "text/plain": [ " url get_final_code \\\n", "get_url \n", "https://www.unit.no/ugyldig-lenke-til-dokument-... 16 16 \n", "https://www.oclc.org/url/notfound 7 7 \n", "https://www.nlm.nih.gov/toxnet/index.html 6 6 \n", "https://nda.nih.gov/ 5 5 \n", "https://digitalrepository.unm.edu/ 4 4 \n", "https://opus.bibliothek.uni-augsburg.de/opus4/home 4 4 \n", "https://intellectum.unisabana.edu.co/ 4 4 \n", "https://pure.iiasa.ac.at/ 4 4 \n", "https://ir.vanderbilt.edu/ 4 4 \n", "http://eprints.nottingham.ac.uk/ 4 4 \n", "https://ddd.uab.cat/ 4 4 \n", "https://mru.arcabc.ca/ 3 3 \n", "https://research.usc.edu.au/esploro/ 3 3 \n", "https://aquadocs.org/ 3 3 \n", "https://uwspace.uwaterloo.ca/ 3 3 \n", "https://pub.uni-bielefeld.de/ 3 3 \n", "https://dataverse.tdl.org/ 3 3 \n", "https://eprints.lib.hokudai.ac.jp/ 3 3 \n", "https://archiv.ub.uni-marburg.de/ubfind/ 3 3 \n", "https://eprints.unife.it/ 3 3 \n", "https://repositorio.fucsalud.edu.co/ 3 3 \n", "https://research.utwente.nl/ 3 3 \n", "https://repository.up.ac.za/ 3 3 \n", "https://pure.mpg.de/ 3 3 \n", "https://epub.wupperinst.org/home 3 3 \n", "https://archive.researchdata.leeds.ac.uk/ 3 3 \n", "https://repository.usta.edu.co/ 3 3 \n", "https://dataverse.harvard.edu/ 3 3 \n", "https://bcnroc.ajuntament.barcelona.cat/jspui/ 3 3 \n", "https://er.knutd.edu.ua/ 3 3 \n", "https://dataspace.princeton.edu/ 3 3 \n", "https://repositori.upf.edu/ 3 3 \n", "https://bibliotecadigital.exactas.uba.ar/ 3 3 \n", "https://prism.ucalgary.ca/ 3 3 \n", "https://research.cbs.dk/ 3 3 \n", "https://webext.pasteur.fr/genolist/ 3 3 \n", "https://ngdc.cncb.ac.cn/idog/ 3 3 \n", "https://research-repository.uwa.edu.au/ 3 3 \n", "https://research-repository.st-andrews.ac.uk/ 3 3 \n", "https://repo.odmu.edu.ua/xmlui/ 3 3 \n", "https://red.uao.edu.co/ 3 3 \n", "https://data.gov.uk/ 3 3 \n", "https://munin.uit.no/ 3 3 \n", "https://datadryad.org/stash 3 3 \n", "http://rua.ua.es/dspace/ 3 3 \n", "http://dspace.uevora.pt/rdpc/ 3 3 \n", "https://sc.lib.miamioh.edu/ 3 3 \n", "https://papyrus.bib.umontreal.ca/xmlui/ 3 3 \n", "https://scholar.smu.edu/ 3 3 \n", "https://dro.deakin.edu.au/ 3 3 \n", "\n", " get_codes len class \n", "get_url \n", "https://www.unit.no/ugyldig-lenke-til-dokument-... 16 16 16 \n", "https://www.oclc.org/url/notfound 7 7 7 \n", "https://www.nlm.nih.gov/toxnet/index.html 6 6 6 \n", "https://nda.nih.gov/ 5 5 5 \n", "https://digitalrepository.unm.edu/ 4 4 4 \n", "https://opus.bibliothek.uni-augsburg.de/opus4/home 4 4 4 \n", "https://intellectum.unisabana.edu.co/ 4 4 4 \n", "https://pure.iiasa.ac.at/ 4 4 4 \n", "https://ir.vanderbilt.edu/ 4 4 4 \n", "http://eprints.nottingham.ac.uk/ 4 4 4 \n", "https://ddd.uab.cat/ 4 4 4 \n", "https://mru.arcabc.ca/ 3 3 3 \n", "https://research.usc.edu.au/esploro/ 3 3 3 \n", "https://aquadocs.org/ 3 3 3 \n", "https://uwspace.uwaterloo.ca/ 3 3 3 \n", "https://pub.uni-bielefeld.de/ 3 3 3 \n", "https://dataverse.tdl.org/ 3 3 3 \n", "https://eprints.lib.hokudai.ac.jp/ 3 3 3 \n", "https://archiv.ub.uni-marburg.de/ubfind/ 3 3 3 \n", "https://eprints.unife.it/ 3 3 3 \n", "https://repositorio.fucsalud.edu.co/ 3 3 3 \n", "https://research.utwente.nl/ 3 3 3 \n", "https://repository.up.ac.za/ 3 3 3 \n", "https://pure.mpg.de/ 3 3 3 \n", "https://epub.wupperinst.org/home 3 3 3 \n", "https://archive.researchdata.leeds.ac.uk/ 3 3 3 \n", "https://repository.usta.edu.co/ 3 3 3 \n", "https://dataverse.harvard.edu/ 3 3 3 \n", "https://bcnroc.ajuntament.barcelona.cat/jspui/ 3 3 3 \n", "https://er.knutd.edu.ua/ 3 3 3 \n", "https://dataspace.princeton.edu/ 3 3 3 \n", "https://repositori.upf.edu/ 3 3 3 \n", "https://bibliotecadigital.exactas.uba.ar/ 3 3 3 \n", "https://prism.ucalgary.ca/ 3 3 3 \n", "https://research.cbs.dk/ 3 3 3 \n", "https://webext.pasteur.fr/genolist/ 3 3 3 \n", "https://ngdc.cncb.ac.cn/idog/ 3 3 3 \n", "https://research-repository.uwa.edu.au/ 3 3 3 \n", "https://research-repository.st-andrews.ac.uk/ 3 3 3 \n", "https://repo.odmu.edu.ua/xmlui/ 3 3 3 \n", "https://red.uao.edu.co/ 3 3 3 \n", "https://data.gov.uk/ 3 3 3 \n", "https://munin.uit.no/ 3 3 3 \n", "https://datadryad.org/stash 3 3 3 \n", "http://rua.ua.es/dspace/ 3 3 3 \n", "http://dspace.uevora.pt/rdpc/ 3 3 3 \n", "https://sc.lib.miamioh.edu/ 3 3 3 \n", "https://papyrus.bib.umontreal.ca/xmlui/ 3 3 3 \n", "https://scholar.smu.edu/ 3 3 3 \n", "https://dro.deakin.edu.au/ 3 3 3 " ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_chains.groupby('get_url').count().sort_values('url', ascending=False).head(50)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_final_codehead_codeslenclass
head_url
http://103.82.172.44:8080/xmlui/[http://103.82.172.44:8080/xmlui/][200.0][[]][0][200.0]
http://109.185.200.51/[http://109.185.200.51/][200.0][[]][0][200.0]
http://119.3.41.228/dnmivd/[http://www.unimd.org/dnmivd/][200.0][[302]][1][200.0]
http://119.78.100.183/[http://119.78.100.183][403.0][[]][0][400.0]
http://125.99.47.158:8090/jspui/[http://125.99.47.158:8090/jspui, http://125.9...[200.0, 200.0][[302], []][1, 0][200.0, 200.0]
..................
https://zks.uni-koeln.de/index.php?s=studien&c=studien_klinische-studien[http://zks.uni-koeln.de/index.php?s=studien&c...[200.0][[301]][1][200.0]
https://zokei.repo.nii.ac.jp/[https://zokei.repo.nii.ac.jp/][200.0][[]][0][200.0]
https://zoltardata.com/[https://zoltardata.com/][200.0][[]][0][200.0]
https://zone.biblio.laurentian.ca/[https://zone.biblio.laurentian.ca, https://zo...[200.0, 200.0][[], []][0, 0][200.0, 200.0]
https://zuscholars.zu.ac.ae/[https://zuscholars.zu.ac.ae, https://zuschola...[200.0, 200.0][[], []][0, 0][200.0, 200.0]
\n", "

9331 rows × 5 columns

\n", "
" ], "text/plain": [ " url \\\n", "head_url \n", "http://103.82.172.44:8080/xmlui/ [http://103.82.172.44:8080/xmlui/] \n", "http://109.185.200.51/ [http://109.185.200.51/] \n", "http://119.3.41.228/dnmivd/ [http://www.unimd.org/dnmivd/] \n", "http://119.78.100.183/ [http://119.78.100.183] \n", "http://125.99.47.158:8090/jspui/ [http://125.99.47.158:8090/jspui, http://125.9... \n", "... ... \n", "https://zks.uni-koeln.de/index.php?s=studien&c=... [http://zks.uni-koeln.de/index.php?s=studien&c... \n", "https://zokei.repo.nii.ac.jp/ [https://zokei.repo.nii.ac.jp/] \n", "https://zoltardata.com/ [https://zoltardata.com/] \n", "https://zone.biblio.laurentian.ca/ [https://zone.biblio.laurentian.ca, https://zo... \n", "https://zuscholars.zu.ac.ae/ [https://zuscholars.zu.ac.ae, https://zuschola... \n", "\n", " head_final_code \\\n", "head_url \n", "http://103.82.172.44:8080/xmlui/ [200.0] \n", "http://109.185.200.51/ [200.0] \n", "http://119.3.41.228/dnmivd/ [200.0] \n", "http://119.78.100.183/ [403.0] \n", "http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n", "... ... \n", "https://zks.uni-koeln.de/index.php?s=studien&c=... [200.0] \n", "https://zokei.repo.nii.ac.jp/ [200.0] \n", "https://zoltardata.com/ [200.0] \n", "https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n", "https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n", "\n", " head_codes len \\\n", "head_url \n", "http://103.82.172.44:8080/xmlui/ [[]] [0] \n", "http://109.185.200.51/ [[]] [0] \n", "http://119.3.41.228/dnmivd/ [[302]] [1] \n", "http://119.78.100.183/ [[]] [0] \n", "http://125.99.47.158:8090/jspui/ [[302], []] [1, 0] \n", "... ... ... \n", "https://zks.uni-koeln.de/index.php?s=studien&c=... [[301]] [1] \n", "https://zokei.repo.nii.ac.jp/ [[]] [0] \n", "https://zoltardata.com/ [[]] [0] \n", "https://zone.biblio.laurentian.ca/ [[], []] [0, 0] \n", "https://zuscholars.zu.ac.ae/ [[], []] [0, 0] \n", "\n", " class \n", "head_url \n", "http://103.82.172.44:8080/xmlui/ [200.0] \n", "http://109.185.200.51/ [200.0] \n", "http://119.3.41.228/dnmivd/ [200.0] \n", "http://119.78.100.183/ [400.0] \n", "http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n", "... ... \n", "https://zks.uni-koeln.de/index.php?s=studien&c=... [200.0] \n", "https://zokei.repo.nii.ac.jp/ [200.0] \n", "https://zoltardata.com/ [200.0] \n", "https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n", "https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n", "\n", "[9331 rows x 5 columns]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_conflation = head_chains.groupby('head_url').aggregate(list)\n", "head_conflation" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "head_conflation.to_csv('../data/processed/head_conflation.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlhead_final_codehead_codeslenclass
head_url
http://125.99.47.158:8090/jspui/[http://125.99.47.158:8090/jspui, http://125.9...[200.0, 200.0][[302], []][1, 0][200.0, 200.0]
http://160.97.80.9:8080/jspui/[http://160.97.80.9:8080/jspui/, http://160.97...[404.0, 404.0][[], [302]][0, 1][400.0, 400.0]
http://acikarsiv.atauni.edu.tr/[http://acikarsiv.atauni.edu.tr/, http://acika...[200.0, 200.0][[], []][0, 0][200.0, 200.0]
http://acikarsiv.atilim.edu.tr/[http://acikarsiv.atilim.edu.tr/, http://acika...[200.0, 200.0][[], []][0, 0][200.0, 200.0]
http://acikerisim.afsu.edu.tr/xmlui/[http://acikerisim.afsu.edu.tr, http://acikeri...[200.0, 200.0][[302, 302], [302, 302]][2, 2][200.0, 200.0]
..................
https://zaguan.unizar.es/[http://zaguan.unizar.es/, http://zaguan.uniza...[200.0, 200.0][[301], [301]][1, 1][200.0, 200.0]
https://zbc.ksiaznica.szczecin.pl/dlibra[http://zbc.ksiaznica.szczecin.pl/dlibra, http...[200.0, 200.0][[302], [302, 302]][1, 2][200.0, 200.0]
https://zenodo.org/[https://zenodo.org/, http://zenodo.org/][200.0, 200.0][[], [301]][0, 1][200.0, 200.0]
https://zone.biblio.laurentian.ca/[https://zone.biblio.laurentian.ca, https://zo...[200.0, 200.0][[], []][0, 0][200.0, 200.0]
https://zuscholars.zu.ac.ae/[https://zuscholars.zu.ac.ae, https://zuschola...[200.0, 200.0][[], []][0, 0][200.0, 200.0]
\n", "

1456 rows × 5 columns

\n", "
" ], "text/plain": [ " url \\\n", "head_url \n", "http://125.99.47.158:8090/jspui/ [http://125.99.47.158:8090/jspui, http://125.9... \n", "http://160.97.80.9:8080/jspui/ [http://160.97.80.9:8080/jspui/, http://160.97... \n", "http://acikarsiv.atauni.edu.tr/ [http://acikarsiv.atauni.edu.tr/, http://acika... \n", "http://acikarsiv.atilim.edu.tr/ [http://acikarsiv.atilim.edu.tr/, http://acika... \n", "http://acikerisim.afsu.edu.tr/xmlui/ [http://acikerisim.afsu.edu.tr, http://acikeri... \n", "... ... \n", "https://zaguan.unizar.es/ [http://zaguan.unizar.es/, http://zaguan.uniza... \n", "https://zbc.ksiaznica.szczecin.pl/dlibra [http://zbc.ksiaznica.szczecin.pl/dlibra, http... \n", "https://zenodo.org/ [https://zenodo.org/, http://zenodo.org/] \n", "https://zone.biblio.laurentian.ca/ [https://zone.biblio.laurentian.ca, https://zo... \n", "https://zuscholars.zu.ac.ae/ [https://zuscholars.zu.ac.ae, https://zuschola... \n", "\n", " head_final_code \\\n", "head_url \n", "http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n", "http://160.97.80.9:8080/jspui/ [404.0, 404.0] \n", "http://acikarsiv.atauni.edu.tr/ [200.0, 200.0] \n", "http://acikarsiv.atilim.edu.tr/ [200.0, 200.0] \n", "http://acikerisim.afsu.edu.tr/xmlui/ [200.0, 200.0] \n", "... ... \n", "https://zaguan.unizar.es/ [200.0, 200.0] \n", "https://zbc.ksiaznica.szczecin.pl/dlibra [200.0, 200.0] \n", "https://zenodo.org/ [200.0, 200.0] \n", "https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n", "https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n", "\n", " head_codes len \\\n", "head_url \n", "http://125.99.47.158:8090/jspui/ [[302], []] [1, 0] \n", "http://160.97.80.9:8080/jspui/ [[], [302]] [0, 1] \n", "http://acikarsiv.atauni.edu.tr/ [[], []] [0, 0] \n", "http://acikarsiv.atilim.edu.tr/ [[], []] [0, 0] \n", "http://acikerisim.afsu.edu.tr/xmlui/ [[302, 302], [302, 302]] [2, 2] \n", "... ... ... \n", "https://zaguan.unizar.es/ [[301], [301]] [1, 1] \n", "https://zbc.ksiaznica.szczecin.pl/dlibra [[302], [302, 302]] [1, 2] \n", "https://zenodo.org/ [[], [301]] [0, 1] \n", "https://zone.biblio.laurentian.ca/ [[], []] [0, 0] \n", "https://zuscholars.zu.ac.ae/ [[], []] [0, 0] \n", "\n", " class \n", "head_url \n", "http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n", "http://160.97.80.9:8080/jspui/ [400.0, 400.0] \n", "http://acikarsiv.atauni.edu.tr/ [200.0, 200.0] \n", "http://acikarsiv.atilim.edu.tr/ [200.0, 200.0] \n", "http://acikerisim.afsu.edu.tr/xmlui/ [200.0, 200.0] \n", "... ... \n", "https://zaguan.unizar.es/ [200.0, 200.0] \n", "https://zbc.ksiaznica.szczecin.pl/dlibra [200.0, 200.0] \n", "https://zenodo.org/ [200.0, 200.0] \n", "https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n", "https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n", "\n", "[1456 rows x 5 columns]" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_conflation[head_conflation.url.str.len() > 1]" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3051" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "head_conflation[head_conflation.url.str.len() > 1].url.apply(len).sum()" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_final_codeget_codeslenclass
get_url
http://103.82.172.44:8080/xmlui/[http://103.82.172.44:8080/xmlui/][200.0][[]][0][200.0]
http://109.185.200.51/[http://109.185.200.51/][200.0][[]][0][200.0]
http://119.3.41.228/dnmivd/[http://www.unimd.org/dnmivd/][200.0][[302]][1][200.0]
http://119.78.100.183/[http://119.78.100.183][200.0][[]][0][200.0]
http://125.99.47.158:8090/jspui/[http://125.99.47.158:8090/jspui, http://125.9...[200.0, 200.0][[302], []][1, 0][200.0, 200.0]
..................
https://zks.uni-koeln.de/index.php?s=studien&c=studien_klinische-studien[http://zks.uni-koeln.de/index.php?s=studien&c...[200.0][[301]][1][200.0]
https://zokei.repo.nii.ac.jp/[https://zokei.repo.nii.ac.jp/][200.0][[]][0][200.0]
https://zoltardata.com/[https://zoltardata.com/][200.0][[]][0][200.0]
https://zone.biblio.laurentian.ca/[https://zone.biblio.laurentian.ca, https://zo...[200.0, 200.0][[], []][0, 0][200.0, 200.0]
https://zuscholars.zu.ac.ae/[https://zuscholars.zu.ac.ae, https://zuschola...[200.0, 200.0][[], []][0, 0][200.0, 200.0]
\n", "

9353 rows × 5 columns

\n", "
" ], "text/plain": [ " url \\\n", "get_url \n", "http://103.82.172.44:8080/xmlui/ [http://103.82.172.44:8080/xmlui/] \n", "http://109.185.200.51/ [http://109.185.200.51/] \n", "http://119.3.41.228/dnmivd/ [http://www.unimd.org/dnmivd/] \n", "http://119.78.100.183/ [http://119.78.100.183] \n", "http://125.99.47.158:8090/jspui/ [http://125.99.47.158:8090/jspui, http://125.9... \n", "... ... \n", "https://zks.uni-koeln.de/index.php?s=studien&c=... [http://zks.uni-koeln.de/index.php?s=studien&c... \n", "https://zokei.repo.nii.ac.jp/ [https://zokei.repo.nii.ac.jp/] \n", "https://zoltardata.com/ [https://zoltardata.com/] \n", "https://zone.biblio.laurentian.ca/ [https://zone.biblio.laurentian.ca, https://zo... \n", "https://zuscholars.zu.ac.ae/ [https://zuscholars.zu.ac.ae, https://zuschola... \n", "\n", " get_final_code \\\n", "get_url \n", "http://103.82.172.44:8080/xmlui/ [200.0] \n", "http://109.185.200.51/ [200.0] \n", "http://119.3.41.228/dnmivd/ [200.0] \n", "http://119.78.100.183/ [200.0] \n", "http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n", "... ... \n", "https://zks.uni-koeln.de/index.php?s=studien&c=... [200.0] \n", "https://zokei.repo.nii.ac.jp/ [200.0] \n", "https://zoltardata.com/ [200.0] \n", "https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n", "https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n", "\n", " get_codes len \\\n", "get_url \n", "http://103.82.172.44:8080/xmlui/ [[]] [0] \n", "http://109.185.200.51/ [[]] [0] \n", "http://119.3.41.228/dnmivd/ [[302]] [1] \n", "http://119.78.100.183/ [[]] [0] \n", "http://125.99.47.158:8090/jspui/ [[302], []] [1, 0] \n", "... ... ... \n", "https://zks.uni-koeln.de/index.php?s=studien&c=... [[301]] [1] \n", "https://zokei.repo.nii.ac.jp/ [[]] [0] \n", "https://zoltardata.com/ [[]] [0] \n", "https://zone.biblio.laurentian.ca/ [[], []] [0, 0] \n", "https://zuscholars.zu.ac.ae/ [[], []] [0, 0] \n", "\n", " class \n", "get_url \n", "http://103.82.172.44:8080/xmlui/ [200.0] \n", "http://109.185.200.51/ [200.0] \n", "http://119.3.41.228/dnmivd/ [200.0] \n", "http://119.78.100.183/ [200.0] \n", "http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n", "... ... \n", "https://zks.uni-koeln.de/index.php?s=studien&c=... [200.0] \n", "https://zokei.repo.nii.ac.jp/ [200.0] \n", "https://zoltardata.com/ [200.0] \n", "https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n", "https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n", "\n", "[9353 rows x 5 columns]" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_conflation = get_chains.groupby('get_url').aggregate(list)\n", "get_conflation" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "get_conflation.to_csv('../data/processed/get_conflation.tsv', sep='\\t')" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
urlget_final_codeget_codeslenclass
get_url
http://125.99.47.158:8090/jspui/[http://125.99.47.158:8090/jspui, http://125.9...[200.0, 200.0][[302], []][1, 0][200.0, 200.0]
http://160.97.80.9:8080/jspui/[http://160.97.80.9:8080/jspui/, http://160.97...[404.0, 404.0][[], [302]][0, 1][400.0, 400.0]
http://acikarsiv.atauni.edu.tr/[http://acikarsiv.atauni.edu.tr/, http://acika...[200.0, 200.0][[], []][0, 0][200.0, 200.0]
http://acikarsiv.atilim.edu.tr/[http://acikarsiv.atilim.edu.tr/, http://acika...[200.0, 200.0][[], []][0, 0][200.0, 200.0]
http://acikerisim.afsu.edu.tr/xmlui/[http://acikerisim.afsu.edu.tr, http://acikeri...[200.0, 200.0][[302, 302], [302, 302]][2, 2][200.0, 200.0]
..................
https://zaguan.unizar.es/[http://zaguan.unizar.es/, http://zaguan.uniza...[200.0, 200.0][[301], [301]][1, 1][200.0, 200.0]
https://zbc.ksiaznica.szczecin.pl/dlibra[http://zbc.ksiaznica.szczecin.pl/dlibra, http...[200.0, 200.0][[302], [302, 302]][1, 2][200.0, 200.0]
https://zenodo.org/[https://zenodo.org/, http://zenodo.org/][200.0, 200.0][[], [301]][0, 1][200.0, 200.0]
https://zone.biblio.laurentian.ca/[https://zone.biblio.laurentian.ca, https://zo...[200.0, 200.0][[], []][0, 0][200.0, 200.0]
https://zuscholars.zu.ac.ae/[https://zuscholars.zu.ac.ae, https://zuschola...[200.0, 200.0][[], []][0, 0][200.0, 200.0]
\n", "

1459 rows × 5 columns

\n", "
" ], "text/plain": [ " url \\\n", "get_url \n", "http://125.99.47.158:8090/jspui/ [http://125.99.47.158:8090/jspui, http://125.9... \n", "http://160.97.80.9:8080/jspui/ [http://160.97.80.9:8080/jspui/, http://160.97... \n", "http://acikarsiv.atauni.edu.tr/ [http://acikarsiv.atauni.edu.tr/, http://acika... \n", "http://acikarsiv.atilim.edu.tr/ [http://acikarsiv.atilim.edu.tr/, http://acika... \n", "http://acikerisim.afsu.edu.tr/xmlui/ [http://acikerisim.afsu.edu.tr, http://acikeri... \n", "... ... \n", "https://zaguan.unizar.es/ [http://zaguan.unizar.es/, http://zaguan.uniza... \n", "https://zbc.ksiaznica.szczecin.pl/dlibra [http://zbc.ksiaznica.szczecin.pl/dlibra, http... \n", "https://zenodo.org/ [https://zenodo.org/, http://zenodo.org/] \n", "https://zone.biblio.laurentian.ca/ [https://zone.biblio.laurentian.ca, https://zo... \n", "https://zuscholars.zu.ac.ae/ [https://zuscholars.zu.ac.ae, https://zuschola... \n", "\n", " get_final_code \\\n", "get_url \n", "http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n", "http://160.97.80.9:8080/jspui/ [404.0, 404.0] \n", "http://acikarsiv.atauni.edu.tr/ [200.0, 200.0] \n", "http://acikarsiv.atilim.edu.tr/ [200.0, 200.0] \n", "http://acikerisim.afsu.edu.tr/xmlui/ [200.0, 200.0] \n", "... ... \n", "https://zaguan.unizar.es/ [200.0, 200.0] \n", "https://zbc.ksiaznica.szczecin.pl/dlibra [200.0, 200.0] \n", "https://zenodo.org/ [200.0, 200.0] \n", "https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n", "https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n", "\n", " get_codes len \\\n", "get_url \n", "http://125.99.47.158:8090/jspui/ [[302], []] [1, 0] \n", "http://160.97.80.9:8080/jspui/ [[], [302]] [0, 1] \n", "http://acikarsiv.atauni.edu.tr/ [[], []] [0, 0] \n", "http://acikarsiv.atilim.edu.tr/ [[], []] [0, 0] \n", "http://acikerisim.afsu.edu.tr/xmlui/ [[302, 302], [302, 302]] [2, 2] \n", "... ... ... \n", "https://zaguan.unizar.es/ [[301], [301]] [1, 1] \n", "https://zbc.ksiaznica.szczecin.pl/dlibra [[302], [302, 302]] [1, 2] \n", "https://zenodo.org/ [[], [301]] [0, 1] \n", "https://zone.biblio.laurentian.ca/ [[], []] [0, 0] \n", "https://zuscholars.zu.ac.ae/ [[], []] [0, 0] \n", "\n", " class \n", "get_url \n", "http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n", "http://160.97.80.9:8080/jspui/ [400.0, 400.0] \n", "http://acikarsiv.atauni.edu.tr/ [200.0, 200.0] \n", "http://acikarsiv.atilim.edu.tr/ [200.0, 200.0] \n", "http://acikerisim.afsu.edu.tr/xmlui/ [200.0, 200.0] \n", "... ... \n", "https://zaguan.unizar.es/ [200.0, 200.0] \n", "https://zbc.ksiaznica.szczecin.pl/dlibra [200.0, 200.0] \n", "https://zenodo.org/ [200.0, 200.0] \n", "https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n", "https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n", "\n", "[1459 rows x 5 columns]" ] }, "execution_count": 72, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_conflation[get_conflation.url.str.len() > 1]" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "3060" ] }, "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ "get_conflation[get_conflation.url.str.len() > 1].url.apply(len).sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "ccdc3acc266150d74575e7f25ef162b022ec22dae7e3244cf5a4f2ecbaf21c19" }, "kernelspec": { "display_name": "Python 3.9.12 ('data-science')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.12" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }