scholarly_repo_availability/notebooks/03.1-analysis.ipynb

7543 lines
295 KiB
Plaintext
Raw Permalink Normal View History

2022-06-04 17:22:17 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import json\n",
"import requests\n",
"import ast\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_url</th>\n",
" <th>head_chain</th>\n",
" <th>head_codes</th>\n",
" <th>head_elapsed</th>\n",
" <th>head_error</th>\n",
" <th>get_final_code</th>\n",
" <th>get_url</th>\n",
" <th>get_chain</th>\n",
" <th>get_codes</th>\n",
" <th>get_elapsed</th>\n",
" <th>get_error</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>http://sidc.be/silso/home</td>\n",
" <td>200.0</td>\n",
" <td>https://www.sidc.be/silso/home</td>\n",
" <td>['http://sidc.be/silso/home']</td>\n",
" <td>[302]</td>\n",
" <td>0.151548</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>https://www.sidc.be/silso/home</td>\n",
" <td>['http://sidc.be/silso/home']</td>\n",
" <td>[302]</td>\n",
" <td>0.145799</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>https://bmrb.io/</td>\n",
" <td>200.0</td>\n",
" <td>https://bmrb.io/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.527131</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>https://bmrb.io/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.487402</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>http://www.pac.dfo-mpo.gc.ca/science/oceans/da...</td>\n",
" <td>200.0</td>\n",
" <td>https://www.pac.dfo-mpo.gc.ca/science/oceans/d...</td>\n",
" <td>['http://www.pac.dfo-mpo.gc.ca/science/oceans/...</td>\n",
" <td>[307]</td>\n",
" <td>0.530466</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>https://www.pac.dfo-mpo.gc.ca/science/oceans/d...</td>\n",
" <td>['http://www.pac.dfo-mpo.gc.ca/science/oceans/...</td>\n",
" <td>[307]</td>\n",
" <td>0.510097</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>https://www.cpc.ncep.noaa.gov/</td>\n",
" <td>200.0</td>\n",
" <td>https://www.cpc.ncep.noaa.gov/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.507708</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>https://www.cpc.ncep.noaa.gov/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.492151</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...</td>\n",
" <td>404.0</td>\n",
" <td>http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.586088</td>\n",
" <td>NaN</td>\n",
" <td>404.0</td>\n",
" <td>http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.542670</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13352</th>\n",
" <td>http://radar.brookes.ac.uk/radar/access/home.do</td>\n",
" <td>200.0</td>\n",
" <td>https://radar.brookes.ac.uk/radar/home.do</td>\n",
" <td>['http://radar.brookes.ac.uk/radar/access/home...</td>\n",
" <td>[302, 302, 302]</td>\n",
" <td>0.082406</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>https://radar.brookes.ac.uk/radar/home.do</td>\n",
" <td>['http://radar.brookes.ac.uk/radar/access/home...</td>\n",
" <td>[302, 302, 302]</td>\n",
" <td>0.088766</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13353</th>\n",
" <td>http://www.ub.uni-siegen.de/opus</td>\n",
" <td>200.0</td>\n",
" <td>https://www.ub.uni-siegen.de/opus/</td>\n",
" <td>['http://www.ub.uni-siegen.de/opus', 'https://...</td>\n",
" <td>[301, 301]</td>\n",
" <td>0.056328</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>https://www.ub.uni-siegen.de/opus/</td>\n",
" <td>['http://www.ub.uni-siegen.de/opus', 'https://...</td>\n",
" <td>[301, 301]</td>\n",
" <td>0.039649</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13354</th>\n",
" <td>http://numerique.bibliotheque.toulouse.fr/cgi-...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>HTTPConnectionPool(host='numerique.bibliothequ...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>HTTPConnectionPool(host='numerique.bibliothequ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13355</th>\n",
" <td>https://www.utas.edu.au/library</td>\n",
" <td>200.0</td>\n",
" <td>https://www.utas.edu.au/library</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.223856</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>https://www.utas.edu.au/library</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.595001</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13356</th>\n",
" <td>http://204.232.200.153:8080/jspui/</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>HTTPConnectionPool(host='204.232.200.153', por...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>HTTPConnectionPool(host='204.232.200.153', por...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>13356 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" url head_final_code \\\n",
"0 http://sidc.be/silso/home 200.0 \n",
"1 https://bmrb.io/ 200.0 \n",
"2 http://www.pac.dfo-mpo.gc.ca/science/oceans/da... 200.0 \n",
"3 https://www.cpc.ncep.noaa.gov/ 200.0 \n",
"4 http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... 404.0 \n",
"... ... ... \n",
"13352 http://radar.brookes.ac.uk/radar/access/home.do 200.0 \n",
"13353 http://www.ub.uni-siegen.de/opus 200.0 \n",
"13354 http://numerique.bibliotheque.toulouse.fr/cgi-... NaN \n",
"13355 https://www.utas.edu.au/library 200.0 \n",
"13356 http://204.232.200.153:8080/jspui/ NaN \n",
"\n",
" head_url \\\n",
"0 https://www.sidc.be/silso/home \n",
"1 https://bmrb.io/ \n",
"2 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... \n",
"3 https://www.cpc.ncep.noaa.gov/ \n",
"4 http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... \n",
"... ... \n",
"13352 https://radar.brookes.ac.uk/radar/home.do \n",
"13353 https://www.ub.uni-siegen.de/opus/ \n",
"13354 NaN \n",
"13355 https://www.utas.edu.au/library \n",
"13356 NaN \n",
"\n",
" head_chain head_codes \\\n",
"0 ['http://sidc.be/silso/home'] [302] \n",
"1 [] [] \n",
"2 ['http://www.pac.dfo-mpo.gc.ca/science/oceans/... [307] \n",
"3 [] [] \n",
"4 [] [] \n",
"... ... ... \n",
"13352 ['http://radar.brookes.ac.uk/radar/access/home... [302, 302, 302] \n",
"13353 ['http://www.ub.uni-siegen.de/opus', 'https://... [301, 301] \n",
"13354 NaN NaN \n",
"13355 [] [] \n",
"13356 NaN NaN \n",
"\n",
" head_elapsed head_error \\\n",
"0 0.151548 NaN \n",
"1 0.527131 NaN \n",
"2 0.530466 NaN \n",
"3 0.507708 NaN \n",
"4 0.586088 NaN \n",
"... ... ... \n",
"13352 0.082406 NaN \n",
"13353 0.056328 NaN \n",
"13354 NaN HTTPConnectionPool(host='numerique.bibliothequ... \n",
"13355 0.223856 NaN \n",
"13356 NaN HTTPConnectionPool(host='204.232.200.153', por... \n",
"\n",
" get_final_code get_url \\\n",
"0 200.0 https://www.sidc.be/silso/home \n",
"1 200.0 https://bmrb.io/ \n",
"2 200.0 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... \n",
"3 200.0 https://www.cpc.ncep.noaa.gov/ \n",
"4 404.0 http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... \n",
"... ... ... \n",
"13352 200.0 https://radar.brookes.ac.uk/radar/home.do \n",
"13353 200.0 https://www.ub.uni-siegen.de/opus/ \n",
"13354 NaN NaN \n",
"13355 200.0 https://www.utas.edu.au/library \n",
"13356 NaN NaN \n",
"\n",
" get_chain get_codes \\\n",
"0 ['http://sidc.be/silso/home'] [302] \n",
"1 [] [] \n",
"2 ['http://www.pac.dfo-mpo.gc.ca/science/oceans/... [307] \n",
"3 [] [] \n",
"4 [] [] \n",
"... ... ... \n",
"13352 ['http://radar.brookes.ac.uk/radar/access/home... [302, 302, 302] \n",
"13353 ['http://www.ub.uni-siegen.de/opus', 'https://... [301, 301] \n",
"13354 NaN NaN \n",
"13355 [] [] \n",
"13356 NaN NaN \n",
"\n",
" get_elapsed get_error \n",
"0 0.145799 NaN \n",
"1 0.487402 NaN \n",
"2 0.510097 NaN \n",
"3 0.492151 NaN \n",
"4 0.542670 NaN \n",
"... ... ... \n",
"13352 0.088766 NaN \n",
"13353 0.039649 NaN \n",
"13354 NaN HTTPConnectionPool(host='numerique.bibliothequ... \n",
"13355 0.595001 NaN \n",
"13356 NaN HTTPConnectionPool(host='204.232.200.153', por... \n",
"\n",
"[13356 rows x 13 columns]"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('../data/processed/redirects_new.tsv', sep='\\t')\n",
"df = df[df.url.notna()]\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_url</th>\n",
" <th>head_chain</th>\n",
" <th>head_codes</th>\n",
" <th>head_elapsed</th>\n",
" <th>head_error</th>\n",
" <th>get_final_code</th>\n",
" <th>get_url</th>\n",
" <th>get_chain</th>\n",
" <th>get_codes</th>\n",
" <th>get_elapsed</th>\n",
" <th>get_error</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>13356</td>\n",
" <td>10926.000000</td>\n",
" <td>10926</td>\n",
" <td>10926</td>\n",
" <td>10926</td>\n",
" <td>10926.000000</td>\n",
" <td>2430</td>\n",
" <td>10954.00000</td>\n",
" <td>10954</td>\n",
" <td>10954</td>\n",
" <td>10954</td>\n",
" <td>10954.000000</td>\n",
" <td>2402</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>13356</td>\n",
" <td>NaN</td>\n",
" <td>9331</td>\n",
" <td>4056</td>\n",
" <td>59</td>\n",
" <td>NaN</td>\n",
" <td>2299</td>\n",
" <td>NaN</td>\n",
" <td>9353</td>\n",
" <td>4082</td>\n",
" <td>57</td>\n",
" <td>NaN</td>\n",
" <td>2301</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>http://sidc.be/silso/home</td>\n",
" <td>NaN</td>\n",
" <td>https://www.unit.no/ugyldig-lenke-til-dokument...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>('Connection aborted.', RemoteDisconnected('Re...</td>\n",
" <td>NaN</td>\n",
" <td>https://www.unit.no/ugyldig-lenke-til-dokument...</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>('Connection aborted.', RemoteDisconnected('Re...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>16</td>\n",
" <td>6596</td>\n",
" <td>6596</td>\n",
" <td>NaN</td>\n",
" <td>34</td>\n",
" <td>NaN</td>\n",
" <td>16</td>\n",
" <td>6595</td>\n",
" <td>6595</td>\n",
" <td>NaN</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>218.406736</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.124777</td>\n",
" <td>NaN</td>\n",
" <td>215.73407</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.125515</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>61.035428</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3.103610</td>\n",
" <td>NaN</td>\n",
" <td>56.89142</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>3.184153</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>200.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.013261</td>\n",
" <td>NaN</td>\n",
" <td>200.00000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.013800</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>200.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.276535</td>\n",
" <td>NaN</td>\n",
" <td>200.00000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.254967</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>200.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.583665</td>\n",
" <td>NaN</td>\n",
" <td>200.00000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.527238</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>200.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.059370</td>\n",
" <td>NaN</td>\n",
" <td>200.00000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.944213</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>504.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>61.235589</td>\n",
" <td>NaN</td>\n",
" <td>504.00000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>61.500941</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url head_final_code \\\n",
"count 13356 10926.000000 \n",
"unique 13356 NaN \n",
"top http://sidc.be/silso/home NaN \n",
"freq 1 NaN \n",
"mean NaN 218.406736 \n",
"std NaN 61.035428 \n",
"min NaN 200.000000 \n",
"25% NaN 200.000000 \n",
"50% NaN 200.000000 \n",
"75% NaN 200.000000 \n",
"max NaN 504.000000 \n",
"\n",
" head_url head_chain \\\n",
"count 10926 10926 \n",
"unique 9331 4056 \n",
"top https://www.unit.no/ugyldig-lenke-til-dokument... [] \n",
"freq 16 6596 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" head_codes head_elapsed \\\n",
"count 10926 10926.000000 \n",
"unique 59 NaN \n",
"top [] NaN \n",
"freq 6596 NaN \n",
"mean NaN 1.124777 \n",
"std NaN 3.103610 \n",
"min NaN 0.013261 \n",
"25% NaN 0.276535 \n",
"50% NaN 0.583665 \n",
"75% NaN 1.059370 \n",
"max NaN 61.235589 \n",
"\n",
" head_error get_final_code \\\n",
"count 2430 10954.00000 \n",
"unique 2299 NaN \n",
"top ('Connection aborted.', RemoteDisconnected('Re... NaN \n",
"freq 34 NaN \n",
"mean NaN 215.73407 \n",
"std NaN 56.89142 \n",
"min NaN 200.00000 \n",
"25% NaN 200.00000 \n",
"50% NaN 200.00000 \n",
"75% NaN 200.00000 \n",
"max NaN 504.00000 \n",
"\n",
" get_url get_chain get_codes \\\n",
"count 10954 10954 10954 \n",
"unique 9353 4082 57 \n",
"top https://www.unit.no/ugyldig-lenke-til-dokument... [] [] \n",
"freq 16 6595 6595 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
"min NaN NaN NaN \n",
"25% NaN NaN NaN \n",
"50% NaN NaN NaN \n",
"75% NaN NaN NaN \n",
"max NaN NaN NaN \n",
"\n",
" get_elapsed get_error \n",
"count 10954.000000 2402 \n",
"unique NaN 2301 \n",
"top NaN ('Connection aborted.', RemoteDisconnected('Re... \n",
"freq NaN 22 \n",
"mean 1.125515 NaN \n",
"std 3.184153 NaN \n",
"min 0.013800 NaN \n",
"25% 0.254967 NaN \n",
"50% 0.527238 NaN \n",
"75% 0.944213 NaN \n",
"max 61.500941 NaN "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe(include='all')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Final status codes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### HEAD"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" </tr>\n",
" <tr>\n",
" <th>head_final_code</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>200.0</th>\n",
" <td>9976</td>\n",
" </tr>\n",
" <tr>\n",
" <th>201.0</th>\n",
" <td>19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>400.0</th>\n",
" <td>19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>401.0</th>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>403.0</th>\n",
" <td>205</td>\n",
" </tr>\n",
" <tr>\n",
" <th>404.0</th>\n",
" <td>489</td>\n",
" </tr>\n",
" <tr>\n",
" <th>405.0</th>\n",
" <td>50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>406.0</th>\n",
" <td>20</td>\n",
" </tr>\n",
" <tr>\n",
" <th>410.0</th>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>412.0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>419.0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>429.0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>444.0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>500.0</th>\n",
" <td>47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>501.0</th>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>502.0</th>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>503.0</th>\n",
" <td>49</td>\n",
" </tr>\n",
" <tr>\n",
" <th>504.0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url\n",
"head_final_code \n",
"200.0 9976\n",
"201.0 19\n",
"400.0 19\n",
"401.0 14\n",
"403.0 205\n",
"404.0 489\n",
"405.0 50\n",
"406.0 20\n",
"410.0 15\n",
"412.0 1\n",
"419.0 1\n",
"429.0 1\n",
"444.0 1\n",
"500.0 47\n",
"501.0 3\n",
"502.0 15\n",
"503.0 49\n",
"504.0 1"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[['url', 'head_final_code']].groupby('head_final_code').count()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/3455765756.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" final_head['class'] = final_head.head_final_code.floordiv(100).mul(100)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_final_code</th>\n",
" </tr>\n",
" <tr>\n",
" <th>class</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>200.0</th>\n",
" <td>9995</td>\n",
" <td>9995</td>\n",
" </tr>\n",
" <tr>\n",
" <th>400.0</th>\n",
" <td>816</td>\n",
" <td>816</td>\n",
" </tr>\n",
" <tr>\n",
" <th>500.0</th>\n",
" <td>115</td>\n",
" <td>115</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url head_final_code\n",
"class \n",
"200.0 9995 9995\n",
"400.0 816 816\n",
"500.0 115 115"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_head = df[['url', 'head_final_code']]\n",
"final_head['class'] = final_head.head_final_code.floordiv(100).mul(100)\n",
"final_head.groupby('class').count()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"url 10926\n",
"head_final_code 10926\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_head.groupby('class').count().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_final_code</th>\n",
" </tr>\n",
" <tr>\n",
" <th>class</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>200.0</th>\n",
" <td>74.835280</td>\n",
" <td>74.835280</td>\n",
" </tr>\n",
" <tr>\n",
" <th>400.0</th>\n",
" <td>6.109614</td>\n",
" <td>6.109614</td>\n",
" </tr>\n",
" <tr>\n",
" <th>500.0</th>\n",
" <td>0.861036</td>\n",
" <td>0.861036</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url head_final_code\n",
"class \n",
"200.0 74.835280 74.835280\n",
"400.0 6.109614 6.109614\n",
"500.0 0.861036 0.861036"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_head.groupby('class').count().div(df.shape[0]).mul(100)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### GET"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" </tr>\n",
" <tr>\n",
" <th>get_final_code</th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>200.0</th>\n",
" <td>10139</td>\n",
" </tr>\n",
" <tr>\n",
" <th>201.0</th>\n",
" <td>19</td>\n",
" </tr>\n",
" <tr>\n",
" <th>400.0</th>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>401.0</th>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>403.0</th>\n",
" <td>168</td>\n",
" </tr>\n",
" <tr>\n",
" <th>404.0</th>\n",
" <td>464</td>\n",
" </tr>\n",
" <tr>\n",
" <th>405.0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>406.0</th>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>410.0</th>\n",
" <td>15</td>\n",
" </tr>\n",
" <tr>\n",
" <th>412.0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>419.0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>429.0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>444.0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>500.0</th>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>502.0</th>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>503.0</th>\n",
" <td>48</td>\n",
" </tr>\n",
" <tr>\n",
" <th>504.0</th>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url\n",
"get_final_code \n",
"200.0 10139\n",
"201.0 19\n",
"400.0 8\n",
"401.0 13\n",
"403.0 168\n",
"404.0 464\n",
"405.0 1\n",
"406.0 21\n",
"410.0 15\n",
"412.0 1\n",
"419.0 1\n",
"429.0 1\n",
"444.0 1\n",
"500.0 40\n",
"502.0 13\n",
"503.0 48\n",
"504.0 1"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[['url', 'get_final_code']].groupby('get_final_code').count()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/1946842175.py:2: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" final_get['class'] = final_get.get_final_code.floordiv(100).mul(100)\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_final_code</th>\n",
" </tr>\n",
" <tr>\n",
" <th>class</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>200.0</th>\n",
" <td>10158</td>\n",
" <td>10158</td>\n",
" </tr>\n",
" <tr>\n",
" <th>400.0</th>\n",
" <td>694</td>\n",
" <td>694</td>\n",
" </tr>\n",
" <tr>\n",
" <th>500.0</th>\n",
" <td>102</td>\n",
" <td>102</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url get_final_code\n",
"class \n",
"200.0 10158 10158\n",
"400.0 694 694\n",
"500.0 102 102"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_get = df[['url', 'get_final_code']]\n",
"final_get['class'] = final_get.get_final_code.floordiv(100).mul(100)\n",
"final_get.groupby('class').count()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"url 10954\n",
"get_final_code 10954\n",
"dtype: int64"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_get.groupby('class').count().sum()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_final_code</th>\n",
" </tr>\n",
" <tr>\n",
" <th>class</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>200.0</th>\n",
" <td>76.055705</td>\n",
" <td>76.055705</td>\n",
" </tr>\n",
" <tr>\n",
" <th>400.0</th>\n",
" <td>5.196167</td>\n",
" <td>5.196167</td>\n",
" </tr>\n",
" <tr>\n",
" <th>500.0</th>\n",
" <td>0.763702</td>\n",
" <td>0.763702</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url get_final_code\n",
"class \n",
"200.0 76.055705 76.055705\n",
"400.0 5.196167 5.196167\n",
"500.0 0.763702 0.763702"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"final_get.groupby('class').count().div(df.shape[0]).mul(100)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Errors"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"head_errors = df[df.head_error.notna()][['url','head_error']]\n",
"get_errors = df[df.get_error.notna()][['url','get_error']]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_error</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>https://isoarch.eu/</td>\n",
" <td>HTTPSConnectionPool(host='isoarch.eu', port=44...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>https://www.chinare.org.cn/en/</td>\n",
" <td>HTTPSConnectionPool(host='www.chinare.org.cn',...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>http://www.flyrnai.org/</td>\n",
" <td>HTTPSConnectionPool(host='www.flyrnai.org', po...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>http://alternaria.vbi.vt.edu/index.html</td>\n",
" <td>HTTPConnectionPool(host='alternaria.vbi.vt.edu...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>83</th>\n",
" <td>http://www.pazar.info/</td>\n",
" <td>HTTPConnectionPool(host='www.pazar.info', port...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13344</th>\n",
" <td>http://intranet.thomasdanby.ac.uk/dspace/</td>\n",
" <td>HTTPConnectionPool(host='intranet.thomasdanby....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13345</th>\n",
" <td>http://dspace.dgsca.unam.mx:8080/dspace/</td>\n",
" <td>HTTPConnectionPool(host='dspace.dgsca.unam.mx'...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13347</th>\n",
" <td>https://library.for.nau.edu:8443/</td>\n",
" <td>HTTPSConnectionPool(host='library.for.nau.edu'...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13354</th>\n",
" <td>http://numerique.bibliotheque.toulouse.fr/cgi-...</td>\n",
" <td>HTTPConnectionPool(host='numerique.bibliothequ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13356</th>\n",
" <td>http://204.232.200.153:8080/jspui/</td>\n",
" <td>HTTPConnectionPool(host='204.232.200.153', por...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2242 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"10 https://isoarch.eu/ \n",
"13 https://www.chinare.org.cn/en/ \n",
"26 http://www.flyrnai.org/ \n",
"71 http://alternaria.vbi.vt.edu/index.html \n",
"83 http://www.pazar.info/ \n",
"... ... \n",
"13344 http://intranet.thomasdanby.ac.uk/dspace/ \n",
"13345 http://dspace.dgsca.unam.mx:8080/dspace/ \n",
"13347 https://library.for.nau.edu:8443/ \n",
"13354 http://numerique.bibliotheque.toulouse.fr/cgi-... \n",
"13356 http://204.232.200.153:8080/jspui/ \n",
"\n",
" head_error \n",
"10 HTTPSConnectionPool(host='isoarch.eu', port=44... \n",
"13 HTTPSConnectionPool(host='www.chinare.org.cn',... \n",
"26 HTTPSConnectionPool(host='www.flyrnai.org', po... \n",
"71 HTTPConnectionPool(host='alternaria.vbi.vt.edu... \n",
"83 HTTPConnectionPool(host='www.pazar.info', port... \n",
"... ... \n",
"13344 HTTPConnectionPool(host='intranet.thomasdanby.... \n",
"13345 HTTPConnectionPool(host='dspace.dgsca.unam.mx'... \n",
"13347 HTTPSConnectionPool(host='library.for.nau.edu'... \n",
"13354 HTTPConnectionPool(host='numerique.bibliothequ... \n",
"13356 HTTPConnectionPool(host='204.232.200.153', por... \n",
"\n",
"[2242 rows x 2 columns]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_errors[head_errors.head_error.str.contains('Max')]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_error</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>https://isoarch.eu/</td>\n",
" <td>HTTPSConnectionPool(host='isoarch.eu', port=44...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>https://www.chinare.org.cn/en/</td>\n",
" <td>HTTPSConnectionPool(host='www.chinare.org.cn',...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>26</th>\n",
" <td>http://www.flyrnai.org/</td>\n",
" <td>HTTPSConnectionPool(host='www.flyrnai.org', po...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>57</th>\n",
" <td>http://stellabase.org</td>\n",
" <td>HTTPConnectionPool(host='cnidarians.bu.edu', p...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>71</th>\n",
" <td>http://alternaria.vbi.vt.edu/index.html</td>\n",
" <td>HTTPConnectionPool(host='alternaria.vbi.vt.edu...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13344</th>\n",
" <td>http://intranet.thomasdanby.ac.uk/dspace/</td>\n",
" <td>HTTPConnectionPool(host='intranet.thomasdanby....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13345</th>\n",
" <td>http://dspace.dgsca.unam.mx:8080/dspace/</td>\n",
" <td>HTTPConnectionPool(host='dspace.dgsca.unam.mx'...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13347</th>\n",
" <td>https://library.for.nau.edu:8443/</td>\n",
" <td>HTTPSConnectionPool(host='library.for.nau.edu'...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13354</th>\n",
" <td>http://numerique.bibliotheque.toulouse.fr/cgi-...</td>\n",
" <td>HTTPConnectionPool(host='numerique.bibliothequ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13356</th>\n",
" <td>http://204.232.200.153:8080/jspui/</td>\n",
" <td>HTTPConnectionPool(host='204.232.200.153', por...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>2256 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"10 https://isoarch.eu/ \n",
"13 https://www.chinare.org.cn/en/ \n",
"26 http://www.flyrnai.org/ \n",
"57 http://stellabase.org \n",
"71 http://alternaria.vbi.vt.edu/index.html \n",
"... ... \n",
"13344 http://intranet.thomasdanby.ac.uk/dspace/ \n",
"13345 http://dspace.dgsca.unam.mx:8080/dspace/ \n",
"13347 https://library.for.nau.edu:8443/ \n",
"13354 http://numerique.bibliotheque.toulouse.fr/cgi-... \n",
"13356 http://204.232.200.153:8080/jspui/ \n",
"\n",
" get_error \n",
"10 HTTPSConnectionPool(host='isoarch.eu', port=44... \n",
"13 HTTPSConnectionPool(host='www.chinare.org.cn',... \n",
"26 HTTPSConnectionPool(host='www.flyrnai.org', po... \n",
"57 HTTPConnectionPool(host='cnidarians.bu.edu', p... \n",
"71 HTTPConnectionPool(host='alternaria.vbi.vt.edu... \n",
"... ... \n",
"13344 HTTPConnectionPool(host='intranet.thomasdanby.... \n",
"13345 HTTPConnectionPool(host='dspace.dgsca.unam.mx'... \n",
"13347 HTTPSConnectionPool(host='library.for.nau.edu'... \n",
"13354 HTTPConnectionPool(host='numerique.bibliothequ... \n",
"13356 HTTPConnectionPool(host='204.232.200.153', por... \n",
"\n",
"[2256 rows x 2 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_errors[get_errors.get_error.str.contains('Max')]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_error</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>485</th>\n",
" <td>https://idr.openmicroscopy.org</td>\n",
" <td>Exceeded 30 redirects.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1191</th>\n",
" <td>http://hgtree.snu.ac.kr</td>\n",
" <td>Exceeded 30 redirects.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1462</th>\n",
" <td>http://cfgp.snu.ac.kr</td>\n",
" <td>Exceeded 30 redirects.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2526</th>\n",
" <td>https://incois.gov.in/tropflux/index.jsp</td>\n",
" <td>Exceeded 30 redirects.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2550</th>\n",
" <td>https://odis.incois.gov.in/</td>\n",
" <td>Exceeded 30 redirects.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3872</th>\n",
" <td>https://www.projectdatasphere.org/projectdatas...</td>\n",
" <td>Exceeded 30 redirects.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6141</th>\n",
" <td>http://repository.poligran.edu.co/</td>\n",
" <td>Exceeded 30 redirects.</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10841</th>\n",
" <td>http://library.wur.nl/way/</td>\n",
" <td>Exceeded 30 redirects.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"485 https://idr.openmicroscopy.org \n",
"1191 http://hgtree.snu.ac.kr \n",
"1462 http://cfgp.snu.ac.kr \n",
"2526 https://incois.gov.in/tropflux/index.jsp \n",
"2550 https://odis.incois.gov.in/ \n",
"3872 https://www.projectdatasphere.org/projectdatas... \n",
"6141 http://repository.poligran.edu.co/ \n",
"10841 http://library.wur.nl/way/ \n",
"\n",
" head_error \n",
"485 Exceeded 30 redirects. \n",
"1191 Exceeded 30 redirects. \n",
"1462 Exceeded 30 redirects. \n",
"2526 Exceeded 30 redirects. \n",
"2550 Exceeded 30 redirects. \n",
"3872 Exceeded 30 redirects. \n",
"6141 Exceeded 30 redirects. \n",
"10841 Exceeded 30 redirects. "
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_errors[head_errors.head_error.str.contains('30 redirect')]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_error</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6141</th>\n",
" <td>http://repository.poligran.edu.co/</td>\n",
" <td>Exceeded 30 redirects.</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url get_error\n",
"6141 http://repository.poligran.edu.co/ Exceeded 30 redirects."
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_errors[get_errors.get_error.str.contains('30 redirect')]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Testing Ukranian repositories"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/1742693288.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
" get_errors[get_errors.url.str.contains('(\\.ua$)|(\\.ua/)', regex=True)]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_error</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>4892</th>\n",
" <td>http://eprints.isofts.kiev.ua/</td>\n",
" <td>('Connection aborted.', ConnectionResetError(5...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4943</th>\n",
" <td>http://www.library.ukma.kiev.ua/dspace/</td>\n",
" <td>HTTPConnectionPool(host='www.library.ukma.kiev...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4983</th>\n",
" <td>http://repository.ibss.org.ua/dspace/</td>\n",
" <td>HTTPConnectionPool(host='repository.ibss.org.u...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5147</th>\n",
" <td>http://www.rarelib.undp.org.ua/eng/index.php3</td>\n",
" <td>HTTPConnectionPool(host='www.rarelib.undp.org....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5817</th>\n",
" <td>http://openarchive.nure.ua/</td>\n",
" <td>HTTPSConnectionPool(host='openarchive.nure.ua'...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5827</th>\n",
" <td>http://dspace.uabs.edu.ua/jspui/</td>\n",
" <td>HTTPConnectionPool(host='dspace.uabs.edu.ua', ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6135</th>\n",
" <td>http://kgmtu.edu.ua/jspui/</td>\n",
" <td>HTTPConnectionPool(host='kgmtu.edu.ua', port=8...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6231</th>\n",
" <td>http://dspace.uccu.org.ua/</td>\n",
" <td>HTTPConnectionPool(host='dspace.uccu.org.ua', ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6474</th>\n",
" <td>http://repo.uipa.edu.ua/jspui/</td>\n",
" <td>HTTPConnectionPool(host='repo.uipa.edu.ua', po...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7067</th>\n",
" <td>http://ir.asta.edu.ua/jspui/</td>\n",
" <td>HTTPConnectionPool(host='ir.asta.edu.ua', port...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7490</th>\n",
" <td>http://dspace.hnpu.edu.ua/</td>\n",
" <td>HTTPConnectionPool(host='dspace.hnpu.edu.ua', ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7989</th>\n",
" <td>http://repository.mdu.in.ua/jspui/</td>\n",
" <td>HTTPConnectionPool(host='repository.mdu.in.ua'...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8085</th>\n",
" <td>http://elar.tsatu.edu.ua/</td>\n",
" <td>HTTPConnectionPool(host='elar.tsatu.edu.ua', p...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8105</th>\n",
" <td>http://dspace.zsea.edu.ua/</td>\n",
" <td>HTTPSConnectionPool(host='dspace.znu.edu.ua', ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8197</th>\n",
" <td>http://dspace.knau.kharkov.ua/jspui/</td>\n",
" <td>HTTPConnectionPool(host='dspace.knau.kharkov.u...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8199</th>\n",
" <td>http://rep.btsau.edu.ua</td>\n",
" <td>HTTPConnectionPool(host='rep.btsau.edu.ua', po...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8206</th>\n",
" <td>https://biblio.onat.edu.ua/</td>\n",
" <td>HTTPSConnectionPool(host='biblio.onat.edu.ua',...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8419</th>\n",
" <td>https://rep.ksma.ks.ua</td>\n",
" <td>HTTPSConnectionPool(host='rep.ksma.ks.ua', por...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10016</th>\n",
" <td>http://eprints.ksame.kharkov.ua/</td>\n",
" <td>HTTPConnectionPool(host='eprints.ksame.kharkov...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10107</th>\n",
" <td>http://dspace.uabs.edu.ua/</td>\n",
" <td>HTTPConnectionPool(host='dspace.uabs.edu.ua', ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10799</th>\n",
" <td>http://repository.ibss.org.ua/dspace</td>\n",
" <td>HTTPConnectionPool(host='repository.ibss.org.u...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10906</th>\n",
" <td>http://repo.sau.sumy.ua/</td>\n",
" <td>HTTPConnectionPool(host='repo.sau.sumy.ua', po...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10976</th>\n",
" <td>http://www.dspace.humanities.org.ua/dspace/</td>\n",
" <td>HTTPConnectionPool(host='www.dspace.humanities...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11616</th>\n",
" <td>http:// visnykagro.knau.kharkov.ua</td>\n",
" <td>HTTPConnectionPool(host='%20visnykagro.knau.kh...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11803</th>\n",
" <td>http://repository.mdu.in.ua/</td>\n",
" <td>HTTPConnectionPool(host='repository.mdu.in.ua'...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11831</th>\n",
" <td>http://sj-dstu.dp.ua/</td>\n",
" <td>HTTPConnectionPool(host='sj-dstu.dp.ua', port=...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12107</th>\n",
" <td>http://www.hups.mil.gov.ua/periodic-app/journal</td>\n",
" <td>HTTPConnectionPool(host='www.hups.mil.gov.ua',...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12746</th>\n",
" <td>http://www.e-archive.org.ua/</td>\n",
" <td>HTTPConnectionPool(host='www.e-archive.org.ua'...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12753</th>\n",
" <td>http://library.khnu.km.ua/jspui/</td>\n",
" <td>HTTPConnectionPool(host='library.khnu.km.ua', ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12916</th>\n",
" <td>http://dspace.ukrfa.kharkov.ua/</td>\n",
" <td>HTTPConnectionPool(host='dspace.ukrfa.kharkov....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12922</th>\n",
" <td>http://dspace.ukrfa.kharkov.ua</td>\n",
" <td>HTTPConnectionPool(host='dspace.ukrfa.kharkov....</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12970</th>\n",
" <td>http://evrica.org.ua</td>\n",
" <td>HTTPConnectionPool(host='evrica.org.ua', port=...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12992</th>\n",
" <td>http://dspace.academy.sumy.ua/</td>\n",
" <td>HTTPConnectionPool(host='dspace.academy.sumy.u...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13017</th>\n",
" <td>http://open-archive.kture.kharkov.ua/</td>\n",
" <td>HTTPConnectionPool(host='open-archive.kture.kh...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13034</th>\n",
" <td>http://dspace.onu.edu.ua/</td>\n",
" <td>HTTPConnectionPool(host='192.168.30.200', port...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13052</th>\n",
" <td>http://open-arhive.kture.kharkov.ua</td>\n",
" <td>HTTPConnectionPool(host='open-arhive.kture.kha...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"4892 http://eprints.isofts.kiev.ua/ \n",
"4943 http://www.library.ukma.kiev.ua/dspace/ \n",
"4983 http://repository.ibss.org.ua/dspace/ \n",
"5147 http://www.rarelib.undp.org.ua/eng/index.php3 \n",
"5817 http://openarchive.nure.ua/ \n",
"5827 http://dspace.uabs.edu.ua/jspui/ \n",
"6135 http://kgmtu.edu.ua/jspui/ \n",
"6231 http://dspace.uccu.org.ua/ \n",
"6474 http://repo.uipa.edu.ua/jspui/ \n",
"7067 http://ir.asta.edu.ua/jspui/ \n",
"7490 http://dspace.hnpu.edu.ua/ \n",
"7989 http://repository.mdu.in.ua/jspui/ \n",
"8085 http://elar.tsatu.edu.ua/ \n",
"8105 http://dspace.zsea.edu.ua/ \n",
"8197 http://dspace.knau.kharkov.ua/jspui/ \n",
"8199 http://rep.btsau.edu.ua \n",
"8206 https://biblio.onat.edu.ua/ \n",
"8419 https://rep.ksma.ks.ua \n",
"10016 http://eprints.ksame.kharkov.ua/ \n",
"10107 http://dspace.uabs.edu.ua/ \n",
"10799 http://repository.ibss.org.ua/dspace \n",
"10906 http://repo.sau.sumy.ua/ \n",
"10976 http://www.dspace.humanities.org.ua/dspace/ \n",
"11616 http:// visnykagro.knau.kharkov.ua \n",
"11803 http://repository.mdu.in.ua/ \n",
"11831 http://sj-dstu.dp.ua/ \n",
"12107 http://www.hups.mil.gov.ua/periodic-app/journal \n",
"12746 http://www.e-archive.org.ua/ \n",
"12753 http://library.khnu.km.ua/jspui/ \n",
"12916 http://dspace.ukrfa.kharkov.ua/ \n",
"12922 http://dspace.ukrfa.kharkov.ua \n",
"12970 http://evrica.org.ua \n",
"12992 http://dspace.academy.sumy.ua/ \n",
"13017 http://open-archive.kture.kharkov.ua/ \n",
"13034 http://dspace.onu.edu.ua/ \n",
"13052 http://open-arhive.kture.kharkov.ua \n",
"\n",
" get_error \n",
"4892 ('Connection aborted.', ConnectionResetError(5... \n",
"4943 HTTPConnectionPool(host='www.library.ukma.kiev... \n",
"4983 HTTPConnectionPool(host='repository.ibss.org.u... \n",
"5147 HTTPConnectionPool(host='www.rarelib.undp.org.... \n",
"5817 HTTPSConnectionPool(host='openarchive.nure.ua'... \n",
"5827 HTTPConnectionPool(host='dspace.uabs.edu.ua', ... \n",
"6135 HTTPConnectionPool(host='kgmtu.edu.ua', port=8... \n",
"6231 HTTPConnectionPool(host='dspace.uccu.org.ua', ... \n",
"6474 HTTPConnectionPool(host='repo.uipa.edu.ua', po... \n",
"7067 HTTPConnectionPool(host='ir.asta.edu.ua', port... \n",
"7490 HTTPConnectionPool(host='dspace.hnpu.edu.ua', ... \n",
"7989 HTTPConnectionPool(host='repository.mdu.in.ua'... \n",
"8085 HTTPConnectionPool(host='elar.tsatu.edu.ua', p... \n",
"8105 HTTPSConnectionPool(host='dspace.znu.edu.ua', ... \n",
"8197 HTTPConnectionPool(host='dspace.knau.kharkov.u... \n",
"8199 HTTPConnectionPool(host='rep.btsau.edu.ua', po... \n",
"8206 HTTPSConnectionPool(host='biblio.onat.edu.ua',... \n",
"8419 HTTPSConnectionPool(host='rep.ksma.ks.ua', por... \n",
"10016 HTTPConnectionPool(host='eprints.ksame.kharkov... \n",
"10107 HTTPConnectionPool(host='dspace.uabs.edu.ua', ... \n",
"10799 HTTPConnectionPool(host='repository.ibss.org.u... \n",
"10906 HTTPConnectionPool(host='repo.sau.sumy.ua', po... \n",
"10976 HTTPConnectionPool(host='www.dspace.humanities... \n",
"11616 HTTPConnectionPool(host='%20visnykagro.knau.kh... \n",
"11803 HTTPConnectionPool(host='repository.mdu.in.ua'... \n",
"11831 HTTPConnectionPool(host='sj-dstu.dp.ua', port=... \n",
"12107 HTTPConnectionPool(host='www.hups.mil.gov.ua',... \n",
"12746 HTTPConnectionPool(host='www.e-archive.org.ua'... \n",
"12753 HTTPConnectionPool(host='library.khnu.km.ua', ... \n",
"12916 HTTPConnectionPool(host='dspace.ukrfa.kharkov.... \n",
"12922 HTTPConnectionPool(host='dspace.ukrfa.kharkov.... \n",
"12970 HTTPConnectionPool(host='evrica.org.ua', port=... \n",
"12992 HTTPConnectionPool(host='dspace.academy.sumy.u... \n",
"13017 HTTPConnectionPool(host='open-archive.kture.kh... \n",
"13034 HTTPConnectionPool(host='192.168.30.200', port... \n",
"13052 HTTPConnectionPool(host='open-arhive.kture.kha... "
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_errors[get_errors.url.str.contains('(\\.ua$)|(\\.ua/)', regex=True)]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/2005047697.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
" get_errors[get_errors.url.str.contains('(\\.ua$)|(\\.ua/)', regex=True)].count()\n"
]
},
{
"data": {
"text/plain": [
"url 36\n",
"get_error 36\n",
"dtype: int64"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_errors[get_errors.url.str.contains('(\\.ua$)|(\\.ua/)', regex=True)].count()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/j8/sgmsw3_510nbtw4brgpr222r0000gn/T/ipykernel_16494/2148841299.py:1: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
" df[df.url.str.contains('(\\.ua$)|(\\.ua/)', regex=True)]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_url</th>\n",
" <th>head_chain</th>\n",
" <th>head_codes</th>\n",
" <th>head_elapsed</th>\n",
" <th>head_error</th>\n",
" <th>get_final_code</th>\n",
" <th>get_url</th>\n",
" <th>get_chain</th>\n",
" <th>get_codes</th>\n",
" <th>get_elapsed</th>\n",
" <th>get_error</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2185</th>\n",
" <td>http://inform.ikd.kiev.ua/index.php?path=/en/i...</td>\n",
" <td>200.0</td>\n",
" <td>http://inform.ikd.kiev.ua/?path=/en/index</td>\n",
" <td>['http://inform.ikd.kiev.ua/index.php?path=/en...</td>\n",
" <td>[301]</td>\n",
" <td>0.402488</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>http://inform.ikd.kiev.ua/?path=/en/index</td>\n",
" <td>['http://inform.ikd.kiev.ua/index.php?path=/en...</td>\n",
" <td>[301]</td>\n",
" <td>0.452971</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2286</th>\n",
" <td>http://wdc.org.ua/</td>\n",
" <td>200.0</td>\n",
" <td>http://wdc.org.ua/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.905063</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>http://wdc.org.ua/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.816887</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4892</th>\n",
" <td>http://eprints.isofts.kiev.ua/</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>('Connection aborted.', ConnectionResetError(5...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>('Connection aborted.', ConnectionResetError(5...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4943</th>\n",
" <td>http://www.library.ukma.kiev.ua/dspace/</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>HTTPConnectionPool(host='www.library.ukma.kiev...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>HTTPConnectionPool(host='www.library.ukma.kiev...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4950</th>\n",
" <td>http://www.dspace.ucu.edu.ua/dspace/</td>\n",
" <td>403.0</td>\n",
" <td>http://www.dspace.ucu.edu.ua/dspace/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.073373</td>\n",
" <td>NaN</td>\n",
" <td>403.0</td>\n",
" <td>http://www.dspace.ucu.edu.ua/dspace/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.073223</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13015</th>\n",
" <td>http://ir.kneu.edu.ua</td>\n",
" <td>200.0</td>\n",
" <td>http://ir.kneu.edu.ua/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>1.609602</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>http://ir.kneu.edu.ua/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>1.508854</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13017</th>\n",
" <td>http://open-archive.kture.kharkov.ua/</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>HTTPConnectionPool(host='open-archive.kture.kh...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>HTTPConnectionPool(host='open-archive.kture.kh...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13034</th>\n",
" <td>http://dspace.onu.edu.ua/</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>HTTPConnectionPool(host='192.168.30.200', port...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>HTTPConnectionPool(host='192.168.30.200', port...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13052</th>\n",
" <td>http://open-arhive.kture.kharkov.ua</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>HTTPConnectionPool(host='open-arhive.kture.kha...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>HTTPConnectionPool(host='open-arhive.kture.kha...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13248</th>\n",
" <td>http://www.ekmair.ukma.edu.ua</td>\n",
" <td>200.0</td>\n",
" <td>http://ekmair.ukma.edu.ua/</td>\n",
" <td>['http://www.ekmair.ukma.edu.ua/']</td>\n",
" <td>[301]</td>\n",
" <td>0.661039</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>http://ekmair.ukma.edu.ua/</td>\n",
" <td>['http://www.ekmair.ukma.edu.ua/']</td>\n",
" <td>[301]</td>\n",
" <td>0.755170</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>146 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" url head_final_code \\\n",
"2185 http://inform.ikd.kiev.ua/index.php?path=/en/i... 200.0 \n",
"2286 http://wdc.org.ua/ 200.0 \n",
"4892 http://eprints.isofts.kiev.ua/ NaN \n",
"4943 http://www.library.ukma.kiev.ua/dspace/ NaN \n",
"4950 http://www.dspace.ucu.edu.ua/dspace/ 403.0 \n",
"... ... ... \n",
"13015 http://ir.kneu.edu.ua 200.0 \n",
"13017 http://open-archive.kture.kharkov.ua/ NaN \n",
"13034 http://dspace.onu.edu.ua/ NaN \n",
"13052 http://open-arhive.kture.kharkov.ua NaN \n",
"13248 http://www.ekmair.ukma.edu.ua 200.0 \n",
"\n",
" head_url \\\n",
"2185 http://inform.ikd.kiev.ua/?path=/en/index \n",
"2286 http://wdc.org.ua/ \n",
"4892 NaN \n",
"4943 NaN \n",
"4950 http://www.dspace.ucu.edu.ua/dspace/ \n",
"... ... \n",
"13015 http://ir.kneu.edu.ua/ \n",
"13017 NaN \n",
"13034 NaN \n",
"13052 NaN \n",
"13248 http://ekmair.ukma.edu.ua/ \n",
"\n",
" head_chain head_codes \\\n",
"2185 ['http://inform.ikd.kiev.ua/index.php?path=/en... [301] \n",
"2286 [] [] \n",
"4892 NaN NaN \n",
"4943 NaN NaN \n",
"4950 [] [] \n",
"... ... ... \n",
"13015 [] [] \n",
"13017 NaN NaN \n",
"13034 NaN NaN \n",
"13052 NaN NaN \n",
"13248 ['http://www.ekmair.ukma.edu.ua/'] [301] \n",
"\n",
" head_elapsed head_error \\\n",
"2185 0.402488 NaN \n",
"2286 0.905063 NaN \n",
"4892 NaN ('Connection aborted.', ConnectionResetError(5... \n",
"4943 NaN HTTPConnectionPool(host='www.library.ukma.kiev... \n",
"4950 0.073373 NaN \n",
"... ... ... \n",
"13015 1.609602 NaN \n",
"13017 NaN HTTPConnectionPool(host='open-archive.kture.kh... \n",
"13034 NaN HTTPConnectionPool(host='192.168.30.200', port... \n",
"13052 NaN HTTPConnectionPool(host='open-arhive.kture.kha... \n",
"13248 0.661039 NaN \n",
"\n",
" get_final_code get_url \\\n",
"2185 200.0 http://inform.ikd.kiev.ua/?path=/en/index \n",
"2286 200.0 http://wdc.org.ua/ \n",
"4892 NaN NaN \n",
"4943 NaN NaN \n",
"4950 403.0 http://www.dspace.ucu.edu.ua/dspace/ \n",
"... ... ... \n",
"13015 200.0 http://ir.kneu.edu.ua/ \n",
"13017 NaN NaN \n",
"13034 NaN NaN \n",
"13052 NaN NaN \n",
"13248 200.0 http://ekmair.ukma.edu.ua/ \n",
"\n",
" get_chain get_codes \\\n",
"2185 ['http://inform.ikd.kiev.ua/index.php?path=/en... [301] \n",
"2286 [] [] \n",
"4892 NaN NaN \n",
"4943 NaN NaN \n",
"4950 [] [] \n",
"... ... ... \n",
"13015 [] [] \n",
"13017 NaN NaN \n",
"13034 NaN NaN \n",
"13052 NaN NaN \n",
"13248 ['http://www.ekmair.ukma.edu.ua/'] [301] \n",
"\n",
" get_elapsed get_error \n",
"2185 0.452971 NaN \n",
"2286 0.816887 NaN \n",
"4892 NaN ('Connection aborted.', ConnectionResetError(5... \n",
"4943 NaN HTTPConnectionPool(host='www.library.ukma.kiev... \n",
"4950 0.073223 NaN \n",
"... ... ... \n",
"13015 1.508854 NaN \n",
"13017 NaN HTTPConnectionPool(host='open-archive.kture.kh... \n",
"13034 NaN HTTPConnectionPool(host='192.168.30.200', port... \n",
"13052 NaN HTTPConnectionPool(host='open-arhive.kture.kha... \n",
"13248 0.755170 NaN \n",
"\n",
"[146 rows x 13 columns]"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.url.str.contains('(\\.ua$)|(\\.ua/)', regex=True)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Redirects"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_codes</th>\n",
" </tr>\n",
" <tr>\n",
" <th>len</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6596</td>\n",
" <td>6596</td>\n",
" <td>6596</td>\n",
" <td>6596</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3364</td>\n",
" <td>3364</td>\n",
" <td>3364</td>\n",
" <td>3364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>686</td>\n",
" <td>686</td>\n",
" <td>686</td>\n",
" <td>686</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>233</td>\n",
" <td>233</td>\n",
" <td>233</td>\n",
" <td>233</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>37</td>\n",
" <td>37</td>\n",
" <td>37</td>\n",
" <td>37</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" <td>10</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url head_url head_final_code head_codes\n",
"len \n",
"0 6596 6596 6596 6596\n",
"1 3364 3364 3364 3364\n",
"2 686 686 686 686\n",
"3 233 233 233 233\n",
"4 37 37 37 37\n",
"5 10 10 10 10"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_chains = df[df.head_codes.notna()][['url', 'head_url', 'head_final_code', 'head_codes']]\n",
"head_chains['len'] = head_chains.head_codes.apply(ast.literal_eval).str.len()\n",
"head_chains.groupby('len').count()\n"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_url</th>\n",
" <th>get_final_code</th>\n",
" <th>get_codes</th>\n",
" </tr>\n",
" <tr>\n",
" <th>len</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>6595</td>\n",
" <td>6595</td>\n",
" <td>6595</td>\n",
" <td>6595</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>3376</td>\n",
" <td>3376</td>\n",
" <td>3376</td>\n",
" <td>3376</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>699</td>\n",
" <td>699</td>\n",
" <td>699</td>\n",
" <td>699</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>229</td>\n",
" <td>229</td>\n",
" <td>229</td>\n",
" <td>229</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>43</td>\n",
" <td>43</td>\n",
" <td>43</td>\n",
" <td>43</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>12</td>\n",
" <td>12</td>\n",
" <td>12</td>\n",
" <td>12</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url get_url get_final_code get_codes\n",
"len \n",
"0 6595 6595 6595 6595\n",
"1 3376 3376 3376 3376\n",
"2 699 699 699 699\n",
"3 229 229 229 229\n",
"4 43 43 43 43\n",
"5 12 12 12 12"
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_chains = df[df.get_codes.notna()][['url', 'get_url', 'get_final_code', 'get_codes']]\n",
"get_chains['len'] = get_chains.get_codes.apply(ast.literal_eval).str.len()\n",
"get_chains.groupby('len').count()"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<AxesSubplot:xlabel='# redirects'>"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAEICAYAAACzliQjAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAYCUlEQVR4nO3df5TV9X3n8ecrCA4rbESdesgMycy6bBNJNgOdBVtNN5BG0HoWTNMg6SY0dUs8B3LUNE2hZ08kSW3iSSttGkMOiSxYI4RKjBNlJRSRhKYIgyKK6DqruA5FGUFIlGBE3vvH/QxccH7cmblzL/J5Pc65536/7+/n+72f7z2c13z53O/9XEUEZmaWh3dUuwNmZlY5Dn0zs4w49M3MMuLQNzPLiEPfzCwjDn0zs4z0GvqSaiRtkfSYpJ2SvpzqyyQ9J2l7ejSluiR9U1KbpB2SJhQda7akZ9Jj9qCdlZmZdemsEtq8DkyJiFclDQU2SfrfadufR8Tdp7S/AhibHpOAxcAkSecBNwHNQADbJLVExCvdvfAFF1wQDQ0NfTohM7Pcbdu27eWIqO1qW6+hH4Vvb72aVoemR0/f6JoO3JH22yzpXEmjgQ8D6yLiAICkdcA0YEV3B2poaKC1tbW3LpqZWRFJz3e3raQxfUlDJG0H9lEI7ofTppvTEM4iSWenWh3wQtHu7anWXd3MzCqkpNCPiDcjogmoByZKej+wAHgv8F+A84C/KEeHJM2R1CqptaOjoxyHNDOzpE9370TEQWADMC0i9kbB68D/AiamZnuAMUW71adad/VTX2NJRDRHRHNtbZdDUmZm1k+9julLqgXeiIiDkoYDHwVukTQ6IvZKEjADeCLt0gLMk7SSwge5h1K7tcBfSxqV2l1O4X8LZmb99sYbb9De3s6RI0eq3ZWKq6mpob6+nqFDh5a8Tyl374wGlksaQuF/Bqsi4j5JD6Y/CAK2A9el9muAK4E24DDwGYCIOCDpq8DW1O4rnR/qmpn1V3t7OyNHjqShoYHCNWgeIoL9+/fT3t5OY2NjyfuVcvfODmB8F/Up3bQPYG4325YCS0vunZlZL44cOZJd4ANI4vzzz6evn336G7lm9raXW+B36s95O/TNzAZoxIgRJ60vW7aMefPmAbBw4ULq6upoamo6/jh48ODxtjfccAN1dXUcO3bspP1ra2sZP348Y8eOZerUqfz85z8vS19LGdM3M3vbaJh/f1mPt/vrvz/gY9x444184QtfeEv92LFj3HPPPYwZM4aNGzcyefLk49tmzpzJt771LQA2bNjAxz72MTZs2MD73ve+AfXljA/9cvwD2F3zyYF3ZOGhgR/DzM4oDz30EOPGjWPmzJmsWLHipNAvNnnyZObMmcOSJUtYtGjRgF7TwztmZgP0q1/96qThmy996UsnbV+0aNHxbcXBvmLFCmbNmsXVV1/N/fffzxtvvNHta0yYMIGnnnpqwH0946/0zcwG2/Dhw9m+ffvx9WXLlp00b1hXwzu//vWvWbNmDbfeeisjR45k0qRJrF27lquuuqrL1yjcGDlwDn0zsypYu3YtBw8e5AMf+AAAhw8fZvjw4d2G/qOPPjrg8Xxw6JuZVcWKFSv43ve+x6xZswB47bXXaGxs5PDhw29pu3HjRpYsWcKGDRsG/LoOfTOzQbZo0SLuvPPO4+t33XUXDzzwAN/5zneO18455xwuu+wyfvzjHwPwgx/8gE2bNnH48GEaGxtZvXp1Wa70Va5xosHQ3NwcA51P33fvmJ3Zdu3aVZYwfLvq6vwlbYuI5q7a++4dM7OMOPTNzDLi0Dczy4hD38wsIw59M7OMOPTNzDLi+/TNzAbopZde4sYbb2Tz5s2MGjWKYcOG8cUvfpFRo0Yxffr0k37ZasGCBXzta18D4MUXX2TIkCF0/h74li1bGDZs2KD21aFvZmeWhe8s8/F6/o5NRDBjxgxmz57NXXfdBcDzzz9PS0sLo0aN4kMf+hD33XffSfvMnDmzcOiFCxkxYkSX0y4PFg/vmJkNwIMPPsiwYcO47rrrjtfe85738LnPfa6Kveqer/TNzAZg586dTJgwodvtP/vZz2hqajq+vnr1ai666KIK9KxrDn0zszKaO3cumzZtYtiwYXzjG9/ocninmjy8Y2Y2AOPGjeORRx45vn7bbbexfv16Ojo6qtir7vUa+pJqJG2R9JiknZK+nOqNkh6W1CbpB5KGpfrZab0tbW8oOtaCVH9a0tRBOyszswqZMmUKR44cYfHixcdrXU2PfLoo5Ur/dWBKRHwQaAKmSboEuAVYFBH/EXgFuDa1vxZ4JdUXpXZIuhi4BhgHTAO+LWlIGc/FzKziJPGjH/2IjRs30tjYyMSJE5k9eza33HILcGJMv/Nx9913V7W/vY7pR2Hu5VfT6tD0CGAK0Dnn8HJgIbAYmJ6WAe4GviVJqb4yIl4HnpPUBkwE/rUcJ2JmBlRlGvPRo0ezcuXKLrcdOtR9fxYuXDhIPepeSWP6koZI2g7sA9YB/xc4GBFHU5N2oC4t1wEvAKTth4Dzi+td7GNmZhVQUuhHxJsR0QTUU7g6f+9gdUjSHEmtklpP1w9CzMzervp0905EHAQ2AL8NnCupc3ioHtiTlvcAYwDS9ncC+4vrXexT/BpLIqI5Ipo7v5psZmblUcrdO7WSzk3Lw4GPArsohP/HU7PZwL1puSWtk7Y/mD4XaAGuSXf3NAJjgS1lOg8zy9jp/LOvg6k/513Kl7NGA8vTnTbvAFZFxH2SngRWSvor4FHg9tT+duAf0we1ByjcsUNE7JS0CngSOArMjYg3+9xjM7MiNTU17N+/n/PPP5/CPSN5iAj2799PTU1Nn/Yr5e6dHcD4LurPUhjfP7V+BPjDbo51M3Bzn3poZtaD+vp62tvbT9svQw2mmpoa6uvr+7SPp2Ews7e1oUOHnjR1sfXM0zCYmWXEoW9mlhGHvplZRhz6ZmYZceibmWXEoW9mlhGHvplZRhz6ZmYZceibmWXEoW9mlhGHvplZRhz6ZmYZceibmWXEoW9mlhGHvplZRhz6ZmYZceibmWXEoW9mlhGHvplZRhz6ZmYZceibmWWk19CXNEbSBklPStop6fpUXyhpj6Tt6XFl0T4LJLVJelrS1KL6tFRrkzR/cE7JzMy6c1YJbY4CfxYRj0gaCWyTtC5tWxQRf1PcWNLFwDXAOOBdwD9L+k9p823AR4F2YKukloh4shwnYmZmves19CNiL7A3Lf9S0i6groddpgMrI+J14DlJbcDEtK0tIp4FkLQytXXom5lVSJ/G9CU1AOOBh1NpnqQdkpZKGpVqdcALRbu1p1p39VNfY46kVkmtHR0dfememZn1ouTQlzQCWA3cEBG/ABYDFwFNFP4n8Lfl6FBELImI5ohorq2tLcchzcwsKWVMH0lDKQT+9yPihwAR8VLR9u8C96XVPcCYot3rU40e6mZmVgGl3L0j4HZgV0TcWlQfXdTsauCJtNwCXCPpbEmNwFhgC7AVGCupUdIwCh/2tpTnNMzMrBSlXOlfCnwKeFzS9lT7S2CWpCYggN3AZwEiYqekVRQ+oD0KzI2INwEkzQPWAkOApRGxs2xnYmZmvSrl7p1NgLrYtKaHfW4Gbu6ivqan/czMbHD5G7lmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhlx6JuZZcShb2aWEYe+mVlGHPpmZhnpNfQljZG0QdKTknZKuj7Vz5O0TtIz6XlUqkvSNyW1SdohaULRsWan9s9Imj14p2VmZl0p5Ur/KPBnEXExcAkwV9LFwHxgfUSMBdandYArgLHpMQdYDIU/EsBNwCRgInBT5x8KMzOrjF5DPyL2RsQjafmXwC6gDpgOLE/NlgMz0vJ04I4o2AycK2k0MBVYFxEHIuIVYB0wrZwnY2ZmPevTmL6kBmA88DBwYUTsTZteBC5My3XAC0W7tadad3UzM6uQkkNf0ghgNXBDRPyieFtEBBDl6JCkOZJ
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"data = head_chains.groupby('len').count().join(get_chains.groupby('len').count(), lsuffix='_head', rsuffix='_get')\n",
"data.index.rename('# redirects', inplace=True)\n",
"data.rename(columns={'url_head': 'HEAD', 'url_get': 'GET'}, inplace=True)\n",
"data[['HEAD', 'GET']][1:].plot.bar(rot=0)"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"HEAD 4330\n",
"GET 4359\n",
"dtype: int64"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[['HEAD', 'GET']][1:].sum()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"HEAD 32.419886\n",
"GET 32.637017\n",
"dtype: float64"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data[['HEAD', 'GET']][1:].sum().div(df.shape[0]).mul(100)"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>http://epi.minsal.cl/bases-de-datos/</td>\n",
" <td>http://epi.minsal.cl/bases-de-datos/</td>\n",
" <td>403.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>84</th>\n",
" <td>https://protocols.io/welcome</td>\n",
" <td>https://www.protocols.io:443/welcome</td>\n",
" <td>400.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>177</th>\n",
" <td>http://agris.fao.org/agris-search/index.do</td>\n",
" <td>https://agris.fao.org:443/agris-search/index.do</td>\n",
" <td>404.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>228</th>\n",
" <td>http://oryzatagline.cirad.fr/</td>\n",
" <td>https://oryzatagline.cirad.fr/</td>\n",
" <td>403.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>243</th>\n",
" <td>http://www.ncbi.nlm.nih.gov/unists</td>\n",
" <td>https://www.ncbi.nlm.nih.gov/unists</td>\n",
" <td>410.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13262</th>\n",
" <td>http://www.kumc.edu/archie</td>\n",
" <td>https://www.kumc.edu/archie</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13320</th>\n",
" <td>http://trinity.cdmhost.com/index.php</td>\n",
" <td>https://www.oclc.org/url/?404;http://trinity.c...</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13340</th>\n",
" <td>http://calvados.c3sl.ufpr.br/ojs2/index.php/ve...</td>\n",
" <td>https://revistas.ufpr.br/index.php/veterinary/...</td>\n",
" <td>404.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13346</th>\n",
" <td>http://www4.furg.br/ojs/biblos/ojs/index.php</td>\n",
" <td>https://sistemas.furg.br/ojs/biblos/ojs/index.php</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13348</th>\n",
" <td>http://lss.fnal.gov/ird/fermilab_spires.html</td>\n",
" <td>https://lss.fnal.gov/ird/fermilab_spires.html</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>378 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"11 http://epi.minsal.cl/bases-de-datos/ \n",
"84 https://protocols.io/welcome \n",
"177 http://agris.fao.org/agris-search/index.do \n",
"228 http://oryzatagline.cirad.fr/ \n",
"243 http://www.ncbi.nlm.nih.gov/unists \n",
"... ... \n",
"13262 http://www.kumc.edu/archie \n",
"13320 http://trinity.cdmhost.com/index.php \n",
"13340 http://calvados.c3sl.ufpr.br/ojs2/index.php/ve... \n",
"13346 http://www4.furg.br/ojs/biblos/ojs/index.php \n",
"13348 http://lss.fnal.gov/ird/fermilab_spires.html \n",
"\n",
" head_url head_final_code \\\n",
"11 http://epi.minsal.cl/bases-de-datos/ 403.0 \n",
"84 https://www.protocols.io:443/welcome 400.0 \n",
"177 https://agris.fao.org:443/agris-search/index.do 404.0 \n",
"228 https://oryzatagline.cirad.fr/ 403.0 \n",
"243 https://www.ncbi.nlm.nih.gov/unists 410.0 \n",
"... ... ... \n",
"13262 https://www.kumc.edu/archie 404.0 \n",
"13320 https://www.oclc.org/url/?404;http://trinity.c... 404.0 \n",
"13340 https://revistas.ufpr.br/index.php/veterinary/... 404.0 \n",
"13346 https://sistemas.furg.br/ojs/biblos/ojs/index.php 404.0 \n",
"13348 https://lss.fnal.gov/ird/fermilab_spires.html 404.0 \n",
"\n",
" head_codes len class \n",
"11 [302] 1 400.0 \n",
"84 [301] 1 400.0 \n",
"177 [301] 1 400.0 \n",
"228 [301] 1 400.0 \n",
"243 [301] 1 400.0 \n",
"... ... ... ... \n",
"13262 [302] 1 400.0 \n",
"13320 [302] 1 400.0 \n",
"13340 [301] 1 400.0 \n",
"13346 [302] 1 400.0 \n",
"13348 [302] 1 400.0 \n",
"\n",
"[378 rows x 6 columns]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_chains['class'] = head_chains.head_final_code.floordiv(100).mul(100)\n",
"head_chains[(head_chains.len > 0) & (head_chains['class'] != 200)]"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_codes</th>\n",
" <th>len</th>\n",
" </tr>\n",
" <tr>\n",
" <th>class</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>400.0</th>\n",
" <td>338</td>\n",
" <td>338</td>\n",
" <td>338</td>\n",
" <td>338</td>\n",
" <td>338</td>\n",
" </tr>\n",
" <tr>\n",
" <th>500.0</th>\n",
" <td>40</td>\n",
" <td>40</td>\n",
" <td>40</td>\n",
" <td>40</td>\n",
" <td>40</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url head_url head_final_code head_codes len\n",
"class \n",
"400.0 338 338 338 338 338\n",
"500.0 40 40 40 40 40"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_chains[(head_chains.len > 0) & (head_chains['class'] != 200)].groupby('class').count()"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_url</th>\n",
" <th>get_final_code</th>\n",
" <th>get_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>http://epi.minsal.cl/bases-de-datos/</td>\n",
" <td>http://epi.minsal.cl/bases-de-datos/</td>\n",
" <td>403.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>177</th>\n",
" <td>http://agris.fao.org/agris-search/index.do</td>\n",
" <td>https://agris.fao.org:443/agris-search/index.do</td>\n",
" <td>404.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>228</th>\n",
" <td>http://oryzatagline.cirad.fr/</td>\n",
" <td>https://oryzatagline.cirad.fr/</td>\n",
" <td>403.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>243</th>\n",
" <td>http://www.ncbi.nlm.nih.gov/unists</td>\n",
" <td>https://www.ncbi.nlm.nih.gov/unists</td>\n",
" <td>410.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>288</th>\n",
" <td>http://www.ncbi.nlm.nih.gov/dbSTS/index.html</td>\n",
" <td>https://www.ncbi.nlm.nih.gov/dbSTS/index.html</td>\n",
" <td>403.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13262</th>\n",
" <td>http://www.kumc.edu/archie</td>\n",
" <td>https://www.kumc.edu/archie</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13320</th>\n",
" <td>http://trinity.cdmhost.com/index.php</td>\n",
" <td>https://www.oclc.org/url/notfound</td>\n",
" <td>404.0</td>\n",
" <td>[302, 302]</td>\n",
" <td>2</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13340</th>\n",
" <td>http://calvados.c3sl.ufpr.br/ojs2/index.php/ve...</td>\n",
" <td>https://revistas.ufpr.br/index.php/veterinary/...</td>\n",
" <td>404.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13346</th>\n",
" <td>http://www4.furg.br/ojs/biblos/ojs/index.php</td>\n",
" <td>https://sistemas.furg.br/ojs/biblos/ojs/index.php</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13348</th>\n",
" <td>http://lss.fnal.gov/ird/fermilab_spires.html</td>\n",
" <td>https://lss.fnal.gov/ird/fermilab_spires.html</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>344 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"11 http://epi.minsal.cl/bases-de-datos/ \n",
"177 http://agris.fao.org/agris-search/index.do \n",
"228 http://oryzatagline.cirad.fr/ \n",
"243 http://www.ncbi.nlm.nih.gov/unists \n",
"288 http://www.ncbi.nlm.nih.gov/dbSTS/index.html \n",
"... ... \n",
"13262 http://www.kumc.edu/archie \n",
"13320 http://trinity.cdmhost.com/index.php \n",
"13340 http://calvados.c3sl.ufpr.br/ojs2/index.php/ve... \n",
"13346 http://www4.furg.br/ojs/biblos/ojs/index.php \n",
"13348 http://lss.fnal.gov/ird/fermilab_spires.html \n",
"\n",
" get_url get_final_code \\\n",
"11 http://epi.minsal.cl/bases-de-datos/ 403.0 \n",
"177 https://agris.fao.org:443/agris-search/index.do 404.0 \n",
"228 https://oryzatagline.cirad.fr/ 403.0 \n",
"243 https://www.ncbi.nlm.nih.gov/unists 410.0 \n",
"288 https://www.ncbi.nlm.nih.gov/dbSTS/index.html 403.0 \n",
"... ... ... \n",
"13262 https://www.kumc.edu/archie 404.0 \n",
"13320 https://www.oclc.org/url/notfound 404.0 \n",
"13340 https://revistas.ufpr.br/index.php/veterinary/... 404.0 \n",
"13346 https://sistemas.furg.br/ojs/biblos/ojs/index.php 404.0 \n",
"13348 https://lss.fnal.gov/ird/fermilab_spires.html 404.0 \n",
"\n",
" get_codes len class \n",
"11 [302] 1 400.0 \n",
"177 [301] 1 400.0 \n",
"228 [301] 1 400.0 \n",
"243 [301] 1 400.0 \n",
"288 [301] 1 400.0 \n",
"... ... ... ... \n",
"13262 [302] 1 400.0 \n",
"13320 [302, 302] 2 400.0 \n",
"13340 [301] 1 400.0 \n",
"13346 [302] 1 400.0 \n",
"13348 [302] 1 400.0 \n",
"\n",
"[344 rows x 6 columns]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_chains['class'] = get_chains.get_final_code.floordiv(100).mul(100)\n",
"get_chains[(get_chains.len > 0) & (get_chains['class'] != 200)]"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_url</th>\n",
" <th>get_final_code</th>\n",
" <th>get_codes</th>\n",
" <th>len</th>\n",
" </tr>\n",
" <tr>\n",
" <th>class</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>400.0</th>\n",
" <td>307</td>\n",
" <td>307</td>\n",
" <td>307</td>\n",
" <td>307</td>\n",
" <td>307</td>\n",
" </tr>\n",
" <tr>\n",
" <th>500.0</th>\n",
" <td>37</td>\n",
" <td>37</td>\n",
" <td>37</td>\n",
" <td>37</td>\n",
" <td>37</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url get_url get_final_code get_codes len\n",
"class \n",
"400.0 307 307 307 307 307\n",
"500.0 37 37 37 37 37"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_chains[(get_chains.len > 0) & (get_chains['class'] != 200)].groupby('class').count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Redirect reason"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>http://sidc.be/silso/home</td>\n",
" <td>https://www.sidc.be/silso/home</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>http://www.pac.dfo-mpo.gc.ca/science/oceans/da...</td>\n",
" <td>https://www.pac.dfo-mpo.gc.ca/science/oceans/d...</td>\n",
" <td>200.0</td>\n",
" <td>[307]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>http://ipt.taibif.tw/</td>\n",
" <td>https://ipt.taibif.tw/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>http://arabidopsis.info</td>\n",
" <td>https://arabidopsis.info/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>http://epi.minsal.cl/bases-de-datos/</td>\n",
" <td>http://epi.minsal.cl/bases-de-datos/</td>\n",
" <td>403.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13341</th>\n",
" <td>http://calvados.c3sl.ufpr.br/ojs2/index.php/acta/</td>\n",
" <td>https://revistas.ufpr.br/index.php/acta/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13346</th>\n",
" <td>http://www4.furg.br/ojs/biblos/ojs/index.php</td>\n",
" <td>https://sistemas.furg.br/ojs/biblos/ojs/index.php</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13348</th>\n",
" <td>http://lss.fnal.gov/ird/fermilab_spires.html</td>\n",
" <td>https://lss.fnal.gov/ird/fermilab_spires.html</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13350</th>\n",
" <td>http://cloud.ncl.edu.tw/chna/</td>\n",
" <td>https://cloud.ncl.edu.tw/chna/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13351</th>\n",
" <td>http://www.myopenarchive.org/</td>\n",
" <td>https://www.myopenarchive.org/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3364 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"0 http://sidc.be/silso/home \n",
"2 http://www.pac.dfo-mpo.gc.ca/science/oceans/da... \n",
"6 http://ipt.taibif.tw/ \n",
"9 http://arabidopsis.info \n",
"11 http://epi.minsal.cl/bases-de-datos/ \n",
"... ... \n",
"13341 http://calvados.c3sl.ufpr.br/ojs2/index.php/acta/ \n",
"13346 http://www4.furg.br/ojs/biblos/ojs/index.php \n",
"13348 http://lss.fnal.gov/ird/fermilab_spires.html \n",
"13350 http://cloud.ncl.edu.tw/chna/ \n",
"13351 http://www.myopenarchive.org/ \n",
"\n",
" head_url head_final_code \\\n",
"0 https://www.sidc.be/silso/home 200.0 \n",
"2 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... 200.0 \n",
"6 https://ipt.taibif.tw/ 200.0 \n",
"9 https://arabidopsis.info/ 200.0 \n",
"11 http://epi.minsal.cl/bases-de-datos/ 403.0 \n",
"... ... ... \n",
"13341 https://revistas.ufpr.br/index.php/acta/ 200.0 \n",
"13346 https://sistemas.furg.br/ojs/biblos/ojs/index.php 404.0 \n",
"13348 https://lss.fnal.gov/ird/fermilab_spires.html 404.0 \n",
"13350 https://cloud.ncl.edu.tw/chna/ 200.0 \n",
"13351 https://www.myopenarchive.org/ 200.0 \n",
"\n",
" head_codes len class \n",
"0 [302] 1 200.0 \n",
"2 [307] 1 200.0 \n",
"6 [301] 1 200.0 \n",
"9 [301] 1 200.0 \n",
"11 [302] 1 400.0 \n",
"... ... ... ... \n",
"13341 [301] 1 200.0 \n",
"13346 [302] 1 400.0 \n",
"13348 [302] 1 400.0 \n",
"13350 [302] 1 200.0 \n",
"13351 [301] 1 200.0 \n",
"\n",
"[3364 rows x 6 columns]"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_chains[head_chains.len == 1]"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>http://www.pac.dfo-mpo.gc.ca/science/oceans/da...</td>\n",
" <td>https://www.pac.dfo-mpo.gc.ca/science/oceans/d...</td>\n",
" <td>200.0</td>\n",
" <td>[307]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>http://ipt.taibif.tw/</td>\n",
" <td>https://ipt.taibif.tw/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>http://www.hgvd.genome.med.kyoto-u.ac.jp/</td>\n",
" <td>https://www.hgvd.genome.med.kyoto-u.ac.jp/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>http://archive.researchdata.leeds.ac.uk/</td>\n",
" <td>https://archive.researchdata.leeds.ac.uk/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>http://biodb.swu.edu.cn/qprimerdb/</td>\n",
" <td>https://biodb.swu.edu.cn/qprimerdb/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13288</th>\n",
" <td>http://dspace.sunyconnect.suny.edu/</td>\n",
" <td>https://dspace.sunyconnect.suny.edu/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13295</th>\n",
" <td>http://scholar.google.com/</td>\n",
" <td>https://scholar.google.com/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13348</th>\n",
" <td>http://lss.fnal.gov/ird/fermilab_spires.html</td>\n",
" <td>https://lss.fnal.gov/ird/fermilab_spires.html</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13350</th>\n",
" <td>http://cloud.ncl.edu.tw/chna/</td>\n",
" <td>https://cloud.ncl.edu.tw/chna/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13351</th>\n",
" <td>http://www.myopenarchive.org/</td>\n",
" <td>https://www.myopenarchive.org/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1665 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"2 http://www.pac.dfo-mpo.gc.ca/science/oceans/da... \n",
"6 http://ipt.taibif.tw/ \n",
"27 http://www.hgvd.genome.med.kyoto-u.ac.jp/ \n",
"30 http://archive.researchdata.leeds.ac.uk/ \n",
"33 http://biodb.swu.edu.cn/qprimerdb/ \n",
"... ... \n",
"13288 http://dspace.sunyconnect.suny.edu/ \n",
"13295 http://scholar.google.com/ \n",
"13348 http://lss.fnal.gov/ird/fermilab_spires.html \n",
"13350 http://cloud.ncl.edu.tw/chna/ \n",
"13351 http://www.myopenarchive.org/ \n",
"\n",
" head_url head_final_code \\\n",
"2 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... 200.0 \n",
"6 https://ipt.taibif.tw/ 200.0 \n",
"27 https://www.hgvd.genome.med.kyoto-u.ac.jp/ 200.0 \n",
"30 https://archive.researchdata.leeds.ac.uk/ 200.0 \n",
"33 https://biodb.swu.edu.cn/qprimerdb/ 200.0 \n",
"... ... ... \n",
"13288 https://dspace.sunyconnect.suny.edu/ 200.0 \n",
"13295 https://scholar.google.com/ 200.0 \n",
"13348 https://lss.fnal.gov/ird/fermilab_spires.html 404.0 \n",
"13350 https://cloud.ncl.edu.tw/chna/ 200.0 \n",
"13351 https://www.myopenarchive.org/ 200.0 \n",
"\n",
" head_codes len class \n",
"2 [307] 1 200.0 \n",
"6 [301] 1 200.0 \n",
"27 [301] 1 200.0 \n",
"30 [302] 1 200.0 \n",
"33 [301] 1 200.0 \n",
"... ... ... ... \n",
"13288 [301] 1 200.0 \n",
"13295 [302] 1 200.0 \n",
"13348 [302] 1 400.0 \n",
"13350 [302] 1 200.0 \n",
"13351 [301] 1 200.0 \n",
"\n",
"[1665 rows x 6 columns]"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_chains[(head_chains.len == 1) & (head_chains.url.str.replace('http://', 'https://') == head_chains.head_url)]"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>119</th>\n",
" <td>https://www.biosino.org/node</td>\n",
" <td>https://www.biosino.org/node/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>314</th>\n",
" <td>http://metacat.tfri.gov.tw/tfri</td>\n",
" <td>http://metacat.tfri.gov.tw/tfri/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>463</th>\n",
" <td>https://nda.nih.gov/abcd</td>\n",
" <td>https://nda.nih.gov/abcd/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>551</th>\n",
" <td>https://www1.usgs.gov/obis-usa/ipt</td>\n",
" <td>https://www1.usgs.gov/obis-usa/ipt/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>559</th>\n",
" <td>https://www.arm.gov/data</td>\n",
" <td>https://www.arm.gov/data/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12642</th>\n",
" <td>http://repositorio.utec.edu.sv:8080/jspui</td>\n",
" <td>http://repositorio.utec.edu.sv:8080/jspui/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12706</th>\n",
" <td>http://adudspace.adu.edu.tr:8080/jspui</td>\n",
" <td>http://adudspace.adu.edu.tr:8080/jspui/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12874</th>\n",
" <td>http://160.97.80.9:8080/jspui</td>\n",
" <td>http://160.97.80.9:8080/jspui/</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13030</th>\n",
" <td>http://library.pdpu.ac.in:8080/xmlui</td>\n",
" <td>http://library.pdpu.ac.in:8080/xmlui/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13267</th>\n",
" <td>http://cab.unime.it/mus</td>\n",
" <td>http://cab.unime.it/mus/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>134 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"119 https://www.biosino.org/node \n",
"314 http://metacat.tfri.gov.tw/tfri \n",
"463 https://nda.nih.gov/abcd \n",
"551 https://www1.usgs.gov/obis-usa/ipt \n",
"559 https://www.arm.gov/data \n",
"... ... \n",
"12642 http://repositorio.utec.edu.sv:8080/jspui \n",
"12706 http://adudspace.adu.edu.tr:8080/jspui \n",
"12874 http://160.97.80.9:8080/jspui \n",
"13030 http://library.pdpu.ac.in:8080/xmlui \n",
"13267 http://cab.unime.it/mus \n",
"\n",
" head_url head_final_code head_codes \\\n",
"119 https://www.biosino.org/node/ 200.0 [302] \n",
"314 http://metacat.tfri.gov.tw/tfri/ 200.0 [301] \n",
"463 https://nda.nih.gov/abcd/ 200.0 [302] \n",
"551 https://www1.usgs.gov/obis-usa/ipt/ 200.0 [301] \n",
"559 https://www.arm.gov/data/ 200.0 [301] \n",
"... ... ... ... \n",
"12642 http://repositorio.utec.edu.sv:8080/jspui/ 200.0 [302] \n",
"12706 http://adudspace.adu.edu.tr:8080/jspui/ 200.0 [302] \n",
"12874 http://160.97.80.9:8080/jspui/ 404.0 [302] \n",
"13030 http://library.pdpu.ac.in:8080/xmlui/ 200.0 [302] \n",
"13267 http://cab.unime.it/mus/ 200.0 [302] \n",
"\n",
" len class \n",
"119 1 200.0 \n",
"314 1 200.0 \n",
"463 1 200.0 \n",
"551 1 200.0 \n",
"559 1 200.0 \n",
"... ... ... \n",
"12642 1 200.0 \n",
"12706 1 200.0 \n",
"12874 1 400.0 \n",
"13030 1 200.0 \n",
"13267 1 200.0 \n",
"\n",
"[134 rows x 6 columns]"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_chains[(head_chains.len == 1) & (head_chains.url + '/' == head_chains.head_url)]"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>http://arabidopsis.info</td>\n",
" <td>https://arabidopsis.info/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>http://pcddb.cryst.bbk.ac.uk</td>\n",
" <td>https://pcddb.cryst.bbk.ac.uk/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>113</th>\n",
" <td>http://www.marinedataarchive.eu</td>\n",
" <td>https://www.marinedataarchive.eu/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>119</th>\n",
" <td>https://www.biosino.org/node</td>\n",
" <td>https://www.biosino.org/node/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>200</th>\n",
" <td>http://www.hydroshare.org</td>\n",
" <td>https://www.hydroshare.org/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13032</th>\n",
" <td>http://repositorio.ipsantarem.pt</td>\n",
" <td>https://repositorio.ipsantarem.pt/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13077</th>\n",
" <td>http://bibliotecadigital.ipb.pt</td>\n",
" <td>https://bibliotecadigital.ipb.pt/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13091</th>\n",
" <td>http://eprints.nmlindia.org</td>\n",
" <td>https://eprints.nmlindia.org/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13124</th>\n",
" <td>http://www.ju.edu.et</td>\n",
" <td>https://www.ju.edu.et/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13137</th>\n",
" <td>http://scholarship.rollins.edu</td>\n",
" <td>https://scholarship.rollins.edu/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>487 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" url head_url \\\n",
"9 http://arabidopsis.info https://arabidopsis.info/ \n",
"46 http://pcddb.cryst.bbk.ac.uk https://pcddb.cryst.bbk.ac.uk/ \n",
"113 http://www.marinedataarchive.eu https://www.marinedataarchive.eu/ \n",
"119 https://www.biosino.org/node https://www.biosino.org/node/ \n",
"200 http://www.hydroshare.org https://www.hydroshare.org/ \n",
"... ... ... \n",
"13032 http://repositorio.ipsantarem.pt https://repositorio.ipsantarem.pt/ \n",
"13077 http://bibliotecadigital.ipb.pt https://bibliotecadigital.ipb.pt/ \n",
"13091 http://eprints.nmlindia.org https://eprints.nmlindia.org/ \n",
"13124 http://www.ju.edu.et https://www.ju.edu.et/ \n",
"13137 http://scholarship.rollins.edu https://scholarship.rollins.edu/ \n",
"\n",
" head_final_code head_codes len class \n",
"9 200.0 [301] 1 200.0 \n",
"46 200.0 [302] 1 200.0 \n",
"113 200.0 [301] 1 200.0 \n",
"119 200.0 [302] 1 200.0 \n",
"200 200.0 [301] 1 200.0 \n",
"... ... ... ... ... \n",
"13032 200.0 [302] 1 200.0 \n",
"13077 200.0 [302] 1 200.0 \n",
"13091 200.0 [301] 1 200.0 \n",
"13124 200.0 [301] 1 200.0 \n",
"13137 200.0 [301] 1 200.0 \n",
"\n",
"[487 rows x 6 columns]"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_chains[(head_chains.len == 1) & (head_chains.url.str.replace('http://', 'https://') + '/' == head_chains.head_url)]"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_url</th>\n",
" <th>get_final_code</th>\n",
" <th>get_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>http://sidc.be/silso/home</td>\n",
" <td>https://www.sidc.be/silso/home</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>http://www.pac.dfo-mpo.gc.ca/science/oceans/da...</td>\n",
" <td>https://www.pac.dfo-mpo.gc.ca/science/oceans/d...</td>\n",
" <td>200.0</td>\n",
" <td>[307]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>http://ipt.taibif.tw/</td>\n",
" <td>https://ipt.taibif.tw/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>http://arabidopsis.info</td>\n",
" <td>https://arabidopsis.info/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>http://epi.minsal.cl/bases-de-datos/</td>\n",
" <td>http://epi.minsal.cl/bases-de-datos/</td>\n",
" <td>403.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13341</th>\n",
" <td>http://calvados.c3sl.ufpr.br/ojs2/index.php/acta/</td>\n",
" <td>https://revistas.ufpr.br/index.php/acta/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13346</th>\n",
" <td>http://www4.furg.br/ojs/biblos/ojs/index.php</td>\n",
" <td>https://sistemas.furg.br/ojs/biblos/ojs/index.php</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13348</th>\n",
" <td>http://lss.fnal.gov/ird/fermilab_spires.html</td>\n",
" <td>https://lss.fnal.gov/ird/fermilab_spires.html</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13350</th>\n",
" <td>http://cloud.ncl.edu.tw/chna/</td>\n",
" <td>https://cloud.ncl.edu.tw/chna/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13351</th>\n",
" <td>http://www.myopenarchive.org/</td>\n",
" <td>https://www.myopenarchive.org/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>3376 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"0 http://sidc.be/silso/home \n",
"2 http://www.pac.dfo-mpo.gc.ca/science/oceans/da... \n",
"6 http://ipt.taibif.tw/ \n",
"9 http://arabidopsis.info \n",
"11 http://epi.minsal.cl/bases-de-datos/ \n",
"... ... \n",
"13341 http://calvados.c3sl.ufpr.br/ojs2/index.php/acta/ \n",
"13346 http://www4.furg.br/ojs/biblos/ojs/index.php \n",
"13348 http://lss.fnal.gov/ird/fermilab_spires.html \n",
"13350 http://cloud.ncl.edu.tw/chna/ \n",
"13351 http://www.myopenarchive.org/ \n",
"\n",
" get_url get_final_code \\\n",
"0 https://www.sidc.be/silso/home 200.0 \n",
"2 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... 200.0 \n",
"6 https://ipt.taibif.tw/ 200.0 \n",
"9 https://arabidopsis.info/ 200.0 \n",
"11 http://epi.minsal.cl/bases-de-datos/ 403.0 \n",
"... ... ... \n",
"13341 https://revistas.ufpr.br/index.php/acta/ 200.0 \n",
"13346 https://sistemas.furg.br/ojs/biblos/ojs/index.php 404.0 \n",
"13348 https://lss.fnal.gov/ird/fermilab_spires.html 404.0 \n",
"13350 https://cloud.ncl.edu.tw/chna/ 200.0 \n",
"13351 https://www.myopenarchive.org/ 200.0 \n",
"\n",
" get_codes len class \n",
"0 [302] 1 200.0 \n",
"2 [307] 1 200.0 \n",
"6 [301] 1 200.0 \n",
"9 [301] 1 200.0 \n",
"11 [302] 1 400.0 \n",
"... ... ... ... \n",
"13341 [301] 1 200.0 \n",
"13346 [302] 1 400.0 \n",
"13348 [302] 1 400.0 \n",
"13350 [302] 1 200.0 \n",
"13351 [301] 1 200.0 \n",
"\n",
"[3376 rows x 6 columns]"
]
},
"execution_count": 54,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_chains[get_chains.len == 1]"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_url</th>\n",
" <th>get_final_code</th>\n",
" <th>get_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>http://www.pac.dfo-mpo.gc.ca/science/oceans/da...</td>\n",
" <td>https://www.pac.dfo-mpo.gc.ca/science/oceans/d...</td>\n",
" <td>200.0</td>\n",
" <td>[307]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>http://ipt.taibif.tw/</td>\n",
" <td>https://ipt.taibif.tw/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>27</th>\n",
" <td>http://www.hgvd.genome.med.kyoto-u.ac.jp/</td>\n",
" <td>https://www.hgvd.genome.med.kyoto-u.ac.jp/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30</th>\n",
" <td>http://archive.researchdata.leeds.ac.uk/</td>\n",
" <td>https://archive.researchdata.leeds.ac.uk/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>33</th>\n",
" <td>http://biodb.swu.edu.cn/qprimerdb/</td>\n",
" <td>https://biodb.swu.edu.cn/qprimerdb/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13288</th>\n",
" <td>http://dspace.sunyconnect.suny.edu/</td>\n",
" <td>https://dspace.sunyconnect.suny.edu/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13295</th>\n",
" <td>http://scholar.google.com/</td>\n",
" <td>https://scholar.google.com/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13348</th>\n",
" <td>http://lss.fnal.gov/ird/fermilab_spires.html</td>\n",
" <td>https://lss.fnal.gov/ird/fermilab_spires.html</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13350</th>\n",
" <td>http://cloud.ncl.edu.tw/chna/</td>\n",
" <td>https://cloud.ncl.edu.tw/chna/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13351</th>\n",
" <td>http://www.myopenarchive.org/</td>\n",
" <td>https://www.myopenarchive.org/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1668 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"2 http://www.pac.dfo-mpo.gc.ca/science/oceans/da... \n",
"6 http://ipt.taibif.tw/ \n",
"27 http://www.hgvd.genome.med.kyoto-u.ac.jp/ \n",
"30 http://archive.researchdata.leeds.ac.uk/ \n",
"33 http://biodb.swu.edu.cn/qprimerdb/ \n",
"... ... \n",
"13288 http://dspace.sunyconnect.suny.edu/ \n",
"13295 http://scholar.google.com/ \n",
"13348 http://lss.fnal.gov/ird/fermilab_spires.html \n",
"13350 http://cloud.ncl.edu.tw/chna/ \n",
"13351 http://www.myopenarchive.org/ \n",
"\n",
" get_url get_final_code \\\n",
"2 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... 200.0 \n",
"6 https://ipt.taibif.tw/ 200.0 \n",
"27 https://www.hgvd.genome.med.kyoto-u.ac.jp/ 200.0 \n",
"30 https://archive.researchdata.leeds.ac.uk/ 200.0 \n",
"33 https://biodb.swu.edu.cn/qprimerdb/ 200.0 \n",
"... ... ... \n",
"13288 https://dspace.sunyconnect.suny.edu/ 200.0 \n",
"13295 https://scholar.google.com/ 200.0 \n",
"13348 https://lss.fnal.gov/ird/fermilab_spires.html 404.0 \n",
"13350 https://cloud.ncl.edu.tw/chna/ 200.0 \n",
"13351 https://www.myopenarchive.org/ 200.0 \n",
"\n",
" get_codes len class \n",
"2 [307] 1 200.0 \n",
"6 [301] 1 200.0 \n",
"27 [301] 1 200.0 \n",
"30 [302] 1 200.0 \n",
"33 [301] 1 200.0 \n",
"... ... ... ... \n",
"13288 [301] 1 200.0 \n",
"13295 [302] 1 200.0 \n",
"13348 [302] 1 400.0 \n",
"13350 [302] 1 200.0 \n",
"13351 [301] 1 200.0 \n",
"\n",
"[1668 rows x 6 columns]"
]
},
"execution_count": 56,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_chains[(get_chains.len == 1) & (get_chains.url.str.replace('http://', 'https://') == get_chains.get_url)]"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_url</th>\n",
" <th>get_final_code</th>\n",
" <th>get_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>119</th>\n",
" <td>https://www.biosino.org/node</td>\n",
" <td>https://www.biosino.org/node/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>314</th>\n",
" <td>http://metacat.tfri.gov.tw/tfri</td>\n",
" <td>http://metacat.tfri.gov.tw/tfri/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>463</th>\n",
" <td>https://nda.nih.gov/abcd</td>\n",
" <td>https://nda.nih.gov/abcd/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>551</th>\n",
" <td>https://www1.usgs.gov/obis-usa/ipt</td>\n",
" <td>https://www1.usgs.gov/obis-usa/ipt/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>559</th>\n",
" <td>https://www.arm.gov/data</td>\n",
" <td>https://www.arm.gov/data/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12642</th>\n",
" <td>http://repositorio.utec.edu.sv:8080/jspui</td>\n",
" <td>http://repositorio.utec.edu.sv:8080/jspui/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12706</th>\n",
" <td>http://adudspace.adu.edu.tr:8080/jspui</td>\n",
" <td>http://adudspace.adu.edu.tr:8080/jspui/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12874</th>\n",
" <td>http://160.97.80.9:8080/jspui</td>\n",
" <td>http://160.97.80.9:8080/jspui/</td>\n",
" <td>404.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13030</th>\n",
" <td>http://library.pdpu.ac.in:8080/xmlui</td>\n",
" <td>http://library.pdpu.ac.in:8080/xmlui/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13267</th>\n",
" <td>http://cab.unime.it/mus</td>\n",
" <td>http://cab.unime.it/mus/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>134 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"119 https://www.biosino.org/node \n",
"314 http://metacat.tfri.gov.tw/tfri \n",
"463 https://nda.nih.gov/abcd \n",
"551 https://www1.usgs.gov/obis-usa/ipt \n",
"559 https://www.arm.gov/data \n",
"... ... \n",
"12642 http://repositorio.utec.edu.sv:8080/jspui \n",
"12706 http://adudspace.adu.edu.tr:8080/jspui \n",
"12874 http://160.97.80.9:8080/jspui \n",
"13030 http://library.pdpu.ac.in:8080/xmlui \n",
"13267 http://cab.unime.it/mus \n",
"\n",
" get_url get_final_code get_codes \\\n",
"119 https://www.biosino.org/node/ 200.0 [302] \n",
"314 http://metacat.tfri.gov.tw/tfri/ 200.0 [301] \n",
"463 https://nda.nih.gov/abcd/ 200.0 [302] \n",
"551 https://www1.usgs.gov/obis-usa/ipt/ 200.0 [301] \n",
"559 https://www.arm.gov/data/ 200.0 [301] \n",
"... ... ... ... \n",
"12642 http://repositorio.utec.edu.sv:8080/jspui/ 200.0 [302] \n",
"12706 http://adudspace.adu.edu.tr:8080/jspui/ 200.0 [302] \n",
"12874 http://160.97.80.9:8080/jspui/ 404.0 [302] \n",
"13030 http://library.pdpu.ac.in:8080/xmlui/ 200.0 [302] \n",
"13267 http://cab.unime.it/mus/ 200.0 [302] \n",
"\n",
" len class \n",
"119 1 200.0 \n",
"314 1 200.0 \n",
"463 1 200.0 \n",
"551 1 200.0 \n",
"559 1 200.0 \n",
"... ... ... \n",
"12642 1 200.0 \n",
"12706 1 200.0 \n",
"12874 1 400.0 \n",
"13030 1 200.0 \n",
"13267 1 200.0 \n",
"\n",
"[134 rows x 6 columns]"
]
},
"execution_count": 57,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_chains[(get_chains.len == 1) & (get_chains.url + '/' == get_chains.get_url)]"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_url</th>\n",
" <th>get_final_code</th>\n",
" <th>get_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>http://arabidopsis.info</td>\n",
" <td>https://arabidopsis.info/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>http://pcddb.cryst.bbk.ac.uk</td>\n",
" <td>https://pcddb.cryst.bbk.ac.uk/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>113</th>\n",
" <td>http://www.marinedataarchive.eu</td>\n",
" <td>https://www.marinedataarchive.eu/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>119</th>\n",
" <td>https://www.biosino.org/node</td>\n",
" <td>https://www.biosino.org/node/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>200</th>\n",
" <td>http://www.hydroshare.org</td>\n",
" <td>https://www.hydroshare.org/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13032</th>\n",
" <td>http://repositorio.ipsantarem.pt</td>\n",
" <td>https://repositorio.ipsantarem.pt/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13077</th>\n",
" <td>http://bibliotecadigital.ipb.pt</td>\n",
" <td>https://bibliotecadigital.ipb.pt/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13091</th>\n",
" <td>http://eprints.nmlindia.org</td>\n",
" <td>https://eprints.nmlindia.org/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13124</th>\n",
" <td>http://www.ju.edu.et</td>\n",
" <td>https://www.ju.edu.et/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13137</th>\n",
" <td>http://scholarship.rollins.edu</td>\n",
" <td>https://scholarship.rollins.edu/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>490 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" url get_url \\\n",
"9 http://arabidopsis.info https://arabidopsis.info/ \n",
"46 http://pcddb.cryst.bbk.ac.uk https://pcddb.cryst.bbk.ac.uk/ \n",
"113 http://www.marinedataarchive.eu https://www.marinedataarchive.eu/ \n",
"119 https://www.biosino.org/node https://www.biosino.org/node/ \n",
"200 http://www.hydroshare.org https://www.hydroshare.org/ \n",
"... ... ... \n",
"13032 http://repositorio.ipsantarem.pt https://repositorio.ipsantarem.pt/ \n",
"13077 http://bibliotecadigital.ipb.pt https://bibliotecadigital.ipb.pt/ \n",
"13091 http://eprints.nmlindia.org https://eprints.nmlindia.org/ \n",
"13124 http://www.ju.edu.et https://www.ju.edu.et/ \n",
"13137 http://scholarship.rollins.edu https://scholarship.rollins.edu/ \n",
"\n",
" get_final_code get_codes len class \n",
"9 200.0 [301] 1 200.0 \n",
"46 200.0 [302] 1 200.0 \n",
"113 200.0 [301] 1 200.0 \n",
"119 200.0 [302] 1 200.0 \n",
"200 200.0 [301] 1 200.0 \n",
"... ... ... ... ... \n",
"13032 200.0 [302] 1 200.0 \n",
"13077 200.0 [302] 1 200.0 \n",
"13091 200.0 [301] 1 200.0 \n",
"13124 200.0 [301] 1 200.0 \n",
"13137 200.0 [301] 1 200.0 \n",
"\n",
"[490 rows x 6 columns]"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_chains[(get_chains.len == 1) & (get_chains.url.str.replace('http://', 'https://') + '/' == get_chains.get_url)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Redirect mismatch"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_url</th>\n",
" <th>head_chain</th>\n",
" <th>head_codes</th>\n",
" <th>head_elapsed</th>\n",
" <th>head_error</th>\n",
" <th>get_final_code</th>\n",
" <th>get_url</th>\n",
" <th>get_chain</th>\n",
" <th>get_codes</th>\n",
" <th>get_elapsed</th>\n",
" <th>get_error</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>127</th>\n",
" <td>https://models.physiomeproject.org</td>\n",
" <td>200.0</td>\n",
" <td>https://models.physiomeproject.org/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.658525</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>https://models.physiomeproject.org/welcome</td>\n",
" <td>['https://models.physiomeproject.org/']</td>\n",
" <td>[302]</td>\n",
" <td>0.575681</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>371</th>\n",
" <td>https://rdp.utas.edu.au</td>\n",
" <td>200.0</td>\n",
" <td>https://login.microsoftonline.com/15eb5d55-199...</td>\n",
" <td>['https://rdp.utas.edu.au/']</td>\n",
" <td>[302]</td>\n",
" <td>0.297592</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>https://login.microsoftonline.com/15eb5d55-199...</td>\n",
" <td>['https://rdp.utas.edu.au/']</td>\n",
" <td>[302]</td>\n",
" <td>0.285739</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>479</th>\n",
" <td>https://sciflection.com</td>\n",
" <td>200.0</td>\n",
" <td>https://sciflection.com/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.176450</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>https://sciflection.com/main</td>\n",
" <td>['https://sciflection.com/', 'http://sciflecti...</td>\n",
" <td>[302, 301]</td>\n",
" <td>0.391234</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>684</th>\n",
" <td>http://www.ntrr-nti.org/</td>\n",
" <td>405.0</td>\n",
" <td>http://www.ntrr-nti.org/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>1.133437</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>https://www.nattrauma.org/research/ntrr/</td>\n",
" <td>['http://www.ntrr-nti.org/']</td>\n",
" <td>[301]</td>\n",
" <td>2.539869</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>697</th>\n",
" <td>http://www.jcm.riken.go.jp/</td>\n",
" <td>200.0</td>\n",
" <td>https://jcm.brc.riken.jp/en/</td>\n",
" <td>['http://www.jcm.riken.go.jp/', 'https://www.j...</td>\n",
" <td>[301, 301, 301, 301]</td>\n",
" <td>0.506354</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>https://jcm.brc.riken.jp/en/?doing_wp_cron=165...</td>\n",
" <td>['http://www.jcm.riken.go.jp/', 'https://www.j...</td>\n",
" <td>[301, 301, 301, 301, 302]</td>\n",
" <td>0.630979</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13132</th>\n",
" <td>http://hik.diva-portal.org/smash/search.jsf</td>\n",
" <td>200.0</td>\n",
" <td>http://lnu.diva-portal.org/smash/search.jsf?ds...</td>\n",
" <td>['http://hik.diva-portal.org/smash/search.jsf'...</td>\n",
" <td>[301, 302]</td>\n",
" <td>0.287171</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>http://lnu.diva-portal.org/smash/search.jsf?ds...</td>\n",
" <td>['http://hik.diva-portal.org/smash/search.jsf'...</td>\n",
" <td>[301, 302]</td>\n",
" <td>0.291231</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13133</th>\n",
" <td>http://hgo.diva-portal.org/smash/search.jsf</td>\n",
" <td>200.0</td>\n",
" <td>http://uu.diva-portal.org/smash/search.jsf?dsw...</td>\n",
" <td>['http://hgo.diva-portal.org/smash/search.jsf'...</td>\n",
" <td>[301, 302]</td>\n",
" <td>0.292740</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>http://uu.diva-portal.org/smash/search.jsf?dsw...</td>\n",
" <td>['http://hgo.diva-portal.org/smash/search.jsf'...</td>\n",
" <td>[301, 302]</td>\n",
" <td>0.289726</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13266</th>\n",
" <td>http://www.diva-portal.org/gih/</td>\n",
" <td>200.0</td>\n",
" <td>http://gih.diva-portal.org/smash/search.jsf?ds...</td>\n",
" <td>['http://www.diva-portal.org/gih/', 'http://gi...</td>\n",
" <td>[301, 302]</td>\n",
" <td>0.301519</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>http://gih.diva-portal.org/smash/search.jsf?ds...</td>\n",
" <td>['http://www.diva-portal.org/gih/', 'http://gi...</td>\n",
" <td>[301, 302]</td>\n",
" <td>2.143105</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13283</th>\n",
" <td>http://www.compscipreprints.com/</td>\n",
" <td>200.0</td>\n",
" <td>http://survey-smiles.com</td>\n",
" <td>['http://www.compscipreprints.com/']</td>\n",
" <td>[302]</td>\n",
" <td>0.257651</td>\n",
" <td>NaN</td>\n",
" <td>200.0</td>\n",
" <td>http://www.compscipreprints.com/</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0.093911</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13320</th>\n",
" <td>http://trinity.cdmhost.com/index.php</td>\n",
" <td>404.0</td>\n",
" <td>https://www.oclc.org/url/?404;http://trinity.c...</td>\n",
" <td>['http://trinity.cdmhost.com/index.php']</td>\n",
" <td>[302]</td>\n",
" <td>0.589119</td>\n",
" <td>NaN</td>\n",
" <td>404.0</td>\n",
" <td>https://www.oclc.org/url/notfound</td>\n",
" <td>['http://trinity.cdmhost.com/index.php', 'http...</td>\n",
" <td>[302, 302]</td>\n",
" <td>0.139563</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>146 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" url head_final_code \\\n",
"127 https://models.physiomeproject.org 200.0 \n",
"371 https://rdp.utas.edu.au 200.0 \n",
"479 https://sciflection.com 200.0 \n",
"684 http://www.ntrr-nti.org/ 405.0 \n",
"697 http://www.jcm.riken.go.jp/ 200.0 \n",
"... ... ... \n",
"13132 http://hik.diva-portal.org/smash/search.jsf 200.0 \n",
"13133 http://hgo.diva-portal.org/smash/search.jsf 200.0 \n",
"13266 http://www.diva-portal.org/gih/ 200.0 \n",
"13283 http://www.compscipreprints.com/ 200.0 \n",
"13320 http://trinity.cdmhost.com/index.php 404.0 \n",
"\n",
" head_url \\\n",
"127 https://models.physiomeproject.org/ \n",
"371 https://login.microsoftonline.com/15eb5d55-199... \n",
"479 https://sciflection.com/ \n",
"684 http://www.ntrr-nti.org/ \n",
"697 https://jcm.brc.riken.jp/en/ \n",
"... ... \n",
"13132 http://lnu.diva-portal.org/smash/search.jsf?ds... \n",
"13133 http://uu.diva-portal.org/smash/search.jsf?dsw... \n",
"13266 http://gih.diva-portal.org/smash/search.jsf?ds... \n",
"13283 http://survey-smiles.com \n",
"13320 https://www.oclc.org/url/?404;http://trinity.c... \n",
"\n",
" head_chain \\\n",
"127 [] \n",
"371 ['https://rdp.utas.edu.au/'] \n",
"479 [] \n",
"684 [] \n",
"697 ['http://www.jcm.riken.go.jp/', 'https://www.j... \n",
"... ... \n",
"13132 ['http://hik.diva-portal.org/smash/search.jsf'... \n",
"13133 ['http://hgo.diva-portal.org/smash/search.jsf'... \n",
"13266 ['http://www.diva-portal.org/gih/', 'http://gi... \n",
"13283 ['http://www.compscipreprints.com/'] \n",
"13320 ['http://trinity.cdmhost.com/index.php'] \n",
"\n",
" head_codes head_elapsed head_error get_final_code \\\n",
"127 [] 0.658525 NaN 200.0 \n",
"371 [302] 0.297592 NaN 200.0 \n",
"479 [] 0.176450 NaN 200.0 \n",
"684 [] 1.133437 NaN 200.0 \n",
"697 [301, 301, 301, 301] 0.506354 NaN 200.0 \n",
"... ... ... ... ... \n",
"13132 [301, 302] 0.287171 NaN 200.0 \n",
"13133 [301, 302] 0.292740 NaN 200.0 \n",
"13266 [301, 302] 0.301519 NaN 200.0 \n",
"13283 [302] 0.257651 NaN 200.0 \n",
"13320 [302] 0.589119 NaN 404.0 \n",
"\n",
" get_url \\\n",
"127 https://models.physiomeproject.org/welcome \n",
"371 https://login.microsoftonline.com/15eb5d55-199... \n",
"479 https://sciflection.com/main \n",
"684 https://www.nattrauma.org/research/ntrr/ \n",
"697 https://jcm.brc.riken.jp/en/?doing_wp_cron=165... \n",
"... ... \n",
"13132 http://lnu.diva-portal.org/smash/search.jsf?ds... \n",
"13133 http://uu.diva-portal.org/smash/search.jsf?dsw... \n",
"13266 http://gih.diva-portal.org/smash/search.jsf?ds... \n",
"13283 http://www.compscipreprints.com/ \n",
"13320 https://www.oclc.org/url/notfound \n",
"\n",
" get_chain \\\n",
"127 ['https://models.physiomeproject.org/'] \n",
"371 ['https://rdp.utas.edu.au/'] \n",
"479 ['https://sciflection.com/', 'http://sciflecti... \n",
"684 ['http://www.ntrr-nti.org/'] \n",
"697 ['http://www.jcm.riken.go.jp/', 'https://www.j... \n",
"... ... \n",
"13132 ['http://hik.diva-portal.org/smash/search.jsf'... \n",
"13133 ['http://hgo.diva-portal.org/smash/search.jsf'... \n",
"13266 ['http://www.diva-portal.org/gih/', 'http://gi... \n",
"13283 [] \n",
"13320 ['http://trinity.cdmhost.com/index.php', 'http... \n",
"\n",
" get_codes get_elapsed get_error \n",
"127 [302] 0.575681 NaN \n",
"371 [302] 0.285739 NaN \n",
"479 [302, 301] 0.391234 NaN \n",
"684 [301] 2.539869 NaN \n",
"697 [301, 301, 301, 301, 302] 0.630979 NaN \n",
"... ... ... ... \n",
"13132 [301, 302] 0.291231 NaN \n",
"13133 [301, 302] 0.289726 NaN \n",
"13266 [301, 302] 2.143105 NaN \n",
"13283 [] 0.093911 NaN \n",
"13320 [302, 302] 0.139563 NaN \n",
"\n",
"[146 rows x 13 columns]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[(df.head_url != df.get_url) & (df.head_url.notna()) & (df.get_url.notna())]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### URL conflation"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>http://sidc.be/silso/home</td>\n",
" <td>https://www.sidc.be/silso/home</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>https://bmrb.io/</td>\n",
" <td>https://bmrb.io/</td>\n",
" <td>200.0</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>http://www.pac.dfo-mpo.gc.ca/science/oceans/da...</td>\n",
" <td>https://www.pac.dfo-mpo.gc.ca/science/oceans/d...</td>\n",
" <td>200.0</td>\n",
" <td>[307]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>https://www.cpc.ncep.noaa.gov/</td>\n",
" <td>https://www.cpc.ncep.noaa.gov/</td>\n",
" <td>200.0</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...</td>\n",
" <td>http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...</td>\n",
" <td>404.0</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>400.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13350</th>\n",
" <td>http://cloud.ncl.edu.tw/chna/</td>\n",
" <td>https://cloud.ncl.edu.tw/chna/</td>\n",
" <td>200.0</td>\n",
" <td>[302]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13351</th>\n",
" <td>http://www.myopenarchive.org/</td>\n",
" <td>https://www.myopenarchive.org/</td>\n",
" <td>200.0</td>\n",
" <td>[301]</td>\n",
" <td>1</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13352</th>\n",
" <td>http://radar.brookes.ac.uk/radar/access/home.do</td>\n",
" <td>https://radar.brookes.ac.uk/radar/home.do</td>\n",
" <td>200.0</td>\n",
" <td>[302, 302, 302]</td>\n",
" <td>3</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13353</th>\n",
" <td>http://www.ub.uni-siegen.de/opus</td>\n",
" <td>https://www.ub.uni-siegen.de/opus/</td>\n",
" <td>200.0</td>\n",
" <td>[301, 301]</td>\n",
" <td>2</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13355</th>\n",
" <td>https://www.utas.edu.au/library</td>\n",
" <td>https://www.utas.edu.au/library</td>\n",
" <td>200.0</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>200.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10926 rows × 6 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"0 http://sidc.be/silso/home \n",
"1 https://bmrb.io/ \n",
"2 http://www.pac.dfo-mpo.gc.ca/science/oceans/da... \n",
"3 https://www.cpc.ncep.noaa.gov/ \n",
"4 http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... \n",
"... ... \n",
"13350 http://cloud.ncl.edu.tw/chna/ \n",
"13351 http://www.myopenarchive.org/ \n",
"13352 http://radar.brookes.ac.uk/radar/access/home.do \n",
"13353 http://www.ub.uni-siegen.de/opus \n",
"13355 https://www.utas.edu.au/library \n",
"\n",
" head_url head_final_code \\\n",
"0 https://www.sidc.be/silso/home 200.0 \n",
"1 https://bmrb.io/ 200.0 \n",
"2 https://www.pac.dfo-mpo.gc.ca/science/oceans/d... 200.0 \n",
"3 https://www.cpc.ncep.noaa.gov/ 200.0 \n",
"4 http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... 404.0 \n",
"... ... ... \n",
"13350 https://cloud.ncl.edu.tw/chna/ 200.0 \n",
"13351 https://www.myopenarchive.org/ 200.0 \n",
"13352 https://radar.brookes.ac.uk/radar/home.do 200.0 \n",
"13353 https://www.ub.uni-siegen.de/opus/ 200.0 \n",
"13355 https://www.utas.edu.au/library 200.0 \n",
"\n",
" head_codes len class \n",
"0 [302] 1 200.0 \n",
"1 [] 0 200.0 \n",
"2 [307] 1 200.0 \n",
"3 [] 0 200.0 \n",
"4 [] 0 400.0 \n",
"... ... ... ... \n",
"13350 [302] 1 200.0 \n",
"13351 [301] 1 200.0 \n",
"13352 [302, 302, 302] 3 200.0 \n",
"13353 [301, 301] 2 200.0 \n",
"13355 [] 0 200.0 \n",
"\n",
"[10926 rows x 6 columns]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_chains"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>10926</td>\n",
" <td>10926</td>\n",
" <td>10926.000000</td>\n",
" <td>10926</td>\n",
" <td>10926.000000</td>\n",
" <td>10926.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>10926</td>\n",
" <td>9331</td>\n",
" <td>NaN</td>\n",
" <td>59</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>http://sidc.be/silso/home</td>\n",
" <td>https://www.unit.no/ugyldig-lenke-til-dokument...</td>\n",
" <td>NaN</td>\n",
" <td>[]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>1</td>\n",
" <td>16</td>\n",
" <td>NaN</td>\n",
" <td>6596</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>218.406736</td>\n",
" <td>NaN</td>\n",
" <td>0.515559</td>\n",
" <td>218.094454</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>61.035428</td>\n",
" <td>NaN</td>\n",
" <td>0.749850</td>\n",
" <td>60.063069</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>200.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>200.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>200.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>200.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>200.000000</td>\n",
" <td>NaN</td>\n",
" <td>0.000000</td>\n",
" <td>200.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>200.000000</td>\n",
" <td>NaN</td>\n",
" <td>1.000000</td>\n",
" <td>200.000000</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>504.000000</td>\n",
" <td>NaN</td>\n",
" <td>5.000000</td>\n",
" <td>500.000000</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"count 10926 \n",
"unique 10926 \n",
"top http://sidc.be/silso/home \n",
"freq 1 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" head_url head_final_code \\\n",
"count 10926 10926.000000 \n",
"unique 9331 NaN \n",
"top https://www.unit.no/ugyldig-lenke-til-dokument... NaN \n",
"freq 16 NaN \n",
"mean NaN 218.406736 \n",
"std NaN 61.035428 \n",
"min NaN 200.000000 \n",
"25% NaN 200.000000 \n",
"50% NaN 200.000000 \n",
"75% NaN 200.000000 \n",
"max NaN 504.000000 \n",
"\n",
" head_codes len class \n",
"count 10926 10926.000000 10926.000000 \n",
"unique 59 NaN NaN \n",
"top [] NaN NaN \n",
"freq 6596 NaN NaN \n",
"mean NaN 0.515559 218.094454 \n",
"std NaN 0.749850 60.063069 \n",
"min NaN 0.000000 200.000000 \n",
"25% NaN 0.000000 200.000000 \n",
"50% NaN 0.000000 200.000000 \n",
"75% NaN 1.000000 200.000000 \n",
"max NaN 5.000000 500.000000 "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_chains.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" <tr>\n",
" <th>head_url</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>https://www.unit.no/ugyldig-lenke-til-dokument-i-vitenarkiv</th>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://www.nlm.nih.gov/toxnet/index.html</th>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://survey-smiles.com</th>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://nda.nih.gov/</th>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://digitalrepository.unm.edu/</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://ir.vanderbilt.edu/</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://ddd.uab.cat/</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://intellectum.unisabana.edu.co/</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://opus.bibliothek.uni-augsburg.de/opus4/home</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://pure.iiasa.ac.at/</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://eprints.nottingham.ac.uk/</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://research.usc.edu.au/esploro/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://dataverse.harvard.edu/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://epub.wupperinst.org/home</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://pure.mpg.de/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://dlib.york.ac.uk/yodl/app/home/index</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://archiv.ub.uni-marburg.de/ubfind/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://eprints.unife.it/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://dataspace.princeton.edu/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://eprints.lib.hokudai.ac.jp/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://pub.uni-bielefeld.de/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://aquadocs.org/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://dataverse.tdl.org/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://uwspace.uwaterloo.ca/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://research.utwente.nl/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://datadryad.org/stash</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://er.knutd.edu.ua/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://esango.cput.ac.za/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://repositorio.fucsalud.edu.co/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://repository.up.ac.za/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://repository.usta.edu.co/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://repositori.upf.edu/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://bibliotecadigital.exactas.uba.ar/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://repo.odmu.edu.ua/xmlui/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://bcnroc.ajuntament.barcelona.cat/jspui/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://data.gov.uk/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://mru.arcabc.ca/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://munin.uit.no/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://red.uao.edu.co/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://research-repository.st-andrews.ac.uk/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://research-repository.uwa.edu.au/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://research.cbs.dk/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://webext.pasteur.fr/genolist/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://prism.ucalgary.ca/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://air.unimi.it/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://rua.ua.es/dspace/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://dial.uclouvain.be/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://sc.lib.miamioh.edu/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://dspace.ankara.edu.tr/xmlui/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://scholar.smu.edu/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url head_final_code \\\n",
"head_url \n",
"https://www.unit.no/ugyldig-lenke-til-dokument-... 16 16 \n",
"https://www.nlm.nih.gov/toxnet/index.html 6 6 \n",
"http://survey-smiles.com 6 6 \n",
"https://nda.nih.gov/ 5 5 \n",
"https://digitalrepository.unm.edu/ 4 4 \n",
"https://ir.vanderbilt.edu/ 4 4 \n",
"https://ddd.uab.cat/ 4 4 \n",
"https://intellectum.unisabana.edu.co/ 4 4 \n",
"https://opus.bibliothek.uni-augsburg.de/opus4/home 4 4 \n",
"https://pure.iiasa.ac.at/ 4 4 \n",
"http://eprints.nottingham.ac.uk/ 4 4 \n",
"https://research.usc.edu.au/esploro/ 3 3 \n",
"https://dataverse.harvard.edu/ 3 3 \n",
"https://epub.wupperinst.org/home 3 3 \n",
"https://pure.mpg.de/ 3 3 \n",
"https://dlib.york.ac.uk/yodl/app/home/index 3 3 \n",
"https://archiv.ub.uni-marburg.de/ubfind/ 3 3 \n",
"https://eprints.unife.it/ 3 3 \n",
"https://dataspace.princeton.edu/ 3 3 \n",
"https://eprints.lib.hokudai.ac.jp/ 3 3 \n",
"https://pub.uni-bielefeld.de/ 3 3 \n",
"https://aquadocs.org/ 3 3 \n",
"https://dataverse.tdl.org/ 3 3 \n",
"https://uwspace.uwaterloo.ca/ 3 3 \n",
"https://research.utwente.nl/ 3 3 \n",
"https://datadryad.org/stash 3 3 \n",
"https://er.knutd.edu.ua/ 3 3 \n",
"https://esango.cput.ac.za/ 3 3 \n",
"https://repositorio.fucsalud.edu.co/ 3 3 \n",
"https://repository.up.ac.za/ 3 3 \n",
"https://repository.usta.edu.co/ 3 3 \n",
"https://repositori.upf.edu/ 3 3 \n",
"https://bibliotecadigital.exactas.uba.ar/ 3 3 \n",
"https://repo.odmu.edu.ua/xmlui/ 3 3 \n",
"https://bcnroc.ajuntament.barcelona.cat/jspui/ 3 3 \n",
"https://data.gov.uk/ 3 3 \n",
"https://mru.arcabc.ca/ 3 3 \n",
"https://munin.uit.no/ 3 3 \n",
"https://red.uao.edu.co/ 3 3 \n",
"https://research-repository.st-andrews.ac.uk/ 3 3 \n",
"https://research-repository.uwa.edu.au/ 3 3 \n",
"https://research.cbs.dk/ 3 3 \n",
"https://webext.pasteur.fr/genolist/ 3 3 \n",
"https://prism.ucalgary.ca/ 3 3 \n",
"https://air.unimi.it/ 3 3 \n",
"http://rua.ua.es/dspace/ 3 3 \n",
"https://dial.uclouvain.be/ 3 3 \n",
"https://sc.lib.miamioh.edu/ 3 3 \n",
"https://dspace.ankara.edu.tr/xmlui/ 3 3 \n",
"https://scholar.smu.edu/ 3 3 \n",
"\n",
" head_codes len class \n",
"head_url \n",
"https://www.unit.no/ugyldig-lenke-til-dokument-... 16 16 16 \n",
"https://www.nlm.nih.gov/toxnet/index.html 6 6 6 \n",
"http://survey-smiles.com 6 6 6 \n",
"https://nda.nih.gov/ 5 5 5 \n",
"https://digitalrepository.unm.edu/ 4 4 4 \n",
"https://ir.vanderbilt.edu/ 4 4 4 \n",
"https://ddd.uab.cat/ 4 4 4 \n",
"https://intellectum.unisabana.edu.co/ 4 4 4 \n",
"https://opus.bibliothek.uni-augsburg.de/opus4/home 4 4 4 \n",
"https://pure.iiasa.ac.at/ 4 4 4 \n",
"http://eprints.nottingham.ac.uk/ 4 4 4 \n",
"https://research.usc.edu.au/esploro/ 3 3 3 \n",
"https://dataverse.harvard.edu/ 3 3 3 \n",
"https://epub.wupperinst.org/home 3 3 3 \n",
"https://pure.mpg.de/ 3 3 3 \n",
"https://dlib.york.ac.uk/yodl/app/home/index 3 3 3 \n",
"https://archiv.ub.uni-marburg.de/ubfind/ 3 3 3 \n",
"https://eprints.unife.it/ 3 3 3 \n",
"https://dataspace.princeton.edu/ 3 3 3 \n",
"https://eprints.lib.hokudai.ac.jp/ 3 3 3 \n",
"https://pub.uni-bielefeld.de/ 3 3 3 \n",
"https://aquadocs.org/ 3 3 3 \n",
"https://dataverse.tdl.org/ 3 3 3 \n",
"https://uwspace.uwaterloo.ca/ 3 3 3 \n",
"https://research.utwente.nl/ 3 3 3 \n",
"https://datadryad.org/stash 3 3 3 \n",
"https://er.knutd.edu.ua/ 3 3 3 \n",
"https://esango.cput.ac.za/ 3 3 3 \n",
"https://repositorio.fucsalud.edu.co/ 3 3 3 \n",
"https://repository.up.ac.za/ 3 3 3 \n",
"https://repository.usta.edu.co/ 3 3 3 \n",
"https://repositori.upf.edu/ 3 3 3 \n",
"https://bibliotecadigital.exactas.uba.ar/ 3 3 3 \n",
"https://repo.odmu.edu.ua/xmlui/ 3 3 3 \n",
"https://bcnroc.ajuntament.barcelona.cat/jspui/ 3 3 3 \n",
"https://data.gov.uk/ 3 3 3 \n",
"https://mru.arcabc.ca/ 3 3 3 \n",
"https://munin.uit.no/ 3 3 3 \n",
"https://red.uao.edu.co/ 3 3 3 \n",
"https://research-repository.st-andrews.ac.uk/ 3 3 3 \n",
"https://research-repository.uwa.edu.au/ 3 3 3 \n",
"https://research.cbs.dk/ 3 3 3 \n",
"https://webext.pasteur.fr/genolist/ 3 3 3 \n",
"https://prism.ucalgary.ca/ 3 3 3 \n",
"https://air.unimi.it/ 3 3 3 \n",
"http://rua.ua.es/dspace/ 3 3 3 \n",
"https://dial.uclouvain.be/ 3 3 3 \n",
"https://sc.lib.miamioh.edu/ 3 3 3 \n",
"https://dspace.ankara.edu.tr/xmlui/ 3 3 3 \n",
"https://scholar.smu.edu/ 3 3 3 "
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_chains.groupby('head_url').count().sort_values('url', ascending=False).head(50)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_final_code</th>\n",
" <th>get_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" <tr>\n",
" <th>get_url</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>https://www.unit.no/ugyldig-lenke-til-dokument-i-vitenarkiv</th>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" <td>16</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://www.oclc.org/url/notfound</th>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://www.nlm.nih.gov/toxnet/index.html</th>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://nda.nih.gov/</th>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://digitalrepository.unm.edu/</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://opus.bibliothek.uni-augsburg.de/opus4/home</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://intellectum.unisabana.edu.co/</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://pure.iiasa.ac.at/</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://ir.vanderbilt.edu/</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://eprints.nottingham.ac.uk/</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://ddd.uab.cat/</th>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://mru.arcabc.ca/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://research.usc.edu.au/esploro/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://aquadocs.org/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://uwspace.uwaterloo.ca/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://pub.uni-bielefeld.de/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://dataverse.tdl.org/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://eprints.lib.hokudai.ac.jp/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://archiv.ub.uni-marburg.de/ubfind/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://eprints.unife.it/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://repositorio.fucsalud.edu.co/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://research.utwente.nl/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://repository.up.ac.za/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://pure.mpg.de/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://epub.wupperinst.org/home</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://archive.researchdata.leeds.ac.uk/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://repository.usta.edu.co/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://dataverse.harvard.edu/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://bcnroc.ajuntament.barcelona.cat/jspui/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://er.knutd.edu.ua/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://dataspace.princeton.edu/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://repositori.upf.edu/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://bibliotecadigital.exactas.uba.ar/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://prism.ucalgary.ca/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://research.cbs.dk/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://webext.pasteur.fr/genolist/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://ngdc.cncb.ac.cn/idog/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://research-repository.uwa.edu.au/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://research-repository.st-andrews.ac.uk/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://repo.odmu.edu.ua/xmlui/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://red.uao.edu.co/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://data.gov.uk/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://munin.uit.no/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://datadryad.org/stash</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://rua.ua.es/dspace/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://dspace.uevora.pt/rdpc/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://sc.lib.miamioh.edu/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://papyrus.bib.umontreal.ca/xmlui/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://scholar.smu.edu/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://dro.deakin.edu.au/</th>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" url get_final_code \\\n",
"get_url \n",
"https://www.unit.no/ugyldig-lenke-til-dokument-... 16 16 \n",
"https://www.oclc.org/url/notfound 7 7 \n",
"https://www.nlm.nih.gov/toxnet/index.html 6 6 \n",
"https://nda.nih.gov/ 5 5 \n",
"https://digitalrepository.unm.edu/ 4 4 \n",
"https://opus.bibliothek.uni-augsburg.de/opus4/home 4 4 \n",
"https://intellectum.unisabana.edu.co/ 4 4 \n",
"https://pure.iiasa.ac.at/ 4 4 \n",
"https://ir.vanderbilt.edu/ 4 4 \n",
"http://eprints.nottingham.ac.uk/ 4 4 \n",
"https://ddd.uab.cat/ 4 4 \n",
"https://mru.arcabc.ca/ 3 3 \n",
"https://research.usc.edu.au/esploro/ 3 3 \n",
"https://aquadocs.org/ 3 3 \n",
"https://uwspace.uwaterloo.ca/ 3 3 \n",
"https://pub.uni-bielefeld.de/ 3 3 \n",
"https://dataverse.tdl.org/ 3 3 \n",
"https://eprints.lib.hokudai.ac.jp/ 3 3 \n",
"https://archiv.ub.uni-marburg.de/ubfind/ 3 3 \n",
"https://eprints.unife.it/ 3 3 \n",
"https://repositorio.fucsalud.edu.co/ 3 3 \n",
"https://research.utwente.nl/ 3 3 \n",
"https://repository.up.ac.za/ 3 3 \n",
"https://pure.mpg.de/ 3 3 \n",
"https://epub.wupperinst.org/home 3 3 \n",
"https://archive.researchdata.leeds.ac.uk/ 3 3 \n",
"https://repository.usta.edu.co/ 3 3 \n",
"https://dataverse.harvard.edu/ 3 3 \n",
"https://bcnroc.ajuntament.barcelona.cat/jspui/ 3 3 \n",
"https://er.knutd.edu.ua/ 3 3 \n",
"https://dataspace.princeton.edu/ 3 3 \n",
"https://repositori.upf.edu/ 3 3 \n",
"https://bibliotecadigital.exactas.uba.ar/ 3 3 \n",
"https://prism.ucalgary.ca/ 3 3 \n",
"https://research.cbs.dk/ 3 3 \n",
"https://webext.pasteur.fr/genolist/ 3 3 \n",
"https://ngdc.cncb.ac.cn/idog/ 3 3 \n",
"https://research-repository.uwa.edu.au/ 3 3 \n",
"https://research-repository.st-andrews.ac.uk/ 3 3 \n",
"https://repo.odmu.edu.ua/xmlui/ 3 3 \n",
"https://red.uao.edu.co/ 3 3 \n",
"https://data.gov.uk/ 3 3 \n",
"https://munin.uit.no/ 3 3 \n",
"https://datadryad.org/stash 3 3 \n",
"http://rua.ua.es/dspace/ 3 3 \n",
"http://dspace.uevora.pt/rdpc/ 3 3 \n",
"https://sc.lib.miamioh.edu/ 3 3 \n",
"https://papyrus.bib.umontreal.ca/xmlui/ 3 3 \n",
"https://scholar.smu.edu/ 3 3 \n",
"https://dro.deakin.edu.au/ 3 3 \n",
"\n",
" get_codes len class \n",
"get_url \n",
"https://www.unit.no/ugyldig-lenke-til-dokument-... 16 16 16 \n",
"https://www.oclc.org/url/notfound 7 7 7 \n",
"https://www.nlm.nih.gov/toxnet/index.html 6 6 6 \n",
"https://nda.nih.gov/ 5 5 5 \n",
"https://digitalrepository.unm.edu/ 4 4 4 \n",
"https://opus.bibliothek.uni-augsburg.de/opus4/home 4 4 4 \n",
"https://intellectum.unisabana.edu.co/ 4 4 4 \n",
"https://pure.iiasa.ac.at/ 4 4 4 \n",
"https://ir.vanderbilt.edu/ 4 4 4 \n",
"http://eprints.nottingham.ac.uk/ 4 4 4 \n",
"https://ddd.uab.cat/ 4 4 4 \n",
"https://mru.arcabc.ca/ 3 3 3 \n",
"https://research.usc.edu.au/esploro/ 3 3 3 \n",
"https://aquadocs.org/ 3 3 3 \n",
"https://uwspace.uwaterloo.ca/ 3 3 3 \n",
"https://pub.uni-bielefeld.de/ 3 3 3 \n",
"https://dataverse.tdl.org/ 3 3 3 \n",
"https://eprints.lib.hokudai.ac.jp/ 3 3 3 \n",
"https://archiv.ub.uni-marburg.de/ubfind/ 3 3 3 \n",
"https://eprints.unife.it/ 3 3 3 \n",
"https://repositorio.fucsalud.edu.co/ 3 3 3 \n",
"https://research.utwente.nl/ 3 3 3 \n",
"https://repository.up.ac.za/ 3 3 3 \n",
"https://pure.mpg.de/ 3 3 3 \n",
"https://epub.wupperinst.org/home 3 3 3 \n",
"https://archive.researchdata.leeds.ac.uk/ 3 3 3 \n",
"https://repository.usta.edu.co/ 3 3 3 \n",
"https://dataverse.harvard.edu/ 3 3 3 \n",
"https://bcnroc.ajuntament.barcelona.cat/jspui/ 3 3 3 \n",
"https://er.knutd.edu.ua/ 3 3 3 \n",
"https://dataspace.princeton.edu/ 3 3 3 \n",
"https://repositori.upf.edu/ 3 3 3 \n",
"https://bibliotecadigital.exactas.uba.ar/ 3 3 3 \n",
"https://prism.ucalgary.ca/ 3 3 3 \n",
"https://research.cbs.dk/ 3 3 3 \n",
"https://webext.pasteur.fr/genolist/ 3 3 3 \n",
"https://ngdc.cncb.ac.cn/idog/ 3 3 3 \n",
"https://research-repository.uwa.edu.au/ 3 3 3 \n",
"https://research-repository.st-andrews.ac.uk/ 3 3 3 \n",
"https://repo.odmu.edu.ua/xmlui/ 3 3 3 \n",
"https://red.uao.edu.co/ 3 3 3 \n",
"https://data.gov.uk/ 3 3 3 \n",
"https://munin.uit.no/ 3 3 3 \n",
"https://datadryad.org/stash 3 3 3 \n",
"http://rua.ua.es/dspace/ 3 3 3 \n",
"http://dspace.uevora.pt/rdpc/ 3 3 3 \n",
"https://sc.lib.miamioh.edu/ 3 3 3 \n",
"https://papyrus.bib.umontreal.ca/xmlui/ 3 3 3 \n",
"https://scholar.smu.edu/ 3 3 3 \n",
"https://dro.deakin.edu.au/ 3 3 3 "
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_chains.groupby('get_url').count().sort_values('url', ascending=False).head(50)"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" <tr>\n",
" <th>head_url</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>http://103.82.172.44:8080/xmlui/</th>\n",
" <td>[http://103.82.172.44:8080/xmlui/]</td>\n",
" <td>[200.0]</td>\n",
" <td>[[]]</td>\n",
" <td>[0]</td>\n",
" <td>[200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://109.185.200.51/</th>\n",
" <td>[http://109.185.200.51/]</td>\n",
" <td>[200.0]</td>\n",
" <td>[[]]</td>\n",
" <td>[0]</td>\n",
" <td>[200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://119.3.41.228/dnmivd/</th>\n",
" <td>[http://www.unimd.org/dnmivd/]</td>\n",
" <td>[200.0]</td>\n",
" <td>[[302]]</td>\n",
" <td>[1]</td>\n",
" <td>[200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://119.78.100.183/</th>\n",
" <td>[http://119.78.100.183]</td>\n",
" <td>[403.0]</td>\n",
" <td>[[]]</td>\n",
" <td>[0]</td>\n",
" <td>[400.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://125.99.47.158:8090/jspui/</th>\n",
" <td>[http://125.99.47.158:8090/jspui, http://125.9...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[302], []]</td>\n",
" <td>[1, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zks.uni-koeln.de/index.php?s=studien&amp;c=studien_klinische-studien</th>\n",
" <td>[http://zks.uni-koeln.de/index.php?s=studien&amp;c...</td>\n",
" <td>[200.0]</td>\n",
" <td>[[301]]</td>\n",
" <td>[1]</td>\n",
" <td>[200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zokei.repo.nii.ac.jp/</th>\n",
" <td>[https://zokei.repo.nii.ac.jp/]</td>\n",
" <td>[200.0]</td>\n",
" <td>[[]]</td>\n",
" <td>[0]</td>\n",
" <td>[200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zoltardata.com/</th>\n",
" <td>[https://zoltardata.com/]</td>\n",
" <td>[200.0]</td>\n",
" <td>[[]]</td>\n",
" <td>[0]</td>\n",
" <td>[200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zone.biblio.laurentian.ca/</th>\n",
" <td>[https://zone.biblio.laurentian.ca, https://zo...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], []]</td>\n",
" <td>[0, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zuscholars.zu.ac.ae/</th>\n",
" <td>[https://zuscholars.zu.ac.ae, https://zuschola...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], []]</td>\n",
" <td>[0, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>9331 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"head_url \n",
"http://103.82.172.44:8080/xmlui/ [http://103.82.172.44:8080/xmlui/] \n",
"http://109.185.200.51/ [http://109.185.200.51/] \n",
"http://119.3.41.228/dnmivd/ [http://www.unimd.org/dnmivd/] \n",
"http://119.78.100.183/ [http://119.78.100.183] \n",
"http://125.99.47.158:8090/jspui/ [http://125.99.47.158:8090/jspui, http://125.9... \n",
"... ... \n",
"https://zks.uni-koeln.de/index.php?s=studien&c=... [http://zks.uni-koeln.de/index.php?s=studien&c... \n",
"https://zokei.repo.nii.ac.jp/ [https://zokei.repo.nii.ac.jp/] \n",
"https://zoltardata.com/ [https://zoltardata.com/] \n",
"https://zone.biblio.laurentian.ca/ [https://zone.biblio.laurentian.ca, https://zo... \n",
"https://zuscholars.zu.ac.ae/ [https://zuscholars.zu.ac.ae, https://zuschola... \n",
"\n",
" head_final_code \\\n",
"head_url \n",
"http://103.82.172.44:8080/xmlui/ [200.0] \n",
"http://109.185.200.51/ [200.0] \n",
"http://119.3.41.228/dnmivd/ [200.0] \n",
"http://119.78.100.183/ [403.0] \n",
"http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n",
"... ... \n",
"https://zks.uni-koeln.de/index.php?s=studien&c=... [200.0] \n",
"https://zokei.repo.nii.ac.jp/ [200.0] \n",
"https://zoltardata.com/ [200.0] \n",
"https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n",
"https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n",
"\n",
" head_codes len \\\n",
"head_url \n",
"http://103.82.172.44:8080/xmlui/ [[]] [0] \n",
"http://109.185.200.51/ [[]] [0] \n",
"http://119.3.41.228/dnmivd/ [[302]] [1] \n",
"http://119.78.100.183/ [[]] [0] \n",
"http://125.99.47.158:8090/jspui/ [[302], []] [1, 0] \n",
"... ... ... \n",
"https://zks.uni-koeln.de/index.php?s=studien&c=... [[301]] [1] \n",
"https://zokei.repo.nii.ac.jp/ [[]] [0] \n",
"https://zoltardata.com/ [[]] [0] \n",
"https://zone.biblio.laurentian.ca/ [[], []] [0, 0] \n",
"https://zuscholars.zu.ac.ae/ [[], []] [0, 0] \n",
"\n",
" class \n",
"head_url \n",
"http://103.82.172.44:8080/xmlui/ [200.0] \n",
"http://109.185.200.51/ [200.0] \n",
"http://119.3.41.228/dnmivd/ [200.0] \n",
"http://119.78.100.183/ [400.0] \n",
"http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n",
"... ... \n",
"https://zks.uni-koeln.de/index.php?s=studien&c=... [200.0] \n",
"https://zokei.repo.nii.ac.jp/ [200.0] \n",
"https://zoltardata.com/ [200.0] \n",
"https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n",
"https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n",
"\n",
"[9331 rows x 5 columns]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_conflation = head_chains.groupby('head_url').aggregate(list)\n",
"head_conflation"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"head_conflation.to_csv('../data/processed/head_conflation.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>head_final_code</th>\n",
" <th>head_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" <tr>\n",
" <th>head_url</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>http://125.99.47.158:8090/jspui/</th>\n",
" <td>[http://125.99.47.158:8090/jspui, http://125.9...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[302], []]</td>\n",
" <td>[1, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://160.97.80.9:8080/jspui/</th>\n",
" <td>[http://160.97.80.9:8080/jspui/, http://160.97...</td>\n",
" <td>[404.0, 404.0]</td>\n",
" <td>[[], [302]]</td>\n",
" <td>[0, 1]</td>\n",
" <td>[400.0, 400.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://acikarsiv.atauni.edu.tr/</th>\n",
" <td>[http://acikarsiv.atauni.edu.tr/, http://acika...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], []]</td>\n",
" <td>[0, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://acikarsiv.atilim.edu.tr/</th>\n",
" <td>[http://acikarsiv.atilim.edu.tr/, http://acika...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], []]</td>\n",
" <td>[0, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://acikerisim.afsu.edu.tr/xmlui/</th>\n",
" <td>[http://acikerisim.afsu.edu.tr, http://acikeri...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[302, 302], [302, 302]]</td>\n",
" <td>[2, 2]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zaguan.unizar.es/</th>\n",
" <td>[http://zaguan.unizar.es/, http://zaguan.uniza...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[301], [301]]</td>\n",
" <td>[1, 1]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zbc.ksiaznica.szczecin.pl/dlibra</th>\n",
" <td>[http://zbc.ksiaznica.szczecin.pl/dlibra, http...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[302], [302, 302]]</td>\n",
" <td>[1, 2]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zenodo.org/</th>\n",
" <td>[https://zenodo.org/, http://zenodo.org/]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], [301]]</td>\n",
" <td>[0, 1]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zone.biblio.laurentian.ca/</th>\n",
" <td>[https://zone.biblio.laurentian.ca, https://zo...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], []]</td>\n",
" <td>[0, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zuscholars.zu.ac.ae/</th>\n",
" <td>[https://zuscholars.zu.ac.ae, https://zuschola...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], []]</td>\n",
" <td>[0, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1456 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"head_url \n",
"http://125.99.47.158:8090/jspui/ [http://125.99.47.158:8090/jspui, http://125.9... \n",
"http://160.97.80.9:8080/jspui/ [http://160.97.80.9:8080/jspui/, http://160.97... \n",
"http://acikarsiv.atauni.edu.tr/ [http://acikarsiv.atauni.edu.tr/, http://acika... \n",
"http://acikarsiv.atilim.edu.tr/ [http://acikarsiv.atilim.edu.tr/, http://acika... \n",
"http://acikerisim.afsu.edu.tr/xmlui/ [http://acikerisim.afsu.edu.tr, http://acikeri... \n",
"... ... \n",
"https://zaguan.unizar.es/ [http://zaguan.unizar.es/, http://zaguan.uniza... \n",
"https://zbc.ksiaznica.szczecin.pl/dlibra [http://zbc.ksiaznica.szczecin.pl/dlibra, http... \n",
"https://zenodo.org/ [https://zenodo.org/, http://zenodo.org/] \n",
"https://zone.biblio.laurentian.ca/ [https://zone.biblio.laurentian.ca, https://zo... \n",
"https://zuscholars.zu.ac.ae/ [https://zuscholars.zu.ac.ae, https://zuschola... \n",
"\n",
" head_final_code \\\n",
"head_url \n",
"http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n",
"http://160.97.80.9:8080/jspui/ [404.0, 404.0] \n",
"http://acikarsiv.atauni.edu.tr/ [200.0, 200.0] \n",
"http://acikarsiv.atilim.edu.tr/ [200.0, 200.0] \n",
"http://acikerisim.afsu.edu.tr/xmlui/ [200.0, 200.0] \n",
"... ... \n",
"https://zaguan.unizar.es/ [200.0, 200.0] \n",
"https://zbc.ksiaznica.szczecin.pl/dlibra [200.0, 200.0] \n",
"https://zenodo.org/ [200.0, 200.0] \n",
"https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n",
"https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n",
"\n",
" head_codes len \\\n",
"head_url \n",
"http://125.99.47.158:8090/jspui/ [[302], []] [1, 0] \n",
"http://160.97.80.9:8080/jspui/ [[], [302]] [0, 1] \n",
"http://acikarsiv.atauni.edu.tr/ [[], []] [0, 0] \n",
"http://acikarsiv.atilim.edu.tr/ [[], []] [0, 0] \n",
"http://acikerisim.afsu.edu.tr/xmlui/ [[302, 302], [302, 302]] [2, 2] \n",
"... ... ... \n",
"https://zaguan.unizar.es/ [[301], [301]] [1, 1] \n",
"https://zbc.ksiaznica.szczecin.pl/dlibra [[302], [302, 302]] [1, 2] \n",
"https://zenodo.org/ [[], [301]] [0, 1] \n",
"https://zone.biblio.laurentian.ca/ [[], []] [0, 0] \n",
"https://zuscholars.zu.ac.ae/ [[], []] [0, 0] \n",
"\n",
" class \n",
"head_url \n",
"http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n",
"http://160.97.80.9:8080/jspui/ [400.0, 400.0] \n",
"http://acikarsiv.atauni.edu.tr/ [200.0, 200.0] \n",
"http://acikarsiv.atilim.edu.tr/ [200.0, 200.0] \n",
"http://acikerisim.afsu.edu.tr/xmlui/ [200.0, 200.0] \n",
"... ... \n",
"https://zaguan.unizar.es/ [200.0, 200.0] \n",
"https://zbc.ksiaznica.szczecin.pl/dlibra [200.0, 200.0] \n",
"https://zenodo.org/ [200.0, 200.0] \n",
"https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n",
"https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n",
"\n",
"[1456 rows x 5 columns]"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_conflation[head_conflation.url.str.len() > 1]"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3051"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"head_conflation[head_conflation.url.str.len() > 1].url.apply(len).sum()"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_final_code</th>\n",
" <th>get_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" <tr>\n",
" <th>get_url</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>http://103.82.172.44:8080/xmlui/</th>\n",
" <td>[http://103.82.172.44:8080/xmlui/]</td>\n",
" <td>[200.0]</td>\n",
" <td>[[]]</td>\n",
" <td>[0]</td>\n",
" <td>[200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://109.185.200.51/</th>\n",
" <td>[http://109.185.200.51/]</td>\n",
" <td>[200.0]</td>\n",
" <td>[[]]</td>\n",
" <td>[0]</td>\n",
" <td>[200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://119.3.41.228/dnmivd/</th>\n",
" <td>[http://www.unimd.org/dnmivd/]</td>\n",
" <td>[200.0]</td>\n",
" <td>[[302]]</td>\n",
" <td>[1]</td>\n",
" <td>[200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://119.78.100.183/</th>\n",
" <td>[http://119.78.100.183]</td>\n",
" <td>[200.0]</td>\n",
" <td>[[]]</td>\n",
" <td>[0]</td>\n",
" <td>[200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://125.99.47.158:8090/jspui/</th>\n",
" <td>[http://125.99.47.158:8090/jspui, http://125.9...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[302], []]</td>\n",
" <td>[1, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zks.uni-koeln.de/index.php?s=studien&amp;c=studien_klinische-studien</th>\n",
" <td>[http://zks.uni-koeln.de/index.php?s=studien&amp;c...</td>\n",
" <td>[200.0]</td>\n",
" <td>[[301]]</td>\n",
" <td>[1]</td>\n",
" <td>[200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zokei.repo.nii.ac.jp/</th>\n",
" <td>[https://zokei.repo.nii.ac.jp/]</td>\n",
" <td>[200.0]</td>\n",
" <td>[[]]</td>\n",
" <td>[0]</td>\n",
" <td>[200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zoltardata.com/</th>\n",
" <td>[https://zoltardata.com/]</td>\n",
" <td>[200.0]</td>\n",
" <td>[[]]</td>\n",
" <td>[0]</td>\n",
" <td>[200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zone.biblio.laurentian.ca/</th>\n",
" <td>[https://zone.biblio.laurentian.ca, https://zo...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], []]</td>\n",
" <td>[0, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zuscholars.zu.ac.ae/</th>\n",
" <td>[https://zuscholars.zu.ac.ae, https://zuschola...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], []]</td>\n",
" <td>[0, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>9353 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"get_url \n",
"http://103.82.172.44:8080/xmlui/ [http://103.82.172.44:8080/xmlui/] \n",
"http://109.185.200.51/ [http://109.185.200.51/] \n",
"http://119.3.41.228/dnmivd/ [http://www.unimd.org/dnmivd/] \n",
"http://119.78.100.183/ [http://119.78.100.183] \n",
"http://125.99.47.158:8090/jspui/ [http://125.99.47.158:8090/jspui, http://125.9... \n",
"... ... \n",
"https://zks.uni-koeln.de/index.php?s=studien&c=... [http://zks.uni-koeln.de/index.php?s=studien&c... \n",
"https://zokei.repo.nii.ac.jp/ [https://zokei.repo.nii.ac.jp/] \n",
"https://zoltardata.com/ [https://zoltardata.com/] \n",
"https://zone.biblio.laurentian.ca/ [https://zone.biblio.laurentian.ca, https://zo... \n",
"https://zuscholars.zu.ac.ae/ [https://zuscholars.zu.ac.ae, https://zuschola... \n",
"\n",
" get_final_code \\\n",
"get_url \n",
"http://103.82.172.44:8080/xmlui/ [200.0] \n",
"http://109.185.200.51/ [200.0] \n",
"http://119.3.41.228/dnmivd/ [200.0] \n",
"http://119.78.100.183/ [200.0] \n",
"http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n",
"... ... \n",
"https://zks.uni-koeln.de/index.php?s=studien&c=... [200.0] \n",
"https://zokei.repo.nii.ac.jp/ [200.0] \n",
"https://zoltardata.com/ [200.0] \n",
"https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n",
"https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n",
"\n",
" get_codes len \\\n",
"get_url \n",
"http://103.82.172.44:8080/xmlui/ [[]] [0] \n",
"http://109.185.200.51/ [[]] [0] \n",
"http://119.3.41.228/dnmivd/ [[302]] [1] \n",
"http://119.78.100.183/ [[]] [0] \n",
"http://125.99.47.158:8090/jspui/ [[302], []] [1, 0] \n",
"... ... ... \n",
"https://zks.uni-koeln.de/index.php?s=studien&c=... [[301]] [1] \n",
"https://zokei.repo.nii.ac.jp/ [[]] [0] \n",
"https://zoltardata.com/ [[]] [0] \n",
"https://zone.biblio.laurentian.ca/ [[], []] [0, 0] \n",
"https://zuscholars.zu.ac.ae/ [[], []] [0, 0] \n",
"\n",
" class \n",
"get_url \n",
"http://103.82.172.44:8080/xmlui/ [200.0] \n",
"http://109.185.200.51/ [200.0] \n",
"http://119.3.41.228/dnmivd/ [200.0] \n",
"http://119.78.100.183/ [200.0] \n",
"http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n",
"... ... \n",
"https://zks.uni-koeln.de/index.php?s=studien&c=... [200.0] \n",
"https://zokei.repo.nii.ac.jp/ [200.0] \n",
"https://zoltardata.com/ [200.0] \n",
"https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n",
"https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n",
"\n",
"[9353 rows x 5 columns]"
]
},
"execution_count": 61,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_conflation = get_chains.groupby('get_url').aggregate(list)\n",
"get_conflation"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"get_conflation.to_csv('../data/processed/get_conflation.tsv', sep='\\t')"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>url</th>\n",
" <th>get_final_code</th>\n",
" <th>get_codes</th>\n",
" <th>len</th>\n",
" <th>class</th>\n",
" </tr>\n",
" <tr>\n",
" <th>get_url</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>http://125.99.47.158:8090/jspui/</th>\n",
" <td>[http://125.99.47.158:8090/jspui, http://125.9...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[302], []]</td>\n",
" <td>[1, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://160.97.80.9:8080/jspui/</th>\n",
" <td>[http://160.97.80.9:8080/jspui/, http://160.97...</td>\n",
" <td>[404.0, 404.0]</td>\n",
" <td>[[], [302]]</td>\n",
" <td>[0, 1]</td>\n",
" <td>[400.0, 400.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://acikarsiv.atauni.edu.tr/</th>\n",
" <td>[http://acikarsiv.atauni.edu.tr/, http://acika...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], []]</td>\n",
" <td>[0, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://acikarsiv.atilim.edu.tr/</th>\n",
" <td>[http://acikarsiv.atilim.edu.tr/, http://acika...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], []]</td>\n",
" <td>[0, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>http://acikerisim.afsu.edu.tr/xmlui/</th>\n",
" <td>[http://acikerisim.afsu.edu.tr, http://acikeri...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[302, 302], [302, 302]]</td>\n",
" <td>[2, 2]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zaguan.unizar.es/</th>\n",
" <td>[http://zaguan.unizar.es/, http://zaguan.uniza...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[301], [301]]</td>\n",
" <td>[1, 1]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zbc.ksiaznica.szczecin.pl/dlibra</th>\n",
" <td>[http://zbc.ksiaznica.szczecin.pl/dlibra, http...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[302], [302, 302]]</td>\n",
" <td>[1, 2]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zenodo.org/</th>\n",
" <td>[https://zenodo.org/, http://zenodo.org/]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], [301]]</td>\n",
" <td>[0, 1]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zone.biblio.laurentian.ca/</th>\n",
" <td>[https://zone.biblio.laurentian.ca, https://zo...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], []]</td>\n",
" <td>[0, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" <tr>\n",
" <th>https://zuscholars.zu.ac.ae/</th>\n",
" <td>[https://zuscholars.zu.ac.ae, https://zuschola...</td>\n",
" <td>[200.0, 200.0]</td>\n",
" <td>[[], []]</td>\n",
" <td>[0, 0]</td>\n",
" <td>[200.0, 200.0]</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1459 rows × 5 columns</p>\n",
"</div>"
],
"text/plain": [
" url \\\n",
"get_url \n",
"http://125.99.47.158:8090/jspui/ [http://125.99.47.158:8090/jspui, http://125.9... \n",
"http://160.97.80.9:8080/jspui/ [http://160.97.80.9:8080/jspui/, http://160.97... \n",
"http://acikarsiv.atauni.edu.tr/ [http://acikarsiv.atauni.edu.tr/, http://acika... \n",
"http://acikarsiv.atilim.edu.tr/ [http://acikarsiv.atilim.edu.tr/, http://acika... \n",
"http://acikerisim.afsu.edu.tr/xmlui/ [http://acikerisim.afsu.edu.tr, http://acikeri... \n",
"... ... \n",
"https://zaguan.unizar.es/ [http://zaguan.unizar.es/, http://zaguan.uniza... \n",
"https://zbc.ksiaznica.szczecin.pl/dlibra [http://zbc.ksiaznica.szczecin.pl/dlibra, http... \n",
"https://zenodo.org/ [https://zenodo.org/, http://zenodo.org/] \n",
"https://zone.biblio.laurentian.ca/ [https://zone.biblio.laurentian.ca, https://zo... \n",
"https://zuscholars.zu.ac.ae/ [https://zuscholars.zu.ac.ae, https://zuschola... \n",
"\n",
" get_final_code \\\n",
"get_url \n",
"http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n",
"http://160.97.80.9:8080/jspui/ [404.0, 404.0] \n",
"http://acikarsiv.atauni.edu.tr/ [200.0, 200.0] \n",
"http://acikarsiv.atilim.edu.tr/ [200.0, 200.0] \n",
"http://acikerisim.afsu.edu.tr/xmlui/ [200.0, 200.0] \n",
"... ... \n",
"https://zaguan.unizar.es/ [200.0, 200.0] \n",
"https://zbc.ksiaznica.szczecin.pl/dlibra [200.0, 200.0] \n",
"https://zenodo.org/ [200.0, 200.0] \n",
"https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n",
"https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n",
"\n",
" get_codes len \\\n",
"get_url \n",
"http://125.99.47.158:8090/jspui/ [[302], []] [1, 0] \n",
"http://160.97.80.9:8080/jspui/ [[], [302]] [0, 1] \n",
"http://acikarsiv.atauni.edu.tr/ [[], []] [0, 0] \n",
"http://acikarsiv.atilim.edu.tr/ [[], []] [0, 0] \n",
"http://acikerisim.afsu.edu.tr/xmlui/ [[302, 302], [302, 302]] [2, 2] \n",
"... ... ... \n",
"https://zaguan.unizar.es/ [[301], [301]] [1, 1] \n",
"https://zbc.ksiaznica.szczecin.pl/dlibra [[302], [302, 302]] [1, 2] \n",
"https://zenodo.org/ [[], [301]] [0, 1] \n",
"https://zone.biblio.laurentian.ca/ [[], []] [0, 0] \n",
"https://zuscholars.zu.ac.ae/ [[], []] [0, 0] \n",
"\n",
" class \n",
"get_url \n",
"http://125.99.47.158:8090/jspui/ [200.0, 200.0] \n",
"http://160.97.80.9:8080/jspui/ [400.0, 400.0] \n",
"http://acikarsiv.atauni.edu.tr/ [200.0, 200.0] \n",
"http://acikarsiv.atilim.edu.tr/ [200.0, 200.0] \n",
"http://acikerisim.afsu.edu.tr/xmlui/ [200.0, 200.0] \n",
"... ... \n",
"https://zaguan.unizar.es/ [200.0, 200.0] \n",
"https://zbc.ksiaznica.szczecin.pl/dlibra [200.0, 200.0] \n",
"https://zenodo.org/ [200.0, 200.0] \n",
"https://zone.biblio.laurentian.ca/ [200.0, 200.0] \n",
"https://zuscholars.zu.ac.ae/ [200.0, 200.0] \n",
"\n",
"[1459 rows x 5 columns]"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_conflation[get_conflation.url.str.len() > 1]"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"3060"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"get_conflation[get_conflation.url.str.len() > 1].url.apply(len).sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"interpreter": {
"hash": "ccdc3acc266150d74575e7f25ef162b022ec22dae7e3244cf5a4f2ecbaf21c19"
},
"kernelspec": {
"display_name": "Python 3.9.12 ('data-science')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
},
"orig_nbformat": 4
},
"nbformat": 4,
"nbformat_minor": 2
}