From 42ff175d05d56c8226447c13cda8df3ef2585259 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Wed, 21 Apr 2021 18:28:38 +0200 Subject: [PATCH] urls vs grid.ac --- notebooks/01-Exploration.ipynb | 459 ++++++++++++++++++++++++++++++++- 1 file changed, 458 insertions(+), 1 deletion(-) diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index 939b2d0..885e655 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -63,7 +63,7 @@ "from datetime import datetime\n", "import pytz\n", "\n", - "import tldextract # parse URLs\n", + "import tldextract\n", "\n", "import ssl # needed because nltk.download down here fires an error\n", "try:\n", @@ -10714,6 +10714,463 @@ "exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## GRID.ac filtering" + ] + }, + { + "cell_type": "code", + "execution_count": 166, + "metadata": {}, + "outputs": [], + "source": [ + "def extract_domain(link):\n", + " return tldextract.extract(link).registered_domain" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [], + "source": [ + "grid_df = pd.read_csv('../data/external/grid-2021-03-25/full_tables/links.csv', index_col='grid_id')\n", + "grid_df['domain'] = grid_df.link.apply(extract_domain)" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
linkdomain
grid_id
grid.1001.0http://www.anu.edu.au/anu.edu.au
grid.1002.3http://www.monash.edu/monash.edu
grid.1003.2http://www.uq.edu.au/uq.edu.au
grid.1004.5http://mq.edu.au/mq.edu.au
grid.1005.4https://www.unsw.edu.au/unsw.edu.au
.........
grid.510886.2https://cftni.org/cftni.org
grid.510887.3https://tinybeamfund.org/tinybeamfund.org
grid.510888.chttps://www.tmg-thinktank.com/tmg-thinktank.com
grid.510889.dhttps://www.unicef.org/eapunicef.org
grid.510890.5https://www.whitehouse.gov/whitehouse.gov
\n", + "

98933 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " link domain\n", + "grid_id \n", + "grid.1001.0 http://www.anu.edu.au/ anu.edu.au\n", + "grid.1002.3 http://www.monash.edu/ monash.edu\n", + "grid.1003.2 http://www.uq.edu.au/ uq.edu.au\n", + "grid.1004.5 http://mq.edu.au/ mq.edu.au\n", + "grid.1005.4 https://www.unsw.edu.au/ unsw.edu.au\n", + "... ... ...\n", + "grid.510886.2 https://cftni.org/ cftni.org\n", + "grid.510887.3 https://tinybeamfund.org/ tinybeamfund.org\n", + "grid.510888.c https://www.tmg-thinktank.com/ tmg-thinktank.com\n", + "grid.510889.d https://www.unicef.org/eap unicef.org\n", + "grid.510890.5 https://www.whitehouse.gov/ whitehouse.gov\n", + "\n", + "[98933 rows x 2 columns]" + ] + }, + "execution_count": 171, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_df" + ] + }, + { + "cell_type": "code", + "execution_count": 172, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "link http://www.isti.cnr.it/\n", + "domain cnr.it\n", + "Name: grid.451498.5, dtype: object" + ] + }, + "execution_count": 172, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grid_df.loc['grid.451498.5']" + ] + }, + { + "cell_type": "code", + "execution_count": 178, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidurl_domains
60000-0001-7402-0096kth.se
110000-0001-8377-3508cnpq.br
290000-0002-2638-4108unioviedo.es
460000-0003-1435-6545cnpq.br
460000-0003-1435-6545linkedin.com
.........
109896320000-0001-9133-2366au.dk
109896360000-0002-2906-0299oxytocin.com.au
109896360000-0002-2906-0299linkedin.com
109896400000-0002-1070-2220vpsantanna.com
109896410000-0002-2187-970Xlinkedin.com
\n", + "

1025856 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " orcid url_domains\n", + "6 0000-0001-7402-0096 kth.se\n", + "11 0000-0001-8377-3508 cnpq.br\n", + "29 0000-0002-2638-4108 unioviedo.es\n", + "46 0000-0003-1435-6545 cnpq.br\n", + "46 0000-0003-1435-6545 linkedin.com\n", + "... ... ...\n", + "10989632 0000-0001-9133-2366 au.dk\n", + "10989636 0000-0002-2906-0299 oxytocin.com.au\n", + "10989636 0000-0002-2906-0299 linkedin.com\n", + "10989640 0000-0002-1070-2220 vpsantanna.com\n", + "10989641 0000-0002-2187-970X linkedin.com\n", + "\n", + "[1025856 rows x 2 columns]" + ] + }, + "execution_count": 178, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exp = df[['orcid', 'url_domains']].explode('url_domains')\n", + "exp = exp[exp.url_domains.notna()]\n", + "exp" + ] + }, + { + "cell_type": "code", + "execution_count": 218, + "metadata": {}, + "outputs": [], + "source": [ + "exp['grid'] = exp.url_domains.isin(grid_df.domain)" + ] + }, + { + "cell_type": "code", + "execution_count": 219, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidurl_domainsgrid
60000-0001-7402-0096kth.seTrue
110000-0001-8377-3508cnpq.brTrue
290000-0002-2638-4108unioviedo.esFalse
460000-0003-1435-6545cnpq.brTrue
460000-0003-1435-6545linkedin.comTrue
............
109896320000-0001-9133-2366au.dkTrue
109896360000-0002-2906-0299oxytocin.com.auFalse
109896360000-0002-2906-0299linkedin.comTrue
109896400000-0002-1070-2220vpsantanna.comFalse
109896410000-0002-2187-970Xlinkedin.comTrue
\n", + "

1025856 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " orcid url_domains grid\n", + "6 0000-0001-7402-0096 kth.se True\n", + "11 0000-0001-8377-3508 cnpq.br True\n", + "29 0000-0002-2638-4108 unioviedo.es False\n", + "46 0000-0003-1435-6545 cnpq.br True\n", + "46 0000-0003-1435-6545 linkedin.com True\n", + "... ... ... ...\n", + "10989632 0000-0001-9133-2366 au.dk True\n", + "10989636 0000-0002-2906-0299 oxytocin.com.au False\n", + "10989636 0000-0002-2906-0299 linkedin.com True\n", + "10989640 0000-0002-1070-2220 vpsantanna.com False\n", + "10989641 0000-0002-2187-970X linkedin.com True\n", + "\n", + "[1025856 rows x 3 columns]" + ] + }, + "execution_count": 219, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 201, + "metadata": {}, + "outputs": [], + "source": [ + "non_grid_domains = exp[~exp.url_domains.isin(grid_df.domain)].groupby('url_domains').count().sort_values('orcid', ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 213, + "metadata": {}, + "outputs": [], + "source": [ + "DOMAIN_EXCLUSIONS = ['google.', 'youtube.', 'github', 'researchgate', 'academia.edu', 'elsevier.', 'elsevierpure.com',\n", + " 'publons.', 'scopus', 'researcherid', 'ac.uk', '.gov.', '.edu', 'arxiv']\n", + "for dex in DOMAIN_EXCLUSIONS:\n", + " non_grid_domains.drop(non_grid_domains.filter(like=dex, axis=0).index, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 214, + "metadata": {}, + "outputs": [], + "source": [ + "non_grid_domains.to_csv('../data/processed/non_grid_urls.csv')" + ] + }, { "cell_type": "markdown", "metadata": {},