diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb
index 939b2d0..885e655 100644
--- a/notebooks/01-Exploration.ipynb
+++ b/notebooks/01-Exploration.ipynb
@@ -63,7 +63,7 @@
"from datetime import datetime\n",
"import pytz\n",
"\n",
- "import tldextract # parse URLs\n",
+ "import tldextract\n",
"\n",
"import ssl # needed because nltk.download down here fires an error\n",
"try:\n",
@@ -10714,6 +10714,463 @@
"exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## GRID.ac filtering"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 166,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def extract_domain(link):\n",
+ " return tldextract.extract(link).registered_domain"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 170,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "grid_df = pd.read_csv('../data/external/grid-2021-03-25/full_tables/links.csv', index_col='grid_id')\n",
+ "grid_df['domain'] = grid_df.link.apply(extract_domain)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 171,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " link | \n",
+ " domain | \n",
+ "
\n",
+ " \n",
+ " grid_id | \n",
+ " | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " grid.1001.0 | \n",
+ " http://www.anu.edu.au/ | \n",
+ " anu.edu.au | \n",
+ "
\n",
+ " \n",
+ " grid.1002.3 | \n",
+ " http://www.monash.edu/ | \n",
+ " monash.edu | \n",
+ "
\n",
+ " \n",
+ " grid.1003.2 | \n",
+ " http://www.uq.edu.au/ | \n",
+ " uq.edu.au | \n",
+ "
\n",
+ " \n",
+ " grid.1004.5 | \n",
+ " http://mq.edu.au/ | \n",
+ " mq.edu.au | \n",
+ "
\n",
+ " \n",
+ " grid.1005.4 | \n",
+ " https://www.unsw.edu.au/ | \n",
+ " unsw.edu.au | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " grid.510886.2 | \n",
+ " https://cftni.org/ | \n",
+ " cftni.org | \n",
+ "
\n",
+ " \n",
+ " grid.510887.3 | \n",
+ " https://tinybeamfund.org/ | \n",
+ " tinybeamfund.org | \n",
+ "
\n",
+ " \n",
+ " grid.510888.c | \n",
+ " https://www.tmg-thinktank.com/ | \n",
+ " tmg-thinktank.com | \n",
+ "
\n",
+ " \n",
+ " grid.510889.d | \n",
+ " https://www.unicef.org/eap | \n",
+ " unicef.org | \n",
+ "
\n",
+ " \n",
+ " grid.510890.5 | \n",
+ " https://www.whitehouse.gov/ | \n",
+ " whitehouse.gov | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
98933 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " link domain\n",
+ "grid_id \n",
+ "grid.1001.0 http://www.anu.edu.au/ anu.edu.au\n",
+ "grid.1002.3 http://www.monash.edu/ monash.edu\n",
+ "grid.1003.2 http://www.uq.edu.au/ uq.edu.au\n",
+ "grid.1004.5 http://mq.edu.au/ mq.edu.au\n",
+ "grid.1005.4 https://www.unsw.edu.au/ unsw.edu.au\n",
+ "... ... ...\n",
+ "grid.510886.2 https://cftni.org/ cftni.org\n",
+ "grid.510887.3 https://tinybeamfund.org/ tinybeamfund.org\n",
+ "grid.510888.c https://www.tmg-thinktank.com/ tmg-thinktank.com\n",
+ "grid.510889.d https://www.unicef.org/eap unicef.org\n",
+ "grid.510890.5 https://www.whitehouse.gov/ whitehouse.gov\n",
+ "\n",
+ "[98933 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 171,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grid_df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 172,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "link http://www.isti.cnr.it/\n",
+ "domain cnr.it\n",
+ "Name: grid.451498.5, dtype: object"
+ ]
+ },
+ "execution_count": 172,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "grid_df.loc['grid.451498.5']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 178,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " orcid | \n",
+ " url_domains | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 6 | \n",
+ " 0000-0001-7402-0096 | \n",
+ " kth.se | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 0000-0001-8377-3508 | \n",
+ " cnpq.br | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " 0000-0002-2638-4108 | \n",
+ " unioviedo.es | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " 0000-0003-1435-6545 | \n",
+ " cnpq.br | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " 0000-0003-1435-6545 | \n",
+ " linkedin.com | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 10989632 | \n",
+ " 0000-0001-9133-2366 | \n",
+ " au.dk | \n",
+ "
\n",
+ " \n",
+ " 10989636 | \n",
+ " 0000-0002-2906-0299 | \n",
+ " oxytocin.com.au | \n",
+ "
\n",
+ " \n",
+ " 10989636 | \n",
+ " 0000-0002-2906-0299 | \n",
+ " linkedin.com | \n",
+ "
\n",
+ " \n",
+ " 10989640 | \n",
+ " 0000-0002-1070-2220 | \n",
+ " vpsantanna.com | \n",
+ "
\n",
+ " \n",
+ " 10989641 | \n",
+ " 0000-0002-2187-970X | \n",
+ " linkedin.com | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1025856 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " orcid url_domains\n",
+ "6 0000-0001-7402-0096 kth.se\n",
+ "11 0000-0001-8377-3508 cnpq.br\n",
+ "29 0000-0002-2638-4108 unioviedo.es\n",
+ "46 0000-0003-1435-6545 cnpq.br\n",
+ "46 0000-0003-1435-6545 linkedin.com\n",
+ "... ... ...\n",
+ "10989632 0000-0001-9133-2366 au.dk\n",
+ "10989636 0000-0002-2906-0299 oxytocin.com.au\n",
+ "10989636 0000-0002-2906-0299 linkedin.com\n",
+ "10989640 0000-0002-1070-2220 vpsantanna.com\n",
+ "10989641 0000-0002-2187-970X linkedin.com\n",
+ "\n",
+ "[1025856 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 178,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "exp = df[['orcid', 'url_domains']].explode('url_domains')\n",
+ "exp = exp[exp.url_domains.notna()]\n",
+ "exp"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 218,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "exp['grid'] = exp.url_domains.isin(grid_df.domain)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 219,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " orcid | \n",
+ " url_domains | \n",
+ " grid | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 6 | \n",
+ " 0000-0001-7402-0096 | \n",
+ " kth.se | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 0000-0001-8377-3508 | \n",
+ " cnpq.br | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " 0000-0002-2638-4108 | \n",
+ " unioviedo.es | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " 0000-0003-1435-6545 | \n",
+ " cnpq.br | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " 0000-0003-1435-6545 | \n",
+ " linkedin.com | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 10989632 | \n",
+ " 0000-0001-9133-2366 | \n",
+ " au.dk | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 10989636 | \n",
+ " 0000-0002-2906-0299 | \n",
+ " oxytocin.com.au | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 10989636 | \n",
+ " 0000-0002-2906-0299 | \n",
+ " linkedin.com | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ " 10989640 | \n",
+ " 0000-0002-1070-2220 | \n",
+ " vpsantanna.com | \n",
+ " False | \n",
+ "
\n",
+ " \n",
+ " 10989641 | \n",
+ " 0000-0002-2187-970X | \n",
+ " linkedin.com | \n",
+ " True | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1025856 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " orcid url_domains grid\n",
+ "6 0000-0001-7402-0096 kth.se True\n",
+ "11 0000-0001-8377-3508 cnpq.br True\n",
+ "29 0000-0002-2638-4108 unioviedo.es False\n",
+ "46 0000-0003-1435-6545 cnpq.br True\n",
+ "46 0000-0003-1435-6545 linkedin.com True\n",
+ "... ... ... ...\n",
+ "10989632 0000-0001-9133-2366 au.dk True\n",
+ "10989636 0000-0002-2906-0299 oxytocin.com.au False\n",
+ "10989636 0000-0002-2906-0299 linkedin.com True\n",
+ "10989640 0000-0002-1070-2220 vpsantanna.com False\n",
+ "10989641 0000-0002-2187-970X linkedin.com True\n",
+ "\n",
+ "[1025856 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 219,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 201,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "non_grid_domains = exp[~exp.url_domains.isin(grid_df.domain)].groupby('url_domains').count().sort_values('orcid', ascending=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 213,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DOMAIN_EXCLUSIONS = ['google.', 'youtube.', 'github', 'researchgate', 'academia.edu', 'elsevier.', 'elsevierpure.com',\n",
+ " 'publons.', 'scopus', 'researcherid', 'ac.uk', '.gov.', '.edu', 'arxiv']\n",
+ "for dex in DOMAIN_EXCLUSIONS:\n",
+ " non_grid_domains.drop(non_grid_domains.filter(like=dex, axis=0).index, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 214,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "non_grid_domains.to_csv('../data/processed/non_grid_urls.csv')"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},