urls vs grid.ac

This commit is contained in:
Andrea Mannocci 2021-04-21 18:28:38 +02:00
parent c7f7a9a62e
commit 42ff175d05
1 changed files with 458 additions and 1 deletions

View File

@ -63,7 +63,7 @@
"from datetime import datetime\n",
"import pytz\n",
"\n",
"import tldextract # parse URLs\n",
"import tldextract\n",
"\n",
"import ssl # needed because nltk.download down here fires an error\n",
"try:\n",
@ -10714,6 +10714,463 @@
"exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## GRID.ac filtering"
]
},
{
"cell_type": "code",
"execution_count": 166,
"metadata": {},
"outputs": [],
"source": [
"def extract_domain(link):\n",
" return tldextract.extract(link).registered_domain"
]
},
{
"cell_type": "code",
"execution_count": 170,
"metadata": {},
"outputs": [],
"source": [
"grid_df = pd.read_csv('../data/external/grid-2021-03-25/full_tables/links.csv', index_col='grid_id')\n",
"grid_df['domain'] = grid_df.link.apply(extract_domain)"
]
},
{
"cell_type": "code",
"execution_count": 171,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>link</th>\n",
" <th>domain</th>\n",
" </tr>\n",
" <tr>\n",
" <th>grid_id</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>grid.1001.0</th>\n",
" <td>http://www.anu.edu.au/</td>\n",
" <td>anu.edu.au</td>\n",
" </tr>\n",
" <tr>\n",
" <th>grid.1002.3</th>\n",
" <td>http://www.monash.edu/</td>\n",
" <td>monash.edu</td>\n",
" </tr>\n",
" <tr>\n",
" <th>grid.1003.2</th>\n",
" <td>http://www.uq.edu.au/</td>\n",
" <td>uq.edu.au</td>\n",
" </tr>\n",
" <tr>\n",
" <th>grid.1004.5</th>\n",
" <td>http://mq.edu.au/</td>\n",
" <td>mq.edu.au</td>\n",
" </tr>\n",
" <tr>\n",
" <th>grid.1005.4</th>\n",
" <td>https://www.unsw.edu.au/</td>\n",
" <td>unsw.edu.au</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>grid.510886.2</th>\n",
" <td>https://cftni.org/</td>\n",
" <td>cftni.org</td>\n",
" </tr>\n",
" <tr>\n",
" <th>grid.510887.3</th>\n",
" <td>https://tinybeamfund.org/</td>\n",
" <td>tinybeamfund.org</td>\n",
" </tr>\n",
" <tr>\n",
" <th>grid.510888.c</th>\n",
" <td>https://www.tmg-thinktank.com/</td>\n",
" <td>tmg-thinktank.com</td>\n",
" </tr>\n",
" <tr>\n",
" <th>grid.510889.d</th>\n",
" <td>https://www.unicef.org/eap</td>\n",
" <td>unicef.org</td>\n",
" </tr>\n",
" <tr>\n",
" <th>grid.510890.5</th>\n",
" <td>https://www.whitehouse.gov/</td>\n",
" <td>whitehouse.gov</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>98933 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" link domain\n",
"grid_id \n",
"grid.1001.0 http://www.anu.edu.au/ anu.edu.au\n",
"grid.1002.3 http://www.monash.edu/ monash.edu\n",
"grid.1003.2 http://www.uq.edu.au/ uq.edu.au\n",
"grid.1004.5 http://mq.edu.au/ mq.edu.au\n",
"grid.1005.4 https://www.unsw.edu.au/ unsw.edu.au\n",
"... ... ...\n",
"grid.510886.2 https://cftni.org/ cftni.org\n",
"grid.510887.3 https://tinybeamfund.org/ tinybeamfund.org\n",
"grid.510888.c https://www.tmg-thinktank.com/ tmg-thinktank.com\n",
"grid.510889.d https://www.unicef.org/eap unicef.org\n",
"grid.510890.5 https://www.whitehouse.gov/ whitehouse.gov\n",
"\n",
"[98933 rows x 2 columns]"
]
},
"execution_count": 171,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grid_df"
]
},
{
"cell_type": "code",
"execution_count": 172,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"link http://www.isti.cnr.it/\n",
"domain cnr.it\n",
"Name: grid.451498.5, dtype: object"
]
},
"execution_count": 172,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grid_df.loc['grid.451498.5']"
]
},
{
"cell_type": "code",
"execution_count": 178,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>url_domains</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0000-0001-7402-0096</td>\n",
" <td>kth.se</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>0000-0001-8377-3508</td>\n",
" <td>cnpq.br</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>0000-0002-2638-4108</td>\n",
" <td>unioviedo.es</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>0000-0003-1435-6545</td>\n",
" <td>cnpq.br</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>0000-0003-1435-6545</td>\n",
" <td>linkedin.com</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989632</th>\n",
" <td>0000-0001-9133-2366</td>\n",
" <td>au.dk</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989636</th>\n",
" <td>0000-0002-2906-0299</td>\n",
" <td>oxytocin.com.au</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989636</th>\n",
" <td>0000-0002-2906-0299</td>\n",
" <td>linkedin.com</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989640</th>\n",
" <td>0000-0002-1070-2220</td>\n",
" <td>vpsantanna.com</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989641</th>\n",
" <td>0000-0002-2187-970X</td>\n",
" <td>linkedin.com</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1025856 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" orcid url_domains\n",
"6 0000-0001-7402-0096 kth.se\n",
"11 0000-0001-8377-3508 cnpq.br\n",
"29 0000-0002-2638-4108 unioviedo.es\n",
"46 0000-0003-1435-6545 cnpq.br\n",
"46 0000-0003-1435-6545 linkedin.com\n",
"... ... ...\n",
"10989632 0000-0001-9133-2366 au.dk\n",
"10989636 0000-0002-2906-0299 oxytocin.com.au\n",
"10989636 0000-0002-2906-0299 linkedin.com\n",
"10989640 0000-0002-1070-2220 vpsantanna.com\n",
"10989641 0000-0002-2187-970X linkedin.com\n",
"\n",
"[1025856 rows x 2 columns]"
]
},
"execution_count": 178,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"exp = df[['orcid', 'url_domains']].explode('url_domains')\n",
"exp = exp[exp.url_domains.notna()]\n",
"exp"
]
},
{
"cell_type": "code",
"execution_count": 218,
"metadata": {},
"outputs": [],
"source": [
"exp['grid'] = exp.url_domains.isin(grid_df.domain)"
]
},
{
"cell_type": "code",
"execution_count": 219,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>url_domains</th>\n",
" <th>grid</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>0000-0001-7402-0096</td>\n",
" <td>kth.se</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>0000-0001-8377-3508</td>\n",
" <td>cnpq.br</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>29</th>\n",
" <td>0000-0002-2638-4108</td>\n",
" <td>unioviedo.es</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>0000-0003-1435-6545</td>\n",
" <td>cnpq.br</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>46</th>\n",
" <td>0000-0003-1435-6545</td>\n",
" <td>linkedin.com</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989632</th>\n",
" <td>0000-0001-9133-2366</td>\n",
" <td>au.dk</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989636</th>\n",
" <td>0000-0002-2906-0299</td>\n",
" <td>oxytocin.com.au</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989636</th>\n",
" <td>0000-0002-2906-0299</td>\n",
" <td>linkedin.com</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989640</th>\n",
" <td>0000-0002-1070-2220</td>\n",
" <td>vpsantanna.com</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989641</th>\n",
" <td>0000-0002-2187-970X</td>\n",
" <td>linkedin.com</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>1025856 rows × 3 columns</p>\n",
"</div>"
],
"text/plain": [
" orcid url_domains grid\n",
"6 0000-0001-7402-0096 kth.se True\n",
"11 0000-0001-8377-3508 cnpq.br True\n",
"29 0000-0002-2638-4108 unioviedo.es False\n",
"46 0000-0003-1435-6545 cnpq.br True\n",
"46 0000-0003-1435-6545 linkedin.com True\n",
"... ... ... ...\n",
"10989632 0000-0001-9133-2366 au.dk True\n",
"10989636 0000-0002-2906-0299 oxytocin.com.au False\n",
"10989636 0000-0002-2906-0299 linkedin.com True\n",
"10989640 0000-0002-1070-2220 vpsantanna.com False\n",
"10989641 0000-0002-2187-970X linkedin.com True\n",
"\n",
"[1025856 rows x 3 columns]"
]
},
"execution_count": 219,
"metadata": {},
"output_type": "execute_result"
}
],
"source": []
},
{
"cell_type": "code",
"execution_count": 201,
"metadata": {},
"outputs": [],
"source": [
"non_grid_domains = exp[~exp.url_domains.isin(grid_df.domain)].groupby('url_domains').count().sort_values('orcid', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 213,
"metadata": {},
"outputs": [],
"source": [
"DOMAIN_EXCLUSIONS = ['google.', 'youtube.', 'github', 'researchgate', 'academia.edu', 'elsevier.', 'elsevierpure.com',\n",
" 'publons.', 'scopus', 'researcherid', 'ac.uk', '.gov.', '.edu', 'arxiv']\n",
"for dex in DOMAIN_EXCLUSIONS:\n",
" non_grid_domains.drop(non_grid_domains.filter(like=dex, axis=0).index, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 214,
"metadata": {},
"outputs": [],
"source": [
"non_grid_domains.to_csv('../data/processed/non_grid_urls.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},