urls vs grid.ac
This commit is contained in:
parent
c7f7a9a62e
commit
42ff175d05
|
@ -63,7 +63,7 @@
|
|||
"from datetime import datetime\n",
|
||||
"import pytz\n",
|
||||
"\n",
|
||||
"import tldextract # parse URLs\n",
|
||||
"import tldextract\n",
|
||||
"\n",
|
||||
"import ssl # needed because nltk.download down here fires an error\n",
|
||||
"try:\n",
|
||||
|
@ -10714,6 +10714,463 @@
|
|||
"exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## GRID.ac filtering"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 166,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def extract_domain(link):\n",
|
||||
" return tldextract.extract(link).registered_domain"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 170,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"grid_df = pd.read_csv('../data/external/grid-2021-03-25/full_tables/links.csv', index_col='grid_id')\n",
|
||||
"grid_df['domain'] = grid_df.link.apply(extract_domain)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 171,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>link</th>\n",
|
||||
" <th>domain</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>grid_id</th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>grid.1001.0</th>\n",
|
||||
" <td>http://www.anu.edu.au/</td>\n",
|
||||
" <td>anu.edu.au</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>grid.1002.3</th>\n",
|
||||
" <td>http://www.monash.edu/</td>\n",
|
||||
" <td>monash.edu</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>grid.1003.2</th>\n",
|
||||
" <td>http://www.uq.edu.au/</td>\n",
|
||||
" <td>uq.edu.au</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>grid.1004.5</th>\n",
|
||||
" <td>http://mq.edu.au/</td>\n",
|
||||
" <td>mq.edu.au</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>grid.1005.4</th>\n",
|
||||
" <td>https://www.unsw.edu.au/</td>\n",
|
||||
" <td>unsw.edu.au</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>grid.510886.2</th>\n",
|
||||
" <td>https://cftni.org/</td>\n",
|
||||
" <td>cftni.org</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>grid.510887.3</th>\n",
|
||||
" <td>https://tinybeamfund.org/</td>\n",
|
||||
" <td>tinybeamfund.org</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>grid.510888.c</th>\n",
|
||||
" <td>https://www.tmg-thinktank.com/</td>\n",
|
||||
" <td>tmg-thinktank.com</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>grid.510889.d</th>\n",
|
||||
" <td>https://www.unicef.org/eap</td>\n",
|
||||
" <td>unicef.org</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>grid.510890.5</th>\n",
|
||||
" <td>https://www.whitehouse.gov/</td>\n",
|
||||
" <td>whitehouse.gov</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>98933 rows × 2 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" link domain\n",
|
||||
"grid_id \n",
|
||||
"grid.1001.0 http://www.anu.edu.au/ anu.edu.au\n",
|
||||
"grid.1002.3 http://www.monash.edu/ monash.edu\n",
|
||||
"grid.1003.2 http://www.uq.edu.au/ uq.edu.au\n",
|
||||
"grid.1004.5 http://mq.edu.au/ mq.edu.au\n",
|
||||
"grid.1005.4 https://www.unsw.edu.au/ unsw.edu.au\n",
|
||||
"... ... ...\n",
|
||||
"grid.510886.2 https://cftni.org/ cftni.org\n",
|
||||
"grid.510887.3 https://tinybeamfund.org/ tinybeamfund.org\n",
|
||||
"grid.510888.c https://www.tmg-thinktank.com/ tmg-thinktank.com\n",
|
||||
"grid.510889.d https://www.unicef.org/eap unicef.org\n",
|
||||
"grid.510890.5 https://www.whitehouse.gov/ whitehouse.gov\n",
|
||||
"\n",
|
||||
"[98933 rows x 2 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 171,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"grid_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 172,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"link http://www.isti.cnr.it/\n",
|
||||
"domain cnr.it\n",
|
||||
"Name: grid.451498.5, dtype: object"
|
||||
]
|
||||
},
|
||||
"execution_count": 172,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"grid_df.loc['grid.451498.5']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 178,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>orcid</th>\n",
|
||||
" <th>url_domains</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>0000-0001-7402-0096</td>\n",
|
||||
" <td>kth.se</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11</th>\n",
|
||||
" <td>0000-0001-8377-3508</td>\n",
|
||||
" <td>cnpq.br</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>29</th>\n",
|
||||
" <td>0000-0002-2638-4108</td>\n",
|
||||
" <td>unioviedo.es</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>46</th>\n",
|
||||
" <td>0000-0003-1435-6545</td>\n",
|
||||
" <td>cnpq.br</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>46</th>\n",
|
||||
" <td>0000-0003-1435-6545</td>\n",
|
||||
" <td>linkedin.com</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10989632</th>\n",
|
||||
" <td>0000-0001-9133-2366</td>\n",
|
||||
" <td>au.dk</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10989636</th>\n",
|
||||
" <td>0000-0002-2906-0299</td>\n",
|
||||
" <td>oxytocin.com.au</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10989636</th>\n",
|
||||
" <td>0000-0002-2906-0299</td>\n",
|
||||
" <td>linkedin.com</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10989640</th>\n",
|
||||
" <td>0000-0002-1070-2220</td>\n",
|
||||
" <td>vpsantanna.com</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10989641</th>\n",
|
||||
" <td>0000-0002-2187-970X</td>\n",
|
||||
" <td>linkedin.com</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>1025856 rows × 2 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" orcid url_domains\n",
|
||||
"6 0000-0001-7402-0096 kth.se\n",
|
||||
"11 0000-0001-8377-3508 cnpq.br\n",
|
||||
"29 0000-0002-2638-4108 unioviedo.es\n",
|
||||
"46 0000-0003-1435-6545 cnpq.br\n",
|
||||
"46 0000-0003-1435-6545 linkedin.com\n",
|
||||
"... ... ...\n",
|
||||
"10989632 0000-0001-9133-2366 au.dk\n",
|
||||
"10989636 0000-0002-2906-0299 oxytocin.com.au\n",
|
||||
"10989636 0000-0002-2906-0299 linkedin.com\n",
|
||||
"10989640 0000-0002-1070-2220 vpsantanna.com\n",
|
||||
"10989641 0000-0002-2187-970X linkedin.com\n",
|
||||
"\n",
|
||||
"[1025856 rows x 2 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 178,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"exp = df[['orcid', 'url_domains']].explode('url_domains')\n",
|
||||
"exp = exp[exp.url_domains.notna()]\n",
|
||||
"exp"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 218,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"exp['grid'] = exp.url_domains.isin(grid_df.domain)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 219,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>orcid</th>\n",
|
||||
" <th>url_domains</th>\n",
|
||||
" <th>grid</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>0000-0001-7402-0096</td>\n",
|
||||
" <td>kth.se</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11</th>\n",
|
||||
" <td>0000-0001-8377-3508</td>\n",
|
||||
" <td>cnpq.br</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>29</th>\n",
|
||||
" <td>0000-0002-2638-4108</td>\n",
|
||||
" <td>unioviedo.es</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>46</th>\n",
|
||||
" <td>0000-0003-1435-6545</td>\n",
|
||||
" <td>cnpq.br</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>46</th>\n",
|
||||
" <td>0000-0003-1435-6545</td>\n",
|
||||
" <td>linkedin.com</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>...</th>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" <td>...</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10989632</th>\n",
|
||||
" <td>0000-0001-9133-2366</td>\n",
|
||||
" <td>au.dk</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10989636</th>\n",
|
||||
" <td>0000-0002-2906-0299</td>\n",
|
||||
" <td>oxytocin.com.au</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10989636</th>\n",
|
||||
" <td>0000-0002-2906-0299</td>\n",
|
||||
" <td>linkedin.com</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10989640</th>\n",
|
||||
" <td>0000-0002-1070-2220</td>\n",
|
||||
" <td>vpsantanna.com</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10989641</th>\n",
|
||||
" <td>0000-0002-2187-970X</td>\n",
|
||||
" <td>linkedin.com</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<p>1025856 rows × 3 columns</p>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" orcid url_domains grid\n",
|
||||
"6 0000-0001-7402-0096 kth.se True\n",
|
||||
"11 0000-0001-8377-3508 cnpq.br True\n",
|
||||
"29 0000-0002-2638-4108 unioviedo.es False\n",
|
||||
"46 0000-0003-1435-6545 cnpq.br True\n",
|
||||
"46 0000-0003-1435-6545 linkedin.com True\n",
|
||||
"... ... ... ...\n",
|
||||
"10989632 0000-0001-9133-2366 au.dk True\n",
|
||||
"10989636 0000-0002-2906-0299 oxytocin.com.au False\n",
|
||||
"10989636 0000-0002-2906-0299 linkedin.com True\n",
|
||||
"10989640 0000-0002-1070-2220 vpsantanna.com False\n",
|
||||
"10989641 0000-0002-2187-970X linkedin.com True\n",
|
||||
"\n",
|
||||
"[1025856 rows x 3 columns]"
|
||||
]
|
||||
},
|
||||
"execution_count": 219,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": []
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 201,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"non_grid_domains = exp[~exp.url_domains.isin(grid_df.domain)].groupby('url_domains').count().sort_values('orcid', ascending=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 213,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"DOMAIN_EXCLUSIONS = ['google.', 'youtube.', 'github', 'researchgate', 'academia.edu', 'elsevier.', 'elsevierpure.com',\n",
|
||||
" 'publons.', 'scopus', 'researcherid', 'ac.uk', '.gov.', '.edu', 'arxiv']\n",
|
||||
"for dex in DOMAIN_EXCLUSIONS:\n",
|
||||
" non_grid_domains.drop(non_grid_domains.filter(like=dex, axis=0).index, inplace=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 214,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"non_grid_domains.to_csv('../data/processed/non_grid_urls.csv')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
|
|
Loading…
Reference in New Issue