fake-orcid-analysis/notebooks/01-Exploration.ipynb

1399 lines
37 KiB
Plaintext
Raw Normal View History

2021-03-18 17:43:00 +01:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
2021-03-19 12:19:45 +01:00
"# Exploratory analysis"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"TODO:\n",
"- Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)\n",
"- Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)\n",
"- Temporal dimension of any use?\n",
"- Can we access private info thanks to the OpenAIRE-ORCID agreement?\n"
]
},
{
"cell_type": "code",
2021-03-23 19:03:37 +01:00
"execution_count": 1,
2021-03-18 17:43:00 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" <script type=\"text/javascript\">\n",
" window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
" if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
" if (typeof require !== 'undefined') {\n",
" require.undef(\"plotly\");\n",
" requirejs.config({\n",
" paths: {\n",
" 'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n",
" }\n",
" });\n",
" require(['plotly'], function(Plotly) {\n",
" window._Plotly = Plotly;\n",
" });\n",
" }\n",
" </script>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
2021-03-24 13:33:01 +01:00
"import glob\n",
"\n",
2021-03-18 17:43:00 +01:00
"import pandas as pd\n",
"import ast\n",
"import tldextract\n",
"import numpy\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
2021-03-22 19:08:20 +01:00
"import plotly.express as px\n",
2021-03-18 17:43:00 +01:00
"\n",
"init_notebook_mode(connected=True)\n",
2021-03-23 09:35:35 +01:00
"TOP_N = 0\n",
"TOP_RANGE = [0, 0]\n",
"def set_top_n(n):\n",
" global TOP_N, TOP_RANGE\n",
" TOP_N = n\n",
" TOP_RANGE = [-.5, n - 1 + .5]"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notable solid ORCID iDs for explorative purposes:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"AM = '0000-0002-5193-7851'\n",
"PP = '0000-0002-8588-4196'\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2021-03-23 19:03:37 +01:00
"Notable anomalies:"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
2021-03-23 19:03:37 +01:00
"execution_count": 3,
2021-03-18 17:43:00 +01:00
"metadata": {},
"outputs": [],
"source": [
"JOURNAL = '0000-0003-1815-5732'\n",
2021-03-23 12:13:04 +01:00
"NOINFO = '0000-0001-5009-2052'\n",
"VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE\n",
2021-03-22 19:08:20 +01:00
"# todo: find group-shared ORCiD, if possible"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2021-03-23 19:03:37 +01:00
"Notable fake ORCID iDs:"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"SCAFFOLD = '0000-0001-5004-7761'\n",
"WHATSAPP = '0000-0001-6997-9470'\n",
"PENIS = '0000-0002-3399-7287'\n",
"BITCOIN = '0000-0002-7518-6845'\n",
"FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment\n",
"CANNABIS = '0000-0002-9025-8632' # URL > 70 + works (REMOVED)\n",
"PLUMBER = '0000-0002-1700-8311' # URL > 10 + works "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load the dataset"
]
},
{
"cell_type": "code",
"execution_count": 5,
2021-03-23 19:03:37 +01:00
"metadata": {},
2021-03-18 17:43:00 +01:00
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>given_names</th>\n",
" <th>family_name</th>\n",
" <th>biography</th>\n",
" <th>other_names</th>\n",
" <th>urls</th>\n",
" <th>primary_email</th>\n",
2021-03-23 19:03:37 +01:00
" <th>...</th>\n",
2021-03-18 17:43:00 +01:00
" <th>employment</th>\n",
" <th>n_works</th>\n",
" <th>works_source</th>\n",
2021-03-23 19:03:37 +01:00
" <th>activation_date</th>\n",
" <th>last_update_date</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>label</th>\n",
2021-03-18 17:43:00 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2021-03-24 13:33:01 +01:00
" <th>10000000</th>\n",
" <td>0000-0001-9812-9790</td>\n",
2021-03-18 17:43:00 +01:00
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
2021-03-24 13:33:01 +01:00
" <td>jonathan</td>\n",
" <td>termaat</td>\n",
2021-03-18 17:43:00 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-03-23 19:03:37 +01:00
" <td>...</td>\n",
2021-03-24 13:33:01 +01:00
" <td>[[research co-ordinator, waikato district heal...</td>\n",
2021-03-18 17:43:00 +01:00
" <td>0</td>\n",
" <td>NaN</td>\n",
2021-03-24 13:33:01 +01:00
" <td>2019-04-15t03:08:05.268z</td>\n",
" <td>2019-04-15t03:09:44.443z</td>\n",
2021-03-23 19:03:37 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2021-03-18 17:43:00 +01:00
" </tr>\n",
" <tr>\n",
2021-03-24 13:33:01 +01:00
" <th>10000001</th>\n",
" <td>0000-0002-0572-0598</td>\n",
2021-03-18 17:43:00 +01:00
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
2021-03-24 13:33:01 +01:00
" <td>jonathan</td>\n",
" <td>jørgensen</td>\n",
2021-03-18 17:43:00 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-03-23 19:03:37 +01:00
" <td>...</td>\n",
2021-03-18 17:43:00 +01:00
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
2021-03-24 13:33:01 +01:00
" <td>2019-03-17t20:31:23.753z</td>\n",
" <td>2019-03-17t20:33:50.316z</td>\n",
2021-03-23 19:03:37 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2021-03-18 17:43:00 +01:00
" </tr>\n",
" <tr>\n",
2021-03-24 13:33:01 +01:00
" <th>10000002</th>\n",
" <td>0000-0002-1512-9646</td>\n",
2021-03-18 17:43:00 +01:00
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
2021-03-24 13:33:01 +01:00
" <td>jonathan</td>\n",
" <td>mkrtchyan</td>\n",
2021-03-18 17:43:00 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-03-23 19:03:37 +01:00
" <td>...</td>\n",
2021-03-18 17:43:00 +01:00
" <td>NaN</td>\n",
2021-03-24 13:33:01 +01:00
" <td>1</td>\n",
" <td>[jonathan mkrtchyan]</td>\n",
" <td>2020-08-24t18:47:27.332z</td>\n",
" <td>2020-08-24t18:54:37.398z</td>\n",
" <td>1</td>\n",
2021-03-23 19:03:37 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
2021-03-24 13:33:01 +01:00
" <td>2</td>\n",
" <td>1</td>\n",
2021-03-18 17:43:00 +01:00
" </tr>\n",
" <tr>\n",
2021-03-24 13:33:01 +01:00
" <th>10000003</th>\n",
" <td>0000-0002-2271-4069</td>\n",
2021-03-18 17:43:00 +01:00
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
2021-03-24 13:33:01 +01:00
" <td>jonathan</td>\n",
" <td>pickard</td>\n",
2021-03-18 17:43:00 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-03-23 19:03:37 +01:00
" <td>...</td>\n",
2021-03-18 17:43:00 +01:00
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
2021-03-24 13:33:01 +01:00
" <td>2018-05-03t09:34:25.613z</td>\n",
" <td>2018-05-10t13:05:09.297z</td>\n",
2021-03-23 19:03:37 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2021-03-18 17:43:00 +01:00
" </tr>\n",
" <tr>\n",
2021-03-24 13:33:01 +01:00
" <th>10000004</th>\n",
" <td>0000-0002-3054-9622</td>\n",
2021-03-18 17:43:00 +01:00
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
2021-03-24 13:33:01 +01:00
" <td>jonathan</td>\n",
" <td>greer</td>\n",
2021-03-18 17:43:00 +01:00
" <td>NaN</td>\n",
2021-03-24 13:33:01 +01:00
" <td>[jonathan s. greer]</td>\n",
2021-03-18 17:43:00 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-03-23 19:03:37 +01:00
" <td>...</td>\n",
2021-03-24 13:33:01 +01:00
" <td>[[associate professor of old testament and dir...</td>\n",
" <td>2</td>\n",
" <td>[multidisciplinary digital publishing institut...</td>\n",
" <td>2019-04-09t20:05:25.447z</td>\n",
" <td>2020-02-07t15:55:18.951z</td>\n",
" <td>2</td>\n",
2021-03-23 19:03:37 +01:00
" <td>0</td>\n",
" <td>0</td>\n",
2021-03-24 13:33:01 +01:00
" <td>1</td>\n",
" <td>1</td>\n",
2021-03-18 17:43:00 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2021-03-23 19:03:37 +01:00
"<p>5 rows × 24 columns</p>\n",
2021-03-18 17:43:00 +01:00
"</div>"
],
"text/plain": [
2021-03-24 13:33:01 +01:00
" orcid claimed verified_email \\\n",
"10000000 0000-0001-9812-9790 1 1 \n",
"10000001 0000-0002-0572-0598 1 1 \n",
"10000002 0000-0002-1512-9646 1 1 \n",
"10000003 0000-0002-2271-4069 1 1 \n",
"10000004 0000-0002-3054-9622 1 1 \n",
"\n",
" verified_primary_email given_names family_name biography \\\n",
"10000000 1 jonathan termaat NaN \n",
"10000001 1 jonathan jørgensen NaN \n",
"10000002 1 jonathan mkrtchyan NaN \n",
"10000003 1 jonathan pickard NaN \n",
"10000004 1 jonathan greer NaN \n",
"\n",
" other_names urls primary_email ... \\\n",
"10000000 NaN NaN NaN ... \n",
"10000001 NaN NaN NaN ... \n",
"10000002 NaN NaN NaN ... \n",
"10000003 NaN NaN NaN ... \n",
"10000004 [jonathan s. greer] NaN NaN ... \n",
"\n",
" employment n_works \\\n",
"10000000 [[research co-ordinator, waikato district heal... 0 \n",
"10000001 NaN 0 \n",
"10000002 NaN 1 \n",
"10000003 NaN 0 \n",
"10000004 [[associate professor of old testament and dir... 2 \n",
"\n",
" works_source \\\n",
"10000000 NaN \n",
"10000001 NaN \n",
"10000002 [jonathan mkrtchyan] \n",
"10000003 NaN \n",
"10000004 [multidisciplinary digital publishing institut... \n",
"\n",
" activation_date last_update_date n_doi n_arxiv \\\n",
"10000000 2019-04-15t03:08:05.268z 2019-04-15t03:09:44.443z 0 0 \n",
"10000001 2019-03-17t20:31:23.753z 2019-03-17t20:33:50.316z 0 0 \n",
"10000002 2020-08-24t18:47:27.332z 2020-08-24t18:54:37.398z 1 0 \n",
"10000003 2018-05-03t09:34:25.613z 2018-05-10t13:05:09.297z 0 0 \n",
"10000004 2019-04-09t20:05:25.447z 2020-02-07t15:55:18.951z 2 0 \n",
"\n",
" n_pmc n_other_pids label \n",
"10000000 0 0 0 \n",
"10000001 0 0 0 \n",
"10000002 0 2 1 \n",
"10000003 0 0 0 \n",
"10000004 0 1 1 \n",
2021-03-23 19:03:37 +01:00
"\n",
"[5 rows x 24 columns]"
2021-03-18 17:43:00 +01:00
]
},
"execution_count": 5,
2021-03-18 17:43:00 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2021-03-24 13:33:01 +01:00
"parts = glob.glob('../data/processed/dataset.pkl.*')\n",
"df = pd.concat((pd.read_pickle(part) for part in parts))\n",
2021-03-23 19:03:37 +01:00
"df.head(5)"
2021-03-18 17:43:00 +01:00
]
},
2021-03-23 12:13:04 +01:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
2021-03-23 19:03:37 +01:00
"Notable profiles inspection"
2021-03-23 12:13:04 +01:00
]
},
2021-03-18 17:43:00 +01:00
{
"cell_type": "code",
"execution_count": 6,
2021-03-18 17:43:00 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>given_names</th>\n",
" <th>family_name</th>\n",
" <th>biography</th>\n",
" <th>other_names</th>\n",
" <th>urls</th>\n",
" <th>primary_email</th>\n",
2021-03-23 19:03:37 +01:00
" <th>...</th>\n",
2021-03-18 17:43:00 +01:00
" <th>employment</th>\n",
" <th>n_works</th>\n",
" <th>works_source</th>\n",
2021-03-23 19:03:37 +01:00
" <th>activation_date</th>\n",
" <th>last_update_date</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>label</th>\n",
2021-03-18 17:43:00 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2021-03-23 19:03:37 +01:00
" <th>1575869</th>\n",
" <td>0000-0002-5193-7851</td>\n",
2021-03-18 17:43:00 +01:00
" <td>1</td>\n",
" <td>1</td>\n",
2021-03-23 19:03:37 +01:00
" <td>1</td>\n",
" <td>andrea</td>\n",
" <td>mannocci</td>\n",
" <td>data scientist &amp; researcher; scholarly knowled...</td>\n",
2021-03-18 17:43:00 +01:00
" <td>NaN</td>\n",
2021-03-23 19:03:37 +01:00
" <td>[[personal website, https://andremann.github.i...</td>\n",
" <td>andrea.mannocci@isti.cnr.it</td>\n",
" <td>...</td>\n",
" <td>[[research associate, istituto di scienza e te...</td>\n",
" <td>37</td>\n",
" <td>[scopus - elsevier, crossref metadata search, ...</td>\n",
" <td>2017-09-12t14:28:33.467z</td>\n",
" <td>2021-03-09t08:32:47.840z</td>\n",
" <td>34</td>\n",
2021-03-18 17:43:00 +01:00
" <td>0</td>\n",
2021-03-23 19:03:37 +01:00
" <td>0</td>\n",
" <td>60</td>\n",
2021-03-18 17:43:00 +01:00
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
2021-03-23 19:03:37 +01:00
"<p>1 rows × 24 columns</p>\n",
2021-03-18 17:43:00 +01:00
"</div>"
],
"text/plain": [
2021-03-23 19:03:37 +01:00
" orcid claimed verified_email verified_primary_email \\\n",
"1575869 0000-0002-5193-7851 1 1 1 \n",
"\n",
" given_names family_name \\\n",
"1575869 andrea mannocci \n",
"\n",
" biography other_names \\\n",
"1575869 data scientist & researcher; scholarly knowled... NaN \n",
"\n",
" urls \\\n",
"1575869 [[personal website, https://andremann.github.i... \n",
"\n",
" primary_email ... \\\n",
"1575869 andrea.mannocci@isti.cnr.it ... \n",
"\n",
" employment n_works \\\n",
"1575869 [[research associate, istituto di scienza e te... 37 \n",
"\n",
" works_source \\\n",
"1575869 [scopus - elsevier, crossref metadata search, ... \n",
"\n",
" activation_date last_update_date n_doi n_arxiv \\\n",
"1575869 2017-09-12t14:28:33.467z 2021-03-09t08:32:47.840z 34 0 \n",
"\n",
" n_pmc n_other_pids label \n",
"1575869 0 60 1 \n",
"\n",
"[1 rows x 24 columns]"
2021-03-18 17:43:00 +01:00
]
},
"execution_count": 6,
2021-03-18 17:43:00 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2021-03-23 19:03:37 +01:00
"df[df['orcid'] == AM]"
2021-03-23 12:13:04 +01:00
]
},
2021-03-18 17:43:00 +01:00
{
"cell_type": "code",
"execution_count": 7,
2021-03-23 12:13:04 +01:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
2021-03-23 19:03:37 +01:00
" <th>claimed</th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>given_names</th>\n",
" <th>family_name</th>\n",
" <th>biography</th>\n",
" <th>other_names</th>\n",
" <th>urls</th>\n",
" <th>primary_email</th>\n",
" <th>...</th>\n",
" <th>employment</th>\n",
" <th>n_works</th>\n",
" <th>works_source</th>\n",
" <th>activation_date</th>\n",
" <th>last_update_date</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>label</th>\n",
2021-03-23 12:13:04 +01:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2021-03-23 19:03:37 +01:00
" <th>6819986</th>\n",
" <td>0000-0001-6997-9470</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>other</td>\n",
" <td>whatsapp</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[[otherwhatsapp, https://otherwhatsapp.com/], ...</td>\n",
" <td>NaN</td>\n",
2021-03-23 12:13:04 +01:00
" <td>...</td>\n",
2021-03-23 19:03:37 +01:00
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>2020-10-07t10:37:12.237z</td>\n",
" <td>2020-10-08t02:32:03.935z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2021-03-23 12:13:04 +01:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
2021-03-23 19:03:37 +01:00
"<p>1 rows × 24 columns</p>\n",
2021-03-23 12:13:04 +01:00
"</div>"
],
"text/plain": [
2021-03-23 19:03:37 +01:00
" orcid claimed verified_email verified_primary_email \\\n",
"6819986 0000-0001-6997-9470 1 1 1 \n",
"\n",
" given_names family_name biography other_names \\\n",
"6819986 other whatsapp NaN NaN \n",
"\n",
" urls primary_email ... \\\n",
"6819986 [[otherwhatsapp, https://otherwhatsapp.com/], ... NaN ... \n",
"\n",
" employment n_works works_source activation_date \\\n",
"6819986 NaN 0 NaN 2020-10-07t10:37:12.237z \n",
"\n",
" last_update_date n_doi n_arxiv n_pmc n_other_pids label \n",
"6819986 2020-10-08t02:32:03.935z 0 0 0 0 0 \n",
"\n",
"[1 rows x 24 columns]"
]
},
"execution_count": 7,
2021-03-23 19:03:37 +01:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == WHATSAPP]"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 19:03:37 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 12:13:04 +01:00
"source": [
2021-03-24 13:33:01 +01:00
"df.count() #10916574"
2021-03-23 12:13:04 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 12:13:04 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 12:13:04 +01:00
"source": [
2021-03-23 19:03:37 +01:00
"df['orcid'].describe()"
2021-03-23 12:13:04 +01:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2021-03-23 19:03:37 +01:00
"## Primary email"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 19:03:37 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 19:03:37 +01:00
"source": [
"df['primary_email'].describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Dupe emails"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 19:03:37 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 19:03:37 +01:00
"source": [
"df['primary_email'].dropna().loc[df['primary_email'].duplicated()]"
2021-03-23 12:13:04 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
2021-03-23 19:03:37 +01:00
"df[df['primary_email'] == 'maykin@owasp.org']"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
2021-03-23 19:03:37 +01:00
"df[df['primary_email'] == 'opercin@erbakan.edu.tr']"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
2021-03-23 19:03:37 +01:00
"df[df['primary_email'] == 'patrick.davey@monash.edu']"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
"outputs": [],
"source": [
"df['primary_email_domain'] = df[df.primary_email.notna()]['primary_email'].apply(lambda x: x.split('@')[1])"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
2021-03-23 19:03:37 +01:00
"df['primary_email_domain'].describe()"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
"top_primary_emails = df[['primary_email_domain', 'orcid']]\\\n",
" .groupby('primary_email_domain')\\\n",
" .count()\\\n",
" .sort_values('orcid', ascending=False)\n",
"top_primary_emails"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
2021-03-23 09:35:35 +01:00
"set_top_n(30)\n",
2021-03-18 17:43:00 +01:00
"data = [\n",
" go.Bar(\n",
" x=top_primary_emails[:TOP_N].index,\n",
" y=top_primary_emails[:TOP_N]['orcid']\n",
2021-03-18 17:43:00 +01:00
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Top-%s email domains' % TOP_N,\n",
2021-03-18 17:43:00 +01:00
" xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Other emails"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
"outputs": [],
"source": [
"def extract_email_domains(lst):\n",
" res = []\n",
" for email in lst:\n",
" res.append(email.split('@')[1])\n",
" return res"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
"outputs": [],
"source": [
"df['other_email_domains'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: extract_email_domains(x))"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-22 19:08:20 +01:00
"metadata": {
"scrolled": true
},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
"df[df['other_email_domains'].notna()].head()"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
"outputs": [],
"source": [
2021-03-22 19:08:20 +01:00
"df['n_emails'] = df['other_emails'].str.len()"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"emails_by_orcid = df.sort_values('n_emails', ascending=False)"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
2021-03-23 10:20:23 +01:00
"set_top_n(30)\n",
"data = [\n",
" go.Bar(\n",
" x=emails_by_orcid[:TOP_N]['orcid'],\n",
" y=emails_by_orcid[:TOP_N]['n_emails']\n",
2021-03-23 10:20:23 +01:00
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Top %s ORCID iDs by email' % TOP_N, \n",
2021-03-23 10:20:23 +01:00
" xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-22 19:08:20 +01:00
"metadata": {},
"outputs": [],
"source": [
"top_other_emails = df[['orcid', 'other_email_domains']]\\\n",
2021-03-22 19:08:20 +01:00
" .explode('other_email_domains')\\\n",
" .reset_index(drop=True)\\\n",
" .groupby('other_email_domains')\\\n",
" .count()\\\n",
" .sort_values('orcid', ascending=False)"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
2021-03-23 09:35:35 +01:00
"set_top_n(30)\n",
2021-03-18 17:43:00 +01:00
"data = [\n",
" go.Bar(\n",
" x=top_other_emails[:TOP_N].index,\n",
" y=top_other_emails[:TOP_N]['orcid']\n",
2021-03-18 17:43:00 +01:00
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
2021-03-23 09:35:35 +01:00
" title='Top %s other email domains' % TOP_N, \n",
2021-03-18 17:43:00 +01:00
" xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Email speculation"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-22 19:08:20 +01:00
"metadata": {
"scrolled": true
},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
"df[df['primary_email'].isna() & df['other_emails'].notna()]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
2021-03-22 19:08:20 +01:00
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
2021-03-18 17:43:00 +01:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## URLs"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
"outputs": [],
"source": [
"def extract_url_domains(lst):\n",
" domains = []\n",
" for e in lst:\n",
" # e[0] is a string describing the url\n",
" # e[1] is the url\n",
" domain = tldextract.extract(e[1])\n",
" domains.append(domain.registered_domain)\n",
" return domains"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
"outputs": [],
"source": [
"df['url_domains'] = df[df.urls.notna()]['urls'].apply(lambda x: extract_url_domains(x))"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
"df[df['url_domains'].notna()].head()"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
"outputs": [],
"source": [
2021-03-22 19:08:20 +01:00
"df['n_urls'] = df['url_domains'].str.len()"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
"urls_by_orcid = df.sort_values('n_urls', ascending=False)[['orcid', 'n_urls']]\n",
"urls_by_orcid"
2021-03-18 17:43:00 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
2021-03-23 09:47:47 +01:00
"set_top_n(100)\n",
2021-03-18 17:43:00 +01:00
"data = [\n",
" go.Bar(\n",
" x=urls_by_orcid[:TOP_N]['orcid'],\n",
" y=urls_by_orcid[:TOP_N]['n_urls']\n",
2021-03-18 17:43:00 +01:00
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Top %s ORCID iDs with URLs' % TOP_N,\n",
2021-03-23 09:35:35 +01:00
" xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
2021-03-18 17:43:00 +01:00
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 09:47:47 +01:00
"metadata": {},
"outputs": [],
"source": [
"top_urls = df[['orcid', 'url_domains']]\\\n",
2021-03-23 09:47:47 +01:00
" .explode('url_domains')\\\n",
" .reset_index(drop=True)\\\n",
" .groupby('url_domains')\\\n",
" .count()\\\n",
" .sort_values('orcid', ascending=False)"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 09:47:47 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 19:03:37 +01:00
"source": [
2021-03-24 13:33:01 +01:00
"set_top_n(50)\n",
2021-03-23 19:03:37 +01:00
"data = [\n",
" go.Bar(\n",
" x=top_urls[:TOP_N].index,\n",
" y=top_urls[:TOP_N]['orcid']\n",
2021-03-23 19:03:37 +01:00
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Top-%s URL domains' % TOP_N,\n",
2021-03-23 19:03:37 +01:00
" xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## URLs speculation"
]
},
2021-03-23 19:03:37 +01:00
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 19:03:37 +01:00
"metadata": {
"scrolled": true
},
2021-03-24 13:33:01 +01:00
"outputs": [],
"source": [
"df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)\n",
"exploded_sources"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": true
},
"outputs": [],
2021-03-23 19:03:37 +01:00
"source": [
"exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Works source"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Paste from Miriam"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## External IDs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"External IDs should come from reliable sources. ORCiD registrants cannot add them freely."
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 19:03:37 +01:00
"metadata": {},
"outputs": [],
"source": [
"df['n_ids'] = df[df['external_ids'].notna()].external_ids.str.len()"
]
},
{
2021-03-24 13:33:01 +01:00
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.n_ids.describe()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
2021-03-23 19:03:37 +01:00
"source": [
"df[df.n_ids == df.n_ids.max()]"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 19:03:37 +01:00
"metadata": {},
"outputs": [],
"source": [
"ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 19:03:37 +01:00
"metadata": {},
"outputs": [],
"source": [
"ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 19:03:37 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 19:03:37 +01:00
"source": [
"ids[ids.provider.notna()].head()"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"top_ids_providers = ids.groupby('provider').count().sort_values('orcid', ascending=False)"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 19:03:37 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 09:47:47 +01:00
"source": [
2021-03-23 19:03:37 +01:00
"data = [\n",
" go.Bar(\n",
" x=top_ids_providers.index,\n",
" y=top_ids_providers['orcid']\n",
2021-03-23 19:03:37 +01:00
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='IDs provided by providers',\n",
" xaxis=dict(tickangle=45, tickfont=dict(size=12))\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
2021-03-23 09:47:47 +01:00
]
},
{
2021-03-23 19:03:37 +01:00
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 09:47:47 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 09:47:47 +01:00
"source": [
2021-03-23 19:03:37 +01:00
"pd.unique(ids['provider'])"
2021-03-23 09:47:47 +01:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2021-03-23 19:03:37 +01:00
"## Keywords"
2021-03-23 09:47:47 +01:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2021-03-23 19:03:37 +01:00
"This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this"
2021-03-23 09:47:47 +01:00
]
},
{
2021-03-23 19:03:37 +01:00
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 09:47:47 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 09:47:47 +01:00
"source": [
2021-03-23 19:03:37 +01:00
"df[df['orcid'] == AM]['keywords'].values[0]"
2021-03-23 09:47:47 +01:00
]
},
{
2021-03-23 19:03:37 +01:00
"cell_type": "markdown",
2021-03-23 09:47:47 +01:00
"metadata": {},
"source": [
2021-03-23 19:03:37 +01:00
"I did a good job. The following instead is dirty"
2021-03-23 09:47:47 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 09:47:47 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 09:47:47 +01:00
"source": [
2021-03-23 19:03:37 +01:00
"df[df['orcid'] == PP]['keywords'].values[0]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"So the keyword field needs some cleaning"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 19:03:37 +01:00
"metadata": {},
"outputs": [],
"source": [
"def fix_keywords(lst):\n",
" fixed = set()\n",
" for k in lst:\n",
" tokens = set(k.split(','))\n",
" for t in tokens:\n",
" fixed.add(str.strip(t))\n",
" fixed.discard('')\n",
" return list(fixed)"
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 19:03:37 +01:00
"metadata": {},
"outputs": [],
"source": [
"df['fixed_keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))"
2021-03-23 09:47:47 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 09:47:47 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 09:35:35 +01:00
"source": [
2021-03-23 19:03:37 +01:00
"df[df['orcid'] == PP]['fixed_keywords'].values[0]"
2021-03-23 09:47:47 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 09:47:47 +01:00
"metadata": {},
"outputs": [],
"source": [
2021-03-23 19:03:37 +01:00
"df['n_keywords'] = df.keywords.str.len()"
2021-03-23 09:47:47 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 09:47:47 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 09:47:47 +01:00
"source": [
"keywords_by_orcid = df.sort_values('n_keywords', ascending=False)[['orcid', 'n_keywords']]\n",
"keywords_by_orcid"
2021-03-23 09:47:47 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 09:47:47 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 12:13:04 +01:00
"source": [
2021-03-23 19:03:37 +01:00
"set_top_n(100)\n",
"data = [\n",
" go.Bar(\n",
" x=keywords_by_orcid[:TOP_N]['orcid'],\n",
" y=keywords_by_orcid[:TOP_N]['n_keywords']\n",
2021-03-23 19:03:37 +01:00
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Keywords provided by ORCiD',\n",
" xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
2021-03-23 12:13:04 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 09:47:47 +01:00
"metadata": {},
"outputs": [],
"source": [
"top_keywords = df[['orcid', 'keywords']]\\\n",
2021-03-23 19:03:37 +01:00
" .explode('keywords')\\\n",
" .reset_index(drop=True)\\\n",
" .groupby('keywords')\\\n",
" .count()\\\n",
" .sort_values('orcid', ascending=False)"
2021-03-23 09:47:47 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 09:47:47 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-23 12:13:04 +01:00
"source": [
"set_top_n(50)\n",
"data = [\n",
" go.Bar(\n",
" x=top_keywords[:TOP_N].index,\n",
" y=top_keywords[:TOP_N]['orcid']\n",
2021-03-23 12:13:04 +01:00
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Top-%s keywords occurrence' % TOP_N,\n",
2021-03-23 09:47:47 +01:00
" xaxis=dict(tickangle=45, tickfont=dict(size=12))\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
2021-03-23 09:35:35 +01:00
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
2021-03-23 09:47:47 +01:00
"## Correlation"
2021-03-23 09:35:35 +01:00
]
},
{
"cell_type": "code",
2021-03-24 13:33:01 +01:00
"execution_count": null,
2021-03-23 09:35:35 +01:00
"metadata": {},
2021-03-24 13:33:01 +01:00
"outputs": [],
2021-03-18 17:43:00 +01:00
"source": [
2021-03-24 13:33:01 +01:00
"fig = px.imshow(df.fillna(0).corr())\n",
2021-03-22 19:08:20 +01:00
"fig.show()"
2021-03-18 17:43:00 +01:00
]
},
2021-03-24 13:33:01 +01:00
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Label speculation"
]
},
2021-03-23 19:03:37 +01:00
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
2021-03-24 13:33:01 +01:00
"source": [
"df[df.label == 1]"
]
2021-03-23 19:03:37 +01:00
},
2021-03-18 17:43:00 +01:00
{
"cell_type": "code",
2021-03-22 19:08:20 +01:00
"execution_count": null,
2021-03-18 17:43:00 +01:00
"metadata": {},
"outputs": [],
2021-03-22 19:08:20 +01:00
"source": []
2021-03-18 17:43:00 +01:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}