fake-orcid-analysis/notebooks/01-Exploration.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Exploratory analysis"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "TODO:\n",
    "- Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)\n",
    "- Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)\n",
    "- Temporal dimension of any use?\n",
    "- Can we access private info thanks to the OpenAIRE-ORCID agreement?\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "        <script type=\"text/javascript\">\n",
       "        window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
       "        if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
       "        if (typeof require !== 'undefined') {\n",
       "        require.undef(\"plotly\");\n",
       "        requirejs.config({\n",
       "            paths: {\n",
       "                'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n",
       "            }\n",
       "        });\n",
       "        require(['plotly'], function(Plotly) {\n",
       "            window._Plotly = Plotly;\n",
       "        });\n",
       "        }\n",
       "        </script>\n",
       "        "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "import glob\n",
    "\n",
    "import pandas as pd\n",
    "import ast\n",
    "import tldextract\n",
    "import numpy\n",
    "\n",
    "import plotly\n",
    "from plotly.offline import iplot, init_notebook_mode\n",
    "import plotly.graph_objs as go\n",
    "import plotly.express as px\n",
    "\n",
    "init_notebook_mode(connected=True)\n",
    "TOP_N = 0\n",
    "TOP_RANGE = [0, 0]\n",
    "def set_top_n(n):\n",
    "    global TOP_N, TOP_RANGE\n",
    "    TOP_N = n\n",
    "    TOP_RANGE = [-.5, n - 1 + .5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notable solid ORCID iDs for explorative purposes:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "AM = '0000-0002-5193-7851'\n",
    "PP = '0000-0002-8588-4196'\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notable anomalies:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "JOURNAL = '0000-0003-1815-5732'\n",
    "NOINFO = '0000-0001-5009-2052'\n",
    "VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE\n",
    "# todo: find group-shared ORCiD, if possible"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notable fake ORCID iDs:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "SCAFFOLD = '0000-0001-5004-7761'\n",
    "WHATSAPP = '0000-0001-6997-9470'\n",
    "PENIS = '0000-0002-3399-7287'\n",
    "BITCOIN = '0000-0002-7518-6845'\n",
    "FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment\n",
    "CANNABIS = '0000-0002-9025-8632'      # URL > 70 + works (REMOVED)\n",
    "PLUMBER = '0000-0002-1700-8311'       # URL > 10 + works "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Load the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verified_email</th>\n",
       "      <th>verified_primary_email</th>\n",
       "      <th>given_names</th>\n",
       "      <th>family_name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other_names</th>\n",
       "      <th>urls</th>\n",
       "      <th>primary_email</th>\n",
       "      <th>...</th>\n",
       "      <th>employment</th>\n",
       "      <th>n_works</th>\n",
       "      <th>works_source</th>\n",
       "      <th>activation_date</th>\n",
       "      <th>last_update_date</th>\n",
       "      <th>n_doi</th>\n",
       "      <th>n_arxiv</th>\n",
       "      <th>n_pmc</th>\n",
       "      <th>n_other_pids</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>10000000</th>\n",
       "      <td>0000-0001-9812-9790</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>jonathan</td>\n",
       "      <td>termaat</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>[[research co-ordinator, waikato district heal...</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2019-04-15t03:08:05.268z</td>\n",
       "      <td>2019-04-15t03:09:44.443z</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10000001</th>\n",
       "      <td>0000-0002-0572-0598</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>jonathan</td>\n",
       "      <td>jørgensen</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2019-03-17t20:31:23.753z</td>\n",
       "      <td>2019-03-17t20:33:50.316z</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10000002</th>\n",
       "      <td>0000-0002-1512-9646</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>jonathan</td>\n",
       "      <td>mkrtchyan</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>1</td>\n",
       "      <td>[jonathan mkrtchyan]</td>\n",
       "      <td>2020-08-24t18:47:27.332z</td>\n",
       "      <td>2020-08-24t18:54:37.398z</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10000003</th>\n",
       "      <td>0000-0002-2271-4069</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>jonathan</td>\n",
       "      <td>pickard</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2018-05-03t09:34:25.613z</td>\n",
       "      <td>2018-05-10t13:05:09.297z</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10000004</th>\n",
       "      <td>0000-0002-3054-9622</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>jonathan</td>\n",
       "      <td>greer</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[jonathan s. greer]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>[[associate professor of old testament and dir...</td>\n",
       "      <td>2</td>\n",
       "      <td>[multidisciplinary digital publishing institut...</td>\n",
       "      <td>2019-04-09t20:05:25.447z</td>\n",
       "      <td>2020-02-07t15:55:18.951z</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                        orcid  claimed  verified_email  \\\n",
       "10000000  0000-0001-9812-9790        1               1   \n",
       "10000001  0000-0002-0572-0598        1               1   \n",
       "10000002  0000-0002-1512-9646        1               1   \n",
       "10000003  0000-0002-2271-4069        1               1   \n",
       "10000004  0000-0002-3054-9622        1               1   \n",
       "\n",
       "          verified_primary_email given_names family_name biography  \\\n",
       "10000000                       1    jonathan     termaat       NaN   \n",
       "10000001                       1    jonathan   jørgensen       NaN   \n",
       "10000002                       1    jonathan   mkrtchyan       NaN   \n",
       "10000003                       1    jonathan     pickard       NaN   \n",
       "10000004                       1    jonathan       greer       NaN   \n",
       "\n",
       "                  other_names urls primary_email  ...  \\\n",
       "10000000                  NaN  NaN           NaN  ...   \n",
       "10000001                  NaN  NaN           NaN  ...   \n",
       "10000002                  NaN  NaN           NaN  ...   \n",
       "10000003                  NaN  NaN           NaN  ...   \n",
       "10000004  [jonathan s. greer]  NaN           NaN  ...   \n",
       "\n",
       "                                                 employment n_works  \\\n",
       "10000000  [[research co-ordinator, waikato district heal...       0   \n",
       "10000001                                                NaN       0   \n",
       "10000002                                                NaN       1   \n",
       "10000003                                                NaN       0   \n",
       "10000004  [[associate professor of old testament and dir...       2   \n",
       "\n",
       "                                               works_source  \\\n",
       "10000000                                                NaN   \n",
       "10000001                                                NaN   \n",
       "10000002                               [jonathan mkrtchyan]   \n",
       "10000003                                                NaN   \n",
       "10000004  [multidisciplinary digital publishing institut...   \n",
       "\n",
       "                   activation_date          last_update_date  n_doi n_arxiv  \\\n",
       "10000000  2019-04-15t03:08:05.268z  2019-04-15t03:09:44.443z      0       0   \n",
       "10000001  2019-03-17t20:31:23.753z  2019-03-17t20:33:50.316z      0       0   \n",
       "10000002  2020-08-24t18:47:27.332z  2020-08-24t18:54:37.398z      1       0   \n",
       "10000003  2018-05-03t09:34:25.613z  2018-05-10t13:05:09.297z      0       0   \n",
       "10000004  2019-04-09t20:05:25.447z  2020-02-07t15:55:18.951z      2       0   \n",
       "\n",
       "         n_pmc n_other_pids  label  \n",
       "10000000     0            0      0  \n",
       "10000001     0            0      0  \n",
       "10000002     0            2      1  \n",
       "10000003     0            0      0  \n",
       "10000004     0            1      1  \n",
       "\n",
       "[5 rows x 24 columns]"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "parts = glob.glob('../data/processed/dataset.pkl.*')\n",
    "df = pd.concat((pd.read_pickle(part) for part in parts))\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Notable profiles inspection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verified_email</th>\n",
       "      <th>verified_primary_email</th>\n",
       "      <th>given_names</th>\n",
       "      <th>family_name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other_names</th>\n",
       "      <th>urls</th>\n",
       "      <th>primary_email</th>\n",
       "      <th>...</th>\n",
       "      <th>employment</th>\n",
       "      <th>n_works</th>\n",
       "      <th>works_source</th>\n",
       "      <th>activation_date</th>\n",
       "      <th>last_update_date</th>\n",
       "      <th>n_doi</th>\n",
       "      <th>n_arxiv</th>\n",
       "      <th>n_pmc</th>\n",
       "      <th>n_other_pids</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1575869</th>\n",
       "      <td>0000-0002-5193-7851</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>andrea</td>\n",
       "      <td>mannocci</td>\n",
       "      <td>data scientist &amp; researcher; scholarly knowled...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[[personal website, https://andremann.github.i...</td>\n",
       "      <td>andrea.mannocci@isti.cnr.it</td>\n",
       "      <td>...</td>\n",
       "      <td>[[research associate, istituto di scienza e te...</td>\n",
       "      <td>37</td>\n",
       "      <td>[scopus - elsevier, crossref metadata search, ...</td>\n",
       "      <td>2017-09-12t14:28:33.467z</td>\n",
       "      <td>2021-03-09t08:32:47.840z</td>\n",
       "      <td>34</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>60</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       orcid  claimed  verified_email  verified_primary_email  \\\n",
       "1575869  0000-0002-5193-7851        1               1                       1   \n",
       "\n",
       "        given_names family_name  \\\n",
       "1575869      andrea    mannocci   \n",
       "\n",
       "                                                 biography other_names  \\\n",
       "1575869  data scientist & researcher; scholarly knowled...         NaN   \n",
       "\n",
       "                                                      urls  \\\n",
       "1575869  [[personal website, https://andremann.github.i...   \n",
       "\n",
       "                       primary_email  ...  \\\n",
       "1575869  andrea.mannocci@isti.cnr.it  ...   \n",
       "\n",
       "                                                employment n_works  \\\n",
       "1575869  [[research associate, istituto di scienza e te...      37   \n",
       "\n",
       "                                              works_source  \\\n",
       "1575869  [scopus - elsevier, crossref metadata search, ...   \n",
       "\n",
       "                  activation_date          last_update_date  n_doi n_arxiv  \\\n",
       "1575869  2017-09-12t14:28:33.467z  2021-03-09t08:32:47.840z     34       0   \n",
       "\n",
       "        n_pmc n_other_pids  label  \n",
       "1575869     0           60      1  \n",
       "\n",
       "[1 rows x 24 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['orcid'] == AM]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verified_email</th>\n",
       "      <th>verified_primary_email</th>\n",
       "      <th>given_names</th>\n",
       "      <th>family_name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other_names</th>\n",
       "      <th>urls</th>\n",
       "      <th>primary_email</th>\n",
       "      <th>...</th>\n",
       "      <th>employment</th>\n",
       "      <th>n_works</th>\n",
       "      <th>works_source</th>\n",
       "      <th>activation_date</th>\n",
       "      <th>last_update_date</th>\n",
       "      <th>n_doi</th>\n",
       "      <th>n_arxiv</th>\n",
       "      <th>n_pmc</th>\n",
       "      <th>n_other_pids</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>6819986</th>\n",
       "      <td>0000-0001-6997-9470</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>other</td>\n",
       "      <td>whatsapp</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>[[otherwhatsapp, https://otherwhatsapp.com/], ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>0</td>\n",
       "      <td>NaN</td>\n",
       "      <td>2020-10-07t10:37:12.237z</td>\n",
       "      <td>2020-10-08t02:32:03.935z</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1 rows × 24 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                       orcid  claimed  verified_email  verified_primary_email  \\\n",
       "6819986  0000-0001-6997-9470        1               1                       1   \n",
       "\n",
       "        given_names family_name biography other_names  \\\n",
       "6819986       other    whatsapp       NaN         NaN   \n",
       "\n",
       "                                                      urls primary_email  ...  \\\n",
       "6819986  [[otherwhatsapp, https://otherwhatsapp.com/], ...           NaN  ...   \n",
       "\n",
       "        employment n_works works_source           activation_date  \\\n",
       "6819986        NaN       0          NaN  2020-10-07t10:37:12.237z   \n",
       "\n",
       "                 last_update_date  n_doi n_arxiv n_pmc n_other_pids  label  \n",
       "6819986  2020-10-08t02:32:03.935z      0       0     0            0      0  \n",
       "\n",
       "[1 rows x 24 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['orcid'] == WHATSAPP]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.count() #10916574"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['orcid'].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Primary email"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['primary_email'].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Dupe emails"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['primary_email'].dropna().loc[df['primary_email'].duplicated()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['primary_email'] == 'maykin@owasp.org']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['primary_email'] == 'opercin@erbakan.edu.tr']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['primary_email'] == 'patrick.davey@monash.edu']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['primary_email_domain'] = df[df.primary_email.notna()]['primary_email'].apply(lambda x: x.split('@')[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['primary_email_domain'].describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_primary_emails = df[['primary_email_domain', 'orcid']]\\\n",
    "                .groupby('primary_email_domain')\\\n",
    "                .count()\\\n",
    "                .sort_values('orcid', ascending=False)\n",
    "top_primary_emails"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "set_top_n(30)\n",
    "data = [\n",
    "    go.Bar(\n",
    "        x=top_primary_emails[:TOP_N].index,\n",
    "        y=top_primary_emails[:TOP_N]['orcid']\n",
    "    )\n",
    "]\n",
    "\n",
    "layout = go.Layout(\n",
    "    title='Top-%s email domains' % TOP_N,\n",
    "    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
    ")\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "plotly.offline.iplot(fig)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Other emails"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_email_domains(lst):\n",
    "    res = []\n",
    "    for email in lst:\n",
    "        res.append(email.split('@')[1])\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['other_email_domains'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: extract_email_domains(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df[df['other_email_domains'].notna()].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['n_emails'] = df['other_emails'].str.len()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "emails_by_orcid = df.sort_values('n_emails', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "set_top_n(30)\n",
    "data = [\n",
    "    go.Bar(\n",
    "        x=emails_by_orcid[:TOP_N]['orcid'],\n",
    "        y=emails_by_orcid[:TOP_N]['n_emails']\n",
    "    )\n",
    "]\n",
    "\n",
    "layout = go.Layout(\n",
    "    title='Top %s ORCID iDs by email' % TOP_N, \n",
    "    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
    ")\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "plotly.offline.iplot(fig)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_other_emails = df[['orcid', 'other_email_domains']]\\\n",
    "                        .explode('other_email_domains')\\\n",
    "                        .reset_index(drop=True)\\\n",
    "                        .groupby('other_email_domains')\\\n",
    "                        .count()\\\n",
    "                        .sort_values('orcid', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "set_top_n(30)\n",
    "data = [\n",
    "    go.Bar(\n",
    "        x=top_other_emails[:TOP_N].index,\n",
    "        y=top_other_emails[:TOP_N]['orcid']\n",
    "    )\n",
    "]\n",
    "\n",
    "layout = go.Layout(\n",
    "    title='Top %s other email domains' % TOP_N, \n",
    "    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
    ")\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "plotly.offline.iplot(fig)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Email speculation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df[df['primary_email'].isna() & df['other_emails'].notna()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## URLs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_url_domains(lst):\n",
    "    domains = []\n",
    "    for e in lst:\n",
    "        # e[0] is a string describing the url\n",
    "        # e[1] is the url\n",
    "        domain = tldextract.extract(e[1])\n",
    "        domains.append(domain.registered_domain)\n",
    "    return domains"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['url_domains'] = df[df.urls.notna()]['urls'].apply(lambda x: extract_url_domains(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['url_domains'].notna()].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['n_urls'] = df['url_domains'].str.len()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "urls_by_orcid = df.sort_values('n_urls', ascending=False)[['orcid', 'n_urls']]\n",
    "urls_by_orcid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "set_top_n(100)\n",
    "data = [\n",
    "    go.Bar(\n",
    "        x=urls_by_orcid[:TOP_N]['orcid'],\n",
    "        y=urls_by_orcid[:TOP_N]['n_urls']\n",
    "    )\n",
    "]\n",
    "\n",
    "layout = go.Layout(\n",
    "    title='Top %s ORCID iDs with URLs' % TOP_N,\n",
    "    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
    ")\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "plotly.offline.iplot(fig)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_urls = df[['orcid', 'url_domains']]\\\n",
    "                .explode('url_domains')\\\n",
    "                .reset_index(drop=True)\\\n",
    "                .groupby('url_domains')\\\n",
    "                .count()\\\n",
    "                .sort_values('orcid', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "set_top_n(50)\n",
    "data = [\n",
    "    go.Bar(\n",
    "        x=top_urls[:TOP_N].index,\n",
    "        y=top_urls[:TOP_N]['orcid']\n",
    "    )\n",
    "]\n",
    "\n",
    "layout = go.Layout(\n",
    "    title='Top-%s URL domains' % TOP_N,\n",
    "    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
    ")\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "plotly.offline.iplot(fig)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## URLs speculation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)\n",
    "exploded_sources"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Works source"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Paste from Miriam"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## External IDs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "External IDs should come from reliable sources. ORCiD registrants cannot add them freely."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['n_ids'] = df[df['external_ids'].notna()].external_ids.str.len()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.n_ids.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df.n_ids == df.n_ids.max()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "ids[ids.provider.notna()].head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_ids_providers = ids.groupby('provider').count().sort_values('orcid', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "data = [\n",
    "    go.Bar(\n",
    "        x=top_ids_providers.index,\n",
    "        y=top_ids_providers['orcid']\n",
    "    )\n",
    "]\n",
    "\n",
    "layout = go.Layout(\n",
    "    title='IDs provided by providers',\n",
    "    xaxis=dict(tickangle=45, tickfont=dict(size=12))\n",
    ")\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "plotly.offline.iplot(fig)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.unique(ids['provider'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Keywords"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['orcid'] == AM]['keywords'].values[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "I did a good job. The following instead is dirty"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['orcid'] == PP]['keywords'].values[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "So the keyword field needs some cleaning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fix_keywords(lst):\n",
    "        fixed = set()\n",
    "        for k in lst:\n",
    "            tokens = set(k.split(','))\n",
    "            for t in tokens:\n",
    "                fixed.add(str.strip(t))\n",
    "        fixed.discard('')\n",
    "        return list(fixed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['fixed_keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['orcid'] == PP]['fixed_keywords'].values[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['n_keywords'] = df.keywords.str.len()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "keywords_by_orcid = df.sort_values('n_keywords', ascending=False)[['orcid', 'n_keywords']]\n",
    "keywords_by_orcid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "set_top_n(100)\n",
    "data = [\n",
    "    go.Bar(\n",
    "        x=keywords_by_orcid[:TOP_N]['orcid'],\n",
    "        y=keywords_by_orcid[:TOP_N]['n_keywords']\n",
    "    )\n",
    "]\n",
    "\n",
    "layout = go.Layout(\n",
    "    title='Keywords provided by ORCiD',\n",
    "    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
    ")\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "plotly.offline.iplot(fig)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "top_keywords = df[['orcid', 'keywords']]\\\n",
    "                .explode('keywords')\\\n",
    "                .reset_index(drop=True)\\\n",
    "                .groupby('keywords')\\\n",
    "                .count()\\\n",
    "                .sort_values('orcid', ascending=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "set_top_n(50)\n",
    "data = [\n",
    "    go.Bar(\n",
    "        x=top_keywords[:TOP_N].index,\n",
    "        y=top_keywords[:TOP_N]['orcid']\n",
    "    )\n",
    "]\n",
    "\n",
    "layout = go.Layout(\n",
    "    title='Top-%s keywords occurrence' % TOP_N,\n",
    "    xaxis=dict(tickangle=45, tickfont=dict(size=12))\n",
    ")\n",
    "fig = go.Figure(data=data, layout=layout)\n",
    "plotly.offline.iplot(fig)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Correlation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "fig = px.imshow(df.fillna(0).corr())\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Label speculation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df.label == 1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}