From 8288d877fa7113f8f5efe25e279fa79f1d581381 Mon Sep 17 00:00:00 2001
From: Andrea Mannocci <andremann@libero.it>
Date: Fri, 26 Mar 2021 09:16:11 +0100
Subject: [PATCH] first tries with rudimental ML

---
 notebooks/01-Exploration.ipynb        |   35 +-
 notebooks/03-Feature extraction.ipynb | 2422 -------------------------
 notebooks/03-Machine Learning.ipynb   |  468 +++++
 3 files changed, 501 insertions(+), 2424 deletions(-)
 delete mode 100644 notebooks/03-Feature extraction.ipynb
 create mode 100644 notebooks/03-Machine Learning.ipynb

diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb
index 4a624ae..51d6a70 100644
--- a/notebooks/01-Exploration.ipynb
+++ b/notebooks/01-Exploration.ipynb
@@ -764,7 +764,7 @@
     }
    ],
    "source": [
-    "df.count() #10916574"
+    "df.count()"
    ]
   },
   {
@@ -16260,6 +16260,28 @@
     "fig.show()"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[['verified_email', \n",
+    "    'verified_primary_email', \n",
+    "    'n_works', \n",
+    "    'n_doi',\n",
+    "    'n_arxiv', \n",
+    "    'n_pmc', \n",
+    "    'n_other_pids', \n",
+    "    'n_emails', \n",
+    "    'n_urls', \n",
+    "    'n_ids', \n",
+    "    'n_keywords', \n",
+    "    'n_employment', \n",
+    "    'n_education', \n",
+    "    'label']].to_pickle('../data/processed/features.pkl')"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -16884,12 +16906,21 @@
     "# (df.n_works > 0) & (df.n_ids > 1)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Serialise "
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {},
    "outputs": [],
-   "source": []
+   "source": [
+    " "
+   ]
   }
  ],
  "metadata": {
diff --git a/notebooks/03-Feature extraction.ipynb b/notebooks/03-Feature extraction.ipynb
deleted file mode 100644
index e25ef16..0000000
--- a/notebooks/03-Feature extraction.ipynb	
+++ /dev/null
@@ -1,2422 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Todo in data\n",
-    "- Column names -> no space\n",
-    "- If a list is empty, serialise [] in the csv\n",
-    "- If a string is empty, serialise '' in the csv"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import ast\n",
-    "from urllib.parse import urlparse\n",
-    "import tldextract\n",
-    "\n",
-    "import pandas as pd\n",
-    "from sklearn.preprocessing import MultiLabelBinarizer\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "mlb = MultiLabelBinarizer()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Notable Solid ORCID iDs for debug purposes\n",
-    "AM = '0000-0002-5193-7851'\n",
-    "PP = '0000-0002-8588-4196'\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Notable fake ORCID iDs for debug purposes\n",
-    "SCAFFOLD = '0000-0001-5004-7761'\n",
-    "WHATSAPP = '0000-0001-6997-9470'\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header = 0,\n",
-    "                         dtype = {\"orcid\": pd.StringDtype(), \n",
-    "                                  \"claimed\": bool, \n",
-    "                                  \"verifyed email\": bool, \n",
-    "                                  \"verified primary email\": bool,\n",
-    "                                  \"given names\": pd.StringDtype(),\n",
-    "                                  \"family name\": pd.StringDtype(),\n",
-    "                                  \"biography\": pd.StringDtype(),\n",
-    "                                  \"other names\": pd.StringDtype(),\n",
-    "                                  \"researcher urls\": pd.StringDtype(),\n",
-    "                                  \"primary email\": pd.StringDtype(),\n",
-    "                                  \"other emails\": pd.StringDtype(),\n",
-    "                                  \"keywords\": pd.StringDtype(),\n",
-    "                                  \"eternal identifiers\": pd.StringDtype(),\n",
-    "                                  \"education\": pd.StringDtype(),\n",
-    "                                  \"employments\": pd.StringDtype(),\n",
-    "                                  \"number of works\": pd.Int16Dtype(),\n",
-    "                                  \"works source\": pd.StringDtype()})"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>orcid</th>\n",
-       "      <th>claimed</th>\n",
-       "      <th>verifyed email</th>\n",
-       "      <th>verified primary email</th>\n",
-       "      <th>given names</th>\n",
-       "      <th>family name</th>\n",
-       "      <th>biography</th>\n",
-       "      <th>other names</th>\n",
-       "      <th>researcher urls</th>\n",
-       "      <th>primary email</th>\n",
-       "      <th>other emails</th>\n",
-       "      <th>keywords</th>\n",
-       "      <th>external identifiers</th>\n",
-       "      <th>education</th>\n",
-       "      <th>employments</th>\n",
-       "      <th>number of works</th>\n",
-       "      <th>works source</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0000-0001-5000-2053</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Jorge</td>\n",
-       "      <td>Jaramillo Sanchez</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0000-0001-5000-6548</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Wiseman</td>\n",
-       "      <td>Bekelesi</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0000-0001-5000-7962</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>ALICE</td>\n",
-       "      <td>INDIMULI</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0000-0001-5000-8586</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>shim</td>\n",
-       "      <td>ji yun</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>0000-0001-5001-0256</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Sandro</td>\n",
-       "      <td>Caramaschi</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                 orcid  claimed  verifyed email  verified primary email  \\\n",
-       "0  0000-0001-5000-2053     True           False                   False   \n",
-       "1  0000-0001-5000-6548     True           False                   False   \n",
-       "2  0000-0001-5000-7962     True            True                    True   \n",
-       "3  0000-0001-5000-8586     True           False                   False   \n",
-       "4  0000-0001-5001-0256     True           False                   False   \n",
-       "\n",
-       "  given names         family name biography other names researcher urls  \\\n",
-       "0      Jorge   Jaramillo Sanchez       <NA>        <NA>            <NA>   \n",
-       "1     Wiseman            Bekelesi      <NA>        <NA>            <NA>   \n",
-       "2       ALICE            INDIMULI      <NA>        <NA>            <NA>   \n",
-       "3        shim              ji yun      <NA>        <NA>            <NA>   \n",
-       "4      Sandro          Caramaschi      <NA>        <NA>            <NA>   \n",
-       "\n",
-       "  primary email other emails keywords external identifiers education  \\\n",
-       "0          <NA>         <NA>     <NA>                  NaN      <NA>   \n",
-       "1          <NA>         <NA>     <NA>                  NaN      <NA>   \n",
-       "2          <NA>         <NA>     <NA>                  NaN      <NA>   \n",
-       "3          <NA>         <NA>     <NA>                  NaN      <NA>   \n",
-       "4          <NA>         <NA>     <NA>                  NaN      <NA>   \n",
-       "\n",
-       "  employments  number of works works source  \n",
-       "0        <NA>                0         <NA>  \n",
-       "1        <NA>                0         <NA>  \n",
-       "2        <NA>                0         <NA>  \n",
-       "3        <NA>                0         <NA>  \n",
-       "4        <NA>                0         <NA>  "
-      ]
-     },
-     "execution_count": 6,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.head(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>orcid</th>\n",
-       "      <th>claimed</th>\n",
-       "      <th>verifyed email</th>\n",
-       "      <th>verified primary email</th>\n",
-       "      <th>given names</th>\n",
-       "      <th>family name</th>\n",
-       "      <th>biography</th>\n",
-       "      <th>other names</th>\n",
-       "      <th>researcher urls</th>\n",
-       "      <th>primary email</th>\n",
-       "      <th>other emails</th>\n",
-       "      <th>keywords</th>\n",
-       "      <th>external identifiers</th>\n",
-       "      <th>education</th>\n",
-       "      <th>employments</th>\n",
-       "      <th>number of works</th>\n",
-       "      <th>works source</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>8840413</th>\n",
-       "      <td>0000-0002-5193-7851</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Andrea</td>\n",
-       "      <td>Mannocci</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
-       "      <td>andrea.mannocci@isti.cnr.it</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
-       "      <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
-       "      <td>[[\"Information engineering\", \"Ph.D.\", \"Univers...</td>\n",
-       "      <td>[[\"Research Associate\", \"Istituto di Scienza e...</td>\n",
-       "      <td>37</td>\n",
-       "      <td>[\"Scopus - Elsevier\", \"Crossref Metadata Searc...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
-       "8840413  0000-0002-5193-7851     True            True                    True   \n",
-       "\n",
-       "        given names family name biography other names  \\\n",
-       "8840413      Andrea    Mannocci      <NA>        <NA>   \n",
-       "\n",
-       "                                           researcher urls  \\\n",
-       "8840413  [[\"Personal website\", \"https://andremann.githu...   \n",
-       "\n",
-       "                       primary email other emails  \\\n",
-       "8840413  andrea.mannocci@isti.cnr.it         <NA>   \n",
-       "\n",
-       "                                                  keywords  \\\n",
-       "8840413  [\"Data science \", \"science of science\", \"schol...   \n",
-       "\n",
-       "                          external identifiers  \\\n",
-       "8840413  [[\"Scopus Author ID\", \"55233589900\"]]   \n",
-       "\n",
-       "                                                 education  \\\n",
-       "8840413  [[\"Information engineering\", \"Ph.D.\", \"Univers...   \n",
-       "\n",
-       "                                               employments  number of works  \\\n",
-       "8840413  [[\"Research Associate\", \"Istituto di Scienza e...               37   \n",
-       "\n",
-       "                                              works source  \n",
-       "8840413  [\"Scopus - Elsevier\", \"Crossref Metadata Searc...  "
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['orcid'] == AM]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Extracting works source"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def extract_work_source(lst):\n",
-    "    extracted = []\n",
-    "    for s in lst:\n",
-    "        if 'Scopus - Elsevier' in s or 'Crossref' in s:\n",
-    "            extracted.append(s)\n",
-    "    return extracted"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['extracted_works_source'] = df['works source'].apply(lambda x: extract_work_source(x))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_works_source']), columns=mlb.classes_)], axis = 1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.drop(['works source', 'extracted_works_source'], axis=1, inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 13,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>orcid</th>\n",
-       "      <th>claimed</th>\n",
-       "      <th>verifyed email</th>\n",
-       "      <th>verified primary email</th>\n",
-       "      <th>given names</th>\n",
-       "      <th>family name</th>\n",
-       "      <th>biography</th>\n",
-       "      <th>other names</th>\n",
-       "      <th>researcher urls</th>\n",
-       "      <th>primary email</th>\n",
-       "      <th>other emails</th>\n",
-       "      <th>keywords</th>\n",
-       "      <th>external identifiers</th>\n",
-       "      <th>education</th>\n",
-       "      <th>employments</th>\n",
-       "      <th>number of works</th>\n",
-       "      <th>Crossref</th>\n",
-       "      <th>Crossref Metadata Search</th>\n",
-       "      <th>Scopus - Elsevier</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>8840413</th>\n",
-       "      <td>0000-0002-5193-7851</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Andrea</td>\n",
-       "      <td>Mannocci</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
-       "      <td>andrea.mannocci@isti.cnr.it</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
-       "      <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
-       "      <td>[[\"Information engineering\", \"Ph.D.\", \"Univers...</td>\n",
-       "      <td>[[\"Research Associate\", \"Istituto di Scienza e...</td>\n",
-       "      <td>37</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
-       "8840413  0000-0002-5193-7851     True            True                    True   \n",
-       "\n",
-       "        given names family name biography other names  \\\n",
-       "8840413      Andrea    Mannocci      <NA>        <NA>   \n",
-       "\n",
-       "                                           researcher urls  \\\n",
-       "8840413  [[\"Personal website\", \"https://andremann.githu...   \n",
-       "\n",
-       "                       primary email other emails  \\\n",
-       "8840413  andrea.mannocci@isti.cnr.it         <NA>   \n",
-       "\n",
-       "                                                  keywords  \\\n",
-       "8840413  [\"Data science \", \"science of science\", \"schol...   \n",
-       "\n",
-       "                          external identifiers  \\\n",
-       "8840413  [[\"Scopus Author ID\", \"55233589900\"]]   \n",
-       "\n",
-       "                                                 education  \\\n",
-       "8840413  [[\"Information engineering\", \"Ph.D.\", \"Univers...   \n",
-       "\n",
-       "                                               employments  number of works  \\\n",
-       "8840413  [[\"Research Associate\", \"Istituto di Scienza e...               37   \n",
-       "\n",
-       "         Crossref  Crossref Metadata Search  Scopus - Elsevier  \n",
-       "8840413         1                         1                  1  "
-      ]
-     },
-     "execution_count": 13,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['orcid'] == AM]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Education"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['n_education'] = df['education'].str.len()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 16,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.drop('education', axis=1, inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>orcid</th>\n",
-       "      <th>claimed</th>\n",
-       "      <th>verifyed email</th>\n",
-       "      <th>verified primary email</th>\n",
-       "      <th>given names</th>\n",
-       "      <th>family name</th>\n",
-       "      <th>biography</th>\n",
-       "      <th>other names</th>\n",
-       "      <th>researcher urls</th>\n",
-       "      <th>primary email</th>\n",
-       "      <th>other emails</th>\n",
-       "      <th>keywords</th>\n",
-       "      <th>external identifiers</th>\n",
-       "      <th>employments</th>\n",
-       "      <th>number of works</th>\n",
-       "      <th>Crossref</th>\n",
-       "      <th>Crossref Metadata Search</th>\n",
-       "      <th>Scopus - Elsevier</th>\n",
-       "      <th>n_education</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>8840413</th>\n",
-       "      <td>0000-0002-5193-7851</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Andrea</td>\n",
-       "      <td>Mannocci</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
-       "      <td>andrea.mannocci@isti.cnr.it</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
-       "      <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
-       "      <td>[[\"Research Associate\", \"Istituto di Scienza e...</td>\n",
-       "      <td>37</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
-       "8840413  0000-0002-5193-7851     True            True                    True   \n",
-       "\n",
-       "        given names family name biography other names  \\\n",
-       "8840413      Andrea    Mannocci      <NA>        <NA>   \n",
-       "\n",
-       "                                           researcher urls  \\\n",
-       "8840413  [[\"Personal website\", \"https://andremann.githu...   \n",
-       "\n",
-       "                       primary email other emails  \\\n",
-       "8840413  andrea.mannocci@isti.cnr.it         <NA>   \n",
-       "\n",
-       "                                                  keywords  \\\n",
-       "8840413  [\"Data science \", \"science of science\", \"schol...   \n",
-       "\n",
-       "                          external identifiers  \\\n",
-       "8840413  [[\"Scopus Author ID\", \"55233589900\"]]   \n",
-       "\n",
-       "                                               employments  number of works  \\\n",
-       "8840413  [[\"Research Associate\", \"Istituto di Scienza e...               37   \n",
-       "\n",
-       "         Crossref  Crossref Metadata Search  Scopus - Elsevier  n_education  \n",
-       "8840413         1                         1                  1            4  "
-      ]
-     },
-     "execution_count": 17,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['orcid'] == AM]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Employment"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['n_employments'] = df['employments'].str.len()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.drop('employments', axis=1, inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>orcid</th>\n",
-       "      <th>claimed</th>\n",
-       "      <th>verifyed email</th>\n",
-       "      <th>verified primary email</th>\n",
-       "      <th>given names</th>\n",
-       "      <th>family name</th>\n",
-       "      <th>biography</th>\n",
-       "      <th>other names</th>\n",
-       "      <th>researcher urls</th>\n",
-       "      <th>primary email</th>\n",
-       "      <th>other emails</th>\n",
-       "      <th>keywords</th>\n",
-       "      <th>external identifiers</th>\n",
-       "      <th>number of works</th>\n",
-       "      <th>Crossref</th>\n",
-       "      <th>Crossref Metadata Search</th>\n",
-       "      <th>Scopus - Elsevier</th>\n",
-       "      <th>n_education</th>\n",
-       "      <th>n_employments</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>8840413</th>\n",
-       "      <td>0000-0002-5193-7851</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Andrea</td>\n",
-       "      <td>Mannocci</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
-       "      <td>andrea.mannocci@isti.cnr.it</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
-       "      <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
-       "      <td>37</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4</td>\n",
-       "      <td>5</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
-       "8840413  0000-0002-5193-7851     True            True                    True   \n",
-       "\n",
-       "        given names family name biography other names  \\\n",
-       "8840413      Andrea    Mannocci      <NA>        <NA>   \n",
-       "\n",
-       "                                           researcher urls  \\\n",
-       "8840413  [[\"Personal website\", \"https://andremann.githu...   \n",
-       "\n",
-       "                       primary email other emails  \\\n",
-       "8840413  andrea.mannocci@isti.cnr.it         <NA>   \n",
-       "\n",
-       "                                                  keywords  \\\n",
-       "8840413  [\"Data science \", \"science of science\", \"schol...   \n",
-       "\n",
-       "                          external identifiers  number of works  Crossref  \\\n",
-       "8840413  [[\"Scopus Author ID\", \"55233589900\"]]               37         1   \n",
-       "\n",
-       "         Crossref Metadata Search  Scopus - Elsevier  n_education  \\\n",
-       "8840413                         1                  1            4   \n",
-       "\n",
-       "         n_employments  \n",
-       "8840413              5  "
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['orcid'] == AM]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# External IDs"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# def extract_ids(lst):\n",
-    "#     extracted = []\n",
-    "#     for id in lst:\n",
-    "#         extracted.append(id[0])\n",
-    "#     return extracted"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# df['extracted_identifiers'] = df['external identifiers'].apply(lambda x: extract_ids(x))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 25,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_identifiers']), columns=mlb.classes_)], axis = 1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['n_ext_ids'] = df['external identifiers'].str.len()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.drop(['external identifiers'], axis=1, inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>orcid</th>\n",
-       "      <th>claimed</th>\n",
-       "      <th>verifyed email</th>\n",
-       "      <th>verified primary email</th>\n",
-       "      <th>given names</th>\n",
-       "      <th>family name</th>\n",
-       "      <th>biography</th>\n",
-       "      <th>other names</th>\n",
-       "      <th>researcher urls</th>\n",
-       "      <th>primary email</th>\n",
-       "      <th>other emails</th>\n",
-       "      <th>keywords</th>\n",
-       "      <th>number of works</th>\n",
-       "      <th>Crossref</th>\n",
-       "      <th>Crossref Metadata Search</th>\n",
-       "      <th>Scopus - Elsevier</th>\n",
-       "      <th>n_education</th>\n",
-       "      <th>n_employments</th>\n",
-       "      <th>n_ext_ids</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>8840413</th>\n",
-       "      <td>0000-0002-5193-7851</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Andrea</td>\n",
-       "      <td>Mannocci</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
-       "      <td>andrea.mannocci@isti.cnr.it</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
-       "      <td>37</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4</td>\n",
-       "      <td>5</td>\n",
-       "      <td>1</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
-       "8840413  0000-0002-5193-7851     True            True                    True   \n",
-       "\n",
-       "        given names family name biography other names  \\\n",
-       "8840413      Andrea    Mannocci      <NA>        <NA>   \n",
-       "\n",
-       "                                           researcher urls  \\\n",
-       "8840413  [[\"Personal website\", \"https://andremann.githu...   \n",
-       "\n",
-       "                       primary email other emails  \\\n",
-       "8840413  andrea.mannocci@isti.cnr.it         <NA>   \n",
-       "\n",
-       "                                                  keywords  number of works  \\\n",
-       "8840413  [\"Data science \", \"science of science\", \"schol...               37   \n",
-       "\n",
-       "         Crossref  Crossref Metadata Search  Scopus - Elsevier  n_education  \\\n",
-       "8840413         1                         1                  1            4   \n",
-       "\n",
-       "         n_employments  n_ext_ids  \n",
-       "8840413              5          1  "
-      ]
-     },
-     "execution_count": 28,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['orcid'] == AM]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Extracting email domains"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['primary email'] = df['primary email'].fillna('')\n",
-    "df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 30,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def extract_email_domains(row):\n",
-    "    domains = []\n",
-    "    if len(row['primary email']) > 0:\n",
-    "        domains.append(row['primary email'].split('@')[1])\n",
-    "    for email in row['other emails']:\n",
-    "        domains.append(email.split('@')[1])\n",
-    "    return domains"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 31,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['email_domains'] = df[['primary email','other emails']].apply(lambda row: extract_email_domains(row), axis=1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 32,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "34          [seh.ox.ac.uk, bsg.ox.ac.uk]\n",
-       "47                         [foxmail.com]\n",
-       "103                     [fvtm.bu.edu.eg]\n",
-       "297                           [unipa.it]\n",
-       "299                            [nhs.net]\n",
-       "                        ...             \n",
-       "10746811             [gva.es, gmail.com]\n",
-       "10746850                  [cinvestav.mx]\n",
-       "10746920        [gmail.com, hotmail.com]\n",
-       "10746975                       [mail.ru]\n",
-       "10746988                        [ucm.es]\n",
-       "Name: email_domains, Length: 141118, dtype: object"
-      ]
-     },
-     "execution_count": 32,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['email_domains'].str.len() != 0]['email_domains']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['email_domains']), columns=mlb.classes_)], axis = 1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.drop(['primary email', 'other emails', 'email_domains'], axis=1, inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df[df['orcid'] == AM]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Extracting URL domains"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 35,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 36,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def extract_url_domains(lst):\n",
-    "    domains = []\n",
-    "    for e in lst:\n",
-    "        # e[0] is a string describing the url\n",
-    "        # e[1] is the url\n",
-    "        ext = tldextract.extract(e[1])\n",
-    "        domains.append(ext.registered_domain)\n",
-    "    return domains"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 37,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 38,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "5                           [researchgate.net]\n",
-       "14                      [tigerscaffolds.co.nz]\n",
-       "15                         [corticalbrain.com]\n",
-       "29                                   [cnpq.br]\n",
-       "30                                [sksahu.net]\n",
-       "                           ...                \n",
-       "10746945                          [telegra.ph]\n",
-       "10746950    [twitter.com, urbanfoodpolicy.com]\n",
-       "10746955                    [openlearning.com]\n",
-       "10746984                        [panaximco.vn]\n",
-       "10746987                       [swansea.ac.uk]\n",
-       "Name: url_domains, Length: 688572, dtype: object"
-      ]
-     },
-     "execution_count": 38,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['url_domains'].str.len() != 0]['url_domains']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['url_domains']), columns=mlb.classes_)], axis = 1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 39,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df.drop(['researcher urls', 'url_domains'], axis=1, inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 40,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>orcid</th>\n",
-       "      <th>claimed</th>\n",
-       "      <th>verifyed email</th>\n",
-       "      <th>verified primary email</th>\n",
-       "      <th>given names</th>\n",
-       "      <th>family name</th>\n",
-       "      <th>biography</th>\n",
-       "      <th>other names</th>\n",
-       "      <th>keywords</th>\n",
-       "      <th>number of works</th>\n",
-       "      <th>Crossref</th>\n",
-       "      <th>Crossref Metadata Search</th>\n",
-       "      <th>Scopus - Elsevier</th>\n",
-       "      <th>n_education</th>\n",
-       "      <th>n_employments</th>\n",
-       "      <th>n_ext_ids</th>\n",
-       "      <th>email_domains</th>\n",
-       "      <th>url_domains</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>8840413</th>\n",
-       "      <td>0000-0002-5193-7851</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Andrea</td>\n",
-       "      <td>Mannocci</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
-       "      <td>37</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>1</td>\n",
-       "      <td>4</td>\n",
-       "      <td>5</td>\n",
-       "      <td>1</td>\n",
-       "      <td>[isti.cnr.it]</td>\n",
-       "      <td>[github.io, twitter.com, linkedin.com]</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
-       "8840413  0000-0002-5193-7851     True            True                    True   \n",
-       "\n",
-       "        given names family name biography other names  \\\n",
-       "8840413      Andrea    Mannocci      <NA>        <NA>   \n",
-       "\n",
-       "                                                  keywords  number of works  \\\n",
-       "8840413  [\"Data science \", \"science of science\", \"schol...               37   \n",
-       "\n",
-       "         Crossref  Crossref Metadata Search  Scopus - Elsevier  n_education  \\\n",
-       "8840413         1                         1                  1            4   \n",
-       "\n",
-       "         n_employments  n_ext_ids  email_domains  \\\n",
-       "8840413              5          1  [isti.cnr.it]   \n",
-       "\n",
-       "                                    url_domains  \n",
-       "8840413  [github.io, twitter.com, linkedin.com]  "
-      ]
-     },
-     "execution_count": 40,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['orcid'] == AM]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Fixing keywords"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 18,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Sometimes, different keywords are provided as a continuum (multiplexed in just one keyword). E.g."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 19,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>orcid</th>\n",
-       "      <th>claimed</th>\n",
-       "      <th>verifyed email</th>\n",
-       "      <th>verified primary email</th>\n",
-       "      <th>given names</th>\n",
-       "      <th>family name</th>\n",
-       "      <th>biography</th>\n",
-       "      <th>other names</th>\n",
-       "      <th>researcher urls</th>\n",
-       "      <th>primary email</th>\n",
-       "      <th>other emails</th>\n",
-       "      <th>keywords</th>\n",
-       "      <th>external identifiers</th>\n",
-       "      <th>education</th>\n",
-       "      <th>employments</th>\n",
-       "      <th>number of works</th>\n",
-       "      <th>works source</th>\n",
-       "      <th>email_domains</th>\n",
-       "      <th>url_domains</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>9601705</th>\n",
-       "      <td>0000-0002-8588-4196</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Pedro</td>\n",
-       "      <td>Príncipe</td>\n",
-       "      <td>Pedro Príncipe is an information, documentatio...</td>\n",
-       "      <td>[\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[open access, open science, libraries, reposit...</td>\n",
-       "      <td>[[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]]</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[[\"Librarian / Project manager\", \"Universidade...</td>\n",
-       "      <td>5</td>\n",
-       "      <td>[\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"...</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
-       "9601705  0000-0002-8588-4196     True            True                    True   \n",
-       "\n",
-       "        given names family name  \\\n",
-       "9601705       Pedro    Príncipe   \n",
-       "\n",
-       "                                                 biography  \\\n",
-       "9601705  Pedro Príncipe is an information, documentatio...   \n",
-       "\n",
-       "                                              other names researcher urls  \\\n",
-       "9601705  [\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"]              []   \n",
-       "\n",
-       "        primary email other emails  \\\n",
-       "9601705                         []   \n",
-       "\n",
-       "                                                  keywords  \\\n",
-       "9601705  [open access, open science, libraries, reposit...   \n",
-       "\n",
-       "                            external identifiers education  \\\n",
-       "9601705  [[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]]      <NA>   \n",
-       "\n",
-       "                                               employments  number of works  \\\n",
-       "9601705  [[\"Librarian / Project manager\", \"Universidade...                5   \n",
-       "\n",
-       "                                              works source email_domains  \\\n",
-       "9601705  [\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"...            []   \n",
-       "\n",
-       "        url_domains  \n",
-       "9601705          []  "
-      ]
-     },
-     "execution_count": 19,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['orcid'] == PP]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 20,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def fix_keywords(lst):\n",
-    "    fixed = []\n",
-    "    for k in lst:\n",
-    "        split = k.split(',')\n",
-    "        fixed.extend(split)\n",
-    "    return fixed"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 21,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "['open access',\n",
-       " ' open science',\n",
-       " ' libraries',\n",
-       " ' repositories',\n",
-       " ' social web',\n",
-       " '']"
-      ]
-     },
-     "execution_count": 21,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "test = ['open access, open science, libraries, repositories, social web,']\n",
-    "fix_keywords(test)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "df['fixed_keywords'] = df['keywords'].apply(lambda lst: fix_keywords(lst))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>orcid</th>\n",
-       "      <th>claimed</th>\n",
-       "      <th>verifyed email</th>\n",
-       "      <th>verified primary email</th>\n",
-       "      <th>given names</th>\n",
-       "      <th>family name</th>\n",
-       "      <th>biography</th>\n",
-       "      <th>other names</th>\n",
-       "      <th>researcher urls</th>\n",
-       "      <th>primary email</th>\n",
-       "      <th>other emails</th>\n",
-       "      <th>keywords</th>\n",
-       "      <th>external identifiers</th>\n",
-       "      <th>education</th>\n",
-       "      <th>employments</th>\n",
-       "      <th>number of works</th>\n",
-       "      <th>works source</th>\n",
-       "      <th>email_domains</th>\n",
-       "      <th>url_domains</th>\n",
-       "      <th>fixed_keywords</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>9517099</th>\n",
-       "      <td>0000-0001-6997-9470</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>other</td>\n",
-       "      <td>whatsapp</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[[Otherwhatsapp, https://otherwhatsapp.com/], ...</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[otherwhatsapp.com, im-creator.com, facebook.c...</td>\n",
-       "      <td>[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
-       "9517099  0000-0001-6997-9470     True            True                    True   \n",
-       "\n",
-       "        given names family name biography other names  \\\n",
-       "9517099       other    whatsapp      <NA>        <NA>   \n",
-       "\n",
-       "                                           researcher urls primary email  \\\n",
-       "9517099  [[Otherwhatsapp, https://otherwhatsapp.com/], ...                 \n",
-       "\n",
-       "        other emails                                           keywords  \\\n",
-       "9517099           []  [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...   \n",
-       "\n",
-       "        external identifiers education employments  number of works  \\\n",
-       "9517099                  NaN      <NA>        <NA>                0   \n",
-       "\n",
-       "        works source email_domains  \\\n",
-       "9517099         <NA>            []   \n",
-       "\n",
-       "                                               url_domains  \\\n",
-       "9517099  [otherwhatsapp.com, im-creator.com, facebook.c...   \n",
-       "\n",
-       "                                            fixed_keywords  \n",
-       "9517099  [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...  "
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df[df['orcid'] == WHATSAPP]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>orcid</th>\n",
-       "      <th>claimed</th>\n",
-       "      <th>verifyed email</th>\n",
-       "      <th>verified primary email</th>\n",
-       "      <th>given names</th>\n",
-       "      <th>family name</th>\n",
-       "      <th>biography</th>\n",
-       "      <th>other names</th>\n",
-       "      <th>researcher urls</th>\n",
-       "      <th>primary email</th>\n",
-       "      <th>other emails</th>\n",
-       "      <th>external identifiers</th>\n",
-       "      <th>education</th>\n",
-       "      <th>employments</th>\n",
-       "      <th>number of works</th>\n",
-       "      <th>works source</th>\n",
-       "      <th>email_domains</th>\n",
-       "      <th>url_domains</th>\n",
-       "      <th>fixed_keywords</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0000-0001-5000-2053</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Jorge</td>\n",
-       "      <td>Jaramillo Sanchez</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0000-0001-5000-6548</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Wiseman</td>\n",
-       "      <td>Bekelesi</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0000-0001-5000-7962</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>ALICE</td>\n",
-       "      <td>INDIMULI</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0000-0001-5000-8586</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>shim</td>\n",
-       "      <td>ji yun</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>0000-0001-5001-0256</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Sandro</td>\n",
-       "      <td>Caramaschi</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10747035</th>\n",
-       "      <td>0000-0003-4998-1551</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Animesh</td>\n",
-       "      <td>Ghosh</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10747036</th>\n",
-       "      <td>0000-0003-4998-4111</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Hawa</td>\n",
-       "      <td>Liberna</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10747037</th>\n",
-       "      <td>0000-0003-4998-6045</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Tongyi</td>\n",
-       "      <td>Men</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10747038</th>\n",
-       "      <td>0000-0003-4998-8868</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Charldon</td>\n",
-       "      <td>Wilken</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10747039</th>\n",
-       "      <td>0000-0003-4999-7916</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Tapas Bapu</td>\n",
-       "      <td>B.R.</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>0</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>10747040 rows × 19 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                        orcid  claimed  verifyed email  \\\n",
-       "0         0000-0001-5000-2053     True           False   \n",
-       "1         0000-0001-5000-6548     True           False   \n",
-       "2         0000-0001-5000-7962     True            True   \n",
-       "3         0000-0001-5000-8586     True           False   \n",
-       "4         0000-0001-5001-0256     True           False   \n",
-       "...                       ...      ...             ...   \n",
-       "10747035  0000-0003-4998-1551     True           False   \n",
-       "10747036  0000-0003-4998-4111     True           False   \n",
-       "10747037  0000-0003-4998-6045     True           False   \n",
-       "10747038  0000-0003-4998-8868     True            True   \n",
-       "10747039  0000-0003-4999-7916     True            True   \n",
-       "\n",
-       "          verified primary email given names         family name biography  \\\n",
-       "0                          False      Jorge   Jaramillo Sanchez       <NA>   \n",
-       "1                          False     Wiseman            Bekelesi      <NA>   \n",
-       "2                           True       ALICE            INDIMULI      <NA>   \n",
-       "3                          False        shim              ji yun      <NA>   \n",
-       "4                          False      Sandro          Caramaschi      <NA>   \n",
-       "...                          ...         ...                 ...       ...   \n",
-       "10747035                   False     Animesh               Ghosh      <NA>   \n",
-       "10747036                   False        Hawa             Liberna      <NA>   \n",
-       "10747037                   False      Tongyi                 Men      <NA>   \n",
-       "10747038                   False    Charldon              Wilken      <NA>   \n",
-       "10747039                    True  Tapas Bapu                B.R.      <NA>   \n",
-       "\n",
-       "         other names researcher urls primary email other emails  \\\n",
-       "0               <NA>              []                         []   \n",
-       "1               <NA>              []                         []   \n",
-       "2               <NA>              []                         []   \n",
-       "3               <NA>              []                         []   \n",
-       "4               <NA>              []                         []   \n",
-       "...              ...             ...           ...          ...   \n",
-       "10747035        <NA>              []                         []   \n",
-       "10747036        <NA>              []                         []   \n",
-       "10747037        <NA>              []                         []   \n",
-       "10747038        <NA>              []                         []   \n",
-       "10747039        <NA>              []                         []   \n",
-       "\n",
-       "         external identifiers education employments  number of works  \\\n",
-       "0                         NaN      <NA>        <NA>                0   \n",
-       "1                         NaN      <NA>        <NA>                0   \n",
-       "2                         NaN      <NA>        <NA>                0   \n",
-       "3                         NaN      <NA>        <NA>                0   \n",
-       "4                         NaN      <NA>        <NA>                0   \n",
-       "...                       ...       ...         ...              ...   \n",
-       "10747035                  NaN      <NA>        <NA>                0   \n",
-       "10747036                  NaN      <NA>        <NA>                0   \n",
-       "10747037                  NaN      <NA>        <NA>                0   \n",
-       "10747038                  NaN      <NA>        <NA>                0   \n",
-       "10747039                  NaN      <NA>        <NA>                0   \n",
-       "\n",
-       "         works source email_domains url_domains fixed_keywords  \n",
-       "0                <NA>            []          []             []  \n",
-       "1                <NA>            []          []             []  \n",
-       "2                <NA>            []          []             []  \n",
-       "3                <NA>            []          []             []  \n",
-       "4                <NA>            []          []             []  \n",
-       "...               ...           ...         ...            ...  \n",
-       "10747035         <NA>            []          []             []  \n",
-       "10747036         <NA>            []          []             []  \n",
-       "10747037         <NA>            []          []             []  \n",
-       "10747038         <NA>            []          []             []  \n",
-       "10747039         <NA>            []          []             []  \n",
-       "\n",
-       "[10747040 rows x 19 columns]"
-      ]
-     },
-     "execution_count": 24,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df.drop('keywords', axis=1, inplace=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Fixes for other columns with lists inside"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 27,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# df['other names'] = df['other names'].apply(lambda x: ast.literal_eval(x))\n",
-    "# df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
-    "# df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
-    "# df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
-    "# df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
-    "# df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
-    "# df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
-    "# df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Feature extraction"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# df['email_encoding'] = mlb.fit_transform(df['email_domains'])\n",
-    "# df['url_encoding'] = mlb.fit_transform(df['url_domains'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 29,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>orcid</th>\n",
-       "      <th>claimed</th>\n",
-       "      <th>verifyed email</th>\n",
-       "      <th>verified primary email</th>\n",
-       "      <th>given names</th>\n",
-       "      <th>family name</th>\n",
-       "      <th>biography</th>\n",
-       "      <th>other names</th>\n",
-       "      <th>researcher urls</th>\n",
-       "      <th>primary email</th>\n",
-       "      <th>other emails</th>\n",
-       "      <th>keywords</th>\n",
-       "      <th>external identifiers</th>\n",
-       "      <th>education</th>\n",
-       "      <th>employments</th>\n",
-       "      <th>number of works</th>\n",
-       "      <th>works source</th>\n",
-       "      <th>email_domains</th>\n",
-       "      <th>url_domains</th>\n",
-       "      <th>fixed_keywords</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>0000-0001-5000-2053</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Jorge</td>\n",
-       "      <td>Jaramillo Sanchez</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0000-0001-5000-6548</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Wiseman</td>\n",
-       "      <td>Bekelesi</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>0000-0001-5000-7962</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>ALICE</td>\n",
-       "      <td>INDIMULI</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0000-0001-5000-8586</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>shim</td>\n",
-       "      <td>ji yun</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>0000-0001-5001-0256</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Sandro</td>\n",
-       "      <td>Caramaschi</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>...</th>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "      <td>...</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10747035</th>\n",
-       "      <td>0000-0003-4998-1551</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Animesh</td>\n",
-       "      <td>Ghosh</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10747036</th>\n",
-       "      <td>0000-0003-4998-4111</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Hawa</td>\n",
-       "      <td>Liberna</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10747037</th>\n",
-       "      <td>0000-0003-4998-6045</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Tongyi</td>\n",
-       "      <td>Men</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10747038</th>\n",
-       "      <td>0000-0003-4998-8868</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>False</td>\n",
-       "      <td>Charldon</td>\n",
-       "      <td>Wilken</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>10747039</th>\n",
-       "      <td>0000-0003-4999-7916</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>True</td>\n",
-       "      <td>Tapas Bapu</td>\n",
-       "      <td>B.R.</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>&lt;NA&gt;</td>\n",
-       "      <td>[]</td>\n",
-       "      <td></td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>0</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "      <td>[]</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "<p>10747040 rows × 20 columns</p>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                        orcid  claimed  verifyed email  \\\n",
-       "0         0000-0001-5000-2053     True           False   \n",
-       "1         0000-0001-5000-6548     True           False   \n",
-       "2         0000-0001-5000-7962     True            True   \n",
-       "3         0000-0001-5000-8586     True           False   \n",
-       "4         0000-0001-5001-0256     True           False   \n",
-       "...                       ...      ...             ...   \n",
-       "10747035  0000-0003-4998-1551     True           False   \n",
-       "10747036  0000-0003-4998-4111     True           False   \n",
-       "10747037  0000-0003-4998-6045     True           False   \n",
-       "10747038  0000-0003-4998-8868     True            True   \n",
-       "10747039  0000-0003-4999-7916     True            True   \n",
-       "\n",
-       "          verified primary email given names         family name biography  \\\n",
-       "0                          False      Jorge   Jaramillo Sanchez       <NA>   \n",
-       "1                          False     Wiseman            Bekelesi      <NA>   \n",
-       "2                           True       ALICE            INDIMULI      <NA>   \n",
-       "3                          False        shim              ji yun      <NA>   \n",
-       "4                          False      Sandro          Caramaschi      <NA>   \n",
-       "...                          ...         ...                 ...       ...   \n",
-       "10747035                   False     Animesh               Ghosh      <NA>   \n",
-       "10747036                   False        Hawa             Liberna      <NA>   \n",
-       "10747037                   False      Tongyi                 Men      <NA>   \n",
-       "10747038                   False    Charldon              Wilken      <NA>   \n",
-       "10747039                    True  Tapas Bapu                B.R.      <NA>   \n",
-       "\n",
-       "         other names researcher urls primary email other emails keywords  \\\n",
-       "0               <NA>              []                         []       []   \n",
-       "1               <NA>              []                         []       []   \n",
-       "2               <NA>              []                         []       []   \n",
-       "3               <NA>              []                         []       []   \n",
-       "4               <NA>              []                         []       []   \n",
-       "...              ...             ...           ...          ...      ...   \n",
-       "10747035        <NA>              []                         []       []   \n",
-       "10747036        <NA>              []                         []       []   \n",
-       "10747037        <NA>              []                         []       []   \n",
-       "10747038        <NA>              []                         []       []   \n",
-       "10747039        <NA>              []                         []       []   \n",
-       "\n",
-       "         external identifiers education employments  number of works  \\\n",
-       "0                          []        []          []                0   \n",
-       "1                          []        []          []                0   \n",
-       "2                          []        []          []                0   \n",
-       "3                          []        []          []                0   \n",
-       "4                          []        []          []                0   \n",
-       "...                       ...       ...         ...              ...   \n",
-       "10747035                   []        []          []                0   \n",
-       "10747036                   []        []          []                0   \n",
-       "10747037                   []        []          []                0   \n",
-       "10747038                   []        []          []                0   \n",
-       "10747039                   []        []          []                0   \n",
-       "\n",
-       "         works source email_domains url_domains fixed_keywords  \n",
-       "0                  []            []          []             []  \n",
-       "1                  []            []          []             []  \n",
-       "2                  []            []          []             []  \n",
-       "3                  []            []          []             []  \n",
-       "4                  []            []          []             []  \n",
-       "...               ...           ...         ...            ...  \n",
-       "10747035           []            []          []             []  \n",
-       "10747036           []            []          []             []  \n",
-       "10747037           []            []          []             []  \n",
-       "10747038           []            []          []             []  \n",
-       "10747039           []            []          []             []  \n",
-       "\n",
-       "[10747040 rows x 20 columns]"
-      ]
-     },
-     "execution_count": 29,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "df"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.8.3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/03-Machine Learning.ipynb b/notebooks/03-Machine Learning.ipynb
new file mode 100644
index 0000000..5fa2601
--- /dev/null
+++ b/notebooks/03-Machine Learning.ipynb	
@@ -0,0 +1,468 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Todo in data\n",
+    "- Column names -> no space\n",
+    "- If a list is empty, serialise [] in the csv\n",
+    "- If a string is empty, serialise '' in the csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ast\n",
+    "from urllib.parse import urlparse\n",
+    "import tldextract\n",
+    "\n",
+    "import pandas as pd\n",
+    "from sklearn.preprocessing import MultiLabelBinarizer\n",
+    "from sklearn.svm import OneClassSVM \n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_pickle('../data/processed/features.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>verified_email</th>\n",
+       "      <th>verified_primary_email</th>\n",
+       "      <th>n_works</th>\n",
+       "      <th>n_doi</th>\n",
+       "      <th>n_arxiv</th>\n",
+       "      <th>n_pmc</th>\n",
+       "      <th>n_other_pids</th>\n",
+       "      <th>n_emails</th>\n",
+       "      <th>n_urls</th>\n",
+       "      <th>n_ids</th>\n",
+       "      <th>n_keywords</th>\n",
+       "      <th>n_employment</th>\n",
+       "      <th>n_education</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10989644</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10989645</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7</td>\n",
+       "      <td>7</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10989646</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10989647</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10989648</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>10989649 rows × 14 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          verified_email  verified_primary_email  n_works  n_doi  n_arxiv  \\\n",
+       "0                      0                       0        0      0        0   \n",
+       "1                      1                       1        0      0        0   \n",
+       "2                      1                       1        0      0        0   \n",
+       "3                      1                       1        0      0        0   \n",
+       "4                      1                       1        0      0        0   \n",
+       "...                  ...                     ...      ...    ...      ...   \n",
+       "10989644               1                       1        0      0        0   \n",
+       "10989645               1                       1        7      7        0   \n",
+       "10989646               1                       1        0      0        0   \n",
+       "10989647               1                       1        0      0        0   \n",
+       "10989648               1                       1        0      0        0   \n",
+       "\n",
+       "          n_pmc  n_other_pids  n_emails  n_urls  n_ids  n_keywords  \\\n",
+       "0             0             0       NaN     NaN    NaN         NaN   \n",
+       "1             0             0       NaN     NaN    NaN         NaN   \n",
+       "2             0             0       NaN     NaN    NaN         NaN   \n",
+       "3             0             0       NaN     NaN    NaN         NaN   \n",
+       "4             0             0       NaN     NaN    NaN         NaN   \n",
+       "...         ...           ...       ...     ...    ...         ...   \n",
+       "10989644      0             0       NaN     NaN    NaN         NaN   \n",
+       "10989645      1             0       NaN     NaN    NaN         NaN   \n",
+       "10989646      0             0       NaN     NaN    NaN         NaN   \n",
+       "10989647      0             0       NaN     NaN    NaN         NaN   \n",
+       "10989648      0             0       NaN     NaN    NaN         NaN   \n",
+       "\n",
+       "          n_employment  n_education  label  \n",
+       "0                  NaN          NaN      0  \n",
+       "1                  1.0          NaN      0  \n",
+       "2                  NaN          NaN      0  \n",
+       "3                  1.0          NaN      0  \n",
+       "4                  2.0          NaN      0  \n",
+       "...                ...          ...    ...  \n",
+       "10989644           1.0          2.0      0  \n",
+       "10989645           2.0          2.0      1  \n",
+       "10989646           NaN          NaN      0  \n",
+       "10989647           1.0          2.0      0  \n",
+       "10989648           NaN          NaN      0  \n",
+       "\n",
+       "[10989649 rows x 14 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.fillna(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "verified_email            2664886\n",
+       "verified_primary_email    2664886\n",
+       "n_works                   2664886\n",
+       "n_doi                     2664886\n",
+       "n_arxiv                   2664886\n",
+       "n_pmc                     2664886\n",
+       "n_other_pids              2664886\n",
+       "n_emails                  2664886\n",
+       "n_urls                    2664886\n",
+       "n_ids                     2664886\n",
+       "n_keywords                2664886\n",
+       "n_employment              2664886\n",
+       "n_education               2664886\n",
+       "label                     2664886\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[df.label == 1].count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "verified_email            8324763\n",
+       "verified_primary_email    8324763\n",
+       "n_works                   8324763\n",
+       "n_doi                     8324763\n",
+       "n_arxiv                   8324763\n",
+       "n_pmc                     8324763\n",
+       "n_other_pids              8324763\n",
+       "n_emails                  8324763\n",
+       "n_urls                    8324763\n",
+       "n_ids                     8324763\n",
+       "n_keywords                8324763\n",
+       "n_employment              8324763\n",
+       "n_education               8324763\n",
+       "label                     8324763\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[df.label == 0].count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# split into train/test sets\n",
+    "X = df.loc[:,'verified_email':'n_education']\n",
+    "y = df['label']\n",
+    "trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)\n",
+    "\n",
+    "# define outlier detection model\n",
+    "model = OneClassSVM(gamma='scale', nu=0.01)\n",
+    "\n",
+    "# fit on majority class\n",
+    "trainX = trainX[trainy==1]\n",
+    "model.fit(trainX)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# detect outliers in the test set\n",
+    "yhat = model.predict(testX)\n",
+    "\n",
+    "# mark inliers 1, outliers -1\n",
+    "testy[testy == 0] = -1\n",
+    "testy[testy == 1] = 1\n",
+    "\n",
+    "# calculate score\n",
+    "score = f1_score(testy, yhat, pos_label=-1)\n",
+    "print('F1 Score: %.3f' % score)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}