fake-orcid-analysis/notebooks/03-Feature extraction.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Todo in data\n",
    "- Column names -> no space\n",
    "- If a list is empty, serialise [] in the csv\n",
    "- If a string is empty, serialise '' in the csv"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import ast\n",
    "from urllib.parse import urlparse\n",
    "import tldextract\n",
    "\n",
    "import pandas as pd\n",
    "from sklearn.preprocessing import MultiLabelBinarizer\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "mlb = MultiLabelBinarizer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Notable Solid ORCID iDs for debug purposes\n",
    "AM = '0000-0002-5193-7851'\n",
    "PP = '0000-0002-8588-4196'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Notable fake ORCID iDs for debug purposes\n",
    "SCAFFOLD = '0000-0001-5004-7761'\n",
    "WHATSAPP = '0000-0001-6997-9470'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header = 0,\n",
    "                         dtype = {\"orcid\": pd.StringDtype(), \n",
    "                                  \"claimed\": bool, \n",
    "                                  \"verifyed email\": bool, \n",
    "                                  \"verified primary email\": bool,\n",
    "                                  \"given names\": pd.StringDtype(),\n",
    "                                  \"family name\": pd.StringDtype(),\n",
    "                                  \"biography\": pd.StringDtype(),\n",
    "                                  \"other names\": pd.StringDtype(),\n",
    "                                  \"researcher urls\": pd.StringDtype(),\n",
    "                                  \"primary email\": pd.StringDtype(),\n",
    "                                  \"other emails\": pd.StringDtype(),\n",
    "                                  \"keywords\": pd.StringDtype(),\n",
    "                                  \"eternal identifiers\": pd.StringDtype(),\n",
    "                                  \"education\": pd.StringDtype(),\n",
    "                                  \"employments\": pd.StringDtype(),\n",
    "                                  \"number of works\": pd.Int16Dtype(),\n",
    "                                  \"works source\": pd.StringDtype()})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verifyed email</th>\n",
       "      <th>verified primary email</th>\n",
       "      <th>given names</th>\n",
       "      <th>family name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other names</th>\n",
       "      <th>researcher urls</th>\n",
       "      <th>primary email</th>\n",
       "      <th>other emails</th>\n",
       "      <th>keywords</th>\n",
       "      <th>external identifiers</th>\n",
       "      <th>education</th>\n",
       "      <th>employments</th>\n",
       "      <th>number of works</th>\n",
       "      <th>works source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0000-0001-5000-2053</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Jorge</td>\n",
       "      <td>Jaramillo Sanchez</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0000-0001-5000-6548</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Wiseman</td>\n",
       "      <td>Bekelesi</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0000-0001-5000-7962</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>ALICE</td>\n",
       "      <td>INDIMULI</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0000-0001-5000-8586</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>shim</td>\n",
       "      <td>ji yun</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0000-0001-5001-0256</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Sandro</td>\n",
       "      <td>Caramaschi</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                 orcid  claimed  verifyed email  verified primary email  \\\n",
       "0  0000-0001-5000-2053     True           False                   False   \n",
       "1  0000-0001-5000-6548     True           False                   False   \n",
       "2  0000-0001-5000-7962     True            True                    True   \n",
       "3  0000-0001-5000-8586     True           False                   False   \n",
       "4  0000-0001-5001-0256     True           False                   False   \n",
       "\n",
       "  given names         family name biography other names researcher urls  \\\n",
       "0      Jorge   Jaramillo Sanchez       <NA>        <NA>            <NA>   \n",
       "1     Wiseman            Bekelesi      <NA>        <NA>            <NA>   \n",
       "2       ALICE            INDIMULI      <NA>        <NA>            <NA>   \n",
       "3        shim              ji yun      <NA>        <NA>            <NA>   \n",
       "4      Sandro          Caramaschi      <NA>        <NA>            <NA>   \n",
       "\n",
       "  primary email other emails keywords external identifiers education  \\\n",
       "0          <NA>         <NA>     <NA>                  NaN      <NA>   \n",
       "1          <NA>         <NA>     <NA>                  NaN      <NA>   \n",
       "2          <NA>         <NA>     <NA>                  NaN      <NA>   \n",
       "3          <NA>         <NA>     <NA>                  NaN      <NA>   \n",
       "4          <NA>         <NA>     <NA>                  NaN      <NA>   \n",
       "\n",
       "  employments  number of works works source  \n",
       "0        <NA>                0         <NA>  \n",
       "1        <NA>                0         <NA>  \n",
       "2        <NA>                0         <NA>  \n",
       "3        <NA>                0         <NA>  \n",
       "4        <NA>                0         <NA>  "
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verifyed email</th>\n",
       "      <th>verified primary email</th>\n",
       "      <th>given names</th>\n",
       "      <th>family name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other names</th>\n",
       "      <th>researcher urls</th>\n",
       "      <th>primary email</th>\n",
       "      <th>other emails</th>\n",
       "      <th>keywords</th>\n",
       "      <th>external identifiers</th>\n",
       "      <th>education</th>\n",
       "      <th>employments</th>\n",
       "      <th>number of works</th>\n",
       "      <th>works source</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8840413</th>\n",
       "      <td>0000-0002-5193-7851</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>Andrea</td>\n",
       "      <td>Mannocci</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
       "      <td>andrea.mannocci@isti.cnr.it</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
       "      <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
       "      <td>[[\"Information engineering\", \"Ph.D.\", \"Univers...</td>\n",
       "      <td>[[\"Research Associate\", \"Istituto di Scienza e...</td>\n",
       "      <td>37</td>\n",
       "      <td>[\"Scopus - Elsevier\", \"Crossref Metadata Searc...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
       "8840413  0000-0002-5193-7851     True            True                    True   \n",
       "\n",
       "        given names family name biography other names  \\\n",
       "8840413      Andrea    Mannocci      <NA>        <NA>   \n",
       "\n",
       "                                           researcher urls  \\\n",
       "8840413  [[\"Personal website\", \"https://andremann.githu...   \n",
       "\n",
       "                       primary email other emails  \\\n",
       "8840413  andrea.mannocci@isti.cnr.it         <NA>   \n",
       "\n",
       "                                                  keywords  \\\n",
       "8840413  [\"Data science \", \"science of science\", \"schol...   \n",
       "\n",
       "                          external identifiers  \\\n",
       "8840413  [[\"Scopus Author ID\", \"55233589900\"]]   \n",
       "\n",
       "                                                 education  \\\n",
       "8840413  [[\"Information engineering\", \"Ph.D.\", \"Univers...   \n",
       "\n",
       "                                               employments  number of works  \\\n",
       "8840413  [[\"Research Associate\", \"Istituto di Scienza e...               37   \n",
       "\n",
       "                                              works source  \n",
       "8840413  [\"Scopus - Elsevier\", \"Crossref Metadata Searc...  "
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['orcid'] == AM]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extracting works source"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_work_source(lst):\n",
    "    extracted = []\n",
    "    for s in lst:\n",
    "        if 'Scopus - Elsevier' in s or 'Crossref' in s:\n",
    "            extracted.append(s)\n",
    "    return extracted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['extracted_works_source'] = df['works source'].apply(lambda x: extract_work_source(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_works_source']), columns=mlb.classes_)], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop(['works source', 'extracted_works_source'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verifyed email</th>\n",
       "      <th>verified primary email</th>\n",
       "      <th>given names</th>\n",
       "      <th>family name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other names</th>\n",
       "      <th>researcher urls</th>\n",
       "      <th>primary email</th>\n",
       "      <th>other emails</th>\n",
       "      <th>keywords</th>\n",
       "      <th>external identifiers</th>\n",
       "      <th>education</th>\n",
       "      <th>employments</th>\n",
       "      <th>number of works</th>\n",
       "      <th>Crossref</th>\n",
       "      <th>Crossref Metadata Search</th>\n",
       "      <th>Scopus - Elsevier</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8840413</th>\n",
       "      <td>0000-0002-5193-7851</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>Andrea</td>\n",
       "      <td>Mannocci</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
       "      <td>andrea.mannocci@isti.cnr.it</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
       "      <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
       "      <td>[[\"Information engineering\", \"Ph.D.\", \"Univers...</td>\n",
       "      <td>[[\"Research Associate\", \"Istituto di Scienza e...</td>\n",
       "      <td>37</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
       "8840413  0000-0002-5193-7851     True            True                    True   \n",
       "\n",
       "        given names family name biography other names  \\\n",
       "8840413      Andrea    Mannocci      <NA>        <NA>   \n",
       "\n",
       "                                           researcher urls  \\\n",
       "8840413  [[\"Personal website\", \"https://andremann.githu...   \n",
       "\n",
       "                       primary email other emails  \\\n",
       "8840413  andrea.mannocci@isti.cnr.it         <NA>   \n",
       "\n",
       "                                                  keywords  \\\n",
       "8840413  [\"Data science \", \"science of science\", \"schol...   \n",
       "\n",
       "                          external identifiers  \\\n",
       "8840413  [[\"Scopus Author ID\", \"55233589900\"]]   \n",
       "\n",
       "                                                 education  \\\n",
       "8840413  [[\"Information engineering\", \"Ph.D.\", \"Univers...   \n",
       "\n",
       "                                               employments  number of works  \\\n",
       "8840413  [[\"Research Associate\", \"Istituto di Scienza e...               37   \n",
       "\n",
       "         Crossref  Crossref Metadata Search  Scopus - Elsevier  \n",
       "8840413         1                         1                  1  "
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['orcid'] == AM]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Education"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['n_education'] = df['education'].str.len()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop('education', axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verifyed email</th>\n",
       "      <th>verified primary email</th>\n",
       "      <th>given names</th>\n",
       "      <th>family name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other names</th>\n",
       "      <th>researcher urls</th>\n",
       "      <th>primary email</th>\n",
       "      <th>other emails</th>\n",
       "      <th>keywords</th>\n",
       "      <th>external identifiers</th>\n",
       "      <th>employments</th>\n",
       "      <th>number of works</th>\n",
       "      <th>Crossref</th>\n",
       "      <th>Crossref Metadata Search</th>\n",
       "      <th>Scopus - Elsevier</th>\n",
       "      <th>n_education</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8840413</th>\n",
       "      <td>0000-0002-5193-7851</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>Andrea</td>\n",
       "      <td>Mannocci</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
       "      <td>andrea.mannocci@isti.cnr.it</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
       "      <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
       "      <td>[[\"Research Associate\", \"Istituto di Scienza e...</td>\n",
       "      <td>37</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
       "8840413  0000-0002-5193-7851     True            True                    True   \n",
       "\n",
       "        given names family name biography other names  \\\n",
       "8840413      Andrea    Mannocci      <NA>        <NA>   \n",
       "\n",
       "                                           researcher urls  \\\n",
       "8840413  [[\"Personal website\", \"https://andremann.githu...   \n",
       "\n",
       "                       primary email other emails  \\\n",
       "8840413  andrea.mannocci@isti.cnr.it         <NA>   \n",
       "\n",
       "                                                  keywords  \\\n",
       "8840413  [\"Data science \", \"science of science\", \"schol...   \n",
       "\n",
       "                          external identifiers  \\\n",
       "8840413  [[\"Scopus Author ID\", \"55233589900\"]]   \n",
       "\n",
       "                                               employments  number of works  \\\n",
       "8840413  [[\"Research Associate\", \"Istituto di Scienza e...               37   \n",
       "\n",
       "         Crossref  Crossref Metadata Search  Scopus - Elsevier  n_education  \n",
       "8840413         1                         1                  1            4  "
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['orcid'] == AM]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Employment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['n_employments'] = df['employments'].str.len()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop('employments', axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verifyed email</th>\n",
       "      <th>verified primary email</th>\n",
       "      <th>given names</th>\n",
       "      <th>family name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other names</th>\n",
       "      <th>researcher urls</th>\n",
       "      <th>primary email</th>\n",
       "      <th>other emails</th>\n",
       "      <th>keywords</th>\n",
       "      <th>external identifiers</th>\n",
       "      <th>number of works</th>\n",
       "      <th>Crossref</th>\n",
       "      <th>Crossref Metadata Search</th>\n",
       "      <th>Scopus - Elsevier</th>\n",
       "      <th>n_education</th>\n",
       "      <th>n_employments</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8840413</th>\n",
       "      <td>0000-0002-5193-7851</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>Andrea</td>\n",
       "      <td>Mannocci</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
       "      <td>andrea.mannocci@isti.cnr.it</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
       "      <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
       "      <td>37</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
       "8840413  0000-0002-5193-7851     True            True                    True   \n",
       "\n",
       "        given names family name biography other names  \\\n",
       "8840413      Andrea    Mannocci      <NA>        <NA>   \n",
       "\n",
       "                                           researcher urls  \\\n",
       "8840413  [[\"Personal website\", \"https://andremann.githu...   \n",
       "\n",
       "                       primary email other emails  \\\n",
       "8840413  andrea.mannocci@isti.cnr.it         <NA>   \n",
       "\n",
       "                                                  keywords  \\\n",
       "8840413  [\"Data science \", \"science of science\", \"schol...   \n",
       "\n",
       "                          external identifiers  number of works  Crossref  \\\n",
       "8840413  [[\"Scopus Author ID\", \"55233589900\"]]               37         1   \n",
       "\n",
       "         Crossref Metadata Search  Scopus - Elsevier  n_education  \\\n",
       "8840413                         1                  1            4   \n",
       "\n",
       "         n_employments  \n",
       "8840413              5  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['orcid'] == AM]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# External IDs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "# def extract_ids(lst):\n",
    "#     extracted = []\n",
    "#     for id in lst:\n",
    "#         extracted.append(id[0])\n",
    "#     return extracted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df['extracted_identifiers'] = df['external identifiers'].apply(lambda x: extract_ids(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_identifiers']), columns=mlb.classes_)], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['n_ext_ids'] = df['external identifiers'].str.len()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop(['external identifiers'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verifyed email</th>\n",
       "      <th>verified primary email</th>\n",
       "      <th>given names</th>\n",
       "      <th>family name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other names</th>\n",
       "      <th>researcher urls</th>\n",
       "      <th>primary email</th>\n",
       "      <th>other emails</th>\n",
       "      <th>keywords</th>\n",
       "      <th>number of works</th>\n",
       "      <th>Crossref</th>\n",
       "      <th>Crossref Metadata Search</th>\n",
       "      <th>Scopus - Elsevier</th>\n",
       "      <th>n_education</th>\n",
       "      <th>n_employments</th>\n",
       "      <th>n_ext_ids</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8840413</th>\n",
       "      <td>0000-0002-5193-7851</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>Andrea</td>\n",
       "      <td>Mannocci</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[[\"Personal website\", \"https://andremann.githu...</td>\n",
       "      <td>andrea.mannocci@isti.cnr.it</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
       "      <td>37</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
       "8840413  0000-0002-5193-7851     True            True                    True   \n",
       "\n",
       "        given names family name biography other names  \\\n",
       "8840413      Andrea    Mannocci      <NA>        <NA>   \n",
       "\n",
       "                                           researcher urls  \\\n",
       "8840413  [[\"Personal website\", \"https://andremann.githu...   \n",
       "\n",
       "                       primary email other emails  \\\n",
       "8840413  andrea.mannocci@isti.cnr.it         <NA>   \n",
       "\n",
       "                                                  keywords  number of works  \\\n",
       "8840413  [\"Data science \", \"science of science\", \"schol...               37   \n",
       "\n",
       "         Crossref  Crossref Metadata Search  Scopus - Elsevier  n_education  \\\n",
       "8840413         1                         1                  1            4   \n",
       "\n",
       "         n_employments  n_ext_ids  \n",
       "8840413              5          1  "
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['orcid'] == AM]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extracting email domains"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['primary email'] = df['primary email'].fillna('')\n",
    "df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_email_domains(row):\n",
    "    domains = []\n",
    "    if len(row['primary email']) > 0:\n",
    "        domains.append(row['primary email'].split('@')[1])\n",
    "    for email in row['other emails']:\n",
    "        domains.append(email.split('@')[1])\n",
    "    return domains"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['email_domains'] = df[['primary email','other emails']].apply(lambda row: extract_email_domains(row), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "34          [seh.ox.ac.uk, bsg.ox.ac.uk]\n",
       "47                         [foxmail.com]\n",
       "103                     [fvtm.bu.edu.eg]\n",
       "297                           [unipa.it]\n",
       "299                            [nhs.net]\n",
       "                        ...             \n",
       "10746811             [gva.es, gmail.com]\n",
       "10746850                  [cinvestav.mx]\n",
       "10746920        [gmail.com, hotmail.com]\n",
       "10746975                       [mail.ru]\n",
       "10746988                        [ucm.es]\n",
       "Name: email_domains, Length: 141118, dtype: object"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['email_domains'].str.len() != 0]['email_domains']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['email_domains']), columns=mlb.classes_)], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop(['primary email', 'other emails', 'email_domains'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df[df['orcid'] == AM]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Extracting URL domains"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_url_domains(lst):\n",
    "    domains = []\n",
    "    for e in lst:\n",
    "        # e[0] is a string describing the url\n",
    "        # e[1] is the url\n",
    "        ext = tldextract.extract(e[1])\n",
    "        domains.append(ext.registered_domain)\n",
    "    return domains"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5                           [researchgate.net]\n",
       "14                      [tigerscaffolds.co.nz]\n",
       "15                         [corticalbrain.com]\n",
       "29                                   [cnpq.br]\n",
       "30                                [sksahu.net]\n",
       "                           ...                \n",
       "10746945                          [telegra.ph]\n",
       "10746950    [twitter.com, urbanfoodpolicy.com]\n",
       "10746955                    [openlearning.com]\n",
       "10746984                        [panaximco.vn]\n",
       "10746987                       [swansea.ac.uk]\n",
       "Name: url_domains, Length: 688572, dtype: object"
      ]
     },
     "execution_count": 38,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['url_domains'].str.len() != 0]['url_domains']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['url_domains']), columns=mlb.classes_)], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.drop(['researcher urls', 'url_domains'], axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verifyed email</th>\n",
       "      <th>verified primary email</th>\n",
       "      <th>given names</th>\n",
       "      <th>family name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other names</th>\n",
       "      <th>keywords</th>\n",
       "      <th>number of works</th>\n",
       "      <th>Crossref</th>\n",
       "      <th>Crossref Metadata Search</th>\n",
       "      <th>Scopus - Elsevier</th>\n",
       "      <th>n_education</th>\n",
       "      <th>n_employments</th>\n",
       "      <th>n_ext_ids</th>\n",
       "      <th>email_domains</th>\n",
       "      <th>url_domains</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8840413</th>\n",
       "      <td>0000-0002-5193-7851</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>Andrea</td>\n",
       "      <td>Mannocci</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[\"Data science \", \"science of science\", \"schol...</td>\n",
       "      <td>37</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>[isti.cnr.it]</td>\n",
       "      <td>[github.io, twitter.com, linkedin.com]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
       "8840413  0000-0002-5193-7851     True            True                    True   \n",
       "\n",
       "        given names family name biography other names  \\\n",
       "8840413      Andrea    Mannocci      <NA>        <NA>   \n",
       "\n",
       "                                                  keywords  number of works  \\\n",
       "8840413  [\"Data science \", \"science of science\", \"schol...               37   \n",
       "\n",
       "         Crossref  Crossref Metadata Search  Scopus - Elsevier  n_education  \\\n",
       "8840413         1                         1                  1            4   \n",
       "\n",
       "         n_employments  n_ext_ids  email_domains  \\\n",
       "8840413              5          1  [isti.cnr.it]   \n",
       "\n",
       "                                    url_domains  \n",
       "8840413  [github.io, twitter.com, linkedin.com]  "
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['orcid'] == AM]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fixing keywords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Sometimes, different keywords are provided as a continuum (multiplexed in just one keyword). E.g."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verifyed email</th>\n",
       "      <th>verified primary email</th>\n",
       "      <th>given names</th>\n",
       "      <th>family name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other names</th>\n",
       "      <th>researcher urls</th>\n",
       "      <th>primary email</th>\n",
       "      <th>other emails</th>\n",
       "      <th>keywords</th>\n",
       "      <th>external identifiers</th>\n",
       "      <th>education</th>\n",
       "      <th>employments</th>\n",
       "      <th>number of works</th>\n",
       "      <th>works source</th>\n",
       "      <th>email_domains</th>\n",
       "      <th>url_domains</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>9601705</th>\n",
       "      <td>0000-0002-8588-4196</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>Pedro</td>\n",
       "      <td>Príncipe</td>\n",
       "      <td>Pedro Príncipe is an information, documentatio...</td>\n",
       "      <td>[\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"]</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[open access, open science, libraries, reposit...</td>\n",
       "      <td>[[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]]</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[[\"Librarian / Project manager\", \"Universidade...</td>\n",
       "      <td>5</td>\n",
       "      <td>[\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"...</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
       "9601705  0000-0002-8588-4196     True            True                    True   \n",
       "\n",
       "        given names family name  \\\n",
       "9601705       Pedro    Príncipe   \n",
       "\n",
       "                                                 biography  \\\n",
       "9601705  Pedro Príncipe is an information, documentatio...   \n",
       "\n",
       "                                              other names researcher urls  \\\n",
       "9601705  [\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"]              []   \n",
       "\n",
       "        primary email other emails  \\\n",
       "9601705                         []   \n",
       "\n",
       "                                                  keywords  \\\n",
       "9601705  [open access, open science, libraries, reposit...   \n",
       "\n",
       "                            external identifiers education  \\\n",
       "9601705  [[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]]      <NA>   \n",
       "\n",
       "                                               employments  number of works  \\\n",
       "9601705  [[\"Librarian / Project manager\", \"Universidade...                5   \n",
       "\n",
       "                                              works source email_domains  \\\n",
       "9601705  [\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"...            []   \n",
       "\n",
       "        url_domains  \n",
       "9601705          []  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['orcid'] == PP]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "def fix_keywords(lst):\n",
    "    fixed = []\n",
    "    for k in lst:\n",
    "        split = k.split(',')\n",
    "        fixed.extend(split)\n",
    "    return fixed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['open access',\n",
       " ' open science',\n",
       " ' libraries',\n",
       " ' repositories',\n",
       " ' social web',\n",
       " '']"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test = ['open access, open science, libraries, repositories, social web,']\n",
    "fix_keywords(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['fixed_keywords'] = df['keywords'].apply(lambda lst: fix_keywords(lst))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verifyed email</th>\n",
       "      <th>verified primary email</th>\n",
       "      <th>given names</th>\n",
       "      <th>family name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other names</th>\n",
       "      <th>researcher urls</th>\n",
       "      <th>primary email</th>\n",
       "      <th>other emails</th>\n",
       "      <th>keywords</th>\n",
       "      <th>external identifiers</th>\n",
       "      <th>education</th>\n",
       "      <th>employments</th>\n",
       "      <th>number of works</th>\n",
       "      <th>works source</th>\n",
       "      <th>email_domains</th>\n",
       "      <th>url_domains</th>\n",
       "      <th>fixed_keywords</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>9517099</th>\n",
       "      <td>0000-0001-6997-9470</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>other</td>\n",
       "      <td>whatsapp</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[[Otherwhatsapp, https://otherwhatsapp.com/], ...</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td>[otherwhatsapp.com, im-creator.com, facebook.c...</td>\n",
       "      <td>[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
       "9517099  0000-0001-6997-9470     True            True                    True   \n",
       "\n",
       "        given names family name biography other names  \\\n",
       "9517099       other    whatsapp      <NA>        <NA>   \n",
       "\n",
       "                                           researcher urls primary email  \\\n",
       "9517099  [[Otherwhatsapp, https://otherwhatsapp.com/], ...                 \n",
       "\n",
       "        other emails                                           keywords  \\\n",
       "9517099           []  [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...   \n",
       "\n",
       "        external identifiers education employments  number of works  \\\n",
       "9517099                  NaN      <NA>        <NA>                0   \n",
       "\n",
       "        works source email_domains  \\\n",
       "9517099         <NA>            []   \n",
       "\n",
       "                                               url_domains  \\\n",
       "9517099  [otherwhatsapp.com, im-creator.com, facebook.c...   \n",
       "\n",
       "                                            fixed_keywords  \n",
       "9517099  [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...  "
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['orcid'] == WHATSAPP]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verifyed email</th>\n",
       "      <th>verified primary email</th>\n",
       "      <th>given names</th>\n",
       "      <th>family name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other names</th>\n",
       "      <th>researcher urls</th>\n",
       "      <th>primary email</th>\n",
       "      <th>other emails</th>\n",
       "      <th>external identifiers</th>\n",
       "      <th>education</th>\n",
       "      <th>employments</th>\n",
       "      <th>number of works</th>\n",
       "      <th>works source</th>\n",
       "      <th>email_domains</th>\n",
       "      <th>url_domains</th>\n",
       "      <th>fixed_keywords</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0000-0001-5000-2053</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Jorge</td>\n",
       "      <td>Jaramillo Sanchez</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0000-0001-5000-6548</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Wiseman</td>\n",
       "      <td>Bekelesi</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0000-0001-5000-7962</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>ALICE</td>\n",
       "      <td>INDIMULI</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0000-0001-5000-8586</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>shim</td>\n",
       "      <td>ji yun</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0000-0001-5001-0256</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Sandro</td>\n",
       "      <td>Caramaschi</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747035</th>\n",
       "      <td>0000-0003-4998-1551</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Animesh</td>\n",
       "      <td>Ghosh</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747036</th>\n",
       "      <td>0000-0003-4998-4111</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Hawa</td>\n",
       "      <td>Liberna</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747037</th>\n",
       "      <td>0000-0003-4998-6045</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Tongyi</td>\n",
       "      <td>Men</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747038</th>\n",
       "      <td>0000-0003-4998-8868</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>Charldon</td>\n",
       "      <td>Wilken</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747039</th>\n",
       "      <td>0000-0003-4999-7916</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>Tapas Bapu</td>\n",
       "      <td>B.R.</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10747040 rows × 19 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                        orcid  claimed  verifyed email  \\\n",
       "0         0000-0001-5000-2053     True           False   \n",
       "1         0000-0001-5000-6548     True           False   \n",
       "2         0000-0001-5000-7962     True            True   \n",
       "3         0000-0001-5000-8586     True           False   \n",
       "4         0000-0001-5001-0256     True           False   \n",
       "...                       ...      ...             ...   \n",
       "10747035  0000-0003-4998-1551     True           False   \n",
       "10747036  0000-0003-4998-4111     True           False   \n",
       "10747037  0000-0003-4998-6045     True           False   \n",
       "10747038  0000-0003-4998-8868     True            True   \n",
       "10747039  0000-0003-4999-7916     True            True   \n",
       "\n",
       "          verified primary email given names         family name biography  \\\n",
       "0                          False      Jorge   Jaramillo Sanchez       <NA>   \n",
       "1                          False     Wiseman            Bekelesi      <NA>   \n",
       "2                           True       ALICE            INDIMULI      <NA>   \n",
       "3                          False        shim              ji yun      <NA>   \n",
       "4                          False      Sandro          Caramaschi      <NA>   \n",
       "...                          ...         ...                 ...       ...   \n",
       "10747035                   False     Animesh               Ghosh      <NA>   \n",
       "10747036                   False        Hawa             Liberna      <NA>   \n",
       "10747037                   False      Tongyi                 Men      <NA>   \n",
       "10747038                   False    Charldon              Wilken      <NA>   \n",
       "10747039                    True  Tapas Bapu                B.R.      <NA>   \n",
       "\n",
       "         other names researcher urls primary email other emails  \\\n",
       "0               <NA>              []                         []   \n",
       "1               <NA>              []                         []   \n",
       "2               <NA>              []                         []   \n",
       "3               <NA>              []                         []   \n",
       "4               <NA>              []                         []   \n",
       "...              ...             ...           ...          ...   \n",
       "10747035        <NA>              []                         []   \n",
       "10747036        <NA>              []                         []   \n",
       "10747037        <NA>              []                         []   \n",
       "10747038        <NA>              []                         []   \n",
       "10747039        <NA>              []                         []   \n",
       "\n",
       "         external identifiers education employments  number of works  \\\n",
       "0                         NaN      <NA>        <NA>                0   \n",
       "1                         NaN      <NA>        <NA>                0   \n",
       "2                         NaN      <NA>        <NA>                0   \n",
       "3                         NaN      <NA>        <NA>                0   \n",
       "4                         NaN      <NA>        <NA>                0   \n",
       "...                       ...       ...         ...              ...   \n",
       "10747035                  NaN      <NA>        <NA>                0   \n",
       "10747036                  NaN      <NA>        <NA>                0   \n",
       "10747037                  NaN      <NA>        <NA>                0   \n",
       "10747038                  NaN      <NA>        <NA>                0   \n",
       "10747039                  NaN      <NA>        <NA>                0   \n",
       "\n",
       "         works source email_domains url_domains fixed_keywords  \n",
       "0                <NA>            []          []             []  \n",
       "1                <NA>            []          []             []  \n",
       "2                <NA>            []          []             []  \n",
       "3                <NA>            []          []             []  \n",
       "4                <NA>            []          []             []  \n",
       "...               ...           ...         ...            ...  \n",
       "10747035         <NA>            []          []             []  \n",
       "10747036         <NA>            []          []             []  \n",
       "10747037         <NA>            []          []             []  \n",
       "10747038         <NA>            []          []             []  \n",
       "10747039         <NA>            []          []             []  \n",
       "\n",
       "[10747040 rows x 19 columns]"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.drop('keywords', axis=1, inplace=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Fixes for other columns with lists inside"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df['other names'] = df['other names'].apply(lambda x: ast.literal_eval(x))\n",
    "# df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
    "# df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
    "# df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
    "# df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
    "# df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
    "# df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
    "# df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Feature extraction"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df['email_encoding'] = mlb.fit_transform(df['email_domains'])\n",
    "# df['url_encoding'] = mlb.fit_transform(df['url_domains'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verifyed email</th>\n",
       "      <th>verified primary email</th>\n",
       "      <th>given names</th>\n",
       "      <th>family name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other names</th>\n",
       "      <th>researcher urls</th>\n",
       "      <th>primary email</th>\n",
       "      <th>other emails</th>\n",
       "      <th>keywords</th>\n",
       "      <th>external identifiers</th>\n",
       "      <th>education</th>\n",
       "      <th>employments</th>\n",
       "      <th>number of works</th>\n",
       "      <th>works source</th>\n",
       "      <th>email_domains</th>\n",
       "      <th>url_domains</th>\n",
       "      <th>fixed_keywords</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0000-0001-5000-2053</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Jorge</td>\n",
       "      <td>Jaramillo Sanchez</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0000-0001-5000-6548</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Wiseman</td>\n",
       "      <td>Bekelesi</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0000-0001-5000-7962</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>ALICE</td>\n",
       "      <td>INDIMULI</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0000-0001-5000-8586</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>shim</td>\n",
       "      <td>ji yun</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0000-0001-5001-0256</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Sandro</td>\n",
       "      <td>Caramaschi</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747035</th>\n",
       "      <td>0000-0003-4998-1551</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Animesh</td>\n",
       "      <td>Ghosh</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747036</th>\n",
       "      <td>0000-0003-4998-4111</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Hawa</td>\n",
       "      <td>Liberna</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747037</th>\n",
       "      <td>0000-0003-4998-6045</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>Tongyi</td>\n",
       "      <td>Men</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747038</th>\n",
       "      <td>0000-0003-4998-8868</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>Charldon</td>\n",
       "      <td>Wilken</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747039</th>\n",
       "      <td>0000-0003-4999-7916</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>Tapas Bapu</td>\n",
       "      <td>B.R.</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10747040 rows × 20 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                        orcid  claimed  verifyed email  \\\n",
       "0         0000-0001-5000-2053     True           False   \n",
       "1         0000-0001-5000-6548     True           False   \n",
       "2         0000-0001-5000-7962     True            True   \n",
       "3         0000-0001-5000-8586     True           False   \n",
       "4         0000-0001-5001-0256     True           False   \n",
       "...                       ...      ...             ...   \n",
       "10747035  0000-0003-4998-1551     True           False   \n",
       "10747036  0000-0003-4998-4111     True           False   \n",
       "10747037  0000-0003-4998-6045     True           False   \n",
       "10747038  0000-0003-4998-8868     True            True   \n",
       "10747039  0000-0003-4999-7916     True            True   \n",
       "\n",
       "          verified primary email given names         family name biography  \\\n",
       "0                          False      Jorge   Jaramillo Sanchez       <NA>   \n",
       "1                          False     Wiseman            Bekelesi      <NA>   \n",
       "2                           True       ALICE            INDIMULI      <NA>   \n",
       "3                          False        shim              ji yun      <NA>   \n",
       "4                          False      Sandro          Caramaschi      <NA>   \n",
       "...                          ...         ...                 ...       ...   \n",
       "10747035                   False     Animesh               Ghosh      <NA>   \n",
       "10747036                   False        Hawa             Liberna      <NA>   \n",
       "10747037                   False      Tongyi                 Men      <NA>   \n",
       "10747038                   False    Charldon              Wilken      <NA>   \n",
       "10747039                    True  Tapas Bapu                B.R.      <NA>   \n",
       "\n",
       "         other names researcher urls primary email other emails keywords  \\\n",
       "0               <NA>              []                         []       []   \n",
       "1               <NA>              []                         []       []   \n",
       "2               <NA>              []                         []       []   \n",
       "3               <NA>              []                         []       []   \n",
       "4               <NA>              []                         []       []   \n",
       "...              ...             ...           ...          ...      ...   \n",
       "10747035        <NA>              []                         []       []   \n",
       "10747036        <NA>              []                         []       []   \n",
       "10747037        <NA>              []                         []       []   \n",
       "10747038        <NA>              []                         []       []   \n",
       "10747039        <NA>              []                         []       []   \n",
       "\n",
       "         external identifiers education employments  number of works  \\\n",
       "0                          []        []          []                0   \n",
       "1                          []        []          []                0   \n",
       "2                          []        []          []                0   \n",
       "3                          []        []          []                0   \n",
       "4                          []        []          []                0   \n",
       "...                       ...       ...         ...              ...   \n",
       "10747035                   []        []          []                0   \n",
       "10747036                   []        []          []                0   \n",
       "10747037                   []        []          []                0   \n",
       "10747038                   []        []          []                0   \n",
       "10747039                   []        []          []                0   \n",
       "\n",
       "         works source email_domains url_domains fixed_keywords  \n",
       "0                  []            []          []             []  \n",
       "1                  []            []          []             []  \n",
       "2                  []            []          []             []  \n",
       "3                  []            []          []             []  \n",
       "4                  []            []          []             []  \n",
       "...               ...           ...         ...            ...  \n",
       "10747035           []            []          []             []  \n",
       "10747036           []            []          []             []  \n",
       "10747037           []            []          []             []  \n",
       "10747038           []            []          []             []  \n",
       "10747039           []            []          []             []  \n",
       "\n",
       "[10747040 rows x 20 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}