fake-orcid-analysis/notebooks/02-Spam filter.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dataset preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import ast\n",
    "import tldextract\n",
    "import numpy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Notable Solid ORCID iDs for debug purposes\n",
    "AM = '0000-0002-5193-7851'\n",
    "PP = '0000-0002-8588-4196'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Notable fake ORCID iDs for debug purposes\n",
    "SCAFFOLD = '0000-0001-5004-7761'\n",
    "WHATSAPP = '0000-0001-6997-9470'\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header=0,\n",
    "                         dtype = {'orcid': pd.StringDtype(), \n",
    "                                  'claimed': bool, \n",
    "                                  'verifyed email': bool, \n",
    "                                  'verified primary email': bool,\n",
    "                                  'given names': pd.StringDtype(),\n",
    "                                  'family name': pd.StringDtype(),\n",
    "                                  'biography': pd.StringDtype(),\n",
    "                                  'other names': pd.StringDtype(),\n",
    "                                  'researcher urls': pd.StringDtype(),\n",
    "                                  'primary email': pd.StringDtype(),\n",
    "                                  'other emails': pd.StringDtype(),\n",
    "                                  'keywords': pd.StringDtype(),\n",
    "                                  'external identifiers': pd.StringDtype(),\n",
    "                                  'education': pd.StringDtype(),\n",
    "                                  'employments': pd.StringDtype(),\n",
    "                                  'number of works': pd.Int16Dtype(),\n",
    "                                  'works source': pd.StringDtype()})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['given names'] = df['given names'].fillna('')\n",
    "df['family name'] = df['family name'].fillna('')\n",
    "df['biography'] = df['biography'].fillna('')\n",
    "df['primary email'] = df['primary email'].fillna('')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['other names'] = df['other names'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_url_domains(lst):\n",
    "    domains = []\n",
    "    for e in lst:\n",
    "        # e[0] is a string describing the url\n",
    "        # e[1] is the url\n",
    "        domain = tldextract.extract(e[1])\n",
    "        domains.append(domain.registered_domain)\n",
    "    return domains"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_education(lst):\n",
    "    educations = []\n",
    "    for e in lst:\n",
    "        # e[0] degree\n",
    "        # e[1] role\n",
    "        # e[2] university\n",
    "        # e[..] city, region, country, id, id_scheme\n",
    "        educations.append(' '.join([e[0], e[1], e[2]]))\n",
    "    return educations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_employment(lst):\n",
    "    res = []\n",
    "    for e in lst:\n",
    "        # e[0] role\n",
    "        # e[1] institute\n",
    "        # e[..] city, region, country, id, id_scheme\n",
    "        res.append(' '.join([e[0], e[1]]))\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "def extract_email_domains(lst):\n",
    "    res = []\n",
    "    for email in lst:\n",
    "        res.append(email.replace('@', ' '))\n",
    "    return res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['concat'] =  df['given names'] + ' ' + df['family name'] + '\\n' + \\\n",
    "                df['other names'].apply(lambda x: ' '.join(x)) + '\\n' + \\\n",
    "                df['primary email'].values[0].replace('@', ' ') + '\\n' + \\\n",
    "                df['other emails'].apply(lambda x: ' '.join(extract_email_domains(x))) + '\\n' + \\\n",
    "                df['biography'] + '\\n' + \\\n",
    "                df['keywords'].apply(lambda x: ' - '.join(x)) + '\\n' + \\\n",
    "                df['url_domains'].apply(lambda x: ' '.join(x)) + '\\n' + \\\n",
    "                df['education'].apply(lambda x: '\\n'.join(extract_education(x))) + '\\n' + \\\n",
    "                df['employments'].apply(lambda x: '\\n'.join(extract_employment(x)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Andrea Mannocci\n",
      "\n",
      "\n",
      "\n",
      "\n",
      "Data science  - science of science - scholarly knowledge mining - open science - research infrastructures\n",
      "github.io twitter.com linkedin.com\n",
      "Information engineering Ph.D. Università degli Studi di Pisa\n",
      "Telematics engineering M.Sc. Universidad Carlos III de Madrid\n",
      "Computer engineering M.Sc. Università degli Studi di Pisa\n",
      "Computer engineering B.Sc. Università degli Studi di Pisa\n",
      "Research Associate Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche\n",
      "Research Associate The Open University\n",
      "Research assistant Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche\n",
      "Research assistant IMDEA Networks\n",
      "Research assistant Syddansk Universitet\n"
     ]
    }
   ],
   "source": [
    "print(df[df['orcid'] == AM]['concat'].values[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.loc[df['number of works'] > 0, 'label'] = True"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verifyed email</th>\n",
       "      <th>verified primary email</th>\n",
       "      <th>given names</th>\n",
       "      <th>family name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other names</th>\n",
       "      <th>researcher urls</th>\n",
       "      <th>primary email</th>\n",
       "      <th>other emails</th>\n",
       "      <th>keywords</th>\n",
       "      <th>external identifiers</th>\n",
       "      <th>education</th>\n",
       "      <th>employments</th>\n",
       "      <th>number of works</th>\n",
       "      <th>works source</th>\n",
       "      <th>url_domains</th>\n",
       "      <th>concat</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>8840413</th>\n",
       "      <td>0000-0002-5193-7851</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>Andrea</td>\n",
       "      <td>Mannocci</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[[Personal website, https://andremann.github.i...</td>\n",
       "      <td>andrea.mannocci@isti.cnr.it</td>\n",
       "      <td>[]</td>\n",
       "      <td>[Data science , science of science, scholarly ...</td>\n",
       "      <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
       "      <td>[[Information engineering, Ph.D., Università d...</td>\n",
       "      <td>[[Research Associate, Istituto di Scienza e Te...</td>\n",
       "      <td>37</td>\n",
       "      <td>[\"Scopus - Elsevier\", \"Crossref Metadata Searc...</td>\n",
       "      <td>[github.io, twitter.com, linkedin.com]</td>\n",
       "      <td>0000-0002-5193-7851\n",
       "Andrea Mannocci\n",
       "\n",
       "andrea.ma...</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                       orcid  claimed  verifyed email  verified primary email  \\\n",
       "8840413  0000-0002-5193-7851     True            True                    True   \n",
       "\n",
       "        given names family name biography other names  \\\n",
       "8840413      Andrea    Mannocci                    []   \n",
       "\n",
       "                                           researcher urls  \\\n",
       "8840413  [[Personal website, https://andremann.github.i...   \n",
       "\n",
       "                       primary email other emails  \\\n",
       "8840413  andrea.mannocci@isti.cnr.it           []   \n",
       "\n",
       "                                                  keywords  \\\n",
       "8840413  [Data science , science of science, scholarly ...   \n",
       "\n",
       "                          external identifiers  \\\n",
       "8840413  [[\"Scopus Author ID\", \"55233589900\"]]   \n",
       "\n",
       "                                                 education  \\\n",
       "8840413  [[Information engineering, Ph.D., Università d...   \n",
       "\n",
       "                                               employments  number of works  \\\n",
       "8840413  [[Research Associate, Istituto di Scienza e Te...               37   \n",
       "\n",
       "                                              works source  \\\n",
       "8840413  [\"Scopus - Elsevier\", \"Crossref Metadata Searc...   \n",
       "\n",
       "                                    url_domains  \\\n",
       "8840413  [github.io, twitter.com, linkedin.com]   \n",
       "\n",
       "                                                    concat label  \n",
       "8840413  0000-0002-5193-7851\n",
       "Andrea Mannocci\n",
       "\n",
       "andrea.ma...  True  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['orcid'] == AM]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>orcid</th>\n",
       "      <th>claimed</th>\n",
       "      <th>verifyed email</th>\n",
       "      <th>verified primary email</th>\n",
       "      <th>given names</th>\n",
       "      <th>family name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other names</th>\n",
       "      <th>researcher urls</th>\n",
       "      <th>primary email</th>\n",
       "      <th>other emails</th>\n",
       "      <th>keywords</th>\n",
       "      <th>external identifiers</th>\n",
       "      <th>education</th>\n",
       "      <th>employments</th>\n",
       "      <th>number of works</th>\n",
       "      <th>works source</th>\n",
       "      <th>url_domains</th>\n",
       "      <th>concat</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>0000-0001-5004-7761</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>scaffolding</td>\n",
       "      <td>hire</td>\n",
       "      <td></td>\n",
       "      <td>[The first feature that you have to check in t...</td>\n",
       "      <td>[[scaffolding hire Wellington, https://www.tig...</td>\n",
       "      <td></td>\n",
       "      <td>[]</td>\n",
       "      <td>[scaffolding hire Wellington]</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[tigerscaffolds.co.nz]</td>\n",
       "      <td>0000-0001-5004-7761\n",
       "scaffolding hire\n",
       "The first...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                  orcid  claimed  verifyed email  verified primary email  \\\n",
       "14  0000-0001-5004-7761     True            True                    True   \n",
       "\n",
       "    given names family name biography  \\\n",
       "14  scaffolding        hire             \n",
       "\n",
       "                                          other names  \\\n",
       "14  [The first feature that you have to check in t...   \n",
       "\n",
       "                                      researcher urls primary email  \\\n",
       "14  [[scaffolding hire Wellington, https://www.tig...                 \n",
       "\n",
       "   other emails                       keywords external identifiers education  \\\n",
       "14           []  [scaffolding hire Wellington]                 <NA>        []   \n",
       "\n",
       "   employments  number of works works source             url_domains  \\\n",
       "14          []                0         <NA>  [tigerscaffolds.co.nz]   \n",
       "\n",
       "                                               concat label  \n",
       "14  0000-0001-5004-7761\n",
       "scaffolding hire\n",
       "The first...   NaN  "
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df[df['orcid'] == SCAFFOLD]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>concat</th>\n",
       "      <th>label</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0000-0001-5000-2053\n",
       "Jorge  Jaramillo Sanchez \n",
       "...</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>0000-0001-5000-6548\n",
       "Wiseman Bekelesi</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>0000-0001-5000-7962\n",
       "ALICE INDIMULI</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>0000-0001-5000-8586\n",
       "shim ji yun</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>0000-0001-5001-0256\n",
       "Sandro Caramaschi</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747035</th>\n",
       "      <td>0000-0003-4998-1551\n",
       "Animesh Ghosh</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747036</th>\n",
       "      <td>0000-0003-4998-4111\n",
       "Hawa Liberna</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747037</th>\n",
       "      <td>0000-0003-4998-6045\n",
       "Tongyi Men</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747038</th>\n",
       "      <td>0000-0003-4998-8868\n",
       "Charldon Wilken</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10747039</th>\n",
       "      <td>0000-0003-4999-7916\n",
       "Tapas Bapu B.R.</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10747040 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                     concat label\n",
       "0         0000-0001-5000-2053\n",
       "Jorge  Jaramillo Sanchez \n",
       "...   NaN\n",
       "1              0000-0001-5000-6548\n",
       "Wiseman Bekelesi\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "   NaN\n",
       "2                0000-0001-5000-7962\n",
       "ALICE INDIMULI\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "   NaN\n",
       "3                   0000-0001-5000-8586\n",
       "shim ji yun\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "   NaN\n",
       "4             0000-0001-5001-0256\n",
       "Sandro Caramaschi\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "   NaN\n",
       "...                                                     ...   ...\n",
       "10747035          0000-0003-4998-1551\n",
       "Animesh Ghosh\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "   NaN\n",
       "10747036           0000-0003-4998-4111\n",
       "Hawa Liberna\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "   NaN\n",
       "10747037             0000-0003-4998-6045\n",
       "Tongyi Men\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "   NaN\n",
       "10747038        0000-0003-4998-8868\n",
       "Charldon Wilken\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "   NaN\n",
       "10747039        0000-0003-4999-7916\n",
       "Tapas Bapu B.R.\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "\n",
       "   NaN\n",
       "\n",
       "[10747040 rows x 2 columns]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = df[['orcid', 'concat', 'label']]\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Pre-trained spam filter as-is"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "ename": "ModuleNotFoundError",
     "evalue": "No module named 'seaborn'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-83-72f6535bcb9f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m \u001b[0;31m# linear algebra\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m \u001b[0;31m# data processing, CSV file I/O (e.g. pd.read_csv)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mseaborn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      7\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpus\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mstopwords\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'seaborn'"
     ]
    }
   ],
   "source": [
    "import string\n",
    "import torch\n",
    "import transformers\n",
    "import numpy as np # linear algebra\n",
    "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "from nltk.corpus import stopwords\n",
    "from sklearn.manifold import TSNE\n",
    "from nltk.tokenize import word_tokenize\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# One-Class SVM"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.svm import OneClassSVM\n",
    "from sklearn.model_selection import train_test_split\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-44-8393c5b42618>:3: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  samples['features'] = vectorizer.fit_transform(samples['concat'])\n"
     ]
    }
   ],
   "source": [
    "samples = df[df['label'] == True]\n",
    "vectorizer = TfidfVectorizer()\n",
    "samples['features'] = vectorizer.fit_transform(samples['concat'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5             (0, 983769)\\t0.04916990678988556\\n  (0, 1177...\n",
       "13            (0, 983769)\\t0.04916990678988556\\n  (0, 1177...\n",
       "24            (0, 983769)\\t0.04916990678988556\\n  (0, 1177...\n",
       "26            (0, 983769)\\t0.04916990678988556\\n  (0, 1177...\n",
       "29            (0, 983769)\\t0.04916990678988556\\n  (0, 1177...\n",
       "                                  ...                        \n",
       "10747024      (0, 983769)\\t0.04916990678988556\\n  (0, 1177...\n",
       "10747026      (0, 983769)\\t0.04916990678988556\\n  (0, 1177...\n",
       "10747027      (0, 983769)\\t0.04916990678988556\\n  (0, 1177...\n",
       "10747030      (0, 983769)\\t0.04916990678988556\\n  (0, 1177...\n",
       "10747034      (0, 983769)\\t0.04916990678988556\\n  (0, 1177...\n",
       "Name: features, Length: 2674451, dtype: object"
      ]
     },
     "execution_count": 46,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "samples['features']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "trainX, testX, trainy, testy = train_test_split(samples['features'], samples['label'], test_size=0.7, random_state=3, stratify=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = OneClassSVM(gamma='scale', nu=0.01)\n",
    "model.fit(trainX)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "could not convert string to float: '0000-0001-5001-4994'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-118-78458a59bca9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/svm/_classes.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight, **params)\u001b[0m\n\u001b[1;32m   1374\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   1375\u001b[0m         \"\"\"\n\u001b[0;32m-> 1376\u001b[0;31m         super().fit(X, np.ones(_num_samples(X)),\n\u001b[0m\u001b[1;32m   1377\u001b[0m                     sample_weight=sample_weight, **params)\n\u001b[1;32m   1378\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moffset_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_intercept_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/svm/_base.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m    167\u001b[0m             \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    168\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 169\u001b[0;31m             X, y = self._validate_data(X, y, dtype=np.float64,\n\u001b[0m\u001b[1;32m    170\u001b[0m                                        \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'C'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    171\u001b[0m                                        accept_large_sparse=False)\n",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m_validate_data\u001b[0;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[1;32m    431\u001b[0m                 \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    432\u001b[0m             \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 433\u001b[0;31m                 \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    434\u001b[0m             \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    435\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     61\u001b[0m             \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     62\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     65\u001b[0m             \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m    812\u001b[0m         \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"y cannot be None\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    813\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 814\u001b[0;31m     X = check_array(X, accept_sparse=accept_sparse,\n\u001b[0m\u001b[1;32m    815\u001b[0m                     \u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    816\u001b[0m                     \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     61\u001b[0m             \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     62\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m                 \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     65\u001b[0m             \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)\u001b[0m\n\u001b[1;32m    558\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mhas_pd_integer_array\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    559\u001b[0m         \u001b[0;31m# If there are any pandas integer extension arrays,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 560\u001b[0;31m         \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    561\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    562\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'allow-nan'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m   5875\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   5876\u001b[0m             \u001b[0;31m# else, only a single dtype is given\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5877\u001b[0;31m             \u001b[0mnew_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   5878\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__finalize__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"astype\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   5879\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m    629\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"raise\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    630\u001b[0m     ) -> \"BlockManager\":\n\u001b[0;32m--> 631\u001b[0;31m         \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"astype\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    632\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    633\u001b[0m     def convert(\n",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, f, align_keys, ignore_failures, **kwargs)\u001b[0m\n\u001b[1;32m    425\u001b[0m                     \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    426\u001b[0m                 \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 427\u001b[0;31m                     \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    428\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mTypeError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    429\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mignore_failures\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/blocks.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m    646\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_extension\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    647\u001b[0m             \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 648\u001b[0;31m                 \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    649\u001b[0m             \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mValueError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    650\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"ignore\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/arrays/string_.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy)\u001b[0m\n\u001b[1;32m    306\u001b[0m             \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    307\u001b[0m             \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 308\u001b[0;31m             \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    309\u001b[0m             \u001b[0mvalues\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    310\u001b[0m             \u001b[0;32mreturn\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: could not convert string to float: '0000-0001-5001-4994'"
     ]
    }
   ],
   "source": [
    "model.fit(df[df['label'] == True])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Evaluation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# BERT"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Language model Databunch\n",
    "from fast_bert.data_lm import BertLMDataBunch\n",
    "# Language model learner\n",
    "from fast_bert.learner_lm import BertLMLearner\n",
    "\n",
    "from pathlib import Path\n",
    "from box import Box\n",
    "\n",
    "import logging\n",
    "logger = logging.getLogger()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Box is a nice wrapper to create an object from a json dict\n",
    "args = Box({\n",
    "    \"seed\": 42,\n",
    "    \"task_name\": 'imdb_reviews_lm',\n",
    "    \"model_name\": 'roberta-base',\n",
    "    \"model_type\": 'roberta',\n",
    "    \"train_batch_size\": 16,\n",
    "    \"learning_rate\": 4e-5,\n",
    "    \"num_train_epochs\": 20,\n",
    "    \"fp16\": True,\n",
    "    \"fp16_opt_level\": \"O2\",\n",
    "    \"warmup_steps\": 1000,\n",
    "    \"logging_steps\": 0,\n",
    "    \"max_seq_length\": 512,\n",
    "    \"multi_gpu\": False\n",
    "})\n",
    "\n",
    "DATA_PATH = Path('../data/processed')\n",
    "LOG_PATH = Path('../logs')\n",
    "MODEL_PATH = Path('../models/lm_model_{}/'.format(args.model_type))\n",
    "\n",
    "DATA_PATH.mkdir(exist_ok=True)\n",
    "MODEL_PATH.mkdir(exist_ok=True)\n",
    "LOG_PATH.mkdir(exist_ok=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "        <style>\n",
       "            /* Turns off some styling */\n",
       "            progress {\n",
       "                /* gets rid of default border in Firefox and Opera. */\n",
       "                border: none;\n",
       "                /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
       "                background-size: auto;\n",
       "            }\n",
       "            .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
       "                background: #F44336;\n",
       "            }\n",
       "        </style>\n",
       "      <progress value='123222' class='' max='9672336' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      1.27% [123222/9672336 00:03<04:04]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "ename": "UnicodeEncodeError",
     "evalue": "'utf-8' codec can't encode characters in position 162-163: surrogates not allowed",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mUnicodeEncodeError\u001b[0m                        Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-35-702486b1b84c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m databunch_lm = BertLMDataBunch.from_raw_corpus(\n\u001b[0m\u001b[1;32m      2\u001b[0m     \u001b[0mdata_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mDATA_PATH\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0mtext_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'concat'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      4\u001b[0m     \u001b[0mtokenizer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m     \u001b[0mbatch_size_per_gpu\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_batch_size\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/fast_bert/data_lm.py\u001b[0m in \u001b[0;36mfrom_raw_corpus\u001b[0;34m(data_dir, text_list, tokenizer, batch_size_per_gpu, max_seq_length, multi_gpu, test_size, model_type, logger, clear_cache, no_cache)\u001b[0m\n\u001b[1;32m    191\u001b[0m         )\n\u001b[1;32m    192\u001b[0m         \u001b[0;31m# Create train corpus\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 193\u001b[0;31m         \u001b[0mcreate_corpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_dir\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mtrain_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogger\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    194\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    195\u001b[0m         \u001b[0;31m# Create val corpus\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/fast_bert/data_lm.py\u001b[0m in \u001b[0;36mcreate_corpus\u001b[0;34m(text_list, target_path, logger)\u001b[0m\n\u001b[1;32m     87\u001b[0m             \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     88\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 89\u001b[0;31m             \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     90\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     91\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mUnicodeEncodeError\u001b[0m: 'utf-8' codec can't encode characters in position 162-163: surrogates not allowed"
     ]
    }
   ],
   "source": [
    "databunch_lm = BertLMDataBunch.from_raw_corpus(\n",
    "    data_dir=DATA_PATH,\n",
    "    text_list=df['concat'],\n",
    "    tokenizer=args.model_name,\n",
    "    batch_size_per_gpu=args.train_batch_size,\n",
    "    max_seq_length=args.max_seq_length,\n",
    "    multi_gpu=args.multi_gpu,\n",
    "    model_type=args.model_type,\n",
    "    logger=logger)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}