fake-orcid-analysis/notebooks/02-Spam filter.ipynb

1097 lines
51 KiB
Plaintext
Raw Normal View History

2021-03-18 17:43:00 +01:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dataset preprocessing"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import ast\n",
"import tldextract\n",
"import numpy"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [],
"source": [
"# Notable Solid ORCID iDs for debug purposes\n",
"AM = '0000-0002-5193-7851'\n",
"PP = '0000-0002-8588-4196'\n"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [],
"source": [
"# Notable fake ORCID iDs for debug purposes\n",
"SCAFFOLD = '0000-0001-5004-7761'\n",
"WHATSAPP = '0000-0001-6997-9470'\n"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header=0,\n",
" dtype = {'orcid': pd.StringDtype(), \n",
" 'claimed': bool, \n",
" 'verifyed email': bool, \n",
" 'verified primary email': bool,\n",
" 'given names': pd.StringDtype(),\n",
" 'family name': pd.StringDtype(),\n",
" 'biography': pd.StringDtype(),\n",
" 'other names': pd.StringDtype(),\n",
" 'researcher urls': pd.StringDtype(),\n",
" 'primary email': pd.StringDtype(),\n",
" 'other emails': pd.StringDtype(),\n",
" 'keywords': pd.StringDtype(),\n",
" 'external identifiers': pd.StringDtype(),\n",
" 'education': pd.StringDtype(),\n",
" 'employments': pd.StringDtype(),\n",
" 'number of works': pd.Int16Dtype(),\n",
" 'works source': pd.StringDtype()})"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [],
"source": [
"df['given names'] = df['given names'].fillna('')\n",
"df['family name'] = df['family name'].fillna('')\n",
"df['biography'] = df['biography'].fillna('')\n",
"df['primary email'] = df['primary email'].fillna('')"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"df['other names'] = df['other names'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [],
"source": [
"df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [],
"source": [
"df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [],
"source": [
"def extract_url_domains(lst):\n",
" domains = []\n",
" for e in lst:\n",
" # e[0] is a string describing the url\n",
" # e[1] is the url\n",
" domain = tldextract.extract(e[1])\n",
" domains.append(domain.registered_domain)\n",
" return domains"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"def extract_education(lst):\n",
" educations = []\n",
" for e in lst:\n",
" # e[0] degree\n",
" # e[1] role\n",
" # e[2] university\n",
" # e[..] city, region, country, id, id_scheme\n",
" educations.append(' '.join([e[0], e[1], e[2]]))\n",
" return educations"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"def extract_employment(lst):\n",
" res = []\n",
" for e in lst:\n",
" # e[0] role\n",
" # e[1] institute\n",
" # e[..] city, region, country, id, id_scheme\n",
" res.append(' '.join([e[0], e[1]]))\n",
" return res"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"def extract_email_domains(lst):\n",
" res = []\n",
" for email in lst:\n",
" res.append(email.replace('@', ' '))\n",
" return res"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"df['concat'] = df['given names'] + ' ' + df['family name'] + '\\n' + \\\n",
" df['other names'].apply(lambda x: ' '.join(x)) + '\\n' + \\\n",
" df['primary email'].values[0].replace('@', ' ') + '\\n' + \\\n",
" df['other emails'].apply(lambda x: ' '.join(extract_email_domains(x))) + '\\n' + \\\n",
" df['biography'] + '\\n' + \\\n",
" df['keywords'].apply(lambda x: ' - '.join(x)) + '\\n' + \\\n",
" df['url_domains'].apply(lambda x: ' '.join(x)) + '\\n' + \\\n",
" df['education'].apply(lambda x: '\\n'.join(extract_education(x))) + '\\n' + \\\n",
" df['employments'].apply(lambda x: '\\n'.join(extract_employment(x)))"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Andrea Mannocci\n",
"\n",
"\n",
"\n",
"\n",
"Data science - science of science - scholarly knowledge mining - open science - research infrastructures\n",
"github.io twitter.com linkedin.com\n",
"Information engineering Ph.D. Università degli Studi di Pisa\n",
"Telematics engineering M.Sc. Universidad Carlos III de Madrid\n",
"Computer engineering M.Sc. Università degli Studi di Pisa\n",
"Computer engineering B.Sc. Università degli Studi di Pisa\n",
"Research Associate Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche\n",
"Research Associate The Open University\n",
"Research assistant Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche\n",
"Research assistant IMDEA Networks\n",
"Research assistant Syddansk Universitet\n"
]
}
],
"source": [
"print(df[df['orcid'] == AM]['concat'].values[0])"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"df.loc[df['number of works'] > 0, 'label'] = True"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" <th>url_domains</th>\n",
" <th>concat</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8840413</th>\n",
" <td>0000-0002-5193-7851</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>Andrea</td>\n",
" <td>Mannocci</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[[Personal website, https://andremann.github.i...</td>\n",
" <td>andrea.mannocci@isti.cnr.it</td>\n",
" <td>[]</td>\n",
" <td>[Data science , science of science, scholarly ...</td>\n",
" <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
" <td>[[Information engineering, Ph.D., Università d...</td>\n",
" <td>[[Research Associate, Istituto di Scienza e Te...</td>\n",
" <td>37</td>\n",
" <td>[\"Scopus - Elsevier\", \"Crossref Metadata Searc...</td>\n",
" <td>[github.io, twitter.com, linkedin.com]</td>\n",
" <td>0000-0002-5193-7851\n",
"Andrea Mannocci\n",
"\n",
"andrea.ma...</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"8840413 0000-0002-5193-7851 True True True \n",
"\n",
" given names family name biography other names \\\n",
"8840413 Andrea Mannocci [] \n",
"\n",
" researcher urls \\\n",
"8840413 [[Personal website, https://andremann.github.i... \n",
"\n",
" primary email other emails \\\n",
"8840413 andrea.mannocci@isti.cnr.it [] \n",
"\n",
" keywords \\\n",
"8840413 [Data science , science of science, scholarly ... \n",
"\n",
" external identifiers \\\n",
"8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n",
"\n",
" education \\\n",
"8840413 [[Information engineering, Ph.D., Università d... \n",
"\n",
" employments number of works \\\n",
"8840413 [[Research Associate, Istituto di Scienza e Te... 37 \n",
"\n",
" works source \\\n",
"8840413 [\"Scopus - Elsevier\", \"Crossref Metadata Searc... \n",
"\n",
" url_domains \\\n",
"8840413 [github.io, twitter.com, linkedin.com] \n",
"\n",
" concat label \n",
"8840413 0000-0002-5193-7851\n",
"Andrea Mannocci\n",
"\n",
"andrea.ma... True "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>claimed</th>\n",
" <th>verifyed email</th>\n",
" <th>verified primary email</th>\n",
" <th>given names</th>\n",
" <th>family name</th>\n",
" <th>biography</th>\n",
" <th>other names</th>\n",
" <th>researcher urls</th>\n",
" <th>primary email</th>\n",
" <th>other emails</th>\n",
" <th>keywords</th>\n",
" <th>external identifiers</th>\n",
" <th>education</th>\n",
" <th>employments</th>\n",
" <th>number of works</th>\n",
" <th>works source</th>\n",
" <th>url_domains</th>\n",
" <th>concat</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>0000-0001-5004-7761</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>scaffolding</td>\n",
" <td>hire</td>\n",
" <td></td>\n",
" <td>[The first feature that you have to check in t...</td>\n",
" <td>[[scaffolding hire Wellington, https://www.tig...</td>\n",
" <td></td>\n",
" <td>[]</td>\n",
" <td>[scaffolding hire Wellington]</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[tigerscaffolds.co.nz]</td>\n",
" <td>0000-0001-5004-7761\n",
"scaffolding hire\n",
"The first...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orcid claimed verifyed email verified primary email \\\n",
"14 0000-0001-5004-7761 True True True \n",
"\n",
" given names family name biography \\\n",
"14 scaffolding hire \n",
"\n",
" other names \\\n",
"14 [The first feature that you have to check in t... \n",
"\n",
" researcher urls primary email \\\n",
"14 [[scaffolding hire Wellington, https://www.tig... \n",
"\n",
" other emails keywords external identifiers education \\\n",
"14 [] [scaffolding hire Wellington] <NA> [] \n",
"\n",
" employments number of works works source url_domains \\\n",
"14 [] 0 <NA> [tigerscaffolds.co.nz] \n",
"\n",
" concat label \n",
"14 0000-0001-5004-7761\n",
"scaffolding hire\n",
"The first... NaN "
]
},
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == SCAFFOLD]"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>concat</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0000-0001-5000-2053\n",
"Jorge Jaramillo Sanchez \n",
"...</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0000-0001-5000-6548\n",
"Wiseman Bekelesi</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0000-0001-5000-7962\n",
"ALICE INDIMULI</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0000-0001-5000-8586\n",
"shim ji yun</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0000-0001-5001-0256\n",
"Sandro Caramaschi</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747035</th>\n",
" <td>0000-0003-4998-1551\n",
"Animesh Ghosh</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747036</th>\n",
" <td>0000-0003-4998-4111\n",
"Hawa Liberna</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747037</th>\n",
" <td>0000-0003-4998-6045\n",
"Tongyi Men</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747038</th>\n",
" <td>0000-0003-4998-8868\n",
"Charldon Wilken</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10747039</th>\n",
" <td>0000-0003-4999-7916\n",
"Tapas Bapu B.R.</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10747040 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" concat label\n",
"0 0000-0001-5000-2053\n",
"Jorge Jaramillo Sanchez \n",
"... NaN\n",
"1 0000-0001-5000-6548\n",
"Wiseman Bekelesi\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" NaN\n",
"2 0000-0001-5000-7962\n",
"ALICE INDIMULI\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" NaN\n",
"3 0000-0001-5000-8586\n",
"shim ji yun\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" NaN\n",
"4 0000-0001-5001-0256\n",
"Sandro Caramaschi\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" NaN\n",
"... ... ...\n",
"10747035 0000-0003-4998-1551\n",
"Animesh Ghosh\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" NaN\n",
"10747036 0000-0003-4998-4111\n",
"Hawa Liberna\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" NaN\n",
"10747037 0000-0003-4998-6045\n",
"Tongyi Men\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" NaN\n",
"10747038 0000-0003-4998-8868\n",
"Charldon Wilken\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" NaN\n",
"10747039 0000-0003-4999-7916\n",
"Tapas Bapu B.R.\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
" NaN\n",
"\n",
"[10747040 rows x 2 columns]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df[['orcid', 'concat', 'label']]\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Pre-trained spam filter as-is"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'seaborn'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-83-72f6535bcb9f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m \u001b[0;31m# linear algebra\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m \u001b[0;31m# data processing, CSV file I/O (e.g. pd.read_csv)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mseaborn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpus\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mstopwords\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'seaborn'"
]
}
],
"source": [
"import string\n",
"import torch\n",
"import transformers\n",
"import numpy as np # linear algebra\n",
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from nltk.corpus import stopwords\n",
"from sklearn.manifold import TSNE\n",
"from nltk.tokenize import word_tokenize\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# One-Class SVM"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.svm import OneClassSVM\n",
"from sklearn.model_selection import train_test_split\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"<ipython-input-44-8393c5b42618>:3: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" samples['features'] = vectorizer.fit_transform(samples['concat'])\n"
]
}
],
"source": [
"samples = df[df['label'] == True]\n",
"vectorizer = TfidfVectorizer()\n",
"samples['features'] = vectorizer.fit_transform(samples['concat'])"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"5 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
"13 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
"24 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
"26 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
"29 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
" ... \n",
"10747024 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
"10747026 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
"10747027 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
"10747030 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
"10747034 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
"Name: features, Length: 2674451, dtype: object"
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"samples['features']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"trainX, testX, trainy, testy = train_test_split(samples['features'], samples['label'], test_size=0.7, random_state=3, stratify=y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = OneClassSVM(gamma='scale', nu=0.01)\n",
"model.fit(trainX)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "could not convert string to float: '0000-0001-5001-4994'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-118-78458a59bca9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/svm/_classes.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight, **params)\u001b[0m\n\u001b[1;32m 1374\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1375\u001b[0m \"\"\"\n\u001b[0;32m-> 1376\u001b[0;31m super().fit(X, np.ones(_num_samples(X)),\n\u001b[0m\u001b[1;32m 1377\u001b[0m sample_weight=sample_weight, **params)\n\u001b[1;32m 1378\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moffset_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_intercept_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/svm/_base.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 169\u001b[0;31m X, y = self._validate_data(X, y, dtype=np.float64,\n\u001b[0m\u001b[1;32m 170\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'C'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 171\u001b[0m accept_large_sparse=False)\n",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m_validate_data\u001b[0;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[1;32m 431\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 432\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 433\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 434\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 435\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 812\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"y cannot be None\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 813\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 814\u001b[0;31m X = check_array(X, accept_sparse=accept_sparse,\n\u001b[0m\u001b[1;32m 815\u001b[0m \u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 816\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)\u001b[0m\n\u001b[1;32m 558\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhas_pd_integer_array\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 559\u001b[0m \u001b[0;31m# If there are any pandas integer extension arrays,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 560\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 561\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 562\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'allow-nan'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 5875\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5876\u001b[0m \u001b[0;31m# else, only a single dtype is given\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5877\u001b[0;31m \u001b[0mnew_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5878\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__finalize__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"astype\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5879\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 629\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"raise\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 630\u001b[0m ) -> \"BlockManager\":\n\u001b[0;32m--> 631\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"astype\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 632\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 633\u001b[0m def convert(\n",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, f, align_keys, ignore_failures, **kwargs)\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 426\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 427\u001b[0;31m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 428\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mTypeError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mignore_failures\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/blocks.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_extension\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 648\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 649\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mValueError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 650\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"ignore\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/arrays/string_.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy)\u001b[0m\n\u001b[1;32m 306\u001b[0m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 307\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 308\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 309\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 310\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mValueError\u001b[0m: could not convert string to float: '0000-0001-5001-4994'"
]
}
],
"source": [
"model.fit(df[df['label'] == True])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Evaluation"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# BERT"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"# Language model Databunch\n",
"from fast_bert.data_lm import BertLMDataBunch\n",
"# Language model learner\n",
"from fast_bert.learner_lm import BertLMLearner\n",
"\n",
"from pathlib import Path\n",
"from box import Box\n",
"\n",
"import logging\n",
"logger = logging.getLogger()\n"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"# Box is a nice wrapper to create an object from a json dict\n",
"args = Box({\n",
" \"seed\": 42,\n",
" \"task_name\": 'imdb_reviews_lm',\n",
" \"model_name\": 'roberta-base',\n",
" \"model_type\": 'roberta',\n",
" \"train_batch_size\": 16,\n",
" \"learning_rate\": 4e-5,\n",
" \"num_train_epochs\": 20,\n",
" \"fp16\": True,\n",
" \"fp16_opt_level\": \"O2\",\n",
" \"warmup_steps\": 1000,\n",
" \"logging_steps\": 0,\n",
" \"max_seq_length\": 512,\n",
" \"multi_gpu\": False\n",
"})\n",
"\n",
"DATA_PATH = Path('../data/processed')\n",
"LOG_PATH = Path('../logs')\n",
"MODEL_PATH = Path('../models/lm_model_{}/'.format(args.model_type))\n",
"\n",
"DATA_PATH.mkdir(exist_ok=True)\n",
"MODEL_PATH.mkdir(exist_ok=True)\n",
"LOG_PATH.mkdir(exist_ok=True)"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
" <div>\n",
" <style>\n",
" /* Turns off some styling */\n",
" progress {\n",
" /* gets rid of default border in Firefox and Opera. */\n",
" border: none;\n",
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
" background-size: auto;\n",
" }\n",
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
" background: #F44336;\n",
" }\n",
" </style>\n",
" <progress value='123222' class='' max='9672336' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
" 1.27% [123222/9672336 00:03<04:04]\n",
" </div>\n",
" "
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"ename": "UnicodeEncodeError",
"evalue": "'utf-8' codec can't encode characters in position 162-163: surrogates not allowed",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mUnicodeEncodeError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-35-702486b1b84c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m databunch_lm = BertLMDataBunch.from_raw_corpus(\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdata_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mDATA_PATH\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtext_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'concat'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mbatch_size_per_gpu\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_batch_size\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/fast_bert/data_lm.py\u001b[0m in \u001b[0;36mfrom_raw_corpus\u001b[0;34m(data_dir, text_list, tokenizer, batch_size_per_gpu, max_seq_length, multi_gpu, test_size, model_type, logger, clear_cache, no_cache)\u001b[0m\n\u001b[1;32m 191\u001b[0m )\n\u001b[1;32m 192\u001b[0m \u001b[0;31m# Create train corpus\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 193\u001b[0;31m \u001b[0mcreate_corpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_dir\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mtrain_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogger\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 194\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 195\u001b[0m \u001b[0;31m# Create val corpus\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/fast_bert/data_lm.py\u001b[0m in \u001b[0;36mcreate_corpus\u001b[0;34m(text_list, target_path, logger)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 89\u001b[0;31m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 90\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mUnicodeEncodeError\u001b[0m: 'utf-8' codec can't encode characters in position 162-163: surrogates not allowed"
]
}
],
"source": [
"databunch_lm = BertLMDataBunch.from_raw_corpus(\n",
" data_dir=DATA_PATH,\n",
" text_list=df['concat'],\n",
" tokenizer=args.model_name,\n",
" batch_size_per_gpu=args.train_batch_size,\n",
" max_seq_length=args.max_seq_length,\n",
" multi_gpu=args.multi_gpu,\n",
" model_type=args.model_type,\n",
" logger=logger)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}