1097 lines
51 KiB
Plaintext
1097 lines
51 KiB
Plaintext
|
{
|
|||
|
"cells": [
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Dataset preprocessing"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 49,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"import pandas as pd\n",
|
|||
|
"import ast\n",
|
|||
|
"import tldextract\n",
|
|||
|
"import numpy"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 50,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Notable Solid ORCID iDs for debug purposes\n",
|
|||
|
"AM = '0000-0002-5193-7851'\n",
|
|||
|
"PP = '0000-0002-8588-4196'\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 51,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Notable fake ORCID iDs for debug purposes\n",
|
|||
|
"SCAFFOLD = '0000-0001-5004-7761'\n",
|
|||
|
"WHATSAPP = '0000-0001-6997-9470'\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 52,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header=0,\n",
|
|||
|
" dtype = {'orcid': pd.StringDtype(), \n",
|
|||
|
" 'claimed': bool, \n",
|
|||
|
" 'verifyed email': bool, \n",
|
|||
|
" 'verified primary email': bool,\n",
|
|||
|
" 'given names': pd.StringDtype(),\n",
|
|||
|
" 'family name': pd.StringDtype(),\n",
|
|||
|
" 'biography': pd.StringDtype(),\n",
|
|||
|
" 'other names': pd.StringDtype(),\n",
|
|||
|
" 'researcher urls': pd.StringDtype(),\n",
|
|||
|
" 'primary email': pd.StringDtype(),\n",
|
|||
|
" 'other emails': pd.StringDtype(),\n",
|
|||
|
" 'keywords': pd.StringDtype(),\n",
|
|||
|
" 'external identifiers': pd.StringDtype(),\n",
|
|||
|
" 'education': pd.StringDtype(),\n",
|
|||
|
" 'employments': pd.StringDtype(),\n",
|
|||
|
" 'number of works': pd.Int16Dtype(),\n",
|
|||
|
" 'works source': pd.StringDtype()})"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 53,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df['given names'] = df['given names'].fillna('')\n",
|
|||
|
"df['family name'] = df['family name'].fillna('')\n",
|
|||
|
"df['biography'] = df['biography'].fillna('')\n",
|
|||
|
"df['primary email'] = df['primary email'].fillna('')"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 54,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df['other names'] = df['other names'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 55,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 59,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 57,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 58,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def extract_url_domains(lst):\n",
|
|||
|
" domains = []\n",
|
|||
|
" for e in lst:\n",
|
|||
|
" # e[0] is a string describing the url\n",
|
|||
|
" # e[1] is the url\n",
|
|||
|
" domain = tldextract.extract(e[1])\n",
|
|||
|
" domains.append(domain.registered_domain)\n",
|
|||
|
" return domains"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 60,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 61,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def extract_education(lst):\n",
|
|||
|
" educations = []\n",
|
|||
|
" for e in lst:\n",
|
|||
|
" # e[0] degree\n",
|
|||
|
" # e[1] role\n",
|
|||
|
" # e[2] university\n",
|
|||
|
" # e[..] city, region, country, id, id_scheme\n",
|
|||
|
" educations.append(' '.join([e[0], e[1], e[2]]))\n",
|
|||
|
" return educations"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 62,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 63,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def extract_employment(lst):\n",
|
|||
|
" res = []\n",
|
|||
|
" for e in lst:\n",
|
|||
|
" # e[0] role\n",
|
|||
|
" # e[1] institute\n",
|
|||
|
" # e[..] city, region, country, id, id_scheme\n",
|
|||
|
" res.append(' '.join([e[0], e[1]]))\n",
|
|||
|
" return res"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 56,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 64,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"def extract_email_domains(lst):\n",
|
|||
|
" res = []\n",
|
|||
|
" for email in lst:\n",
|
|||
|
" res.append(email.replace('@', ' '))\n",
|
|||
|
" return res"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 81,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df['concat'] = df['given names'] + ' ' + df['family name'] + '\\n' + \\\n",
|
|||
|
" df['other names'].apply(lambda x: ' '.join(x)) + '\\n' + \\\n",
|
|||
|
" df['primary email'].values[0].replace('@', ' ') + '\\n' + \\\n",
|
|||
|
" df['other emails'].apply(lambda x: ' '.join(extract_email_domains(x))) + '\\n' + \\\n",
|
|||
|
" df['biography'] + '\\n' + \\\n",
|
|||
|
" df['keywords'].apply(lambda x: ' - '.join(x)) + '\\n' + \\\n",
|
|||
|
" df['url_domains'].apply(lambda x: ' '.join(x)) + '\\n' + \\\n",
|
|||
|
" df['education'].apply(lambda x: '\\n'.join(extract_education(x))) + '\\n' + \\\n",
|
|||
|
" df['employments'].apply(lambda x: '\\n'.join(extract_employment(x)))"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 82,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stdout",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"Andrea Mannocci\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"Data science - science of science - scholarly knowledge mining - open science - research infrastructures\n",
|
|||
|
"github.io twitter.com linkedin.com\n",
|
|||
|
"Information engineering Ph.D. Università degli Studi di Pisa\n",
|
|||
|
"Telematics engineering M.Sc. Universidad Carlos III de Madrid\n",
|
|||
|
"Computer engineering M.Sc. Università degli Studi di Pisa\n",
|
|||
|
"Computer engineering B.Sc. Università degli Studi di Pisa\n",
|
|||
|
"Research Associate Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche\n",
|
|||
|
"Research Associate The Open University\n",
|
|||
|
"Research assistant Istituto di Scienza e Tecnologie dellInformazione Alessandro Faedo Consiglio Nazionale delle Ricerche\n",
|
|||
|
"Research assistant IMDEA Networks\n",
|
|||
|
"Research assistant Syddansk Universitet\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"print(df[df['orcid'] == AM]['concat'].values[0])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 19,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"df.loc[df['number of works'] > 0, 'label'] = True"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 20,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>orcid</th>\n",
|
|||
|
" <th>claimed</th>\n",
|
|||
|
" <th>verifyed email</th>\n",
|
|||
|
" <th>verified primary email</th>\n",
|
|||
|
" <th>given names</th>\n",
|
|||
|
" <th>family name</th>\n",
|
|||
|
" <th>biography</th>\n",
|
|||
|
" <th>other names</th>\n",
|
|||
|
" <th>researcher urls</th>\n",
|
|||
|
" <th>primary email</th>\n",
|
|||
|
" <th>other emails</th>\n",
|
|||
|
" <th>keywords</th>\n",
|
|||
|
" <th>external identifiers</th>\n",
|
|||
|
" <th>education</th>\n",
|
|||
|
" <th>employments</th>\n",
|
|||
|
" <th>number of works</th>\n",
|
|||
|
" <th>works source</th>\n",
|
|||
|
" <th>url_domains</th>\n",
|
|||
|
" <th>concat</th>\n",
|
|||
|
" <th>label</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>8840413</th>\n",
|
|||
|
" <td>0000-0002-5193-7851</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>Andrea</td>\n",
|
|||
|
" <td>Mannocci</td>\n",
|
|||
|
" <td></td>\n",
|
|||
|
" <td>[]</td>\n",
|
|||
|
" <td>[[Personal website, https://andremann.github.i...</td>\n",
|
|||
|
" <td>andrea.mannocci@isti.cnr.it</td>\n",
|
|||
|
" <td>[]</td>\n",
|
|||
|
" <td>[Data science , science of science, scholarly ...</td>\n",
|
|||
|
" <td>[[\"Scopus Author ID\", \"55233589900\"]]</td>\n",
|
|||
|
" <td>[[Information engineering, Ph.D., Università d...</td>\n",
|
|||
|
" <td>[[Research Associate, Istituto di Scienza e Te...</td>\n",
|
|||
|
" <td>37</td>\n",
|
|||
|
" <td>[\"Scopus - Elsevier\", \"Crossref Metadata Searc...</td>\n",
|
|||
|
" <td>[github.io, twitter.com, linkedin.com]</td>\n",
|
|||
|
" <td>0000-0002-5193-7851\n",
|
|||
|
"Andrea Mannocci\n",
|
|||
|
"\n",
|
|||
|
"andrea.ma...</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" orcid claimed verifyed email verified primary email \\\n",
|
|||
|
"8840413 0000-0002-5193-7851 True True True \n",
|
|||
|
"\n",
|
|||
|
" given names family name biography other names \\\n",
|
|||
|
"8840413 Andrea Mannocci [] \n",
|
|||
|
"\n",
|
|||
|
" researcher urls \\\n",
|
|||
|
"8840413 [[Personal website, https://andremann.github.i... \n",
|
|||
|
"\n",
|
|||
|
" primary email other emails \\\n",
|
|||
|
"8840413 andrea.mannocci@isti.cnr.it [] \n",
|
|||
|
"\n",
|
|||
|
" keywords \\\n",
|
|||
|
"8840413 [Data science , science of science, scholarly ... \n",
|
|||
|
"\n",
|
|||
|
" external identifiers \\\n",
|
|||
|
"8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n",
|
|||
|
"\n",
|
|||
|
" education \\\n",
|
|||
|
"8840413 [[Information engineering, Ph.D., Università d... \n",
|
|||
|
"\n",
|
|||
|
" employments number of works \\\n",
|
|||
|
"8840413 [[Research Associate, Istituto di Scienza e Te... 37 \n",
|
|||
|
"\n",
|
|||
|
" works source \\\n",
|
|||
|
"8840413 [\"Scopus - Elsevier\", \"Crossref Metadata Searc... \n",
|
|||
|
"\n",
|
|||
|
" url_domains \\\n",
|
|||
|
"8840413 [github.io, twitter.com, linkedin.com] \n",
|
|||
|
"\n",
|
|||
|
" concat label \n",
|
|||
|
"8840413 0000-0002-5193-7851\n",
|
|||
|
"Andrea Mannocci\n",
|
|||
|
"\n",
|
|||
|
"andrea.ma... True "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 20,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df[df['orcid'] == AM]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 21,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>orcid</th>\n",
|
|||
|
" <th>claimed</th>\n",
|
|||
|
" <th>verifyed email</th>\n",
|
|||
|
" <th>verified primary email</th>\n",
|
|||
|
" <th>given names</th>\n",
|
|||
|
" <th>family name</th>\n",
|
|||
|
" <th>biography</th>\n",
|
|||
|
" <th>other names</th>\n",
|
|||
|
" <th>researcher urls</th>\n",
|
|||
|
" <th>primary email</th>\n",
|
|||
|
" <th>other emails</th>\n",
|
|||
|
" <th>keywords</th>\n",
|
|||
|
" <th>external identifiers</th>\n",
|
|||
|
" <th>education</th>\n",
|
|||
|
" <th>employments</th>\n",
|
|||
|
" <th>number of works</th>\n",
|
|||
|
" <th>works source</th>\n",
|
|||
|
" <th>url_domains</th>\n",
|
|||
|
" <th>concat</th>\n",
|
|||
|
" <th>label</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>14</th>\n",
|
|||
|
" <td>0000-0001-5004-7761</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>True</td>\n",
|
|||
|
" <td>scaffolding</td>\n",
|
|||
|
" <td>hire</td>\n",
|
|||
|
" <td></td>\n",
|
|||
|
" <td>[The first feature that you have to check in t...</td>\n",
|
|||
|
" <td>[[scaffolding hire Wellington, https://www.tig...</td>\n",
|
|||
|
" <td></td>\n",
|
|||
|
" <td>[]</td>\n",
|
|||
|
" <td>[scaffolding hire Wellington]</td>\n",
|
|||
|
" <td><NA></td>\n",
|
|||
|
" <td>[]</td>\n",
|
|||
|
" <td>[]</td>\n",
|
|||
|
" <td>0</td>\n",
|
|||
|
" <td><NA></td>\n",
|
|||
|
" <td>[tigerscaffolds.co.nz]</td>\n",
|
|||
|
" <td>0000-0001-5004-7761\n",
|
|||
|
"scaffolding hire\n",
|
|||
|
"The first...</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" orcid claimed verifyed email verified primary email \\\n",
|
|||
|
"14 0000-0001-5004-7761 True True True \n",
|
|||
|
"\n",
|
|||
|
" given names family name biography \\\n",
|
|||
|
"14 scaffolding hire \n",
|
|||
|
"\n",
|
|||
|
" other names \\\n",
|
|||
|
"14 [The first feature that you have to check in t... \n",
|
|||
|
"\n",
|
|||
|
" researcher urls primary email \\\n",
|
|||
|
"14 [[scaffolding hire Wellington, https://www.tig... \n",
|
|||
|
"\n",
|
|||
|
" other emails keywords external identifiers education \\\n",
|
|||
|
"14 [] [scaffolding hire Wellington] <NA> [] \n",
|
|||
|
"\n",
|
|||
|
" employments number of works works source url_domains \\\n",
|
|||
|
"14 [] 0 <NA> [tigerscaffolds.co.nz] \n",
|
|||
|
"\n",
|
|||
|
" concat label \n",
|
|||
|
"14 0000-0001-5004-7761\n",
|
|||
|
"scaffolding hire\n",
|
|||
|
"The first... NaN "
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 21,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df[df['orcid'] == SCAFFOLD]"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"<div>\n",
|
|||
|
"<style scoped>\n",
|
|||
|
" .dataframe tbody tr th:only-of-type {\n",
|
|||
|
" vertical-align: middle;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe tbody tr th {\n",
|
|||
|
" vertical-align: top;\n",
|
|||
|
" }\n",
|
|||
|
"\n",
|
|||
|
" .dataframe thead th {\n",
|
|||
|
" text-align: right;\n",
|
|||
|
" }\n",
|
|||
|
"</style>\n",
|
|||
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|||
|
" <thead>\n",
|
|||
|
" <tr style=\"text-align: right;\">\n",
|
|||
|
" <th></th>\n",
|
|||
|
" <th>concat</th>\n",
|
|||
|
" <th>label</th>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </thead>\n",
|
|||
|
" <tbody>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>0</th>\n",
|
|||
|
" <td>0000-0001-5000-2053\n",
|
|||
|
"Jorge Jaramillo Sanchez \n",
|
|||
|
"...</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>1</th>\n",
|
|||
|
" <td>0000-0001-5000-6548\n",
|
|||
|
"Wiseman Bekelesi</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>2</th>\n",
|
|||
|
" <td>0000-0001-5000-7962\n",
|
|||
|
"ALICE INDIMULI</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>3</th>\n",
|
|||
|
" <td>0000-0001-5000-8586\n",
|
|||
|
"shim ji yun</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>4</th>\n",
|
|||
|
" <td>0000-0001-5001-0256\n",
|
|||
|
"Sandro Caramaschi</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>...</th>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" <td>...</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10747035</th>\n",
|
|||
|
" <td>0000-0003-4998-1551\n",
|
|||
|
"Animesh Ghosh</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10747036</th>\n",
|
|||
|
" <td>0000-0003-4998-4111\n",
|
|||
|
"Hawa Liberna</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10747037</th>\n",
|
|||
|
" <td>0000-0003-4998-6045\n",
|
|||
|
"Tongyi Men</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10747038</th>\n",
|
|||
|
" <td>0000-0003-4998-8868\n",
|
|||
|
"Charldon Wilken</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" <tr>\n",
|
|||
|
" <th>10747039</th>\n",
|
|||
|
" <td>0000-0003-4999-7916\n",
|
|||
|
"Tapas Bapu B.R.</td>\n",
|
|||
|
" <td>NaN</td>\n",
|
|||
|
" </tr>\n",
|
|||
|
" </tbody>\n",
|
|||
|
"</table>\n",
|
|||
|
"<p>10747040 rows × 2 columns</p>\n",
|
|||
|
"</div>"
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
" concat label\n",
|
|||
|
"0 0000-0001-5000-2053\n",
|
|||
|
"Jorge Jaramillo Sanchez \n",
|
|||
|
"... NaN\n",
|
|||
|
"1 0000-0001-5000-6548\n",
|
|||
|
"Wiseman Bekelesi\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
" NaN\n",
|
|||
|
"2 0000-0001-5000-7962\n",
|
|||
|
"ALICE INDIMULI\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
" NaN\n",
|
|||
|
"3 0000-0001-5000-8586\n",
|
|||
|
"shim ji yun\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
" NaN\n",
|
|||
|
"4 0000-0001-5001-0256\n",
|
|||
|
"Sandro Caramaschi\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
" NaN\n",
|
|||
|
"... ... ...\n",
|
|||
|
"10747035 0000-0003-4998-1551\n",
|
|||
|
"Animesh Ghosh\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
" NaN\n",
|
|||
|
"10747036 0000-0003-4998-4111\n",
|
|||
|
"Hawa Liberna\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
" NaN\n",
|
|||
|
"10747037 0000-0003-4998-6045\n",
|
|||
|
"Tongyi Men\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
" NaN\n",
|
|||
|
"10747038 0000-0003-4998-8868\n",
|
|||
|
"Charldon Wilken\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
" NaN\n",
|
|||
|
"10747039 0000-0003-4999-7916\n",
|
|||
|
"Tapas Bapu B.R.\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
"\n",
|
|||
|
" NaN\n",
|
|||
|
"\n",
|
|||
|
"[10747040 rows x 2 columns]"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 25,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"df = df[['orcid', 'concat', 'label']]\n",
|
|||
|
"df"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# Pre-trained spam filter as-is"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 83,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"ename": "ModuleNotFoundError",
|
|||
|
"evalue": "No module named 'seaborn'",
|
|||
|
"output_type": "error",
|
|||
|
"traceback": [
|
|||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|||
|
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
|
|||
|
"\u001b[0;32m<ipython-input-83-72f6535bcb9f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m \u001b[0;31m# linear algebra\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m \u001b[0;31m# data processing, CSV file I/O (e.g. pd.read_csv)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mseaborn\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0msns\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmatplotlib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpyplot\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mplt\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mnltk\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorpus\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mstopwords\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'seaborn'"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"import string\n",
|
|||
|
"import torch\n",
|
|||
|
"import transformers\n",
|
|||
|
"import numpy as np # linear algebra\n",
|
|||
|
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
|
|||
|
"import seaborn as sns\n",
|
|||
|
"import matplotlib.pyplot as plt\n",
|
|||
|
"from nltk.corpus import stopwords\n",
|
|||
|
"from sklearn.manifold import TSNE\n",
|
|||
|
"from nltk.tokenize import word_tokenize\n",
|
|||
|
"from sklearn.ensemble import RandomForestClassifier\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n",
|
|||
|
"from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# One-Class SVM"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 41,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
|||
|
"from sklearn.svm import OneClassSVM\n",
|
|||
|
"from sklearn.model_selection import train_test_split\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 44,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"name": "stderr",
|
|||
|
"output_type": "stream",
|
|||
|
"text": [
|
|||
|
"<ipython-input-44-8393c5b42618>:3: SettingWithCopyWarning: \n",
|
|||
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|||
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|||
|
"\n",
|
|||
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|||
|
" samples['features'] = vectorizer.fit_transform(samples['concat'])\n"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"samples = df[df['label'] == True]\n",
|
|||
|
"vectorizer = TfidfVectorizer()\n",
|
|||
|
"samples['features'] = vectorizer.fit_transform(samples['concat'])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 46,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/plain": [
|
|||
|
"5 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
|
|||
|
"13 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
|
|||
|
"24 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
|
|||
|
"26 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
|
|||
|
"29 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
|
|||
|
" ... \n",
|
|||
|
"10747024 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
|
|||
|
"10747026 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
|
|||
|
"10747027 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
|
|||
|
"10747030 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
|
|||
|
"10747034 (0, 983769)\\t0.04916990678988556\\n (0, 1177...\n",
|
|||
|
"Name: features, Length: 2674451, dtype: object"
|
|||
|
]
|
|||
|
},
|
|||
|
"execution_count": 46,
|
|||
|
"metadata": {},
|
|||
|
"output_type": "execute_result"
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"samples['features']"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"trainX, testX, trainy, testy = train_test_split(samples['features'], samples['label'], test_size=0.7, random_state=3, stratify=y)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"model = OneClassSVM(gamma='scale', nu=0.01)\n",
|
|||
|
"model.fit(trainX)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Training"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 118,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"ename": "ValueError",
|
|||
|
"evalue": "could not convert string to float: '0000-0001-5001-4994'",
|
|||
|
"output_type": "error",
|
|||
|
"traceback": [
|
|||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|||
|
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
|||
|
"\u001b[0;32m<ipython-input-118-78458a59bca9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'label'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/svm/_classes.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight, **params)\u001b[0m\n\u001b[1;32m 1374\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1375\u001b[0m \"\"\"\n\u001b[0;32m-> 1376\u001b[0;31m super().fit(X, np.ones(_num_samples(X)),\n\u001b[0m\u001b[1;32m 1377\u001b[0m sample_weight=sample_weight, **params)\n\u001b[1;32m 1378\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0moffset_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_intercept_\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/svm/_base.py\u001b[0m in \u001b[0;36mfit\u001b[0;34m(self, X, y, sample_weight)\u001b[0m\n\u001b[1;32m 167\u001b[0m \u001b[0mcheck_consistent_length\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 168\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 169\u001b[0;31m X, y = self._validate_data(X, y, dtype=np.float64,\n\u001b[0m\u001b[1;32m 170\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'C'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maccept_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m'csr'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 171\u001b[0m accept_large_sparse=False)\n",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/base.py\u001b[0m in \u001b[0;36m_validate_data\u001b[0;34m(self, X, y, reset, validate_separately, **check_params)\u001b[0m\n\u001b[1;32m 431\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_array\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_y_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 432\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 433\u001b[0;31m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcheck_X_y\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mcheck_params\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 434\u001b[0m \u001b[0mout\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 435\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_X_y\u001b[0;34m(X, y, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, multi_output, ensure_min_samples, ensure_min_features, y_numeric, estimator)\u001b[0m\n\u001b[1;32m 812\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"y cannot be None\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 813\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 814\u001b[0;31m X = check_array(X, accept_sparse=accept_sparse,\n\u001b[0m\u001b[1;32m 815\u001b[0m \u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0maccept_large_sparse\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 816\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0morder\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0morder\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36minner_f\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 61\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mall_args\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 62\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mextra_args\u001b[0m \u001b[0;34m<=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 63\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 64\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 65\u001b[0m \u001b[0;31m# extra_args > 0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/sklearn/utils/validation.py\u001b[0m in \u001b[0;36mcheck_array\u001b[0;34m(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)\u001b[0m\n\u001b[1;32m 558\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mhas_pd_integer_array\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 559\u001b[0m \u001b[0;31m# If there are any pandas integer extension arrays,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 560\u001b[0;31m \u001b[0marray\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marray\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 561\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 562\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mforce_all_finite\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'allow-nan'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 5875\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5876\u001b[0m \u001b[0;31m# else, only a single dtype is given\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 5877\u001b[0;31m \u001b[0mnew_data\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_mgr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 5878\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_data\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__finalize__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"astype\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5879\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 629\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mstr\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m\"raise\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 630\u001b[0m ) -> \"BlockManager\":\n\u001b[0;32m--> 631\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"astype\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdtype\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 632\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 633\u001b[0m def convert(\n",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, f, align_keys, ignore_failures, **kwargs)\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 426\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 427\u001b[0;31m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 428\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mTypeError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mignore_failures\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/blocks.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy, errors)\u001b[0m\n\u001b[1;32m 646\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mis_extension\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 647\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 648\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 649\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mValueError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mTypeError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 650\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0merrors\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"ignore\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/arrays/string_.py\u001b[0m in \u001b[0;36mastype\u001b[0;34m(self, dtype, copy)\u001b[0m\n\u001b[1;32m 306\u001b[0m \u001b[0mmask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0misna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 307\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 308\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marr\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdtype\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 309\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnan\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 310\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;31mValueError\u001b[0m: could not convert string to float: '0000-0001-5001-4994'"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"model.fit(df[df['label'] == True])"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"## Evaluation"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "markdown",
|
|||
|
"metadata": {},
|
|||
|
"source": [
|
|||
|
"# BERT"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 30,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Language model Databunch\n",
|
|||
|
"from fast_bert.data_lm import BertLMDataBunch\n",
|
|||
|
"# Language model learner\n",
|
|||
|
"from fast_bert.learner_lm import BertLMLearner\n",
|
|||
|
"\n",
|
|||
|
"from pathlib import Path\n",
|
|||
|
"from box import Box\n",
|
|||
|
"\n",
|
|||
|
"import logging\n",
|
|||
|
"logger = logging.getLogger()\n"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 34,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": [
|
|||
|
"# Box is a nice wrapper to create an object from a json dict\n",
|
|||
|
"args = Box({\n",
|
|||
|
" \"seed\": 42,\n",
|
|||
|
" \"task_name\": 'imdb_reviews_lm',\n",
|
|||
|
" \"model_name\": 'roberta-base',\n",
|
|||
|
" \"model_type\": 'roberta',\n",
|
|||
|
" \"train_batch_size\": 16,\n",
|
|||
|
" \"learning_rate\": 4e-5,\n",
|
|||
|
" \"num_train_epochs\": 20,\n",
|
|||
|
" \"fp16\": True,\n",
|
|||
|
" \"fp16_opt_level\": \"O2\",\n",
|
|||
|
" \"warmup_steps\": 1000,\n",
|
|||
|
" \"logging_steps\": 0,\n",
|
|||
|
" \"max_seq_length\": 512,\n",
|
|||
|
" \"multi_gpu\": False\n",
|
|||
|
"})\n",
|
|||
|
"\n",
|
|||
|
"DATA_PATH = Path('../data/processed')\n",
|
|||
|
"LOG_PATH = Path('../logs')\n",
|
|||
|
"MODEL_PATH = Path('../models/lm_model_{}/'.format(args.model_type))\n",
|
|||
|
"\n",
|
|||
|
"DATA_PATH.mkdir(exist_ok=True)\n",
|
|||
|
"MODEL_PATH.mkdir(exist_ok=True)\n",
|
|||
|
"LOG_PATH.mkdir(exist_ok=True)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": 35,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [
|
|||
|
{
|
|||
|
"data": {
|
|||
|
"text/html": [
|
|||
|
"\n",
|
|||
|
" <div>\n",
|
|||
|
" <style>\n",
|
|||
|
" /* Turns off some styling */\n",
|
|||
|
" progress {\n",
|
|||
|
" /* gets rid of default border in Firefox and Opera. */\n",
|
|||
|
" border: none;\n",
|
|||
|
" /* Needs to be in here for Safari polyfill so background images work as expected. */\n",
|
|||
|
" background-size: auto;\n",
|
|||
|
" }\n",
|
|||
|
" .progress-bar-interrupted, .progress-bar-interrupted::-webkit-progress-bar {\n",
|
|||
|
" background: #F44336;\n",
|
|||
|
" }\n",
|
|||
|
" </style>\n",
|
|||
|
" <progress value='123222' class='' max='9672336' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
|
|||
|
" 1.27% [123222/9672336 00:03<04:04]\n",
|
|||
|
" </div>\n",
|
|||
|
" "
|
|||
|
],
|
|||
|
"text/plain": [
|
|||
|
"<IPython.core.display.HTML object>"
|
|||
|
]
|
|||
|
},
|
|||
|
"metadata": {},
|
|||
|
"output_type": "display_data"
|
|||
|
},
|
|||
|
{
|
|||
|
"ename": "UnicodeEncodeError",
|
|||
|
"evalue": "'utf-8' codec can't encode characters in position 162-163: surrogates not allowed",
|
|||
|
"output_type": "error",
|
|||
|
"traceback": [
|
|||
|
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
|||
|
"\u001b[0;31mUnicodeEncodeError\u001b[0m Traceback (most recent call last)",
|
|||
|
"\u001b[0;32m<ipython-input-35-702486b1b84c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m databunch_lm = BertLMDataBunch.from_raw_corpus(\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mdata_dir\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mDATA_PATH\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mtext_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'concat'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmodel_name\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mbatch_size_per_gpu\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0margs\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtrain_batch_size\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/fast_bert/data_lm.py\u001b[0m in \u001b[0;36mfrom_raw_corpus\u001b[0;34m(data_dir, text_list, tokenizer, batch_size_per_gpu, max_seq_length, multi_gpu, test_size, model_type, logger, clear_cache, no_cache)\u001b[0m\n\u001b[1;32m 191\u001b[0m )\n\u001b[1;32m 192\u001b[0m \u001b[0;31m# Create train corpus\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 193\u001b[0;31m \u001b[0mcreate_corpus\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain_list\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_dir\u001b[0m \u001b[0;34m/\u001b[0m \u001b[0mtrain_file\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlogger\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 194\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 195\u001b[0m \u001b[0;31m# Create val corpus\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/fast_bert/data_lm.py\u001b[0m in \u001b[0;36mcreate_corpus\u001b[0;34m(text_list, target_path, logger)\u001b[0m\n\u001b[1;32m 87\u001b[0m \u001b[0mtext\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtext\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mstrip\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 88\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 89\u001b[0;31m \u001b[0mf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwrite\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m\"\\n\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 90\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 91\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
|
|||
|
"\u001b[0;31mUnicodeEncodeError\u001b[0m: 'utf-8' codec can't encode characters in position 162-163: surrogates not allowed"
|
|||
|
]
|
|||
|
}
|
|||
|
],
|
|||
|
"source": [
|
|||
|
"databunch_lm = BertLMDataBunch.from_raw_corpus(\n",
|
|||
|
" data_dir=DATA_PATH,\n",
|
|||
|
" text_list=df['concat'],\n",
|
|||
|
" tokenizer=args.model_name,\n",
|
|||
|
" batch_size_per_gpu=args.train_batch_size,\n",
|
|||
|
" max_seq_length=args.max_seq_length,\n",
|
|||
|
" multi_gpu=args.multi_gpu,\n",
|
|||
|
" model_type=args.model_type,\n",
|
|||
|
" logger=logger)"
|
|||
|
]
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
},
|
|||
|
{
|
|||
|
"cell_type": "code",
|
|||
|
"execution_count": null,
|
|||
|
"metadata": {},
|
|||
|
"outputs": [],
|
|||
|
"source": []
|
|||
|
}
|
|||
|
],
|
|||
|
"metadata": {
|
|||
|
"kernelspec": {
|
|||
|
"display_name": "Python 3",
|
|||
|
"language": "python",
|
|||
|
"name": "python3"
|
|||
|
},
|
|||
|
"language_info": {
|
|||
|
"codemirror_mode": {
|
|||
|
"name": "ipython",
|
|||
|
"version": 3
|
|||
|
},
|
|||
|
"file_extension": ".py",
|
|||
|
"mimetype": "text/x-python",
|
|||
|
"name": "python",
|
|||
|
"nbconvert_exporter": "python",
|
|||
|
"pygments_lexer": "ipython3",
|
|||
|
"version": "3.8.3"
|
|||
|
}
|
|||
|
},
|
|||
|
"nbformat": 4,
|
|||
|
"nbformat_minor": 4
|
|||
|
}
|