first tries with rudimental ML

This commit is contained in:
Andrea Mannocci 2021-03-26 09:16:11 +01:00
parent 8e159607ea
commit 8288d877fa
3 changed files with 501 additions and 2424 deletions

View File

@ -764,7 +764,7 @@
}
],
"source": [
"df.count() #10916574"
"df.count()"
]
},
{
@ -16260,6 +16260,28 @@
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [],
"source": [
"df[['verified_email', \n",
" 'verified_primary_email', \n",
" 'n_works', \n",
" 'n_doi',\n",
" 'n_arxiv', \n",
" 'n_pmc', \n",
" 'n_other_pids', \n",
" 'n_emails', \n",
" 'n_urls', \n",
" 'n_ids', \n",
" 'n_keywords', \n",
" 'n_employment', \n",
" 'n_education', \n",
" 'label']].to_pickle('../data/processed/features.pkl')"
]
},
{
"cell_type": "markdown",
"metadata": {},
@ -16884,12 +16906,21 @@
"# (df.n_works > 0) & (df.n_ids > 1)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Serialise "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
"source": [
" "
]
}
],
"metadata": {

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,468 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Todo in data\n",
"- Column names -> no space\n",
"- If a list is empty, serialise [] in the csv\n",
"- If a string is empty, serialise '' in the csv"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"from urllib.parse import urlparse\n",
"import tldextract\n",
"\n",
"import pandas as pd\n",
"from sklearn.preprocessing import MultiLabelBinarizer\n",
"from sklearn.svm import OneClassSVM \n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_pickle('../data/processed/features.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>n_ids</th>\n",
" <th>n_keywords</th>\n",
" <th>n_employment</th>\n",
" <th>n_education</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989644</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989645</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989646</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989647</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989648</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10989649 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"0 0 0 0 0 0 \n",
"1 1 1 0 0 0 \n",
"2 1 1 0 0 0 \n",
"3 1 1 0 0 0 \n",
"4 1 1 0 0 0 \n",
"... ... ... ... ... ... \n",
"10989644 1 1 0 0 0 \n",
"10989645 1 1 7 7 0 \n",
"10989646 1 1 0 0 0 \n",
"10989647 1 1 0 0 0 \n",
"10989648 1 1 0 0 0 \n",
"\n",
" n_pmc n_other_pids n_emails n_urls n_ids n_keywords \\\n",
"0 0 0 NaN NaN NaN NaN \n",
"1 0 0 NaN NaN NaN NaN \n",
"2 0 0 NaN NaN NaN NaN \n",
"3 0 0 NaN NaN NaN NaN \n",
"4 0 0 NaN NaN NaN NaN \n",
"... ... ... ... ... ... ... \n",
"10989644 0 0 NaN NaN NaN NaN \n",
"10989645 1 0 NaN NaN NaN NaN \n",
"10989646 0 0 NaN NaN NaN NaN \n",
"10989647 0 0 NaN NaN NaN NaN \n",
"10989648 0 0 NaN NaN NaN NaN \n",
"\n",
" n_employment n_education label \n",
"0 NaN NaN 0 \n",
"1 1.0 NaN 0 \n",
"2 NaN NaN 0 \n",
"3 1.0 NaN 0 \n",
"4 2.0 NaN 0 \n",
"... ... ... ... \n",
"10989644 1.0 2.0 0 \n",
"10989645 2.0 2.0 1 \n",
"10989646 NaN NaN 0 \n",
"10989647 1.0 2.0 0 \n",
"10989648 NaN NaN 0 \n",
"\n",
"[10989649 rows x 14 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"df = df.fillna(0)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"verified_email 2664886\n",
"verified_primary_email 2664886\n",
"n_works 2664886\n",
"n_doi 2664886\n",
"n_arxiv 2664886\n",
"n_pmc 2664886\n",
"n_other_pids 2664886\n",
"n_emails 2664886\n",
"n_urls 2664886\n",
"n_ids 2664886\n",
"n_keywords 2664886\n",
"n_employment 2664886\n",
"n_education 2664886\n",
"label 2664886\n",
"dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.label == 1].count()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"verified_email 8324763\n",
"verified_primary_email 8324763\n",
"n_works 8324763\n",
"n_doi 8324763\n",
"n_arxiv 8324763\n",
"n_pmc 8324763\n",
"n_other_pids 8324763\n",
"n_emails 8324763\n",
"n_urls 8324763\n",
"n_ids 8324763\n",
"n_keywords 8324763\n",
"n_employment 8324763\n",
"n_education 8324763\n",
"label 8324763\n",
"dtype: int64"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.label == 0].count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# split into train/test sets\n",
"X = df.loc[:,'verified_email':'n_education']\n",
"y = df['label']\n",
"trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)\n",
"\n",
"# define outlier detection model\n",
"model = OneClassSVM(gamma='scale', nu=0.01)\n",
"\n",
"# fit on majority class\n",
"trainX = trainX[trainy==1]\n",
"model.fit(trainX)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# detect outliers in the test set\n",
"yhat = model.predict(testX)\n",
"\n",
"# mark inliers 1, outliers -1\n",
"testy[testy == 0] = -1\n",
"testy[testy == 1] = 1\n",
"\n",
"# calculate score\n",
"score = f1_score(testy, yhat, pos_label=-1)\n",
"print('F1 Score: %.3f' % score)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}