first tries with rudimental ML

This commit is contained in:
Andrea Mannocci 2021-03-26 16:01:46 +01:00
parent 31209807a8
commit 83e2005c0e
2 changed files with 610 additions and 66 deletions

File diff suppressed because one or more lines are too long

View File

@ -1,18 +1,8 @@
{ {
"cells": [ "cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Todo in data\n",
"- Column names -> no space\n",
"- If a list is empty, serialise [] in the csv\n",
"- If a string is empty, serialise '' in the csv"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 2,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
@ -23,7 +13,8 @@
"import pandas as pd\n", "import pandas as pd\n",
"from sklearn.preprocessing import MultiLabelBinarizer\n", "from sklearn.preprocessing import MultiLabelBinarizer\n",
"from sklearn.svm import OneClassSVM \n", "from sklearn.svm import OneClassSVM \n",
"from sklearn.model_selection import train_test_split" "from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import f1_score\n"
] ]
}, },
{ {
@ -324,16 +315,303 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 10, "execution_count": 5,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"df = df.fillna(0)" "df.fillna(0, inplace=True)"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 11, "execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>n_ids</th>\n",
" <th>n_keywords</th>\n",
" <th>n_employment</th>\n",
" <th>n_education</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989644</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989645</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989646</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989647</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989648</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10989649 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"0 0 0 0 0 0 \n",
"1 1 1 0 0 0 \n",
"2 1 1 0 0 0 \n",
"3 1 1 0 0 0 \n",
"4 1 1 0 0 0 \n",
"... ... ... ... ... ... \n",
"10989644 1 1 0 0 0 \n",
"10989645 1 1 7 7 0 \n",
"10989646 1 1 0 0 0 \n",
"10989647 1 1 0 0 0 \n",
"10989648 1 1 0 0 0 \n",
"\n",
" n_pmc n_other_pids n_emails n_urls n_ids n_keywords \\\n",
"0 0 0 0.0 0.0 0.0 0.0 \n",
"1 0 0 0.0 0.0 0.0 0.0 \n",
"2 0 0 0.0 0.0 0.0 0.0 \n",
"3 0 0 0.0 0.0 0.0 0.0 \n",
"4 0 0 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... ... ... \n",
"10989644 0 0 0.0 0.0 0.0 0.0 \n",
"10989645 1 0 0.0 0.0 0.0 0.0 \n",
"10989646 0 0 0.0 0.0 0.0 0.0 \n",
"10989647 0 0 0.0 0.0 0.0 0.0 \n",
"10989648 0 0 0.0 0.0 0.0 0.0 \n",
"\n",
" n_employment n_education label \n",
"0 0.0 0.0 0 \n",
"1 1.0 0.0 0 \n",
"2 0.0 0.0 0 \n",
"3 1.0 0.0 0 \n",
"4 2.0 0.0 0 \n",
"... ... ... ... \n",
"10989644 1.0 2.0 0 \n",
"10989645 2.0 2.0 1 \n",
"10989646 0.0 0.0 0 \n",
"10989647 1.0 2.0 0 \n",
"10989648 0.0 0.0 0 \n",
"\n",
"[10989649 rows x 14 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -356,7 +634,7 @@
"dtype: int64" "dtype: int64"
] ]
}, },
"execution_count": 11, "execution_count": 7,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -367,7 +645,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 12, "execution_count": 8,
"metadata": {}, "metadata": {},
"outputs": [ "outputs": [
{ {
@ -390,7 +668,7 @@
"dtype: int64" "dtype: int64"
] ]
}, },
"execution_count": 12, "execution_count": 8,
"metadata": {}, "metadata": {},
"output_type": "execute_result" "output_type": "execute_result"
} }
@ -401,17 +679,299 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": 10,
"metadata": {}, "metadata": {},
"outputs": [], "outputs": [],
"source": [ "source": [
"# split into train/test sets\n", "# split into train/test sets\n",
"X = df.loc[:,'verified_email':'n_education']\n", "X = df.loc[:,'verified_email':'n_education']\n",
"y = df['label']\n", "y = df['label']\n",
"trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)\n", "trainX, testX, trainy, testy = train_test_split(X, y, train_size=0.5, random_state=2, stratify=y)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n", "\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>n_ids</th>\n",
" <th>n_keywords</th>\n",
" <th>n_employment</th>\n",
" <th>n_education</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6325067</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6140551</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3258315</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10948983</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10089158</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2398808</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3622839</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1389679</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6594722</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4589084</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>11</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5494824 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"6325067 1 1 0 0 0 \n",
"6140551 1 1 0 0 0 \n",
"3258315 1 1 0 0 0 \n",
"10948983 1 1 0 0 0 \n",
"10089158 1 1 0 0 0 \n",
"... ... ... ... ... ... \n",
"2398808 0 0 1 1 0 \n",
"3622839 1 1 0 0 0 \n",
"1389679 1 1 0 0 0 \n",
"6594722 0 0 0 0 0 \n",
"4589084 1 1 11 0 0 \n",
"\n",
" n_pmc n_other_pids n_emails n_urls n_ids n_keywords \\\n",
"6325067 0 0 0.0 0.0 1.0 0.0 \n",
"6140551 0 0 0.0 0.0 0.0 0.0 \n",
"3258315 0 0 0.0 1.0 0.0 2.0 \n",
"10948983 0 0 0.0 0.0 0.0 0.0 \n",
"10089158 0 0 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... ... ... \n",
"2398808 0 0 0.0 0.0 0.0 0.0 \n",
"3622839 0 0 0.0 0.0 0.0 0.0 \n",
"1389679 0 0 0.0 0.0 0.0 0.0 \n",
"6594722 0 0 0.0 0.0 0.0 0.0 \n",
"4589084 0 0 0.0 0.0 0.0 0.0 \n",
"\n",
" n_employment n_education \n",
"6325067 1.0 1.0 \n",
"6140551 0.0 0.0 \n",
"3258315 0.0 0.0 \n",
"10948983 1.0 0.0 \n",
"10089158 0.0 0.0 \n",
"... ... ... \n",
"2398808 0.0 0.0 \n",
"3622839 1.0 1.0 \n",
"1389679 0.0 0.0 \n",
"6594722 0.0 0.0 \n",
"4589084 1.0 0.0 \n",
"\n",
"[5494824 rows x 13 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trainX"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# define outlier detection model\n", "# define outlier detection model\n",
"model = OneClassSVM(gamma='scale', nu=0.01)\n", "model = OneClassSVM(gamma='scale', nu=0.5)\n",
"\n", "\n",
"# fit on majority class\n", "# fit on majority class\n",
"trainX = trainX[trainy==1]\n", "trainX = trainX[trainy==1]\n",