first tries with rudimental ML

2021-03-26 09:16:11 +01:00 · 2021-03-26 09:16:11 +01:00 · 8288d877fa
parent 8e159607ea
commit 8288d877fa
3 changed files with 501 additions and 2424 deletions
--- a/notebooks/01-Exploration.ipynb
+++ b/notebooks/01-Exploration.ipynb
@ -764,7 +764,7 @@
    }
   ],
   "source": [
-    "df.count() #10916574"
+    "df.count()"
   ]
  },
  {
@ -16260,6 +16260,28 @@
    "fig.show()"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df[['verified_email', \n",
+    "    'verified_primary_email', \n",
+    "    'n_works', \n",
+    "    'n_doi',\n",
+    "    'n_arxiv', \n",
+    "    'n_pmc', \n",
+    "    'n_other_pids', \n",
+    "    'n_emails', \n",
+    "    'n_urls', \n",
+    "    'n_ids', \n",
+    "    'n_keywords', \n",
+    "    'n_employment', \n",
+    "    'n_education', \n",
+    "    'label']].to_pickle('../data/processed/features.pkl')"
+   ]
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
@ -16884,12 +16906,21 @@
    "# (df.n_works > 0) & (df.n_ids > 1)"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Serialise "
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
-   "source": []
+   "source": [
+    " "
+   ]
  }
 ],
 "metadata": {
--- a/notebooks/03-Feature
+++ b/notebooks/03-Feature
--- a/notebooks/03-Machine
+++ b/notebooks/03-Machine
@ -0,0 +1,468 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Todo in data\n",
+    "- Column names -> no space\n",
+    "- If a list is empty, serialise [] in the csv\n",
+    "- If a string is empty, serialise '' in the csv"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import ast\n",
+    "from urllib.parse import urlparse\n",
+    "import tldextract\n",
+    "\n",
+    "import pandas as pd\n",
+    "from sklearn.preprocessing import MultiLabelBinarizer\n",
+    "from sklearn.svm import OneClassSVM \n",
+    "from sklearn.model_selection import train_test_split"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_pickle('../data/processed/features.pkl')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>verified_email</th>\n",
+       "      <th>verified_primary_email</th>\n",
+       "      <th>n_works</th>\n",
+       "      <th>n_doi</th>\n",
+       "      <th>n_arxiv</th>\n",
+       "      <th>n_pmc</th>\n",
+       "      <th>n_other_pids</th>\n",
+       "      <th>n_emails</th>\n",
+       "      <th>n_urls</th>\n",
+       "      <th>n_ids</th>\n",
+       "      <th>n_keywords</th>\n",
+       "      <th>n_employment</th>\n",
+       "      <th>n_education</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10989644</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10989645</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>7</td>\n",
+       "      <td>7</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10989646</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10989647</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1.0</td>\n",
+       "      <td>2.0</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>10989648</th>\n",
+       "      <td>1</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>10989649 rows × 14 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          verified_email  verified_primary_email  n_works  n_doi  n_arxiv  \\\n",
+       "0                      0                       0        0      0        0   \n",
+       "1                      1                       1        0      0        0   \n",
+       "2                      1                       1        0      0        0   \n",
+       "3                      1                       1        0      0        0   \n",
+       "4                      1                       1        0      0        0   \n",
+       "...                  ...                     ...      ...    ...      ...   \n",
+       "10989644               1                       1        0      0        0   \n",
+       "10989645               1                       1        7      7        0   \n",
+       "10989646               1                       1        0      0        0   \n",
+       "10989647               1                       1        0      0        0   \n",
+       "10989648               1                       1        0      0        0   \n",
+       "\n",
+       "          n_pmc  n_other_pids  n_emails  n_urls  n_ids  n_keywords  \\\n",
+       "0             0             0       NaN     NaN    NaN         NaN   \n",
+       "1             0             0       NaN     NaN    NaN         NaN   \n",
+       "2             0             0       NaN     NaN    NaN         NaN   \n",
+       "3             0             0       NaN     NaN    NaN         NaN   \n",
+       "4             0             0       NaN     NaN    NaN         NaN   \n",
+       "...         ...           ...       ...     ...    ...         ...   \n",
+       "10989644      0             0       NaN     NaN    NaN         NaN   \n",
+       "10989645      1             0       NaN     NaN    NaN         NaN   \n",
+       "10989646      0             0       NaN     NaN    NaN         NaN   \n",
+       "10989647      0             0       NaN     NaN    NaN         NaN   \n",
+       "10989648      0             0       NaN     NaN    NaN         NaN   \n",
+       "\n",
+       "          n_employment  n_education  label  \n",
+       "0                  NaN          NaN      0  \n",
+       "1                  1.0          NaN      0  \n",
+       "2                  NaN          NaN      0  \n",
+       "3                  1.0          NaN      0  \n",
+       "4                  2.0          NaN      0  \n",
+       "...                ...          ...    ...  \n",
+       "10989644           1.0          2.0      0  \n",
+       "10989645           2.0          2.0      1  \n",
+       "10989646           NaN          NaN      0  \n",
+       "10989647           1.0          2.0      0  \n",
+       "10989648           NaN          NaN      0  \n",
+       "\n",
+       "[10989649 rows x 14 columns]"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.fillna(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "verified_email            2664886\n",
+       "verified_primary_email    2664886\n",
+       "n_works                   2664886\n",
+       "n_doi                     2664886\n",
+       "n_arxiv                   2664886\n",
+       "n_pmc                     2664886\n",
+       "n_other_pids              2664886\n",
+       "n_emails                  2664886\n",
+       "n_urls                    2664886\n",
+       "n_ids                     2664886\n",
+       "n_keywords                2664886\n",
+       "n_employment              2664886\n",
+       "n_education               2664886\n",
+       "label                     2664886\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[df.label == 1].count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "verified_email            8324763\n",
+       "verified_primary_email    8324763\n",
+       "n_works                   8324763\n",
+       "n_doi                     8324763\n",
+       "n_arxiv                   8324763\n",
+       "n_pmc                     8324763\n",
+       "n_other_pids              8324763\n",
+       "n_emails                  8324763\n",
+       "n_urls                    8324763\n",
+       "n_ids                     8324763\n",
+       "n_keywords                8324763\n",
+       "n_employment              8324763\n",
+       "n_education               8324763\n",
+       "label                     8324763\n",
+       "dtype: int64"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df[df.label == 0].count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# split into train/test sets\n",
+    "X = df.loc[:,'verified_email':'n_education']\n",
+    "y = df['label']\n",
+    "trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)\n",
+    "\n",
+    "# define outlier detection model\n",
+    "model = OneClassSVM(gamma='scale', nu=0.01)\n",
+    "\n",
+    "# fit on majority class\n",
+    "trainX = trainX[trainy==1]\n",
+    "model.fit(trainX)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# detect outliers in the test set\n",
+    "yhat = model.predict(testX)\n",
+    "\n",
+    "# mark inliers 1, outliers -1\n",
+    "testy[testy == 0] = -1\n",
+    "testy[testy == 1] = 1\n",
+    "\n",
+    "# calculate score\n",
+    "score = f1_score(testy, yhat, pos_label=-1)\n",
+    "print('F1 Score: %.3f' % score)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}