{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "Todo in data\n", "- Column names -> no space\n", "- If a list is empty, serialise [] in the csv\n", "- If a string is empty, serialise '' in the csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import ast\n", "from urllib.parse import urlparse\n", "import tldextract\n", "\n", "import pandas as pd\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "from sklearn.svm import OneClassSVM \n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = pd.read_pickle('../data/processed/features.pkl')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidsn_emailsn_urlsn_idsn_keywordsn_employmentn_educationlabel
00000000NaNNaNNaNNaNNaNNaN0
11100000NaNNaNNaNNaN1.0NaN0
21100000NaNNaNNaNNaNNaNNaN0
31100000NaNNaNNaNNaN1.0NaN0
41100000NaNNaNNaNNaN2.0NaN0
.............................................
109896441100000NaNNaNNaNNaN1.02.00
109896451177010NaNNaNNaNNaN2.02.01
109896461100000NaNNaNNaNNaNNaNNaN0
109896471100000NaNNaNNaNNaN1.02.00
109896481100000NaNNaNNaNNaNNaNNaN0
\n", "

10989649 rows × 14 columns

\n", "
" ], "text/plain": [ " verified_email verified_primary_email n_works n_doi n_arxiv \\\n", "0 0 0 0 0 0 \n", "1 1 1 0 0 0 \n", "2 1 1 0 0 0 \n", "3 1 1 0 0 0 \n", "4 1 1 0 0 0 \n", "... ... ... ... ... ... \n", "10989644 1 1 0 0 0 \n", "10989645 1 1 7 7 0 \n", "10989646 1 1 0 0 0 \n", "10989647 1 1 0 0 0 \n", "10989648 1 1 0 0 0 \n", "\n", " n_pmc n_other_pids n_emails n_urls n_ids n_keywords \\\n", "0 0 0 NaN NaN NaN NaN \n", "1 0 0 NaN NaN NaN NaN \n", "2 0 0 NaN NaN NaN NaN \n", "3 0 0 NaN NaN NaN NaN \n", "4 0 0 NaN NaN NaN NaN \n", "... ... ... ... ... ... ... \n", "10989644 0 0 NaN NaN NaN NaN \n", "10989645 1 0 NaN NaN NaN NaN \n", "10989646 0 0 NaN NaN NaN NaN \n", "10989647 0 0 NaN NaN NaN NaN \n", "10989648 0 0 NaN NaN NaN NaN \n", "\n", " n_employment n_education label \n", "0 NaN NaN 0 \n", "1 1.0 NaN 0 \n", "2 NaN NaN 0 \n", "3 1.0 NaN 0 \n", "4 2.0 NaN 0 \n", "... ... ... ... \n", "10989644 1.0 2.0 0 \n", "10989645 2.0 2.0 1 \n", "10989646 NaN NaN 0 \n", "10989647 1.0 2.0 0 \n", "10989648 NaN NaN 0 \n", "\n", "[10989649 rows x 14 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "df = df.fillna(0)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "verified_email 2664886\n", "verified_primary_email 2664886\n", "n_works 2664886\n", "n_doi 2664886\n", "n_arxiv 2664886\n", "n_pmc 2664886\n", "n_other_pids 2664886\n", "n_emails 2664886\n", "n_urls 2664886\n", "n_ids 2664886\n", "n_keywords 2664886\n", "n_employment 2664886\n", "n_education 2664886\n", "label 2664886\n", "dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.label == 1].count()" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "verified_email 8324763\n", "verified_primary_email 8324763\n", "n_works 8324763\n", "n_doi 8324763\n", "n_arxiv 8324763\n", "n_pmc 8324763\n", "n_other_pids 8324763\n", "n_emails 8324763\n", "n_urls 8324763\n", "n_ids 8324763\n", "n_keywords 8324763\n", "n_employment 8324763\n", "n_education 8324763\n", "label 8324763\n", "dtype: int64" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.label == 0].count()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# split into train/test sets\n", "X = df.loc[:,'verified_email':'n_education']\n", "y = df['label']\n", "trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)\n", "\n", "# define outlier detection model\n", "model = OneClassSVM(gamma='scale', nu=0.01)\n", "\n", "# fit on majority class\n", "trainX = trainX[trainy==1]\n", "model.fit(trainX)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# detect outliers in the test set\n", "yhat = model.predict(testX)\n", "\n", "# mark inliers 1, outliers -1\n", "testy[testy == 0] = -1\n", "testy[testy == 1] = 1\n", "\n", "# calculate score\n", "score = f1_score(testy, yhat, pos_label=-1)\n", "print('F1 Score: %.3f' % score)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }