{ "cells": [ { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import ast\n", "from urllib.parse import urlparse\n", "import tldextract\n", "\n", "import pandas as pd\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "from sklearn.svm import OneClassSVM \n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import f1_score\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df = pd.read_pickle('../data/processed/features.pkl')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidsn_emailsn_urlsn_idsn_keywordsn_employmentn_educationlabel
00000000NaNNaNNaNNaNNaNNaN0
11100000NaNNaNNaNNaN1.0NaN0
21100000NaNNaNNaNNaNNaNNaN0
31100000NaNNaNNaNNaN1.0NaN0
41100000NaNNaNNaNNaN2.0NaN0
.............................................
109896441100000NaNNaNNaNNaN1.02.00
109896451177010NaNNaNNaNNaN2.02.01
109896461100000NaNNaNNaNNaNNaNNaN0
109896471100000NaNNaNNaNNaN1.02.00
109896481100000NaNNaNNaNNaNNaNNaN0
\n", "

10989649 rows × 14 columns

\n", "
" ], "text/plain": [ " verified_email verified_primary_email n_works n_doi n_arxiv \\\n", "0 0 0 0 0 0 \n", "1 1 1 0 0 0 \n", "2 1 1 0 0 0 \n", "3 1 1 0 0 0 \n", "4 1 1 0 0 0 \n", "... ... ... ... ... ... \n", "10989644 1 1 0 0 0 \n", "10989645 1 1 7 7 0 \n", "10989646 1 1 0 0 0 \n", "10989647 1 1 0 0 0 \n", "10989648 1 1 0 0 0 \n", "\n", " n_pmc n_other_pids n_emails n_urls n_ids n_keywords \\\n", "0 0 0 NaN NaN NaN NaN \n", "1 0 0 NaN NaN NaN NaN \n", "2 0 0 NaN NaN NaN NaN \n", "3 0 0 NaN NaN NaN NaN \n", "4 0 0 NaN NaN NaN NaN \n", "... ... ... ... ... ... ... \n", "10989644 0 0 NaN NaN NaN NaN \n", "10989645 1 0 NaN NaN NaN NaN \n", "10989646 0 0 NaN NaN NaN NaN \n", "10989647 0 0 NaN NaN NaN NaN \n", "10989648 0 0 NaN NaN NaN NaN \n", "\n", " n_employment n_education label \n", "0 NaN NaN 0 \n", "1 1.0 NaN 0 \n", "2 NaN NaN 0 \n", "3 1.0 NaN 0 \n", "4 2.0 NaN 0 \n", "... ... ... ... \n", "10989644 1.0 2.0 0 \n", "10989645 2.0 2.0 1 \n", "10989646 NaN NaN 0 \n", "10989647 1.0 2.0 0 \n", "10989648 NaN NaN 0 \n", "\n", "[10989649 rows x 14 columns]" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df.fillna(0, inplace=True)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidsn_emailsn_urlsn_idsn_keywordsn_employmentn_educationlabel
000000000.00.00.00.00.00.00
111000000.00.00.00.01.00.00
211000000.00.00.00.00.00.00
311000000.00.00.00.01.00.00
411000000.00.00.00.02.00.00
.............................................
1098964411000000.00.00.00.01.02.00
1098964511770100.00.00.00.02.02.01
1098964611000000.00.00.00.00.00.00
1098964711000000.00.00.00.01.02.00
1098964811000000.00.00.00.00.00.00
\n", "

10989649 rows × 14 columns

\n", "
" ], "text/plain": [ " verified_email verified_primary_email n_works n_doi n_arxiv \\\n", "0 0 0 0 0 0 \n", "1 1 1 0 0 0 \n", "2 1 1 0 0 0 \n", "3 1 1 0 0 0 \n", "4 1 1 0 0 0 \n", "... ... ... ... ... ... \n", "10989644 1 1 0 0 0 \n", "10989645 1 1 7 7 0 \n", "10989646 1 1 0 0 0 \n", "10989647 1 1 0 0 0 \n", "10989648 1 1 0 0 0 \n", "\n", " n_pmc n_other_pids n_emails n_urls n_ids n_keywords \\\n", "0 0 0 0.0 0.0 0.0 0.0 \n", "1 0 0 0.0 0.0 0.0 0.0 \n", "2 0 0 0.0 0.0 0.0 0.0 \n", "3 0 0 0.0 0.0 0.0 0.0 \n", "4 0 0 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... \n", "10989644 0 0 0.0 0.0 0.0 0.0 \n", "10989645 1 0 0.0 0.0 0.0 0.0 \n", "10989646 0 0 0.0 0.0 0.0 0.0 \n", "10989647 0 0 0.0 0.0 0.0 0.0 \n", "10989648 0 0 0.0 0.0 0.0 0.0 \n", "\n", " n_employment n_education label \n", "0 0.0 0.0 0 \n", "1 1.0 0.0 0 \n", "2 0.0 0.0 0 \n", "3 1.0 0.0 0 \n", "4 2.0 0.0 0 \n", "... ... ... ... \n", "10989644 1.0 2.0 0 \n", "10989645 2.0 2.0 1 \n", "10989646 0.0 0.0 0 \n", "10989647 1.0 2.0 0 \n", "10989648 0.0 0.0 0 \n", "\n", "[10989649 rows x 14 columns]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "verified_email 2664886\n", "verified_primary_email 2664886\n", "n_works 2664886\n", "n_doi 2664886\n", "n_arxiv 2664886\n", "n_pmc 2664886\n", "n_other_pids 2664886\n", "n_emails 2664886\n", "n_urls 2664886\n", "n_ids 2664886\n", "n_keywords 2664886\n", "n_employment 2664886\n", "n_education 2664886\n", "label 2664886\n", "dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.label == 1].count()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "verified_email 8324763\n", "verified_primary_email 8324763\n", "n_works 8324763\n", "n_doi 8324763\n", "n_arxiv 8324763\n", "n_pmc 8324763\n", "n_other_pids 8324763\n", "n_emails 8324763\n", "n_urls 8324763\n", "n_ids 8324763\n", "n_keywords 8324763\n", "n_employment 8324763\n", "n_education 8324763\n", "label 8324763\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.label == 0].count()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "# split into train/test sets\n", "X = df.loc[:,'verified_email':'n_education']\n", "y = df['label']\n", "trainX, testX, trainy, testy = train_test_split(X, y, train_size=0.5, random_state=2, stratify=y)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidsn_emailsn_urlsn_idsn_keywordsn_employmentn_education
632506711000000.00.01.00.01.01.0
614055111000000.00.00.00.00.00.0
325831511000000.01.00.02.00.00.0
1094898311000000.00.00.00.01.00.0
1008915811000000.00.00.00.00.00.0
..........................................
239880800110000.00.00.00.00.00.0
362283911000000.00.00.00.01.01.0
138967911000000.00.00.00.00.00.0
659472200000000.00.00.00.00.00.0
4589084111100000.00.00.00.01.00.0
\n", "

5494824 rows × 13 columns

\n", "
" ], "text/plain": [ " verified_email verified_primary_email n_works n_doi n_arxiv \\\n", "6325067 1 1 0 0 0 \n", "6140551 1 1 0 0 0 \n", "3258315 1 1 0 0 0 \n", "10948983 1 1 0 0 0 \n", "10089158 1 1 0 0 0 \n", "... ... ... ... ... ... \n", "2398808 0 0 1 1 0 \n", "3622839 1 1 0 0 0 \n", "1389679 1 1 0 0 0 \n", "6594722 0 0 0 0 0 \n", "4589084 1 1 11 0 0 \n", "\n", " n_pmc n_other_pids n_emails n_urls n_ids n_keywords \\\n", "6325067 0 0 0.0 0.0 1.0 0.0 \n", "6140551 0 0 0.0 0.0 0.0 0.0 \n", "3258315 0 0 0.0 1.0 0.0 2.0 \n", "10948983 0 0 0.0 0.0 0.0 0.0 \n", "10089158 0 0 0.0 0.0 0.0 0.0 \n", "... ... ... ... ... ... ... \n", "2398808 0 0 0.0 0.0 0.0 0.0 \n", "3622839 0 0 0.0 0.0 0.0 0.0 \n", "1389679 0 0 0.0 0.0 0.0 0.0 \n", "6594722 0 0 0.0 0.0 0.0 0.0 \n", "4589084 0 0 0.0 0.0 0.0 0.0 \n", "\n", " n_employment n_education \n", "6325067 1.0 1.0 \n", "6140551 0.0 0.0 \n", "3258315 0.0 0.0 \n", "10948983 1.0 0.0 \n", "10089158 0.0 0.0 \n", "... ... ... \n", "2398808 0.0 0.0 \n", "3622839 1.0 1.0 \n", "1389679 0.0 0.0 \n", "6594722 0.0 0.0 \n", "4589084 1.0 0.0 \n", "\n", "[5494824 rows x 13 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainX" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# define outlier detection model\n", "model = OneClassSVM(gamma='scale', nu=0.5)\n", "\n", "# fit on majority class\n", "trainX = trainX[trainy==1]\n", "model.fit(trainX)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# detect outliers in the test set\n", "yhat = model.predict(testX)\n", "\n", "# mark inliers 1, outliers -1\n", "testy[testy == 0] = -1\n", "testy[testy == 1] = 1\n", "\n", "# calculate score\n", "score = f1_score(testy, yhat, pos_label=-1)\n", "print('F1 Score: %.3f' % score)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }