33 KiB
33 KiB
In [2]:
import ast
from urllib.parse import urlparse
import tldextract
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
In [3]:
df = pd.read_pickle('../data/processed/features.pkl')
In [4]:
df
Out[4]:
In [5]:
df.fillna(0, inplace=True)
In [6]:
df
Out[6]:
In [7]:
df[df.label == 1].count()
Out[7]:
In [8]:
df[df.label == 0].count()
Out[8]:
In [10]:
# split into train/test sets
X = df.loc[:,'verified_email':'n_education']
y = df['label']
trainX, testX, trainy, testy = train_test_split(X, y, train_size=0.5, random_state=2, stratify=y)
In [11]:
trainX
Out[11]:
In [ ]:
# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.5)
# fit on majority class
trainX = trainX[trainy==1]
model.fit(trainX)
In [ ]:
# detect outliers in the test set
yhat = model.predict(testX)
# mark inliers 1, outliers -1
testy[testy == 0] = -1
testy[testy == 1] = 1
# calculate score
score = f1_score(testy, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)