14 KiB
14 KiB
Todo in data
- Column names -> no space
- If a list is empty, serialise [] in the csv
- If a string is empty, serialise '' in the csv
In [ ]:
import ast
from urllib.parse import urlparse
import tldextract
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
In [3]:
df = pd.read_pickle('../data/processed/features.pkl')
In [4]:
df
Out[4]:
In [10]:
df = df.fillna(0)
In [11]:
df[df.label == 1].count()
Out[11]:
In [12]:
df[df.label == 0].count()
Out[12]:
In [ ]:
# split into train/test sets
X = df.loc[:,'verified_email':'n_education']
y = df['label']
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)
# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.01)
# fit on majority class
trainX = trainX[trainy==1]
model.fit(trainX)
In [ ]:
# detect outliers in the test set
yhat = model.predict(testX)
# mark inliers 1, outliers -1
testy[testy == 0] = -1
testy[testy == 1] = 1
# calculate score
score = f1_score(testy, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)