fake-orcid-analysis/03-Machine Learning.ipynb at 8288d877fa7113f8f5efe25e279fa79f1d581381

Todo in data

Column names -> no space
If a list is empty, serialise [] in the csv
If a string is empty, serialise '' in the csv

In [ ]:

import ast
from urllib.parse import urlparse
import tldextract

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import OneClassSVM 
from sklearn.model_selection import train_test_split

In [3]:

df = pd.read_pickle('../data/processed/features.pkl')

In [4]:

df

Out[4]:

	verified_email	verified_primary_email	n_works	n_doi	n_arxiv	n_pmc	n_other_pids	n_emails	n_urls	n_ids	n_keywords	n_employment	n_education	label
0	0	0	0	0	0	0	0	NaN	NaN	NaN	NaN	NaN	NaN	0
1	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	1.0	NaN	0
2	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	NaN	NaN	0
3	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	1.0	NaN	0
4	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	2.0	NaN	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10989644	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	1.0	2.0	0
10989645	1	1	7	7	0	1	0	NaN	NaN	NaN	NaN	2.0	2.0	1
10989646	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	NaN	NaN	0
10989647	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	1.0	2.0	0
10989648	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	NaN	NaN	0

10989649 rows × 14 columns

In [10]:

df = df.fillna(0)

In [11]:

df[df.label == 1].count()

Out[11]:

verified_email            2664886
verified_primary_email    2664886
n_works                   2664886
n_doi                     2664886
n_arxiv                   2664886
n_pmc                     2664886
n_other_pids              2664886
n_emails                  2664886
n_urls                    2664886
n_ids                     2664886
n_keywords                2664886
n_employment              2664886
n_education               2664886
label                     2664886
dtype: int64

In [12]:

df[df.label == 0].count()

Out[12]:

verified_email            8324763
verified_primary_email    8324763
n_works                   8324763
n_doi                     8324763
n_arxiv                   8324763
n_pmc                     8324763
n_other_pids              8324763
n_emails                  8324763
n_urls                    8324763
n_ids                     8324763
n_keywords                8324763
n_employment              8324763
n_education               8324763
label                     8324763
dtype: int64

In [ ]:

# split into train/test sets
X = df.loc[:,'verified_email':'n_education']
y = df['label']
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)

# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.01)

# fit on majority class
trainX = trainX[trainy==1]
model.fit(trainX)

In [ ]:

# detect outliers in the test set
yhat = model.predict(testX)

# mark inliers 1, outliers -1
testy[testy == 0] = -1
testy[testy == 1] = 1

# calculate score
score = f1_score(testy, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)

14 KiB Raw Blame History Unescape Escape

14 KiB

Raw Blame History