fake-orcid-analysis/03-Machine Learning.ipynb at 42ff175d05d56c8226447c13cda8df3ef2585259

In [2]:

import ast
from urllib.parse import urlparse
import tldextract

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import OneClassSVM 
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [3]:

df = pd.read_pickle('../data/processed/features.pkl')

In [4]:

df

Out[4]:

	verified_email	verified_primary_email	n_works	n_doi	n_arxiv	n_pmc	n_other_pids	n_emails	n_urls	n_ids	n_keywords	n_employment	n_education	label
0	0	0	0	0	0	0	0	NaN	NaN	NaN	NaN	NaN	NaN	0
1	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	1.0	NaN	0
2	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	NaN	NaN	0
3	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	1.0	NaN	0
4	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	2.0	NaN	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10989644	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	1.0	2.0	0
10989645	1	1	7	7	0	1	0	NaN	NaN	NaN	NaN	2.0	2.0	1
10989646	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	NaN	NaN	0
10989647	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	1.0	2.0	0
10989648	1	1	0	0	0	0	0	NaN	NaN	NaN	NaN	NaN	NaN	0

10989649 rows × 14 columns

In [5]:

df.fillna(0, inplace=True)

In [6]:

df

Out[6]:

	verified_email	verified_primary_email	n_works	n_doi	n_arxiv	n_pmc	n_other_pids	n_emails	n_urls	n_ids	n_keywords	n_employment	n_education	label
0	0	0	0	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0	0
1	1	1	0	0	0	0	0	0.0	0.0	0.0	0.0	1.0	0.0	0
2	1	1	0	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0	0
3	1	1	0	0	0	0	0	0.0	0.0	0.0	0.0	1.0	0.0	0
4	1	1	0	0	0	0	0	0.0	0.0	0.0	0.0	2.0	0.0	0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10989644	1	1	0	0	0	0	0	0.0	0.0	0.0	0.0	1.0	2.0	0
10989645	1	1	7	7	0	1	0	0.0	0.0	0.0	0.0	2.0	2.0	1
10989646	1	1	0	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0	0
10989647	1	1	0	0	0	0	0	0.0	0.0	0.0	0.0	1.0	2.0	0
10989648	1	1	0	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0	0

10989649 rows × 14 columns

In [7]:

df[df.label == 1].count()

Out[7]:

verified_email            2664886
verified_primary_email    2664886
n_works                   2664886
n_doi                     2664886
n_arxiv                   2664886
n_pmc                     2664886
n_other_pids              2664886
n_emails                  2664886
n_urls                    2664886
n_ids                     2664886
n_keywords                2664886
n_employment              2664886
n_education               2664886
label                     2664886
dtype: int64

In [8]:

df[df.label == 0].count()

Out[8]:

verified_email            8324763
verified_primary_email    8324763
n_works                   8324763
n_doi                     8324763
n_arxiv                   8324763
n_pmc                     8324763
n_other_pids              8324763
n_emails                  8324763
n_urls                    8324763
n_ids                     8324763
n_keywords                8324763
n_employment              8324763
n_education               8324763
label                     8324763
dtype: int64

In [10]:

# split into train/test sets
X = df.loc[:,'verified_email':'n_education']
y = df['label']
trainX, testX, trainy, testy = train_test_split(X, y, train_size=0.5, random_state=2, stratify=y)

In [11]:

trainX

Out[11]:

	verified_email	verified_primary_email	n_works	n_doi	n_arxiv	n_pmc	n_other_pids	n_emails	n_urls	n_ids	n_keywords	n_employment	n_education
6325067	1	1	0	0	0	0	0	0.0	0.0	1.0	0.0	1.0	1.0
6140551	1	1	0	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
3258315	1	1	0	0	0	0	0	0.0	1.0	0.0	2.0	0.0	0.0
10948983	1	1	0	0	0	0	0	0.0	0.0	0.0	0.0	1.0	0.0
10089158	1	1	0	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...
2398808	0	0	1	1	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
3622839	1	1	0	0	0	0	0	0.0	0.0	0.0	0.0	1.0	1.0
1389679	1	1	0	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
6594722	0	0	0	0	0	0	0	0.0	0.0	0.0	0.0	0.0	0.0
4589084	1	1	11	0	0	0	0	0.0	0.0	0.0	0.0	1.0	0.0

5494824 rows × 13 columns

In [ ]:

# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.5)

# fit on majority class
trainX = trainX[trainy==1]
model.fit(trainX)

In [ ]:

# detect outliers in the test set
yhat = model.predict(testX)

# mark inliers 1, outliers -1
testy[testy == 0] = -1
testy[testy == 1] = 1

# calculate score
score = f1_score(testy, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)

33 KiB Raw Blame History Unescape Escape

33 KiB

Raw Blame History