fake-orcid-analysis/notebooks/03-Machine Learning.ipynb

33 KiB
Raw Blame History

In [2]:
import ast
from urllib.parse import urlparse
import tldextract

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import OneClassSVM 
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
In [3]:
df = pd.read_pickle('../data/processed/features.pkl')
In [4]:
df
Out[4]:
verified_email verified_primary_email n_works n_doi n_arxiv n_pmc n_other_pids n_emails n_urls n_ids n_keywords n_employment n_education label
0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN 0
1 1 1 0 0 0 0 0 NaN NaN NaN NaN 1.0 NaN 0
2 1 1 0 0 0 0 0 NaN NaN NaN NaN NaN NaN 0
3 1 1 0 0 0 0 0 NaN NaN NaN NaN 1.0 NaN 0
4 1 1 0 0 0 0 0 NaN NaN NaN NaN 2.0 NaN 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10989644 1 1 0 0 0 0 0 NaN NaN NaN NaN 1.0 2.0 0
10989645 1 1 7 7 0 1 0 NaN NaN NaN NaN 2.0 2.0 1
10989646 1 1 0 0 0 0 0 NaN NaN NaN NaN NaN NaN 0
10989647 1 1 0 0 0 0 0 NaN NaN NaN NaN 1.0 2.0 0
10989648 1 1 0 0 0 0 0 NaN NaN NaN NaN NaN NaN 0

10989649 rows × 14 columns

In [5]:
df.fillna(0, inplace=True)
In [6]:
df
Out[6]:
verified_email verified_primary_email n_works n_doi n_arxiv n_pmc n_other_pids n_emails n_urls n_ids n_keywords n_employment n_education label
0 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0
1 1 1 0 0 0 0 0 0.0 0.0 0.0 0.0 1.0 0.0 0
2 1 1 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0
3 1 1 0 0 0 0 0 0.0 0.0 0.0 0.0 1.0 0.0 0
4 1 1 0 0 0 0 0 0.0 0.0 0.0 0.0 2.0 0.0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10989644 1 1 0 0 0 0 0 0.0 0.0 0.0 0.0 1.0 2.0 0
10989645 1 1 7 7 0 1 0 0.0 0.0 0.0 0.0 2.0 2.0 1
10989646 1 1 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0
10989647 1 1 0 0 0 0 0 0.0 0.0 0.0 0.0 1.0 2.0 0
10989648 1 1 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0 0

10989649 rows × 14 columns

In [7]:
df[df.label == 1].count()
Out[7]:
verified_email            2664886
verified_primary_email    2664886
n_works                   2664886
n_doi                     2664886
n_arxiv                   2664886
n_pmc                     2664886
n_other_pids              2664886
n_emails                  2664886
n_urls                    2664886
n_ids                     2664886
n_keywords                2664886
n_employment              2664886
n_education               2664886
label                     2664886
dtype: int64
In [8]:
df[df.label == 0].count()
Out[8]:
verified_email            8324763
verified_primary_email    8324763
n_works                   8324763
n_doi                     8324763
n_arxiv                   8324763
n_pmc                     8324763
n_other_pids              8324763
n_emails                  8324763
n_urls                    8324763
n_ids                     8324763
n_keywords                8324763
n_employment              8324763
n_education               8324763
label                     8324763
dtype: int64
In [10]:
# split into train/test sets
X = df.loc[:,'verified_email':'n_education']
y = df['label']
trainX, testX, trainy, testy = train_test_split(X, y, train_size=0.5, random_state=2, stratify=y)
In [11]:
trainX
Out[11]:
verified_email verified_primary_email n_works n_doi n_arxiv n_pmc n_other_pids n_emails n_urls n_ids n_keywords n_employment n_education
6325067 1 1 0 0 0 0 0 0.0 0.0 1.0 0.0 1.0 1.0
6140551 1 1 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
3258315 1 1 0 0 0 0 0 0.0 1.0 0.0 2.0 0.0 0.0
10948983 1 1 0 0 0 0 0 0.0 0.0 0.0 0.0 1.0 0.0
10089158 1 1 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2398808 0 0 1 1 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
3622839 1 1 0 0 0 0 0 0.0 0.0 0.0 0.0 1.0 1.0
1389679 1 1 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
6594722 0 0 0 0 0 0 0 0.0 0.0 0.0 0.0 0.0 0.0
4589084 1 1 11 0 0 0 0 0.0 0.0 0.0 0.0 1.0 0.0

5494824 rows × 13 columns

In [ ]:
# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.5)

# fit on majority class
trainX = trainX[trainy==1]
model.fit(trainX)
In [ ]:
# detect outliers in the test set
yhat = model.predict(testX)

# mark inliers 1, outliers -1
testy[testy == 0] = -1
testy[testy == 1] = 1

# calculate score
score = f1_score(testy, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)