fake-orcid-analysis/notebooks/03-Machine Learning.ipynb

14 KiB
Raw Blame History

Todo in data

  • Column names -> no space
  • If a list is empty, serialise [] in the csv
  • If a string is empty, serialise '' in the csv
In [ ]:
import ast
from urllib.parse import urlparse
import tldextract

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import OneClassSVM 
from sklearn.model_selection import train_test_split
In [3]:
df = pd.read_pickle('../data/processed/features.pkl')
In [4]:
df
Out[4]:
verified_email verified_primary_email n_works n_doi n_arxiv n_pmc n_other_pids n_emails n_urls n_ids n_keywords n_employment n_education label
0 0 0 0 0 0 0 0 NaN NaN NaN NaN NaN NaN 0
1 1 1 0 0 0 0 0 NaN NaN NaN NaN 1.0 NaN 0
2 1 1 0 0 0 0 0 NaN NaN NaN NaN NaN NaN 0
3 1 1 0 0 0 0 0 NaN NaN NaN NaN 1.0 NaN 0
4 1 1 0 0 0 0 0 NaN NaN NaN NaN 2.0 NaN 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10989644 1 1 0 0 0 0 0 NaN NaN NaN NaN 1.0 2.0 0
10989645 1 1 7 7 0 1 0 NaN NaN NaN NaN 2.0 2.0 1
10989646 1 1 0 0 0 0 0 NaN NaN NaN NaN NaN NaN 0
10989647 1 1 0 0 0 0 0 NaN NaN NaN NaN 1.0 2.0 0
10989648 1 1 0 0 0 0 0 NaN NaN NaN NaN NaN NaN 0

10989649 rows × 14 columns

In [10]:
df = df.fillna(0)
In [11]:
df[df.label == 1].count()
Out[11]:
verified_email            2664886
verified_primary_email    2664886
n_works                   2664886
n_doi                     2664886
n_arxiv                   2664886
n_pmc                     2664886
n_other_pids              2664886
n_emails                  2664886
n_urls                    2664886
n_ids                     2664886
n_keywords                2664886
n_employment              2664886
n_education               2664886
label                     2664886
dtype: int64
In [12]:
df[df.label == 0].count()
Out[12]:
verified_email            8324763
verified_primary_email    8324763
n_works                   8324763
n_doi                     8324763
n_arxiv                   8324763
n_pmc                     8324763
n_other_pids              8324763
n_emails                  8324763
n_urls                    8324763
n_ids                     8324763
n_keywords                8324763
n_employment              8324763
n_education               8324763
label                     8324763
dtype: int64
In [ ]:
# split into train/test sets
X = df.loc[:,'verified_email':'n_education']
y = df['label']
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)

# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.01)

# fit on majority class
trainX = trainX[trainy==1]
model.fit(trainX)
In [ ]:
# detect outliers in the test set
yhat = model.predict(testX)

# mark inliers 1, outliers -1
testy[testy == 0] = -1
testy[testy == 1] = 1

# calculate score
score = f1_score(testy, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)