You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

85 KiB

In [1]:
import pandas as pd

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import OneClassSVM 
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
In [2]:
df = pd.read_pickle('../data/processed/features.pkl')
In [3]:
df
Out[3]:
verified_email verified_primary_email n_works n_doi n_arxiv n_pmc n_other_pids label n_emails n_urls ... n_employment n_ext_work_source n_valid_education n_valid_employment biography_length biography_n_sentences biography_n_words date_diff ref_year date_stale
0 False False 0 0 0 0 0 False <NA> <NA> ... <NA> <NA> NaN NaN <NA> NaN NaN 0.009618 2018 1153.980551
1 True True 0 0 0 0 0 False <NA> <NA> ... 1 <NA> NaN 1.0 <NA> NaN NaN 715.078025 2018 406.980815
2 True True 0 0 0 0 0 False <NA> <NA> ... <NA> <NA> NaN NaN <NA> NaN NaN 48.001631 2019 456.736688
3 True True 0 0 0 0 0 False <NA> <NA> ... 1 <NA> NaN 0.0 <NA> NaN NaN 1863.042464 2015 217.817512
4 True True 0 0 0 0 0 False <NA> <NA> ... 2 <NA> NaN 1.0 <NA> NaN NaN 827.372135 2014 1779.456397
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10989644 True True 0 0 0 0 0 False <NA> <NA> ... 1 <NA> 1.0 1.0 118 2.0 23.0 0.051116 2020 139.242812
10989645 True True 7 7 0 1 0 True <NA> <NA> ... 2 2 2.0 0.0 <NA> NaN NaN 2131.978000 2015 158.560081
10989646 True True 0 0 0 0 0 False <NA> <NA> ... <NA> <NA> NaN NaN <NA> NaN NaN 0.000597 2020 139.226289
10989647 True True 0 0 0 0 0 False <NA> <NA> ... 1 <NA> 2.0 1.0 <NA> NaN NaN 1433.222830 2016 150.839463
10989648 True True 0 0 0 0 0 False <NA> <NA> ... <NA> <NA> NaN NaN <NA> NaN NaN 1591.542558 2016 152.263413

10989649 rows × 23 columns

In [4]:
df.fillna(0, inplace=True)
In [5]:
df
Out[5]:
verified_email verified_primary_email n_works n_doi n_arxiv n_pmc n_other_pids label n_emails n_urls ... n_employment n_ext_work_source n_valid_education n_valid_employment biography_length biography_n_sentences biography_n_words date_diff ref_year date_stale
0 False False 0 0 0 0 0 False 0 0 ... 0 0 0.0 0.0 0 0.0 0.0 0.009618 2018 1153.980551
1 True True 0 0 0 0 0 False 0 0 ... 1 0 0.0 1.0 0 0.0 0.0 715.078025 2018 406.980815
2 True True 0 0 0 0 0 False 0 0 ... 0 0 0.0 0.0 0 0.0 0.0 48.001631 2019 456.736688
3 True True 0 0 0 0 0 False 0 0 ... 1 0 0.0 0.0 0 0.0 0.0 1863.042464 2015 217.817512
4 True True 0 0 0 0 0 False 0 0 ... 2 0 0.0 1.0 0 0.0 0.0 827.372135 2014 1779.456397
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10989644 True True 0 0 0 0 0 False 0 0 ... 1 0 1.0 1.0 118 2.0 23.0 0.051116 2020 139.242812
10989645 True True 7 7 0 1 0 True 0 0 ... 2 2 2.0 0.0 0 0.0 0.0 2131.978000 2015 158.560081
10989646 True True 0 0 0 0 0 False 0 0 ... 0 0 0.0 0.0 0 0.0 0.0 0.000597 2020 139.226289
10989647 True True 0 0 0 0 0 False 0 0 ... 1 0 2.0 1.0 0 0.0 0.0 1433.222830 2016 150.839463
10989648 True True 0 0 0 0 0 False 0 0 ... 0 0 0.0 0.0 0 0.0 0.0 1591.542558 2016 152.263413

10989649 rows × 23 columns

In [6]:
df[df.label == 1].shape
Out[6]:
(2075872, 23)
In [7]:
df[df.label == 0].shape
Out[7]:
(8913777, 23)
In [8]:
# split into train/test sets
x = df.loc[:, df.columns != 'label']
y = df['label']
train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=200000, test_size=1000000, random_state=2, stratify=y)
In [9]:
train_x[train_y==1].shape
Out[9]:
(37779, 22)
In [10]:
test_x[test_y==1].shape
Out[10]:
(188893, 22)
In [11]:
# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.5)

# fit on majority class
train_x = train_x[train_y==1]
model.fit(train_x)
Out[11]:
OneClassSVM()
In [12]:
# detect outliers in the test set
y_hat = model.predict(test_x)

# mark inliers 1, outliers -1
test_y[test_y == 0] = -1
test_y[test_y == 1] = 1

# calculate score
score = f1_score(test_y, y_hat, pos_label=-1)
print('F1 Score: %.3f' % score)
F1 Score: 0.872
In [13]:
test_x.loc[:, 'label'] = test_y.values
/Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1597: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
/Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1676: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
In [14]:
test_x.loc[:, 'prediction'] = y_hat
In [15]:
test_x[test_x.label != test_x.prediction]
Out[15]:
verified_email verified_primary_email n_works n_doi n_arxiv n_pmc n_other_pids n_emails n_urls n_ids ... n_valid_education n_valid_employment biography_length biography_n_sentences biography_n_words date_diff ref_year date_stale label prediction
4867967 True True 189 155 0 12 177 0 0 1 ... 1.0 2.0 1099 4.0 160.0 2071.017713 2015 136.067404 1.0 -1
8751870 True True 0 0 0 0 0 0 0 0 ... 1.0 0.0 0 0.0 0.0 722.965509 2018 296.556650 -1.0 1
10041539 True True 0 0 0 0 0 0 0 0 ... 0.0 0.0 0 0.0 0.0 2376.608882 2014 59.669744 1.0 -1
184408 True True 21 0 0 0 0 0 1 0 ... 4.0 1.0 0 0.0 0.0 461.839456 2017 1072.829123 1.0 -1
8166189 True True 4 4 0 0 0 0 0 0 ... 1.0 1.0 0 0.0 0.0 395.328151 2019 92.050538 1.0 -1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7680437 True True 34 25 0 0 35 0 0 1 ... 1.0 1.0 0 0.0 0.0 1854.980732 2016 58.626353 -1.0 1
3679223 True True 2 1 0 0 0 0 0 0 ... 1.0 0.0 0 0.0 0.0 1147.468938 2017 210.395635 -1.0 1
7996977 True True 0 0 0 0 0 0 0 0 ... 0.0 0.0 0 0.0 0.0 1525.385941 2016 374.678959 -1.0 1
638259 True True 0 0 0 0 0 0 0 0 ... 0.0 0.0 0 0.0 0.0 605.317103 2019 87.166078 1.0 -1
1485855 True True 0 0 0 0 0 0 0 0 ... 0.0 0.0 0 0.0 0.0 1264.909069 2016 512.234662 -1.0 1

205134 rows × 24 columns

In [16]:
fake_df = pd.read_csv('../data/processed/fake_heap_index.csv', index_col='index')
In [17]:
test_x[(test_x.label != test_x.prediction) & (test_x.index.isin(fake_df.index))]
Out[17]:
verified_email verified_primary_email n_works n_doi n_arxiv n_pmc n_other_pids n_emails n_urls n_ids ... n_valid_education n_valid_employment biography_length biography_n_sentences biography_n_words date_diff ref_year date_stale label prediction

0 rows × 24 columns

In [18]:
test_x[(test_x.index.isin(fake_df.index))]
Out[18]:
verified_email verified_primary_email n_works n_doi n_arxiv n_pmc n_other_pids n_emails n_urls n_ids ... n_valid_education n_valid_employment biography_length biography_n_sentences biography_n_words date_diff ref_year date_stale label prediction
7579770 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.002694 2020 184.261009 -1.0 -1
4173344 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.005405 2020 191.303842 -1.0 -1
6370669 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.006984 2020 178.167846 -1.0 -1
6099073 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.025714 2020 142.184268 -1.0 -1
8341750 True True 0 0 0 0 0 0 10 0 ... 0.0 0.0 392 3.0 74.0 0.015018 2021 46.549118 -1.0 -1
10546308 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.002717 2020 203.149999 -1.0 -1
5109458 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.004633 2020 140.166676 -1.0 -1
989919 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 561 5.0 110.0 0.002649 2020 146.098129 -1.0 -1
9173011 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.005271 2020 169.293431 -1.0 -1
801178 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.008069 2020 131.828965 -1.0 -1
10581997 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.020527 2020 129.861984 -1.0 -1
9831120 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.004996 2020 167.149854 -1.0 -1
7689620 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.003671 2020 203.189280 -1.0 -1
10215555 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.000964 2020 192.363398 -1.0 -1
10985986 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.003662 2020 195.359312 -1.0 -1
8567972 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.005248 2020 196.358010 -1.0 -1
1041978 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.006524 2020 167.270642 -1.0 -1
2749172 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 540 4.0 103.0 0.001172 2020 142.989201 -1.0 -1
3274872 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.017478 2020 126.967926 -1.0 -1
7963350 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.003572 2020 190.312686 -1.0 -1
4392500 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.004414 2020 206.150937 -1.0 -1
3243302 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.003391 2020 140.230023 -1.0 -1
6263478 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 552 4.0 107.0 0.005174 2020 140.365511 -1.0 -1
4230883 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.001681 2020 153.354072 -1.0 -1
3659063 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.010618 2020 131.658167 -1.0 -1
5659388 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.011685 2020 140.250630 -1.0 -1
8567973 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.003278 2020 184.270401 -1.0 -1
9724190 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 543 4.0 107.0 6.993352 2020 135.124112 -1.0 -1
880090 True True 0 0 0 0 0 0 1 0 ... 0.0 0.0 445 4.0 85.0 0.001572 2020 143.298327 -1.0 -1

29 rows × 24 columns