You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
master
${ noResults }
85 KiB
85 KiB
In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.svm import OneClassSVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
In [2]:
df = pd.read_pickle('../data/processed/features.pkl')
In [3]:
df
Out[3]:
verified_email | verified_primary_email | n_works | n_doi | n_arxiv | n_pmc | n_other_pids | label | n_emails | n_urls | ... | n_employment | n_ext_work_source | n_valid_education | n_valid_employment | biography_length | biography_n_sentences | biography_n_words | date_diff | ref_year | date_stale | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | 0 | 0 | 0 | 0 | 0 | False | <NA> | <NA> | ... | <NA> | <NA> | NaN | NaN | <NA> | NaN | NaN | 0.009618 | 2018 | 1153.980551 |
1 | True | True | 0 | 0 | 0 | 0 | 0 | False | <NA> | <NA> | ... | 1 | <NA> | NaN | 1.0 | <NA> | NaN | NaN | 715.078025 | 2018 | 406.980815 |
2 | True | True | 0 | 0 | 0 | 0 | 0 | False | <NA> | <NA> | ... | <NA> | <NA> | NaN | NaN | <NA> | NaN | NaN | 48.001631 | 2019 | 456.736688 |
3 | True | True | 0 | 0 | 0 | 0 | 0 | False | <NA> | <NA> | ... | 1 | <NA> | NaN | 0.0 | <NA> | NaN | NaN | 1863.042464 | 2015 | 217.817512 |
4 | True | True | 0 | 0 | 0 | 0 | 0 | False | <NA> | <NA> | ... | 2 | <NA> | NaN | 1.0 | <NA> | NaN | NaN | 827.372135 | 2014 | 1779.456397 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10989644 | True | True | 0 | 0 | 0 | 0 | 0 | False | <NA> | <NA> | ... | 1 | <NA> | 1.0 | 1.0 | 118 | 2.0 | 23.0 | 0.051116 | 2020 | 139.242812 |
10989645 | True | True | 7 | 7 | 0 | 1 | 0 | True | <NA> | <NA> | ... | 2 | 2 | 2.0 | 0.0 | <NA> | NaN | NaN | 2131.978000 | 2015 | 158.560081 |
10989646 | True | True | 0 | 0 | 0 | 0 | 0 | False | <NA> | <NA> | ... | <NA> | <NA> | NaN | NaN | <NA> | NaN | NaN | 0.000597 | 2020 | 139.226289 |
10989647 | True | True | 0 | 0 | 0 | 0 | 0 | False | <NA> | <NA> | ... | 1 | <NA> | 2.0 | 1.0 | <NA> | NaN | NaN | 1433.222830 | 2016 | 150.839463 |
10989648 | True | True | 0 | 0 | 0 | 0 | 0 | False | <NA> | <NA> | ... | <NA> | <NA> | NaN | NaN | <NA> | NaN | NaN | 1591.542558 | 2016 | 152.263413 |
10989649 rows × 23 columns
In [4]:
df.fillna(0, inplace=True)
In [5]:
df
Out[5]:
verified_email | verified_primary_email | n_works | n_doi | n_arxiv | n_pmc | n_other_pids | label | n_emails | n_urls | ... | n_employment | n_ext_work_source | n_valid_education | n_valid_employment | biography_length | biography_n_sentences | biography_n_words | date_diff | ref_year | date_stale | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | 0 | 0 | 0 | 0 | 0 | False | 0 | 0 | ... | 0 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.009618 | 2018 | 1153.980551 |
1 | True | True | 0 | 0 | 0 | 0 | 0 | False | 0 | 0 | ... | 1 | 0 | 0.0 | 1.0 | 0 | 0.0 | 0.0 | 715.078025 | 2018 | 406.980815 |
2 | True | True | 0 | 0 | 0 | 0 | 0 | False | 0 | 0 | ... | 0 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 48.001631 | 2019 | 456.736688 |
3 | True | True | 0 | 0 | 0 | 0 | 0 | False | 0 | 0 | ... | 1 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 1863.042464 | 2015 | 217.817512 |
4 | True | True | 0 | 0 | 0 | 0 | 0 | False | 0 | 0 | ... | 2 | 0 | 0.0 | 1.0 | 0 | 0.0 | 0.0 | 827.372135 | 2014 | 1779.456397 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
10989644 | True | True | 0 | 0 | 0 | 0 | 0 | False | 0 | 0 | ... | 1 | 0 | 1.0 | 1.0 | 118 | 2.0 | 23.0 | 0.051116 | 2020 | 139.242812 |
10989645 | True | True | 7 | 7 | 0 | 1 | 0 | True | 0 | 0 | ... | 2 | 2 | 2.0 | 0.0 | 0 | 0.0 | 0.0 | 2131.978000 | 2015 | 158.560081 |
10989646 | True | True | 0 | 0 | 0 | 0 | 0 | False | 0 | 0 | ... | 0 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 0.000597 | 2020 | 139.226289 |
10989647 | True | True | 0 | 0 | 0 | 0 | 0 | False | 0 | 0 | ... | 1 | 0 | 2.0 | 1.0 | 0 | 0.0 | 0.0 | 1433.222830 | 2016 | 150.839463 |
10989648 | True | True | 0 | 0 | 0 | 0 | 0 | False | 0 | 0 | ... | 0 | 0 | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 1591.542558 | 2016 | 152.263413 |
10989649 rows × 23 columns
In [6]:
df[df.label == 1].shape
Out[6]:
(2075872, 23)
In [7]:
df[df.label == 0].shape
Out[7]:
(8913777, 23)
In [8]:
# split into train/test sets
x = df.loc[:, df.columns != 'label']
y = df['label']
train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=200000, test_size=1000000, random_state=2, stratify=y)
In [9]:
train_x[train_y==1].shape
Out[9]:
(37779, 22)
In [10]:
test_x[test_y==1].shape
Out[10]:
(188893, 22)
In [11]:
# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.5)
# fit on majority class
train_x = train_x[train_y==1]
model.fit(train_x)
Out[11]:
OneClassSVM()
In [12]:
# detect outliers in the test set
y_hat = model.predict(test_x)
# mark inliers 1, outliers -1
test_y[test_y == 0] = -1
test_y[test_y == 1] = 1
# calculate score
score = f1_score(test_y, y_hat, pos_label=-1)
print('F1 Score: %.3f' % score)
F1 Score: 0.872
In [13]:
test_x.loc[:, 'label'] = test_y.values
/Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1597: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self.obj[key] = value /Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1676: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self._setitem_single_column(ilocs[0], value, pi)
In [14]:
test_x.loc[:, 'prediction'] = y_hat
In [15]:
test_x[test_x.label != test_x.prediction]
Out[15]:
verified_email | verified_primary_email | n_works | n_doi | n_arxiv | n_pmc | n_other_pids | n_emails | n_urls | n_ids | ... | n_valid_education | n_valid_employment | biography_length | biography_n_sentences | biography_n_words | date_diff | ref_year | date_stale | label | prediction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
4867967 | True | True | 189 | 155 | 0 | 12 | 177 | 0 | 0 | 1 | ... | 1.0 | 2.0 | 1099 | 4.0 | 160.0 | 2071.017713 | 2015 | 136.067404 | 1.0 | -1 |
8751870 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1.0 | 0.0 | 0 | 0.0 | 0.0 | 722.965509 | 2018 | 296.556650 | -1.0 | 1 |
10041539 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 2376.608882 | 2014 | 59.669744 | 1.0 | -1 |
184408 | True | True | 21 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 4.0 | 1.0 | 0 | 0.0 | 0.0 | 461.839456 | 2017 | 1072.829123 | 1.0 | -1 |
8166189 | True | True | 4 | 4 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1.0 | 1.0 | 0 | 0.0 | 0.0 | 395.328151 | 2019 | 92.050538 | 1.0 | -1 |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
7680437 | True | True | 34 | 25 | 0 | 0 | 35 | 0 | 0 | 1 | ... | 1.0 | 1.0 | 0 | 0.0 | 0.0 | 1854.980732 | 2016 | 58.626353 | -1.0 | 1 |
3679223 | True | True | 2 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 1.0 | 0.0 | 0 | 0.0 | 0.0 | 1147.468938 | 2017 | 210.395635 | -1.0 | 1 |
7996977 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 1525.385941 | 2016 | 374.678959 | -1.0 | 1 |
638259 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 605.317103 | 2019 | 87.166078 | 1.0 | -1 |
1485855 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0 | 0.0 | 0.0 | 1264.909069 | 2016 | 512.234662 | -1.0 | 1 |
205134 rows × 24 columns
In [16]:
fake_df = pd.read_csv('../data/processed/fake_heap_index.csv', index_col='index')
In [17]:
test_x[(test_x.label != test_x.prediction) & (test_x.index.isin(fake_df.index))]
Out[17]:
verified_email | verified_primary_email | n_works | n_doi | n_arxiv | n_pmc | n_other_pids | n_emails | n_urls | n_ids | ... | n_valid_education | n_valid_employment | biography_length | biography_n_sentences | biography_n_words | date_diff | ref_year | date_stale | label | prediction |
---|
0 rows × 24 columns
In [18]:
test_x[(test_x.index.isin(fake_df.index))]
Out[18]:
verified_email | verified_primary_email | n_works | n_doi | n_arxiv | n_pmc | n_other_pids | n_emails | n_urls | n_ids | ... | n_valid_education | n_valid_employment | biography_length | biography_n_sentences | biography_n_words | date_diff | ref_year | date_stale | label | prediction | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
7579770 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.002694 | 2020 | 184.261009 | -1.0 | -1 |
4173344 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.005405 | 2020 | 191.303842 | -1.0 | -1 |
6370669 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.006984 | 2020 | 178.167846 | -1.0 | -1 |
6099073 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.025714 | 2020 | 142.184268 | -1.0 | -1 |
8341750 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 10 | 0 | ... | 0.0 | 0.0 | 392 | 3.0 | 74.0 | 0.015018 | 2021 | 46.549118 | -1.0 | -1 |
10546308 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.002717 | 2020 | 203.149999 | -1.0 | -1 |
5109458 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.004633 | 2020 | 140.166676 | -1.0 | -1 |
989919 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 561 | 5.0 | 110.0 | 0.002649 | 2020 | 146.098129 | -1.0 | -1 |
9173011 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.005271 | 2020 | 169.293431 | -1.0 | -1 |
801178 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.008069 | 2020 | 131.828965 | -1.0 | -1 |
10581997 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.020527 | 2020 | 129.861984 | -1.0 | -1 |
9831120 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.004996 | 2020 | 167.149854 | -1.0 | -1 |
7689620 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.003671 | 2020 | 203.189280 | -1.0 | -1 |
10215555 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.000964 | 2020 | 192.363398 | -1.0 | -1 |
10985986 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.003662 | 2020 | 195.359312 | -1.0 | -1 |
8567972 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.005248 | 2020 | 196.358010 | -1.0 | -1 |
1041978 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.006524 | 2020 | 167.270642 | -1.0 | -1 |
2749172 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 540 | 4.0 | 103.0 | 0.001172 | 2020 | 142.989201 | -1.0 | -1 |
3274872 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.017478 | 2020 | 126.967926 | -1.0 | -1 |
7963350 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.003572 | 2020 | 190.312686 | -1.0 | -1 |
4392500 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.004414 | 2020 | 206.150937 | -1.0 | -1 |
3243302 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.003391 | 2020 | 140.230023 | -1.0 | -1 |
6263478 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 552 | 4.0 | 107.0 | 0.005174 | 2020 | 140.365511 | -1.0 | -1 |
4230883 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.001681 | 2020 | 153.354072 | -1.0 | -1 |
3659063 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.010618 | 2020 | 131.658167 | -1.0 | -1 |
5659388 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.011685 | 2020 | 140.250630 | -1.0 | -1 |
8567973 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.003278 | 2020 | 184.270401 | -1.0 | -1 |
9724190 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 543 | 4.0 | 107.0 | 6.993352 | 2020 | 135.124112 | -1.0 | -1 |
880090 | True | True | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | ... | 0.0 | 0.0 | 445 | 4.0 | 85.0 | 0.001572 | 2020 | 143.298327 | -1.0 | -1 |
29 rows × 24 columns