fake-orcid-analysis/notebooks/03-Supervised.ipynb

2436 lines
85 KiB
Plaintext
Raw Permalink Normal View History

2021-04-29 18:50:02 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from sklearn.preprocessing import MultiLabelBinarizer\n",
"from sklearn.svm import OneClassSVM \n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import f1_score"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_pickle('../data/processed/features.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>label</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>...</th>\n",
" <th>n_employment</th>\n",
" <th>n_ext_work_source</th>\n",
" <th>n_valid_education</th>\n",
" <th>n_valid_employment</th>\n",
" <th>biography_length</th>\n",
" <th>biography_n_sentences</th>\n",
" <th>biography_n_words</th>\n",
" <th>date_diff</th>\n",
" <th>ref_year</th>\n",
" <th>date_stale</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.009618</td>\n",
" <td>2018</td>\n",
" <td>1153.980551</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>715.078025</td>\n",
" <td>2018</td>\n",
" <td>406.980815</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48.001631</td>\n",
" <td>2019</td>\n",
" <td>456.736688</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1863.042464</td>\n",
" <td>2015</td>\n",
" <td>217.817512</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>827.372135</td>\n",
" <td>2014</td>\n",
" <td>1779.456397</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989644</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>118</td>\n",
" <td>2.0</td>\n",
" <td>23.0</td>\n",
" <td>0.051116</td>\n",
" <td>2020</td>\n",
" <td>139.242812</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989645</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2131.978000</td>\n",
" <td>2015</td>\n",
" <td>158.560081</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989646</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.000597</td>\n",
" <td>2020</td>\n",
" <td>139.226289</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989647</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1433.222830</td>\n",
" <td>2016</td>\n",
" <td>150.839463</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989648</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1591.542558</td>\n",
" <td>2016</td>\n",
" <td>152.263413</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10989649 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"0 False False 0 0 0 \n",
"1 True True 0 0 0 \n",
"2 True True 0 0 0 \n",
"3 True True 0 0 0 \n",
"4 True True 0 0 0 \n",
"... ... ... ... ... ... \n",
"10989644 True True 0 0 0 \n",
"10989645 True True 7 7 0 \n",
"10989646 True True 0 0 0 \n",
"10989647 True True 0 0 0 \n",
"10989648 True True 0 0 0 \n",
"\n",
" n_pmc n_other_pids label n_emails n_urls ... n_employment \\\n",
"0 0 0 False <NA> <NA> ... <NA> \n",
"1 0 0 False <NA> <NA> ... 1 \n",
"2 0 0 False <NA> <NA> ... <NA> \n",
"3 0 0 False <NA> <NA> ... 1 \n",
"4 0 0 False <NA> <NA> ... 2 \n",
"... ... ... ... ... ... ... ... \n",
"10989644 0 0 False <NA> <NA> ... 1 \n",
"10989645 1 0 True <NA> <NA> ... 2 \n",
"10989646 0 0 False <NA> <NA> ... <NA> \n",
"10989647 0 0 False <NA> <NA> ... 1 \n",
"10989648 0 0 False <NA> <NA> ... <NA> \n",
"\n",
" n_ext_work_source n_valid_education n_valid_employment \\\n",
"0 <NA> NaN NaN \n",
"1 <NA> NaN 1.0 \n",
"2 <NA> NaN NaN \n",
"3 <NA> NaN 0.0 \n",
"4 <NA> NaN 1.0 \n",
"... ... ... ... \n",
"10989644 <NA> 1.0 1.0 \n",
"10989645 2 2.0 0.0 \n",
"10989646 <NA> NaN NaN \n",
"10989647 <NA> 2.0 1.0 \n",
"10989648 <NA> NaN NaN \n",
"\n",
" biography_length biography_n_sentences biography_n_words \\\n",
"0 <NA> NaN NaN \n",
"1 <NA> NaN NaN \n",
"2 <NA> NaN NaN \n",
"3 <NA> NaN NaN \n",
"4 <NA> NaN NaN \n",
"... ... ... ... \n",
"10989644 118 2.0 23.0 \n",
"10989645 <NA> NaN NaN \n",
"10989646 <NA> NaN NaN \n",
"10989647 <NA> NaN NaN \n",
"10989648 <NA> NaN NaN \n",
"\n",
" date_diff ref_year date_stale \n",
"0 0.009618 2018 1153.980551 \n",
"1 715.078025 2018 406.980815 \n",
"2 48.001631 2019 456.736688 \n",
"3 1863.042464 2015 217.817512 \n",
"4 827.372135 2014 1779.456397 \n",
"... ... ... ... \n",
"10989644 0.051116 2020 139.242812 \n",
"10989645 2131.978000 2015 158.560081 \n",
"10989646 0.000597 2020 139.226289 \n",
"10989647 1433.222830 2016 150.839463 \n",
"10989648 1591.542558 2016 152.263413 \n",
"\n",
"[10989649 rows x 23 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df.fillna(0, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>label</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>...</th>\n",
" <th>n_employment</th>\n",
" <th>n_ext_work_source</th>\n",
" <th>n_valid_education</th>\n",
" <th>n_valid_employment</th>\n",
" <th>biography_length</th>\n",
" <th>biography_n_sentences</th>\n",
" <th>biography_n_words</th>\n",
" <th>date_diff</th>\n",
" <th>ref_year</th>\n",
" <th>date_stale</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.009618</td>\n",
" <td>2018</td>\n",
" <td>1153.980551</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>715.078025</td>\n",
" <td>2018</td>\n",
" <td>406.980815</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>48.001631</td>\n",
" <td>2019</td>\n",
" <td>456.736688</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1863.042464</td>\n",
" <td>2015</td>\n",
" <td>217.817512</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>827.372135</td>\n",
" <td>2014</td>\n",
" <td>1779.456397</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989644</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>118</td>\n",
" <td>2.0</td>\n",
" <td>23.0</td>\n",
" <td>0.051116</td>\n",
" <td>2020</td>\n",
" <td>139.242812</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989645</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2131.978000</td>\n",
" <td>2015</td>\n",
" <td>158.560081</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989646</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000597</td>\n",
" <td>2020</td>\n",
" <td>139.226289</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989647</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1433.222830</td>\n",
" <td>2016</td>\n",
" <td>150.839463</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989648</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1591.542558</td>\n",
" <td>2016</td>\n",
" <td>152.263413</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10989649 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"0 False False 0 0 0 \n",
"1 True True 0 0 0 \n",
"2 True True 0 0 0 \n",
"3 True True 0 0 0 \n",
"4 True True 0 0 0 \n",
"... ... ... ... ... ... \n",
"10989644 True True 0 0 0 \n",
"10989645 True True 7 7 0 \n",
"10989646 True True 0 0 0 \n",
"10989647 True True 0 0 0 \n",
"10989648 True True 0 0 0 \n",
"\n",
" n_pmc n_other_pids label n_emails n_urls ... n_employment \\\n",
"0 0 0 False 0 0 ... 0 \n",
"1 0 0 False 0 0 ... 1 \n",
"2 0 0 False 0 0 ... 0 \n",
"3 0 0 False 0 0 ... 1 \n",
"4 0 0 False 0 0 ... 2 \n",
"... ... ... ... ... ... ... ... \n",
"10989644 0 0 False 0 0 ... 1 \n",
"10989645 1 0 True 0 0 ... 2 \n",
"10989646 0 0 False 0 0 ... 0 \n",
"10989647 0 0 False 0 0 ... 1 \n",
"10989648 0 0 False 0 0 ... 0 \n",
"\n",
" n_ext_work_source n_valid_education n_valid_employment \\\n",
"0 0 0.0 0.0 \n",
"1 0 0.0 1.0 \n",
"2 0 0.0 0.0 \n",
"3 0 0.0 0.0 \n",
"4 0 0.0 1.0 \n",
"... ... ... ... \n",
"10989644 0 1.0 1.0 \n",
"10989645 2 2.0 0.0 \n",
"10989646 0 0.0 0.0 \n",
"10989647 0 2.0 1.0 \n",
"10989648 0 0.0 0.0 \n",
"\n",
" biography_length biography_n_sentences biography_n_words \\\n",
"0 0 0.0 0.0 \n",
"1 0 0.0 0.0 \n",
"2 0 0.0 0.0 \n",
"3 0 0.0 0.0 \n",
"4 0 0.0 0.0 \n",
"... ... ... ... \n",
"10989644 118 2.0 23.0 \n",
"10989645 0 0.0 0.0 \n",
"10989646 0 0.0 0.0 \n",
"10989647 0 0.0 0.0 \n",
"10989648 0 0.0 0.0 \n",
"\n",
" date_diff ref_year date_stale \n",
"0 0.009618 2018 1153.980551 \n",
"1 715.078025 2018 406.980815 \n",
"2 48.001631 2019 456.736688 \n",
"3 1863.042464 2015 217.817512 \n",
"4 827.372135 2014 1779.456397 \n",
"... ... ... ... \n",
"10989644 0.051116 2020 139.242812 \n",
"10989645 2131.978000 2015 158.560081 \n",
"10989646 0.000597 2020 139.226289 \n",
"10989647 1433.222830 2016 150.839463 \n",
"10989648 1591.542558 2016 152.263413 \n",
"\n",
"[10989649 rows x 23 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2075872, 23)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.label == 1].shape"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(8913777, 23)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.label == 0].shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# split into train/test sets\n",
"x = df.loc[:, df.columns != 'label']\n",
"y = df['label']\n",
2021-07-20 12:15:17 +02:00
"train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=200000, test_size=1000000, random_state=2, stratify=y)"
2021-04-29 18:50:02 +02:00
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
2021-07-20 12:15:17 +02:00
"(37779, 22)"
2021-04-29 18:50:02 +02:00
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_x[train_y==1].shape"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(188893, 22)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_x[test_y==1].shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"OneClassSVM()"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# define outlier detection model\n",
"model = OneClassSVM(gamma='scale', nu=0.5)\n",
"\n",
"# fit on majority class\n",
"train_x = train_x[train_y==1]\n",
"model.fit(train_x)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"F1 Score: 0.872\n"
]
}
],
"source": [
"# detect outliers in the test set\n",
"y_hat = model.predict(test_x)\n",
"\n",
"# mark inliers 1, outliers -1\n",
"test_y[test_y == 0] = -1\n",
"test_y[test_y == 1] = 1\n",
"\n",
"# calculate score\n",
"score = f1_score(test_y, y_hat, pos_label=-1)\n",
"print('F1 Score: %.3f' % score)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1597: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" self.obj[key] = value\n",
"/Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1676: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" self._setitem_single_column(ilocs[0], value, pi)\n"
]
}
],
"source": [
"test_x.loc[:, 'label'] = test_y.values"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"test_x.loc[:, 'prediction'] = y_hat"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>n_ids</th>\n",
" <th>...</th>\n",
" <th>n_valid_education</th>\n",
" <th>n_valid_employment</th>\n",
" <th>biography_length</th>\n",
" <th>biography_n_sentences</th>\n",
" <th>biography_n_words</th>\n",
" <th>date_diff</th>\n",
" <th>ref_year</th>\n",
" <th>date_stale</th>\n",
" <th>label</th>\n",
" <th>prediction</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>4867967</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
2021-07-20 12:15:17 +02:00
" <td>189</td>\n",
" <td>155</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>12</td>\n",
" <td>177</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
2021-07-20 12:15:17 +02:00
" <td>1.0</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>1099</td>\n",
" <td>4.0</td>\n",
" <td>160.0</td>\n",
" <td>2071.017713</td>\n",
" <td>2015</td>\n",
" <td>136.067404</td>\n",
2021-04-29 18:50:02 +02:00
" <td>1.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>-1</td>\n",
2021-04-29 18:50:02 +02:00
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>8751870</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.0</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>722.965509</td>\n",
" <td>2018</td>\n",
" <td>296.556650</td>\n",
" <td>-1.0</td>\n",
" <td>1</td>\n",
2021-04-29 18:50:02 +02:00
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>10041539</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>2376.608882</td>\n",
" <td>2014</td>\n",
" <td>59.669744</td>\n",
" <td>1.0</td>\n",
" <td>-1</td>\n",
2021-04-29 18:50:02 +02:00
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>184408</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
2021-07-20 12:15:17 +02:00
" <td>21</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>1</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>...</td>\n",
2021-07-20 12:15:17 +02:00
" <td>4.0</td>\n",
" <td>1.0</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>461.839456</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2017</td>\n",
2021-07-20 12:15:17 +02:00
" <td>1072.829123</td>\n",
" <td>1.0</td>\n",
" <td>-1</td>\n",
2021-04-29 18:50:02 +02:00
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>8166189</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>4</td>\n",
" <td>4</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2021-07-20 12:15:17 +02:00
" <td>1.0</td>\n",
" <td>1.0</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>395.328151</td>\n",
" <td>2019</td>\n",
" <td>92.050538</td>\n",
2021-04-29 18:50:02 +02:00
" <td>1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>7680437</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
2021-07-20 12:15:17 +02:00
" <td>34</td>\n",
" <td>25</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>35</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>1</td>\n",
2021-04-29 18:50:02 +02:00
" <td>...</td>\n",
2021-07-20 12:15:17 +02:00
" <td>1.0</td>\n",
" <td>1.0</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>1854.980732</td>\n",
" <td>2016</td>\n",
" <td>58.626353</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>3679223</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
2021-07-20 12:15:17 +02:00
" <td>2</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
2021-07-20 12:15:17 +02:00
" <td>1.0</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>1147.468938</td>\n",
" <td>2017</td>\n",
" <td>210.395635</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>7996977</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>1525.385941</td>\n",
" <td>2016</td>\n",
" <td>374.678959</td>\n",
" <td>-1.0</td>\n",
" <td>1</td>\n",
2021-04-29 18:50:02 +02:00
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>638259</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0</td>\n",
2021-04-29 18:50:02 +02:00
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>605.317103</td>\n",
" <td>2019</td>\n",
" <td>87.166078</td>\n",
" <td>1.0</td>\n",
" <td>-1</td>\n",
2021-04-29 18:50:02 +02:00
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>1485855</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>1264.909069</td>\n",
" <td>2016</td>\n",
" <td>512.234662</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
2021-07-20 12:15:17 +02:00
"<p>205134 rows × 24 columns</p>\n",
2021-04-29 18:50:02 +02:00
"</div>"
],
"text/plain": [
2021-07-20 12:15:17 +02:00
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"4867967 True True 189 155 0 \n",
"8751870 True True 0 0 0 \n",
"10041539 True True 0 0 0 \n",
"184408 True True 21 0 0 \n",
"8166189 True True 4 4 0 \n",
"... ... ... ... ... ... \n",
"7680437 True True 34 25 0 \n",
"3679223 True True 2 1 0 \n",
"7996977 True True 0 0 0 \n",
"638259 True True 0 0 0 \n",
"1485855 True True 0 0 0 \n",
2021-04-29 18:50:02 +02:00
"\n",
2021-07-20 12:15:17 +02:00
" n_pmc n_other_pids n_emails n_urls n_ids ... \\\n",
"4867967 12 177 0 0 1 ... \n",
"8751870 0 0 0 0 0 ... \n",
"10041539 0 0 0 0 0 ... \n",
"184408 0 0 0 1 0 ... \n",
"8166189 0 0 0 0 0 ... \n",
"... ... ... ... ... ... ... \n",
"7680437 0 35 0 0 1 ... \n",
"3679223 0 0 0 0 0 ... \n",
"7996977 0 0 0 0 0 ... \n",
"638259 0 0 0 0 0 ... \n",
"1485855 0 0 0 0 0 ... \n",
2021-04-29 18:50:02 +02:00
"\n",
2021-07-20 12:15:17 +02:00
" n_valid_education n_valid_employment biography_length \\\n",
"4867967 1.0 2.0 1099 \n",
"8751870 1.0 0.0 0 \n",
"10041539 0.0 0.0 0 \n",
"184408 4.0 1.0 0 \n",
"8166189 1.0 1.0 0 \n",
"... ... ... ... \n",
"7680437 1.0 1.0 0 \n",
"3679223 1.0 0.0 0 \n",
"7996977 0.0 0.0 0 \n",
"638259 0.0 0.0 0 \n",
"1485855 0.0 0.0 0 \n",
2021-04-29 18:50:02 +02:00
"\n",
2021-07-20 12:15:17 +02:00
" biography_n_sentences biography_n_words date_diff ref_year \\\n",
"4867967 4.0 160.0 2071.017713 2015 \n",
"8751870 0.0 0.0 722.965509 2018 \n",
"10041539 0.0 0.0 2376.608882 2014 \n",
"184408 0.0 0.0 461.839456 2017 \n",
"8166189 0.0 0.0 395.328151 2019 \n",
"... ... ... ... ... \n",
"7680437 0.0 0.0 1854.980732 2016 \n",
"3679223 0.0 0.0 1147.468938 2017 \n",
"7996977 0.0 0.0 1525.385941 2016 \n",
"638259 0.0 0.0 605.317103 2019 \n",
"1485855 0.0 0.0 1264.909069 2016 \n",
2021-04-29 18:50:02 +02:00
"\n",
2021-07-20 12:15:17 +02:00
" date_stale label prediction \n",
"4867967 136.067404 1.0 -1 \n",
"8751870 296.556650 -1.0 1 \n",
"10041539 59.669744 1.0 -1 \n",
"184408 1072.829123 1.0 -1 \n",
"8166189 92.050538 1.0 -1 \n",
"... ... ... ... \n",
"7680437 58.626353 -1.0 1 \n",
"3679223 210.395635 -1.0 1 \n",
"7996977 374.678959 -1.0 1 \n",
"638259 87.166078 1.0 -1 \n",
"1485855 512.234662 -1.0 1 \n",
2021-04-29 18:50:02 +02:00
"\n",
2021-07-20 12:15:17 +02:00
"[205134 rows x 24 columns]"
2021-04-29 18:50:02 +02:00
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_x[test_x.label != test_x.prediction]"
]
},
{
"cell_type": "code",
2021-07-20 12:15:17 +02:00
"execution_count": 16,
2021-04-29 18:50:02 +02:00
"metadata": {},
"outputs": [],
"source": [
"fake_df = pd.read_csv('../data/processed/fake_heap_index.csv', index_col='index')"
]
},
{
"cell_type": "code",
2021-07-20 12:15:17 +02:00
"execution_count": 17,
2021-04-29 18:50:02 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>n_ids</th>\n",
" <th>...</th>\n",
" <th>n_valid_education</th>\n",
" <th>n_valid_employment</th>\n",
" <th>biography_length</th>\n",
" <th>biography_n_sentences</th>\n",
" <th>biography_n_words</th>\n",
" <th>date_diff</th>\n",
" <th>ref_year</th>\n",
" <th>date_stale</th>\n",
" <th>label</th>\n",
" <th>prediction</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"<p>0 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [verified_email, verified_primary_email, n_works, n_doi, n_arxiv, n_pmc, n_other_pids, n_emails, n_urls, n_ids, n_keywords, n_education, n_employment, n_ext_work_source, n_valid_education, n_valid_employment, biography_length, biography_n_sentences, biography_n_words, date_diff, ref_year, date_stale, label, prediction]\n",
"Index: []\n",
"\n",
"[0 rows x 24 columns]"
]
},
2021-07-20 12:15:17 +02:00
"execution_count": 17,
2021-04-29 18:50:02 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_x[(test_x.label != test_x.prediction) & (test_x.index.isin(fake_df.index))]"
]
},
{
"cell_type": "code",
2021-07-20 12:15:17 +02:00
"execution_count": 18,
2021-04-29 18:50:02 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>n_ids</th>\n",
" <th>...</th>\n",
" <th>n_valid_education</th>\n",
" <th>n_valid_employment</th>\n",
" <th>biography_length</th>\n",
" <th>biography_n_sentences</th>\n",
" <th>biography_n_words</th>\n",
" <th>date_diff</th>\n",
" <th>ref_year</th>\n",
" <th>date_stale</th>\n",
" <th>label</th>\n",
" <th>prediction</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>7579770</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.002694</td>\n",
" <td>2020</td>\n",
" <td>184.261009</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>4173344</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.005405</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>191.303842</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>6370669</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.006984</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>178.167846</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>6099073</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.025714</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>142.184268</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>8341750</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>10</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>392</td>\n",
" <td>3.0</td>\n",
" <td>74.0</td>\n",
" <td>0.015018</td>\n",
" <td>2021</td>\n",
" <td>46.549118</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>10546308</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.002717</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>203.149999</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>5109458</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.004633</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>140.166676</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>989919</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>561</td>\n",
" <td>5.0</td>\n",
" <td>110.0</td>\n",
" <td>0.002649</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>146.098129</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>9173011</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.005271</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>169.293431</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>801178</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.008069</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>131.828965</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>10581997</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>445</td>\n",
2021-04-29 18:50:02 +02:00
" <td>4.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>85.0</td>\n",
" <td>0.020527</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>129.861984</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>9831120</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.004996</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>167.149854</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>7689620</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.003671</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>203.189280</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>10215555</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.000964</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>192.363398</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>10985986</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.003662</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>195.359312</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>8567972</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.005248</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>196.358010</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>1041978</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>445</td>\n",
2021-04-29 18:50:02 +02:00
" <td>4.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>85.0</td>\n",
" <td>0.006524</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>167.270642</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>2749172</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>540</td>\n",
2021-04-29 18:50:02 +02:00
" <td>4.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>103.0</td>\n",
" <td>0.001172</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>142.989201</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>3274872</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.017478</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>126.967926</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>7963350</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.003572</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>190.312686</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>4392500</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.004414</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>206.150937</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>3243302</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>1</td>\n",
2021-04-29 18:50:02 +02:00
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.003391</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>140.230023</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>6263478</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>552</td>\n",
2021-04-29 18:50:02 +02:00
" <td>4.0</td>\n",
" <td>107.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.005174</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>140.365511</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>4230883</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.001681</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>153.354072</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>3659063</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.010618</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>131.658167</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>5659388</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.011685</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>140.250630</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>8567973</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>0.003278</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>184.270401</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>9724190</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>543</td>\n",
2021-04-29 18:50:02 +02:00
" <td>4.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>107.0</td>\n",
" <td>6.993352</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>135.124112</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-20 12:15:17 +02:00
" <th>880090</th>\n",
2021-04-29 18:50:02 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
2021-07-20 12:15:17 +02:00
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.001572</td>\n",
2021-04-29 18:50:02 +02:00
" <td>2020</td>\n",
2021-07-20 12:15:17 +02:00
" <td>143.298327</td>\n",
2021-04-29 18:50:02 +02:00
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
2021-07-20 12:15:17 +02:00
"<p>29 rows × 24 columns</p>\n",
2021-04-29 18:50:02 +02:00
"</div>"
],
"text/plain": [
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"7579770 True True 0 0 0 \n",
"4173344 True True 0 0 0 \n",
"6370669 True True 0 0 0 \n",
2021-07-20 12:15:17 +02:00
"6099073 True True 0 0 0 \n",
"8341750 True True 0 0 0 \n",
"10546308 True True 0 0 0 \n",
2021-04-29 18:50:02 +02:00
"5109458 True True 0 0 0 \n",
2021-07-20 12:15:17 +02:00
"989919 True True 0 0 0 \n",
"9173011 True True 0 0 0 \n",
"801178 True True 0 0 0 \n",
2021-04-29 18:50:02 +02:00
"10581997 True True 0 0 0 \n",
2021-07-20 12:15:17 +02:00
"9831120 True True 0 0 0 \n",
"7689620 True True 0 0 0 \n",
"10215555 True True 0 0 0 \n",
"10985986 True True 0 0 0 \n",
"8567972 True True 0 0 0 \n",
"1041978 True True 0 0 0 \n",
"2749172 True True 0 0 0 \n",
"3274872 True True 0 0 0 \n",
"7963350 True True 0 0 0 \n",
"4392500 True True 0 0 0 \n",
2021-04-29 18:50:02 +02:00
"3243302 True True 0 0 0 \n",
2021-07-20 12:15:17 +02:00
"6263478 True True 0 0 0 \n",
"4230883 True True 0 0 0 \n",
2021-04-29 18:50:02 +02:00
"3659063 True True 0 0 0 \n",
2021-07-20 12:15:17 +02:00
"5659388 True True 0 0 0 \n",
"8567973 True True 0 0 0 \n",
2021-04-29 18:50:02 +02:00
"9724190 True True 0 0 0 \n",
"880090 True True 0 0 0 \n",
"\n",
" n_pmc n_other_pids n_emails n_urls n_ids ... \\\n",
"7579770 0 0 0 1 0 ... \n",
"4173344 0 0 0 1 0 ... \n",
"6370669 0 0 0 1 0 ... \n",
2021-07-20 12:15:17 +02:00
"6099073 0 0 0 1 0 ... \n",
"8341750 0 0 0 10 0 ... \n",
"10546308 0 0 0 1 0 ... \n",
2021-04-29 18:50:02 +02:00
"5109458 0 0 0 1 0 ... \n",
2021-07-20 12:15:17 +02:00
"989919 0 0 0 1 0 ... \n",
"9173011 0 0 0 1 0 ... \n",
"801178 0 0 0 1 0 ... \n",
2021-04-29 18:50:02 +02:00
"10581997 0 0 0 1 0 ... \n",
2021-07-20 12:15:17 +02:00
"9831120 0 0 0 1 0 ... \n",
"7689620 0 0 0 1 0 ... \n",
"10215555 0 0 0 1 0 ... \n",
"10985986 0 0 0 1 0 ... \n",
"8567972 0 0 0 1 0 ... \n",
"1041978 0 0 0 1 0 ... \n",
"2749172 0 0 0 1 0 ... \n",
"3274872 0 0 0 1 0 ... \n",
"7963350 0 0 0 1 0 ... \n",
"4392500 0 0 0 1 0 ... \n",
2021-04-29 18:50:02 +02:00
"3243302 0 0 0 1 0 ... \n",
2021-07-20 12:15:17 +02:00
"6263478 0 0 0 1 0 ... \n",
"4230883 0 0 0 1 0 ... \n",
2021-04-29 18:50:02 +02:00
"3659063 0 0 0 1 0 ... \n",
2021-07-20 12:15:17 +02:00
"5659388 0 0 0 1 0 ... \n",
"8567973 0 0 0 1 0 ... \n",
2021-04-29 18:50:02 +02:00
"9724190 0 0 0 1 0 ... \n",
"880090 0 0 0 1 0 ... \n",
"\n",
" n_valid_education n_valid_employment biography_length \\\n",
"7579770 0.0 0.0 445 \n",
"4173344 0.0 0.0 445 \n",
"6370669 0.0 0.0 445 \n",
2021-07-20 12:15:17 +02:00
"6099073 0.0 0.0 445 \n",
"8341750 0.0 0.0 392 \n",
"10546308 0.0 0.0 445 \n",
2021-04-29 18:50:02 +02:00
"5109458 0.0 0.0 445 \n",
2021-07-20 12:15:17 +02:00
"989919 0.0 0.0 561 \n",
"9173011 0.0 0.0 445 \n",
"801178 0.0 0.0 445 \n",
2021-04-29 18:50:02 +02:00
"10581997 0.0 0.0 445 \n",
2021-07-20 12:15:17 +02:00
"9831120 0.0 0.0 445 \n",
"7689620 0.0 0.0 445 \n",
"10215555 0.0 0.0 445 \n",
"10985986 0.0 0.0 445 \n",
"8567972 0.0 0.0 445 \n",
"1041978 0.0 0.0 445 \n",
"2749172 0.0 0.0 540 \n",
"3274872 0.0 0.0 445 \n",
"7963350 0.0 0.0 445 \n",
"4392500 0.0 0.0 445 \n",
2021-04-29 18:50:02 +02:00
"3243302 0.0 0.0 445 \n",
2021-07-20 12:15:17 +02:00
"6263478 0.0 0.0 552 \n",
"4230883 0.0 0.0 445 \n",
2021-04-29 18:50:02 +02:00
"3659063 0.0 0.0 445 \n",
2021-07-20 12:15:17 +02:00
"5659388 0.0 0.0 445 \n",
"8567973 0.0 0.0 445 \n",
2021-04-29 18:50:02 +02:00
"9724190 0.0 0.0 543 \n",
"880090 0.0 0.0 445 \n",
"\n",
" biography_n_sentences biography_n_words date_diff ref_year \\\n",
"7579770 4.0 85.0 0.002694 2020 \n",
"4173344 4.0 85.0 0.005405 2020 \n",
"6370669 4.0 85.0 0.006984 2020 \n",
2021-07-20 12:15:17 +02:00
"6099073 4.0 85.0 0.025714 2020 \n",
"8341750 3.0 74.0 0.015018 2021 \n",
"10546308 4.0 85.0 0.002717 2020 \n",
2021-04-29 18:50:02 +02:00
"5109458 4.0 85.0 0.004633 2020 \n",
2021-07-20 12:15:17 +02:00
"989919 5.0 110.0 0.002649 2020 \n",
"9173011 4.0 85.0 0.005271 2020 \n",
"801178 4.0 85.0 0.008069 2020 \n",
2021-04-29 18:50:02 +02:00
"10581997 4.0 85.0 0.020527 2020 \n",
2021-07-20 12:15:17 +02:00
"9831120 4.0 85.0 0.004996 2020 \n",
"7689620 4.0 85.0 0.003671 2020 \n",
"10215555 4.0 85.0 0.000964 2020 \n",
"10985986 4.0 85.0 0.003662 2020 \n",
"8567972 4.0 85.0 0.005248 2020 \n",
"1041978 4.0 85.0 0.006524 2020 \n",
"2749172 4.0 103.0 0.001172 2020 \n",
"3274872 4.0 85.0 0.017478 2020 \n",
"7963350 4.0 85.0 0.003572 2020 \n",
"4392500 4.0 85.0 0.004414 2020 \n",
2021-04-29 18:50:02 +02:00
"3243302 4.0 85.0 0.003391 2020 \n",
2021-07-20 12:15:17 +02:00
"6263478 4.0 107.0 0.005174 2020 \n",
"4230883 4.0 85.0 0.001681 2020 \n",
2021-04-29 18:50:02 +02:00
"3659063 4.0 85.0 0.010618 2020 \n",
2021-07-20 12:15:17 +02:00
"5659388 4.0 85.0 0.011685 2020 \n",
"8567973 4.0 85.0 0.003278 2020 \n",
2021-04-29 18:50:02 +02:00
"9724190 4.0 107.0 6.993352 2020 \n",
"880090 4.0 85.0 0.001572 2020 \n",
"\n",
" date_stale label prediction \n",
"7579770 184.261009 -1.0 -1 \n",
"4173344 191.303842 -1.0 -1 \n",
"6370669 178.167846 -1.0 -1 \n",
2021-07-20 12:15:17 +02:00
"6099073 142.184268 -1.0 -1 \n",
"8341750 46.549118 -1.0 -1 \n",
"10546308 203.149999 -1.0 -1 \n",
2021-04-29 18:50:02 +02:00
"5109458 140.166676 -1.0 -1 \n",
2021-07-20 12:15:17 +02:00
"989919 146.098129 -1.0 -1 \n",
"9173011 169.293431 -1.0 -1 \n",
"801178 131.828965 -1.0 -1 \n",
2021-04-29 18:50:02 +02:00
"10581997 129.861984 -1.0 -1 \n",
2021-07-20 12:15:17 +02:00
"9831120 167.149854 -1.0 -1 \n",
"7689620 203.189280 -1.0 -1 \n",
"10215555 192.363398 -1.0 -1 \n",
"10985986 195.359312 -1.0 -1 \n",
"8567972 196.358010 -1.0 -1 \n",
"1041978 167.270642 -1.0 -1 \n",
"2749172 142.989201 -1.0 -1 \n",
"3274872 126.967926 -1.0 -1 \n",
"7963350 190.312686 -1.0 -1 \n",
"4392500 206.150937 -1.0 -1 \n",
2021-04-29 18:50:02 +02:00
"3243302 140.230023 -1.0 -1 \n",
2021-07-20 12:15:17 +02:00
"6263478 140.365511 -1.0 -1 \n",
"4230883 153.354072 -1.0 -1 \n",
2021-04-29 18:50:02 +02:00
"3659063 131.658167 -1.0 -1 \n",
2021-07-20 12:15:17 +02:00
"5659388 140.250630 -1.0 -1 \n",
"8567973 184.270401 -1.0 -1 \n",
2021-04-29 18:50:02 +02:00
"9724190 135.124112 -1.0 -1 \n",
"880090 143.298327 -1.0 -1 \n",
"\n",
2021-07-20 12:15:17 +02:00
"[29 rows x 24 columns]"
2021-04-29 18:50:02 +02:00
]
},
2021-07-20 12:15:17 +02:00
"execution_count": 18,
2021-04-29 18:50:02 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_x[(test_x.index.isin(fake_df.index))]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}