fake-orcid-analysis/notebooks/03-Supervised.ipynb

2465 lines
86 KiB
Plaintext
Raw Normal View History

2021-04-29 18:50:02 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"\n",
"from sklearn.preprocessing import MultiLabelBinarizer\n",
"from sklearn.svm import OneClassSVM \n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import f1_score"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_pickle('../data/processed/features.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>label</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>...</th>\n",
" <th>n_employment</th>\n",
" <th>n_ext_work_source</th>\n",
" <th>n_valid_education</th>\n",
" <th>n_valid_employment</th>\n",
" <th>biography_length</th>\n",
" <th>biography_n_sentences</th>\n",
" <th>biography_n_words</th>\n",
" <th>date_diff</th>\n",
" <th>ref_year</th>\n",
" <th>date_stale</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.009618</td>\n",
" <td>2018</td>\n",
" <td>1153.980551</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>715.078025</td>\n",
" <td>2018</td>\n",
" <td>406.980815</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>48.001631</td>\n",
" <td>2019</td>\n",
" <td>456.736688</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1863.042464</td>\n",
" <td>2015</td>\n",
" <td>217.817512</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>827.372135</td>\n",
" <td>2014</td>\n",
" <td>1779.456397</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989644</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>118</td>\n",
" <td>2.0</td>\n",
" <td>23.0</td>\n",
" <td>0.051116</td>\n",
" <td>2020</td>\n",
" <td>139.242812</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989645</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2131.978000</td>\n",
" <td>2015</td>\n",
" <td>158.560081</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989646</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.000597</td>\n",
" <td>2020</td>\n",
" <td>139.226289</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989647</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1433.222830</td>\n",
" <td>2016</td>\n",
" <td>150.839463</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989648</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>...</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1591.542558</td>\n",
" <td>2016</td>\n",
" <td>152.263413</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10989649 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"0 False False 0 0 0 \n",
"1 True True 0 0 0 \n",
"2 True True 0 0 0 \n",
"3 True True 0 0 0 \n",
"4 True True 0 0 0 \n",
"... ... ... ... ... ... \n",
"10989644 True True 0 0 0 \n",
"10989645 True True 7 7 0 \n",
"10989646 True True 0 0 0 \n",
"10989647 True True 0 0 0 \n",
"10989648 True True 0 0 0 \n",
"\n",
" n_pmc n_other_pids label n_emails n_urls ... n_employment \\\n",
"0 0 0 False <NA> <NA> ... <NA> \n",
"1 0 0 False <NA> <NA> ... 1 \n",
"2 0 0 False <NA> <NA> ... <NA> \n",
"3 0 0 False <NA> <NA> ... 1 \n",
"4 0 0 False <NA> <NA> ... 2 \n",
"... ... ... ... ... ... ... ... \n",
"10989644 0 0 False <NA> <NA> ... 1 \n",
"10989645 1 0 True <NA> <NA> ... 2 \n",
"10989646 0 0 False <NA> <NA> ... <NA> \n",
"10989647 0 0 False <NA> <NA> ... 1 \n",
"10989648 0 0 False <NA> <NA> ... <NA> \n",
"\n",
" n_ext_work_source n_valid_education n_valid_employment \\\n",
"0 <NA> NaN NaN \n",
"1 <NA> NaN 1.0 \n",
"2 <NA> NaN NaN \n",
"3 <NA> NaN 0.0 \n",
"4 <NA> NaN 1.0 \n",
"... ... ... ... \n",
"10989644 <NA> 1.0 1.0 \n",
"10989645 2 2.0 0.0 \n",
"10989646 <NA> NaN NaN \n",
"10989647 <NA> 2.0 1.0 \n",
"10989648 <NA> NaN NaN \n",
"\n",
" biography_length biography_n_sentences biography_n_words \\\n",
"0 <NA> NaN NaN \n",
"1 <NA> NaN NaN \n",
"2 <NA> NaN NaN \n",
"3 <NA> NaN NaN \n",
"4 <NA> NaN NaN \n",
"... ... ... ... \n",
"10989644 118 2.0 23.0 \n",
"10989645 <NA> NaN NaN \n",
"10989646 <NA> NaN NaN \n",
"10989647 <NA> NaN NaN \n",
"10989648 <NA> NaN NaN \n",
"\n",
" date_diff ref_year date_stale \n",
"0 0.009618 2018 1153.980551 \n",
"1 715.078025 2018 406.980815 \n",
"2 48.001631 2019 456.736688 \n",
"3 1863.042464 2015 217.817512 \n",
"4 827.372135 2014 1779.456397 \n",
"... ... ... ... \n",
"10989644 0.051116 2020 139.242812 \n",
"10989645 2131.978000 2015 158.560081 \n",
"10989646 0.000597 2020 139.226289 \n",
"10989647 1433.222830 2016 150.839463 \n",
"10989648 1591.542558 2016 152.263413 \n",
"\n",
"[10989649 rows x 23 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df.fillna(0, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>label</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>...</th>\n",
" <th>n_employment</th>\n",
" <th>n_ext_work_source</th>\n",
" <th>n_valid_education</th>\n",
" <th>n_valid_employment</th>\n",
" <th>biography_length</th>\n",
" <th>biography_n_sentences</th>\n",
" <th>biography_n_words</th>\n",
" <th>date_diff</th>\n",
" <th>ref_year</th>\n",
" <th>date_stale</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.009618</td>\n",
" <td>2018</td>\n",
" <td>1153.980551</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>715.078025</td>\n",
" <td>2018</td>\n",
" <td>406.980815</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>48.001631</td>\n",
" <td>2019</td>\n",
" <td>456.736688</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1863.042464</td>\n",
" <td>2015</td>\n",
" <td>217.817512</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>827.372135</td>\n",
" <td>2014</td>\n",
" <td>1779.456397</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989644</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>118</td>\n",
" <td>2.0</td>\n",
" <td>23.0</td>\n",
" <td>0.051116</td>\n",
" <td>2020</td>\n",
" <td>139.242812</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989645</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2131.978000</td>\n",
" <td>2015</td>\n",
" <td>158.560081</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989646</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000597</td>\n",
" <td>2020</td>\n",
" <td>139.226289</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989647</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1433.222830</td>\n",
" <td>2016</td>\n",
" <td>150.839463</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989648</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1591.542558</td>\n",
" <td>2016</td>\n",
" <td>152.263413</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10989649 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"0 False False 0 0 0 \n",
"1 True True 0 0 0 \n",
"2 True True 0 0 0 \n",
"3 True True 0 0 0 \n",
"4 True True 0 0 0 \n",
"... ... ... ... ... ... \n",
"10989644 True True 0 0 0 \n",
"10989645 True True 7 7 0 \n",
"10989646 True True 0 0 0 \n",
"10989647 True True 0 0 0 \n",
"10989648 True True 0 0 0 \n",
"\n",
" n_pmc n_other_pids label n_emails n_urls ... n_employment \\\n",
"0 0 0 False 0 0 ... 0 \n",
"1 0 0 False 0 0 ... 1 \n",
"2 0 0 False 0 0 ... 0 \n",
"3 0 0 False 0 0 ... 1 \n",
"4 0 0 False 0 0 ... 2 \n",
"... ... ... ... ... ... ... ... \n",
"10989644 0 0 False 0 0 ... 1 \n",
"10989645 1 0 True 0 0 ... 2 \n",
"10989646 0 0 False 0 0 ... 0 \n",
"10989647 0 0 False 0 0 ... 1 \n",
"10989648 0 0 False 0 0 ... 0 \n",
"\n",
" n_ext_work_source n_valid_education n_valid_employment \\\n",
"0 0 0.0 0.0 \n",
"1 0 0.0 1.0 \n",
"2 0 0.0 0.0 \n",
"3 0 0.0 0.0 \n",
"4 0 0.0 1.0 \n",
"... ... ... ... \n",
"10989644 0 1.0 1.0 \n",
"10989645 2 2.0 0.0 \n",
"10989646 0 0.0 0.0 \n",
"10989647 0 2.0 1.0 \n",
"10989648 0 0.0 0.0 \n",
"\n",
" biography_length biography_n_sentences biography_n_words \\\n",
"0 0 0.0 0.0 \n",
"1 0 0.0 0.0 \n",
"2 0 0.0 0.0 \n",
"3 0 0.0 0.0 \n",
"4 0 0.0 0.0 \n",
"... ... ... ... \n",
"10989644 118 2.0 23.0 \n",
"10989645 0 0.0 0.0 \n",
"10989646 0 0.0 0.0 \n",
"10989647 0 0.0 0.0 \n",
"10989648 0 0.0 0.0 \n",
"\n",
" date_diff ref_year date_stale \n",
"0 0.009618 2018 1153.980551 \n",
"1 715.078025 2018 406.980815 \n",
"2 48.001631 2019 456.736688 \n",
"3 1863.042464 2015 217.817512 \n",
"4 827.372135 2014 1779.456397 \n",
"... ... ... ... \n",
"10989644 0.051116 2020 139.242812 \n",
"10989645 2131.978000 2015 158.560081 \n",
"10989646 0.000597 2020 139.226289 \n",
"10989647 1433.222830 2016 150.839463 \n",
"10989648 1591.542558 2016 152.263413 \n",
"\n",
"[10989649 rows x 23 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(2075872, 23)"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.label == 1].shape"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(8913777, 23)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.label == 0].shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"# split into train/test sets\n",
"x = df.loc[:, df.columns != 'label']\n",
"y = df['label']\n",
"train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=100000, test_size=1000000, random_state=2, stratify=y)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(18889, 22)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_x[train_y==1].shape"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(188893, 22)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_x[test_y==1].shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"OneClassSVM()"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# define outlier detection model\n",
"model = OneClassSVM(gamma='scale', nu=0.5)\n",
"\n",
"# fit on majority class\n",
"train_x = train_x[train_y==1]\n",
"model.fit(train_x)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"F1 Score: 0.872\n"
]
}
],
"source": [
"# detect outliers in the test set\n",
"y_hat = model.predict(test_x)\n",
"\n",
"# mark inliers 1, outliers -1\n",
"test_y[test_y == 0] = -1\n",
"test_y[test_y == 1] = 1\n",
"\n",
"# calculate score\n",
"score = f1_score(test_y, y_hat, pos_label=-1)\n",
"print('F1 Score: %.3f' % score)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1597: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" self.obj[key] = value\n",
"/Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1676: SettingWithCopyWarning: \n",
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
"Try using .loc[row_indexer,col_indexer] = value instead\n",
"\n",
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
" self._setitem_single_column(ilocs[0], value, pi)\n"
]
}
],
"source": [
"test_x.loc[:, 'label'] = test_y.values"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"test_x.loc[:, 'prediction'] = y_hat"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>n_ids</th>\n",
" <th>...</th>\n",
" <th>n_valid_education</th>\n",
" <th>n_valid_employment</th>\n",
" <th>biography_length</th>\n",
" <th>biography_n_sentences</th>\n",
" <th>biography_n_words</th>\n",
" <th>date_diff</th>\n",
" <th>ref_year</th>\n",
" <th>date_stale</th>\n",
" <th>label</th>\n",
" <th>prediction</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>8422958</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>7</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>2.0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1149.760128</td>\n",
" <td>2016</td>\n",
" <td>498.053074</td>\n",
" <td>-1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>30426</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>5</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>1.0</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2446.815456</td>\n",
" <td>2014</td>\n",
" <td>112.064679</td>\n",
" <td>1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1941223</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>811.573040</td>\n",
" <td>2017</td>\n",
" <td>486.567693</td>\n",
" <td>-1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9232681</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>888.520157</td>\n",
" <td>2017</td>\n",
" <td>359.397781</td>\n",
" <td>-1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7646644</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.000030</td>\n",
" <td>2017</td>\n",
" <td>1443.731837</td>\n",
" <td>1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4769520</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>781.732331</td>\n",
" <td>2019</td>\n",
" <td>46.115046</td>\n",
" <td>-1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2817268</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>9</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>3.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1782.543747</td>\n",
" <td>2014</td>\n",
" <td>544.549425</td>\n",
" <td>-1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4840353</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>659.864595</td>\n",
" <td>2017</td>\n",
" <td>634.846007</td>\n",
" <td>1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9717615</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1971.810335</td>\n",
" <td>2015</td>\n",
" <td>38.063916</td>\n",
" <td>-1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2290714</th>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>947.216224</td>\n",
" <td>2018</td>\n",
" <td>104.732935</td>\n",
" <td>-1.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>205377 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"8422958 True True 7 3 0 \n",
"30426 True True 5 5 0 \n",
"1941223 True True 0 0 0 \n",
"9232681 True True 1 1 0 \n",
"7646644 False False 0 0 0 \n",
"... ... ... ... ... ... \n",
"4769520 True True 0 0 0 \n",
"2817268 True True 9 7 0 \n",
"4840353 True True 0 0 0 \n",
"9717615 True True 0 0 0 \n",
"2290714 False False 0 0 0 \n",
"\n",
" n_pmc n_other_pids n_emails n_urls n_ids ... n_valid_education \\\n",
"8422958 0 6 0 0 1 ... 2.0 \n",
"30426 0 0 0 0 0 ... 1.0 \n",
"1941223 0 0 0 0 0 ... 0.0 \n",
"9232681 0 0 0 0 0 ... 0.0 \n",
"7646644 0 0 0 0 0 ... 0.0 \n",
"... ... ... ... ... ... ... ... \n",
"4769520 0 0 0 0 0 ... 0.0 \n",
"2817268 0 5 0 0 1 ... 3.0 \n",
"4840353 0 0 0 0 0 ... 0.0 \n",
"9717615 0 0 0 0 1 ... 0.0 \n",
"2290714 0 0 0 0 0 ... 0.0 \n",
"\n",
" n_valid_employment biography_length biography_n_sentences \\\n",
"8422958 1.0 0 0.0 \n",
"30426 3.0 0 0.0 \n",
"1941223 0.0 0 0.0 \n",
"9232681 0.0 0 0.0 \n",
"7646644 0.0 0 0.0 \n",
"... ... ... ... \n",
"4769520 0.0 0 0.0 \n",
"2817268 0.0 0 0.0 \n",
"4840353 0.0 0 0.0 \n",
"9717615 0.0 0 0.0 \n",
"2290714 0.0 0 0.0 \n",
"\n",
" biography_n_words date_diff ref_year date_stale label \\\n",
"8422958 0.0 1149.760128 2016 498.053074 -1.0 \n",
"30426 0.0 2446.815456 2014 112.064679 1.0 \n",
"1941223 0.0 811.573040 2017 486.567693 -1.0 \n",
"9232681 0.0 888.520157 2017 359.397781 -1.0 \n",
"7646644 0.0 0.000030 2017 1443.731837 1.0 \n",
"... ... ... ... ... ... \n",
"4769520 0.0 781.732331 2019 46.115046 -1.0 \n",
"2817268 0.0 1782.543747 2014 544.549425 -1.0 \n",
"4840353 0.0 659.864595 2017 634.846007 1.0 \n",
"9717615 0.0 1971.810335 2015 38.063916 -1.0 \n",
"2290714 0.0 947.216224 2018 104.732935 -1.0 \n",
"\n",
" prediction \n",
"8422958 1 \n",
"30426 -1 \n",
"1941223 1 \n",
"9232681 1 \n",
"7646644 -1 \n",
"... ... \n",
"4769520 1 \n",
"2817268 1 \n",
"4840353 -1 \n",
"9717615 1 \n",
"2290714 1 \n",
"\n",
"[205377 rows x 24 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_x[test_x.label != test_x.prediction]"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"fake_df = pd.read_csv('../data/processed/fake_heap_index.csv', index_col='index')"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>n_ids</th>\n",
" <th>...</th>\n",
" <th>n_valid_education</th>\n",
" <th>n_valid_employment</th>\n",
" <th>biography_length</th>\n",
" <th>biography_n_sentences</th>\n",
" <th>biography_n_words</th>\n",
" <th>date_diff</th>\n",
" <th>ref_year</th>\n",
" <th>date_stale</th>\n",
" <th>label</th>\n",
" <th>prediction</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" </tbody>\n",
"</table>\n",
"<p>0 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [verified_email, verified_primary_email, n_works, n_doi, n_arxiv, n_pmc, n_other_pids, n_emails, n_urls, n_ids, n_keywords, n_education, n_employment, n_ext_work_source, n_valid_education, n_valid_employment, biography_length, biography_n_sentences, biography_n_words, date_diff, ref_year, date_stale, label, prediction]\n",
"Index: []\n",
"\n",
"[0 rows x 24 columns]"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_x[(test_x.label != test_x.prediction) & (test_x.index.isin(fake_df.index))]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>n_ids</th>\n",
" <th>...</th>\n",
" <th>n_valid_education</th>\n",
" <th>n_valid_employment</th>\n",
" <th>biography_length</th>\n",
" <th>biography_n_sentences</th>\n",
" <th>biography_n_words</th>\n",
" <th>date_diff</th>\n",
" <th>ref_year</th>\n",
" <th>date_stale</th>\n",
" <th>label</th>\n",
" <th>prediction</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>1319584</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>637</td>\n",
" <td>4.0</td>\n",
" <td>113.0</td>\n",
" <td>0.000852</td>\n",
" <td>2020</td>\n",
" <td>143.341818</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7579770</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.002694</td>\n",
" <td>2020</td>\n",
" <td>184.261009</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9173011</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.005271</td>\n",
" <td>2020</td>\n",
" <td>169.293431</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1209389</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.006071</td>\n",
" <td>2020</td>\n",
" <td>146.328318</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4173344</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.005405</td>\n",
" <td>2020</td>\n",
" <td>191.303842</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10985986</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.003662</td>\n",
" <td>2020</td>\n",
" <td>195.359312</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6208696</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.001287</td>\n",
" <td>2020</td>\n",
" <td>142.360796</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>801178</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.008069</td>\n",
" <td>2020</td>\n",
" <td>131.828965</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4392500</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.004414</td>\n",
" <td>2020</td>\n",
" <td>206.150937</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7963350</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.003572</td>\n",
" <td>2020</td>\n",
" <td>190.312686</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5659388</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.011685</td>\n",
" <td>2020</td>\n",
" <td>140.250630</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2749172</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>540</td>\n",
" <td>4.0</td>\n",
" <td>103.0</td>\n",
" <td>0.001172</td>\n",
" <td>2020</td>\n",
" <td>142.989201</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4230883</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.001681</td>\n",
" <td>2020</td>\n",
" <td>153.354072</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6370669</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.006984</td>\n",
" <td>2020</td>\n",
" <td>178.167846</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5109458</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.004633</td>\n",
" <td>2020</td>\n",
" <td>140.166676</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7689620</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.003671</td>\n",
" <td>2020</td>\n",
" <td>203.189280</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9831120</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.004996</td>\n",
" <td>2020</td>\n",
" <td>167.149854</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6263478</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>552</td>\n",
" <td>4.0</td>\n",
" <td>107.0</td>\n",
" <td>0.005174</td>\n",
" <td>2020</td>\n",
" <td>140.365511</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10581997</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.020527</td>\n",
" <td>2020</td>\n",
" <td>129.861984</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3243302</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.003391</td>\n",
" <td>2020</td>\n",
" <td>140.230023</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3659063</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.010618</td>\n",
" <td>2020</td>\n",
" <td>131.658167</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6099073</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.025714</td>\n",
" <td>2020</td>\n",
" <td>142.184268</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3953358</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.005145</td>\n",
" <td>2020</td>\n",
" <td>185.115723</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9724190</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>543</td>\n",
" <td>4.0</td>\n",
" <td>107.0</td>\n",
" <td>6.993352</td>\n",
" <td>2020</td>\n",
" <td>135.124112</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1041978</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.006524</td>\n",
" <td>2020</td>\n",
" <td>167.270642</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>880090</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.001572</td>\n",
" <td>2020</td>\n",
" <td>143.298327</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8492341</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.006172</td>\n",
" <td>2020</td>\n",
" <td>123.742413</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8952735</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.004785</td>\n",
" <td>2020</td>\n",
" <td>188.359673</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3274872</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>445</td>\n",
" <td>4.0</td>\n",
" <td>85.0</td>\n",
" <td>0.017478</td>\n",
" <td>2020</td>\n",
" <td>126.967926</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>989919</th>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>...</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>561</td>\n",
" <td>5.0</td>\n",
" <td>110.0</td>\n",
" <td>0.002649</td>\n",
" <td>2020</td>\n",
" <td>146.098129</td>\n",
" <td>-1.0</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>30 rows × 24 columns</p>\n",
"</div>"
],
"text/plain": [
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"1319584 True True 0 0 0 \n",
"7579770 True True 0 0 0 \n",
"9173011 True True 0 0 0 \n",
"1209389 True True 0 0 0 \n",
"4173344 True True 0 0 0 \n",
"10985986 True True 0 0 0 \n",
"6208696 True True 0 0 0 \n",
"801178 True True 0 0 0 \n",
"4392500 True True 0 0 0 \n",
"7963350 True True 0 0 0 \n",
"5659388 True True 0 0 0 \n",
"2749172 True True 0 0 0 \n",
"4230883 True True 0 0 0 \n",
"6370669 True True 0 0 0 \n",
"5109458 True True 0 0 0 \n",
"7689620 True True 0 0 0 \n",
"9831120 True True 0 0 0 \n",
"6263478 True True 0 0 0 \n",
"10581997 True True 0 0 0 \n",
"3243302 True True 0 0 0 \n",
"3659063 True True 0 0 0 \n",
"6099073 True True 0 0 0 \n",
"3953358 True True 0 0 0 \n",
"9724190 True True 0 0 0 \n",
"1041978 True True 0 0 0 \n",
"880090 True True 0 0 0 \n",
"8492341 True True 0 0 0 \n",
"8952735 True True 0 0 0 \n",
"3274872 True True 0 0 0 \n",
"989919 True True 0 0 0 \n",
"\n",
" n_pmc n_other_pids n_emails n_urls n_ids ... \\\n",
"1319584 0 0 0 1 0 ... \n",
"7579770 0 0 0 1 0 ... \n",
"9173011 0 0 0 1 0 ... \n",
"1209389 0 0 0 1 0 ... \n",
"4173344 0 0 0 1 0 ... \n",
"10985986 0 0 0 1 0 ... \n",
"6208696 0 0 0 1 0 ... \n",
"801178 0 0 0 1 0 ... \n",
"4392500 0 0 0 1 0 ... \n",
"7963350 0 0 0 1 0 ... \n",
"5659388 0 0 0 1 0 ... \n",
"2749172 0 0 0 1 0 ... \n",
"4230883 0 0 0 1 0 ... \n",
"6370669 0 0 0 1 0 ... \n",
"5109458 0 0 0 1 0 ... \n",
"7689620 0 0 0 1 0 ... \n",
"9831120 0 0 0 1 0 ... \n",
"6263478 0 0 0 1 0 ... \n",
"10581997 0 0 0 1 0 ... \n",
"3243302 0 0 0 1 0 ... \n",
"3659063 0 0 0 1 0 ... \n",
"6099073 0 0 0 1 0 ... \n",
"3953358 0 0 0 0 0 ... \n",
"9724190 0 0 0 1 0 ... \n",
"1041978 0 0 0 1 0 ... \n",
"880090 0 0 0 1 0 ... \n",
"8492341 0 0 0 1 0 ... \n",
"8952735 0 0 0 1 0 ... \n",
"3274872 0 0 0 1 0 ... \n",
"989919 0 0 0 1 0 ... \n",
"\n",
" n_valid_education n_valid_employment biography_length \\\n",
"1319584 0.0 0.0 637 \n",
"7579770 0.0 0.0 445 \n",
"9173011 0.0 0.0 445 \n",
"1209389 0.0 0.0 445 \n",
"4173344 0.0 0.0 445 \n",
"10985986 0.0 0.0 445 \n",
"6208696 0.0 0.0 445 \n",
"801178 0.0 0.0 445 \n",
"4392500 0.0 0.0 445 \n",
"7963350 0.0 0.0 445 \n",
"5659388 0.0 0.0 445 \n",
"2749172 0.0 0.0 540 \n",
"4230883 0.0 0.0 445 \n",
"6370669 0.0 0.0 445 \n",
"5109458 0.0 0.0 445 \n",
"7689620 0.0 0.0 445 \n",
"9831120 0.0 0.0 445 \n",
"6263478 0.0 0.0 552 \n",
"10581997 0.0 0.0 445 \n",
"3243302 0.0 0.0 445 \n",
"3659063 0.0 0.0 445 \n",
"6099073 0.0 0.0 445 \n",
"3953358 0.0 0.0 445 \n",
"9724190 0.0 0.0 543 \n",
"1041978 0.0 0.0 445 \n",
"880090 0.0 0.0 445 \n",
"8492341 0.0 0.0 445 \n",
"8952735 0.0 0.0 445 \n",
"3274872 0.0 0.0 445 \n",
"989919 0.0 0.0 561 \n",
"\n",
" biography_n_sentences biography_n_words date_diff ref_year \\\n",
"1319584 4.0 113.0 0.000852 2020 \n",
"7579770 4.0 85.0 0.002694 2020 \n",
"9173011 4.0 85.0 0.005271 2020 \n",
"1209389 4.0 85.0 0.006071 2020 \n",
"4173344 4.0 85.0 0.005405 2020 \n",
"10985986 4.0 85.0 0.003662 2020 \n",
"6208696 4.0 85.0 0.001287 2020 \n",
"801178 4.0 85.0 0.008069 2020 \n",
"4392500 4.0 85.0 0.004414 2020 \n",
"7963350 4.0 85.0 0.003572 2020 \n",
"5659388 4.0 85.0 0.011685 2020 \n",
"2749172 4.0 103.0 0.001172 2020 \n",
"4230883 4.0 85.0 0.001681 2020 \n",
"6370669 4.0 85.0 0.006984 2020 \n",
"5109458 4.0 85.0 0.004633 2020 \n",
"7689620 4.0 85.0 0.003671 2020 \n",
"9831120 4.0 85.0 0.004996 2020 \n",
"6263478 4.0 107.0 0.005174 2020 \n",
"10581997 4.0 85.0 0.020527 2020 \n",
"3243302 4.0 85.0 0.003391 2020 \n",
"3659063 4.0 85.0 0.010618 2020 \n",
"6099073 4.0 85.0 0.025714 2020 \n",
"3953358 4.0 85.0 0.005145 2020 \n",
"9724190 4.0 107.0 6.993352 2020 \n",
"1041978 4.0 85.0 0.006524 2020 \n",
"880090 4.0 85.0 0.001572 2020 \n",
"8492341 4.0 85.0 0.006172 2020 \n",
"8952735 4.0 85.0 0.004785 2020 \n",
"3274872 4.0 85.0 0.017478 2020 \n",
"989919 5.0 110.0 0.002649 2020 \n",
"\n",
" date_stale label prediction \n",
"1319584 143.341818 -1.0 -1 \n",
"7579770 184.261009 -1.0 -1 \n",
"9173011 169.293431 -1.0 -1 \n",
"1209389 146.328318 -1.0 -1 \n",
"4173344 191.303842 -1.0 -1 \n",
"10985986 195.359312 -1.0 -1 \n",
"6208696 142.360796 -1.0 -1 \n",
"801178 131.828965 -1.0 -1 \n",
"4392500 206.150937 -1.0 -1 \n",
"7963350 190.312686 -1.0 -1 \n",
"5659388 140.250630 -1.0 -1 \n",
"2749172 142.989201 -1.0 -1 \n",
"4230883 153.354072 -1.0 -1 \n",
"6370669 178.167846 -1.0 -1 \n",
"5109458 140.166676 -1.0 -1 \n",
"7689620 203.189280 -1.0 -1 \n",
"9831120 167.149854 -1.0 -1 \n",
"6263478 140.365511 -1.0 -1 \n",
"10581997 129.861984 -1.0 -1 \n",
"3243302 140.230023 -1.0 -1 \n",
"3659063 131.658167 -1.0 -1 \n",
"6099073 142.184268 -1.0 -1 \n",
"3953358 185.115723 -1.0 -1 \n",
"9724190 135.124112 -1.0 -1 \n",
"1041978 167.270642 -1.0 -1 \n",
"880090 143.298327 -1.0 -1 \n",
"8492341 123.742413 -1.0 -1 \n",
"8952735 188.359673 -1.0 -1 \n",
"3274872 126.967926 -1.0 -1 \n",
"989919 146.098129 -1.0 -1 \n",
"\n",
"[30 rows x 24 columns]"
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"test_x[(test_x.index.isin(fake_df.index))]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}