2021-04-29 18:50:02 +02:00
|
|
|
|
{
|
|
|
|
|
"cells": [
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 1,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"import pandas as pd\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"from sklearn.preprocessing import MultiLabelBinarizer\n",
|
|
|
|
|
"from sklearn.svm import OneClassSVM \n",
|
|
|
|
|
"from sklearn.model_selection import train_test_split\n",
|
|
|
|
|
"from sklearn.metrics import f1_score"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 2,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"df = pd.read_pickle('../data/processed/features.pkl')"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 3,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>verified_email</th>\n",
|
|
|
|
|
" <th>verified_primary_email</th>\n",
|
|
|
|
|
" <th>n_works</th>\n",
|
|
|
|
|
" <th>n_doi</th>\n",
|
|
|
|
|
" <th>n_arxiv</th>\n",
|
|
|
|
|
" <th>n_pmc</th>\n",
|
|
|
|
|
" <th>n_other_pids</th>\n",
|
|
|
|
|
" <th>label</th>\n",
|
|
|
|
|
" <th>n_emails</th>\n",
|
|
|
|
|
" <th>n_urls</th>\n",
|
|
|
|
|
" <th>...</th>\n",
|
|
|
|
|
" <th>n_employment</th>\n",
|
|
|
|
|
" <th>n_ext_work_source</th>\n",
|
|
|
|
|
" <th>n_valid_education</th>\n",
|
|
|
|
|
" <th>n_valid_employment</th>\n",
|
|
|
|
|
" <th>biography_length</th>\n",
|
|
|
|
|
" <th>biography_n_sentences</th>\n",
|
|
|
|
|
" <th>biography_n_words</th>\n",
|
|
|
|
|
" <th>date_diff</th>\n",
|
|
|
|
|
" <th>ref_year</th>\n",
|
|
|
|
|
" <th>date_stale</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>0.009618</td>\n",
|
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
|
" <td>1153.980551</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>715.078025</td>\n",
|
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
|
" <td>406.980815</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>48.001631</td>\n",
|
|
|
|
|
" <td>2019</td>\n",
|
|
|
|
|
" <td>456.736688</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>1863.042464</td>\n",
|
|
|
|
|
" <td>2015</td>\n",
|
|
|
|
|
" <td>217.817512</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>827.372135</td>\n",
|
|
|
|
|
" <td>2014</td>\n",
|
|
|
|
|
" <td>1779.456397</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>...</th>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>10989644</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>118</td>\n",
|
|
|
|
|
" <td>2.0</td>\n",
|
|
|
|
|
" <td>23.0</td>\n",
|
|
|
|
|
" <td>0.051116</td>\n",
|
|
|
|
|
" <td>2020</td>\n",
|
|
|
|
|
" <td>139.242812</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>10989645</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>7</td>\n",
|
|
|
|
|
" <td>7</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>2.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>2131.978000</td>\n",
|
|
|
|
|
" <td>2015</td>\n",
|
|
|
|
|
" <td>158.560081</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>10989646</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>0.000597</td>\n",
|
|
|
|
|
" <td>2020</td>\n",
|
|
|
|
|
" <td>139.226289</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>10989647</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>2.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>1433.222830</td>\n",
|
|
|
|
|
" <td>2016</td>\n",
|
|
|
|
|
" <td>150.839463</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>10989648</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>1591.542558</td>\n",
|
|
|
|
|
" <td>2016</td>\n",
|
|
|
|
|
" <td>152.263413</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"<p>10989649 rows × 23 columns</p>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
|
|
|
|
|
"0 False False 0 0 0 \n",
|
|
|
|
|
"1 True True 0 0 0 \n",
|
|
|
|
|
"2 True True 0 0 0 \n",
|
|
|
|
|
"3 True True 0 0 0 \n",
|
|
|
|
|
"4 True True 0 0 0 \n",
|
|
|
|
|
"... ... ... ... ... ... \n",
|
|
|
|
|
"10989644 True True 0 0 0 \n",
|
|
|
|
|
"10989645 True True 7 7 0 \n",
|
|
|
|
|
"10989646 True True 0 0 0 \n",
|
|
|
|
|
"10989647 True True 0 0 0 \n",
|
|
|
|
|
"10989648 True True 0 0 0 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" n_pmc n_other_pids label n_emails n_urls ... n_employment \\\n",
|
|
|
|
|
"0 0 0 False <NA> <NA> ... <NA> \n",
|
|
|
|
|
"1 0 0 False <NA> <NA> ... 1 \n",
|
|
|
|
|
"2 0 0 False <NA> <NA> ... <NA> \n",
|
|
|
|
|
"3 0 0 False <NA> <NA> ... 1 \n",
|
|
|
|
|
"4 0 0 False <NA> <NA> ... 2 \n",
|
|
|
|
|
"... ... ... ... ... ... ... ... \n",
|
|
|
|
|
"10989644 0 0 False <NA> <NA> ... 1 \n",
|
|
|
|
|
"10989645 1 0 True <NA> <NA> ... 2 \n",
|
|
|
|
|
"10989646 0 0 False <NA> <NA> ... <NA> \n",
|
|
|
|
|
"10989647 0 0 False <NA> <NA> ... 1 \n",
|
|
|
|
|
"10989648 0 0 False <NA> <NA> ... <NA> \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" n_ext_work_source n_valid_education n_valid_employment \\\n",
|
|
|
|
|
"0 <NA> NaN NaN \n",
|
|
|
|
|
"1 <NA> NaN 1.0 \n",
|
|
|
|
|
"2 <NA> NaN NaN \n",
|
|
|
|
|
"3 <NA> NaN 0.0 \n",
|
|
|
|
|
"4 <NA> NaN 1.0 \n",
|
|
|
|
|
"... ... ... ... \n",
|
|
|
|
|
"10989644 <NA> 1.0 1.0 \n",
|
|
|
|
|
"10989645 2 2.0 0.0 \n",
|
|
|
|
|
"10989646 <NA> NaN NaN \n",
|
|
|
|
|
"10989647 <NA> 2.0 1.0 \n",
|
|
|
|
|
"10989648 <NA> NaN NaN \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" biography_length biography_n_sentences biography_n_words \\\n",
|
|
|
|
|
"0 <NA> NaN NaN \n",
|
|
|
|
|
"1 <NA> NaN NaN \n",
|
|
|
|
|
"2 <NA> NaN NaN \n",
|
|
|
|
|
"3 <NA> NaN NaN \n",
|
|
|
|
|
"4 <NA> NaN NaN \n",
|
|
|
|
|
"... ... ... ... \n",
|
|
|
|
|
"10989644 118 2.0 23.0 \n",
|
|
|
|
|
"10989645 <NA> NaN NaN \n",
|
|
|
|
|
"10989646 <NA> NaN NaN \n",
|
|
|
|
|
"10989647 <NA> NaN NaN \n",
|
|
|
|
|
"10989648 <NA> NaN NaN \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" date_diff ref_year date_stale \n",
|
|
|
|
|
"0 0.009618 2018 1153.980551 \n",
|
|
|
|
|
"1 715.078025 2018 406.980815 \n",
|
|
|
|
|
"2 48.001631 2019 456.736688 \n",
|
|
|
|
|
"3 1863.042464 2015 217.817512 \n",
|
|
|
|
|
"4 827.372135 2014 1779.456397 \n",
|
|
|
|
|
"... ... ... ... \n",
|
|
|
|
|
"10989644 0.051116 2020 139.242812 \n",
|
|
|
|
|
"10989645 2131.978000 2015 158.560081 \n",
|
|
|
|
|
"10989646 0.000597 2020 139.226289 \n",
|
|
|
|
|
"10989647 1433.222830 2016 150.839463 \n",
|
|
|
|
|
"10989648 1591.542558 2016 152.263413 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
"[10989649 rows x 23 columns]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 3,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"df"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 4,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"df.fillna(0, inplace=True)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 5,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>verified_email</th>\n",
|
|
|
|
|
" <th>verified_primary_email</th>\n",
|
|
|
|
|
" <th>n_works</th>\n",
|
|
|
|
|
" <th>n_doi</th>\n",
|
|
|
|
|
" <th>n_arxiv</th>\n",
|
|
|
|
|
" <th>n_pmc</th>\n",
|
|
|
|
|
" <th>n_other_pids</th>\n",
|
|
|
|
|
" <th>label</th>\n",
|
|
|
|
|
" <th>n_emails</th>\n",
|
|
|
|
|
" <th>n_urls</th>\n",
|
|
|
|
|
" <th>...</th>\n",
|
|
|
|
|
" <th>n_employment</th>\n",
|
|
|
|
|
" <th>n_ext_work_source</th>\n",
|
|
|
|
|
" <th>n_valid_education</th>\n",
|
|
|
|
|
" <th>n_valid_employment</th>\n",
|
|
|
|
|
" <th>biography_length</th>\n",
|
|
|
|
|
" <th>biography_n_sentences</th>\n",
|
|
|
|
|
" <th>biography_n_words</th>\n",
|
|
|
|
|
" <th>date_diff</th>\n",
|
|
|
|
|
" <th>ref_year</th>\n",
|
|
|
|
|
" <th>date_stale</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>0</th>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.009618</td>\n",
|
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
|
" <td>1153.980551</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>1</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>715.078025</td>\n",
|
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
|
" <td>406.980815</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>2</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>48.001631</td>\n",
|
|
|
|
|
" <td>2019</td>\n",
|
|
|
|
|
" <td>456.736688</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>3</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>1863.042464</td>\n",
|
|
|
|
|
" <td>2015</td>\n",
|
|
|
|
|
" <td>217.817512</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>4</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>827.372135</td>\n",
|
|
|
|
|
" <td>2014</td>\n",
|
|
|
|
|
" <td>1779.456397</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>...</th>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>10989644</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>118</td>\n",
|
|
|
|
|
" <td>2.0</td>\n",
|
|
|
|
|
" <td>23.0</td>\n",
|
|
|
|
|
" <td>0.051116</td>\n",
|
|
|
|
|
" <td>2020</td>\n",
|
|
|
|
|
" <td>139.242812</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>10989645</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>7</td>\n",
|
|
|
|
|
" <td>7</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>2.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>2131.978000</td>\n",
|
|
|
|
|
" <td>2015</td>\n",
|
|
|
|
|
" <td>158.560081</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>10989646</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.000597</td>\n",
|
|
|
|
|
" <td>2020</td>\n",
|
|
|
|
|
" <td>139.226289</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>10989647</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>2.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>1433.222830</td>\n",
|
|
|
|
|
" <td>2016</td>\n",
|
|
|
|
|
" <td>150.839463</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>10989648</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>False</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>1591.542558</td>\n",
|
|
|
|
|
" <td>2016</td>\n",
|
|
|
|
|
" <td>152.263413</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"<p>10989649 rows × 23 columns</p>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
|
|
|
|
|
"0 False False 0 0 0 \n",
|
|
|
|
|
"1 True True 0 0 0 \n",
|
|
|
|
|
"2 True True 0 0 0 \n",
|
|
|
|
|
"3 True True 0 0 0 \n",
|
|
|
|
|
"4 True True 0 0 0 \n",
|
|
|
|
|
"... ... ... ... ... ... \n",
|
|
|
|
|
"10989644 True True 0 0 0 \n",
|
|
|
|
|
"10989645 True True 7 7 0 \n",
|
|
|
|
|
"10989646 True True 0 0 0 \n",
|
|
|
|
|
"10989647 True True 0 0 0 \n",
|
|
|
|
|
"10989648 True True 0 0 0 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" n_pmc n_other_pids label n_emails n_urls ... n_employment \\\n",
|
|
|
|
|
"0 0 0 False 0 0 ... 0 \n",
|
|
|
|
|
"1 0 0 False 0 0 ... 1 \n",
|
|
|
|
|
"2 0 0 False 0 0 ... 0 \n",
|
|
|
|
|
"3 0 0 False 0 0 ... 1 \n",
|
|
|
|
|
"4 0 0 False 0 0 ... 2 \n",
|
|
|
|
|
"... ... ... ... ... ... ... ... \n",
|
|
|
|
|
"10989644 0 0 False 0 0 ... 1 \n",
|
|
|
|
|
"10989645 1 0 True 0 0 ... 2 \n",
|
|
|
|
|
"10989646 0 0 False 0 0 ... 0 \n",
|
|
|
|
|
"10989647 0 0 False 0 0 ... 1 \n",
|
|
|
|
|
"10989648 0 0 False 0 0 ... 0 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" n_ext_work_source n_valid_education n_valid_employment \\\n",
|
|
|
|
|
"0 0 0.0 0.0 \n",
|
|
|
|
|
"1 0 0.0 1.0 \n",
|
|
|
|
|
"2 0 0.0 0.0 \n",
|
|
|
|
|
"3 0 0.0 0.0 \n",
|
|
|
|
|
"4 0 0.0 1.0 \n",
|
|
|
|
|
"... ... ... ... \n",
|
|
|
|
|
"10989644 0 1.0 1.0 \n",
|
|
|
|
|
"10989645 2 2.0 0.0 \n",
|
|
|
|
|
"10989646 0 0.0 0.0 \n",
|
|
|
|
|
"10989647 0 2.0 1.0 \n",
|
|
|
|
|
"10989648 0 0.0 0.0 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" biography_length biography_n_sentences biography_n_words \\\n",
|
|
|
|
|
"0 0 0.0 0.0 \n",
|
|
|
|
|
"1 0 0.0 0.0 \n",
|
|
|
|
|
"2 0 0.0 0.0 \n",
|
|
|
|
|
"3 0 0.0 0.0 \n",
|
|
|
|
|
"4 0 0.0 0.0 \n",
|
|
|
|
|
"... ... ... ... \n",
|
|
|
|
|
"10989644 118 2.0 23.0 \n",
|
|
|
|
|
"10989645 0 0.0 0.0 \n",
|
|
|
|
|
"10989646 0 0.0 0.0 \n",
|
|
|
|
|
"10989647 0 0.0 0.0 \n",
|
|
|
|
|
"10989648 0 0.0 0.0 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" date_diff ref_year date_stale \n",
|
|
|
|
|
"0 0.009618 2018 1153.980551 \n",
|
|
|
|
|
"1 715.078025 2018 406.980815 \n",
|
|
|
|
|
"2 48.001631 2019 456.736688 \n",
|
|
|
|
|
"3 1863.042464 2015 217.817512 \n",
|
|
|
|
|
"4 827.372135 2014 1779.456397 \n",
|
|
|
|
|
"... ... ... ... \n",
|
|
|
|
|
"10989644 0.051116 2020 139.242812 \n",
|
|
|
|
|
"10989645 2131.978000 2015 158.560081 \n",
|
|
|
|
|
"10989646 0.000597 2020 139.226289 \n",
|
|
|
|
|
"10989647 1433.222830 2016 150.839463 \n",
|
|
|
|
|
"10989648 1591.542558 2016 152.263413 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
"[10989649 rows x 23 columns]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 5,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"df"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 6,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"(2075872, 23)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 6,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"df[df.label == 1].shape"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 7,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"(8913777, 23)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 7,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"df[df.label == 0].shape"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 8,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"# split into train/test sets\n",
|
|
|
|
|
"x = df.loc[:, df.columns != 'label']\n",
|
|
|
|
|
"y = df['label']\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=200000, test_size=1000000, random_state=2, stratify=y)"
|
2021-04-29 18:50:02 +02:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 9,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"(37779, 22)"
|
2021-04-29 18:50:02 +02:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 9,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"train_x[train_y==1].shape"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 10,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"(188893, 22)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 10,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"test_x[test_y==1].shape"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 11,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"OneClassSVM()"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 11,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# define outlier detection model\n",
|
|
|
|
|
"model = OneClassSVM(gamma='scale', nu=0.5)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# fit on majority class\n",
|
|
|
|
|
"train_x = train_x[train_y==1]\n",
|
|
|
|
|
"model.fit(train_x)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 12,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stdout",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"F1 Score: 0.872\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"# detect outliers in the test set\n",
|
|
|
|
|
"y_hat = model.predict(test_x)\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# mark inliers 1, outliers -1\n",
|
|
|
|
|
"test_y[test_y == 0] = -1\n",
|
|
|
|
|
"test_y[test_y == 1] = 1\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"# calculate score\n",
|
|
|
|
|
"score = f1_score(test_y, y_hat, pos_label=-1)\n",
|
|
|
|
|
"print('F1 Score: %.3f' % score)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 13,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"name": "stderr",
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"/Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1597: SettingWithCopyWarning: \n",
|
|
|
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
|
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
|
|
|
" self.obj[key] = value\n",
|
|
|
|
|
"/Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1676: SettingWithCopyWarning: \n",
|
|
|
|
|
"A value is trying to be set on a copy of a slice from a DataFrame.\n",
|
|
|
|
|
"Try using .loc[row_indexer,col_indexer] = value instead\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
|
|
|
|
|
" self._setitem_single_column(ilocs[0], value, pi)\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"test_x.loc[:, 'label'] = test_y.values"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 14,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"test_x.loc[:, 'prediction'] = y_hat"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 15,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>verified_email</th>\n",
|
|
|
|
|
" <th>verified_primary_email</th>\n",
|
|
|
|
|
" <th>n_works</th>\n",
|
|
|
|
|
" <th>n_doi</th>\n",
|
|
|
|
|
" <th>n_arxiv</th>\n",
|
|
|
|
|
" <th>n_pmc</th>\n",
|
|
|
|
|
" <th>n_other_pids</th>\n",
|
|
|
|
|
" <th>n_emails</th>\n",
|
|
|
|
|
" <th>n_urls</th>\n",
|
|
|
|
|
" <th>n_ids</th>\n",
|
|
|
|
|
" <th>...</th>\n",
|
|
|
|
|
" <th>n_valid_education</th>\n",
|
|
|
|
|
" <th>n_valid_employment</th>\n",
|
|
|
|
|
" <th>biography_length</th>\n",
|
|
|
|
|
" <th>biography_n_sentences</th>\n",
|
|
|
|
|
" <th>biography_n_words</th>\n",
|
|
|
|
|
" <th>date_diff</th>\n",
|
|
|
|
|
" <th>ref_year</th>\n",
|
|
|
|
|
" <th>date_stale</th>\n",
|
|
|
|
|
" <th>label</th>\n",
|
|
|
|
|
" <th>prediction</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>4867967</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>189</td>\n",
|
|
|
|
|
" <td>155</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>12</td>\n",
|
|
|
|
|
" <td>177</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>1.0</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>1099</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>160.0</td>\n",
|
|
|
|
|
" <td>2071.017713</td>\n",
|
|
|
|
|
" <td>2015</td>\n",
|
|
|
|
|
" <td>136.067404</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>1.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>-1</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>8751870</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.0</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>722.965509</td>\n",
|
|
|
|
|
" <td>2018</td>\n",
|
|
|
|
|
" <td>296.556650</td>\n",
|
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>10041539</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>2376.608882</td>\n",
|
|
|
|
|
" <td>2014</td>\n",
|
|
|
|
|
" <td>59.669744</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>184408</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>21</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>1</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>461.839456</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2017</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>1072.829123</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>8166189</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
|
|
|
|
" <td>4</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>395.328151</td>\n",
|
|
|
|
|
" <td>2019</td>\n",
|
|
|
|
|
" <td>92.050538</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>...</th>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>7680437</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>34</td>\n",
|
|
|
|
|
" <td>25</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>35</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>1</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>...</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>1854.980732</td>\n",
|
|
|
|
|
" <td>2016</td>\n",
|
|
|
|
|
" <td>58.626353</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>3679223</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>2</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>1.0</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>1147.468938</td>\n",
|
|
|
|
|
" <td>2017</td>\n",
|
|
|
|
|
" <td>210.395635</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>7996977</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>1525.385941</td>\n",
|
|
|
|
|
" <td>2016</td>\n",
|
|
|
|
|
" <td>374.678959</td>\n",
|
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>638259</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>605.317103</td>\n",
|
|
|
|
|
" <td>2019</td>\n",
|
|
|
|
|
" <td>87.166078</td>\n",
|
|
|
|
|
" <td>1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>1485855</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>1264.909069</td>\n",
|
|
|
|
|
" <td>2016</td>\n",
|
|
|
|
|
" <td>512.234662</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"<p>205134 rows × 24 columns</p>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
|
|
|
|
|
"4867967 True True 189 155 0 \n",
|
|
|
|
|
"8751870 True True 0 0 0 \n",
|
|
|
|
|
"10041539 True True 0 0 0 \n",
|
|
|
|
|
"184408 True True 21 0 0 \n",
|
|
|
|
|
"8166189 True True 4 4 0 \n",
|
|
|
|
|
"... ... ... ... ... ... \n",
|
|
|
|
|
"7680437 True True 34 25 0 \n",
|
|
|
|
|
"3679223 True True 2 1 0 \n",
|
|
|
|
|
"7996977 True True 0 0 0 \n",
|
|
|
|
|
"638259 True True 0 0 0 \n",
|
|
|
|
|
"1485855 True True 0 0 0 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" n_pmc n_other_pids n_emails n_urls n_ids ... \\\n",
|
|
|
|
|
"4867967 12 177 0 0 1 ... \n",
|
|
|
|
|
"8751870 0 0 0 0 0 ... \n",
|
|
|
|
|
"10041539 0 0 0 0 0 ... \n",
|
|
|
|
|
"184408 0 0 0 1 0 ... \n",
|
|
|
|
|
"8166189 0 0 0 0 0 ... \n",
|
|
|
|
|
"... ... ... ... ... ... ... \n",
|
|
|
|
|
"7680437 0 35 0 0 1 ... \n",
|
|
|
|
|
"3679223 0 0 0 0 0 ... \n",
|
|
|
|
|
"7996977 0 0 0 0 0 ... \n",
|
|
|
|
|
"638259 0 0 0 0 0 ... \n",
|
|
|
|
|
"1485855 0 0 0 0 0 ... \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" n_valid_education n_valid_employment biography_length \\\n",
|
|
|
|
|
"4867967 1.0 2.0 1099 \n",
|
|
|
|
|
"8751870 1.0 0.0 0 \n",
|
|
|
|
|
"10041539 0.0 0.0 0 \n",
|
|
|
|
|
"184408 4.0 1.0 0 \n",
|
|
|
|
|
"8166189 1.0 1.0 0 \n",
|
|
|
|
|
"... ... ... ... \n",
|
|
|
|
|
"7680437 1.0 1.0 0 \n",
|
|
|
|
|
"3679223 1.0 0.0 0 \n",
|
|
|
|
|
"7996977 0.0 0.0 0 \n",
|
|
|
|
|
"638259 0.0 0.0 0 \n",
|
|
|
|
|
"1485855 0.0 0.0 0 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" biography_n_sentences biography_n_words date_diff ref_year \\\n",
|
|
|
|
|
"4867967 4.0 160.0 2071.017713 2015 \n",
|
|
|
|
|
"8751870 0.0 0.0 722.965509 2018 \n",
|
|
|
|
|
"10041539 0.0 0.0 2376.608882 2014 \n",
|
|
|
|
|
"184408 0.0 0.0 461.839456 2017 \n",
|
|
|
|
|
"8166189 0.0 0.0 395.328151 2019 \n",
|
|
|
|
|
"... ... ... ... ... \n",
|
|
|
|
|
"7680437 0.0 0.0 1854.980732 2016 \n",
|
|
|
|
|
"3679223 0.0 0.0 1147.468938 2017 \n",
|
|
|
|
|
"7996977 0.0 0.0 1525.385941 2016 \n",
|
|
|
|
|
"638259 0.0 0.0 605.317103 2019 \n",
|
|
|
|
|
"1485855 0.0 0.0 1264.909069 2016 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" date_stale label prediction \n",
|
|
|
|
|
"4867967 136.067404 1.0 -1 \n",
|
|
|
|
|
"8751870 296.556650 -1.0 1 \n",
|
|
|
|
|
"10041539 59.669744 1.0 -1 \n",
|
|
|
|
|
"184408 1072.829123 1.0 -1 \n",
|
|
|
|
|
"8166189 92.050538 1.0 -1 \n",
|
|
|
|
|
"... ... ... ... \n",
|
|
|
|
|
"7680437 58.626353 -1.0 1 \n",
|
|
|
|
|
"3679223 210.395635 -1.0 1 \n",
|
|
|
|
|
"7996977 374.678959 -1.0 1 \n",
|
|
|
|
|
"638259 87.166078 1.0 -1 \n",
|
|
|
|
|
"1485855 512.234662 -1.0 1 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"[205134 rows x 24 columns]"
|
2021-04-29 18:50:02 +02:00
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 15,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"test_x[test_x.label != test_x.prediction]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"execution_count": 16,
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"fake_df = pd.read_csv('../data/processed/fake_heap_index.csv', index_col='index')"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"execution_count": 17,
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>verified_email</th>\n",
|
|
|
|
|
" <th>verified_primary_email</th>\n",
|
|
|
|
|
" <th>n_works</th>\n",
|
|
|
|
|
" <th>n_doi</th>\n",
|
|
|
|
|
" <th>n_arxiv</th>\n",
|
|
|
|
|
" <th>n_pmc</th>\n",
|
|
|
|
|
" <th>n_other_pids</th>\n",
|
|
|
|
|
" <th>n_emails</th>\n",
|
|
|
|
|
" <th>n_urls</th>\n",
|
|
|
|
|
" <th>n_ids</th>\n",
|
|
|
|
|
" <th>...</th>\n",
|
|
|
|
|
" <th>n_valid_education</th>\n",
|
|
|
|
|
" <th>n_valid_employment</th>\n",
|
|
|
|
|
" <th>biography_length</th>\n",
|
|
|
|
|
" <th>biography_n_sentences</th>\n",
|
|
|
|
|
" <th>biography_n_words</th>\n",
|
|
|
|
|
" <th>date_diff</th>\n",
|
|
|
|
|
" <th>ref_year</th>\n",
|
|
|
|
|
" <th>date_stale</th>\n",
|
|
|
|
|
" <th>label</th>\n",
|
|
|
|
|
" <th>prediction</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
|
|
|
|
"<p>0 rows × 24 columns</p>\n",
|
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
"Empty DataFrame\n",
|
|
|
|
|
"Columns: [verified_email, verified_primary_email, n_works, n_doi, n_arxiv, n_pmc, n_other_pids, n_emails, n_urls, n_ids, n_keywords, n_education, n_employment, n_ext_work_source, n_valid_education, n_valid_employment, biography_length, biography_n_sentences, biography_n_words, date_diff, ref_year, date_stale, label, prediction]\n",
|
|
|
|
|
"Index: []\n",
|
|
|
|
|
"\n",
|
|
|
|
|
"[0 rows x 24 columns]"
|
|
|
|
|
]
|
|
|
|
|
},
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"execution_count": 17,
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"test_x[(test_x.label != test_x.prediction) & (test_x.index.isin(fake_df.index))]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"execution_count": 18,
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
|
"data": {
|
|
|
|
|
"text/html": [
|
|
|
|
|
"<div>\n",
|
|
|
|
|
"<style scoped>\n",
|
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"\n",
|
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
|
" text-align: right;\n",
|
|
|
|
|
" }\n",
|
|
|
|
|
"</style>\n",
|
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
|
" <thead>\n",
|
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
|
" <th></th>\n",
|
|
|
|
|
" <th>verified_email</th>\n",
|
|
|
|
|
" <th>verified_primary_email</th>\n",
|
|
|
|
|
" <th>n_works</th>\n",
|
|
|
|
|
" <th>n_doi</th>\n",
|
|
|
|
|
" <th>n_arxiv</th>\n",
|
|
|
|
|
" <th>n_pmc</th>\n",
|
|
|
|
|
" <th>n_other_pids</th>\n",
|
|
|
|
|
" <th>n_emails</th>\n",
|
|
|
|
|
" <th>n_urls</th>\n",
|
|
|
|
|
" <th>n_ids</th>\n",
|
|
|
|
|
" <th>...</th>\n",
|
|
|
|
|
" <th>n_valid_education</th>\n",
|
|
|
|
|
" <th>n_valid_employment</th>\n",
|
|
|
|
|
" <th>biography_length</th>\n",
|
|
|
|
|
" <th>biography_n_sentences</th>\n",
|
|
|
|
|
" <th>biography_n_words</th>\n",
|
|
|
|
|
" <th>date_diff</th>\n",
|
|
|
|
|
" <th>ref_year</th>\n",
|
|
|
|
|
" <th>date_stale</th>\n",
|
|
|
|
|
" <th>label</th>\n",
|
|
|
|
|
" <th>prediction</th>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </thead>\n",
|
|
|
|
|
" <tbody>\n",
|
|
|
|
|
" <tr>\n",
|
|
|
|
|
" <th>7579770</th>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
|
|
|
|
" <td>0.002694</td>\n",
|
|
|
|
|
" <td>2020</td>\n",
|
|
|
|
|
" <td>184.261009</td>\n",
|
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>4173344</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.005405</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>191.303842</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>6370669</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.006984</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>178.167846</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>6099073</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.025714</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>142.184268</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>8341750</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>10</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>392</td>\n",
|
|
|
|
|
" <td>3.0</td>\n",
|
|
|
|
|
" <td>74.0</td>\n",
|
|
|
|
|
" <td>0.015018</td>\n",
|
|
|
|
|
" <td>2021</td>\n",
|
|
|
|
|
" <td>46.549118</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>10546308</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.002717</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>203.149999</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>5109458</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.004633</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>140.166676</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>989919</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>561</td>\n",
|
|
|
|
|
" <td>5.0</td>\n",
|
|
|
|
|
" <td>110.0</td>\n",
|
|
|
|
|
" <td>0.002649</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>146.098129</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>9173011</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.005271</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>169.293431</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>801178</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.008069</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>131.828965</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>10581997</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>445</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>4.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>85.0</td>\n",
|
|
|
|
|
" <td>0.020527</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>129.861984</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>9831120</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.004996</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>167.149854</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>7689620</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.003671</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>203.189280</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>10215555</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.000964</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>192.363398</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>10985986</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.003662</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>195.359312</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>8567972</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.005248</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>196.358010</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>1041978</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>445</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>4.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>85.0</td>\n",
|
|
|
|
|
" <td>0.006524</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>167.270642</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>2749172</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>540</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>4.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>103.0</td>\n",
|
|
|
|
|
" <td>0.001172</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>142.989201</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>3274872</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.017478</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>126.967926</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>7963350</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.003572</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>190.312686</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>4392500</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.004414</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>206.150937</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>3243302</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>1</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.003391</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>140.230023</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>6263478</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>552</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>107.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.005174</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>140.365511</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>4230883</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.001681</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>153.354072</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>3659063</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.010618</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>131.658167</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>5659388</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.011685</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>140.250630</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>8567973</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>0.003278</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>184.270401</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>9724190</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>543</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>4.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>107.0</td>\n",
|
|
|
|
|
" <td>6.993352</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>135.124112</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" <tr>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <th>880090</th>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>True</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>1</td>\n",
|
|
|
|
|
" <td>0</td>\n",
|
|
|
|
|
" <td>...</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
|
" <td>0.0</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>445</td>\n",
|
|
|
|
|
" <td>4.0</td>\n",
|
|
|
|
|
" <td>85.0</td>\n",
|
|
|
|
|
" <td>0.001572</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>2020</td>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
" <td>143.298327</td>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
" <td>-1.0</td>\n",
|
|
|
|
|
" <td>-1</td>\n",
|
|
|
|
|
" </tr>\n",
|
|
|
|
|
" </tbody>\n",
|
|
|
|
|
"</table>\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"<p>29 rows × 24 columns</p>\n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"</div>"
|
|
|
|
|
],
|
|
|
|
|
"text/plain": [
|
|
|
|
|
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
|
|
|
|
|
"7579770 True True 0 0 0 \n",
|
|
|
|
|
"4173344 True True 0 0 0 \n",
|
|
|
|
|
"6370669 True True 0 0 0 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"6099073 True True 0 0 0 \n",
|
|
|
|
|
"8341750 True True 0 0 0 \n",
|
|
|
|
|
"10546308 True True 0 0 0 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"5109458 True True 0 0 0 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"989919 True True 0 0 0 \n",
|
|
|
|
|
"9173011 True True 0 0 0 \n",
|
|
|
|
|
"801178 True True 0 0 0 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"10581997 True True 0 0 0 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"9831120 True True 0 0 0 \n",
|
|
|
|
|
"7689620 True True 0 0 0 \n",
|
|
|
|
|
"10215555 True True 0 0 0 \n",
|
|
|
|
|
"10985986 True True 0 0 0 \n",
|
|
|
|
|
"8567972 True True 0 0 0 \n",
|
|
|
|
|
"1041978 True True 0 0 0 \n",
|
|
|
|
|
"2749172 True True 0 0 0 \n",
|
|
|
|
|
"3274872 True True 0 0 0 \n",
|
|
|
|
|
"7963350 True True 0 0 0 \n",
|
|
|
|
|
"4392500 True True 0 0 0 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"3243302 True True 0 0 0 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"6263478 True True 0 0 0 \n",
|
|
|
|
|
"4230883 True True 0 0 0 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"3659063 True True 0 0 0 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"5659388 True True 0 0 0 \n",
|
|
|
|
|
"8567973 True True 0 0 0 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"9724190 True True 0 0 0 \n",
|
|
|
|
|
"880090 True True 0 0 0 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" n_pmc n_other_pids n_emails n_urls n_ids ... \\\n",
|
|
|
|
|
"7579770 0 0 0 1 0 ... \n",
|
|
|
|
|
"4173344 0 0 0 1 0 ... \n",
|
|
|
|
|
"6370669 0 0 0 1 0 ... \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"6099073 0 0 0 1 0 ... \n",
|
|
|
|
|
"8341750 0 0 0 10 0 ... \n",
|
|
|
|
|
"10546308 0 0 0 1 0 ... \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"5109458 0 0 0 1 0 ... \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"989919 0 0 0 1 0 ... \n",
|
|
|
|
|
"9173011 0 0 0 1 0 ... \n",
|
|
|
|
|
"801178 0 0 0 1 0 ... \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"10581997 0 0 0 1 0 ... \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"9831120 0 0 0 1 0 ... \n",
|
|
|
|
|
"7689620 0 0 0 1 0 ... \n",
|
|
|
|
|
"10215555 0 0 0 1 0 ... \n",
|
|
|
|
|
"10985986 0 0 0 1 0 ... \n",
|
|
|
|
|
"8567972 0 0 0 1 0 ... \n",
|
|
|
|
|
"1041978 0 0 0 1 0 ... \n",
|
|
|
|
|
"2749172 0 0 0 1 0 ... \n",
|
|
|
|
|
"3274872 0 0 0 1 0 ... \n",
|
|
|
|
|
"7963350 0 0 0 1 0 ... \n",
|
|
|
|
|
"4392500 0 0 0 1 0 ... \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"3243302 0 0 0 1 0 ... \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"6263478 0 0 0 1 0 ... \n",
|
|
|
|
|
"4230883 0 0 0 1 0 ... \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"3659063 0 0 0 1 0 ... \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"5659388 0 0 0 1 0 ... \n",
|
|
|
|
|
"8567973 0 0 0 1 0 ... \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"9724190 0 0 0 1 0 ... \n",
|
|
|
|
|
"880090 0 0 0 1 0 ... \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" n_valid_education n_valid_employment biography_length \\\n",
|
|
|
|
|
"7579770 0.0 0.0 445 \n",
|
|
|
|
|
"4173344 0.0 0.0 445 \n",
|
|
|
|
|
"6370669 0.0 0.0 445 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"6099073 0.0 0.0 445 \n",
|
|
|
|
|
"8341750 0.0 0.0 392 \n",
|
|
|
|
|
"10546308 0.0 0.0 445 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"5109458 0.0 0.0 445 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"989919 0.0 0.0 561 \n",
|
|
|
|
|
"9173011 0.0 0.0 445 \n",
|
|
|
|
|
"801178 0.0 0.0 445 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"10581997 0.0 0.0 445 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"9831120 0.0 0.0 445 \n",
|
|
|
|
|
"7689620 0.0 0.0 445 \n",
|
|
|
|
|
"10215555 0.0 0.0 445 \n",
|
|
|
|
|
"10985986 0.0 0.0 445 \n",
|
|
|
|
|
"8567972 0.0 0.0 445 \n",
|
|
|
|
|
"1041978 0.0 0.0 445 \n",
|
|
|
|
|
"2749172 0.0 0.0 540 \n",
|
|
|
|
|
"3274872 0.0 0.0 445 \n",
|
|
|
|
|
"7963350 0.0 0.0 445 \n",
|
|
|
|
|
"4392500 0.0 0.0 445 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"3243302 0.0 0.0 445 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"6263478 0.0 0.0 552 \n",
|
|
|
|
|
"4230883 0.0 0.0 445 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"3659063 0.0 0.0 445 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"5659388 0.0 0.0 445 \n",
|
|
|
|
|
"8567973 0.0 0.0 445 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"9724190 0.0 0.0 543 \n",
|
|
|
|
|
"880090 0.0 0.0 445 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" biography_n_sentences biography_n_words date_diff ref_year \\\n",
|
|
|
|
|
"7579770 4.0 85.0 0.002694 2020 \n",
|
|
|
|
|
"4173344 4.0 85.0 0.005405 2020 \n",
|
|
|
|
|
"6370669 4.0 85.0 0.006984 2020 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"6099073 4.0 85.0 0.025714 2020 \n",
|
|
|
|
|
"8341750 3.0 74.0 0.015018 2021 \n",
|
|
|
|
|
"10546308 4.0 85.0 0.002717 2020 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"5109458 4.0 85.0 0.004633 2020 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"989919 5.0 110.0 0.002649 2020 \n",
|
|
|
|
|
"9173011 4.0 85.0 0.005271 2020 \n",
|
|
|
|
|
"801178 4.0 85.0 0.008069 2020 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"10581997 4.0 85.0 0.020527 2020 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"9831120 4.0 85.0 0.004996 2020 \n",
|
|
|
|
|
"7689620 4.0 85.0 0.003671 2020 \n",
|
|
|
|
|
"10215555 4.0 85.0 0.000964 2020 \n",
|
|
|
|
|
"10985986 4.0 85.0 0.003662 2020 \n",
|
|
|
|
|
"8567972 4.0 85.0 0.005248 2020 \n",
|
|
|
|
|
"1041978 4.0 85.0 0.006524 2020 \n",
|
|
|
|
|
"2749172 4.0 103.0 0.001172 2020 \n",
|
|
|
|
|
"3274872 4.0 85.0 0.017478 2020 \n",
|
|
|
|
|
"7963350 4.0 85.0 0.003572 2020 \n",
|
|
|
|
|
"4392500 4.0 85.0 0.004414 2020 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"3243302 4.0 85.0 0.003391 2020 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"6263478 4.0 107.0 0.005174 2020 \n",
|
|
|
|
|
"4230883 4.0 85.0 0.001681 2020 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"3659063 4.0 85.0 0.010618 2020 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"5659388 4.0 85.0 0.011685 2020 \n",
|
|
|
|
|
"8567973 4.0 85.0 0.003278 2020 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"9724190 4.0 107.0 6.993352 2020 \n",
|
|
|
|
|
"880090 4.0 85.0 0.001572 2020 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" date_stale label prediction \n",
|
|
|
|
|
"7579770 184.261009 -1.0 -1 \n",
|
|
|
|
|
"4173344 191.303842 -1.0 -1 \n",
|
|
|
|
|
"6370669 178.167846 -1.0 -1 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"6099073 142.184268 -1.0 -1 \n",
|
|
|
|
|
"8341750 46.549118 -1.0 -1 \n",
|
|
|
|
|
"10546308 203.149999 -1.0 -1 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"5109458 140.166676 -1.0 -1 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"989919 146.098129 -1.0 -1 \n",
|
|
|
|
|
"9173011 169.293431 -1.0 -1 \n",
|
|
|
|
|
"801178 131.828965 -1.0 -1 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"10581997 129.861984 -1.0 -1 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"9831120 167.149854 -1.0 -1 \n",
|
|
|
|
|
"7689620 203.189280 -1.0 -1 \n",
|
|
|
|
|
"10215555 192.363398 -1.0 -1 \n",
|
|
|
|
|
"10985986 195.359312 -1.0 -1 \n",
|
|
|
|
|
"8567972 196.358010 -1.0 -1 \n",
|
|
|
|
|
"1041978 167.270642 -1.0 -1 \n",
|
|
|
|
|
"2749172 142.989201 -1.0 -1 \n",
|
|
|
|
|
"3274872 126.967926 -1.0 -1 \n",
|
|
|
|
|
"7963350 190.312686 -1.0 -1 \n",
|
|
|
|
|
"4392500 206.150937 -1.0 -1 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"3243302 140.230023 -1.0 -1 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"6263478 140.365511 -1.0 -1 \n",
|
|
|
|
|
"4230883 153.354072 -1.0 -1 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"3659063 131.658167 -1.0 -1 \n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"5659388 140.250630 -1.0 -1 \n",
|
|
|
|
|
"8567973 184.270401 -1.0 -1 \n",
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"9724190 135.124112 -1.0 -1 \n",
|
|
|
|
|
"880090 143.298327 -1.0 -1 \n",
|
|
|
|
|
"\n",
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"[29 rows x 24 columns]"
|
2021-04-29 18:50:02 +02:00
|
|
|
|
]
|
|
|
|
|
},
|
2021-07-20 12:15:17 +02:00
|
|
|
|
"execution_count": 18,
|
2021-04-29 18:50:02 +02:00
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"source": [
|
|
|
|
|
"test_x[(test_x.index.isin(fake_df.index))]"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": null,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": []
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|
"metadata": {
|
|
|
|
|
"kernelspec": {
|
|
|
|
|
"display_name": "Python 3",
|
|
|
|
|
"language": "python",
|
|
|
|
|
"name": "python3"
|
|
|
|
|
},
|
|
|
|
|
"language_info": {
|
|
|
|
|
"codemirror_mode": {
|
|
|
|
|
"name": "ipython",
|
|
|
|
|
"version": 3
|
|
|
|
|
},
|
|
|
|
|
"file_extension": ".py",
|
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
|
"name": "python",
|
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
|
"version": "3.8.3"
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
"nbformat": 4,
|
|
|
|
|
"nbformat_minor": 4
|
|
|
|
|
}
|