{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "from sklearn.svm import OneClassSVM \n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import f1_score" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_pickle('../data/processed/features.pkl')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidslabeln_emailsn_urls...n_employmentn_ext_work_sourcen_valid_educationn_valid_employmentbiography_lengthbiography_n_sentencesbiography_n_wordsdate_diffref_yeardate_stale
0FalseFalse00000False<NA><NA>...<NA><NA>NaNNaN<NA>NaNNaN0.00961820181153.980551
1TrueTrue00000False<NA><NA>...1<NA>NaN1.0<NA>NaNNaN715.0780252018406.980815
2TrueTrue00000False<NA><NA>...<NA><NA>NaNNaN<NA>NaNNaN48.0016312019456.736688
3TrueTrue00000False<NA><NA>...1<NA>NaN0.0<NA>NaNNaN1863.0424642015217.817512
4TrueTrue00000False<NA><NA>...2<NA>NaN1.0<NA>NaNNaN827.37213520141779.456397
..................................................................
10989644TrueTrue00000False<NA><NA>...1<NA>1.01.01182.023.00.0511162020139.242812
10989645TrueTrue77010True<NA><NA>...222.00.0<NA>NaNNaN2131.9780002015158.560081
10989646TrueTrue00000False<NA><NA>...<NA><NA>NaNNaN<NA>NaNNaN0.0005972020139.226289
10989647TrueTrue00000False<NA><NA>...1<NA>2.01.0<NA>NaNNaN1433.2228302016150.839463
10989648TrueTrue00000False<NA><NA>...<NA><NA>NaNNaN<NA>NaNNaN1591.5425582016152.263413
\n", "

10989649 rows × 23 columns

\n", "
" ], "text/plain": [ " verified_email verified_primary_email n_works n_doi n_arxiv \\\n", "0 False False 0 0 0 \n", "1 True True 0 0 0 \n", "2 True True 0 0 0 \n", "3 True True 0 0 0 \n", "4 True True 0 0 0 \n", "... ... ... ... ... ... \n", "10989644 True True 0 0 0 \n", "10989645 True True 7 7 0 \n", "10989646 True True 0 0 0 \n", "10989647 True True 0 0 0 \n", "10989648 True True 0 0 0 \n", "\n", " n_pmc n_other_pids label n_emails n_urls ... n_employment \\\n", "0 0 0 False ... \n", "1 0 0 False ... 1 \n", "2 0 0 False ... \n", "3 0 0 False ... 1 \n", "4 0 0 False ... 2 \n", "... ... ... ... ... ... ... ... \n", "10989644 0 0 False ... 1 \n", "10989645 1 0 True ... 2 \n", "10989646 0 0 False ... \n", "10989647 0 0 False ... 1 \n", "10989648 0 0 False ... \n", "\n", " n_ext_work_source n_valid_education n_valid_employment \\\n", "0 NaN NaN \n", "1 NaN 1.0 \n", "2 NaN NaN \n", "3 NaN 0.0 \n", "4 NaN 1.0 \n", "... ... ... ... \n", "10989644 1.0 1.0 \n", "10989645 2 2.0 0.0 \n", "10989646 NaN NaN \n", "10989647 2.0 1.0 \n", "10989648 NaN NaN \n", "\n", " biography_length biography_n_sentences biography_n_words \\\n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", "... ... ... ... \n", "10989644 118 2.0 23.0 \n", "10989645 NaN NaN \n", "10989646 NaN NaN \n", "10989647 NaN NaN \n", "10989648 NaN NaN \n", "\n", " date_diff ref_year date_stale \n", "0 0.009618 2018 1153.980551 \n", "1 715.078025 2018 406.980815 \n", "2 48.001631 2019 456.736688 \n", "3 1863.042464 2015 217.817512 \n", "4 827.372135 2014 1779.456397 \n", "... ... ... ... \n", "10989644 0.051116 2020 139.242812 \n", "10989645 2131.978000 2015 158.560081 \n", "10989646 0.000597 2020 139.226289 \n", "10989647 1433.222830 2016 150.839463 \n", "10989648 1591.542558 2016 152.263413 \n", "\n", "[10989649 rows x 23 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df.fillna(0, inplace=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidslabeln_emailsn_urls...n_employmentn_ext_work_sourcen_valid_educationn_valid_employmentbiography_lengthbiography_n_sentencesbiography_n_wordsdate_diffref_yeardate_stale
0FalseFalse00000False00...000.00.000.00.00.00961820181153.980551
1TrueTrue00000False00...100.01.000.00.0715.0780252018406.980815
2TrueTrue00000False00...000.00.000.00.048.0016312019456.736688
3TrueTrue00000False00...100.00.000.00.01863.0424642015217.817512
4TrueTrue00000False00...200.01.000.00.0827.37213520141779.456397
..................................................................
10989644TrueTrue00000False00...101.01.01182.023.00.0511162020139.242812
10989645TrueTrue77010True00...222.00.000.00.02131.9780002015158.560081
10989646TrueTrue00000False00...000.00.000.00.00.0005972020139.226289
10989647TrueTrue00000False00...102.01.000.00.01433.2228302016150.839463
10989648TrueTrue00000False00...000.00.000.00.01591.5425582016152.263413
\n", "

10989649 rows × 23 columns

\n", "
" ], "text/plain": [ " verified_email verified_primary_email n_works n_doi n_arxiv \\\n", "0 False False 0 0 0 \n", "1 True True 0 0 0 \n", "2 True True 0 0 0 \n", "3 True True 0 0 0 \n", "4 True True 0 0 0 \n", "... ... ... ... ... ... \n", "10989644 True True 0 0 0 \n", "10989645 True True 7 7 0 \n", "10989646 True True 0 0 0 \n", "10989647 True True 0 0 0 \n", "10989648 True True 0 0 0 \n", "\n", " n_pmc n_other_pids label n_emails n_urls ... n_employment \\\n", "0 0 0 False 0 0 ... 0 \n", "1 0 0 False 0 0 ... 1 \n", "2 0 0 False 0 0 ... 0 \n", "3 0 0 False 0 0 ... 1 \n", "4 0 0 False 0 0 ... 2 \n", "... ... ... ... ... ... ... ... \n", "10989644 0 0 False 0 0 ... 1 \n", "10989645 1 0 True 0 0 ... 2 \n", "10989646 0 0 False 0 0 ... 0 \n", "10989647 0 0 False 0 0 ... 1 \n", "10989648 0 0 False 0 0 ... 0 \n", "\n", " n_ext_work_source n_valid_education n_valid_employment \\\n", "0 0 0.0 0.0 \n", "1 0 0.0 1.0 \n", "2 0 0.0 0.0 \n", "3 0 0.0 0.0 \n", "4 0 0.0 1.0 \n", "... ... ... ... \n", "10989644 0 1.0 1.0 \n", "10989645 2 2.0 0.0 \n", "10989646 0 0.0 0.0 \n", "10989647 0 2.0 1.0 \n", "10989648 0 0.0 0.0 \n", "\n", " biography_length biography_n_sentences biography_n_words \\\n", "0 0 0.0 0.0 \n", "1 0 0.0 0.0 \n", "2 0 0.0 0.0 \n", "3 0 0.0 0.0 \n", "4 0 0.0 0.0 \n", "... ... ... ... \n", "10989644 118 2.0 23.0 \n", "10989645 0 0.0 0.0 \n", "10989646 0 0.0 0.0 \n", "10989647 0 0.0 0.0 \n", "10989648 0 0.0 0.0 \n", "\n", " date_diff ref_year date_stale \n", "0 0.009618 2018 1153.980551 \n", "1 715.078025 2018 406.980815 \n", "2 48.001631 2019 456.736688 \n", "3 1863.042464 2015 217.817512 \n", "4 827.372135 2014 1779.456397 \n", "... ... ... ... \n", "10989644 0.051116 2020 139.242812 \n", "10989645 2131.978000 2015 158.560081 \n", "10989646 0.000597 2020 139.226289 \n", "10989647 1433.222830 2016 150.839463 \n", "10989648 1591.542558 2016 152.263413 \n", "\n", "[10989649 rows x 23 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2075872, 23)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.label == 1].shape" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(8913777, 23)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.label == 0].shape" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# split into train/test sets\n", "x = df.loc[:, df.columns != 'label']\n", "y = df['label']\n", "train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=200000, test_size=1000000, random_state=2, stratify=y)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(37779, 22)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_x[train_y==1].shape" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(188893, 22)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_x[test_y==1].shape" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "OneClassSVM()" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# define outlier detection model\n", "model = OneClassSVM(gamma='scale', nu=0.5)\n", "\n", "# fit on majority class\n", "train_x = train_x[train_y==1]\n", "model.fit(train_x)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "F1 Score: 0.872\n" ] } ], "source": [ "# detect outliers in the test set\n", "y_hat = model.predict(test_x)\n", "\n", "# mark inliers 1, outliers -1\n", "test_y[test_y == 0] = -1\n", "test_y[test_y == 1] = 1\n", "\n", "# calculate score\n", "score = f1_score(test_y, y_hat, pos_label=-1)\n", "print('F1 Score: %.3f' % score)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1597: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " self.obj[key] = value\n", "/Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1676: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " self._setitem_single_column(ilocs[0], value, pi)\n" ] } ], "source": [ "test_x.loc[:, 'label'] = test_y.values" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "test_x.loc[:, 'prediction'] = y_hat" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidsn_emailsn_urlsn_ids...n_valid_educationn_valid_employmentbiography_lengthbiography_n_sentencesbiography_n_wordsdate_diffref_yeardate_stalelabelprediction
4867967TrueTrue189155012177001...1.02.010994.0160.02071.0177132015136.0674041.0-1
8751870TrueTrue00000000...1.00.000.00.0722.9655092018296.556650-1.01
10041539TrueTrue00000000...0.00.000.00.02376.608882201459.6697441.0-1
184408TrueTrue210000010...4.01.000.00.0461.83945620171072.8291231.0-1
8166189TrueTrue44000000...1.01.000.00.0395.328151201992.0505381.0-1
..................................................................
7680437TrueTrue34250035001...1.01.000.00.01854.980732201658.626353-1.01
3679223TrueTrue21000000...1.00.000.00.01147.4689382017210.395635-1.01
7996977TrueTrue00000000...0.00.000.00.01525.3859412016374.678959-1.01
638259TrueTrue00000000...0.00.000.00.0605.317103201987.1660781.0-1
1485855TrueTrue00000000...0.00.000.00.01264.9090692016512.234662-1.01
\n", "

205134 rows × 24 columns

\n", "
" ], "text/plain": [ " verified_email verified_primary_email n_works n_doi n_arxiv \\\n", "4867967 True True 189 155 0 \n", "8751870 True True 0 0 0 \n", "10041539 True True 0 0 0 \n", "184408 True True 21 0 0 \n", "8166189 True True 4 4 0 \n", "... ... ... ... ... ... \n", "7680437 True True 34 25 0 \n", "3679223 True True 2 1 0 \n", "7996977 True True 0 0 0 \n", "638259 True True 0 0 0 \n", "1485855 True True 0 0 0 \n", "\n", " n_pmc n_other_pids n_emails n_urls n_ids ... \\\n", "4867967 12 177 0 0 1 ... \n", "8751870 0 0 0 0 0 ... \n", "10041539 0 0 0 0 0 ... \n", "184408 0 0 0 1 0 ... \n", "8166189 0 0 0 0 0 ... \n", "... ... ... ... ... ... ... \n", "7680437 0 35 0 0 1 ... \n", "3679223 0 0 0 0 0 ... \n", "7996977 0 0 0 0 0 ... \n", "638259 0 0 0 0 0 ... \n", "1485855 0 0 0 0 0 ... \n", "\n", " n_valid_education n_valid_employment biography_length \\\n", "4867967 1.0 2.0 1099 \n", "8751870 1.0 0.0 0 \n", "10041539 0.0 0.0 0 \n", "184408 4.0 1.0 0 \n", "8166189 1.0 1.0 0 \n", "... ... ... ... \n", "7680437 1.0 1.0 0 \n", "3679223 1.0 0.0 0 \n", "7996977 0.0 0.0 0 \n", "638259 0.0 0.0 0 \n", "1485855 0.0 0.0 0 \n", "\n", " biography_n_sentences biography_n_words date_diff ref_year \\\n", "4867967 4.0 160.0 2071.017713 2015 \n", "8751870 0.0 0.0 722.965509 2018 \n", "10041539 0.0 0.0 2376.608882 2014 \n", "184408 0.0 0.0 461.839456 2017 \n", "8166189 0.0 0.0 395.328151 2019 \n", "... ... ... ... ... \n", "7680437 0.0 0.0 1854.980732 2016 \n", "3679223 0.0 0.0 1147.468938 2017 \n", "7996977 0.0 0.0 1525.385941 2016 \n", "638259 0.0 0.0 605.317103 2019 \n", "1485855 0.0 0.0 1264.909069 2016 \n", "\n", " date_stale label prediction \n", "4867967 136.067404 1.0 -1 \n", "8751870 296.556650 -1.0 1 \n", "10041539 59.669744 1.0 -1 \n", "184408 1072.829123 1.0 -1 \n", "8166189 92.050538 1.0 -1 \n", "... ... ... ... \n", "7680437 58.626353 -1.0 1 \n", "3679223 210.395635 -1.0 1 \n", "7996977 374.678959 -1.0 1 \n", "638259 87.166078 1.0 -1 \n", "1485855 512.234662 -1.0 1 \n", "\n", "[205134 rows x 24 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_x[test_x.label != test_x.prediction]" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "fake_df = pd.read_csv('../data/processed/fake_heap_index.csv', index_col='index')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidsn_emailsn_urlsn_ids...n_valid_educationn_valid_employmentbiography_lengthbiography_n_sentencesbiography_n_wordsdate_diffref_yeardate_stalelabelprediction
\n", "

0 rows × 24 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [verified_email, verified_primary_email, n_works, n_doi, n_arxiv, n_pmc, n_other_pids, n_emails, n_urls, n_ids, n_keywords, n_education, n_employment, n_ext_work_source, n_valid_education, n_valid_employment, biography_length, biography_n_sentences, biography_n_words, date_diff, ref_year, date_stale, label, prediction]\n", "Index: []\n", "\n", "[0 rows x 24 columns]" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_x[(test_x.label != test_x.prediction) & (test_x.index.isin(fake_df.index))]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidsn_emailsn_urlsn_ids...n_valid_educationn_valid_employmentbiography_lengthbiography_n_sentencesbiography_n_wordsdate_diffref_yeardate_stalelabelprediction
7579770TrueTrue00000010...0.00.04454.085.00.0026942020184.261009-1.0-1
4173344TrueTrue00000010...0.00.04454.085.00.0054052020191.303842-1.0-1
6370669TrueTrue00000010...0.00.04454.085.00.0069842020178.167846-1.0-1
6099073TrueTrue00000010...0.00.04454.085.00.0257142020142.184268-1.0-1
8341750TrueTrue000000100...0.00.03923.074.00.015018202146.549118-1.0-1
10546308TrueTrue00000010...0.00.04454.085.00.0027172020203.149999-1.0-1
5109458TrueTrue00000010...0.00.04454.085.00.0046332020140.166676-1.0-1
989919TrueTrue00000010...0.00.05615.0110.00.0026492020146.098129-1.0-1
9173011TrueTrue00000010...0.00.04454.085.00.0052712020169.293431-1.0-1
801178TrueTrue00000010...0.00.04454.085.00.0080692020131.828965-1.0-1
10581997TrueTrue00000010...0.00.04454.085.00.0205272020129.861984-1.0-1
9831120TrueTrue00000010...0.00.04454.085.00.0049962020167.149854-1.0-1
7689620TrueTrue00000010...0.00.04454.085.00.0036712020203.189280-1.0-1
10215555TrueTrue00000010...0.00.04454.085.00.0009642020192.363398-1.0-1
10985986TrueTrue00000010...0.00.04454.085.00.0036622020195.359312-1.0-1
8567972TrueTrue00000010...0.00.04454.085.00.0052482020196.358010-1.0-1
1041978TrueTrue00000010...0.00.04454.085.00.0065242020167.270642-1.0-1
2749172TrueTrue00000010...0.00.05404.0103.00.0011722020142.989201-1.0-1
3274872TrueTrue00000010...0.00.04454.085.00.0174782020126.967926-1.0-1
7963350TrueTrue00000010...0.00.04454.085.00.0035722020190.312686-1.0-1
4392500TrueTrue00000010...0.00.04454.085.00.0044142020206.150937-1.0-1
3243302TrueTrue00000010...0.00.04454.085.00.0033912020140.230023-1.0-1
6263478TrueTrue00000010...0.00.05524.0107.00.0051742020140.365511-1.0-1
4230883TrueTrue00000010...0.00.04454.085.00.0016812020153.354072-1.0-1
3659063TrueTrue00000010...0.00.04454.085.00.0106182020131.658167-1.0-1
5659388TrueTrue00000010...0.00.04454.085.00.0116852020140.250630-1.0-1
8567973TrueTrue00000010...0.00.04454.085.00.0032782020184.270401-1.0-1
9724190TrueTrue00000010...0.00.05434.0107.06.9933522020135.124112-1.0-1
880090TrueTrue00000010...0.00.04454.085.00.0015722020143.298327-1.0-1
\n", "

29 rows × 24 columns

\n", "
" ], "text/plain": [ " verified_email verified_primary_email n_works n_doi n_arxiv \\\n", "7579770 True True 0 0 0 \n", "4173344 True True 0 0 0 \n", "6370669 True True 0 0 0 \n", "6099073 True True 0 0 0 \n", "8341750 True True 0 0 0 \n", "10546308 True True 0 0 0 \n", "5109458 True True 0 0 0 \n", "989919 True True 0 0 0 \n", "9173011 True True 0 0 0 \n", "801178 True True 0 0 0 \n", "10581997 True True 0 0 0 \n", "9831120 True True 0 0 0 \n", "7689620 True True 0 0 0 \n", "10215555 True True 0 0 0 \n", "10985986 True True 0 0 0 \n", "8567972 True True 0 0 0 \n", "1041978 True True 0 0 0 \n", "2749172 True True 0 0 0 \n", "3274872 True True 0 0 0 \n", "7963350 True True 0 0 0 \n", "4392500 True True 0 0 0 \n", "3243302 True True 0 0 0 \n", "6263478 True True 0 0 0 \n", "4230883 True True 0 0 0 \n", "3659063 True True 0 0 0 \n", "5659388 True True 0 0 0 \n", "8567973 True True 0 0 0 \n", "9724190 True True 0 0 0 \n", "880090 True True 0 0 0 \n", "\n", " n_pmc n_other_pids n_emails n_urls n_ids ... \\\n", "7579770 0 0 0 1 0 ... \n", "4173344 0 0 0 1 0 ... \n", "6370669 0 0 0 1 0 ... \n", "6099073 0 0 0 1 0 ... \n", "8341750 0 0 0 10 0 ... \n", "10546308 0 0 0 1 0 ... \n", "5109458 0 0 0 1 0 ... \n", "989919 0 0 0 1 0 ... \n", "9173011 0 0 0 1 0 ... \n", "801178 0 0 0 1 0 ... \n", "10581997 0 0 0 1 0 ... \n", "9831120 0 0 0 1 0 ... \n", "7689620 0 0 0 1 0 ... \n", "10215555 0 0 0 1 0 ... \n", "10985986 0 0 0 1 0 ... \n", "8567972 0 0 0 1 0 ... \n", "1041978 0 0 0 1 0 ... \n", "2749172 0 0 0 1 0 ... \n", "3274872 0 0 0 1 0 ... \n", "7963350 0 0 0 1 0 ... \n", "4392500 0 0 0 1 0 ... \n", "3243302 0 0 0 1 0 ... \n", "6263478 0 0 0 1 0 ... \n", "4230883 0 0 0 1 0 ... \n", "3659063 0 0 0 1 0 ... \n", "5659388 0 0 0 1 0 ... \n", "8567973 0 0 0 1 0 ... \n", "9724190 0 0 0 1 0 ... \n", "880090 0 0 0 1 0 ... \n", "\n", " n_valid_education n_valid_employment biography_length \\\n", "7579770 0.0 0.0 445 \n", "4173344 0.0 0.0 445 \n", "6370669 0.0 0.0 445 \n", "6099073 0.0 0.0 445 \n", "8341750 0.0 0.0 392 \n", "10546308 0.0 0.0 445 \n", "5109458 0.0 0.0 445 \n", "989919 0.0 0.0 561 \n", "9173011 0.0 0.0 445 \n", "801178 0.0 0.0 445 \n", "10581997 0.0 0.0 445 \n", "9831120 0.0 0.0 445 \n", "7689620 0.0 0.0 445 \n", "10215555 0.0 0.0 445 \n", "10985986 0.0 0.0 445 \n", "8567972 0.0 0.0 445 \n", "1041978 0.0 0.0 445 \n", "2749172 0.0 0.0 540 \n", "3274872 0.0 0.0 445 \n", "7963350 0.0 0.0 445 \n", "4392500 0.0 0.0 445 \n", "3243302 0.0 0.0 445 \n", "6263478 0.0 0.0 552 \n", "4230883 0.0 0.0 445 \n", "3659063 0.0 0.0 445 \n", "5659388 0.0 0.0 445 \n", "8567973 0.0 0.0 445 \n", "9724190 0.0 0.0 543 \n", "880090 0.0 0.0 445 \n", "\n", " biography_n_sentences biography_n_words date_diff ref_year \\\n", "7579770 4.0 85.0 0.002694 2020 \n", "4173344 4.0 85.0 0.005405 2020 \n", "6370669 4.0 85.0 0.006984 2020 \n", "6099073 4.0 85.0 0.025714 2020 \n", "8341750 3.0 74.0 0.015018 2021 \n", "10546308 4.0 85.0 0.002717 2020 \n", "5109458 4.0 85.0 0.004633 2020 \n", "989919 5.0 110.0 0.002649 2020 \n", "9173011 4.0 85.0 0.005271 2020 \n", "801178 4.0 85.0 0.008069 2020 \n", "10581997 4.0 85.0 0.020527 2020 \n", "9831120 4.0 85.0 0.004996 2020 \n", "7689620 4.0 85.0 0.003671 2020 \n", "10215555 4.0 85.0 0.000964 2020 \n", "10985986 4.0 85.0 0.003662 2020 \n", "8567972 4.0 85.0 0.005248 2020 \n", "1041978 4.0 85.0 0.006524 2020 \n", "2749172 4.0 103.0 0.001172 2020 \n", "3274872 4.0 85.0 0.017478 2020 \n", "7963350 4.0 85.0 0.003572 2020 \n", "4392500 4.0 85.0 0.004414 2020 \n", "3243302 4.0 85.0 0.003391 2020 \n", "6263478 4.0 107.0 0.005174 2020 \n", "4230883 4.0 85.0 0.001681 2020 \n", "3659063 4.0 85.0 0.010618 2020 \n", "5659388 4.0 85.0 0.011685 2020 \n", "8567973 4.0 85.0 0.003278 2020 \n", "9724190 4.0 107.0 6.993352 2020 \n", "880090 4.0 85.0 0.001572 2020 \n", "\n", " date_stale label prediction \n", "7579770 184.261009 -1.0 -1 \n", "4173344 191.303842 -1.0 -1 \n", "6370669 178.167846 -1.0 -1 \n", "6099073 142.184268 -1.0 -1 \n", "8341750 46.549118 -1.0 -1 \n", "10546308 203.149999 -1.0 -1 \n", "5109458 140.166676 -1.0 -1 \n", "989919 146.098129 -1.0 -1 \n", "9173011 169.293431 -1.0 -1 \n", "801178 131.828965 -1.0 -1 \n", "10581997 129.861984 -1.0 -1 \n", "9831120 167.149854 -1.0 -1 \n", "7689620 203.189280 -1.0 -1 \n", "10215555 192.363398 -1.0 -1 \n", "10985986 195.359312 -1.0 -1 \n", "8567972 196.358010 -1.0 -1 \n", "1041978 167.270642 -1.0 -1 \n", "2749172 142.989201 -1.0 -1 \n", "3274872 126.967926 -1.0 -1 \n", "7963350 190.312686 -1.0 -1 \n", "4392500 206.150937 -1.0 -1 \n", "3243302 140.230023 -1.0 -1 \n", "6263478 140.365511 -1.0 -1 \n", "4230883 153.354072 -1.0 -1 \n", "3659063 131.658167 -1.0 -1 \n", "5659388 140.250630 -1.0 -1 \n", "8567973 184.270401 -1.0 -1 \n", "9724190 135.124112 -1.0 -1 \n", "880090 143.298327 -1.0 -1 \n", "\n", "[29 rows x 24 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_x[(test_x.index.isin(fake_df.index))]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }