{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "from sklearn.preprocessing import MultiLabelBinarizer\n", "from sklearn.svm import OneClassSVM \n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import f1_score" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "df = pd.read_pickle('../data/processed/features.pkl')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidslabeln_emailsn_urls...n_employmentn_ext_work_sourcen_valid_educationn_valid_employmentbiography_lengthbiography_n_sentencesbiography_n_wordsdate_diffref_yeardate_stale
0FalseFalse00000False<NA><NA>...<NA><NA>NaNNaN<NA>NaNNaN0.00961820181153.980551
1TrueTrue00000False<NA><NA>...1<NA>NaN1.0<NA>NaNNaN715.0780252018406.980815
2TrueTrue00000False<NA><NA>...<NA><NA>NaNNaN<NA>NaNNaN48.0016312019456.736688
3TrueTrue00000False<NA><NA>...1<NA>NaN0.0<NA>NaNNaN1863.0424642015217.817512
4TrueTrue00000False<NA><NA>...2<NA>NaN1.0<NA>NaNNaN827.37213520141779.456397
..................................................................
10989644TrueTrue00000False<NA><NA>...1<NA>1.01.01182.023.00.0511162020139.242812
10989645TrueTrue77010True<NA><NA>...222.00.0<NA>NaNNaN2131.9780002015158.560081
10989646TrueTrue00000False<NA><NA>...<NA><NA>NaNNaN<NA>NaNNaN0.0005972020139.226289
10989647TrueTrue00000False<NA><NA>...1<NA>2.01.0<NA>NaNNaN1433.2228302016150.839463
10989648TrueTrue00000False<NA><NA>...<NA><NA>NaNNaN<NA>NaNNaN1591.5425582016152.263413
\n", "

10989649 rows × 23 columns

\n", "
" ], "text/plain": [ " verified_email verified_primary_email n_works n_doi n_arxiv \\\n", "0 False False 0 0 0 \n", "1 True True 0 0 0 \n", "2 True True 0 0 0 \n", "3 True True 0 0 0 \n", "4 True True 0 0 0 \n", "... ... ... ... ... ... \n", "10989644 True True 0 0 0 \n", "10989645 True True 7 7 0 \n", "10989646 True True 0 0 0 \n", "10989647 True True 0 0 0 \n", "10989648 True True 0 0 0 \n", "\n", " n_pmc n_other_pids label n_emails n_urls ... n_employment \\\n", "0 0 0 False ... \n", "1 0 0 False ... 1 \n", "2 0 0 False ... \n", "3 0 0 False ... 1 \n", "4 0 0 False ... 2 \n", "... ... ... ... ... ... ... ... \n", "10989644 0 0 False ... 1 \n", "10989645 1 0 True ... 2 \n", "10989646 0 0 False ... \n", "10989647 0 0 False ... 1 \n", "10989648 0 0 False ... \n", "\n", " n_ext_work_source n_valid_education n_valid_employment \\\n", "0 NaN NaN \n", "1 NaN 1.0 \n", "2 NaN NaN \n", "3 NaN 0.0 \n", "4 NaN 1.0 \n", "... ... ... ... \n", "10989644 1.0 1.0 \n", "10989645 2 2.0 0.0 \n", "10989646 NaN NaN \n", "10989647 2.0 1.0 \n", "10989648 NaN NaN \n", "\n", " biography_length biography_n_sentences biography_n_words \\\n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", "... ... ... ... \n", "10989644 118 2.0 23.0 \n", "10989645 NaN NaN \n", "10989646 NaN NaN \n", "10989647 NaN NaN \n", "10989648 NaN NaN \n", "\n", " date_diff ref_year date_stale \n", "0 0.009618 2018 1153.980551 \n", "1 715.078025 2018 406.980815 \n", "2 48.001631 2019 456.736688 \n", "3 1863.042464 2015 217.817512 \n", "4 827.372135 2014 1779.456397 \n", "... ... ... ... \n", "10989644 0.051116 2020 139.242812 \n", "10989645 2131.978000 2015 158.560081 \n", "10989646 0.000597 2020 139.226289 \n", "10989647 1433.222830 2016 150.839463 \n", "10989648 1591.542558 2016 152.263413 \n", "\n", "[10989649 rows x 23 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "df.fillna(0, inplace=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidslabeln_emailsn_urls...n_employmentn_ext_work_sourcen_valid_educationn_valid_employmentbiography_lengthbiography_n_sentencesbiography_n_wordsdate_diffref_yeardate_stale
0FalseFalse00000False00...000.00.000.00.00.00961820181153.980551
1TrueTrue00000False00...100.01.000.00.0715.0780252018406.980815
2TrueTrue00000False00...000.00.000.00.048.0016312019456.736688
3TrueTrue00000False00...100.00.000.00.01863.0424642015217.817512
4TrueTrue00000False00...200.01.000.00.0827.37213520141779.456397
..................................................................
10989644TrueTrue00000False00...101.01.01182.023.00.0511162020139.242812
10989645TrueTrue77010True00...222.00.000.00.02131.9780002015158.560081
10989646TrueTrue00000False00...000.00.000.00.00.0005972020139.226289
10989647TrueTrue00000False00...102.01.000.00.01433.2228302016150.839463
10989648TrueTrue00000False00...000.00.000.00.01591.5425582016152.263413
\n", "

10989649 rows × 23 columns

\n", "
" ], "text/plain": [ " verified_email verified_primary_email n_works n_doi n_arxiv \\\n", "0 False False 0 0 0 \n", "1 True True 0 0 0 \n", "2 True True 0 0 0 \n", "3 True True 0 0 0 \n", "4 True True 0 0 0 \n", "... ... ... ... ... ... \n", "10989644 True True 0 0 0 \n", "10989645 True True 7 7 0 \n", "10989646 True True 0 0 0 \n", "10989647 True True 0 0 0 \n", "10989648 True True 0 0 0 \n", "\n", " n_pmc n_other_pids label n_emails n_urls ... n_employment \\\n", "0 0 0 False 0 0 ... 0 \n", "1 0 0 False 0 0 ... 1 \n", "2 0 0 False 0 0 ... 0 \n", "3 0 0 False 0 0 ... 1 \n", "4 0 0 False 0 0 ... 2 \n", "... ... ... ... ... ... ... ... \n", "10989644 0 0 False 0 0 ... 1 \n", "10989645 1 0 True 0 0 ... 2 \n", "10989646 0 0 False 0 0 ... 0 \n", "10989647 0 0 False 0 0 ... 1 \n", "10989648 0 0 False 0 0 ... 0 \n", "\n", " n_ext_work_source n_valid_education n_valid_employment \\\n", "0 0 0.0 0.0 \n", "1 0 0.0 1.0 \n", "2 0 0.0 0.0 \n", "3 0 0.0 0.0 \n", "4 0 0.0 1.0 \n", "... ... ... ... \n", "10989644 0 1.0 1.0 \n", "10989645 2 2.0 0.0 \n", "10989646 0 0.0 0.0 \n", "10989647 0 2.0 1.0 \n", "10989648 0 0.0 0.0 \n", "\n", " biography_length biography_n_sentences biography_n_words \\\n", "0 0 0.0 0.0 \n", "1 0 0.0 0.0 \n", "2 0 0.0 0.0 \n", "3 0 0.0 0.0 \n", "4 0 0.0 0.0 \n", "... ... ... ... \n", "10989644 118 2.0 23.0 \n", "10989645 0 0.0 0.0 \n", "10989646 0 0.0 0.0 \n", "10989647 0 0.0 0.0 \n", "10989648 0 0.0 0.0 \n", "\n", " date_diff ref_year date_stale \n", "0 0.009618 2018 1153.980551 \n", "1 715.078025 2018 406.980815 \n", "2 48.001631 2019 456.736688 \n", "3 1863.042464 2015 217.817512 \n", "4 827.372135 2014 1779.456397 \n", "... ... ... ... \n", "10989644 0.051116 2020 139.242812 \n", "10989645 2131.978000 2015 158.560081 \n", "10989646 0.000597 2020 139.226289 \n", "10989647 1433.222830 2016 150.839463 \n", "10989648 1591.542558 2016 152.263413 \n", "\n", "[10989649 rows x 23 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2075872, 23)" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.label == 1].shape" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(8913777, 23)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.label == 0].shape" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "# split into train/test sets\n", "x = df.loc[:, df.columns != 'label']\n", "y = df['label']\n", "train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=100000, test_size=1000000, random_state=2, stratify=y)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(18889, 22)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "train_x[train_y==1].shape" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(188893, 22)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_x[test_y==1].shape" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "OneClassSVM()" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# define outlier detection model\n", "model = OneClassSVM(gamma='scale', nu=0.5)\n", "\n", "# fit on majority class\n", "train_x = train_x[train_y==1]\n", "model.fit(train_x)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "F1 Score: 0.872\n" ] } ], "source": [ "# detect outliers in the test set\n", "y_hat = model.predict(test_x)\n", "\n", "# mark inliers 1, outliers -1\n", "test_y[test_y == 0] = -1\n", "test_y[test_y == 1] = 1\n", "\n", "# calculate score\n", "score = f1_score(test_y, y_hat, pos_label=-1)\n", "print('F1 Score: %.3f' % score)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1597: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " self.obj[key] = value\n", "/Users/andrea/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/indexing.py:1676: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " self._setitem_single_column(ilocs[0], value, pi)\n" ] } ], "source": [ "test_x.loc[:, 'label'] = test_y.values" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "test_x.loc[:, 'prediction'] = y_hat" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidsn_emailsn_urlsn_ids...n_valid_educationn_valid_employmentbiography_lengthbiography_n_sentencesbiography_n_wordsdate_diffref_yeardate_stalelabelprediction
8422958TrueTrue73006001...2.01.000.00.01149.7601282016498.053074-1.01
30426TrueTrue55000000...1.03.000.00.02446.8154562014112.0646791.0-1
1941223TrueTrue00000000...0.00.000.00.0811.5730402017486.567693-1.01
9232681TrueTrue11000000...0.00.000.00.0888.5201572017359.397781-1.01
7646644FalseFalse00000000...0.00.000.00.00.00003020171443.7318371.0-1
..................................................................
4769520TrueTrue00000000...0.00.000.00.0781.732331201946.115046-1.01
2817268TrueTrue97005001...3.00.000.00.01782.5437472014544.549425-1.01
4840353TrueTrue00000000...0.00.000.00.0659.8645952017634.8460071.0-1
9717615TrueTrue00000001...0.00.000.00.01971.810335201538.063916-1.01
2290714FalseFalse00000000...0.00.000.00.0947.2162242018104.732935-1.01
\n", "

205377 rows × 24 columns

\n", "
" ], "text/plain": [ " verified_email verified_primary_email n_works n_doi n_arxiv \\\n", "8422958 True True 7 3 0 \n", "30426 True True 5 5 0 \n", "1941223 True True 0 0 0 \n", "9232681 True True 1 1 0 \n", "7646644 False False 0 0 0 \n", "... ... ... ... ... ... \n", "4769520 True True 0 0 0 \n", "2817268 True True 9 7 0 \n", "4840353 True True 0 0 0 \n", "9717615 True True 0 0 0 \n", "2290714 False False 0 0 0 \n", "\n", " n_pmc n_other_pids n_emails n_urls n_ids ... n_valid_education \\\n", "8422958 0 6 0 0 1 ... 2.0 \n", "30426 0 0 0 0 0 ... 1.0 \n", "1941223 0 0 0 0 0 ... 0.0 \n", "9232681 0 0 0 0 0 ... 0.0 \n", "7646644 0 0 0 0 0 ... 0.0 \n", "... ... ... ... ... ... ... ... \n", "4769520 0 0 0 0 0 ... 0.0 \n", "2817268 0 5 0 0 1 ... 3.0 \n", "4840353 0 0 0 0 0 ... 0.0 \n", "9717615 0 0 0 0 1 ... 0.0 \n", "2290714 0 0 0 0 0 ... 0.0 \n", "\n", " n_valid_employment biography_length biography_n_sentences \\\n", "8422958 1.0 0 0.0 \n", "30426 3.0 0 0.0 \n", "1941223 0.0 0 0.0 \n", "9232681 0.0 0 0.0 \n", "7646644 0.0 0 0.0 \n", "... ... ... ... \n", "4769520 0.0 0 0.0 \n", "2817268 0.0 0 0.0 \n", "4840353 0.0 0 0.0 \n", "9717615 0.0 0 0.0 \n", "2290714 0.0 0 0.0 \n", "\n", " biography_n_words date_diff ref_year date_stale label \\\n", "8422958 0.0 1149.760128 2016 498.053074 -1.0 \n", "30426 0.0 2446.815456 2014 112.064679 1.0 \n", "1941223 0.0 811.573040 2017 486.567693 -1.0 \n", "9232681 0.0 888.520157 2017 359.397781 -1.0 \n", "7646644 0.0 0.000030 2017 1443.731837 1.0 \n", "... ... ... ... ... ... \n", "4769520 0.0 781.732331 2019 46.115046 -1.0 \n", "2817268 0.0 1782.543747 2014 544.549425 -1.0 \n", "4840353 0.0 659.864595 2017 634.846007 1.0 \n", "9717615 0.0 1971.810335 2015 38.063916 -1.0 \n", "2290714 0.0 947.216224 2018 104.732935 -1.0 \n", "\n", " prediction \n", "8422958 1 \n", "30426 -1 \n", "1941223 1 \n", "9232681 1 \n", "7646644 -1 \n", "... ... \n", "4769520 1 \n", "2817268 1 \n", "4840353 -1 \n", "9717615 1 \n", "2290714 1 \n", "\n", "[205377 rows x 24 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_x[test_x.label != test_x.prediction]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "fake_df = pd.read_csv('../data/processed/fake_heap_index.csv', index_col='index')" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidsn_emailsn_urlsn_ids...n_valid_educationn_valid_employmentbiography_lengthbiography_n_sentencesbiography_n_wordsdate_diffref_yeardate_stalelabelprediction
\n", "

0 rows × 24 columns

\n", "
" ], "text/plain": [ "Empty DataFrame\n", "Columns: [verified_email, verified_primary_email, n_works, n_doi, n_arxiv, n_pmc, n_other_pids, n_emails, n_urls, n_ids, n_keywords, n_education, n_employment, n_ext_work_source, n_valid_education, n_valid_employment, biography_length, biography_n_sentences, biography_n_words, date_diff, ref_year, date_stale, label, prediction]\n", "Index: []\n", "\n", "[0 rows x 24 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_x[(test_x.label != test_x.prediction) & (test_x.index.isin(fake_df.index))]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidsn_emailsn_urlsn_ids...n_valid_educationn_valid_employmentbiography_lengthbiography_n_sentencesbiography_n_wordsdate_diffref_yeardate_stalelabelprediction
1319584TrueTrue00000010...0.00.06374.0113.00.0008522020143.341818-1.0-1
7579770TrueTrue00000010...0.00.04454.085.00.0026942020184.261009-1.0-1
9173011TrueTrue00000010...0.00.04454.085.00.0052712020169.293431-1.0-1
1209389TrueTrue00000010...0.00.04454.085.00.0060712020146.328318-1.0-1
4173344TrueTrue00000010...0.00.04454.085.00.0054052020191.303842-1.0-1
10985986TrueTrue00000010...0.00.04454.085.00.0036622020195.359312-1.0-1
6208696TrueTrue00000010...0.00.04454.085.00.0012872020142.360796-1.0-1
801178TrueTrue00000010...0.00.04454.085.00.0080692020131.828965-1.0-1
4392500TrueTrue00000010...0.00.04454.085.00.0044142020206.150937-1.0-1
7963350TrueTrue00000010...0.00.04454.085.00.0035722020190.312686-1.0-1
5659388TrueTrue00000010...0.00.04454.085.00.0116852020140.250630-1.0-1
2749172TrueTrue00000010...0.00.05404.0103.00.0011722020142.989201-1.0-1
4230883TrueTrue00000010...0.00.04454.085.00.0016812020153.354072-1.0-1
6370669TrueTrue00000010...0.00.04454.085.00.0069842020178.167846-1.0-1
5109458TrueTrue00000010...0.00.04454.085.00.0046332020140.166676-1.0-1
7689620TrueTrue00000010...0.00.04454.085.00.0036712020203.189280-1.0-1
9831120TrueTrue00000010...0.00.04454.085.00.0049962020167.149854-1.0-1
6263478TrueTrue00000010...0.00.05524.0107.00.0051742020140.365511-1.0-1
10581997TrueTrue00000010...0.00.04454.085.00.0205272020129.861984-1.0-1
3243302TrueTrue00000010...0.00.04454.085.00.0033912020140.230023-1.0-1
3659063TrueTrue00000010...0.00.04454.085.00.0106182020131.658167-1.0-1
6099073TrueTrue00000010...0.00.04454.085.00.0257142020142.184268-1.0-1
3953358TrueTrue00000000...0.00.04454.085.00.0051452020185.115723-1.0-1
9724190TrueTrue00000010...0.00.05434.0107.06.9933522020135.124112-1.0-1
1041978TrueTrue00000010...0.00.04454.085.00.0065242020167.270642-1.0-1
880090TrueTrue00000010...0.00.04454.085.00.0015722020143.298327-1.0-1
8492341TrueTrue00000010...0.00.04454.085.00.0061722020123.742413-1.0-1
8952735TrueTrue00000010...0.00.04454.085.00.0047852020188.359673-1.0-1
3274872TrueTrue00000010...0.00.04454.085.00.0174782020126.967926-1.0-1
989919TrueTrue00000010...0.00.05615.0110.00.0026492020146.098129-1.0-1
\n", "

30 rows × 24 columns

\n", "
" ], "text/plain": [ " verified_email verified_primary_email n_works n_doi n_arxiv \\\n", "1319584 True True 0 0 0 \n", "7579770 True True 0 0 0 \n", "9173011 True True 0 0 0 \n", "1209389 True True 0 0 0 \n", "4173344 True True 0 0 0 \n", "10985986 True True 0 0 0 \n", "6208696 True True 0 0 0 \n", "801178 True True 0 0 0 \n", "4392500 True True 0 0 0 \n", "7963350 True True 0 0 0 \n", "5659388 True True 0 0 0 \n", "2749172 True True 0 0 0 \n", "4230883 True True 0 0 0 \n", "6370669 True True 0 0 0 \n", "5109458 True True 0 0 0 \n", "7689620 True True 0 0 0 \n", "9831120 True True 0 0 0 \n", "6263478 True True 0 0 0 \n", "10581997 True True 0 0 0 \n", "3243302 True True 0 0 0 \n", "3659063 True True 0 0 0 \n", "6099073 True True 0 0 0 \n", "3953358 True True 0 0 0 \n", "9724190 True True 0 0 0 \n", "1041978 True True 0 0 0 \n", "880090 True True 0 0 0 \n", "8492341 True True 0 0 0 \n", "8952735 True True 0 0 0 \n", "3274872 True True 0 0 0 \n", "989919 True True 0 0 0 \n", "\n", " n_pmc n_other_pids n_emails n_urls n_ids ... \\\n", "1319584 0 0 0 1 0 ... \n", "7579770 0 0 0 1 0 ... \n", "9173011 0 0 0 1 0 ... \n", "1209389 0 0 0 1 0 ... \n", "4173344 0 0 0 1 0 ... \n", "10985986 0 0 0 1 0 ... \n", "6208696 0 0 0 1 0 ... \n", "801178 0 0 0 1 0 ... \n", "4392500 0 0 0 1 0 ... \n", "7963350 0 0 0 1 0 ... \n", "5659388 0 0 0 1 0 ... \n", "2749172 0 0 0 1 0 ... \n", "4230883 0 0 0 1 0 ... \n", "6370669 0 0 0 1 0 ... \n", "5109458 0 0 0 1 0 ... \n", "7689620 0 0 0 1 0 ... \n", "9831120 0 0 0 1 0 ... \n", "6263478 0 0 0 1 0 ... \n", "10581997 0 0 0 1 0 ... \n", "3243302 0 0 0 1 0 ... \n", "3659063 0 0 0 1 0 ... \n", "6099073 0 0 0 1 0 ... \n", "3953358 0 0 0 0 0 ... \n", "9724190 0 0 0 1 0 ... \n", "1041978 0 0 0 1 0 ... \n", "880090 0 0 0 1 0 ... \n", "8492341 0 0 0 1 0 ... \n", "8952735 0 0 0 1 0 ... \n", "3274872 0 0 0 1 0 ... \n", "989919 0 0 0 1 0 ... \n", "\n", " n_valid_education n_valid_employment biography_length \\\n", "1319584 0.0 0.0 637 \n", "7579770 0.0 0.0 445 \n", "9173011 0.0 0.0 445 \n", "1209389 0.0 0.0 445 \n", "4173344 0.0 0.0 445 \n", "10985986 0.0 0.0 445 \n", "6208696 0.0 0.0 445 \n", "801178 0.0 0.0 445 \n", "4392500 0.0 0.0 445 \n", "7963350 0.0 0.0 445 \n", "5659388 0.0 0.0 445 \n", "2749172 0.0 0.0 540 \n", "4230883 0.0 0.0 445 \n", "6370669 0.0 0.0 445 \n", "5109458 0.0 0.0 445 \n", "7689620 0.0 0.0 445 \n", "9831120 0.0 0.0 445 \n", "6263478 0.0 0.0 552 \n", "10581997 0.0 0.0 445 \n", "3243302 0.0 0.0 445 \n", "3659063 0.0 0.0 445 \n", "6099073 0.0 0.0 445 \n", "3953358 0.0 0.0 445 \n", "9724190 0.0 0.0 543 \n", "1041978 0.0 0.0 445 \n", "880090 0.0 0.0 445 \n", "8492341 0.0 0.0 445 \n", "8952735 0.0 0.0 445 \n", "3274872 0.0 0.0 445 \n", "989919 0.0 0.0 561 \n", "\n", " biography_n_sentences biography_n_words date_diff ref_year \\\n", "1319584 4.0 113.0 0.000852 2020 \n", "7579770 4.0 85.0 0.002694 2020 \n", "9173011 4.0 85.0 0.005271 2020 \n", "1209389 4.0 85.0 0.006071 2020 \n", "4173344 4.0 85.0 0.005405 2020 \n", "10985986 4.0 85.0 0.003662 2020 \n", "6208696 4.0 85.0 0.001287 2020 \n", "801178 4.0 85.0 0.008069 2020 \n", "4392500 4.0 85.0 0.004414 2020 \n", "7963350 4.0 85.0 0.003572 2020 \n", "5659388 4.0 85.0 0.011685 2020 \n", "2749172 4.0 103.0 0.001172 2020 \n", "4230883 4.0 85.0 0.001681 2020 \n", "6370669 4.0 85.0 0.006984 2020 \n", "5109458 4.0 85.0 0.004633 2020 \n", "7689620 4.0 85.0 0.003671 2020 \n", "9831120 4.0 85.0 0.004996 2020 \n", "6263478 4.0 107.0 0.005174 2020 \n", "10581997 4.0 85.0 0.020527 2020 \n", "3243302 4.0 85.0 0.003391 2020 \n", "3659063 4.0 85.0 0.010618 2020 \n", "6099073 4.0 85.0 0.025714 2020 \n", "3953358 4.0 85.0 0.005145 2020 \n", "9724190 4.0 107.0 6.993352 2020 \n", "1041978 4.0 85.0 0.006524 2020 \n", "880090 4.0 85.0 0.001572 2020 \n", "8492341 4.0 85.0 0.006172 2020 \n", "8952735 4.0 85.0 0.004785 2020 \n", "3274872 4.0 85.0 0.017478 2020 \n", "989919 5.0 110.0 0.002649 2020 \n", "\n", " date_stale label prediction \n", "1319584 143.341818 -1.0 -1 \n", "7579770 184.261009 -1.0 -1 \n", "9173011 169.293431 -1.0 -1 \n", "1209389 146.328318 -1.0 -1 \n", "4173344 191.303842 -1.0 -1 \n", "10985986 195.359312 -1.0 -1 \n", "6208696 142.360796 -1.0 -1 \n", "801178 131.828965 -1.0 -1 \n", "4392500 206.150937 -1.0 -1 \n", "7963350 190.312686 -1.0 -1 \n", "5659388 140.250630 -1.0 -1 \n", "2749172 142.989201 -1.0 -1 \n", "4230883 153.354072 -1.0 -1 \n", "6370669 178.167846 -1.0 -1 \n", "5109458 140.166676 -1.0 -1 \n", "7689620 203.189280 -1.0 -1 \n", "9831120 167.149854 -1.0 -1 \n", "6263478 140.365511 -1.0 -1 \n", "10581997 129.861984 -1.0 -1 \n", "3243302 140.230023 -1.0 -1 \n", "3659063 131.658167 -1.0 -1 \n", "6099073 142.184268 -1.0 -1 \n", "3953358 185.115723 -1.0 -1 \n", "9724190 135.124112 -1.0 -1 \n", "1041978 167.270642 -1.0 -1 \n", "880090 143.298327 -1.0 -1 \n", "8492341 123.742413 -1.0 -1 \n", "8952735 188.359673 -1.0 -1 \n", "3274872 126.967926 -1.0 -1 \n", "989919 146.098129 -1.0 -1 \n", "\n", "[30 rows x 24 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_x[(test_x.index.isin(fake_df.index))]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }