fake-orcid-analysis/notebooks/03-Machine Learning.ipynb

1029 lines
33 KiB
Plaintext
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"from urllib.parse import urlparse\n",
"import tldextract\n",
"\n",
"import pandas as pd\n",
"from sklearn.preprocessing import MultiLabelBinarizer\n",
"from sklearn.svm import OneClassSVM \n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.metrics import f1_score\n"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_pickle('../data/processed/features.pkl')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>n_ids</th>\n",
" <th>n_keywords</th>\n",
" <th>n_employment</th>\n",
" <th>n_education</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989644</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989645</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989646</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989647</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989648</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10989649 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"0 0 0 0 0 0 \n",
"1 1 1 0 0 0 \n",
"2 1 1 0 0 0 \n",
"3 1 1 0 0 0 \n",
"4 1 1 0 0 0 \n",
"... ... ... ... ... ... \n",
"10989644 1 1 0 0 0 \n",
"10989645 1 1 7 7 0 \n",
"10989646 1 1 0 0 0 \n",
"10989647 1 1 0 0 0 \n",
"10989648 1 1 0 0 0 \n",
"\n",
" n_pmc n_other_pids n_emails n_urls n_ids n_keywords \\\n",
"0 0 0 NaN NaN NaN NaN \n",
"1 0 0 NaN NaN NaN NaN \n",
"2 0 0 NaN NaN NaN NaN \n",
"3 0 0 NaN NaN NaN NaN \n",
"4 0 0 NaN NaN NaN NaN \n",
"... ... ... ... ... ... ... \n",
"10989644 0 0 NaN NaN NaN NaN \n",
"10989645 1 0 NaN NaN NaN NaN \n",
"10989646 0 0 NaN NaN NaN NaN \n",
"10989647 0 0 NaN NaN NaN NaN \n",
"10989648 0 0 NaN NaN NaN NaN \n",
"\n",
" n_employment n_education label \n",
"0 NaN NaN 0 \n",
"1 1.0 NaN 0 \n",
"2 NaN NaN 0 \n",
"3 1.0 NaN 0 \n",
"4 2.0 NaN 0 \n",
"... ... ... ... \n",
"10989644 1.0 2.0 0 \n",
"10989645 2.0 2.0 1 \n",
"10989646 NaN NaN 0 \n",
"10989647 1.0 2.0 0 \n",
"10989648 NaN NaN 0 \n",
"\n",
"[10989649 rows x 14 columns]"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"df.fillna(0, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>n_ids</th>\n",
" <th>n_keywords</th>\n",
" <th>n_employment</th>\n",
" <th>n_education</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989644</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989645</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>7</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>2.0</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989646</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989647</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989648</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10989649 rows × 14 columns</p>\n",
"</div>"
],
"text/plain": [
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"0 0 0 0 0 0 \n",
"1 1 1 0 0 0 \n",
"2 1 1 0 0 0 \n",
"3 1 1 0 0 0 \n",
"4 1 1 0 0 0 \n",
"... ... ... ... ... ... \n",
"10989644 1 1 0 0 0 \n",
"10989645 1 1 7 7 0 \n",
"10989646 1 1 0 0 0 \n",
"10989647 1 1 0 0 0 \n",
"10989648 1 1 0 0 0 \n",
"\n",
" n_pmc n_other_pids n_emails n_urls n_ids n_keywords \\\n",
"0 0 0 0.0 0.0 0.0 0.0 \n",
"1 0 0 0.0 0.0 0.0 0.0 \n",
"2 0 0 0.0 0.0 0.0 0.0 \n",
"3 0 0 0.0 0.0 0.0 0.0 \n",
"4 0 0 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... ... ... \n",
"10989644 0 0 0.0 0.0 0.0 0.0 \n",
"10989645 1 0 0.0 0.0 0.0 0.0 \n",
"10989646 0 0 0.0 0.0 0.0 0.0 \n",
"10989647 0 0 0.0 0.0 0.0 0.0 \n",
"10989648 0 0 0.0 0.0 0.0 0.0 \n",
"\n",
" n_employment n_education label \n",
"0 0.0 0.0 0 \n",
"1 1.0 0.0 0 \n",
"2 0.0 0.0 0 \n",
"3 1.0 0.0 0 \n",
"4 2.0 0.0 0 \n",
"... ... ... ... \n",
"10989644 1.0 2.0 0 \n",
"10989645 2.0 2.0 1 \n",
"10989646 0.0 0.0 0 \n",
"10989647 1.0 2.0 0 \n",
"10989648 0.0 0.0 0 \n",
"\n",
"[10989649 rows x 14 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"verified_email 2664886\n",
"verified_primary_email 2664886\n",
"n_works 2664886\n",
"n_doi 2664886\n",
"n_arxiv 2664886\n",
"n_pmc 2664886\n",
"n_other_pids 2664886\n",
"n_emails 2664886\n",
"n_urls 2664886\n",
"n_ids 2664886\n",
"n_keywords 2664886\n",
"n_employment 2664886\n",
"n_education 2664886\n",
"label 2664886\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.label == 1].count()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"verified_email 8324763\n",
"verified_primary_email 8324763\n",
"n_works 8324763\n",
"n_doi 8324763\n",
"n_arxiv 8324763\n",
"n_pmc 8324763\n",
"n_other_pids 8324763\n",
"n_emails 8324763\n",
"n_urls 8324763\n",
"n_ids 8324763\n",
"n_keywords 8324763\n",
"n_employment 8324763\n",
"n_education 8324763\n",
"label 8324763\n",
"dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.label == 0].count()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# split into train/test sets\n",
"X = df.loc[:,'verified_email':'n_education']\n",
"y = df['label']\n",
"trainX, testX, trainy, testy = train_test_split(X, y, train_size=0.5, random_state=2, stratify=y)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>n_works</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>n_emails</th>\n",
" <th>n_urls</th>\n",
" <th>n_ids</th>\n",
" <th>n_keywords</th>\n",
" <th>n_employment</th>\n",
" <th>n_education</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>6325067</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6140551</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3258315</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>2.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10948983</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10089158</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2398808</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3622839</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1389679</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6594722</th>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4589084</th>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>11</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5494824 rows × 13 columns</p>\n",
"</div>"
],
"text/plain": [
" verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
"6325067 1 1 0 0 0 \n",
"6140551 1 1 0 0 0 \n",
"3258315 1 1 0 0 0 \n",
"10948983 1 1 0 0 0 \n",
"10089158 1 1 0 0 0 \n",
"... ... ... ... ... ... \n",
"2398808 0 0 1 1 0 \n",
"3622839 1 1 0 0 0 \n",
"1389679 1 1 0 0 0 \n",
"6594722 0 0 0 0 0 \n",
"4589084 1 1 11 0 0 \n",
"\n",
" n_pmc n_other_pids n_emails n_urls n_ids n_keywords \\\n",
"6325067 0 0 0.0 0.0 1.0 0.0 \n",
"6140551 0 0 0.0 0.0 0.0 0.0 \n",
"3258315 0 0 0.0 1.0 0.0 2.0 \n",
"10948983 0 0 0.0 0.0 0.0 0.0 \n",
"10089158 0 0 0.0 0.0 0.0 0.0 \n",
"... ... ... ... ... ... ... \n",
"2398808 0 0 0.0 0.0 0.0 0.0 \n",
"3622839 0 0 0.0 0.0 0.0 0.0 \n",
"1389679 0 0 0.0 0.0 0.0 0.0 \n",
"6594722 0 0 0.0 0.0 0.0 0.0 \n",
"4589084 0 0 0.0 0.0 0.0 0.0 \n",
"\n",
" n_employment n_education \n",
"6325067 1.0 1.0 \n",
"6140551 0.0 0.0 \n",
"3258315 0.0 0.0 \n",
"10948983 1.0 0.0 \n",
"10089158 0.0 0.0 \n",
"... ... ... \n",
"2398808 0.0 0.0 \n",
"3622839 1.0 1.0 \n",
"1389679 0.0 0.0 \n",
"6594722 0.0 0.0 \n",
"4589084 1.0 0.0 \n",
"\n",
"[5494824 rows x 13 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"trainX"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# define outlier detection model\n",
"model = OneClassSVM(gamma='scale', nu=0.5)\n",
"\n",
"# fit on majority class\n",
"trainX = trainX[trainy==1]\n",
"model.fit(trainX)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# detect outliers in the test set\n",
"yhat = model.predict(testX)\n",
"\n",
"# mark inliers 1, outliers -1\n",
"testy[testy == 0] = -1\n",
"testy[testy == 1] = 1\n",
"\n",
"# calculate score\n",
"score = f1_score(testy, yhat, pos_label=-1)\n",
"print('F1 Score: %.3f' % score)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}