diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb
index 4a624ae..51d6a70 100644
--- a/notebooks/01-Exploration.ipynb
+++ b/notebooks/01-Exploration.ipynb
@@ -764,7 +764,7 @@
}
],
"source": [
- "df.count() #10916574"
+ "df.count()"
]
},
{
@@ -16260,6 +16260,28 @@
"fig.show()"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df[['verified_email', \n",
+ " 'verified_primary_email', \n",
+ " 'n_works', \n",
+ " 'n_doi',\n",
+ " 'n_arxiv', \n",
+ " 'n_pmc', \n",
+ " 'n_other_pids', \n",
+ " 'n_emails', \n",
+ " 'n_urls', \n",
+ " 'n_ids', \n",
+ " 'n_keywords', \n",
+ " 'n_employment', \n",
+ " 'n_education', \n",
+ " 'label']].to_pickle('../data/processed/features.pkl')"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
@@ -16884,12 +16906,21 @@
"# (df.n_works > 0) & (df.n_ids > 1)"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "# Serialise "
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ " "
+ ]
}
],
"metadata": {
diff --git a/notebooks/03-Feature extraction.ipynb b/notebooks/03-Feature extraction.ipynb
deleted file mode 100644
index e25ef16..0000000
--- a/notebooks/03-Feature extraction.ipynb
+++ /dev/null
@@ -1,2422 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Todo in data\n",
- "- Column names -> no space\n",
- "- If a list is empty, serialise [] in the csv\n",
- "- If a string is empty, serialise '' in the csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {},
- "outputs": [],
- "source": [
- "import ast\n",
- "from urllib.parse import urlparse\n",
- "import tldextract\n",
- "\n",
- "import pandas as pd\n",
- "from sklearn.preprocessing import MultiLabelBinarizer\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "mlb = MultiLabelBinarizer()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Notable Solid ORCID iDs for debug purposes\n",
- "AM = '0000-0002-5193-7851'\n",
- "PP = '0000-0002-8588-4196'\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {},
- "outputs": [],
- "source": [
- "# Notable fake ORCID iDs for debug purposes\n",
- "SCAFFOLD = '0000-0001-5004-7761'\n",
- "WHATSAPP = '0000-0001-6997-9470'\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [],
- "source": [
- "df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header = 0,\n",
- " dtype = {\"orcid\": pd.StringDtype(), \n",
- " \"claimed\": bool, \n",
- " \"verifyed email\": bool, \n",
- " \"verified primary email\": bool,\n",
- " \"given names\": pd.StringDtype(),\n",
- " \"family name\": pd.StringDtype(),\n",
- " \"biography\": pd.StringDtype(),\n",
- " \"other names\": pd.StringDtype(),\n",
- " \"researcher urls\": pd.StringDtype(),\n",
- " \"primary email\": pd.StringDtype(),\n",
- " \"other emails\": pd.StringDtype(),\n",
- " \"keywords\": pd.StringDtype(),\n",
- " \"eternal identifiers\": pd.StringDtype(),\n",
- " \"education\": pd.StringDtype(),\n",
- " \"employments\": pd.StringDtype(),\n",
- " \"number of works\": pd.Int16Dtype(),\n",
- " \"works source\": pd.StringDtype()})"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "
\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " orcid | \n",
- " claimed | \n",
- " verifyed email | \n",
- " verified primary email | \n",
- " given names | \n",
- " family name | \n",
- " biography | \n",
- " other names | \n",
- " researcher urls | \n",
- " primary email | \n",
- " other emails | \n",
- " keywords | \n",
- " external identifiers | \n",
- " education | \n",
- " employments | \n",
- " number of works | \n",
- " works source | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0000-0001-5000-2053 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Jorge | \n",
- " Jaramillo Sanchez | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 0000-0001-5000-6548 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Wiseman | \n",
- " Bekelesi | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 0000-0001-5000-7962 | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " ALICE | \n",
- " INDIMULI | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 0000-0001-5000-8586 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " shim | \n",
- " ji yun | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 0000-0001-5001-0256 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Sandro | \n",
- " Caramaschi | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " orcid claimed verifyed email verified primary email \\\n",
- "0 0000-0001-5000-2053 True False False \n",
- "1 0000-0001-5000-6548 True False False \n",
- "2 0000-0001-5000-7962 True True True \n",
- "3 0000-0001-5000-8586 True False False \n",
- "4 0000-0001-5001-0256 True False False \n",
- "\n",
- " given names family name biography other names researcher urls \\\n",
- "0 Jorge Jaramillo Sanchez \n",
- "1 Wiseman Bekelesi \n",
- "2 ALICE INDIMULI \n",
- "3 shim ji yun \n",
- "4 Sandro Caramaschi \n",
- "\n",
- " primary email other emails keywords external identifiers education \\\n",
- "0 NaN \n",
- "1 NaN \n",
- "2 NaN \n",
- "3 NaN \n",
- "4 NaN \n",
- "\n",
- " employments number of works works source \n",
- "0 0 \n",
- "1 0 \n",
- "2 0 \n",
- "3 0 \n",
- "4 0 "
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.head(5)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " orcid | \n",
- " claimed | \n",
- " verifyed email | \n",
- " verified primary email | \n",
- " given names | \n",
- " family name | \n",
- " biography | \n",
- " other names | \n",
- " researcher urls | \n",
- " primary email | \n",
- " other emails | \n",
- " keywords | \n",
- " external identifiers | \n",
- " education | \n",
- " employments | \n",
- " number of works | \n",
- " works source | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 8840413 | \n",
- " 0000-0002-5193-7851 | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " Andrea | \n",
- " Mannocci | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [[\"Personal website\", \"https://andremann.githu... | \n",
- " andrea.mannocci@isti.cnr.it | \n",
- " <NA> | \n",
- " [\"Data science \", \"science of science\", \"schol... | \n",
- " [[\"Scopus Author ID\", \"55233589900\"]] | \n",
- " [[\"Information engineering\", \"Ph.D.\", \"Univers... | \n",
- " [[\"Research Associate\", \"Istituto di Scienza e... | \n",
- " 37 | \n",
- " [\"Scopus - Elsevier\", \"Crossref Metadata Searc... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " orcid claimed verifyed email verified primary email \\\n",
- "8840413 0000-0002-5193-7851 True True True \n",
- "\n",
- " given names family name biography other names \\\n",
- "8840413 Andrea Mannocci \n",
- "\n",
- " researcher urls \\\n",
- "8840413 [[\"Personal website\", \"https://andremann.githu... \n",
- "\n",
- " primary email other emails \\\n",
- "8840413 andrea.mannocci@isti.cnr.it \n",
- "\n",
- " keywords \\\n",
- "8840413 [\"Data science \", \"science of science\", \"schol... \n",
- "\n",
- " external identifiers \\\n",
- "8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n",
- "\n",
- " education \\\n",
- "8840413 [[\"Information engineering\", \"Ph.D.\", \"Univers... \n",
- "\n",
- " employments number of works \\\n",
- "8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n",
- "\n",
- " works source \n",
- "8840413 [\"Scopus - Elsevier\", \"Crossref Metadata Searc... "
- ]
- },
- "execution_count": 7,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['orcid'] == AM]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Extracting works source"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "def extract_work_source(lst):\n",
- " extracted = []\n",
- " for s in lst:\n",
- " if 'Scopus - Elsevier' in s or 'Crossref' in s:\n",
- " extracted.append(s)\n",
- " return extracted"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['extracted_works_source'] = df['works source'].apply(lambda x: extract_work_source(x))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [],
- "source": [
- "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_works_source']), columns=mlb.classes_)], axis = 1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
- "source": [
- "df.drop(['works source', 'extracted_works_source'], axis=1, inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " orcid | \n",
- " claimed | \n",
- " verifyed email | \n",
- " verified primary email | \n",
- " given names | \n",
- " family name | \n",
- " biography | \n",
- " other names | \n",
- " researcher urls | \n",
- " primary email | \n",
- " other emails | \n",
- " keywords | \n",
- " external identifiers | \n",
- " education | \n",
- " employments | \n",
- " number of works | \n",
- " Crossref | \n",
- " Crossref Metadata Search | \n",
- " Scopus - Elsevier | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 8840413 | \n",
- " 0000-0002-5193-7851 | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " Andrea | \n",
- " Mannocci | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [[\"Personal website\", \"https://andremann.githu... | \n",
- " andrea.mannocci@isti.cnr.it | \n",
- " <NA> | \n",
- " [\"Data science \", \"science of science\", \"schol... | \n",
- " [[\"Scopus Author ID\", \"55233589900\"]] | \n",
- " [[\"Information engineering\", \"Ph.D.\", \"Univers... | \n",
- " [[\"Research Associate\", \"Istituto di Scienza e... | \n",
- " 37 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " orcid claimed verifyed email verified primary email \\\n",
- "8840413 0000-0002-5193-7851 True True True \n",
- "\n",
- " given names family name biography other names \\\n",
- "8840413 Andrea Mannocci \n",
- "\n",
- " researcher urls \\\n",
- "8840413 [[\"Personal website\", \"https://andremann.githu... \n",
- "\n",
- " primary email other emails \\\n",
- "8840413 andrea.mannocci@isti.cnr.it \n",
- "\n",
- " keywords \\\n",
- "8840413 [\"Data science \", \"science of science\", \"schol... \n",
- "\n",
- " external identifiers \\\n",
- "8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n",
- "\n",
- " education \\\n",
- "8840413 [[\"Information engineering\", \"Ph.D.\", \"Univers... \n",
- "\n",
- " employments number of works \\\n",
- "8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n",
- "\n",
- " Crossref Crossref Metadata Search Scopus - Elsevier \n",
- "8840413 1 1 1 "
- ]
- },
- "execution_count": 13,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['orcid'] == AM]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Education"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['n_education'] = df['education'].str.len()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "metadata": {},
- "outputs": [],
- "source": [
- "df.drop('education', axis=1, inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 17,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " orcid | \n",
- " claimed | \n",
- " verifyed email | \n",
- " verified primary email | \n",
- " given names | \n",
- " family name | \n",
- " biography | \n",
- " other names | \n",
- " researcher urls | \n",
- " primary email | \n",
- " other emails | \n",
- " keywords | \n",
- " external identifiers | \n",
- " employments | \n",
- " number of works | \n",
- " Crossref | \n",
- " Crossref Metadata Search | \n",
- " Scopus - Elsevier | \n",
- " n_education | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 8840413 | \n",
- " 0000-0002-5193-7851 | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " Andrea | \n",
- " Mannocci | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [[\"Personal website\", \"https://andremann.githu... | \n",
- " andrea.mannocci@isti.cnr.it | \n",
- " <NA> | \n",
- " [\"Data science \", \"science of science\", \"schol... | \n",
- " [[\"Scopus Author ID\", \"55233589900\"]] | \n",
- " [[\"Research Associate\", \"Istituto di Scienza e... | \n",
- " 37 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 4 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " orcid claimed verifyed email verified primary email \\\n",
- "8840413 0000-0002-5193-7851 True True True \n",
- "\n",
- " given names family name biography other names \\\n",
- "8840413 Andrea Mannocci \n",
- "\n",
- " researcher urls \\\n",
- "8840413 [[\"Personal website\", \"https://andremann.githu... \n",
- "\n",
- " primary email other emails \\\n",
- "8840413 andrea.mannocci@isti.cnr.it \n",
- "\n",
- " keywords \\\n",
- "8840413 [\"Data science \", \"science of science\", \"schol... \n",
- "\n",
- " external identifiers \\\n",
- "8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n",
- "\n",
- " employments number of works \\\n",
- "8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n",
- "\n",
- " Crossref Crossref Metadata Search Scopus - Elsevier n_education \n",
- "8840413 1 1 1 4 "
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['orcid'] == AM]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Employment"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['n_employments'] = df['employments'].str.len()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [],
- "source": [
- "df.drop('employments', axis=1, inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " orcid | \n",
- " claimed | \n",
- " verifyed email | \n",
- " verified primary email | \n",
- " given names | \n",
- " family name | \n",
- " biography | \n",
- " other names | \n",
- " researcher urls | \n",
- " primary email | \n",
- " other emails | \n",
- " keywords | \n",
- " external identifiers | \n",
- " number of works | \n",
- " Crossref | \n",
- " Crossref Metadata Search | \n",
- " Scopus - Elsevier | \n",
- " n_education | \n",
- " n_employments | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 8840413 | \n",
- " 0000-0002-5193-7851 | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " Andrea | \n",
- " Mannocci | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [[\"Personal website\", \"https://andremann.githu... | \n",
- " andrea.mannocci@isti.cnr.it | \n",
- " <NA> | \n",
- " [\"Data science \", \"science of science\", \"schol... | \n",
- " [[\"Scopus Author ID\", \"55233589900\"]] | \n",
- " 37 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 4 | \n",
- " 5 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " orcid claimed verifyed email verified primary email \\\n",
- "8840413 0000-0002-5193-7851 True True True \n",
- "\n",
- " given names family name biography other names \\\n",
- "8840413 Andrea Mannocci \n",
- "\n",
- " researcher urls \\\n",
- "8840413 [[\"Personal website\", \"https://andremann.githu... \n",
- "\n",
- " primary email other emails \\\n",
- "8840413 andrea.mannocci@isti.cnr.it \n",
- "\n",
- " keywords \\\n",
- "8840413 [\"Data science \", \"science of science\", \"schol... \n",
- "\n",
- " external identifiers number of works Crossref \\\n",
- "8840413 [[\"Scopus Author ID\", \"55233589900\"]] 37 1 \n",
- "\n",
- " Crossref Metadata Search Scopus - Elsevier n_education \\\n",
- "8840413 1 1 4 \n",
- "\n",
- " n_employments \n",
- "8840413 5 "
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['orcid'] == AM]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# External IDs"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [],
- "source": [
- "# def extract_ids(lst):\n",
- "# extracted = []\n",
- "# for id in lst:\n",
- "# extracted.append(id[0])\n",
- "# return extracted"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [],
- "source": [
- "# df['extracted_identifiers'] = df['external identifiers'].apply(lambda x: extract_ids(x))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "metadata": {},
- "outputs": [],
- "source": [
- "# df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_identifiers']), columns=mlb.classes_)], axis = 1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['n_ext_ids'] = df['external identifiers'].str.len()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [],
- "source": [
- "df.drop(['external identifiers'], axis=1, inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " orcid | \n",
- " claimed | \n",
- " verifyed email | \n",
- " verified primary email | \n",
- " given names | \n",
- " family name | \n",
- " biography | \n",
- " other names | \n",
- " researcher urls | \n",
- " primary email | \n",
- " other emails | \n",
- " keywords | \n",
- " number of works | \n",
- " Crossref | \n",
- " Crossref Metadata Search | \n",
- " Scopus - Elsevier | \n",
- " n_education | \n",
- " n_employments | \n",
- " n_ext_ids | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 8840413 | \n",
- " 0000-0002-5193-7851 | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " Andrea | \n",
- " Mannocci | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [[\"Personal website\", \"https://andremann.githu... | \n",
- " andrea.mannocci@isti.cnr.it | \n",
- " <NA> | \n",
- " [\"Data science \", \"science of science\", \"schol... | \n",
- " 37 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 4 | \n",
- " 5 | \n",
- " 1 | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " orcid claimed verifyed email verified primary email \\\n",
- "8840413 0000-0002-5193-7851 True True True \n",
- "\n",
- " given names family name biography other names \\\n",
- "8840413 Andrea Mannocci \n",
- "\n",
- " researcher urls \\\n",
- "8840413 [[\"Personal website\", \"https://andremann.githu... \n",
- "\n",
- " primary email other emails \\\n",
- "8840413 andrea.mannocci@isti.cnr.it \n",
- "\n",
- " keywords number of works \\\n",
- "8840413 [\"Data science \", \"science of science\", \"schol... 37 \n",
- "\n",
- " Crossref Crossref Metadata Search Scopus - Elsevier n_education \\\n",
- "8840413 1 1 1 4 \n",
- "\n",
- " n_employments n_ext_ids \n",
- "8840413 5 1 "
- ]
- },
- "execution_count": 28,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['orcid'] == AM]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Extracting email domains"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['primary email'] = df['primary email'].fillna('')\n",
- "df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 30,
- "metadata": {},
- "outputs": [],
- "source": [
- "def extract_email_domains(row):\n",
- " domains = []\n",
- " if len(row['primary email']) > 0:\n",
- " domains.append(row['primary email'].split('@')[1])\n",
- " for email in row['other emails']:\n",
- " domains.append(email.split('@')[1])\n",
- " return domains"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 31,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['email_domains'] = df[['primary email','other emails']].apply(lambda row: extract_email_domains(row), axis=1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 32,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "34 [seh.ox.ac.uk, bsg.ox.ac.uk]\n",
- "47 [foxmail.com]\n",
- "103 [fvtm.bu.edu.eg]\n",
- "297 [unipa.it]\n",
- "299 [nhs.net]\n",
- " ... \n",
- "10746811 [gva.es, gmail.com]\n",
- "10746850 [cinvestav.mx]\n",
- "10746920 [gmail.com, hotmail.com]\n",
- "10746975 [mail.ru]\n",
- "10746988 [ucm.es]\n",
- "Name: email_domains, Length: 141118, dtype: object"
- ]
- },
- "execution_count": 32,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['email_domains'].str.len() != 0]['email_domains']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['email_domains']), columns=mlb.classes_)], axis = 1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "df.drop(['primary email', 'other emails', 'email_domains'], axis=1, inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "df[df['orcid'] == AM]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Extracting URL domains"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 35,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 36,
- "metadata": {},
- "outputs": [],
- "source": [
- "def extract_url_domains(lst):\n",
- " domains = []\n",
- " for e in lst:\n",
- " # e[0] is a string describing the url\n",
- " # e[1] is the url\n",
- " ext = tldextract.extract(e[1])\n",
- " domains.append(ext.registered_domain)\n",
- " return domains"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 37,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 38,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "5 [researchgate.net]\n",
- "14 [tigerscaffolds.co.nz]\n",
- "15 [corticalbrain.com]\n",
- "29 [cnpq.br]\n",
- "30 [sksahu.net]\n",
- " ... \n",
- "10746945 [telegra.ph]\n",
- "10746950 [twitter.com, urbanfoodpolicy.com]\n",
- "10746955 [openlearning.com]\n",
- "10746984 [panaximco.vn]\n",
- "10746987 [swansea.ac.uk]\n",
- "Name: url_domains, Length: 688572, dtype: object"
- ]
- },
- "execution_count": 38,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['url_domains'].str.len() != 0]['url_domains']"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": [
- "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['url_domains']), columns=mlb.classes_)], axis = 1)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 39,
- "metadata": {},
- "outputs": [],
- "source": [
- "df.drop(['researcher urls', 'url_domains'], axis=1, inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 40,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " orcid | \n",
- " claimed | \n",
- " verifyed email | \n",
- " verified primary email | \n",
- " given names | \n",
- " family name | \n",
- " biography | \n",
- " other names | \n",
- " keywords | \n",
- " number of works | \n",
- " Crossref | \n",
- " Crossref Metadata Search | \n",
- " Scopus - Elsevier | \n",
- " n_education | \n",
- " n_employments | \n",
- " n_ext_ids | \n",
- " email_domains | \n",
- " url_domains | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 8840413 | \n",
- " 0000-0002-5193-7851 | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " Andrea | \n",
- " Mannocci | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [\"Data science \", \"science of science\", \"schol... | \n",
- " 37 | \n",
- " 1 | \n",
- " 1 | \n",
- " 1 | \n",
- " 4 | \n",
- " 5 | \n",
- " 1 | \n",
- " [isti.cnr.it] | \n",
- " [github.io, twitter.com, linkedin.com] | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " orcid claimed verifyed email verified primary email \\\n",
- "8840413 0000-0002-5193-7851 True True True \n",
- "\n",
- " given names family name biography other names \\\n",
- "8840413 Andrea Mannocci \n",
- "\n",
- " keywords number of works \\\n",
- "8840413 [\"Data science \", \"science of science\", \"schol... 37 \n",
- "\n",
- " Crossref Crossref Metadata Search Scopus - Elsevier n_education \\\n",
- "8840413 1 1 1 4 \n",
- "\n",
- " n_employments n_ext_ids email_domains \\\n",
- "8840413 5 1 [isti.cnr.it] \n",
- "\n",
- " url_domains \n",
- "8840413 [github.io, twitter.com, linkedin.com] "
- ]
- },
- "execution_count": 40,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['orcid'] == AM]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Fixing keywords"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 18,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "Sometimes, different keywords are provided as a continuum (multiplexed in just one keyword). E.g."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 19,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " orcid | \n",
- " claimed | \n",
- " verifyed email | \n",
- " verified primary email | \n",
- " given names | \n",
- " family name | \n",
- " biography | \n",
- " other names | \n",
- " researcher urls | \n",
- " primary email | \n",
- " other emails | \n",
- " keywords | \n",
- " external identifiers | \n",
- " education | \n",
- " employments | \n",
- " number of works | \n",
- " works source | \n",
- " email_domains | \n",
- " url_domains | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 9601705 | \n",
- " 0000-0002-8588-4196 | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " Pedro | \n",
- " Príncipe | \n",
- " Pedro Príncipe is an information, documentatio... | \n",
- " [\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"] | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " [open access, open science, libraries, reposit... | \n",
- " [[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]] | \n",
- " <NA> | \n",
- " [[\"Librarian / Project manager\", \"Universidade... | \n",
- " 5 | \n",
- " [\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"... | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " orcid claimed verifyed email verified primary email \\\n",
- "9601705 0000-0002-8588-4196 True True True \n",
- "\n",
- " given names family name \\\n",
- "9601705 Pedro Príncipe \n",
- "\n",
- " biography \\\n",
- "9601705 Pedro Príncipe is an information, documentatio... \n",
- "\n",
- " other names researcher urls \\\n",
- "9601705 [\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"] [] \n",
- "\n",
- " primary email other emails \\\n",
- "9601705 [] \n",
- "\n",
- " keywords \\\n",
- "9601705 [open access, open science, libraries, reposit... \n",
- "\n",
- " external identifiers education \\\n",
- "9601705 [[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]] \n",
- "\n",
- " employments number of works \\\n",
- "9601705 [[\"Librarian / Project manager\", \"Universidade... 5 \n",
- "\n",
- " works source email_domains \\\n",
- "9601705 [\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"... [] \n",
- "\n",
- " url_domains \n",
- "9601705 [] "
- ]
- },
- "execution_count": 19,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['orcid'] == PP]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 20,
- "metadata": {},
- "outputs": [],
- "source": [
- "def fix_keywords(lst):\n",
- " fixed = []\n",
- " for k in lst:\n",
- " split = k.split(',')\n",
- " fixed.extend(split)\n",
- " return fixed"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "['open access',\n",
- " ' open science',\n",
- " ' libraries',\n",
- " ' repositories',\n",
- " ' social web',\n",
- " '']"
- ]
- },
- "execution_count": 21,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "test = ['open access, open science, libraries, repositories, social web,']\n",
- "fix_keywords(test)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 22,
- "metadata": {},
- "outputs": [],
- "source": [
- "df['fixed_keywords'] = df['keywords'].apply(lambda lst: fix_keywords(lst))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " orcid | \n",
- " claimed | \n",
- " verifyed email | \n",
- " verified primary email | \n",
- " given names | \n",
- " family name | \n",
- " biography | \n",
- " other names | \n",
- " researcher urls | \n",
- " primary email | \n",
- " other emails | \n",
- " keywords | \n",
- " external identifiers | \n",
- " education | \n",
- " employments | \n",
- " number of works | \n",
- " works source | \n",
- " email_domains | \n",
- " url_domains | \n",
- " fixed_keywords | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 9517099 | \n",
- " 0000-0001-6997-9470 | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " other | \n",
- " whatsapp | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [[Otherwhatsapp, https://otherwhatsapp.com/], ... | \n",
- " | \n",
- " [] | \n",
- " [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- " [] | \n",
- " [otherwhatsapp.com, im-creator.com, facebook.c... | \n",
- " [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " orcid claimed verifyed email verified primary email \\\n",
- "9517099 0000-0001-6997-9470 True True True \n",
- "\n",
- " given names family name biography other names \\\n",
- "9517099 other whatsapp \n",
- "\n",
- " researcher urls primary email \\\n",
- "9517099 [[Otherwhatsapp, https://otherwhatsapp.com/], ... \n",
- "\n",
- " other emails keywords \\\n",
- "9517099 [] [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... \n",
- "\n",
- " external identifiers education employments number of works \\\n",
- "9517099 NaN 0 \n",
- "\n",
- " works source email_domains \\\n",
- "9517099 [] \n",
- "\n",
- " url_domains \\\n",
- "9517099 [otherwhatsapp.com, im-creator.com, facebook.c... \n",
- "\n",
- " fixed_keywords \n",
- "9517099 [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... "
- ]
- },
- "execution_count": 23,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df['orcid'] == WHATSAPP]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 24,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " orcid | \n",
- " claimed | \n",
- " verifyed email | \n",
- " verified primary email | \n",
- " given names | \n",
- " family name | \n",
- " biography | \n",
- " other names | \n",
- " researcher urls | \n",
- " primary email | \n",
- " other emails | \n",
- " external identifiers | \n",
- " education | \n",
- " employments | \n",
- " number of works | \n",
- " works source | \n",
- " email_domains | \n",
- " url_domains | \n",
- " fixed_keywords | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0000-0001-5000-2053 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Jorge | \n",
- " Jaramillo Sanchez | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 0000-0001-5000-6548 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Wiseman | \n",
- " Bekelesi | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 0000-0001-5000-7962 | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " ALICE | \n",
- " INDIMULI | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 0000-0001-5000-8586 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " shim | \n",
- " ji yun | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 0000-0001-5001-0256 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Sandro | \n",
- " Caramaschi | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 10747035 | \n",
- " 0000-0003-4998-1551 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Animesh | \n",
- " Ghosh | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 10747036 | \n",
- " 0000-0003-4998-4111 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Hawa | \n",
- " Liberna | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 10747037 | \n",
- " 0000-0003-4998-6045 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Tongyi | \n",
- " Men | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 10747038 | \n",
- " 0000-0003-4998-8868 | \n",
- " True | \n",
- " True | \n",
- " False | \n",
- " Charldon | \n",
- " Wilken | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 10747039 | \n",
- " 0000-0003-4999-7916 | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " Tapas Bapu | \n",
- " B.R. | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 0 | \n",
- " <NA> | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- "
\n",
- "
10747040 rows × 19 columns
\n",
- "
"
- ],
- "text/plain": [
- " orcid claimed verifyed email \\\n",
- "0 0000-0001-5000-2053 True False \n",
- "1 0000-0001-5000-6548 True False \n",
- "2 0000-0001-5000-7962 True True \n",
- "3 0000-0001-5000-8586 True False \n",
- "4 0000-0001-5001-0256 True False \n",
- "... ... ... ... \n",
- "10747035 0000-0003-4998-1551 True False \n",
- "10747036 0000-0003-4998-4111 True False \n",
- "10747037 0000-0003-4998-6045 True False \n",
- "10747038 0000-0003-4998-8868 True True \n",
- "10747039 0000-0003-4999-7916 True True \n",
- "\n",
- " verified primary email given names family name biography \\\n",
- "0 False Jorge Jaramillo Sanchez \n",
- "1 False Wiseman Bekelesi \n",
- "2 True ALICE INDIMULI \n",
- "3 False shim ji yun \n",
- "4 False Sandro Caramaschi \n",
- "... ... ... ... ... \n",
- "10747035 False Animesh Ghosh \n",
- "10747036 False Hawa Liberna \n",
- "10747037 False Tongyi Men \n",
- "10747038 False Charldon Wilken \n",
- "10747039 True Tapas Bapu B.R. \n",
- "\n",
- " other names researcher urls primary email other emails \\\n",
- "0 [] [] \n",
- "1 [] [] \n",
- "2 [] [] \n",
- "3 [] [] \n",
- "4 [] [] \n",
- "... ... ... ... ... \n",
- "10747035 [] [] \n",
- "10747036 [] [] \n",
- "10747037 [] [] \n",
- "10747038 [] [] \n",
- "10747039 [] [] \n",
- "\n",
- " external identifiers education employments number of works \\\n",
- "0 NaN 0 \n",
- "1 NaN 0 \n",
- "2 NaN 0 \n",
- "3 NaN 0 \n",
- "4 NaN 0 \n",
- "... ... ... ... ... \n",
- "10747035 NaN 0 \n",
- "10747036 NaN 0 \n",
- "10747037 NaN 0 \n",
- "10747038 NaN 0 \n",
- "10747039 NaN 0 \n",
- "\n",
- " works source email_domains url_domains fixed_keywords \n",
- "0 [] [] [] \n",
- "1 [] [] [] \n",
- "2 [] [] [] \n",
- "3 [] [] [] \n",
- "4 [] [] [] \n",
- "... ... ... ... ... \n",
- "10747035 [] [] [] \n",
- "10747036 [] [] [] \n",
- "10747037 [] [] [] \n",
- "10747038 [] [] [] \n",
- "10747039 [] [] [] \n",
- "\n",
- "[10747040 rows x 19 columns]"
- ]
- },
- "execution_count": 24,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df.drop('keywords', axis=1, inplace=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Fixes for other columns with lists inside"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "metadata": {},
- "outputs": [],
- "source": [
- "# df['other names'] = df['other names'].apply(lambda x: ast.literal_eval(x))\n",
- "# df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
- "# df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
- "# df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
- "# df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
- "# df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
- "# df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n",
- "# df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# Feature extraction"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 28,
- "metadata": {},
- "outputs": [],
- "source": [
- "# df['email_encoding'] = mlb.fit_transform(df['email_domains'])\n",
- "# df['url_encoding'] = mlb.fit_transform(df['url_domains'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " orcid | \n",
- " claimed | \n",
- " verifyed email | \n",
- " verified primary email | \n",
- " given names | \n",
- " family name | \n",
- " biography | \n",
- " other names | \n",
- " researcher urls | \n",
- " primary email | \n",
- " other emails | \n",
- " keywords | \n",
- " external identifiers | \n",
- " education | \n",
- " employments | \n",
- " number of works | \n",
- " works source | \n",
- " email_domains | \n",
- " url_domains | \n",
- " fixed_keywords | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 0000-0001-5000-2053 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Jorge | \n",
- " Jaramillo Sanchez | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " 0 | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 0000-0001-5000-6548 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Wiseman | \n",
- " Bekelesi | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " 0 | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 0000-0001-5000-7962 | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " ALICE | \n",
- " INDIMULI | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " 0 | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 0000-0001-5000-8586 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " shim | \n",
- " ji yun | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " 0 | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 4 | \n",
- " 0000-0001-5001-0256 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Sandro | \n",
- " Caramaschi | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " 0 | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 10747035 | \n",
- " 0000-0003-4998-1551 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Animesh | \n",
- " Ghosh | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " 0 | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 10747036 | \n",
- " 0000-0003-4998-4111 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Hawa | \n",
- " Liberna | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " 0 | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 10747037 | \n",
- " 0000-0003-4998-6045 | \n",
- " True | \n",
- " False | \n",
- " False | \n",
- " Tongyi | \n",
- " Men | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " 0 | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 10747038 | \n",
- " 0000-0003-4998-8868 | \n",
- " True | \n",
- " True | \n",
- " False | \n",
- " Charldon | \n",
- " Wilken | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " 0 | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 10747039 | \n",
- " 0000-0003-4999-7916 | \n",
- " True | \n",
- " True | \n",
- " True | \n",
- " Tapas Bapu | \n",
- " B.R. | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [] | \n",
- " | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " 0 | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- " [] | \n",
- "
\n",
- " \n",
- "
\n",
- "
10747040 rows × 20 columns
\n",
- "
"
- ],
- "text/plain": [
- " orcid claimed verifyed email \\\n",
- "0 0000-0001-5000-2053 True False \n",
- "1 0000-0001-5000-6548 True False \n",
- "2 0000-0001-5000-7962 True True \n",
- "3 0000-0001-5000-8586 True False \n",
- "4 0000-0001-5001-0256 True False \n",
- "... ... ... ... \n",
- "10747035 0000-0003-4998-1551 True False \n",
- "10747036 0000-0003-4998-4111 True False \n",
- "10747037 0000-0003-4998-6045 True False \n",
- "10747038 0000-0003-4998-8868 True True \n",
- "10747039 0000-0003-4999-7916 True True \n",
- "\n",
- " verified primary email given names family name biography \\\n",
- "0 False Jorge Jaramillo Sanchez \n",
- "1 False Wiseman Bekelesi \n",
- "2 True ALICE INDIMULI \n",
- "3 False shim ji yun \n",
- "4 False Sandro Caramaschi \n",
- "... ... ... ... ... \n",
- "10747035 False Animesh Ghosh \n",
- "10747036 False Hawa Liberna \n",
- "10747037 False Tongyi Men \n",
- "10747038 False Charldon Wilken \n",
- "10747039 True Tapas Bapu B.R. \n",
- "\n",
- " other names researcher urls primary email other emails keywords \\\n",
- "0 [] [] [] \n",
- "1 [] [] [] \n",
- "2 [] [] [] \n",
- "3 [] [] [] \n",
- "4 [] [] [] \n",
- "... ... ... ... ... ... \n",
- "10747035 [] [] [] \n",
- "10747036 [] [] [] \n",
- "10747037 [] [] [] \n",
- "10747038 [] [] [] \n",
- "10747039 [] [] [] \n",
- "\n",
- " external identifiers education employments number of works \\\n",
- "0 [] [] [] 0 \n",
- "1 [] [] [] 0 \n",
- "2 [] [] [] 0 \n",
- "3 [] [] [] 0 \n",
- "4 [] [] [] 0 \n",
- "... ... ... ... ... \n",
- "10747035 [] [] [] 0 \n",
- "10747036 [] [] [] 0 \n",
- "10747037 [] [] [] 0 \n",
- "10747038 [] [] [] 0 \n",
- "10747039 [] [] [] 0 \n",
- "\n",
- " works source email_domains url_domains fixed_keywords \n",
- "0 [] [] [] [] \n",
- "1 [] [] [] [] \n",
- "2 [] [] [] [] \n",
- "3 [] [] [] [] \n",
- "4 [] [] [] [] \n",
- "... ... ... ... ... \n",
- "10747035 [] [] [] [] \n",
- "10747036 [] [] [] [] \n",
- "10747037 [] [] [] [] \n",
- "10747038 [] [] [] [] \n",
- "10747039 [] [] [] [] \n",
- "\n",
- "[10747040 rows x 20 columns]"
- ]
- },
- "execution_count": 29,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df"
- ]
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}
diff --git a/notebooks/03-Machine Learning.ipynb b/notebooks/03-Machine Learning.ipynb
new file mode 100644
index 0000000..5fa2601
--- /dev/null
+++ b/notebooks/03-Machine Learning.ipynb
@@ -0,0 +1,468 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Todo in data\n",
+ "- Column names -> no space\n",
+ "- If a list is empty, serialise [] in the csv\n",
+ "- If a string is empty, serialise '' in the csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import ast\n",
+ "from urllib.parse import urlparse\n",
+ "import tldextract\n",
+ "\n",
+ "import pandas as pd\n",
+ "from sklearn.preprocessing import MultiLabelBinarizer\n",
+ "from sklearn.svm import OneClassSVM \n",
+ "from sklearn.model_selection import train_test_split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = pd.read_pickle('../data/processed/features.pkl')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " verified_email | \n",
+ " verified_primary_email | \n",
+ " n_works | \n",
+ " n_doi | \n",
+ " n_arxiv | \n",
+ " n_pmc | \n",
+ " n_other_pids | \n",
+ " n_emails | \n",
+ " n_urls | \n",
+ " n_ids | \n",
+ " n_keywords | \n",
+ " n_employment | \n",
+ " n_education | \n",
+ " label | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2.0 | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 10989644 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 10989645 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 7 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 2.0 | \n",
+ " 2.0 | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 10989646 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 10989647 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1.0 | \n",
+ " 2.0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " 10989648 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
10989649 rows × 14 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " verified_email verified_primary_email n_works n_doi n_arxiv \\\n",
+ "0 0 0 0 0 0 \n",
+ "1 1 1 0 0 0 \n",
+ "2 1 1 0 0 0 \n",
+ "3 1 1 0 0 0 \n",
+ "4 1 1 0 0 0 \n",
+ "... ... ... ... ... ... \n",
+ "10989644 1 1 0 0 0 \n",
+ "10989645 1 1 7 7 0 \n",
+ "10989646 1 1 0 0 0 \n",
+ "10989647 1 1 0 0 0 \n",
+ "10989648 1 1 0 0 0 \n",
+ "\n",
+ " n_pmc n_other_pids n_emails n_urls n_ids n_keywords \\\n",
+ "0 0 0 NaN NaN NaN NaN \n",
+ "1 0 0 NaN NaN NaN NaN \n",
+ "2 0 0 NaN NaN NaN NaN \n",
+ "3 0 0 NaN NaN NaN NaN \n",
+ "4 0 0 NaN NaN NaN NaN \n",
+ "... ... ... ... ... ... ... \n",
+ "10989644 0 0 NaN NaN NaN NaN \n",
+ "10989645 1 0 NaN NaN NaN NaN \n",
+ "10989646 0 0 NaN NaN NaN NaN \n",
+ "10989647 0 0 NaN NaN NaN NaN \n",
+ "10989648 0 0 NaN NaN NaN NaN \n",
+ "\n",
+ " n_employment n_education label \n",
+ "0 NaN NaN 0 \n",
+ "1 1.0 NaN 0 \n",
+ "2 NaN NaN 0 \n",
+ "3 1.0 NaN 0 \n",
+ "4 2.0 NaN 0 \n",
+ "... ... ... ... \n",
+ "10989644 1.0 2.0 0 \n",
+ "10989645 2.0 2.0 1 \n",
+ "10989646 NaN NaN 0 \n",
+ "10989647 1.0 2.0 0 \n",
+ "10989648 NaN NaN 0 \n",
+ "\n",
+ "[10989649 rows x 14 columns]"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = df.fillna(0)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "verified_email 2664886\n",
+ "verified_primary_email 2664886\n",
+ "n_works 2664886\n",
+ "n_doi 2664886\n",
+ "n_arxiv 2664886\n",
+ "n_pmc 2664886\n",
+ "n_other_pids 2664886\n",
+ "n_emails 2664886\n",
+ "n_urls 2664886\n",
+ "n_ids 2664886\n",
+ "n_keywords 2664886\n",
+ "n_employment 2664886\n",
+ "n_education 2664886\n",
+ "label 2664886\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[df.label == 1].count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "verified_email 8324763\n",
+ "verified_primary_email 8324763\n",
+ "n_works 8324763\n",
+ "n_doi 8324763\n",
+ "n_arxiv 8324763\n",
+ "n_pmc 8324763\n",
+ "n_other_pids 8324763\n",
+ "n_emails 8324763\n",
+ "n_urls 8324763\n",
+ "n_ids 8324763\n",
+ "n_keywords 8324763\n",
+ "n_employment 8324763\n",
+ "n_education 8324763\n",
+ "label 8324763\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df[df.label == 0].count()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# split into train/test sets\n",
+ "X = df.loc[:,'verified_email':'n_education']\n",
+ "y = df['label']\n",
+ "trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)\n",
+ "\n",
+ "# define outlier detection model\n",
+ "model = OneClassSVM(gamma='scale', nu=0.01)\n",
+ "\n",
+ "# fit on majority class\n",
+ "trainX = trainX[trainy==1]\n",
+ "model.fit(trainX)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# detect outliers in the test set\n",
+ "yhat = model.predict(testX)\n",
+ "\n",
+ "# mark inliers 1, outliers -1\n",
+ "testy[testy == 0] = -1\n",
+ "testy[testy == 1] = 1\n",
+ "\n",
+ "# calculate score\n",
+ "score = f1_score(testy, yhat, pos_label=-1)\n",
+ "print('F1 Score: %.3f' % score)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}