From 8288d877fa7113f8f5efe25e279fa79f1d581381 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Fri, 26 Mar 2021 09:16:11 +0100 Subject: [PATCH] first tries with rudimental ML --- notebooks/01-Exploration.ipynb | 35 +- notebooks/03-Feature extraction.ipynb | 2422 ------------------------- notebooks/03-Machine Learning.ipynb | 468 +++++ 3 files changed, 501 insertions(+), 2424 deletions(-) delete mode 100644 notebooks/03-Feature extraction.ipynb create mode 100644 notebooks/03-Machine Learning.ipynb diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index 4a624ae..51d6a70 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -764,7 +764,7 @@ } ], "source": [ - "df.count() #10916574" + "df.count()" ] }, { @@ -16260,6 +16260,28 @@ "fig.show()" ] }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "df[['verified_email', \n", + " 'verified_primary_email', \n", + " 'n_works', \n", + " 'n_doi',\n", + " 'n_arxiv', \n", + " 'n_pmc', \n", + " 'n_other_pids', \n", + " 'n_emails', \n", + " 'n_urls', \n", + " 'n_ids', \n", + " 'n_keywords', \n", + " 'n_employment', \n", + " 'n_education', \n", + " 'label']].to_pickle('../data/processed/features.pkl')" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -16884,12 +16906,21 @@ "# (df.n_works > 0) & (df.n_ids > 1)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Serialise " + ] + }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + " " + ] } ], "metadata": { diff --git a/notebooks/03-Feature extraction.ipynb b/notebooks/03-Feature extraction.ipynb deleted file mode 100644 index e25ef16..0000000 --- a/notebooks/03-Feature extraction.ipynb +++ /dev/null @@ -1,2422 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Todo in data\n", - "- Column names -> no space\n", - "- If a list is empty, serialise [] in the csv\n", - "- If a string is empty, serialise '' in the csv" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import ast\n", - "from urllib.parse import urlparse\n", - "import tldextract\n", - "\n", - "import pandas as pd\n", - "from sklearn.preprocessing import MultiLabelBinarizer\n" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "mlb = MultiLabelBinarizer()" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [], - "source": [ - "# Notable Solid ORCID iDs for debug purposes\n", - "AM = '0000-0002-5193-7851'\n", - "PP = '0000-0002-8588-4196'\n" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "# Notable fake ORCID iDs for debug purposes\n", - "SCAFFOLD = '0000-0001-5004-7761'\n", - "WHATSAPP = '0000-0001-6997-9470'\n" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header = 0,\n", - " dtype = {\"orcid\": pd.StringDtype(), \n", - " \"claimed\": bool, \n", - " \"verifyed email\": bool, \n", - " \"verified primary email\": bool,\n", - " \"given names\": pd.StringDtype(),\n", - " \"family name\": pd.StringDtype(),\n", - " \"biography\": pd.StringDtype(),\n", - " \"other names\": pd.StringDtype(),\n", - " \"researcher urls\": pd.StringDtype(),\n", - " \"primary email\": pd.StringDtype(),\n", - " \"other emails\": pd.StringDtype(),\n", - " \"keywords\": pd.StringDtype(),\n", - " \"eternal identifiers\": pd.StringDtype(),\n", - " \"education\": pd.StringDtype(),\n", - " \"employments\": pd.StringDtype(),\n", - " \"number of works\": pd.Int16Dtype(),\n", - " \"works source\": pd.StringDtype()})" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks source
00000-0001-5000-2053TrueFalseFalseJorgeJaramillo Sanchez<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
10000-0001-5000-6548TrueFalseFalseWisemanBekelesi<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
20000-0001-5000-7962TrueTrueTrueALICEINDIMULI<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
30000-0001-5000-8586TrueFalseFalseshimji yun<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
40000-0001-5001-0256TrueFalseFalseSandroCaramaschi<NA><NA><NA><NA><NA><NA>NaN<NA><NA>0<NA>
\n", - "
" - ], - "text/plain": [ - " orcid claimed verifyed email verified primary email \\\n", - "0 0000-0001-5000-2053 True False False \n", - "1 0000-0001-5000-6548 True False False \n", - "2 0000-0001-5000-7962 True True True \n", - "3 0000-0001-5000-8586 True False False \n", - "4 0000-0001-5001-0256 True False False \n", - "\n", - " given names family name biography other names researcher urls \\\n", - "0 Jorge Jaramillo Sanchez \n", - "1 Wiseman Bekelesi \n", - "2 ALICE INDIMULI \n", - "3 shim ji yun \n", - "4 Sandro Caramaschi \n", - "\n", - " primary email other emails keywords external identifiers education \\\n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN \n", - "\n", - " employments number of works works source \n", - "0 0 \n", - "1 0 \n", - "2 0 \n", - "3 0 \n", - "4 0 " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks source
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...[[\"Scopus Author ID\", \"55233589900\"]][[\"Information engineering\", \"Ph.D.\", \"Univers...[[\"Research Associate\", \"Istituto di Scienza e...37[\"Scopus - Elsevier\", \"Crossref Metadata Searc...
\n", - "
" - ], - "text/plain": [ - " orcid claimed verifyed email verified primary email \\\n", - "8840413 0000-0002-5193-7851 True True True \n", - "\n", - " given names family name biography other names \\\n", - "8840413 Andrea Mannocci \n", - "\n", - " researcher urls \\\n", - "8840413 [[\"Personal website\", \"https://andremann.githu... \n", - "\n", - " primary email other emails \\\n", - "8840413 andrea.mannocci@isti.cnr.it \n", - "\n", - " keywords \\\n", - "8840413 [\"Data science \", \"science of science\", \"schol... \n", - "\n", - " external identifiers \\\n", - "8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n", - "\n", - " education \\\n", - "8840413 [[\"Information engineering\", \"Ph.D.\", \"Univers... \n", - "\n", - " employments number of works \\\n", - "8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n", - "\n", - " works source \n", - "8840413 [\"Scopus - Elsevier\", \"Crossref Metadata Searc... " - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['orcid'] == AM]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Extracting works source" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "def extract_work_source(lst):\n", - " extracted = []\n", - " for s in lst:\n", - " if 'Scopus - Elsevier' in s or 'Crossref' in s:\n", - " extracted.append(s)\n", - " return extracted" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "df['extracted_works_source'] = df['works source'].apply(lambda x: extract_work_source(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_works_source']), columns=mlb.classes_)], axis = 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "df.drop(['works source', 'extracted_works_source'], axis=1, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksCrossrefCrossref Metadata SearchScopus - Elsevier
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...[[\"Scopus Author ID\", \"55233589900\"]][[\"Information engineering\", \"Ph.D.\", \"Univers...[[\"Research Associate\", \"Istituto di Scienza e...37111
\n", - "
" - ], - "text/plain": [ - " orcid claimed verifyed email verified primary email \\\n", - "8840413 0000-0002-5193-7851 True True True \n", - "\n", - " given names family name biography other names \\\n", - "8840413 Andrea Mannocci \n", - "\n", - " researcher urls \\\n", - "8840413 [[\"Personal website\", \"https://andremann.githu... \n", - "\n", - " primary email other emails \\\n", - "8840413 andrea.mannocci@isti.cnr.it \n", - "\n", - " keywords \\\n", - "8840413 [\"Data science \", \"science of science\", \"schol... \n", - "\n", - " external identifiers \\\n", - "8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n", - "\n", - " education \\\n", - "8840413 [[\"Information engineering\", \"Ph.D.\", \"Univers... \n", - "\n", - " employments number of works \\\n", - "8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n", - "\n", - " Crossref Crossref Metadata Search Scopus - Elsevier \n", - "8840413 1 1 1 " - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['orcid'] == AM]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Education" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "df['n_education'] = df['education'].str.len()" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "df.drop('education', axis=1, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifiersemploymentsnumber of worksCrossrefCrossref Metadata SearchScopus - Elseviern_education
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...[[\"Scopus Author ID\", \"55233589900\"]][[\"Research Associate\", \"Istituto di Scienza e...371114
\n", - "
" - ], - "text/plain": [ - " orcid claimed verifyed email verified primary email \\\n", - "8840413 0000-0002-5193-7851 True True True \n", - "\n", - " given names family name biography other names \\\n", - "8840413 Andrea Mannocci \n", - "\n", - " researcher urls \\\n", - "8840413 [[\"Personal website\", \"https://andremann.githu... \n", - "\n", - " primary email other emails \\\n", - "8840413 andrea.mannocci@isti.cnr.it \n", - "\n", - " keywords \\\n", - "8840413 [\"Data science \", \"science of science\", \"schol... \n", - "\n", - " external identifiers \\\n", - "8840413 [[\"Scopus Author ID\", \"55233589900\"]] \n", - "\n", - " employments number of works \\\n", - "8840413 [[\"Research Associate\", \"Istituto di Scienza e... 37 \n", - "\n", - " Crossref Crossref Metadata Search Scopus - Elsevier n_education \n", - "8840413 1 1 1 4 " - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['orcid'] == AM]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Employment" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [], - "source": [ - "df['n_employments'] = df['employments'].str.len()" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "df.drop('employments', axis=1, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifiersnumber of worksCrossrefCrossref Metadata SearchScopus - Elseviern_educationn_employments
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...[[\"Scopus Author ID\", \"55233589900\"]]3711145
\n", - "
" - ], - "text/plain": [ - " orcid claimed verifyed email verified primary email \\\n", - "8840413 0000-0002-5193-7851 True True True \n", - "\n", - " given names family name biography other names \\\n", - "8840413 Andrea Mannocci \n", - "\n", - " researcher urls \\\n", - "8840413 [[\"Personal website\", \"https://andremann.githu... \n", - "\n", - " primary email other emails \\\n", - "8840413 andrea.mannocci@isti.cnr.it \n", - "\n", - " keywords \\\n", - "8840413 [\"Data science \", \"science of science\", \"schol... \n", - "\n", - " external identifiers number of works Crossref \\\n", - "8840413 [[\"Scopus Author ID\", \"55233589900\"]] 37 1 \n", - "\n", - " Crossref Metadata Search Scopus - Elsevier n_education \\\n", - "8840413 1 1 4 \n", - "\n", - " n_employments \n", - "8840413 5 " - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['orcid'] == AM]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# External IDs" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "# def extract_ids(lst):\n", - "# extracted = []\n", - "# for id in lst:\n", - "# extracted.append(id[0])\n", - "# return extracted" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [], - "source": [ - "# df['extracted_identifiers'] = df['external identifiers'].apply(lambda x: extract_ids(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "# df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['extracted_identifiers']), columns=mlb.classes_)], axis = 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "df['n_ext_ids'] = df['external identifiers'].str.len()" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "df.drop(['external identifiers'], axis=1, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsnumber of worksCrossrefCrossref Metadata SearchScopus - Elseviern_educationn_employmentsn_ext_ids
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[[\"Personal website\", \"https://andremann.githu...andrea.mannocci@isti.cnr.it<NA>[\"Data science \", \"science of science\", \"schol...37111451
\n", - "
" - ], - "text/plain": [ - " orcid claimed verifyed email verified primary email \\\n", - "8840413 0000-0002-5193-7851 True True True \n", - "\n", - " given names family name biography other names \\\n", - "8840413 Andrea Mannocci \n", - "\n", - " researcher urls \\\n", - "8840413 [[\"Personal website\", \"https://andremann.githu... \n", - "\n", - " primary email other emails \\\n", - "8840413 andrea.mannocci@isti.cnr.it \n", - "\n", - " keywords number of works \\\n", - "8840413 [\"Data science \", \"science of science\", \"schol... 37 \n", - "\n", - " Crossref Crossref Metadata Search Scopus - Elsevier n_education \\\n", - "8840413 1 1 1 4 \n", - "\n", - " n_employments n_ext_ids \n", - "8840413 5 1 " - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['orcid'] == AM]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Extracting email domains" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [], - "source": [ - "df['primary email'] = df['primary email'].fillna('')\n", - "df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [], - "source": [ - "def extract_email_domains(row):\n", - " domains = []\n", - " if len(row['primary email']) > 0:\n", - " domains.append(row['primary email'].split('@')[1])\n", - " for email in row['other emails']:\n", - " domains.append(email.split('@')[1])\n", - " return domains" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "metadata": {}, - "outputs": [], - "source": [ - "df['email_domains'] = df[['primary email','other emails']].apply(lambda row: extract_email_domains(row), axis=1)" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "34 [seh.ox.ac.uk, bsg.ox.ac.uk]\n", - "47 [foxmail.com]\n", - "103 [fvtm.bu.edu.eg]\n", - "297 [unipa.it]\n", - "299 [nhs.net]\n", - " ... \n", - "10746811 [gva.es, gmail.com]\n", - "10746850 [cinvestav.mx]\n", - "10746920 [gmail.com, hotmail.com]\n", - "10746975 [mail.ru]\n", - "10746988 [ucm.es]\n", - "Name: email_domains, Length: 141118, dtype: object" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['email_domains'].str.len() != 0]['email_domains']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['email_domains']), columns=mlb.classes_)], axis = 1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df.drop(['primary email', 'other emails', 'email_domains'], axis=1, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df[df['orcid'] == AM]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Extracting URL domains" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "metadata": {}, - "outputs": [], - "source": [ - "df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "metadata": {}, - "outputs": [], - "source": [ - "def extract_url_domains(lst):\n", - " domains = []\n", - " for e in lst:\n", - " # e[0] is a string describing the url\n", - " # e[1] is the url\n", - " ext = tldextract.extract(e[1])\n", - " domains.append(ext.registered_domain)\n", - " return domains" - ] - }, - { - "cell_type": "code", - "execution_count": 37, - "metadata": {}, - "outputs": [], - "source": [ - "df['url_domains'] = df['researcher urls'].apply(lambda lst: extract_url_domains(lst))" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "5 [researchgate.net]\n", - "14 [tigerscaffolds.co.nz]\n", - "15 [corticalbrain.com]\n", - "29 [cnpq.br]\n", - "30 [sksahu.net]\n", - " ... \n", - "10746945 [telegra.ph]\n", - "10746950 [twitter.com, urbanfoodpolicy.com]\n", - "10746955 [openlearning.com]\n", - "10746984 [panaximco.vn]\n", - "10746987 [swansea.ac.uk]\n", - "Name: url_domains, Length: 688572, dtype: object" - ] - }, - "execution_count": 38, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['url_domains'].str.len() != 0]['url_domains']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.concat([df, pd.DataFrame(mlb.fit_transform(df['url_domains']), columns=mlb.classes_)], axis = 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [], - "source": [ - "df.drop(['researcher urls', 'url_domains'], axis=1, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother nameskeywordsnumber of worksCrossrefCrossref Metadata SearchScopus - Elseviern_educationn_employmentsn_ext_idsemail_domainsurl_domains
88404130000-0002-5193-7851TrueTrueTrueAndreaMannocci<NA><NA>[\"Data science \", \"science of science\", \"schol...37111451[isti.cnr.it][github.io, twitter.com, linkedin.com]
\n", - "
" - ], - "text/plain": [ - " orcid claimed verifyed email verified primary email \\\n", - "8840413 0000-0002-5193-7851 True True True \n", - "\n", - " given names family name biography other names \\\n", - "8840413 Andrea Mannocci \n", - "\n", - " keywords number of works \\\n", - "8840413 [\"Data science \", \"science of science\", \"schol... 37 \n", - "\n", - " Crossref Crossref Metadata Search Scopus - Elsevier n_education \\\n", - "8840413 1 1 1 4 \n", - "\n", - " n_employments n_ext_ids email_domains \\\n", - "8840413 5 1 [isti.cnr.it] \n", - "\n", - " url_domains \n", - "8840413 [github.io, twitter.com, linkedin.com] " - ] - }, - "execution_count": 40, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['orcid'] == AM]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Fixing keywords" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Sometimes, different keywords are provided as a continuum (multiplexed in just one keyword). E.g." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks sourceemail_domainsurl_domains
96017050000-0002-8588-4196TrueTrueTruePedroPríncipePedro Príncipe is an information, documentatio...[\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"][][][open access, open science, libraries, reposit...[[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]]<NA>[[\"Librarian / Project manager\", \"Universidade...5[\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"...[][]
\n", - "
" - ], - "text/plain": [ - " orcid claimed verifyed email verified primary email \\\n", - "9601705 0000-0002-8588-4196 True True True \n", - "\n", - " given names family name \\\n", - "9601705 Pedro Príncipe \n", - "\n", - " biography \\\n", - "9601705 Pedro Príncipe is an information, documentatio... \n", - "\n", - " other names researcher urls \\\n", - "9601705 [\"Pedro Miguel de Oliveira Bento Pr\\u00edncipe\"] [] \n", - "\n", - " primary email other emails \\\n", - "9601705 [] \n", - "\n", - " keywords \\\n", - "9601705 [open access, open science, libraries, reposit... \n", - "\n", - " external identifiers education \\\n", - "9601705 [[\"Ci\\u00eancia ID\", \"C915-48B2-6C87\"]] \n", - "\n", - " employments number of works \\\n", - "9601705 [[\"Librarian / Project manager\", \"Universidade... 5 \n", - "\n", - " works source email_domains \\\n", - "9601705 [\"CI\\u00caNCIAVITAE\", \"Pedro Pr\\u00edncipe\", \"... [] \n", - "\n", - " url_domains \n", - "9601705 [] " - ] - }, - "execution_count": 19, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['orcid'] == PP]" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "def fix_keywords(lst):\n", - " fixed = []\n", - " for k in lst:\n", - " split = k.split(',')\n", - " fixed.extend(split)\n", - " return fixed" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['open access',\n", - " ' open science',\n", - " ' libraries',\n", - " ' repositories',\n", - " ' social web',\n", - " '']" - ] - }, - "execution_count": 21, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "test = ['open access, open science, libraries, repositories, social web,']\n", - "fix_keywords(test)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [], - "source": [ - "df['fixed_keywords'] = df['keywords'].apply(lambda lst: fix_keywords(lst))" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks sourceemail_domainsurl_domainsfixed_keywords
95170990000-0001-6997-9470TrueTrueTrueotherwhatsapp<NA><NA>[[Otherwhatsapp, https://otherwhatsapp.com/], ...[][Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...NaN<NA><NA>0<NA>[][otherwhatsapp.com, im-creator.com, facebook.c...[Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...
\n", - "
" - ], - "text/plain": [ - " orcid claimed verifyed email verified primary email \\\n", - "9517099 0000-0001-6997-9470 True True True \n", - "\n", - " given names family name biography other names \\\n", - "9517099 other whatsapp \n", - "\n", - " researcher urls primary email \\\n", - "9517099 [[Otherwhatsapp, https://otherwhatsapp.com/], ... \n", - "\n", - " other emails keywords \\\n", - "9517099 [] [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... \n", - "\n", - " external identifiers education employments number of works \\\n", - "9517099 NaN 0 \n", - "\n", - " works source email_domains \\\n", - "9517099 [] \n", - "\n", - " url_domains \\\n", - "9517099 [otherwhatsapp.com, im-creator.com, facebook.c... \n", - "\n", - " fixed_keywords \n", - "9517099 [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... " - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['orcid'] == WHATSAPP]" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailsexternal identifierseducationemploymentsnumber of worksworks sourceemail_domainsurl_domainsfixed_keywords
00000-0001-5000-2053TrueFalseFalseJorgeJaramillo Sanchez<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
10000-0001-5000-6548TrueFalseFalseWisemanBekelesi<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
20000-0001-5000-7962TrueTrueTrueALICEINDIMULI<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
30000-0001-5000-8586TrueFalseFalseshimji yun<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
40000-0001-5001-0256TrueFalseFalseSandroCaramaschi<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
............................................................
107470350000-0003-4998-1551TrueFalseFalseAnimeshGhosh<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
107470360000-0003-4998-4111TrueFalseFalseHawaLiberna<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
107470370000-0003-4998-6045TrueFalseFalseTongyiMen<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
107470380000-0003-4998-8868TrueTrueFalseCharldonWilken<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
107470390000-0003-4999-7916TrueTrueTrueTapas BapuB.R.<NA><NA>[][]NaN<NA><NA>0<NA>[][][]
\n", - "

10747040 rows × 19 columns

\n", - "
" - ], - "text/plain": [ - " orcid claimed verifyed email \\\n", - "0 0000-0001-5000-2053 True False \n", - "1 0000-0001-5000-6548 True False \n", - "2 0000-0001-5000-7962 True True \n", - "3 0000-0001-5000-8586 True False \n", - "4 0000-0001-5001-0256 True False \n", - "... ... ... ... \n", - "10747035 0000-0003-4998-1551 True False \n", - "10747036 0000-0003-4998-4111 True False \n", - "10747037 0000-0003-4998-6045 True False \n", - "10747038 0000-0003-4998-8868 True True \n", - "10747039 0000-0003-4999-7916 True True \n", - "\n", - " verified primary email given names family name biography \\\n", - "0 False Jorge Jaramillo Sanchez \n", - "1 False Wiseman Bekelesi \n", - "2 True ALICE INDIMULI \n", - "3 False shim ji yun \n", - "4 False Sandro Caramaschi \n", - "... ... ... ... ... \n", - "10747035 False Animesh Ghosh \n", - "10747036 False Hawa Liberna \n", - "10747037 False Tongyi Men \n", - "10747038 False Charldon Wilken \n", - "10747039 True Tapas Bapu B.R. \n", - "\n", - " other names researcher urls primary email other emails \\\n", - "0 [] [] \n", - "1 [] [] \n", - "2 [] [] \n", - "3 [] [] \n", - "4 [] [] \n", - "... ... ... ... ... \n", - "10747035 [] [] \n", - "10747036 [] [] \n", - "10747037 [] [] \n", - "10747038 [] [] \n", - "10747039 [] [] \n", - "\n", - " external identifiers education employments number of works \\\n", - "0 NaN 0 \n", - "1 NaN 0 \n", - "2 NaN 0 \n", - "3 NaN 0 \n", - "4 NaN 0 \n", - "... ... ... ... ... \n", - "10747035 NaN 0 \n", - "10747036 NaN 0 \n", - "10747037 NaN 0 \n", - "10747038 NaN 0 \n", - "10747039 NaN 0 \n", - "\n", - " works source email_domains url_domains fixed_keywords \n", - "0 [] [] [] \n", - "1 [] [] [] \n", - "2 [] [] [] \n", - "3 [] [] [] \n", - "4 [] [] [] \n", - "... ... ... ... ... \n", - "10747035 [] [] [] \n", - "10747036 [] [] [] \n", - "10747037 [] [] [] \n", - "10747038 [] [] [] \n", - "10747039 [] [] [] \n", - "\n", - "[10747040 rows x 19 columns]" - ] - }, - "execution_count": 24, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.drop('keywords', axis=1, inplace=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Fixes for other columns with lists inside" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "# df['other names'] = df['other names'].apply(lambda x: ast.literal_eval(x))\n", - "# df['other emails'] = df['other emails'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", - "# df['researcher urls'] = df['researcher urls'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", - "# df['keywords'] = df['keywords'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", - "# df['education'] = df['education'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", - "# df['employments'] = df['employments'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", - "# df['external identifiers'] = df['external identifiers'].fillna('[]').apply(lambda x: ast.literal_eval(x))\n", - "# df['works source'] = df['works source'].fillna('[]').apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Feature extraction" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "# df['email_encoding'] = mlb.fit_transform(df['email_domains'])\n", - "# df['url_encoding'] = mlb.fit_transform(df['url_domains'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverifyed emailverified primary emailgiven namesfamily namebiographyother namesresearcher urlsprimary emailother emailskeywordsexternal identifierseducationemploymentsnumber of worksworks sourceemail_domainsurl_domainsfixed_keywords
00000-0001-5000-2053TrueFalseFalseJorgeJaramillo Sanchez<NA><NA>[][][][][][]0[][][][]
10000-0001-5000-6548TrueFalseFalseWisemanBekelesi<NA><NA>[][][][][][]0[][][][]
20000-0001-5000-7962TrueTrueTrueALICEINDIMULI<NA><NA>[][][][][][]0[][][][]
30000-0001-5000-8586TrueFalseFalseshimji yun<NA><NA>[][][][][][]0[][][][]
40000-0001-5001-0256TrueFalseFalseSandroCaramaschi<NA><NA>[][][][][][]0[][][][]
...............................................................
107470350000-0003-4998-1551TrueFalseFalseAnimeshGhosh<NA><NA>[][][][][][]0[][][][]
107470360000-0003-4998-4111TrueFalseFalseHawaLiberna<NA><NA>[][][][][][]0[][][][]
107470370000-0003-4998-6045TrueFalseFalseTongyiMen<NA><NA>[][][][][][]0[][][][]
107470380000-0003-4998-8868TrueTrueFalseCharldonWilken<NA><NA>[][][][][][]0[][][][]
107470390000-0003-4999-7916TrueTrueTrueTapas BapuB.R.<NA><NA>[][][][][][]0[][][][]
\n", - "

10747040 rows × 20 columns

\n", - "
" - ], - "text/plain": [ - " orcid claimed verifyed email \\\n", - "0 0000-0001-5000-2053 True False \n", - "1 0000-0001-5000-6548 True False \n", - "2 0000-0001-5000-7962 True True \n", - "3 0000-0001-5000-8586 True False \n", - "4 0000-0001-5001-0256 True False \n", - "... ... ... ... \n", - "10747035 0000-0003-4998-1551 True False \n", - "10747036 0000-0003-4998-4111 True False \n", - "10747037 0000-0003-4998-6045 True False \n", - "10747038 0000-0003-4998-8868 True True \n", - "10747039 0000-0003-4999-7916 True True \n", - "\n", - " verified primary email given names family name biography \\\n", - "0 False Jorge Jaramillo Sanchez \n", - "1 False Wiseman Bekelesi \n", - "2 True ALICE INDIMULI \n", - "3 False shim ji yun \n", - "4 False Sandro Caramaschi \n", - "... ... ... ... ... \n", - "10747035 False Animesh Ghosh \n", - "10747036 False Hawa Liberna \n", - "10747037 False Tongyi Men \n", - "10747038 False Charldon Wilken \n", - "10747039 True Tapas Bapu B.R. \n", - "\n", - " other names researcher urls primary email other emails keywords \\\n", - "0 [] [] [] \n", - "1 [] [] [] \n", - "2 [] [] [] \n", - "3 [] [] [] \n", - "4 [] [] [] \n", - "... ... ... ... ... ... \n", - "10747035 [] [] [] \n", - "10747036 [] [] [] \n", - "10747037 [] [] [] \n", - "10747038 [] [] [] \n", - "10747039 [] [] [] \n", - "\n", - " external identifiers education employments number of works \\\n", - "0 [] [] [] 0 \n", - "1 [] [] [] 0 \n", - "2 [] [] [] 0 \n", - "3 [] [] [] 0 \n", - "4 [] [] [] 0 \n", - "... ... ... ... ... \n", - "10747035 [] [] [] 0 \n", - "10747036 [] [] [] 0 \n", - "10747037 [] [] [] 0 \n", - "10747038 [] [] [] 0 \n", - "10747039 [] [] [] 0 \n", - "\n", - " works source email_domains url_domains fixed_keywords \n", - "0 [] [] [] [] \n", - "1 [] [] [] [] \n", - "2 [] [] [] [] \n", - "3 [] [] [] [] \n", - "4 [] [] [] [] \n", - "... ... ... ... ... \n", - "10747035 [] [] [] [] \n", - "10747036 [] [] [] [] \n", - "10747037 [] [] [] [] \n", - "10747038 [] [] [] [] \n", - "10747039 [] [] [] [] \n", - "\n", - "[10747040 rows x 20 columns]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -} diff --git a/notebooks/03-Machine Learning.ipynb b/notebooks/03-Machine Learning.ipynb new file mode 100644 index 0000000..5fa2601 --- /dev/null +++ b/notebooks/03-Machine Learning.ipynb @@ -0,0 +1,468 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Todo in data\n", + "- Column names -> no space\n", + "- If a list is empty, serialise [] in the csv\n", + "- If a string is empty, serialise '' in the csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "from urllib.parse import urlparse\n", + "import tldextract\n", + "\n", + "import pandas as pd\n", + "from sklearn.preprocessing import MultiLabelBinarizer\n", + "from sklearn.svm import OneClassSVM \n", + "from sklearn.model_selection import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_pickle('../data/processed/features.pkl')" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
verified_emailverified_primary_emailn_worksn_doin_arxivn_pmcn_other_pidsn_emailsn_urlsn_idsn_keywordsn_employmentn_educationlabel
00000000NaNNaNNaNNaNNaNNaN0
11100000NaNNaNNaNNaN1.0NaN0
21100000NaNNaNNaNNaNNaNNaN0
31100000NaNNaNNaNNaN1.0NaN0
41100000NaNNaNNaNNaN2.0NaN0
.............................................
109896441100000NaNNaNNaNNaN1.02.00
109896451177010NaNNaNNaNNaN2.02.01
109896461100000NaNNaNNaNNaNNaNNaN0
109896471100000NaNNaNNaNNaN1.02.00
109896481100000NaNNaNNaNNaNNaNNaN0
\n", + "

10989649 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " verified_email verified_primary_email n_works n_doi n_arxiv \\\n", + "0 0 0 0 0 0 \n", + "1 1 1 0 0 0 \n", + "2 1 1 0 0 0 \n", + "3 1 1 0 0 0 \n", + "4 1 1 0 0 0 \n", + "... ... ... ... ... ... \n", + "10989644 1 1 0 0 0 \n", + "10989645 1 1 7 7 0 \n", + "10989646 1 1 0 0 0 \n", + "10989647 1 1 0 0 0 \n", + "10989648 1 1 0 0 0 \n", + "\n", + " n_pmc n_other_pids n_emails n_urls n_ids n_keywords \\\n", + "0 0 0 NaN NaN NaN NaN \n", + "1 0 0 NaN NaN NaN NaN \n", + "2 0 0 NaN NaN NaN NaN \n", + "3 0 0 NaN NaN NaN NaN \n", + "4 0 0 NaN NaN NaN NaN \n", + "... ... ... ... ... ... ... \n", + "10989644 0 0 NaN NaN NaN NaN \n", + "10989645 1 0 NaN NaN NaN NaN \n", + "10989646 0 0 NaN NaN NaN NaN \n", + "10989647 0 0 NaN NaN NaN NaN \n", + "10989648 0 0 NaN NaN NaN NaN \n", + "\n", + " n_employment n_education label \n", + "0 NaN NaN 0 \n", + "1 1.0 NaN 0 \n", + "2 NaN NaN 0 \n", + "3 1.0 NaN 0 \n", + "4 2.0 NaN 0 \n", + "... ... ... ... \n", + "10989644 1.0 2.0 0 \n", + "10989645 2.0 2.0 1 \n", + "10989646 NaN NaN 0 \n", + "10989647 1.0 2.0 0 \n", + "10989648 NaN NaN 0 \n", + "\n", + "[10989649 rows x 14 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.fillna(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "verified_email 2664886\n", + "verified_primary_email 2664886\n", + "n_works 2664886\n", + "n_doi 2664886\n", + "n_arxiv 2664886\n", + "n_pmc 2664886\n", + "n_other_pids 2664886\n", + "n_emails 2664886\n", + "n_urls 2664886\n", + "n_ids 2664886\n", + "n_keywords 2664886\n", + "n_employment 2664886\n", + "n_education 2664886\n", + "label 2664886\n", + "dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df.label == 1].count()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "verified_email 8324763\n", + "verified_primary_email 8324763\n", + "n_works 8324763\n", + "n_doi 8324763\n", + "n_arxiv 8324763\n", + "n_pmc 8324763\n", + "n_other_pids 8324763\n", + "n_emails 8324763\n", + "n_urls 8324763\n", + "n_ids 8324763\n", + "n_keywords 8324763\n", + "n_employment 8324763\n", + "n_education 8324763\n", + "label 8324763\n", + "dtype: int64" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df.label == 0].count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# split into train/test sets\n", + "X = df.loc[:,'verified_email':'n_education']\n", + "y = df['label']\n", + "trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)\n", + "\n", + "# define outlier detection model\n", + "model = OneClassSVM(gamma='scale', nu=0.01)\n", + "\n", + "# fit on majority class\n", + "trainX = trainX[trainy==1]\n", + "model.fit(trainX)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# detect outliers in the test set\n", + "yhat = model.predict(testX)\n", + "\n", + "# mark inliers 1, outliers -1\n", + "testy[testy == 0] = -1\n", + "testy[testy == 1] = 1\n", + "\n", + "# calculate score\n", + "score = f1_score(testy, yhat, pos_label=-1)\n", + "print('F1 Score: %.3f' % score)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}