{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exploratory analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "TODO:\n", "- Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)\n", "- Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)\n", "- Temporal dimension of any use?\n", "- Can we access private info thanks to the OpenAIRE-ORCID agreement?\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import glob\n", "\n", "import pandas as pd\n", "import ast\n", "import tldextract\n", "import numpy as np\n", "\n", "import antispam\n", "\n", "import plotly\n", "from plotly.offline import iplot, init_notebook_mode\n", "import plotly.graph_objs as go\n", "import plotly.express as px\n", "\n", "init_notebook_mode(connected=True)\n", "TOP_N = 0\n", "TOP_RANGE = [0, 0]\n", "\n", "def set_top_n(n):\n", " global TOP_N, TOP_RANGE\n", " TOP_N = n\n", " TOP_RANGE = [-.5, n - 1 + .5]\n", " \n", "pd.set_option('display.max_columns', None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notable solid ORCID iDs for explorative purposes:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "AM = '0000-0002-5193-7851'\n", "PP = '0000-0002-8588-4196'\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notable anomalies:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "JOURNAL = '0000-0003-1815-5732'\n", "NOINFO = '0000-0001-5009-2052'\n", "VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE\n", "WORK_MISUSE = '0000-0001-7870-1120'\n", "# todo: find group-shared ORCiD, if possible" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notable fake ORCID iDs:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "SCAFFOLD = '0000-0001-5004-7761'\n", "WHATSAPP = '0000-0001-6997-9470'\n", "PENIS = '0000-0002-3399-7287'\n", "BITCOIN = '0000-0002-7518-6845'\n", "FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment\n", "CANNABIS = '0000-0002-9025-8632' # URL > 70 + works (REMOVED)\n", "PLUMBER = '0000-0002-1700-8311' # URL > 10 + works " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
00000-0001-6097-3953FalseFalse<NA><NA><NA>NaN<NA>NaNNaNNaNNaN0NaN2018-03-02t09:29:16.528z2018-03-02t09:43:07.551z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA><NA>
10000-0001-6112-5550TrueTrue<NA><NA><NA>[v.i. yurtaev; v. yurtaev]<NA>NaNNaNNaN[[professor, peoples friendship university of ...0NaN2018-04-03t07:50:23.358z2020-03-18t09:42:44.753z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA>1
20000-0001-6152-2695TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaNNaN0NaN2019-12-11t15:31:56.388z2020-01-28t15:34:17.309z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA><NA>
30000-0001-6220-5683TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaN[[research scientist, new york university abu ...0NaN2015-08-18t12:36:45.307z2020-09-23t13:37:54.180z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA>1
40000-0001-7071-8294TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaN[[researcher (academic), universidad de zarago...0NaN2014-03-10t13:22:01.966z2016-06-14t22:17:54.470z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA>2
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email given_names \\\n", "0 0000-0001-6097-3953 False False \n", "1 0000-0001-6112-5550 True True \n", "2 0000-0001-6152-2695 True True \n", "3 0000-0001-6220-5683 True True \n", "4 0000-0001-7071-8294 True True \n", "\n", " family_name biography other_names primary_email keywords \\\n", "0 NaN NaN \n", "1 [v.i. yurtaev; v. yurtaev] NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", "\n", " external_ids education employment \\\n", "0 NaN NaN NaN \n", "1 NaN NaN [[professor, peoples friendship university of ... \n", "2 NaN NaN NaN \n", "3 NaN NaN [[research scientist, new york university abu ... \n", "4 NaN NaN [[researcher (academic), universidad de zarago... \n", "\n", " n_works works_source activation_date last_update_date \\\n", "0 0 NaN 2018-03-02t09:29:16.528z 2018-03-02t09:43:07.551z \n", "1 0 NaN 2018-04-03t07:50:23.358z 2020-03-18t09:42:44.753z \n", "2 0 NaN 2019-12-11t15:31:56.388z 2020-01-28t15:34:17.309z \n", "3 0 NaN 2015-08-18t12:36:45.307z 2020-09-23t13:37:54.180z \n", "4 0 NaN 2014-03-10t13:22:01.966z 2016-06-14t22:17:54.470z \n", "\n", " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", "0 0 0 0 0 False NaN \n", "1 0 0 0 0 False NaN \n", "2 0 0 0 0 False NaN \n", "3 0 0 0 0 False NaN \n", "4 0 0 0 0 False NaN \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", "\n", " n_education n_employment \n", "0 \n", "1 1 \n", "2 \n", "3 1 \n", "4 2 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parts = glob.glob('../data/processed/dataset.pkl.*')\n", "\n", "df = pd.concat((pd.read_pickle(part) for part in sorted(parts)))\n", "df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notable profiles inspection" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
30732610000-0002-5193-7851TrueTrueandreamannoccidata scientist & researcher; scholarly knowled...NaNandrea.mannocci@isti.cnr.it[open science, data science, science of scienc...[[scopus author id, 55233589900]][[information engineering, ph.d., università d...[[research associate, istituto di scienza e te...37[scopus - elsevier, crossref metadata search, ...2017-09-12t14:28:33.467z2021-03-17t15:40:07.776z340060Trueisti.cnr.itNaN[github.io, twitter.com, linkedin.com]<NA>31545
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "3073261 0000-0002-5193-7851 True True \n", "\n", " given_names family_name \\\n", "3073261 andrea mannocci \n", "\n", " biography other_names \\\n", "3073261 data scientist & researcher; scholarly knowled... NaN \n", "\n", " primary_email \\\n", "3073261 andrea.mannocci@isti.cnr.it \n", "\n", " keywords \\\n", "3073261 [open science, data science, science of scienc... \n", "\n", " external_ids \\\n", "3073261 [[scopus author id, 55233589900]] \n", "\n", " education \\\n", "3073261 [[information engineering, ph.d., università d... \n", "\n", " employment n_works \\\n", "3073261 [[research associate, istituto di scienza e te... 37 \n", "\n", " works_source \\\n", "3073261 [scopus - elsevier, crossref metadata search, ... \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "3073261 2017-09-12t14:28:33.467z 2021-03-17t15:40:07.776z 34 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", "3073261 0 60 True isti.cnr.it NaN \n", "\n", " url_domains n_emails n_urls n_ids \\\n", "3073261 [github.io, twitter.com, linkedin.com] 3 1 \n", "\n", " n_keywords n_education n_employment \n", "3073261 5 4 5 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['orcid'] == AM]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
98872720000-0001-6997-9470TrueTrueotherwhatsapp<NA>NaN<NA>[whatsapp gb baixar, whatsapp gb 2020, whatsap...NaNNaNNaN0NaN2020-10-07t10:37:12.237z2020-10-08t02:32:03.935z0000FalseNaNNaN[otherwhatsapp.com, im-creator.com, facebook.c...<NA>27<NA>4<NA><NA>
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "9887272 0000-0001-6997-9470 True True \n", "\n", " given_names family_name biography other_names primary_email \\\n", "9887272 other whatsapp NaN \n", "\n", " keywords external_ids \\\n", "9887272 [whatsapp gb baixar, whatsapp gb 2020, whatsap... NaN \n", "\n", " education employment n_works works_source activation_date \\\n", "9887272 NaN NaN 0 NaN 2020-10-07t10:37:12.237z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", "9887272 2020-10-08t02:32:03.935z 0 0 0 0 False \n", "\n", " primary_email_domain other_email_domains \\\n", "9887272 NaN NaN \n", "\n", " url_domains n_emails n_urls \\\n", "9887272 [otherwhatsapp.com, im-creator.com, facebook.c... 27 \n", "\n", " n_ids n_keywords n_education n_employment \n", "9887272 4 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['orcid'] == WHATSAPP]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "orcid 10989649\n", "verified_email 10989649\n", "verified_primary_email 10989649\n", "given_names 10959039\n", "family_name 10671715\n", "biography 354015\n", "other_names 554684\n", "primary_email 124722\n", "keywords 649637\n", "external_ids 1308598\n", "education 2441645\n", "employment 2680488\n", "n_works 10989649\n", "works_source 2740939\n", "activation_date 10989649\n", "last_update_date 10989649\n", "n_doi 10989649\n", "n_arxiv 10989649\n", "n_pmc 10989649\n", "n_other_pids 10989649\n", "label 10989649\n", "primary_email_domain 124722\n", "other_email_domains 48615\n", "url_domains 715067\n", "n_emails 48615\n", "n_urls 715067\n", "n_ids 1308598\n", "n_keywords 649637\n", "n_education 2441645\n", "n_employment 2680488\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.count()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 10989649\n", "unique 10989649\n", "top 0000-0001-5242-3687\n", "freq 1\n", "Name: orcid, dtype: object" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['orcid'].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Primary email" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 124722\n", "unique 124718\n", "top opercin@erbakan.edu.tr\n", "freq 2\n", "Name: primary_email, dtype: object" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['primary_email'].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Dupe emails" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1681787 opercin@erbakan.edu.tr\n", "5590332 patrick.davey@monash.edu\n", "9316843 maykin@owasp.org\n", "10375852 andycheng2026@163.com\n", "Name: primary_email, dtype: string" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['primary_email'].dropna().loc[df['primary_email'].duplicated()]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
75439810000-0002-0836-2271TrueTruemaykinwarasart<NA>NaNmaykin@owasp.orgNaNNaNNaNNaN0NaN2020-09-15t04:43:55.709z2020-09-15t05:17:28.509z0000Falseowasp.org[dga.or.th]NaN1<NA><NA><NA><NA><NA>
93168430000-0001-9855-1676TrueTruemaykinwarasart<NA>NaNmaykin@owasp.orgNaNNaNNaNNaN0NaN2020-10-23t17:51:51.925z2021-01-01t15:00:52.053z0000Falseowasp.org[dga.or.th, ieee.org]NaN2<NA><NA><NA><NA><NA>
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "7543981 0000-0002-0836-2271 True True \n", "9316843 0000-0001-9855-1676 True True \n", "\n", " given_names family_name biography other_names primary_email \\\n", "7543981 maykin warasart NaN maykin@owasp.org \n", "9316843 maykin warasart NaN maykin@owasp.org \n", "\n", " keywords external_ids education employment n_works works_source \\\n", "7543981 NaN NaN NaN NaN 0 NaN \n", "9316843 NaN NaN NaN NaN 0 NaN \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "7543981 2020-09-15t04:43:55.709z 2020-09-15t05:17:28.509z 0 0 \n", "9316843 2020-10-23t17:51:51.925z 2021-01-01t15:00:52.053z 0 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain \\\n", "7543981 0 0 False owasp.org \n", "9316843 0 0 False owasp.org \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids \\\n", "7543981 [dga.or.th] NaN 1 \n", "9316843 [dga.or.th, ieee.org] NaN 2 \n", "\n", " n_keywords n_education n_employment \n", "7543981 \n", "9316843 " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['primary_email'] == 'maykin@owasp.org']" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
3478520000-0002-2232-9638TrueTrueosmanperçin<NA>NaNopercin@erbakan.edu.trNaNNaNNaNNaN0NaN2015-01-12t13:47:55.549z2020-01-27t07:38:24.269z0000Falseerbakan.edu.trNaNNaN<NA><NA><NA><NA><NA><NA>
16817870000-0003-0033-0918TrueTrueosmanperçin<NA>NaNopercin@erbakan.edu.trNaNNaNNaN[[, necmettin erbakan university, konya, , tr,...0NaN2015-10-13t05:47:12.014z2020-12-25t13:52:03.976z0000Falseerbakan.edu.trNaNNaN<NA><NA><NA><NA><NA>1
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "347852 0000-0002-2232-9638 True True \n", "1681787 0000-0003-0033-0918 True True \n", "\n", " given_names family_name biography other_names primary_email \\\n", "347852 osman perçin NaN opercin@erbakan.edu.tr \n", "1681787 osman perçin NaN opercin@erbakan.edu.tr \n", "\n", " keywords external_ids education \\\n", "347852 NaN NaN NaN \n", "1681787 NaN NaN NaN \n", "\n", " employment n_works \\\n", "347852 NaN 0 \n", "1681787 [[, necmettin erbakan university, konya, , tr,... 0 \n", "\n", " works_source activation_date last_update_date \\\n", "347852 NaN 2015-01-12t13:47:55.549z 2020-01-27t07:38:24.269z \n", "1681787 NaN 2015-10-13t05:47:12.014z 2020-12-25t13:52:03.976z \n", "\n", " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", "347852 0 0 0 0 False erbakan.edu.tr \n", "1681787 0 0 0 0 False erbakan.edu.tr \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", "347852 NaN NaN \n", "1681787 NaN NaN \n", "\n", " n_education n_employment \n", "347852 \n", "1681787 1 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['primary_email'] == 'opercin@erbakan.edu.tr']" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
9540850000-0002-9158-1757TrueTruepatrickdavey<NA>NaNpatrick.davey@monash.edu[radiochemistry, inorganic chemistry, bioinorg...NaNNaN[[phd student, monash university, melbourne, ,...0NaN2019-05-09t23:01:02.170z2019-08-20t03:00:17.844z0000Falsemonash.eduNaNNaN<NA><NA><NA>4<NA>1
55903320000-0002-8774-0030TrueTruepatrickdavey<NA>NaNpatrick.davey@monash.eduNaNNaNNaN[[phd student, monash university, melbourne, v...1[crossref]2018-09-11t10:47:10.997z2021-02-09t06:21:44.138z1000Truemonash.eduNaNNaN<NA><NA><NA><NA><NA>1
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "954085 0000-0002-9158-1757 True True \n", "5590332 0000-0002-8774-0030 True True \n", "\n", " given_names family_name biography other_names \\\n", "954085 patrick davey NaN \n", "5590332 patrick davey NaN \n", "\n", " primary_email \\\n", "954085 patrick.davey@monash.edu \n", "5590332 patrick.davey@monash.edu \n", "\n", " keywords external_ids \\\n", "954085 [radiochemistry, inorganic chemistry, bioinorg... NaN \n", "5590332 NaN NaN \n", "\n", " education employment n_works \\\n", "954085 NaN [[phd student, monash university, melbourne, ,... 0 \n", "5590332 NaN [[phd student, monash university, melbourne, v... 1 \n", "\n", " works_source activation_date last_update_date \\\n", "954085 NaN 2019-05-09t23:01:02.170z 2019-08-20t03:00:17.844z \n", "5590332 [crossref] 2018-09-11t10:47:10.997z 2021-02-09t06:21:44.138z \n", "\n", " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", "954085 0 0 0 0 False monash.edu \n", "5590332 1 0 0 0 True monash.edu \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", "954085 NaN NaN 4 \n", "5590332 NaN NaN \n", "\n", " n_education n_employment \n", "954085 1 \n", "5590332 1 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['primary_email'] == 'patrick.davey@monash.edu']" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 124722\n", "unique 17160\n", "top gmail.com\n", "freq 26750\n", "Name: primary_email_domain, dtype: object" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['primary_email_domain'].describe()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcid
primary_email_domain
gmail.com26750
hotmail.com3801
yahoo.com2625
163.com2132
yuhs.ac1134
......
imf.csic.es1
imf.org1
imfd.tu-freiberg.de1
imft.fr1
zzuli.edu.cn1
\n", "

17160 rows × 1 columns

\n", "
" ], "text/plain": [ " orcid\n", "primary_email_domain \n", "gmail.com 26750\n", "hotmail.com 3801\n", "yahoo.com 2625\n", "163.com 2132\n", "yuhs.ac 1134\n", "... ...\n", "imf.csic.es 1\n", "imf.org 1\n", "imfd.tu-freiberg.de 1\n", "imft.fr 1\n", "zzuli.edu.cn 1\n", "\n", "[17160 rows x 1 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_primary_emails = df[['primary_email_domain', 'orcid']]\\\n", " .groupby('primary_email_domain')\\\n", " .count()\\\n", " .sort_values('orcid', ascending=False)\n", "top_primary_emails" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "gmail.com", "hotmail.com", "yahoo.com", "163.com", "yuhs.ac", "qq.com", "outlook.com", "126.com", "bu.edu", "usgs.gov", "mail.ru", "usp.br", "yahoo.com.br", "ua.pt", "umich.edu", "ust.hk", "foxmail.com", "uomustansiriyah.edu.iq", "yandex.ru", "uq.edu.au", "ukr.net", "unesp.br", "ucl.ac.uk", "ieee.org", "naver.com", "stcatz.ox.ac.uk", "st-annes.ox.ac.uk", "yahoo.fr", "ucm.es", "live.com" ], "y": [ 26750, 3801, 2625, 2132, 1134, 1059, 948, 766, 629, 586, 579, 464, 459, 302, 290, 277, 260, 248, 244, 235, 226, 218, 210, 205, 188, 184, 184, 174, 174, 165 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top-30 email domains" }, "xaxis": { "range": [ -0.5, 29.5 ], "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(30)\n", "data = [\n", " go.Bar(\n", " x=top_primary_emails[:TOP_N].index,\n", " y=top_primary_emails[:TOP_N]['orcid']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Top-%s email domains' % TOP_N,\n", " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Other emails" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
2510000-0002-5916-446XTrueTrueantonio gilvanteixeira júnior<NA>[teixeira, antônio gilvan, júnior, antonio gil...gilvan.junior@aluno.ufca.edu.br[ethicis; medicine; infectology; neurology; ne...[[scopus author id, 56647743200], [scopus auth...[[faculty of health and life sciences, , unive...NaN14[antonio gilvan teixeira júnior, scopus - else...2016-05-18t11:26:36.642z2016-09-20t18:25:05.728z13008Falsealuno.ufca.edu.br[liverpool.ac.uk][researchgate.net, academia.edu, cnpq.br]13411<NA>
3160000-0002-8742-947XTrueTrueaarontan shing loong<NA>NaNaaron.tanshingloong@wadh.ox.ac.ukNaNNaN[[ruskin school of art; wadham college, , univ...NaN0NaN2015-10-05t23:10:08.771z2016-06-14t19:55:50.313z0000Falsewadh.ox.ac.uk[rsa.ox.ac.uk]NaN1<NA><NA><NA>1<NA>
4330000-0001-9097-2281TrueTrueabhisheksolanki<NA>NaN<NA>NaNNaNNaN[[senior engineer, robert bosch (india), benga...1[abhishek solanki]2019-04-22t04:43:06.232z2020-07-02t14:18:28.305z0000FalseNaN[in.bosch.com][github.com, linkedin.com]12<NA><NA><NA>2
4970000-0002-8614-3007TrueTrueadamarra<NA>NaN<NA>NaNNaNNaNNaN0NaN2017-11-15t06:33:45.625z2017-11-15t06:44:02.998z0000FalseNaN[hct.ac.ae]NaN1<NA><NA><NA><NA><NA>
8690000-0001-9884-5498TrueTruealbertoronzani<NA>NaNalberto@aronza.comNaNNaNNaN[[research scientist, vtt technical research c...19[crossref metadata search, alberto ronzani, cr...2014-04-16t13:21:54.287z2020-09-28t15:10:37.439z18003Truearonza.com[vtt.fi]NaN1<NA><NA><NA><NA>1
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "251 0000-0002-5916-446X True True \n", "316 0000-0002-8742-947X True True \n", "433 0000-0001-9097-2281 True True \n", "497 0000-0002-8614-3007 True True \n", "869 0000-0001-9884-5498 True True \n", "\n", " given_names family_name biography \\\n", "251 antonio gilvan teixeira júnior \n", "316 aaron tan shing loong \n", "433 abhishek solanki \n", "497 adam arra \n", "869 alberto ronzani \n", "\n", " other_names \\\n", "251 [teixeira, antônio gilvan, júnior, antonio gil... \n", "316 NaN \n", "433 NaN \n", "497 NaN \n", "869 NaN \n", "\n", " primary_email \\\n", "251 gilvan.junior@aluno.ufca.edu.br \n", "316 aaron.tanshingloong@wadh.ox.ac.uk \n", "433 \n", "497 \n", "869 alberto@aronza.com \n", "\n", " keywords \\\n", "251 [ethicis; medicine; infectology; neurology; ne... \n", "316 NaN \n", "433 NaN \n", "497 NaN \n", "869 NaN \n", "\n", " external_ids \\\n", "251 [[scopus author id, 56647743200], [scopus auth... \n", "316 NaN \n", "433 NaN \n", "497 NaN \n", "869 NaN \n", "\n", " education \\\n", "251 [[faculty of health and life sciences, , unive... \n", "316 [[ruskin school of art; wadham college, , univ... \n", "433 NaN \n", "497 NaN \n", "869 NaN \n", "\n", " employment n_works \\\n", "251 NaN 14 \n", "316 NaN 0 \n", "433 [[senior engineer, robert bosch (india), benga... 1 \n", "497 NaN 0 \n", "869 [[research scientist, vtt technical research c... 19 \n", "\n", " works_source \\\n", "251 [antonio gilvan teixeira júnior, scopus - else... \n", "316 NaN \n", "433 [abhishek solanki] \n", "497 NaN \n", "869 [crossref metadata search, alberto ronzani, cr... \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "251 2016-05-18t11:26:36.642z 2016-09-20t18:25:05.728z 13 0 \n", "316 2015-10-05t23:10:08.771z 2016-06-14t19:55:50.313z 0 0 \n", "433 2019-04-22t04:43:06.232z 2020-07-02t14:18:28.305z 0 0 \n", "497 2017-11-15t06:33:45.625z 2017-11-15t06:44:02.998z 0 0 \n", "869 2014-04-16t13:21:54.287z 2020-09-28t15:10:37.439z 18 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", "251 0 8 False aluno.ufca.edu.br [liverpool.ac.uk] \n", "316 0 0 False wadh.ox.ac.uk [rsa.ox.ac.uk] \n", "433 0 0 False NaN [in.bosch.com] \n", "497 0 0 False NaN [hct.ac.ae] \n", "869 0 3 True aronza.com [vtt.fi] \n", "\n", " url_domains n_emails n_urls n_ids \\\n", "251 [researchgate.net, academia.edu, cnpq.br] 1 3 4 \n", "316 NaN 1 \n", "433 [github.com, linkedin.com] 1 2 \n", "497 NaN 1 \n", "869 NaN 1 \n", "\n", " n_keywords n_education n_employment \n", "251 1 1 \n", "316 1 \n", "433 2 \n", "497 \n", "869 1 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.other_email_domains.notna()].head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "emails_by_orcid = df[['orcid', 'n_emails']].sort_values('n_emails', ascending=False)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "0000-0003-4171-3835", "0000-0001-6239-2968", "0000-0003-2151-4089", "0000-0003-2290-2817", "0000-0001-9084-3156", "0000-0001-6349-1044", "0000-0002-2085-1908", "0000-0003-4147-212X", "0000-0002-9599-6909", "0000-0001-9311-0687", "0000-0003-1502-3910", "0000-0002-9821-8424", "0000-0003-4327-6827", "0000-0002-1929-6054", "0000-0002-8390-8238", "0000-0002-1615-8633", "0000-0003-0671-1543", "0000-0003-4499-7300", "0000-0002-5341-6531", "0000-0002-8565-194X", "0000-0002-0776-9547", "0000-0001-8420-9204", "0000-0002-7396-1561", "0000-0002-3165-132X", "0000-0002-2567-3741", "0000-0003-2657-8225", "0000-0003-4685-5621", "0000-0001-5548-8259", "0000-0003-0391-3430", "0000-0003-2526-0928" ], "y": [ 12, 9, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top 30 ORCID iDs by email" }, "xaxis": { "range": [ -0.5, 29.5 ], "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(30)\n", "data = [\n", " go.Bar(\n", " x=emails_by_orcid[:TOP_N]['orcid'],\n", " y=emails_by_orcid[:TOP_N]['n_emails']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Top %s ORCID iDs by email' % TOP_N, \n", " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "top_other_emails = df[['orcid', 'other_email_domains']]\\\n", " .explode('other_email_domains')\\\n", " .reset_index(drop=True)\\\n", " .groupby('other_email_domains')\\\n", " .count()\\\n", " .sort_values('orcid', ascending=False)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "gmail.com", "hotmail.com", "yahoo.com", "qq.com", "163.com", "outlook.com", "126.com", "usp.br", "ieee.org", "yahoo.com.br", "mail.ru", "unesp.br", "sbs.ox.ac.uk", "yuhs.ac", "naver.com", "icloud.com", "foxmail.com", "uq.edu.au", "ua.pt", "cam.ac.uk", "imperial.ac.uk", "ukr.net", "law.ox.ac.uk", "mit.edu", "monash.edu", "stanford.edu", "ucl.ac.uk", "education.ox.ac.uk", "ucm.es", "conted.ox.ac.uk" ], "y": [ 11198, 1550, 1303, 785, 780, 433, 262, 236, 226, 151, 148, 141, 136, 134, 132, 119, 98, 96, 90, 84, 79, 75, 75, 74, 70, 70, 69, 67, 66, 65 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top 30 other email domains" }, "xaxis": { "range": [ -0.5, 29.5 ], "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(30)\n", "data = [\n", " go.Bar(\n", " x=top_other_emails[:TOP_N].index,\n", " y=top_other_emails[:TOP_N]['orcid']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Top %s other email domains' % TOP_N, \n", " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This somehow makes sense, legitimate users could put the gmail account as primary for login purposes and have institutional addresses as other email addresses. It makes also the life easier upon relocation." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Email speculation" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
4330000-0001-9097-2281TrueTrueabhisheksolanki<NA>NaN<NA>NaNNaNNaN[[senior engineer, robert bosch (india), benga...1[abhishek solanki]2019-04-22t04:43:06.232z2020-07-02t14:18:28.305z0000FalseNaN[in.bosch.com][github.com, linkedin.com]12<NA><NA><NA>2
4970000-0002-8614-3007TrueTrueadamarra<NA>NaN<NA>NaNNaNNaNNaN0NaN2017-11-15t06:33:45.625z2017-11-15t06:44:02.998z0000FalseNaN[hct.ac.ae]NaN1<NA><NA><NA><NA><NA>
8980000-0003-3728-6439TrueTruealejandraecheverry velásquezalejandra echeverry is an industrial electrici...NaN<NA>[control, technology, science, innovation, eng...NaN[[, electrical engineer, institución universit...[[professor, institución universitaria pascual...1[crossref]2019-03-31t00:00:42.929z2020-09-06t02:18:54.290z1000TrueNaN[pascualbravo.edu.co]NaN1<NA><NA>711
17190000-0001-8330-7443TrueTrueandreatesoniero<NA>NaN<NA>NaN[[researcherid, d-9056-2015]][[department of geophysics, master of science ...[[postdoctoral associate, yale university, new...4[andrea tesoniero]2015-03-09t11:59:06.093z2020-08-20t15:03:23.447z4002FalseNaN[yale.edu]NaN1<NA>1<NA>42
68290000-0001-9670-515XTrueTrueesma esinyildirim<NA>NaN<NA>[pharmacognosy, natural chemistry, chemical en...NaN[[business management, master of science, ista...NaN0NaN2020-07-26t10:38:03.721z2020-07-26t10:52:26.539z0000FalseNaN[gmail.com]NaN1<NA><NA>33<NA>
.............................................................................................
109858160000-0003-1204-6009TrueTruenathanwalk<NA>NaN<NA>NaNNaN[[department of physics, doctor of philosophy,...[[, university of oxford, oxford, oxfordshire,...10[crossref metadata search]2016-07-28t14:24:16.844z2020-10-13t11:47:50.621z10000TrueNaN[cs.ox.ac.uk][fu-berlin.de]11<NA><NA>32
109860270000-0002-3472-7668TrueTruerafvandevelde<NA>NaN<NA>NaNNaN[[chemical engineering technology, master, kat...[[phd researcher, katholieke universiteit leuv...0NaN2020-10-14t13:56:44.779z2020-10-16t14:21:40.673z0000FalseNaN[kuleuven.be][linkedin.com]11<NA><NA>21
109875010000-0002-9602-0529TrueTruecarlos augustofinelli<NA>NaN<NA>NaNNaNNaNNaN1[crossref]2013-09-16t16:52:06.120z2020-12-01t22:47:08.074z1000TrueNaN[cecot.com.br]NaN1<NA><NA><NA><NA><NA>
109878290000-0003-4402-5982TrueTruefilipede almeida araújo<NA>NaN<NA>NaNNaN[[materials science, msc. materials science, m...[[co-owner, aeft acessory, manaus, amazonas, b...0NaN2020-03-02t20:11:01.699z2020-12-04t13:53:39.404z0000FalseNaN[ime.eb.br]NaN1<NA><NA><NA>21
109884440000-0002-1734-7241TrueTruemanareldeenahmed<NA>NaN<NA>[deep learning, atomistic simulation, graphene...NaNNaN[[post-doctor, zhejiang university, hangzhou, ...6[manareldeen ahmed]2017-02-17t13:18:36.540z2020-12-04t02:04:36.668z6003TrueNaN[hotmail.com]NaN1<NA><NA>5<NA>1
\n", "

19814 rows × 30 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "433 0000-0001-9097-2281 True True \n", "497 0000-0002-8614-3007 True True \n", "898 0000-0003-3728-6439 True True \n", "1719 0000-0001-8330-7443 True True \n", "6829 0000-0001-9670-515X True True \n", "... ... ... ... \n", "10985816 0000-0003-1204-6009 True True \n", "10986027 0000-0002-3472-7668 True True \n", "10987501 0000-0002-9602-0529 True True \n", "10987829 0000-0003-4402-5982 True True \n", "10988444 0000-0002-1734-7241 True True \n", "\n", " given_names family_name \\\n", "433 abhishek solanki \n", "497 adam arra \n", "898 alejandra echeverry velásquez \n", "1719 andrea tesoniero \n", "6829 esma esin yildirim \n", "... ... ... \n", "10985816 nathan walk \n", "10986027 raf vandevelde \n", "10987501 carlos augusto finelli \n", "10987829 filipe de almeida araújo \n", "10988444 manareldeen ahmed \n", "\n", " biography other_names \\\n", "433 NaN \n", "497 NaN \n", "898 alejandra echeverry is an industrial electrici... NaN \n", "1719 NaN \n", "6829 NaN \n", "... ... ... \n", "10985816 NaN \n", "10986027 NaN \n", "10987501 NaN \n", "10987829 NaN \n", "10988444 NaN \n", "\n", " primary_email keywords \\\n", "433 NaN \n", "497 NaN \n", "898 [control, technology, science, innovation, eng... \n", "1719 NaN \n", "6829 [pharmacognosy, natural chemistry, chemical en... \n", "... ... ... \n", "10985816 NaN \n", "10986027 NaN \n", "10987501 NaN \n", "10987829 NaN \n", "10988444 [deep learning, atomistic simulation, graphene... \n", "\n", " external_ids \\\n", "433 NaN \n", "497 NaN \n", "898 NaN \n", "1719 [[researcherid, d-9056-2015]] \n", "6829 NaN \n", "... ... \n", "10985816 NaN \n", "10986027 NaN \n", "10987501 NaN \n", "10987829 NaN \n", "10988444 NaN \n", "\n", " education \\\n", "433 NaN \n", "497 NaN \n", "898 [[, electrical engineer, institución universit... \n", "1719 [[department of geophysics, master of science ... \n", "6829 [[business management, master of science, ista... \n", "... ... \n", "10985816 [[department of physics, doctor of philosophy,... \n", "10986027 [[chemical engineering technology, master, kat... \n", "10987501 NaN \n", "10987829 [[materials science, msc. materials science, m... \n", "10988444 NaN \n", "\n", " employment n_works \\\n", "433 [[senior engineer, robert bosch (india), benga... 1 \n", "497 NaN 0 \n", "898 [[professor, institución universitaria pascual... 1 \n", "1719 [[postdoctoral associate, yale university, new... 4 \n", "6829 NaN 0 \n", "... ... ... \n", "10985816 [[, university of oxford, oxford, oxfordshire,... 10 \n", "10986027 [[phd researcher, katholieke universiteit leuv... 0 \n", "10987501 NaN 1 \n", "10987829 [[co-owner, aeft acessory, manaus, amazonas, b... 0 \n", "10988444 [[post-doctor, zhejiang university, hangzhou, ... 6 \n", "\n", " works_source activation_date \\\n", "433 [abhishek solanki] 2019-04-22t04:43:06.232z \n", "497 NaN 2017-11-15t06:33:45.625z \n", "898 [crossref] 2019-03-31t00:00:42.929z \n", "1719 [andrea tesoniero] 2015-03-09t11:59:06.093z \n", "6829 NaN 2020-07-26t10:38:03.721z \n", "... ... ... \n", "10985816 [crossref metadata search] 2016-07-28t14:24:16.844z \n", "10986027 NaN 2020-10-14t13:56:44.779z \n", "10987501 [crossref] 2013-09-16t16:52:06.120z \n", "10987829 NaN 2020-03-02t20:11:01.699z \n", "10988444 [manareldeen ahmed] 2017-02-17t13:18:36.540z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n", "433 2020-07-02t14:18:28.305z 0 0 0 0 \n", "497 2017-11-15t06:44:02.998z 0 0 0 0 \n", "898 2020-09-06t02:18:54.290z 1 0 0 0 \n", "1719 2020-08-20t15:03:23.447z 4 0 0 2 \n", "6829 2020-07-26t10:52:26.539z 0 0 0 0 \n", "... ... ... ... ... ... \n", "10985816 2020-10-13t11:47:50.621z 10 0 0 0 \n", "10986027 2020-10-16t14:21:40.673z 0 0 0 0 \n", "10987501 2020-12-01t22:47:08.074z 1 0 0 0 \n", "10987829 2020-12-04t13:53:39.404z 0 0 0 0 \n", "10988444 2020-12-04t02:04:36.668z 6 0 0 3 \n", "\n", " label primary_email_domain other_email_domains \\\n", "433 False NaN [in.bosch.com] \n", "497 False NaN [hct.ac.ae] \n", "898 True NaN [pascualbravo.edu.co] \n", "1719 False NaN [yale.edu] \n", "6829 False NaN [gmail.com] \n", "... ... ... ... \n", "10985816 True NaN [cs.ox.ac.uk] \n", "10986027 False NaN [kuleuven.be] \n", "10987501 True NaN [cecot.com.br] \n", "10987829 False NaN [ime.eb.br] \n", "10988444 True NaN [hotmail.com] \n", "\n", " url_domains n_emails n_urls n_ids n_keywords \\\n", "433 [github.com, linkedin.com] 1 2 \n", "497 NaN 1 \n", "898 NaN 1 7 \n", "1719 NaN 1 1 \n", "6829 NaN 1 3 \n", "... ... ... ... ... ... \n", "10985816 [fu-berlin.de] 1 1 \n", "10986027 [linkedin.com] 1 1 \n", "10987501 NaN 1 \n", "10987829 NaN 1 \n", "10988444 NaN 1 5 \n", "\n", " n_education n_employment \n", "433 2 \n", "497 \n", "898 1 1 \n", "1719 4 2 \n", "6829 3 \n", "... ... ... \n", "10985816 3 2 \n", "10986027 2 1 \n", "10987501 \n", "10987829 2 1 \n", "10988444 1 \n", "\n", "[19814 rows x 30 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.primary_email.isna() & df.other_email_domains.notna()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## URLs" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
60000-0001-7402-0096TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaN[[, kth royal institute of technology, stockho...0NaN2015-01-11t15:13:06.467z2016-06-14t23:55:59.896z0000FalseNaNNaN[kth.se]<NA>1<NA><NA><NA>1
110000-0001-8377-3508TrueTrue<NA><NA><NA>[fontana, milena da silva]<NA>[educação; informática; matemática.]NaNNaN[[, instituto federal de educação, ciência e t...0NaN2018-05-23t23:39:04.534z2019-10-16t02:50:11.007z0000FalseNaNNaN[cnpq.br]<NA>1<NA>1<NA>3
290000-0002-2638-4108TrueTrue<NA><NA>investigador de la universidad de oviedo. depa...NaN<NA>[constitutional history, history of political ...[[scopus author id, 54394231000]][[public law, ph doctor, university of oviedo,...[[professor of constitutional law, university ...1[crossref]2013-03-25t14:38:06.016z2020-07-01t13:10:37.025z1000FalseNaNNaN[unioviedo.es]<NA>11311
460000-0003-1435-6545TrueTrue<NA><NA><NA>NaN<NA>[prostate cancer, migration, culture cell][[researcherid, p-2223-2018]][[morfologia, , universidade estadual paulista...[[, universidade estadual paulista (unesp), in...0NaN2018-08-09t12:12:24.405z2020-04-22t01:38:03.184z0000FalseNaNNaN[cnpq.br, linkedin.com]<NA>21311
1580000-0003-1284-9741TrueTruealex percy antoniomanriquez paisig<NA>NaN<NA>NaNNaNNaNNaN0NaN2020-09-08t20:04:33.906z2020-09-08t20:25:55.432z0000FalseNaNNaN[youtube.com]<NA>1<NA><NA><NA><NA>
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "6 0000-0001-7402-0096 True True \n", "11 0000-0001-8377-3508 True True \n", "29 0000-0002-2638-4108 True True \n", "46 0000-0003-1435-6545 True True \n", "158 0000-0003-1284-9741 True True \n", "\n", " given_names family_name \\\n", "6 \n", "11 \n", "29 \n", "46 \n", "158 alex percy antonio manriquez paisig \n", "\n", " biography \\\n", "6 \n", "11 \n", "29 investigador de la universidad de oviedo. depa... \n", "46 \n", "158 \n", "\n", " other_names primary_email \\\n", "6 NaN \n", "11 [fontana, milena da silva] \n", "29 NaN \n", "46 NaN \n", "158 NaN \n", "\n", " keywords \\\n", "6 NaN \n", "11 [educação; informática; matemática.] \n", "29 [constitutional history, history of political ... \n", "46 [prostate cancer, migration, culture cell] \n", "158 NaN \n", "\n", " external_ids \\\n", "6 NaN \n", "11 NaN \n", "29 [[scopus author id, 54394231000]] \n", "46 [[researcherid, p-2223-2018]] \n", "158 NaN \n", "\n", " education \\\n", "6 NaN \n", "11 NaN \n", "29 [[public law, ph doctor, university of oviedo,... \n", "46 [[morfologia, , universidade estadual paulista... \n", "158 NaN \n", "\n", " employment n_works works_source \\\n", "6 [[, kth royal institute of technology, stockho... 0 NaN \n", "11 [[, instituto federal de educação, ciência e t... 0 NaN \n", "29 [[professor of constitutional law, university ... 1 [crossref] \n", "46 [[, universidade estadual paulista (unesp), in... 0 NaN \n", "158 NaN 0 NaN \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "6 2015-01-11t15:13:06.467z 2016-06-14t23:55:59.896z 0 0 \n", "11 2018-05-23t23:39:04.534z 2019-10-16t02:50:11.007z 0 0 \n", "29 2013-03-25t14:38:06.016z 2020-07-01t13:10:37.025z 1 0 \n", "46 2018-08-09t12:12:24.405z 2020-04-22t01:38:03.184z 0 0 \n", "158 2020-09-08t20:04:33.906z 2020-09-08t20:25:55.432z 0 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", "6 0 0 False NaN NaN \n", "11 0 0 False NaN NaN \n", "29 0 0 False NaN NaN \n", "46 0 0 False NaN NaN \n", "158 0 0 False NaN NaN \n", "\n", " url_domains n_emails n_urls n_ids n_keywords \\\n", "6 [kth.se] 1 \n", "11 [cnpq.br] 1 1 \n", "29 [unioviedo.es] 1 1 3 \n", "46 [cnpq.br, linkedin.com] 2 1 3 \n", "158 [youtube.com] 1 \n", "\n", " n_education n_employment \n", "6 1 \n", "11 3 \n", "29 1 1 \n", "46 1 1 \n", "158 " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.url_domains.notna()].head()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidn_urls
32265180000-0002-1234-835X219
42060550000-0001-7478-4539174
49018700000-0002-7392-3792169
81842600000-0002-6938-9638152
27436480000-0002-5710-4041114
.........
109896440000-0002-1686-1935<NA>
109896450000-0002-3800-6331<NA>
109896460000-0002-8783-5814<NA>
109896470000-0002-7584-2283<NA>
109896480000-0003-0529-3538<NA>
\n", "

10989649 rows × 2 columns

\n", "
" ], "text/plain": [ " orcid n_urls\n", "3226518 0000-0002-1234-835X 219\n", "4206055 0000-0001-7478-4539 174\n", "4901870 0000-0002-7392-3792 169\n", "8184260 0000-0002-6938-9638 152\n", "2743648 0000-0002-5710-4041 114\n", "... ... ...\n", "10989644 0000-0002-1686-1935 \n", "10989645 0000-0002-3800-6331 \n", "10989646 0000-0002-8783-5814 \n", "10989647 0000-0002-7584-2283 \n", "10989648 0000-0003-0529-3538 \n", "\n", "[10989649 rows x 2 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "urls_by_orcid = df[['orcid', 'n_urls']].sort_values('n_urls', ascending=False)\n", "urls_by_orcid" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "0000-0002-1234-835X", "0000-0001-7478-4539", "0000-0002-7392-3792", "0000-0002-6938-9638", "0000-0002-5710-4041", "0000-0003-2450-090X", "0000-0002-3920-7389", "0000-0002-6689-4129", "0000-0001-5384-0001", "0000-0002-4621-5571", "0000-0001-9131-1266", "0000-0002-7754-8889", "0000-0002-5250-1144", "0000-0002-9025-8632", "0000-0002-7456-3848", "0000-0003-0176-1293", "0000-0003-0321-7339", "0000-0002-8493-0402", "0000-0002-9965-2425", "0000-0001-8873-6677", "0000-0002-3997-5070", "0000-0002-1856-6905", "0000-0002-4316-1467", "0000-0002-4062-3603", "0000-0003-0594-2462", "0000-0001-5880-7091", "0000-0003-1524-6268", "0000-0002-0752-7513", "0000-0003-2593-7134", "0000-0002-1298-5252", "0000-0003-1761-3842", "0000-0003-2383-8386", "0000-0003-3546-2312", "0000-0002-2886-9248", "0000-0003-2183-8112", "0000-0002-1929-6054", "0000-0003-4948-9268", "0000-0003-2407-3557", "0000-0002-9276-6921", "0000-0003-1484-6958", "0000-0002-7568-3403", "0000-0002-4305-4215", "0000-0002-4004-6666", "0000-0003-0796-0234", "0000-0001-7133-6896", "0000-0002-8208-0897", "0000-0002-9071-5450", "0000-0003-4993-5555", "0000-0003-0930-6121", "0000-0002-8116-9611", "0000-0002-5139-2660", "0000-0002-3277-9659", "0000-0002-8122-879X", "0000-0001-9559-1103", "0000-0003-2862-6315", "0000-0002-2000-8339", "0000-0001-5300-4601", "0000-0002-6254-8683", "0000-0002-6547-0172", "0000-0003-4808-6619", "0000-0003-3933-0229", "0000-0002-0971-9375", "0000-0003-0694-1154", "0000-0003-1585-1134", "0000-0002-4659-5391", "0000-0002-2916-2893", "0000-0001-6783-2037", "0000-0001-6461-2573", "0000-0003-4501-3756", "0000-0001-5549-6822", "0000-0002-8940-3177", "0000-0003-4326-9336", "0000-0001-8096-4333", "0000-0001-8978-4830", "0000-0002-5946-1595", "0000-0002-6680-1703", "0000-0002-8593-9257", "0000-0002-7653-4899", "0000-0003-1904-4188", "0000-0002-5196-4905", "0000-0001-8808-4867", "0000-0001-6921-0426", "0000-0003-1815-1993", "0000-0002-7843-8497", "0000-0003-1675-2840", "0000-0001-8644-2114", "0000-0003-0907-9870", "0000-0001-7784-0583", "0000-0001-7550-5802", "0000-0001-8986-2528", "0000-0002-5265-6074", "0000-0001-9102-8639", "0000-0002-0696-8560", "0000-0001-6979-4273", "0000-0002-7179-6953", "0000-0002-3334-9386", "0000-0001-6714-009X", "0000-0001-7193-5039", "0000-0002-5241-1026", "0000-0001-7608-9433" ], "y": [ 219, 174, 169, 152, 114, 114, 111, 104, 104, 90, 83, 83, 81, 81, 80, 80, 80, 76, 73, 72, 71, 70, 69, 69, 68, 68, 68, 68, 67, 67, 66, 66, 65, 64, 61, 61, 61, 59, 57, 57, 57, 57, 57, 57, 57, 56, 55, 55, 55, 55, 51, 50, 50, 50, 49, 49, 48, 48, 48, 48, 47, 47, 46, 46, 46, 45, 45, 45, 45, 44, 43, 43, 43, 43, 42, 42, 42, 41, 41, 41, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38, 38, 37, 37, 37, 37, 37, 36, 36, 36, 36 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top 100 ORCID iDs with URLs" }, "xaxis": { "range": [ -0.5, 99.5 ], "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(100)\n", "data = [\n", " go.Bar(\n", " x=urls_by_orcid[:TOP_N]['orcid'],\n", " y=urls_by_orcid[:TOP_N]['n_urls']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Top %s ORCID iDs with URLs' % TOP_N,\n", " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "top_urls = df[['orcid', 'url_domains']]\\\n", " .explode('url_domains')\\\n", " .reset_index(drop=True)\\\n", " .groupby('url_domains')\\\n", " .count()\\\n", " .sort_values('orcid', ascending=False)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "linkedin.com", "researchgate.net", "google.com", "cnpq.br", "academia.edu", "twitter.com", "facebook.com", "publons.com", "wordpress.com", "mendeley.com", "instagram.com", "github.io", "google.com.ua", "blogspot.com", "github.com", "google.es", "helsinki.fi", "unirioja.es", "youtube.com", "wixsite.com", "ku.dk", "", "scopus.com", "weebly.com", "us.es", "kth.se", "cityu.edu.hk", "au.dk", "kcl.ac.uk", "man.ac.uk", "google.com.au", "ucl.ac.uk", "sdu.dk", "ugr.es", "researcherid.com", "mq.edu.au", "ntu.edu.tw", "dtu.dk", "rug.nl", "colciencias.gov.co", "google.co.in", "bris.ac.uk", "uwa.edu.au", "uc3m.es", "vub.be", "bu.edu", "monash.edu", "google.co.uk", "aau.dk", "lancs.ac.uk" ], "y": [ 78418, 67823, 44804, 24635, 21174, 19046, 15368, 10751, 9043, 6960, 6040, 5516, 5371, 5272, 5252, 5163, 4730, 4590, 4470, 4140, 3771, 3620, 3586, 3122, 3037, 2957, 2795, 2746, 2724, 2689, 2610, 2586, 2478, 2231, 2134, 2133, 2094, 2002, 1975, 1929, 1917, 1840, 1820, 1804, 1803, 1803, 1772, 1656, 1653, 1650 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top-50 URL domains" }, "xaxis": { "range": [ -0.5, 49.5 ], "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(50)\n", "data = [\n", " go.Bar(\n", " x=top_urls[:TOP_N].index,\n", " y=top_urls[:TOP_N]['orcid']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Top-%s URL domains' % TOP_N,\n", " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## URLs speculation" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
10257130000-0003-2407-3557TrueTrueabdulazizabdul aziz was born on may 25, 1973, in brebes...[abdul aziz, aziz, abdul, aziz, a., aziz, abd,...<NA>[metodologi penelitian, ilmu ekonomi, ekonomi ...NaN[[ilmu ekonomi, dr, universitas borobudur, jak...[[assisten professor/dr, institut agama islam ...72[base - bielefeld academic search engine, abdu...2016-09-12t04:41:24.842z2021-01-26t11:58:33.039z190077FalseNaNNaN[google.com, syekhnurjati.ac.id, orcid.org, bl...<NA>59<NA>431
27436480000-0002-5710-4041TrueTrueryszardromaniukprofessor of electronics and communications en...[r.romaniuk, r.s.romaniuk, ryszard romaniuk, r...rrom@ise.pw.edu.pl[electronics, measurement systems, research sy...[[isni, 0000000071432485], [researcherid, b-91...[[faculty of electronics and information techn...[[professor, institute director, politechnika ...5008[inspire-hep, researcherid, isni2orcid search ...2013-01-20t12:09:21.600z2021-03-16t19:37:31.650z12212501742Trueise.pw.edu.pl[ise.pw.edu.pl, elka.pw.edu.pl, cern.ch][google.pl, publons.com, scopus.com, mendeley....31143511
30117240000-0003-2450-090XTrueTrueeduardbabulakprofessor eduard babulak is accomplished inter...[professor eduard babulak]<NA>[quality of service provision assessment, next...[[scopus author id, 6506867432], [researcherid...[[information technology, doctor habilitated (...[[consultant, horizon 2020 framework programme...274[the lens, base - bielefeld academic search en...2013-04-03t08:02:30.013z2021-02-28t10:07:13.231z19901174FalseNaNNaN[worldassessmentcouncil.org, spseke.sk, bcs.or...<NA>11458622
38810640000-0002-3920-7389TrueTrueа.гусевsurname, name gusev alexander leonidovichdate...[alexander l. gusev , alexander leonidovich gu...<NA>[technologies of production, technologies of i...[[researcherid, f-8048-2014], [scopus author i...[[chemical technology and cryogenic-vacuum tec...[[general director, scientific technical centr...472[publons, datacite, scopus - elsevier, a.l. gu...2014-05-14t00:01:28.030z2021-01-16t13:44:14.134z370021FalseNaNNaN[youtube.com, isjaee.com, researchgate.net, re...<NA>11121627
74660620000-0002-1929-6054TrueTruefranklin américocanaza choquedocente-investigador social. maestrando en der...[franklin américo canaza-choque , franklin a. ...leo_123fa@hotmail.com[filosofía; educación; políticas de desarrollo...[[researcherid, p-8613-2018], [loop profile, 8...[[facultad de ciencias de la educación , maest...[[investigador social, universidad católica de...39[researcherid, base - bielefeld academic searc...2017-09-15t19:45:43.483z2021-03-23t20:12:47.297z300034Truehotmail.com[gmail.com, gmail.com, hotmail.com, baldwin.ed...[concytec.gob.pe, redalyc.org, redalyc.org, un...5614211
75170960000-0003-4948-9268TrueTruegustavoduperrégustavo norberto duperré graduated in arts and...[gustavo norberto duperré, duperré, g. n., gus...gustavo.duperre@usal.edu.ar[computer science, sciences of antiquity, cont...[[scopus author id, 57195936346], [researcheri...[[programme in history, history of art and ter...[[titular professor, dirección general de cult...41[gustavo duperré, scopus - elsevier, publons, ...2020-02-22t15:49:52.386z2021-03-12t15:13:44.065z130034Falseusal.edu.arNaN[icomos.ro, unirioja.es, unirioja.es, unc.edu....<NA>6121165
80682750000-0003-2183-8112TrueTruepelayo munhozoleapós-doutorado em gestão ambiental pela univers...[ munhoz, pelayo olea, olea, pelayo, olea, p...<NA>[empreendedorismo, sustentabilidade, inovação][[scopus author id, 55175503300], [researcheri...[[, postdoctoral in environmental sustainabili...[[professor, universidade federal do rio grand...1109[the lens, pelayo munhoz olea, dimensions, bas...2013-02-04t17:25:34.723z2021-03-19t18:51:01.128z79801582TrueNaNNaN[cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c...<NA>612379
81842600000-0002-6938-9638TrueTrueadolfocatral sanabriamy education is in computer science, mathemati...NaN<NA>NaN[[loop profile, 747193]][[education, capacitación para la enseñanza en...NaN2023[base - bielefeld academic search engine, data...2019-05-07t19:27:02.210z2020-12-10t23:39:15.236z20220016FalseNaNNaN[researchgate.net, youtube.com, linkedin.com, ...<NA>1521<NA>6<NA>
87912560000-0002-9025-8632TrueTruebuycannabisdispensarywe procure and deliver premium cannabis strain...[we procure and deliver premium cannabis strai...<NA>[marijuana dispensary, cannabis, canabis dispe...NaNNaNNaN10[goowonderland dispensary]2020-12-09t21:19:46.004z2020-12-10t01:17:28.772z0000FalseNaNNaN[goowonderland.com, goowonderland.com, goowond...<NA>81<NA>7<NA><NA>
101745090000-0002-9965-2425TrueTruejaroslawspychalajaroslaw spychala has received a doctoral degr...[jaroslaw jozef spychala]<NA>[photochemistry, medicinal and pharmaceutical ...[[scopus author id, 7006745874]][[department of chemistry, postdoctoral associ...[[assistant professor, adam mickiewicz univers...29[scopus - elsevier]2014-09-18t12:34:14.242z2020-02-11t14:31:25.544z150029TrueNaNNaN[biowebspin.com, biowebspin.com, google.com, l...<NA>731442
102578080000-0002-4062-3603TrueTruejuan de diosbeltrán mancillajuan de dios beltrán mancilla (*) filósofo aut...[juan de dios beltrán mancilla, filósofo autod...<NA>[filosofia medicina arquitectura economía dere...NaN[[, diplomado en practicas directivas para or...[[inspector general jornada vespertina // de 2...11[juan de dios beltr´´án mancilla]2020-04-19t21:06:33.495z2021-02-10t20:13:07.698z0007FalseNaNNaN[yumpu.com, ijopm.org, google.com, blogspot.co...<NA>69<NA>186
104862120000-0002-3997-5070TrueTruedr. parameshacharib ddr. parameshachari b dacm distinguished speake...[dr. parameshachari b d]<NA>[mysore region coordinator|ieee bangalore sect...[[researcherid, f-7045-2018], [scopus author i...[[electronics and communication engineering, p...[[acm distinguished speaker (volunteer), assoc...93[publons, multidisciplinary digital publishing...2016-08-24t11:00:30.403z2021-03-23t07:16:22.582z470048FalseNaNNaN[geethashishu.in, geethashishu.in, acm.org, go...<NA>7136510
106526320000-0003-2593-7134TrueTrueaanjaelaniall my papers can be downloaded from portal:re...[jaelani, a., jaelani, aan]aan_jaelani@syekhnurjati.ac.id[islamic economics, islamic finance and bankin...[[scopus author id, 57195963463], [loop profil...[[post graduate, s3/dr, universitas islam nege...[[dr, institut agama islam negeri syekh nurjat...79[publons, aan jaelani, scopus - elsevier, dime...2016-03-02t18:37:44.989z2021-03-19t10:11:57.908z8800193Truesyekhnurjati.ac.id[gmail.com][microsoft.com, twitter.com, academia.edu, aca...1674721
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "1025713 0000-0003-2407-3557 True True \n", "2743648 0000-0002-5710-4041 True True \n", "3011724 0000-0003-2450-090X True True \n", "3881064 0000-0002-3920-7389 True True \n", "7466062 0000-0002-1929-6054 True True \n", "7517096 0000-0003-4948-9268 True True \n", "8068275 0000-0003-2183-8112 True True \n", "8184260 0000-0002-6938-9638 True True \n", "8791256 0000-0002-9025-8632 True True \n", "10174509 0000-0002-9965-2425 True True \n", "10257808 0000-0002-4062-3603 True True \n", "10486212 0000-0002-3997-5070 True True \n", "10652632 0000-0003-2593-7134 True True \n", "\n", " given_names family_name \\\n", "1025713 abdul aziz \n", "2743648 ryszard romaniuk \n", "3011724 eduard babulak \n", "3881064 а. гусев \n", "7466062 franklin américo canaza choque \n", "7517096 gustavo duperré \n", "8068275 pelayo munhoz olea \n", "8184260 adolfo catral sanabria \n", "8791256 buycannabis dispensary \n", "10174509 jaroslaw spychala \n", "10257808 juan de dios beltrán mancilla \n", "10486212 dr. parameshachari b d \n", "10652632 aan jaelani \n", "\n", " biography \\\n", "1025713 abdul aziz was born on may 25, 1973, in brebes... \n", "2743648 professor of electronics and communications en... \n", "3011724 professor eduard babulak is accomplished inter... \n", "3881064 surname, name gusev alexander leonidovichdate... \n", "7466062 docente-investigador social. maestrando en der... \n", "7517096 gustavo norberto duperré graduated in arts and... \n", "8068275 pós-doutorado em gestão ambiental pela univers... \n", "8184260 my education is in computer science, mathemati... \n", "8791256 we procure and deliver premium cannabis strain... \n", "10174509 jaroslaw spychala has received a doctoral degr... \n", "10257808 juan de dios beltrán mancilla (*) filósofo aut... \n", "10486212 dr. parameshachari b dacm distinguished speake... \n", "10652632 all my papers can be downloaded from portal:re... \n", "\n", " other_names \\\n", "1025713 [abdul aziz, aziz, abdul, aziz, a., aziz, abd,... \n", "2743648 [r.romaniuk, r.s.romaniuk, ryszard romaniuk, r... \n", "3011724 [professor eduard babulak] \n", "3881064 [alexander l. gusev , alexander leonidovich gu... \n", "7466062 [franklin américo canaza-choque , franklin a. ... \n", "7517096 [gustavo norberto duperré, duperré, g. n., gus... \n", "8068275 [ munhoz, pelayo olea, olea, pelayo, olea, p... \n", "8184260 NaN \n", "8791256 [we procure and deliver premium cannabis strai... \n", "10174509 [jaroslaw jozef spychala] \n", "10257808 [juan de dios beltrán mancilla, filósofo autod... \n", "10486212 [dr. parameshachari b d] \n", "10652632 [jaelani, a., jaelani, aan] \n", "\n", " primary_email \\\n", "1025713 \n", "2743648 rrom@ise.pw.edu.pl \n", "3011724 \n", "3881064 \n", "7466062 leo_123fa@hotmail.com \n", "7517096 gustavo.duperre@usal.edu.ar \n", "8068275 \n", "8184260 \n", "8791256 \n", "10174509 \n", "10257808 \n", "10486212 \n", "10652632 aan_jaelani@syekhnurjati.ac.id \n", "\n", " keywords \\\n", "1025713 [metodologi penelitian, ilmu ekonomi, ekonomi ... \n", "2743648 [electronics, measurement systems, research sy... \n", "3011724 [quality of service provision assessment, next... \n", "3881064 [technologies of production, technologies of i... \n", "7466062 [filosofía; educación; políticas de desarrollo... \n", "7517096 [computer science, sciences of antiquity, cont... \n", "8068275 [empreendedorismo, sustentabilidade, inovação] \n", "8184260 NaN \n", "8791256 [marijuana dispensary, cannabis, canabis dispe... \n", "10174509 [photochemistry, medicinal and pharmaceutical ... \n", "10257808 [filosofia medicina arquitectura economía dere... \n", "10486212 [mysore region coordinator|ieee bangalore sect... \n", "10652632 [islamic economics, islamic finance and bankin... \n", "\n", " external_ids \\\n", "1025713 NaN \n", "2743648 [[isni, 0000000071432485], [researcherid, b-91... \n", "3011724 [[scopus author id, 6506867432], [researcherid... \n", "3881064 [[researcherid, f-8048-2014], [scopus author i... \n", "7466062 [[researcherid, p-8613-2018], [loop profile, 8... \n", "7517096 [[scopus author id, 57195936346], [researcheri... \n", "8068275 [[scopus author id, 55175503300], [researcheri... \n", "8184260 [[loop profile, 747193]] \n", "8791256 NaN \n", "10174509 [[scopus author id, 7006745874]] \n", "10257808 NaN \n", "10486212 [[researcherid, f-7045-2018], [scopus author i... \n", "10652632 [[scopus author id, 57195963463], [loop profil... \n", "\n", " education \\\n", "1025713 [[ilmu ekonomi, dr, universitas borobudur, jak... \n", "2743648 [[faculty of electronics and information techn... \n", "3011724 [[information technology, doctor habilitated (... \n", "3881064 [[chemical technology and cryogenic-vacuum tec... \n", "7466062 [[facultad de ciencias de la educación , maest... \n", "7517096 [[programme in history, history of art and ter... \n", "8068275 [[, postdoctoral in environmental sustainabili... \n", "8184260 [[education, capacitación para la enseñanza en... \n", "8791256 NaN \n", "10174509 [[department of chemistry, postdoctoral associ... \n", "10257808 [[, diplomado en practicas directivas para or... \n", "10486212 [[electronics and communication engineering, p... \n", "10652632 [[post graduate, s3/dr, universitas islam nege... \n", "\n", " employment n_works \\\n", "1025713 [[assisten professor/dr, institut agama islam ... 72 \n", "2743648 [[professor, institute director, politechnika ... 5008 \n", "3011724 [[consultant, horizon 2020 framework programme... 274 \n", "3881064 [[general director, scientific technical centr... 472 \n", "7466062 [[investigador social, universidad católica de... 39 \n", "7517096 [[titular professor, dirección general de cult... 41 \n", "8068275 [[professor, universidade federal do rio grand... 1109 \n", "8184260 NaN 2023 \n", "8791256 NaN 10 \n", "10174509 [[assistant professor, adam mickiewicz univers... 29 \n", "10257808 [[inspector general jornada vespertina // de 2... 11 \n", "10486212 [[acm distinguished speaker (volunteer), assoc... 93 \n", "10652632 [[dr, institut agama islam negeri syekh nurjat... 79 \n", "\n", " works_source \\\n", "1025713 [base - bielefeld academic search engine, abdu... \n", "2743648 [inspire-hep, researcherid, isni2orcid search ... \n", "3011724 [the lens, base - bielefeld academic search en... \n", "3881064 [publons, datacite, scopus - elsevier, a.l. gu... \n", "7466062 [researcherid, base - bielefeld academic searc... \n", "7517096 [gustavo duperré, scopus - elsevier, publons, ... \n", "8068275 [the lens, pelayo munhoz olea, dimensions, bas... \n", "8184260 [base - bielefeld academic search engine, data... \n", "8791256 [goowonderland dispensary] \n", "10174509 [scopus - elsevier] \n", "10257808 [juan de dios beltr´´án mancilla] \n", "10486212 [publons, multidisciplinary digital publishing... \n", "10652632 [publons, aan jaelani, scopus - elsevier, dime... \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "1025713 2016-09-12t04:41:24.842z 2021-01-26t11:58:33.039z 19 0 \n", "2743648 2013-01-20t12:09:21.600z 2021-03-16t19:37:31.650z 1221 25 \n", "3011724 2013-04-03t08:02:30.013z 2021-02-28t10:07:13.231z 199 0 \n", "3881064 2014-05-14t00:01:28.030z 2021-01-16t13:44:14.134z 37 0 \n", "7466062 2017-09-15t19:45:43.483z 2021-03-23t20:12:47.297z 30 0 \n", "7517096 2020-02-22t15:49:52.386z 2021-03-12t15:13:44.065z 13 0 \n", "8068275 2013-02-04t17:25:34.723z 2021-03-19t18:51:01.128z 798 0 \n", "8184260 2019-05-07t19:27:02.210z 2020-12-10t23:39:15.236z 2022 0 \n", "8791256 2020-12-09t21:19:46.004z 2020-12-10t01:17:28.772z 0 0 \n", "10174509 2014-09-18t12:34:14.242z 2020-02-11t14:31:25.544z 15 0 \n", "10257808 2020-04-19t21:06:33.495z 2021-02-10t20:13:07.698z 0 0 \n", "10486212 2016-08-24t11:00:30.403z 2021-03-23t07:16:22.582z 47 0 \n", "10652632 2016-03-02t18:37:44.989z 2021-03-19t10:11:57.908z 88 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain \\\n", "1025713 0 77 False NaN \n", "2743648 0 1742 True ise.pw.edu.pl \n", "3011724 1 174 False NaN \n", "3881064 0 21 False NaN \n", "7466062 0 34 True hotmail.com \n", "7517096 0 34 False usal.edu.ar \n", "8068275 1 582 True NaN \n", "8184260 0 16 False NaN \n", "8791256 0 0 False NaN \n", "10174509 0 29 True NaN \n", "10257808 0 7 False NaN \n", "10486212 0 48 False NaN \n", "10652632 0 193 True syekhnurjati.ac.id \n", "\n", " other_email_domains \\\n", "1025713 NaN \n", "2743648 [ise.pw.edu.pl, elka.pw.edu.pl, cern.ch] \n", "3011724 NaN \n", "3881064 NaN \n", "7466062 [gmail.com, gmail.com, hotmail.com, baldwin.ed... \n", "7517096 NaN \n", "8068275 NaN \n", "8184260 NaN \n", "8791256 NaN \n", "10174509 NaN \n", "10257808 NaN \n", "10486212 NaN \n", "10652632 [gmail.com] \n", "\n", " url_domains n_emails n_urls \\\n", "1025713 [google.com, syekhnurjati.ac.id, orcid.org, bl... 59 \n", "2743648 [google.pl, publons.com, scopus.com, mendeley.... 3 114 \n", "3011724 [worldassessmentcouncil.org, spseke.sk, bcs.or... 114 \n", "3881064 [youtube.com, isjaee.com, researchgate.net, re... 111 \n", "7466062 [concytec.gob.pe, redalyc.org, redalyc.org, un... 5 61 \n", "7517096 [icomos.ro, unirioja.es, unirioja.es, unc.edu.... 61 \n", "8068275 [cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c... 61 \n", "8184260 [researchgate.net, youtube.com, linkedin.com, ... 152 \n", "8791256 [goowonderland.com, goowonderland.com, goowond... 81 \n", "10174509 [biowebspin.com, biowebspin.com, google.com, l... 73 \n", "10257808 [yumpu.com, ijopm.org, google.com, blogspot.co... 69 \n", "10486212 [geethashishu.in, geethashishu.in, acm.org, go... 71 \n", "10652632 [microsoft.com, twitter.com, academia.edu, aca... 1 67 \n", "\n", " n_ids n_keywords n_education n_employment \n", "1025713 4 3 1 \n", "2743648 3 5 1 1 \n", "3011724 5 8 6 22 \n", "3881064 2 16 2 7 \n", "7466062 4 2 1 1 \n", "7517096 2 11 6 5 \n", "8068275 2 3 7 9 \n", "8184260 1 6 \n", "8791256 7 \n", "10174509 1 4 4 2 \n", "10257808 1 8 6 \n", "10486212 3 6 5 10 \n", "10652632 4 7 2 1 " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
474390000-0002-5967-2835TrueTrueoleksiygoryayinov<NA>[алексей николаевич горяинов, о.м.горяїнов, а....<NA>[diagnostics, transport, logistics][[researcherid, i-7977-2016]][[, дистанционный курс «ctl.sc2x: supply chain...[[docent, kharkiv petro vasylenko national tec...274[oleksiy goryayinov]2014-08-03t18:06:42.925z2021-03-22t13:56:48.311z0000FalseNaNNaN[khntusg.com.ua, khntusg.com.ua, google.com.ua...<NA>1313147
725570000-0002-3505-2797TrueTruenurulmalahayatigoogle scholarNaN<NA>NaN[[researcherid, q-3861-2017]][[civil and transportation engineering , maste...[[senior lecturer, universitas syiah kuala, ba...6[nurul malahayati]2017-10-01t00:46:31.324z2019-08-19t15:52:47.253z3003FalseNaNNaN[google.com, ristekdikti.go.id, unsyiah.ac.id,...<NA>161<NA>21
940810000-0003-3670-9620TrueTruecarlosbarreraim individual inventor, and this is my work; s...[retrodynamic, novelinflow]<NA>[energy, technology, gearturbine, imploturboco...[[loop profile, 394457]]NaNNaN1[carlos barrera]2016-08-29t20:32:10.362z2021-02-09t04:56:35.554z0000FalseNaNNaN[blogspot.mx, behance.net, authorstream.com, d...<NA>2418<NA><NA>
2616730000-0002-5441-0465TrueTruenuriahernández-león<NA>[nuria h. león, nuria hernández león, hernánde...<NA>[training, icts, business management, research...NaN[[, course: social skills, university of salam...[[merchandise reception and expedition trainer...11[nuria hernández-león]2015-11-28t07:18:58.442z2021-03-05t16:37:47.403z1004FalseNaNNaN[feriaempresamujer.com, escueladenegociosydire...<NA>16<NA>71916
3262110000-0002-7781-6767TrueTruemohd nazriismailborn in penang, malaysia in 1971, dr. mohd had...[ndum (national defence university of malaysia)]<NA>[wsn, manet, simulation and modelling, network...[[scopus author id, 24372977800], [researcheri...NaN[[lecturer, universiti pertahanan nasional mal...35[scopus - elsevier]2016-09-06t02:25:52.974z2020-10-20t06:55:55.051z240035TrueNaNNaN[google.com.my, researchgate.net, academia.edu...<NA>16210<NA>4
.............................................................................................
105798010000-0001-5087-6965TrueTruerobertoharasystematics, evolutionary biology, and the his...[r. o’hara, r.j. o’hara, robert o’hara, robert...<NA>[history and philosophy of science, evolutiona...[[isni, 0000000138200102], [researcherid, b-47...[[biology, ph.d., harvard university, cambridg...NaN45[robert j. o’hara]2014-09-21t02:45:19.620z2020-07-09t06:51:09.228z230072TrueNaNNaN[rjohara.net, google.com, collegiateway.org, r...<NA>12351<NA>
105908820000-0002-3318-9861TrueTrueshaguftaperveenprof. dr. shagufta perveen is a professor at k...NaNshagufta792000@yahoo.com[shagufta perveen professor, shagufta perveen ...NaN[[hej research institute of chemistry, phd che...[[professor, king saud university college of p...66[scopus - elsevier]2015-12-21t10:34:06.771z2021-02-22t14:58:30.893z560066Trueyahoo.com[msu.edu, ksu.edu.sa][shaguftaperveen.com, researchgate.net, ksu.ed...211<NA>2537
107660620000-0001-8960-9004TrueTruesusanbastani<NA>[s. bastani, سوسن باستانی]sbastani@alzahra.ac.ir[social networks, fuzzy logic, online and offl...[[scopus author id, 16642098400]][[sociology, ph.d., university of toronto, tor...[[professor, alzahra university, tehran, vanak...20[scopus - elsevier]2019-07-10t06:50:46.255z2020-10-07t04:08:01.961z190033Truealzahra.ac.ir[gmail.com, gmail.com][scopus.com, google.com, publons.com, zenodo.o...2111434
108078390000-0002-4379-6454TrueTruecaroline wanjirukariukicaroline holds a phd in economics from curtin ...NaN<NA>[development economics, applied econometrics, ...NaN[[economics, doctor of philosophy , curtin uni...[[director, educational development, strathmor...4[caroline wanjiru kariuki]2020-03-18t10:18:04.007z2021-02-11t14:40:38.515z1000FalseNaNNaN[scopus.com, mendeley.com, publons.com, resear...<NA>13<NA>436
109119660000-0003-2311-0600TrueTruemyokyaw hlaing<NA>[dr myo kyaw hlaing]<NA>[economic geology]NaNNaN[[lecturer, union of myanmar ministry of educa...2[myo kyaw hlaing]2018-12-26t12:51:57.801z2021-01-26t14:36:47.421z1002FalseNaNNaN[facebook.com, linkedin.com, instagram.com, re...<NA>12<NA>1<NA>2
\n", "

140 rows × 30 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "47439 0000-0002-5967-2835 True True \n", "72557 0000-0002-3505-2797 True True \n", "94081 0000-0003-3670-9620 True True \n", "261673 0000-0002-5441-0465 True True \n", "326211 0000-0002-7781-6767 True True \n", "... ... ... ... \n", "10579801 0000-0001-5087-6965 True True \n", "10590882 0000-0002-3318-9861 True True \n", "10766062 0000-0001-8960-9004 True True \n", "10807839 0000-0002-4379-6454 True True \n", "10911966 0000-0003-2311-0600 True True \n", "\n", " given_names family_name \\\n", "47439 oleksiy goryayinov \n", "72557 nurul malahayati \n", "94081 carlos barrera \n", "261673 nuria hernández-león \n", "326211 mohd nazri ismail \n", "... ... ... \n", "10579801 robert ohara \n", "10590882 shagufta perveen \n", "10766062 susan bastani \n", "10807839 caroline wanjiru kariuki \n", "10911966 myo kyaw hlaing \n", "\n", " biography \\\n", "47439 \n", "72557 google scholar \n", "94081 im individual inventor, and this is my work; s... \n", "261673 \n", "326211 born in penang, malaysia in 1971, dr. mohd had... \n", "... ... \n", "10579801 systematics, evolutionary biology, and the his... \n", "10590882 prof. dr. shagufta perveen is a professor at k... \n", "10766062 \n", "10807839 caroline holds a phd in economics from curtin ... \n", "10911966 \n", "\n", " other_names \\\n", "47439 [алексей николаевич горяинов, о.м.горяїнов, а.... \n", "72557 NaN \n", "94081 [retrodynamic, novelinflow] \n", "261673 [nuria h. león, nuria hernández león, hernánde... \n", "326211 [ndum (national defence university of malaysia)] \n", "... ... \n", "10579801 [r. o’hara, r.j. o’hara, robert o’hara, robert... \n", "10590882 NaN \n", "10766062 [s. bastani, سوسن باستانی] \n", "10807839 NaN \n", "10911966 [dr myo kyaw hlaing] \n", "\n", " primary_email \\\n", "47439 \n", "72557 \n", "94081 \n", "261673 \n", "326211 \n", "... ... \n", "10579801 \n", "10590882 shagufta792000@yahoo.com \n", "10766062 sbastani@alzahra.ac.ir \n", "10807839 \n", "10911966 \n", "\n", " keywords \\\n", "47439 [diagnostics, transport, logistics] \n", "72557 NaN \n", "94081 [energy, technology, gearturbine, imploturboco... \n", "261673 [training, icts, business management, research... \n", "326211 [wsn, manet, simulation and modelling, network... \n", "... ... \n", "10579801 [history and philosophy of science, evolutiona... \n", "10590882 [shagufta perveen professor, shagufta perveen ... \n", "10766062 [social networks, fuzzy logic, online and offl... \n", "10807839 [development economics, applied econometrics, ... \n", "10911966 [economic geology] \n", "\n", " external_ids \\\n", "47439 [[researcherid, i-7977-2016]] \n", "72557 [[researcherid, q-3861-2017]] \n", "94081 [[loop profile, 394457]] \n", "261673 NaN \n", "326211 [[scopus author id, 24372977800], [researcheri... \n", "... ... \n", "10579801 [[isni, 0000000138200102], [researcherid, b-47... \n", "10590882 NaN \n", "10766062 [[scopus author id, 16642098400]] \n", "10807839 NaN \n", "10911966 NaN \n", "\n", " education \\\n", "47439 [[, дистанционный курс «ctl.sc2x: supply chain... \n", "72557 [[civil and transportation engineering , maste... \n", "94081 NaN \n", "261673 [[, course: social skills, university of salam... \n", "326211 NaN \n", "... ... \n", "10579801 [[biology, ph.d., harvard university, cambridg... \n", "10590882 [[hej research institute of chemistry, phd che... \n", "10766062 [[sociology, ph.d., university of toronto, tor... \n", "10807839 [[economics, doctor of philosophy , curtin uni... \n", "10911966 NaN \n", "\n", " employment n_works \\\n", "47439 [[docent, kharkiv petro vasylenko national tec... 274 \n", "72557 [[senior lecturer, universitas syiah kuala, ba... 6 \n", "94081 NaN 1 \n", "261673 [[merchandise reception and expedition trainer... 11 \n", "326211 [[lecturer, universiti pertahanan nasional mal... 35 \n", "... ... ... \n", "10579801 NaN 45 \n", "10590882 [[professor, king saud university college of p... 66 \n", "10766062 [[professor, alzahra university, tehran, vanak... 20 \n", "10807839 [[director, educational development, strathmor... 4 \n", "10911966 [[lecturer, union of myanmar ministry of educa... 2 \n", "\n", " works_source activation_date \\\n", "47439 [oleksiy goryayinov] 2014-08-03t18:06:42.925z \n", "72557 [nurul malahayati] 2017-10-01t00:46:31.324z \n", "94081 [carlos barrera] 2016-08-29t20:32:10.362z \n", "261673 [nuria hernández-león] 2015-11-28t07:18:58.442z \n", "326211 [scopus - elsevier] 2016-09-06t02:25:52.974z \n", "... ... ... \n", "10579801 [robert j. o’hara] 2014-09-21t02:45:19.620z \n", "10590882 [scopus - elsevier] 2015-12-21t10:34:06.771z \n", "10766062 [scopus - elsevier] 2019-07-10t06:50:46.255z \n", "10807839 [caroline wanjiru kariuki] 2020-03-18t10:18:04.007z \n", "10911966 [myo kyaw hlaing] 2018-12-26t12:51:57.801z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n", "47439 2021-03-22t13:56:48.311z 0 0 0 0 \n", "72557 2019-08-19t15:52:47.253z 3 0 0 3 \n", "94081 2021-02-09t04:56:35.554z 0 0 0 0 \n", "261673 2021-03-05t16:37:47.403z 1 0 0 4 \n", "326211 2020-10-20t06:55:55.051z 24 0 0 35 \n", "... ... ... ... ... ... \n", "10579801 2020-07-09t06:51:09.228z 23 0 0 72 \n", "10590882 2021-02-22t14:58:30.893z 56 0 0 66 \n", "10766062 2020-10-07t04:08:01.961z 19 0 0 33 \n", "10807839 2021-02-11t14:40:38.515z 1 0 0 0 \n", "10911966 2021-01-26t14:36:47.421z 1 0 0 2 \n", "\n", " label primary_email_domain other_email_domains \\\n", "47439 False NaN NaN \n", "72557 False NaN NaN \n", "94081 False NaN NaN \n", "261673 False NaN NaN \n", "326211 True NaN NaN \n", "... ... ... ... \n", "10579801 True NaN NaN \n", "10590882 True yahoo.com [msu.edu, ksu.edu.sa] \n", "10766062 True alzahra.ac.ir [gmail.com, gmail.com] \n", "10807839 False NaN NaN \n", "10911966 False NaN NaN \n", "\n", " url_domains n_emails n_urls \\\n", "47439 [khntusg.com.ua, khntusg.com.ua, google.com.ua... 13 \n", "72557 [google.com, ristekdikti.go.id, unsyiah.ac.id,... 16 \n", "94081 [blogspot.mx, behance.net, authorstream.com, d... 24 \n", "261673 [feriaempresamujer.com, escueladenegociosydire... 16 \n", "326211 [google.com.my, researchgate.net, academia.edu... 16 \n", "... ... ... ... \n", "10579801 [rjohara.net, google.com, collegiateway.org, r... 12 \n", "10590882 [shaguftaperveen.com, researchgate.net, ksu.ed... 2 11 \n", "10766062 [scopus.com, google.com, publons.com, zenodo.o... 2 11 \n", "10807839 [scopus.com, mendeley.com, publons.com, resear... 13 \n", "10911966 [facebook.com, linkedin.com, instagram.com, re... 12 \n", "\n", " n_ids n_keywords n_education n_employment \n", "47439 1 3 14 7 \n", "72557 1 2 1 \n", "94081 1 8 \n", "261673 7 19 16 \n", "326211 2 10 4 \n", "... ... ... ... ... \n", "10579801 3 5 1 \n", "10590882 25 3 7 \n", "10766062 1 4 3 4 \n", "10807839 4 3 6 \n", "10911966 1 2 \n", "\n", "[140 rows x 30 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
00000-0002-5967-2835TrueTrueoleksiygoryayinov<NA>[алексей николаевич горяинов, о.м.горяїнов, а....<NA>[diagnostics, transport, logistics][[researcherid, i-7977-2016]][[, дистанционный курс «ctl.sc2x: supply chain...[[docent, kharkiv petro vasylenko national tec...274oleksiy goryayinov2014-08-03t18:06:42.925z2021-03-22t13:56:48.311z0000FalseNaNNaN[khntusg.com.ua, khntusg.com.ua, google.com.ua...<NA>1313147
10000-0002-3505-2797TrueTruenurulmalahayatigoogle scholarNaN<NA>NaN[[researcherid, q-3861-2017]][[civil and transportation engineering , maste...[[senior lecturer, universitas syiah kuala, ba...6nurul malahayati2017-10-01t00:46:31.324z2019-08-19t15:52:47.253z3003FalseNaNNaN[google.com, ristekdikti.go.id, unsyiah.ac.id,...<NA>161<NA>21
20000-0003-3670-9620TrueTruecarlosbarreraim individual inventor, and this is my work; s...[retrodynamic, novelinflow]<NA>[energy, technology, gearturbine, imploturboco...[[loop profile, 394457]]NaNNaN1carlos barrera2016-08-29t20:32:10.362z2021-02-09t04:56:35.554z0000FalseNaNNaN[blogspot.mx, behance.net, authorstream.com, d...<NA>2418<NA><NA>
30000-0002-5441-0465TrueTruenuriahernández-león<NA>[nuria h. león, nuria hernández león, hernánde...<NA>[training, icts, business management, research...NaN[[, course: social skills, university of salam...[[merchandise reception and expedition trainer...11nuria hernández-león2015-11-28t07:18:58.442z2021-03-05t16:37:47.403z1004FalseNaNNaN[feriaempresamujer.com, escueladenegociosydire...<NA>16<NA>71916
40000-0002-7781-6767TrueTruemohd nazriismailborn in penang, malaysia in 1971, dr. mohd had...[ndum (national defence university of malaysia)]<NA>[wsn, manet, simulation and modelling, network...[[scopus author id, 24372977800], [researcheri...NaN[[lecturer, universiti pertahanan nasional mal...35scopus - elsevier2016-09-06t02:25:52.974z2020-10-20t06:55:55.051z240035TrueNaNNaN[google.com.my, researchgate.net, academia.edu...<NA>16210<NA>4
.............................................................................................
1350000-0001-5087-6965TrueTruerobertoharasystematics, evolutionary biology, and the his...[r. o’hara, r.j. o’hara, robert o’hara, robert...<NA>[history and philosophy of science, evolutiona...[[isni, 0000000138200102], [researcherid, b-47...[[biology, ph.d., harvard university, cambridg...NaN45robert j. o’hara2014-09-21t02:45:19.620z2020-07-09t06:51:09.228z230072TrueNaNNaN[rjohara.net, google.com, collegiateway.org, r...<NA>12351<NA>
1360000-0002-3318-9861TrueTrueshaguftaperveenprof. dr. shagufta perveen is a professor at k...NaNshagufta792000@yahoo.com[shagufta perveen professor, shagufta perveen ...NaN[[hej research institute of chemistry, phd che...[[professor, king saud university college of p...66scopus - elsevier2015-12-21t10:34:06.771z2021-02-22t14:58:30.893z560066Trueyahoo.com[msu.edu, ksu.edu.sa][shaguftaperveen.com, researchgate.net, ksu.ed...211<NA>2537
1370000-0001-8960-9004TrueTruesusanbastani<NA>[s. bastani, سوسن باستانی]sbastani@alzahra.ac.ir[social networks, fuzzy logic, online and offl...[[scopus author id, 16642098400]][[sociology, ph.d., university of toronto, tor...[[professor, alzahra university, tehran, vanak...20scopus - elsevier2019-07-10t06:50:46.255z2020-10-07t04:08:01.961z190033Truealzahra.ac.ir[gmail.com, gmail.com][scopus.com, google.com, publons.com, zenodo.o...2111434
1380000-0002-4379-6454TrueTruecaroline wanjirukariukicaroline holds a phd in economics from curtin ...NaN<NA>[development economics, applied econometrics, ...NaN[[economics, doctor of philosophy , curtin uni...[[director, educational development, strathmor...4caroline wanjiru kariuki2020-03-18t10:18:04.007z2021-02-11t14:40:38.515z1000FalseNaNNaN[scopus.com, mendeley.com, publons.com, resear...<NA>13<NA>436
1390000-0003-2311-0600TrueTruemyokyaw hlaing<NA>[dr myo kyaw hlaing]<NA>[economic geology]NaNNaN[[lecturer, union of myanmar ministry of educa...2myo kyaw hlaing2018-12-26t12:51:57.801z2021-01-26t14:36:47.421z1002FalseNaNNaN[facebook.com, linkedin.com, instagram.com, re...<NA>12<NA>1<NA>2
\n", "

140 rows × 30 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "0 0000-0002-5967-2835 True True \n", "1 0000-0002-3505-2797 True True \n", "2 0000-0003-3670-9620 True True \n", "3 0000-0002-5441-0465 True True \n", "4 0000-0002-7781-6767 True True \n", ".. ... ... ... \n", "135 0000-0001-5087-6965 True True \n", "136 0000-0002-3318-9861 True True \n", "137 0000-0001-8960-9004 True True \n", "138 0000-0002-4379-6454 True True \n", "139 0000-0003-2311-0600 True True \n", "\n", " given_names family_name \\\n", "0 oleksiy goryayinov \n", "1 nurul malahayati \n", "2 carlos barrera \n", "3 nuria hernández-león \n", "4 mohd nazri ismail \n", ".. ... ... \n", "135 robert ohara \n", "136 shagufta perveen \n", "137 susan bastani \n", "138 caroline wanjiru kariuki \n", "139 myo kyaw hlaing \n", "\n", " biography \\\n", "0 \n", "1 google scholar \n", "2 im individual inventor, and this is my work; s... \n", "3 \n", "4 born in penang, malaysia in 1971, dr. mohd had... \n", ".. ... \n", "135 systematics, evolutionary biology, and the his... \n", "136 prof. dr. shagufta perveen is a professor at k... \n", "137 \n", "138 caroline holds a phd in economics from curtin ... \n", "139 \n", "\n", " other_names \\\n", "0 [алексей николаевич горяинов, о.м.горяїнов, а.... \n", "1 NaN \n", "2 [retrodynamic, novelinflow] \n", "3 [nuria h. león, nuria hernández león, hernánde... \n", "4 [ndum (national defence university of malaysia)] \n", ".. ... \n", "135 [r. o’hara, r.j. o’hara, robert o’hara, robert... \n", "136 NaN \n", "137 [s. bastani, سوسن باستانی] \n", "138 NaN \n", "139 [dr myo kyaw hlaing] \n", "\n", " primary_email \\\n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", ".. ... \n", "135 \n", "136 shagufta792000@yahoo.com \n", "137 sbastani@alzahra.ac.ir \n", "138 \n", "139 \n", "\n", " keywords \\\n", "0 [diagnostics, transport, logistics] \n", "1 NaN \n", "2 [energy, technology, gearturbine, imploturboco... \n", "3 [training, icts, business management, research... \n", "4 [wsn, manet, simulation and modelling, network... \n", ".. ... \n", "135 [history and philosophy of science, evolutiona... \n", "136 [shagufta perveen professor, shagufta perveen ... \n", "137 [social networks, fuzzy logic, online and offl... \n", "138 [development economics, applied econometrics, ... \n", "139 [economic geology] \n", "\n", " external_ids \\\n", "0 [[researcherid, i-7977-2016]] \n", "1 [[researcherid, q-3861-2017]] \n", "2 [[loop profile, 394457]] \n", "3 NaN \n", "4 [[scopus author id, 24372977800], [researcheri... \n", ".. ... \n", "135 [[isni, 0000000138200102], [researcherid, b-47... \n", "136 NaN \n", "137 [[scopus author id, 16642098400]] \n", "138 NaN \n", "139 NaN \n", "\n", " education \\\n", "0 [[, дистанционный курс «ctl.sc2x: supply chain... \n", "1 [[civil and transportation engineering , maste... \n", "2 NaN \n", "3 [[, course: social skills, university of salam... \n", "4 NaN \n", ".. ... \n", "135 [[biology, ph.d., harvard university, cambridg... \n", "136 [[hej research institute of chemistry, phd che... \n", "137 [[sociology, ph.d., university of toronto, tor... \n", "138 [[economics, doctor of philosophy , curtin uni... \n", "139 NaN \n", "\n", " employment n_works \\\n", "0 [[docent, kharkiv petro vasylenko national tec... 274 \n", "1 [[senior lecturer, universitas syiah kuala, ba... 6 \n", "2 NaN 1 \n", "3 [[merchandise reception and expedition trainer... 11 \n", "4 [[lecturer, universiti pertahanan nasional mal... 35 \n", ".. ... ... \n", "135 NaN 45 \n", "136 [[professor, king saud university college of p... 66 \n", "137 [[professor, alzahra university, tehran, vanak... 20 \n", "138 [[director, educational development, strathmor... 4 \n", "139 [[lecturer, union of myanmar ministry of educa... 2 \n", "\n", " works_source activation_date \\\n", "0 oleksiy goryayinov 2014-08-03t18:06:42.925z \n", "1 nurul malahayati 2017-10-01t00:46:31.324z \n", "2 carlos barrera 2016-08-29t20:32:10.362z \n", "3 nuria hernández-león 2015-11-28t07:18:58.442z \n", "4 scopus - elsevier 2016-09-06t02:25:52.974z \n", ".. ... ... \n", "135 robert j. o’hara 2014-09-21t02:45:19.620z \n", "136 scopus - elsevier 2015-12-21t10:34:06.771z \n", "137 scopus - elsevier 2019-07-10t06:50:46.255z \n", "138 caroline wanjiru kariuki 2020-03-18t10:18:04.007z \n", "139 myo kyaw hlaing 2018-12-26t12:51:57.801z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", "0 2021-03-22t13:56:48.311z 0 0 0 0 False \n", "1 2019-08-19t15:52:47.253z 3 0 0 3 False \n", "2 2021-02-09t04:56:35.554z 0 0 0 0 False \n", "3 2021-03-05t16:37:47.403z 1 0 0 4 False \n", "4 2020-10-20t06:55:55.051z 24 0 0 35 True \n", ".. ... ... ... ... ... ... \n", "135 2020-07-09t06:51:09.228z 23 0 0 72 True \n", "136 2021-02-22t14:58:30.893z 56 0 0 66 True \n", "137 2020-10-07t04:08:01.961z 19 0 0 33 True \n", "138 2021-02-11t14:40:38.515z 1 0 0 0 False \n", "139 2021-01-26t14:36:47.421z 1 0 0 2 False \n", "\n", " primary_email_domain other_email_domains \\\n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", ".. ... ... \n", "135 NaN NaN \n", "136 yahoo.com [msu.edu, ksu.edu.sa] \n", "137 alzahra.ac.ir [gmail.com, gmail.com] \n", "138 NaN NaN \n", "139 NaN NaN \n", "\n", " url_domains n_emails n_urls \\\n", "0 [khntusg.com.ua, khntusg.com.ua, google.com.ua... 13 \n", "1 [google.com, ristekdikti.go.id, unsyiah.ac.id,... 16 \n", "2 [blogspot.mx, behance.net, authorstream.com, d... 24 \n", "3 [feriaempresamujer.com, escueladenegociosydire... 16 \n", "4 [google.com.my, researchgate.net, academia.edu... 16 \n", ".. ... ... ... \n", "135 [rjohara.net, google.com, collegiateway.org, r... 12 \n", "136 [shaguftaperveen.com, researchgate.net, ksu.ed... 2 11 \n", "137 [scopus.com, google.com, publons.com, zenodo.o... 2 11 \n", "138 [scopus.com, mendeley.com, publons.com, resear... 13 \n", "139 [facebook.com, linkedin.com, instagram.com, re... 12 \n", "\n", " n_ids n_keywords n_education n_employment \n", "0 1 3 14 7 \n", "1 1 2 1 \n", "2 1 8 \n", "3 7 19 16 \n", "4 2 10 4 \n", ".. ... ... ... ... \n", "135 3 5 1 \n", "136 25 3 7 \n", "137 1 4 3 4 \n", "138 4 3 6 \n", "139 1 2 \n", "\n", "[140 rows x 30 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)\n", "exploded_sources" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
00000-0002-5967-2835TrueTrueoleksiygoryayinov<NA>[алексей николаевич горяинов, о.м.горяїнов, а....<NA>[diagnostics, transport, logistics][[researcherid, i-7977-2016]][[, дистанционный курс «ctl.sc2x: supply chain...[[docent, kharkiv petro vasylenko national tec...274oleksiy goryayinov2014-08-03t18:06:42.925z2021-03-22t13:56:48.311z0000FalseNaNNaN[khntusg.com.ua, khntusg.com.ua, google.com.ua...<NA>1313147
10000-0002-3505-2797TrueTruenurulmalahayatigoogle scholarNaN<NA>NaN[[researcherid, q-3861-2017]][[civil and transportation engineering , maste...[[senior lecturer, universitas syiah kuala, ba...6nurul malahayati2017-10-01t00:46:31.324z2019-08-19t15:52:47.253z3003FalseNaNNaN[google.com, ristekdikti.go.id, unsyiah.ac.id,...<NA>161<NA>21
20000-0003-3670-9620TrueTruecarlosbarreraim individual inventor, and this is my work; s...[retrodynamic, novelinflow]<NA>[energy, technology, gearturbine, imploturboco...[[loop profile, 394457]]NaNNaN1carlos barrera2016-08-29t20:32:10.362z2021-02-09t04:56:35.554z0000FalseNaNNaN[blogspot.mx, behance.net, authorstream.com, d...<NA>2418<NA><NA>
30000-0002-5441-0465TrueTruenuriahernández-león<NA>[nuria h. león, nuria hernández león, hernánde...<NA>[training, icts, business management, research...NaN[[, course: social skills, university of salam...[[merchandise reception and expedition trainer...11nuria hernández-león2015-11-28t07:18:58.442z2021-03-05t16:37:47.403z1004FalseNaNNaN[feriaempresamujer.com, escueladenegociosydire...<NA>16<NA>71916
50000-0001-7010-2908TrueTrueclarasarmentoclara sarmento holds an aggregation in cultura...NaN<NA>[portuguese culture and literature, cultural a...[[ciência id, d418-d6f8-7d49]][[ao abrigo da bolsa santander ie best practic...[[presidente da comissão de acreditação do nov...275clara sarmento2013-12-12t00:33:58.190z2020-10-12t14:43:00.749z170060TrueNaNNaN[iscap.pt, google.pt, academia.edu, researchga...<NA>1316837
.............................................................................................
1330000-0003-1020-1351TrueTruesheikh saifullahahmedsheikh saifullah ahmed is a full-time lecturer...NaNsaifullahahmedku@gmail.com[south asian literature, postmodern literature...NaN[[english discipline , ma & ba in english , kh...[[lecturer , international university of busin...3sheikh saifullah ahmed2020-04-08t21:00:11.201z2021-02-12t20:45:32.247z2003Falsegmail.comNaN[academia.edu, iubat.edu, google.com, research...<NA>12<NA>511
1340000-0001-7228-5680TrueTruetextprotocol<NA>NaN<NA>NaNNaNNaN[[engineer, textprotocol.org, palo alto, ca, u...1text protocol2021-03-09t10:30:32.237z2021-03-21t17:17:40.500z0000FalseNaNNaN[about.me, figma.com, github.com, gitlab.com, ...<NA>15<NA><NA><NA>1
1350000-0001-5087-6965TrueTruerobertoharasystematics, evolutionary biology, and the his...[r. o’hara, r.j. o’hara, robert o’hara, robert...<NA>[history and philosophy of science, evolutiona...[[isni, 0000000138200102], [researcherid, b-47...[[biology, ph.d., harvard university, cambridg...NaN45robert j. o’hara2014-09-21t02:45:19.620z2020-07-09t06:51:09.228z230072TrueNaNNaN[rjohara.net, google.com, collegiateway.org, r...<NA>12351<NA>
1380000-0002-4379-6454TrueTruecaroline wanjirukariukicaroline holds a phd in economics from curtin ...NaN<NA>[development economics, applied econometrics, ...NaN[[economics, doctor of philosophy , curtin uni...[[director, educational development, strathmor...4caroline wanjiru kariuki2020-03-18t10:18:04.007z2021-02-11t14:40:38.515z1000FalseNaNNaN[scopus.com, mendeley.com, publons.com, resear...<NA>13<NA>436
1390000-0003-2311-0600TrueTruemyokyaw hlaing<NA>[dr myo kyaw hlaing]<NA>[economic geology]NaNNaN[[lecturer, union of myanmar ministry of educa...2myo kyaw hlaing2018-12-26t12:51:57.801z2021-01-26t14:36:47.421z1002FalseNaNNaN[facebook.com, linkedin.com, instagram.com, re...<NA>12<NA>1<NA>2
\n", "

113 rows × 30 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "0 0000-0002-5967-2835 True True \n", "1 0000-0002-3505-2797 True True \n", "2 0000-0003-3670-9620 True True \n", "3 0000-0002-5441-0465 True True \n", "5 0000-0001-7010-2908 True True \n", ".. ... ... ... \n", "133 0000-0003-1020-1351 True True \n", "134 0000-0001-7228-5680 True True \n", "135 0000-0001-5087-6965 True True \n", "138 0000-0002-4379-6454 True True \n", "139 0000-0003-2311-0600 True True \n", "\n", " given_names family_name \\\n", "0 oleksiy goryayinov \n", "1 nurul malahayati \n", "2 carlos barrera \n", "3 nuria hernández-león \n", "5 clara sarmento \n", ".. ... ... \n", "133 sheikh saifullah ahmed \n", "134 text protocol \n", "135 robert ohara \n", "138 caroline wanjiru kariuki \n", "139 myo kyaw hlaing \n", "\n", " biography \\\n", "0 \n", "1 google scholar \n", "2 im individual inventor, and this is my work; s... \n", "3 \n", "5 clara sarmento holds an aggregation in cultura... \n", ".. ... \n", "133 sheikh saifullah ahmed is a full-time lecturer... \n", "134 \n", "135 systematics, evolutionary biology, and the his... \n", "138 caroline holds a phd in economics from curtin ... \n", "139 \n", "\n", " other_names \\\n", "0 [алексей николаевич горяинов, о.м.горяїнов, а.... \n", "1 NaN \n", "2 [retrodynamic, novelinflow] \n", "3 [nuria h. león, nuria hernández león, hernánde... \n", "5 NaN \n", ".. ... \n", "133 NaN \n", "134 NaN \n", "135 [r. o’hara, r.j. o’hara, robert o’hara, robert... \n", "138 NaN \n", "139 [dr myo kyaw hlaing] \n", "\n", " primary_email \\\n", "0 \n", "1 \n", "2 \n", "3 \n", "5 \n", ".. ... \n", "133 saifullahahmedku@gmail.com \n", "134 \n", "135 \n", "138 \n", "139 \n", "\n", " keywords \\\n", "0 [diagnostics, transport, logistics] \n", "1 NaN \n", "2 [energy, technology, gearturbine, imploturboco... \n", "3 [training, icts, business management, research... \n", "5 [portuguese culture and literature, cultural a... \n", ".. ... \n", "133 [south asian literature, postmodern literature... \n", "134 NaN \n", "135 [history and philosophy of science, evolutiona... \n", "138 [development economics, applied econometrics, ... \n", "139 [economic geology] \n", "\n", " external_ids \\\n", "0 [[researcherid, i-7977-2016]] \n", "1 [[researcherid, q-3861-2017]] \n", "2 [[loop profile, 394457]] \n", "3 NaN \n", "5 [[ciência id, d418-d6f8-7d49]] \n", ".. ... \n", "133 NaN \n", "134 NaN \n", "135 [[isni, 0000000138200102], [researcherid, b-47... \n", "138 NaN \n", "139 NaN \n", "\n", " education \\\n", "0 [[, дистанционный курс «ctl.sc2x: supply chain... \n", "1 [[civil and transportation engineering , maste... \n", "2 NaN \n", "3 [[, course: social skills, university of salam... \n", "5 [[ao abrigo da bolsa santander ie best practic... \n", ".. ... \n", "133 [[english discipline , ma & ba in english , kh... \n", "134 NaN \n", "135 [[biology, ph.d., harvard university, cambridg... \n", "138 [[economics, doctor of philosophy , curtin uni... \n", "139 NaN \n", "\n", " employment n_works \\\n", "0 [[docent, kharkiv petro vasylenko national tec... 274 \n", "1 [[senior lecturer, universitas syiah kuala, ba... 6 \n", "2 NaN 1 \n", "3 [[merchandise reception and expedition trainer... 11 \n", "5 [[presidente da comissão de acreditação do nov... 275 \n", ".. ... ... \n", "133 [[lecturer , international university of busin... 3 \n", "134 [[engineer, textprotocol.org, palo alto, ca, u... 1 \n", "135 NaN 45 \n", "138 [[director, educational development, strathmor... 4 \n", "139 [[lecturer, union of myanmar ministry of educa... 2 \n", "\n", " works_source activation_date \\\n", "0 oleksiy goryayinov 2014-08-03t18:06:42.925z \n", "1 nurul malahayati 2017-10-01t00:46:31.324z \n", "2 carlos barrera 2016-08-29t20:32:10.362z \n", "3 nuria hernández-león 2015-11-28t07:18:58.442z \n", "5 clara sarmento 2013-12-12t00:33:58.190z \n", ".. ... ... \n", "133 sheikh saifullah ahmed 2020-04-08t21:00:11.201z \n", "134 text protocol 2021-03-09t10:30:32.237z \n", "135 robert j. o’hara 2014-09-21t02:45:19.620z \n", "138 caroline wanjiru kariuki 2020-03-18t10:18:04.007z \n", "139 myo kyaw hlaing 2018-12-26t12:51:57.801z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", "0 2021-03-22t13:56:48.311z 0 0 0 0 False \n", "1 2019-08-19t15:52:47.253z 3 0 0 3 False \n", "2 2021-02-09t04:56:35.554z 0 0 0 0 False \n", "3 2021-03-05t16:37:47.403z 1 0 0 4 False \n", "5 2020-10-12t14:43:00.749z 17 0 0 60 True \n", ".. ... ... ... ... ... ... \n", "133 2021-02-12t20:45:32.247z 2 0 0 3 False \n", "134 2021-03-21t17:17:40.500z 0 0 0 0 False \n", "135 2020-07-09t06:51:09.228z 23 0 0 72 True \n", "138 2021-02-11t14:40:38.515z 1 0 0 0 False \n", "139 2021-01-26t14:36:47.421z 1 0 0 2 False \n", "\n", " primary_email_domain other_email_domains \\\n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "5 NaN NaN \n", ".. ... ... \n", "133 gmail.com NaN \n", "134 NaN NaN \n", "135 NaN NaN \n", "138 NaN NaN \n", "139 NaN NaN \n", "\n", " url_domains n_emails n_urls \\\n", "0 [khntusg.com.ua, khntusg.com.ua, google.com.ua... 13 \n", "1 [google.com, ristekdikti.go.id, unsyiah.ac.id,... 16 \n", "2 [blogspot.mx, behance.net, authorstream.com, d... 24 \n", "3 [feriaempresamujer.com, escueladenegociosydire... 16 \n", "5 [iscap.pt, google.pt, academia.edu, researchga... 13 \n", ".. ... ... ... \n", "133 [academia.edu, iubat.edu, google.com, research... 12 \n", "134 [about.me, figma.com, github.com, gitlab.com, ... 15 \n", "135 [rjohara.net, google.com, collegiateway.org, r... 12 \n", "138 [scopus.com, mendeley.com, publons.com, resear... 13 \n", "139 [facebook.com, linkedin.com, instagram.com, re... 12 \n", "\n", " n_ids n_keywords n_education n_employment \n", "0 1 3 14 7 \n", "1 1 2 1 \n", "2 1 8 \n", "3 7 19 16 \n", "5 1 6 8 37 \n", ".. ... ... ... ... \n", "133 5 1 1 \n", "134 1 \n", "135 3 5 1 \n", "138 4 3 6 \n", "139 1 2 \n", "\n", "[113 rows x 30 columns]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Works source" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "def remove_own_source(lst, given, family):\n", " res = []\n", " for ws in lst:\n", " if ws.lower().find(given.lower()) == -1:\n", " if pd.notna(family):\n", " if ws.lower().find(family.lower()) == -1:\n", " res.append(ws)\n", " else:\n", " res.append(ws)\n", " return res" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "df['ext_works_source'] = df[(df.works_source.notna()) & (df.given_names.notna())]\\\n", " .apply(lambda x: remove_own_source(x['works_source'], x['given_names'], x['family_name']), axis=1)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "df['n_ext_work_source'] = df.ext_works_source.str.len()" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "exploded_external_sources = df[df['ext_works_source'].str.len() > 0][['orcid','ext_works_source']]\\\n", " .explode('ext_works_source').reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "grouped_ext_sources = exploded_external_sources.groupby('ext_works_source')\\\n", " .count()\\\n", " .sort_values('orcid', ascending=False)\\\n", " .reset_index()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "crossref", "scopus - elsevier", "crossref metadata search", "multidisciplinary digital publishing institute", "europe pubmed central", "researcherid", "publons", "ciênciavitae", "base - bielefeld academic search engine", "datacite", "redalyc", "mla international bibliography", "deutsche nationalbibliothek (dnb)", "nasa astrophysics data system", "national information processing institute ", "f1000", "inspire-hep", "university of helsinki", "hal", "igi global", "airiti", "university of copenhagen", "universidade federal de uberlândia", "aarhus university", "universidad del país vasco", "university of manchester - pure", "kings college london", "university of southern denmark", "wellcome open research", "macquarie university" ], "y": [ 1460841, 902231, 297684, 281664, 181605, 158148, 39786, 32315, 20699, 16107, 9640, 8059, 7855, 7403, 6509, 5221, 4872, 4152, 4136, 3833, 3725, 3127, 2718, 2311, 2271, 2227, 2199, 2185, 2113, 2053 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top 30 works_source" }, "xaxis": { "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "data = [\n", " go.Bar(\n", " x=grouped_ext_sources[:30].ext_works_source,\n", " y=grouped_ext_sources[:30].orcid\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Top 30 works_source',\n", " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ext_works_sourceorcid
0crossref1460841
1scopus - elsevier902231
2crossref metadata search297684
3multidisciplinary digital publishing institute281664
4europe pubmed central181605
.........
337uta - oa journal global insight3
338francis crick institute3
339anna3
340santos3
341universitäts- und stadtbibliothek köln3
\n", "

342 rows × 2 columns

\n", "
" ], "text/plain": [ " ext_works_source orcid\n", "0 crossref 1460841\n", "1 scopus - elsevier 902231\n", "2 crossref metadata search 297684\n", "3 multidisciplinary digital publishing institute 281664\n", "4 europe pubmed central 181605\n", ".. ... ...\n", "337 uta - oa journal global insight 3\n", "338 francis crick institute 3\n", "339 anna 3\n", "340 santos 3\n", "341 universitäts- und stadtbibliothek köln 3\n", "\n", "[342 rows x 2 columns]" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "authoritative_sources = grouped_ext_sources[grouped_ext_sources['orcid'] > 2]\n", "authoritative_sources" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "exploded_external_sources['authoritative'] = exploded_external_sources.ext_works_source\\\n", " .isin(authoritative_sources['ext_works_source'])" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "orcid_authoritative_source = exploded_external_sources\\\n", " .groupby('orcid')['authoritative']\\\n", " .any()\\\n", " .reset_index()[['orcid', 'authoritative']]" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "df = df.set_index('orcid').join(orcid_authoritative_source.set_index('orcid')).reset_index()" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "df.loc[df.authoritative.isna(), 'authoritative'] = False" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employmentext_works_sourcen_ext_work_sourceauthoritative
00000-0001-6097-3953FalseFalse<NA><NA><NA>NaN<NA>NaNNaNNaNNaN0NaN2018-03-02t09:29:16.528z2018-03-02t09:43:07.551z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA><NA>NaNNaNFalse
10000-0001-6112-5550TrueTrue<NA><NA><NA>[v.i. yurtaev; v. yurtaev]<NA>NaNNaNNaN[[professor, peoples friendship university of ...0NaN2018-04-03t07:50:23.358z2020-03-18t09:42:44.753z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA>1NaNNaNFalse
20000-0001-6152-2695TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaNNaN0NaN2019-12-11t15:31:56.388z2020-01-28t15:34:17.309z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA><NA>NaNNaNFalse
30000-0001-6220-5683TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaN[[research scientist, new york university abu ...0NaN2015-08-18t12:36:45.307z2020-09-23t13:37:54.180z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA>1NaNNaNFalse
40000-0001-7071-8294TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaN[[researcher (academic), universidad de zarago...0NaN2014-03-10t13:22:01.966z2016-06-14t22:17:54.470z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA>2NaNNaNFalse
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email given_names \\\n", "0 0000-0001-6097-3953 False False \n", "1 0000-0001-6112-5550 True True \n", "2 0000-0001-6152-2695 True True \n", "3 0000-0001-6220-5683 True True \n", "4 0000-0001-7071-8294 True True \n", "\n", " family_name biography other_names primary_email keywords \\\n", "0 NaN NaN \n", "1 [v.i. yurtaev; v. yurtaev] NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", "\n", " external_ids education employment \\\n", "0 NaN NaN NaN \n", "1 NaN NaN [[professor, peoples friendship university of ... \n", "2 NaN NaN NaN \n", "3 NaN NaN [[research scientist, new york university abu ... \n", "4 NaN NaN [[researcher (academic), universidad de zarago... \n", "\n", " n_works works_source activation_date last_update_date \\\n", "0 0 NaN 2018-03-02t09:29:16.528z 2018-03-02t09:43:07.551z \n", "1 0 NaN 2018-04-03t07:50:23.358z 2020-03-18t09:42:44.753z \n", "2 0 NaN 2019-12-11t15:31:56.388z 2020-01-28t15:34:17.309z \n", "3 0 NaN 2015-08-18t12:36:45.307z 2020-09-23t13:37:54.180z \n", "4 0 NaN 2014-03-10t13:22:01.966z 2016-06-14t22:17:54.470z \n", "\n", " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", "0 0 0 0 0 False NaN \n", "1 0 0 0 0 False NaN \n", "2 0 0 0 0 False NaN \n", "3 0 0 0 0 False NaN \n", "4 0 0 0 0 False NaN \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", "\n", " n_education n_employment ext_works_source n_ext_work_source authoritative \n", "0 NaN NaN False \n", "1 1 NaN NaN False \n", "2 NaN NaN False \n", "3 1 NaN NaN False \n", "4 2 NaN NaN False " ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## External IDs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "External IDs should come from reliable sources. ORCiD registrants cannot add them freely." ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 1.308598e+06\n", "mean 1.359082e+00\n", "std 6.643235e-01\n", "min 1.000000e+00\n", "25% 1.000000e+00\n", "50% 1.000000e+00\n", "75% 2.000000e+00\n", "max 8.000000e+01\n", "Name: n_ids, dtype: float64" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.n_ids.describe()" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employmentext_works_sourcen_ext_work_sourceauthoritative
38962260000-0002-9554-6633TrueTruejohn awilliams<NA>NaN<NA>NaN[[scopus author id,  55553733518], [scopus aut...NaN[[, aston university, birmingham, , gb, 1722, ...92[aston research explorer]2014-11-20t09:42:10.690z2021-03-17t01:00:51.203z8000208TrueNaNNaN[aston.ac.uk]<NA>180<NA><NA>1[aston research explorer]1.0True
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "3896226 0000-0002-9554-6633 True True \n", "\n", " given_names family_name biography other_names primary_email keywords \\\n", "3896226 john a williams NaN NaN \n", "\n", " external_ids education \\\n", "3896226 [[scopus author id,  55553733518], [scopus aut... NaN \n", "\n", " employment n_works \\\n", "3896226 [[, aston university, birmingham, , gb, 1722, ... 92 \n", "\n", " works_source activation_date \\\n", "3896226 [aston research explorer] 2014-11-20t09:42:10.690z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", "3896226 2021-03-17t01:00:51.203z 80 0 0 208 True \n", "\n", " primary_email_domain other_email_domains url_domains n_emails \\\n", "3896226 NaN NaN [aston.ac.uk] \n", "\n", " n_urls n_ids n_keywords n_education n_employment \\\n", "3896226 1 80 1 \n", "\n", " ext_works_source n_ext_work_source authoritative \n", "3896226 [aston research explorer] 1.0 True " ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.n_ids == df.n_ids.max()]" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidexternal_idsprovider
90000-0001-8315-2066[researcherid, k-4630-2014]researcherid
290000-0002-2638-4108[scopus author id, 54394231000]scopus author id
460000-0003-1435-6545[researcherid, p-2223-2018]researcherid
500000-0003-2259-7023[scopus author id, 57189297461]scopus author id
640000-0002-7397-5824[scopus author id, 8399842800]scopus author id
\n", "
" ], "text/plain": [ " orcid external_ids provider\n", "9 0000-0001-8315-2066 [researcherid, k-4630-2014] researcherid\n", "29 0000-0002-2638-4108 [scopus author id, 54394231000] scopus author id\n", "46 0000-0003-1435-6545 [researcherid, p-2223-2018] researcherid\n", "50 0000-0003-2259-7023 [scopus author id, 57189297461] scopus author id\n", "64 0000-0002-7397-5824 [scopus author id, 8399842800] scopus author id" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ids[ids.provider.notna()].head()" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "top_ids_providers = ids.groupby('provider').count().sort_values('orcid', ascending=False)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "scopus author id", "researcherid", "loop profile", "ciência id", "researcher name resolver id", "sciprofile", "中国科学家在线", "isni", "gnd", "pitt id", "technical university of denmark cwis", "researcher id", "id dialnet", "digital author id", "scopus author id: ", "authenticusid", "hku researcherpage", "uow scholars", "cti vitae", "scopus author id:", "hkust profile", "chalmers id", "scopus id", "iauthor", "google scholar", "digital author id (dai)", "authid", "dai", "us epa vivo", "scopus id", "authenticus", "smithsonian profiles", "github", "escientist", "vivo cornell", "researcherid:", "id dialnet:", "dialnet id", "sciprofiles", "kaken", "une researcher id", "researcherid: ", "orcid", "scienceopen", "profile system identifier", "orcid id", "custom" ], "y": [ 1037239, 545399, 118645, 37042, 7954, 5164, 4811, 3089, 2999, 2679, 2483, 1452, 1169, 1126, 1077, 878, 741, 646, 582, 547, 523, 430, 256, 212, 201, 180, 175, 155, 146, 127, 83, 61, 51, 49, 46, 39, 7, 6, 5, 5, 4, 3, 2, 1, 1, 1, 1 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "IDs provided by providers" }, "xaxis": { "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "data = [\n", " go.Bar(\n", " x=top_ids_providers.index,\n", " y=top_ids_providers['orcid']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='IDs provided by providers',\n", " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([nan, 'researcherid', 'scopus author id', 'loop profile', 'gnd',\n", " 'ciência id', 'researcher name resolver id', 'pitt id',\n", " 'id dialnet', 'isni', 'technical university of denmark cwis',\n", " 'chalmers id', 'scopus author id: ', 'scopus author id:',\n", " 'hkust profile', 'hku researcherpage', '中国科学家在线', 'uow scholars',\n", " 'sciprofile', 'cti vitae', 'digital author id', 'researcher id',\n", " 'authenticusid', 'authid', 'authenticus', 'scopus id',\n", " 'digital author id (dai)', 'researcherid:', 'vivo cornell',\n", " 'us epa vivo', 'escientist', 'github', 'iauthor', 'orcid id',\n", " 'dai', 'scopus id', 'smithsonian profiles', 'google scholar',\n", " 'kaken', 'dialnet id', 'researcherid: ', 'une researcher id',\n", " 'sciprofiles', 'id dialnet:', 'scienceopen', 'orcid',\n", " 'profile system identifier', 'custom'], dtype=object)" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.unique(ids['provider'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Keywords" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidn_keywords
37517140000-0002-0673-0341154
86979260000-0003-3343-5660148
11545230000-0002-6075-3501140
65129710000-0002-7060-4112140
15151970000-0001-5287-1949132
.........
109896440000-0002-1686-1935<NA>
109896450000-0002-3800-6331<NA>
109896460000-0002-8783-5814<NA>
109896470000-0002-7584-2283<NA>
109896480000-0003-0529-3538<NA>
\n", "

10989649 rows × 2 columns

\n", "
" ], "text/plain": [ " orcid n_keywords\n", "3751714 0000-0002-0673-0341 154\n", "8697926 0000-0003-3343-5660 148\n", "1154523 0000-0002-6075-3501 140\n", "6512971 0000-0002-7060-4112 140\n", "1515197 0000-0001-5287-1949 132\n", "... ... ...\n", "10989644 0000-0002-1686-1935 \n", "10989645 0000-0002-3800-6331 \n", "10989646 0000-0002-8783-5814 \n", "10989647 0000-0002-7584-2283 \n", "10989648 0000-0003-0529-3538 \n", "\n", "[10989649 rows x 2 columns]" ] }, "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keywords_by_orcid = df[['orcid', 'n_keywords']].sort_values('n_keywords', ascending=False)\n", "keywords_by_orcid" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "0000-0002-0673-0341", "0000-0003-3343-5660", "0000-0002-6075-3501", "0000-0002-7060-4112", "0000-0001-5287-1949", "0000-0002-9638-8091", "0000-0002-4071-0301", "0000-0001-9462-5666", "0000-0002-0929-2412", "0000-0002-0115-7195", "0000-0002-4235-4259", "0000-0003-0076-6287", "0000-0001-9715-9357", "0000-0002-1878-9762", "0000-0001-6307-6027", "0000-0003-2273-9888", "0000-0002-0937-7061", "0000-0002-1770-9660", "0000-0001-5696-1052", "0000-0003-2998-5520", "0000-0003-1799-0971", "0000-0002-0156-3580", "0000-0002-9625-6742", "0000-0003-1399-7156", "0000-0001-9985-1697", "0000-0001-6537-7683", "0000-0002-8401-8018", "0000-0003-4246-8579", "0000-0001-7857-4133", "0000-0002-7710-0355", "0000-0001-5869-2204", "0000-0002-8083-7382", "0000-0001-8670-4372", "0000-0001-7654-5013", "0000-0002-4488-2880", "0000-0003-4374-6374", "0000-0001-6939-3859", "0000-0003-2509-2549", "0000-0002-3186-8860", "0000-0002-0441-1507", "0000-0001-5230-715X", "0000-0003-0209-180X", "0000-0001-9336-6850", "0000-0002-0463-0048", "0000-0001-5458-7167", "0000-0002-9381-2264", "0000-0002-8227-5387", "0000-0002-3061-3364", "0000-0002-9293-0189", "0000-0002-3123-3021", "0000-0003-1071-4296", "0000-0003-3340-6413", "0000-0003-3584-6834", "0000-0002-8644-8396", "0000-0002-2935-1934", "0000-0002-1718-1632", "0000-0002-8659-6321", "0000-0002-8449-2211", "0000-0003-1693-3190", "0000-0001-5637-1124", "0000-0001-5167-7466", "0000-0002-3532-043X", "0000-0001-6861-9561", "0000-0003-4608-3844", "0000-0003-4505-3678", "0000-0003-4673-1063", "0000-0001-8174-8835", "0000-0002-6347-9464", "0000-0002-8918-2781", "0000-0003-4511-7942", "0000-0003-2532-2906", "0000-0001-9280-6017", "0000-0002-5274-7742", "0000-0001-9586-0780", "0000-0003-3720-1183", "0000-0001-5819-4555", "0000-0002-1103-9651", "0000-0001-8135-2304", "0000-0002-8499-1045", "0000-0003-2550-1859", "0000-0002-8665-9281", "0000-0001-7818-3212", "0000-0003-1863-0265", "0000-0001-8733-5230", "0000-0003-2218-1343", "0000-0002-5306-7781", "0000-0001-7728-4046", "0000-0003-4486-2684", "0000-0002-4982-5236", "0000-0001-5300-3932", "0000-0003-3342-6123", "0000-0002-8072-1152", "0000-0002-3494-2624", "0000-0002-0715-0461", "0000-0002-3907-3552", "0000-0001-5556-8275", "0000-0002-3597-3350", "0000-0002-2252-672X", "0000-0001-7392-9361", "0000-0001-8689-185X" ], "y": [ 154, 148, 140, 140, 132, 124, 115, 106, 105, 102, 100, 94, 92, 92, 88, 86, 78, 77, 75, 75, 72, 71, 70, 68, 68, 68, 67, 66, 64, 64, 63, 62, 61, 61, 61, 60, 60, 56, 55, 54, 54, 53, 53, 53, 53, 53, 52, 52, 52, 51, 51, 51, 50, 50, 50, 50, 50, 49, 49, 49, 49, 48, 48, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 45, 45, 45, 45, 44, 44, 44, 44, 44, 44, 44, 44, 44, 43, 43, 43, 43, 43, 43, 43, 43, 42, 42, 42, 42, 42, 42 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Keywords provided by ORCiD" }, "xaxis": { "range": [ -0.5, 99.5 ], "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(100)\n", "data = [\n", " go.Bar(\n", " x=keywords_by_orcid[:TOP_N]['orcid'],\n", " y=keywords_by_orcid[:TOP_N]['n_keywords']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Keywords provided by ORCiD',\n", " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [], "source": [ "top_keywords = df[['orcid', 'keywords']]\\\n", " .explode('keywords')\\\n", " .reset_index(drop=True)\\\n", " .groupby('keywords')\\\n", " .count()\\\n", " .sort_values('orcid', ascending=False)" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "machine learning", "bioinformatics", "education", "molecular biology", "cancer", "ecology", "artificial intelligence", "epidemiology", "public health", "microbiology", "neuroscience", "immunology", "genetics", "climate change", "remote sensing", "biochemistry", "genomics", "biotechnology", "nanotechnology", "sustainability", "educación", "gis", "deep learning", "psychology", "computer vision", "marketing", "nutrition", "innovation", "data science", "statistics", "data mining", "nanomaterials", "image processing", "robotics", "management", "optimization", "renewable energy", "chemistry", "biomaterials", "diabetes", "gender", "educação", "architecture", "catalysis", "history", "electrochemistry", "evolution", "research", "energy", "biodiversity" ], "y": [ 8574, 5424, 5191, 4557, 4163, 3923, 3839, 3789, 3676, 3550, 3495, 3468, 3343, 3337, 3279, 3003, 2794, 2681, 2674, 2654, 2526, 2511, 2466, 2381, 2309, 2213, 2199, 2154, 2153, 2144, 2108, 2100, 2099, 2086, 2081, 2071, 2009, 2005, 2002, 1998, 1997, 1873, 1835, 1813, 1813, 1800, 1797, 1789, 1770, 1717 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top-50 keywords occurrence" }, "xaxis": { "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(50)\n", "data = [\n", " go.Bar(\n", " x=top_keywords[:TOP_N].index,\n", " y=top_keywords[:TOP_N]['orcid']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Top-%s keywords occurrence' % TOP_N,\n", " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Education" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 1.753340e+06\n", "mean 1.913072e+00\n", "std 1.197388e+00\n", "min 1.000000e+00\n", "25% 1.000000e+00\n", "50% 2.000000e+00\n", "75% 3.000000e+00\n", "max 2.000000e+02\n", "Name: n_education, dtype: float64" ] }, "execution_count": 120, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.n_education.describe()" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employmentext_works_sourcen_ext_work_sourceauthoritativespam_scoren_valid_employment
25685390000-0002-1927-0292TrueTruephd. carmen mgalvez-sánchezmy name is carmen maria galvez sánchez. i´m a ...NaN<NA>[qualitative research, fibromyalgia, quantitat...[[loop profile, 509331], [scopus author id, 57...[[psychology, 2019-2020 course. degree in psyc...[[researcher and teaching staff. postdoctoral ...35[phd. carmen m galvez-sánchez, multidisciplina...2016-04-18t14:28:57.237z2021-03-06t14:17:33.246z24007TrueNaNNaNNaN<NA><NA>252003[multidisciplinary digital publishing institut...4.0True0.9999481
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "2568539 0000-0002-1927-0292 True True \n", "\n", " given_names family_name \\\n", "2568539 phd. carmen m galvez-sánchez \n", "\n", " biography other_names \\\n", "2568539 my name is carmen maria galvez sánchez. i´m a ... NaN \n", "\n", " primary_email keywords \\\n", "2568539 [qualitative research, fibromyalgia, quantitat... \n", "\n", " external_ids \\\n", "2568539 [[loop profile, 509331], [scopus author id, 57... \n", "\n", " education \\\n", "2568539 [[psychology, 2019-2020 course. degree in psyc... \n", "\n", " employment n_works \\\n", "2568539 [[researcher and teaching staff. postdoctoral ... 35 \n", "\n", " works_source \\\n", "2568539 [phd. carmen m galvez-sánchez, multidisciplina... \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "2568539 2016-04-18t14:28:57.237z 2021-03-06t14:17:33.246z 24 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", "2568539 0 7 True NaN NaN \n", "\n", " url_domains n_emails n_urls n_ids n_keywords n_education \\\n", "2568539 NaN 2 5 200 \n", "\n", " n_employment ext_works_source \\\n", "2568539 3 [multidisciplinary digital publishing institut... \n", "\n", " n_ext_work_source authoritative spam_score n_valid_employment \n", "2568539 4.0 True 0.999948 1 " ] }, "execution_count": 121, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.n_education == df.n_education.max()]" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcideducation
280000-0002-2343-910X[aeronautics and astronautics, phd, massachuse...
280000-0002-2343-910X[aeronautics and astronautics, sm, massachuset...
280000-0002-2343-910X[mechanical engineering and material science, ...
290000-0002-2638-4108[public law, ph doctor, university of oviedo, ...
460000-0003-1435-6545[morfologia, , universidade estadual paulista ...
.........
109896440000-0002-1686-1935[, , south china agricultural university, guan...
109896450000-0002-3800-6331[richard gilder graduate school, phd in compar...
109896450000-0002-3800-6331[geological sciences and history (dual major),...
109896470000-0002-7584-2283[school of electronics and information, master...
109896470000-0002-7584-2283[ department of electrical engineering, bachel...
\n", "

4434439 rows × 2 columns

\n", "
" ], "text/plain": [ " orcid \\\n", "28 0000-0002-2343-910X \n", "28 0000-0002-2343-910X \n", "28 0000-0002-2343-910X \n", "29 0000-0002-2638-4108 \n", "46 0000-0003-1435-6545 \n", "... ... \n", "10989644 0000-0002-1686-1935 \n", "10989645 0000-0002-3800-6331 \n", "10989645 0000-0002-3800-6331 \n", "10989647 0000-0002-7584-2283 \n", "10989647 0000-0002-7584-2283 \n", "\n", " education \n", "28 [aeronautics and astronautics, phd, massachuse... \n", "28 [aeronautics and astronautics, sm, massachuset... \n", "28 [mechanical engineering and material science, ... \n", "29 [public law, ph doctor, university of oviedo, ... \n", "46 [morfologia, , universidade estadual paulista ... \n", "... ... \n", "10989644 [, , south china agricultural university, guan... \n", "10989645 [richard gilder graduate school, phd in compar... \n", "10989645 [geological sciences and history (dual major),... \n", "10989647 [school of electronics and information, master... \n", "10989647 [ department of electrical engineering, bachel... \n", "\n", "[4434439 rows x 2 columns]" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exploded_education = df[['orcid', 'education']].explode('education').dropna()\n", "exploded_education" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [], "source": [ "exploded_education[['degree', 'role', 'university', 'city', 'region', 'country', 'id', 'id_scheme']] = pd.DataFrame(exploded_education.education.tolist(), index=exploded_education.index)" ] }, { "cell_type": "code", "execution_count": 130, "metadata": {}, "outputs": [], "source": [ "exploded_education.id.replace('', pd.NA, inplace=True)" ] }, { "cell_type": "code", "execution_count": 132, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidid
00000-0001-5000-01623
10000-0001-5000-01702
20000-0001-5000-02183
30000-0001-5000-02261
40000-0001-5000-03060
.........
24416400000-0003-4999-97191
24416410000-0003-4999-97351
24416420000-0003-4999-992X2
24416430000-0003-4999-99382
24416440000-0003-4999-99541
\n", "

2441645 rows × 2 columns

\n", "
" ], "text/plain": [ " orcid id\n", "0 0000-0001-5000-0162 3\n", "1 0000-0001-5000-0170 2\n", "2 0000-0001-5000-0218 3\n", "3 0000-0001-5000-0226 1\n", "4 0000-0001-5000-0306 0\n", "... ... ..\n", "2441640 0000-0003-4999-9719 1\n", "2441641 0000-0003-4999-9735 1\n", "2441642 0000-0003-4999-992X 2\n", "2441643 0000-0003-4999-9938 2\n", "2441644 0000-0003-4999-9954 1\n", "\n", "[2441645 rows x 2 columns]" ] }, "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exploded_education.groupby('orcid').id.count().reset_index()" ] }, { "cell_type": "code", "execution_count": 133, "metadata": {}, "outputs": [], "source": [ "df = df.merge(exploded_education.groupby('orcid').id.count().reset_index(), on='orcid')\n", "df.rename(columns={'id': 'n_valid_education'}, inplace=True)" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employmentext_works_sourcen_ext_work_sourceauthoritativespam_scoren_valid_employmentn_valid_education
20000-0003-1435-6545TrueTrue<NA><NA><NA>NaN<NA>[prostate cancer, migration, culture cell][[researcherid, p-2223-2018]][[morfologia, , universidade estadual paulista...[[, universidade estadual paulista (unesp), in...0NaN2018-08-09t12:12:24.405z2020-04-22t01:38:03.184z0000FalseNaNNaN[cnpq.br, linkedin.com]<NA>21311NaNNaNFalseNaN00
60000-0002-0427-9745TrueTruea. canincii am a professor of finance at bryant universi...NaN<NA>NaN[[researcherid, b-5471-2018], [scopus author i...[[finance, ph.d., university of michigan - ros...[[professor of finance, bryant university, smi...34[a. can inci]2018-01-20t02:58:05.199z2020-06-16t12:35:09.403z0000FalseNaNNaNNaN<NA><NA>2<NA>45[]0.0False4.341588e-1000
90000-0002-3380-6671TrueTrueabdulasis pata<NA>NaN<NA>NaNNaN[[agribisnis, m.si, universitas hasanuddin, ma...[[s.p, universitas muslim maros, maros, , id, ...0NaN2018-02-12t02:08:37.018z2018-02-12t02:22:33.378z0000FalseNaNNaNNaN<NA><NA><NA><NA>11NaNNaNFalseNaN00
110000-0001-6902-6549TrueTrueabubakarmuhammad<NA>NaN<NA>NaNNaN[[school of electrical and information enginee...[[lecturer, university of faisalabad, faisalab...1[multidisciplinary digital publishing institute]2017-07-06t10:29:17.738z2020-08-01t05:18:53.393z1000TrueNaNNaNNaN<NA><NA><NA><NA>11[multidisciplinary digital publishing institute]1.0TrueNaN00
120000-0002-6142-6406TrueTrueadammamadou<NA>NaN<NA>NaNNaN[[département deconomie sociologie rurale et t...[[, institut national de la recherche agronomi...0NaN2018-02-15t09:54:59.943z2018-02-15t10:19:27.869z0000FalseNaNNaNNaN<NA><NA><NA><NA>11NaNNaNFalseNaN00
...............................................................................................................
17533160000-0002-1842-4130TrueTruejosé de jesúscázares-marinero<NA>[josé cázares]<NA>[chemical biology, industrial chemistry, biote...[[researcherid, h-2597-2013], [scopus author i...[[charles friedel, postdoc, école nationale su...[[mtc, polioles, mexico, , mx, , ], [head of r...17[crossref metadata search, scopus - elsevier, ...2013-07-09t14:39:30.950z2020-12-10t17:42:20.176z170029FalseNaNNaN[linkedin.com, google.com, researchgate.net]<NA>32533[crossref metadata search, scopus - elsevier]2.0TrueNaN00
17533190000-0003-0459-4822TrueTrueluana<NA>mestranda em tecnologia na saúde e foi aluna o...[luana bastos morey]<NA>[tradução; língua espanhol; língua portuguesa;...NaN[[pós-graduação em tecnologia em saúde stricto...[[professora de espanhol e português para estr...7[luana arrial bastos]2017-05-11t13:14:59.372z2020-12-08t20:18:24.163z0000FalseNaNNaN[unidospelasaude.com.br, facebook.com, faceboo...<NA>4<NA>243[]0.0False1.000000e+0023
17533200000-0003-0057-1551TrueTruelyudmylaantypenkothe phd degree of pharmacy was received under ...[lyudmila nikolaevna antipenko (russian transl...<NA>[pharmaceutical chemistry, organic synthesis, ...[[scopus author id, 55070809900], [researcheri...[[centre for nanomaterials, advanced technolog...[[visiting scientist, north dakota state unive...35[crossref metadata search, scopus - elsevier, ...2014-02-19t08:15:15.698z2020-12-09t18:14:17.963z2801117TrueNaNNaNNaN<NA><NA>2578[crossref metadata search, scopus - elsevier, ...4.0True1.000000e+0024
17533250000-0003-4653-4705TrueTruepatriciateixeira2005 - phd, university of coimbrajuly 2009-jun...NaN<NA>[ecotoxicology, heavy metals, steroid hormones...[[researcherid, i-6863-2013], [scopus author i...[[, phd, university of coimbra, coimbra, , pt,...[[senior researcher, university of coimbra, co...95[ciênciavitae, scopus - elsevier, pg cardoso, ...2013-11-26t10:59:34.331z2020-12-02t15:28:26.221z900042FalseNaNNaNNaN<NA><NA>3713[ciênciavitae, scopus - elsevier, pg cardoso, ...4.0True7.147059e-1030
17533370000-0002-1686-1935TrueTrueyouxiawangyouxia wang (1995-), native of zunyi, guizhou ...NaN<NA>NaNNaN[[institute of animal nutrition, master degree...[[master, sichuan agricultural university , ch...0NaN2020-12-11t02:11:51.808z2020-12-11t03:25:28.263z0000FalseNaNNaNNaN<NA><NA><NA><NA>21NaNNaNFalse4.475163e-0211
\n", "

473043 rows × 36 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "2 0000-0003-1435-6545 True True \n", "6 0000-0002-0427-9745 True True \n", "9 0000-0002-3380-6671 True True \n", "11 0000-0001-6902-6549 True True \n", "12 0000-0002-6142-6406 True True \n", "... ... ... ... \n", "1753316 0000-0002-1842-4130 True True \n", "1753319 0000-0003-0459-4822 True True \n", "1753320 0000-0003-0057-1551 True True \n", "1753325 0000-0003-4653-4705 True True \n", "1753337 0000-0002-1686-1935 True True \n", "\n", " given_names family_name \\\n", "2 \n", "6 a. can inci \n", "9 abdul asis pata \n", "11 abubakar muhammad \n", "12 adam mamadou \n", "... ... ... \n", "1753316 josé de jesús cázares-marinero \n", "1753319 luana \n", "1753320 lyudmyla antypenko \n", "1753325 patricia teixeira \n", "1753337 youxia wang \n", "\n", " biography \\\n", "2 \n", "6 i am a professor of finance at bryant universi... \n", "9 \n", "11 \n", "12 \n", "... ... \n", "1753316 \n", "1753319 mestranda em tecnologia na saúde e foi aluna o... \n", "1753320 the phd degree of pharmacy was received under ... \n", "1753325 2005 - phd, university of coimbrajuly 2009-jun... \n", "1753337 youxia wang (1995-), native of zunyi, guizhou ... \n", "\n", " other_names primary_email \\\n", "2 NaN \n", "6 NaN \n", "9 NaN \n", "11 NaN \n", "12 NaN \n", "... ... ... \n", "1753316 [josé cázares] \n", "1753319 [luana bastos morey] \n", "1753320 [lyudmila nikolaevna antipenko (russian transl... \n", "1753325 NaN \n", "1753337 NaN \n", "\n", " keywords \\\n", "2 [prostate cancer, migration, culture cell] \n", "6 NaN \n", "9 NaN \n", "11 NaN \n", "12 NaN \n", "... ... \n", "1753316 [chemical biology, industrial chemistry, biote... \n", "1753319 [tradução; língua espanhol; língua portuguesa;... \n", "1753320 [pharmaceutical chemistry, organic synthesis, ... \n", "1753325 [ecotoxicology, heavy metals, steroid hormones... \n", "1753337 NaN \n", "\n", " external_ids \\\n", "2 [[researcherid, p-2223-2018]] \n", "6 [[researcherid, b-5471-2018], [scopus author i... \n", "9 NaN \n", "11 NaN \n", "12 NaN \n", "... ... \n", "1753316 [[researcherid, h-2597-2013], [scopus author i... \n", "1753319 NaN \n", "1753320 [[scopus author id, 55070809900], [researcheri... \n", "1753325 [[researcherid, i-6863-2013], [scopus author i... \n", "1753337 NaN \n", "\n", " education \\\n", "2 [[morfologia, , universidade estadual paulista... \n", "6 [[finance, ph.d., university of michigan - ros... \n", "9 [[agribisnis, m.si, universitas hasanuddin, ma... \n", "11 [[school of electrical and information enginee... \n", "12 [[département deconomie sociologie rurale et t... \n", "... ... \n", "1753316 [[charles friedel, postdoc, école nationale su... \n", "1753319 [[pós-graduação em tecnologia em saúde stricto... \n", "1753320 [[centre for nanomaterials, advanced technolog... \n", "1753325 [[, phd, university of coimbra, coimbra, , pt,... \n", "1753337 [[institute of animal nutrition, master degree... \n", "\n", " employment n_works \\\n", "2 [[, universidade estadual paulista (unesp), in... 0 \n", "6 [[professor of finance, bryant university, smi... 34 \n", "9 [[s.p, universitas muslim maros, maros, , id, ... 0 \n", "11 [[lecturer, university of faisalabad, faisalab... 1 \n", "12 [[, institut national de la recherche agronomi... 0 \n", "... ... ... \n", "1753316 [[mtc, polioles, mexico, , mx, , ], [head of r... 17 \n", "1753319 [[professora de espanhol e português para estr... 7 \n", "1753320 [[visiting scientist, north dakota state unive... 35 \n", "1753325 [[senior researcher, university of coimbra, co... 95 \n", "1753337 [[master, sichuan agricultural university , ch... 0 \n", "\n", " works_source \\\n", "2 NaN \n", "6 [a. can inci] \n", "9 NaN \n", "11 [multidisciplinary digital publishing institute] \n", "12 NaN \n", "... ... \n", "1753316 [crossref metadata search, scopus - elsevier, ... \n", "1753319 [luana arrial bastos] \n", "1753320 [crossref metadata search, scopus - elsevier, ... \n", "1753325 [ciênciavitae, scopus - elsevier, pg cardoso, ... \n", "1753337 NaN \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "2 2018-08-09t12:12:24.405z 2020-04-22t01:38:03.184z 0 0 \n", "6 2018-01-20t02:58:05.199z 2020-06-16t12:35:09.403z 0 0 \n", "9 2018-02-12t02:08:37.018z 2018-02-12t02:22:33.378z 0 0 \n", "11 2017-07-06t10:29:17.738z 2020-08-01t05:18:53.393z 1 0 \n", "12 2018-02-15t09:54:59.943z 2018-02-15t10:19:27.869z 0 0 \n", "... ... ... ... ... \n", "1753316 2013-07-09t14:39:30.950z 2020-12-10t17:42:20.176z 17 0 \n", "1753319 2017-05-11t13:14:59.372z 2020-12-08t20:18:24.163z 0 0 \n", "1753320 2014-02-19t08:15:15.698z 2020-12-09t18:14:17.963z 28 0 \n", "1753325 2013-11-26t10:59:34.331z 2020-12-02t15:28:26.221z 90 0 \n", "1753337 2020-12-11t02:11:51.808z 2020-12-11t03:25:28.263z 0 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", "2 0 0 False NaN NaN \n", "6 0 0 False NaN NaN \n", "9 0 0 False NaN NaN \n", "11 0 0 True NaN NaN \n", "12 0 0 False NaN NaN \n", "... ... ... ... ... ... \n", "1753316 0 29 False NaN NaN \n", "1753319 0 0 False NaN NaN \n", "1753320 11 17 True NaN NaN \n", "1753325 0 42 False NaN NaN \n", "1753337 0 0 False NaN NaN \n", "\n", " url_domains n_emails n_urls \\\n", "2 [cnpq.br, linkedin.com] 2 \n", "6 NaN \n", "9 NaN \n", "11 NaN \n", "12 NaN \n", "... ... ... ... \n", "1753316 [linkedin.com, google.com, researchgate.net] 3 \n", "1753319 [unidospelasaude.com.br, facebook.com, faceboo... 4 \n", "1753320 NaN \n", "1753325 NaN \n", "1753337 NaN \n", "\n", " n_ids n_keywords n_education n_employment \\\n", "2 1 3 1 1 \n", "6 2 4 5 \n", "9 1 1 \n", "11 1 1 \n", "12 1 1 \n", "... ... ... ... ... \n", "1753316 2 5 3 3 \n", "1753319 2 4 3 \n", "1753320 2 5 7 8 \n", "1753325 3 7 1 3 \n", "1753337 2 1 \n", "\n", " ext_works_source n_ext_work_source \\\n", "2 NaN NaN \n", "6 [] 0.0 \n", "9 NaN NaN \n", "11 [multidisciplinary digital publishing institute] 1.0 \n", "12 NaN NaN \n", "... ... ... \n", "1753316 [crossref metadata search, scopus - elsevier] 2.0 \n", "1753319 [] 0.0 \n", "1753320 [crossref metadata search, scopus - elsevier, ... 4.0 \n", "1753325 [ciênciavitae, scopus - elsevier, pg cardoso, ... 4.0 \n", "1753337 NaN NaN \n", "\n", " authoritative spam_score n_valid_employment n_valid_education \n", "2 False NaN 0 0 \n", "6 False 4.341588e-10 0 0 \n", "9 False NaN 0 0 \n", "11 True NaN 0 0 \n", "12 False NaN 0 0 \n", "... ... ... ... ... \n", "1753316 True NaN 0 0 \n", "1753319 False 1.000000e+00 2 3 \n", "1753320 True 1.000000e+00 2 4 \n", "1753325 True 7.147059e-10 3 0 \n", "1753337 False 4.475163e-02 1 1 \n", "\n", "[473043 rows x 36 columns]" ] }, "execution_count": 134, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.n_education != df.n_valid_education]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Employment" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 2.680488e+06\n", "mean 1.664713e+00\n", "std 1.530077e+00\n", "min 1.000000e+00\n", "25% 1.000000e+00\n", "50% 1.000000e+00\n", "75% 2.000000e+00\n", "max 1.980000e+02\n", "Name: n_employment, dtype: float64" ] }, "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.n_employment.describe()" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employmentext_works_sourcen_ext_work_sourceauthoritativespam_scoren_valid_employment
20207380000-0002-0293-964XTrueTrueben zhongtang<NA>[唐本忠]tangbenz@ust.hk[fluorescent biosensors, light-emitting molecu...[[hkust profile, tang-benzhong], [researcherid...[[department of chemistry and faculty of pharm...[[chair professor, division of biomedical engi...422[tang, benzhong, crossref]2015-03-13t00:28:33.270z2021-03-23t07:56:34.824z359000Falseust.hkNaN[ust.hk]<NA>1377198[crossref]1.0TrueNaN32
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "2020738 0000-0002-0293-964X True True \n", "\n", " given_names family_name biography other_names primary_email \\\n", "2020738 ben zhong tang [唐本忠] tangbenz@ust.hk \n", "\n", " keywords \\\n", "2020738 [fluorescent biosensors, light-emitting molecu... \n", "\n", " external_ids \\\n", "2020738 [[hkust profile, tang-benzhong], [researcherid... \n", "\n", " education \\\n", "2020738 [[department of chemistry and faculty of pharm... \n", "\n", " employment n_works \\\n", "2020738 [[chair professor, division of biomedical engi... 422 \n", "\n", " works_source activation_date \\\n", "2020738 [tang, benzhong, crossref] 2015-03-13t00:28:33.270z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", "2020738 2021-03-23t07:56:34.824z 359 0 0 0 False \n", "\n", " primary_email_domain other_email_domains url_domains n_emails \\\n", "2020738 ust.hk NaN [ust.hk] \n", "\n", " n_urls n_ids n_keywords n_education n_employment \\\n", "2020738 1 3 7 7 198 \n", "\n", " ext_works_source n_ext_work_source authoritative spam_score \\\n", "2020738 [crossref] 1.0 True NaN \n", "\n", " n_valid_employment \n", "2020738 32 " ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.n_employment == df.n_employment.max()]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's count how many employments have a valid assigned id by orcid (ringols, isni, grid, etc.)" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidemployment
10000-0001-6112-5550[professor, peoples friendship university of r...
30000-0001-6220-5683[research scientist, new york university abu d...
40000-0001-7071-8294[researcher (academic), universidad de zaragoz...
40000-0001-7071-8294[researcher (academic), instituto de síntesis ...
60000-0001-7402-0096[, kth royal institute of technology, stockhol...
.........
109896430000-0003-2606-0936[post-doc, institute of biochemistry and cell ...
109896440000-0002-1686-1935[master, sichuan agricultural university , che...
109896450000-0002-3800-6331[assistant professor, baruch college, city uni...
109896450000-0002-3800-6331[postdoctoral scholar, university of californi...
109896470000-0002-7584-2283[lecturer, henan institute of science and tech...
\n", "

4462243 rows × 2 columns

\n", "
" ], "text/plain": [ " orcid \\\n", "1 0000-0001-6112-5550 \n", "3 0000-0001-6220-5683 \n", "4 0000-0001-7071-8294 \n", "4 0000-0001-7071-8294 \n", "6 0000-0001-7402-0096 \n", "... ... \n", "10989643 0000-0003-2606-0936 \n", "10989644 0000-0002-1686-1935 \n", "10989645 0000-0002-3800-6331 \n", "10989645 0000-0002-3800-6331 \n", "10989647 0000-0002-7584-2283 \n", "\n", " employment \n", "1 [professor, peoples friendship university of r... \n", "3 [research scientist, new york university abu d... \n", "4 [researcher (academic), universidad de zaragoz... \n", "4 [researcher (academic), instituto de síntesis ... \n", "6 [, kth royal institute of technology, stockhol... \n", "... ... \n", "10989643 [post-doc, institute of biochemistry and cell ... \n", "10989644 [master, sichuan agricultural university , che... \n", "10989645 [assistant professor, baruch college, city uni... \n", "10989645 [postdoctoral scholar, university of californi... \n", "10989647 [lecturer, henan institute of science and tech... \n", "\n", "[4462243 rows x 2 columns]" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exploded_employment = df[['orcid', 'employment']].explode('employment').dropna()\n", "exploded_employment" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [], "source": [ "exploded_employment[['role', 'institution', 'city', 'region', 'country', 'id', 'id_scheme']] = pd.DataFrame(exploded_employment.employment.tolist(), index=exploded_employment.index)" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "exploded_employment.id.replace('', pd.NA, inplace=True)" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidid
00000-0001-5000-00311
10000-0001-5000-01381
20000-0001-5000-01702
30000-0001-5000-02181
40000-0001-5000-02261
.........
26804830000-0003-4999-98311
26804840000-0003-4999-98901
26804850000-0003-4999-992X0
26804860000-0003-4999-99381
26804870000-0003-4999-99542
\n", "

2680488 rows × 2 columns

\n", "
" ], "text/plain": [ " orcid id\n", "0 0000-0001-5000-0031 1\n", "1 0000-0001-5000-0138 1\n", "2 0000-0001-5000-0170 2\n", "3 0000-0001-5000-0218 1\n", "4 0000-0001-5000-0226 1\n", "... ... ..\n", "2680483 0000-0003-4999-9831 1\n", "2680484 0000-0003-4999-9890 1\n", "2680485 0000-0003-4999-992X 0\n", "2680486 0000-0003-4999-9938 1\n", "2680487 0000-0003-4999-9954 2\n", "\n", "[2680488 rows x 2 columns]" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exploded_employment.groupby('orcid').id.count().reset_index()" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employmentext_works_sourcen_ext_work_sourceauthoritativespam_scoren_valid_employment
00000-0001-6112-5550TrueTrue<NA><NA><NA>[v.i. yurtaev; v. yurtaev]<NA>NaNNaNNaN[[professor, peoples friendship university of ...0NaN2018-04-03t07:50:23.358z2020-03-18t09:42:44.753z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA>1NaNNaNFalseNaN1
10000-0001-6220-5683TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaN[[research scientist, new york university abu ...0NaN2015-08-18t12:36:45.307z2020-09-23t13:37:54.180z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA>1NaNNaNFalseNaN0
20000-0001-7071-8294TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaN[[researcher (academic), universidad de zarago...0NaN2014-03-10t13:22:01.966z2016-06-14t22:17:54.470z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA>2NaNNaNFalseNaN1
30000-0001-7402-0096TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaN[[, kth royal institute of technology, stockho...0NaN2015-01-11t15:13:06.467z2016-06-14t23:55:59.896z0000FalseNaNNaN[kth.se]<NA>1<NA><NA><NA>1NaNNaNFalseNaN0
40000-0001-8315-2066TrueTrue<NA><NA><NA>NaN<NA>[iron chlorosis, fertilizers, calcareous soil,...[[researcherid, k-4630-2014]]NaN[[, universidad de córdoba, córdoba, andalucía...0NaN2014-05-26t08:57:12.661z2019-03-27t07:53:48.987z0000FalseNaNNaNNaN<NA><NA>14<NA>1NaNNaNFalseNaN1
............................................................................................................
26804830000-0002-8004-688XTrueTruepaulwanjala muyoma<NA>[wanjala]<NA>[environment and sustainability]NaNNaN[[graduate teaching assistant, university of p...0NaN2016-03-07t08:53:06.561z2020-12-02t02:14:50.213z0000FalseNaNNaNNaN<NA><NA><NA>1<NA>2NaNNaNFalseNaN2
26804840000-0003-2606-0936TrueTrueluangxu<NA>[xu lu-ang, lu lu]<NA>NaNNaNNaN[[post-doc, institute of biochemistry and cell...2[scopus - elsevier, crossref]2015-10-24t03:53:23.544z2020-11-19t09:23:48.896z2001TrueNaNNaNNaN<NA><NA><NA><NA><NA>1[scopus - elsevier, crossref]2.0TrueNaN1
26804850000-0002-1686-1935TrueTrueyouxiawangyouxia wang (1995-), native of zunyi, guizhou ...NaN<NA>NaNNaN[[institute of animal nutrition, master degree...[[master, sichuan agricultural university , ch...0NaN2020-12-11t02:11:51.808z2020-12-11t03:25:28.263z0000FalseNaNNaNNaN<NA><NA><NA><NA>21NaNNaNFalse0.0447521
26804860000-0002-3800-6331TrueTruezacharycalamari<NA>NaN<NA>NaNNaN[[richard gilder graduate school, phd in compa...[[assistant professor, baruch college, city un...7[crossref metadata search, zachary t. calamari...2015-01-20t20:20:17.042z2020-11-21t19:48:36.221z7010TrueNaNNaNNaN<NA><NA><NA><NA>22[crossref metadata search, crossref]2.0TrueNaN0
26804870000-0002-7584-2283TrueTrue现刚<NA>[zuo xiangang, xiangang zuo, zuo x g, x g zuo]<NA>NaNNaN[[school of electronics and information, maste...[[lecturer, henan institute of science and tec...0NaN2016-12-27t07:45:25.073z2020-11-29t13:06:17.582z0000FalseNaNNaNNaN<NA><NA><NA><NA>21NaNNaNFalseNaN1
\n", "

2680488 rows × 35 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "0 0000-0001-6112-5550 True True \n", "1 0000-0001-6220-5683 True True \n", "2 0000-0001-7071-8294 True True \n", "3 0000-0001-7402-0096 True True \n", "4 0000-0001-8315-2066 True True \n", "... ... ... ... \n", "2680483 0000-0002-8004-688X True True \n", "2680484 0000-0003-2606-0936 True True \n", "2680485 0000-0002-1686-1935 True True \n", "2680486 0000-0002-3800-6331 True True \n", "2680487 0000-0002-7584-2283 True True \n", "\n", " given_names family_name \\\n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", "... ... ... \n", "2680483 paul wanjala muyoma \n", "2680484 luang xu \n", "2680485 youxia wang \n", "2680486 zachary calamari \n", "2680487 现刚 左 \n", "\n", " biography \\\n", "0 \n", "1 \n", "2 \n", "3 \n", "4 \n", "... ... \n", "2680483 \n", "2680484 \n", "2680485 youxia wang (1995-), native of zunyi, guizhou ... \n", "2680486 \n", "2680487 \n", "\n", " other_names primary_email \\\n", "0 [v.i. yurtaev; v. yurtaev] \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "... ... ... \n", "2680483 [wanjala] \n", "2680484 [xu lu-ang, lu lu] \n", "2680485 NaN \n", "2680486 NaN \n", "2680487 [zuo xiangang, xiangang zuo, zuo x g, x g zuo] \n", "\n", " keywords \\\n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 [iron chlorosis, fertilizers, calcareous soil,... \n", "... ... \n", "2680483 [environment and sustainability] \n", "2680484 NaN \n", "2680485 NaN \n", "2680486 NaN \n", "2680487 NaN \n", "\n", " external_ids \\\n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 [[researcherid, k-4630-2014]] \n", "... ... \n", "2680483 NaN \n", "2680484 NaN \n", "2680485 NaN \n", "2680486 NaN \n", "2680487 NaN \n", "\n", " education \\\n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "... ... \n", "2680483 NaN \n", "2680484 NaN \n", "2680485 [[institute of animal nutrition, master degree... \n", "2680486 [[richard gilder graduate school, phd in compa... \n", "2680487 [[school of electronics and information, maste... \n", "\n", " employment n_works \\\n", "0 [[professor, peoples friendship university of ... 0 \n", "1 [[research scientist, new york university abu ... 0 \n", "2 [[researcher (academic), universidad de zarago... 0 \n", "3 [[, kth royal institute of technology, stockho... 0 \n", "4 [[, universidad de córdoba, córdoba, andalucía... 0 \n", "... ... ... \n", "2680483 [[graduate teaching assistant, university of p... 0 \n", "2680484 [[post-doc, institute of biochemistry and cell... 2 \n", "2680485 [[master, sichuan agricultural university , ch... 0 \n", "2680486 [[assistant professor, baruch college, city un... 7 \n", "2680487 [[lecturer, henan institute of science and tec... 0 \n", "\n", " works_source \\\n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", "... ... \n", "2680483 NaN \n", "2680484 [scopus - elsevier, crossref] \n", "2680485 NaN \n", "2680486 [crossref metadata search, zachary t. calamari... \n", "2680487 NaN \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "0 2018-04-03t07:50:23.358z 2020-03-18t09:42:44.753z 0 0 \n", "1 2015-08-18t12:36:45.307z 2020-09-23t13:37:54.180z 0 0 \n", "2 2014-03-10t13:22:01.966z 2016-06-14t22:17:54.470z 0 0 \n", "3 2015-01-11t15:13:06.467z 2016-06-14t23:55:59.896z 0 0 \n", "4 2014-05-26t08:57:12.661z 2019-03-27t07:53:48.987z 0 0 \n", "... ... ... ... ... \n", "2680483 2016-03-07t08:53:06.561z 2020-12-02t02:14:50.213z 0 0 \n", "2680484 2015-10-24t03:53:23.544z 2020-11-19t09:23:48.896z 2 0 \n", "2680485 2020-12-11t02:11:51.808z 2020-12-11t03:25:28.263z 0 0 \n", "2680486 2015-01-20t20:20:17.042z 2020-11-21t19:48:36.221z 7 0 \n", "2680487 2016-12-27t07:45:25.073z 2020-11-29t13:06:17.582z 0 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", "0 0 0 False NaN NaN \n", "1 0 0 False NaN NaN \n", "2 0 0 False NaN NaN \n", "3 0 0 False NaN NaN \n", "4 0 0 False NaN NaN \n", "... ... ... ... ... ... \n", "2680483 0 0 False NaN NaN \n", "2680484 0 1 True NaN NaN \n", "2680485 0 0 False NaN NaN \n", "2680486 1 0 True NaN NaN \n", "2680487 0 0 False NaN NaN \n", "\n", " url_domains n_emails n_urls n_ids n_keywords n_education \\\n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 [kth.se] 1 \n", "4 NaN 1 4 \n", "... ... ... ... ... ... ... \n", "2680483 NaN 1 \n", "2680484 NaN \n", "2680485 NaN 2 \n", "2680486 NaN 2 \n", "2680487 NaN 2 \n", "\n", " n_employment ext_works_source \\\n", "0 1 NaN \n", "1 1 NaN \n", "2 2 NaN \n", "3 1 NaN \n", "4 1 NaN \n", "... ... ... \n", "2680483 2 NaN \n", "2680484 1 [scopus - elsevier, crossref] \n", "2680485 1 NaN \n", "2680486 2 [crossref metadata search, crossref] \n", "2680487 1 NaN \n", "\n", " n_ext_work_source authoritative spam_score n_valid_employment \n", "0 NaN False NaN 1 \n", "1 NaN False NaN 0 \n", "2 NaN False NaN 1 \n", "3 NaN False NaN 0 \n", "4 NaN False NaN 1 \n", "... ... ... ... ... \n", "2680483 NaN False NaN 2 \n", "2680484 2.0 True NaN 1 \n", "2680485 NaN False 0.044752 1 \n", "2680486 2.0 True NaN 0 \n", "2680487 NaN False NaN 1 \n", "\n", "[2680488 rows x 35 columns]" ] }, "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df.merge(exploded_employment.groupby('orcid').id.count().reset_index(), on='orcid')\n", "df.rename(columns={'id': 'n_valid_employment'}, inplace=True)" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employmentext_works_sourcen_ext_work_sourceauthoritativespam_scoren_valid_employment
10000-0001-6220-5683TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaN[[research scientist, new york university abu ...0NaN2015-08-18t12:36:45.307z2020-09-23t13:37:54.180z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA>1NaNNaNFalseNaN0
20000-0001-7071-8294TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaN[[researcher (academic), universidad de zarago...0NaN2014-03-10t13:22:01.966z2016-06-14t22:17:54.470z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA>2NaNNaNFalseNaN1
30000-0001-7402-0096TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaN[[, kth royal institute of technology, stockho...0NaN2015-01-11t15:13:06.467z2016-06-14t23:55:59.896z0000FalseNaNNaN[kth.se]<NA>1<NA><NA><NA>1NaNNaNFalseNaN0
50000-0001-8377-3508TrueTrue<NA><NA><NA>[fontana, milena da silva]<NA>[educação; informática; matemática.]NaNNaN[[, instituto federal de educação, ciência e t...0NaN2018-05-23t23:39:04.534z2019-10-16t02:50:11.007z0000FalseNaNNaN[cnpq.br]<NA>1<NA>1<NA>3NaNNaNFalseNaN0
80000-0002-6508-6998TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaN[[researcher (academic), universidad de zarago...0NaN2014-03-12t08:23:22.492z2015-07-27t15:51:38.411z0000FalseNaNNaNNaN<NA><NA><NA><NA><NA>2NaNNaNFalseNaN1
............................................................................................................
26804760000-0001-9133-2366TrueTruesørenstaugaard<NA>NaN<NA>NaNNaN[[, , aarhus universitet, aarhus, , dk, 1006, ...[[, aarhus university, aarhus c, , dk, , ], [s...29[aarhus university, crossref]2013-03-19t11:34:48.477z2020-12-07t08:03:23.190z1401035TrueNaNNaN[au.dk, au.dk]<NA>2<NA><NA>13[aarhus university, crossref]2.0TrueNaN1
26804770000-0001-8494-2123TrueTruetarunjain<NA>NaN<NA>[pet/ct specialist; nuclear medicine physician...NaNNaN[[assistant professor, mahatma gandhi medical ...0NaN2014-12-19t08:21:46.292z2020-12-09t06:03:57.055z0000FalseNaNNaNNaN<NA><NA><NA>1<NA>5NaNNaNFalseNaN4
26804790000-0002-2906-0299TrueTruetiffanymackay<NA>[tiffany russel sia]<NA>[microfluidics, gpc-1, gallium-67, pet/ct, oxy...[[researcherid, a-2121-2017]][[faculty of medicine, master in pharmaceutica...[[clinical project lead, minomic international...11[crossref, researcherid, tiffany mackay]2017-01-03t23:28:48.736z2020-12-09t17:12:20.326z11000TrueNaNNaN[oxytocin.com.au, linkedin.com]<NA>211324[crossref, researcherid]2.0TrueNaN1
26804810000-0002-4422-4036TrueTruevijaykrishnan<NA>NaN<NA>NaNNaN[[psychiatry, md, all india institute of medic...[[assistant professor, all india institute of ...2[crossref]2015-05-28t17:24:39.519z2020-11-24t08:57:22.875z2000FalseNaNNaNNaN<NA><NA><NA><NA>25[crossref]1.0TrueNaN3
26804860000-0002-3800-6331TrueTruezacharycalamari<NA>NaN<NA>NaNNaN[[richard gilder graduate school, phd in compa...[[assistant professor, baruch college, city un...7[crossref metadata search, zachary t. calamari...2015-01-20t20:20:17.042z2020-11-21t19:48:36.221z7010TrueNaNNaNNaN<NA><NA><NA><NA>22[crossref metadata search, crossref]2.0TrueNaN0
\n", "

1036967 rows × 35 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "1 0000-0001-6220-5683 True True \n", "2 0000-0001-7071-8294 True True \n", "3 0000-0001-7402-0096 True True \n", "5 0000-0001-8377-3508 True True \n", "8 0000-0002-6508-6998 True True \n", "... ... ... ... \n", "2680476 0000-0001-9133-2366 True True \n", "2680477 0000-0001-8494-2123 True True \n", "2680479 0000-0002-2906-0299 True True \n", "2680481 0000-0002-4422-4036 True True \n", "2680486 0000-0002-3800-6331 True True \n", "\n", " given_names family_name biography other_names \\\n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "5 [fontana, milena da silva] \n", "8 NaN \n", "... ... ... ... ... \n", "2680476 søren staugaard NaN \n", "2680477 tarun jain NaN \n", "2680479 tiffany mackay [tiffany russel sia] \n", "2680481 vijay krishnan NaN \n", "2680486 zachary calamari NaN \n", "\n", " primary_email keywords \\\n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "5 [educação; informática; matemática.] \n", "8 NaN \n", "... ... ... \n", "2680476 NaN \n", "2680477 [pet/ct specialist; nuclear medicine physician... \n", "2680479 [microfluidics, gpc-1, gallium-67, pet/ct, oxy... \n", "2680481 NaN \n", "2680486 NaN \n", "\n", " external_ids \\\n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "5 NaN \n", "8 NaN \n", "... ... \n", "2680476 NaN \n", "2680477 NaN \n", "2680479 [[researcherid, a-2121-2017]] \n", "2680481 NaN \n", "2680486 NaN \n", "\n", " education \\\n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "5 NaN \n", "8 NaN \n", "... ... \n", "2680476 [[, , aarhus universitet, aarhus, , dk, 1006, ... \n", "2680477 NaN \n", "2680479 [[faculty of medicine, master in pharmaceutica... \n", "2680481 [[psychiatry, md, all india institute of medic... \n", "2680486 [[richard gilder graduate school, phd in compa... \n", "\n", " employment n_works \\\n", "1 [[research scientist, new york university abu ... 0 \n", "2 [[researcher (academic), universidad de zarago... 0 \n", "3 [[, kth royal institute of technology, stockho... 0 \n", "5 [[, instituto federal de educação, ciência e t... 0 \n", "8 [[researcher (academic), universidad de zarago... 0 \n", "... ... ... \n", "2680476 [[, aarhus university, aarhus c, , dk, , ], [s... 29 \n", "2680477 [[assistant professor, mahatma gandhi medical ... 0 \n", "2680479 [[clinical project lead, minomic international... 11 \n", "2680481 [[assistant professor, all india institute of ... 2 \n", "2680486 [[assistant professor, baruch college, city un... 7 \n", "\n", " works_source \\\n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "5 NaN \n", "8 NaN \n", "... ... \n", "2680476 [aarhus university, crossref] \n", "2680477 NaN \n", "2680479 [crossref, researcherid, tiffany mackay] \n", "2680481 [crossref] \n", "2680486 [crossref metadata search, zachary t. calamari... \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "1 2015-08-18t12:36:45.307z 2020-09-23t13:37:54.180z 0 0 \n", "2 2014-03-10t13:22:01.966z 2016-06-14t22:17:54.470z 0 0 \n", "3 2015-01-11t15:13:06.467z 2016-06-14t23:55:59.896z 0 0 \n", "5 2018-05-23t23:39:04.534z 2019-10-16t02:50:11.007z 0 0 \n", "8 2014-03-12t08:23:22.492z 2015-07-27t15:51:38.411z 0 0 \n", "... ... ... ... ... \n", "2680476 2013-03-19t11:34:48.477z 2020-12-07t08:03:23.190z 14 0 \n", "2680477 2014-12-19t08:21:46.292z 2020-12-09t06:03:57.055z 0 0 \n", "2680479 2017-01-03t23:28:48.736z 2020-12-09t17:12:20.326z 11 0 \n", "2680481 2015-05-28t17:24:39.519z 2020-11-24t08:57:22.875z 2 0 \n", "2680486 2015-01-20t20:20:17.042z 2020-11-21t19:48:36.221z 7 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", "1 0 0 False NaN NaN \n", "2 0 0 False NaN NaN \n", "3 0 0 False NaN NaN \n", "5 0 0 False NaN NaN \n", "8 0 0 False NaN NaN \n", "... ... ... ... ... ... \n", "2680476 10 35 True NaN NaN \n", "2680477 0 0 False NaN NaN \n", "2680479 0 0 True NaN NaN \n", "2680481 0 0 False NaN NaN \n", "2680486 1 0 True NaN NaN \n", "\n", " url_domains n_emails n_urls n_ids n_keywords \\\n", "1 NaN \n", "2 NaN \n", "3 [kth.se] 1 \n", "5 [cnpq.br] 1 1 \n", "8 NaN \n", "... ... ... ... ... ... \n", "2680476 [au.dk, au.dk] 2 \n", "2680477 NaN 1 \n", "2680479 [oxytocin.com.au, linkedin.com] 2 1 13 \n", "2680481 NaN \n", "2680486 NaN \n", "\n", " n_education n_employment ext_works_source \\\n", "1 1 NaN \n", "2 2 NaN \n", "3 1 NaN \n", "5 3 NaN \n", "8 2 NaN \n", "... ... ... ... \n", "2680476 1 3 [aarhus university, crossref] \n", "2680477 5 NaN \n", "2680479 2 4 [crossref, researcherid] \n", "2680481 2 5 [crossref] \n", "2680486 2 2 [crossref metadata search, crossref] \n", "\n", " n_ext_work_source authoritative spam_score n_valid_employment \n", "1 NaN False NaN 0 \n", "2 NaN False NaN 1 \n", "3 NaN False NaN 0 \n", "5 NaN False NaN 0 \n", "8 NaN False NaN 1 \n", "... ... ... ... ... \n", "2680476 2.0 True NaN 1 \n", "2680477 NaN False NaN 4 \n", "2680479 2.0 True NaN 1 \n", "2680481 1.0 True NaN 3 \n", "2680486 2.0 True NaN 0 \n", "\n", "[1036967 rows x 35 columns]" ] }, "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.n_employment != df.n_valid_employment]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Biography" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "df['biography'] = df[df.biography.notna()]['biography'].replace('', np.NaN)" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 354015\n", "unique 337007\n", "top car title loans are a more straightforward way...\n", "freq 343\n", "Name: biography, dtype: object" ] }, "execution_count": 64, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.biography.describe()" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employmentext_works_sourcen_ext_work_sourceauthoritative
513060000-0002-7397-7977TrueTruepremium cartitle loanscar title loans are a more straightforward way...[premium car title loans]<NA>[car title loan upland]NaNNaNNaN0NaN2020-11-06t06:10:20.070z2020-11-06t06:24:28.005z0000FalseNaNNaN[premiumcartitleloans.com]<NA>1<NA>1<NA><NA>NaNNaNFalse
513070000-0003-4931-9736TrueTruepremium cartitle loanscar title loans are a more straightforward way...[premium car title loans]<NA>[car title loan saratoga]NaNNaNNaN0NaN2020-11-13t01:04:19.859z2020-11-13t01:15:12.546z0000FalseNaNNaN[premiumcartitleloans.com]<NA>1<NA>1<NA><NA>NaNNaNFalse
1060240000-0001-8221-2303TrueTruepremium cartitle loanscar title loans are a more straightforward way...[premium car title loans]<NA>[car title loan victorville]NaNNaNNaN0NaN2020-11-05t00:38:21.096z2020-11-05t00:40:40.091z0000FalseNaNNaN[premiumcartitleloans.com]<NA>1<NA>1<NA><NA>NaNNaNFalse
1087700000-0001-6736-072XTrueTruepremium cartitle loanscar title loans are a more straightforward way...NaN<NA>NaNNaNNaNNaN0NaN2020-12-08t05:38:30.786z2020-12-08t05:40:03.786z0000FalseNaNNaN[premiumcartitleloans.com]<NA>1<NA><NA><NA><NA>NaNNaNFalse
1087710000-0002-8727-1246TrueTruepremium cartitle loanscar title loans are a more straightforward way...[loan agency]<NA>[refinance car title loan, title loan on car, ...NaNNaNNaN0NaN2020-12-10t08:54:56.127z2020-12-10t08:57:15.791z0000FalseNaNNaN[premiumcartitleloans.com]<NA>1<NA>4<NA><NA>NaNNaNFalse
......................................................................................................
108754160000-0002-9640-8136TrueTruepremium cartitle loanscar title loans are a more straightforward way...[premium car title loans]<NA>[car title loan clovis]NaNNaNNaN0NaN2020-10-22t06:11:02.945z2020-10-22t06:17:09.111z0000FalseNaNNaN[premiumcartitleloans.com]<NA>1<NA>1<NA><NA>NaNNaNFalse
108782390000-0002-6926-3752TrueTruepremium cartitle loanscar title loans are a more straightforward way...[premium car title loans]<NA>[car title loan escondido]NaNNaNNaN0NaN2020-12-03t02:00:33.684z2020-12-03t02:02:07.054z0000FalseNaNNaN[premiumcartitleloans.com]<NA>1<NA>1<NA><NA>NaNNaNFalse
109333800000-0002-3655-4713TrueTruepremium cartitle loanscar title loans are a more straightforward way...[premium car title loans]<NA>[car title loan san rafael]NaNNaNNaN0NaN2020-11-18t00:39:17.492z2020-11-18t00:52:19.024z0000FalseNaNNaN[premiumcartitleloans.com]<NA>1<NA>1<NA><NA>NaNNaNFalse
109333810000-0002-8724-1020TrueTruepremium cartitle loanscar title loans are a more straightforward way...[premium car title loans]<NA>[car title loan san juan capistrano]NaNNaNNaN0NaN2020-11-19t00:31:54.080z2020-11-19t00:34:08.721z0000FalseNaNNaN[premiumcartitleloans.com]<NA>1<NA>1<NA><NA>NaNNaNFalse
109859860000-0002-4601-4569TrueTruepremium cartitle loanscar title loans are a more straightforward way...[premium car title loans]<NA>[car title loan mount pleasant]NaNNaNNaN0NaN2020-10-16t00:32:26.207z2020-10-16t00:37:42.646z0000FalseNaNNaN[premiumcartitleloans.com]<NA>1<NA>1<NA><NA>NaNNaNFalse
\n", "

421 rows × 33 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "51306 0000-0002-7397-7977 True True \n", "51307 0000-0003-4931-9736 True True \n", "106024 0000-0001-8221-2303 True True \n", "108770 0000-0001-6736-072X True True \n", "108771 0000-0002-8727-1246 True True \n", "... ... ... ... \n", "10875416 0000-0002-9640-8136 True True \n", "10878239 0000-0002-6926-3752 True True \n", "10933380 0000-0002-3655-4713 True True \n", "10933381 0000-0002-8724-1020 True True \n", "10985986 0000-0002-4601-4569 True True \n", "\n", " given_names family_name \\\n", "51306 premium car title loans \n", "51307 premium car title loans \n", "106024 premium car title loans \n", "108770 premium car title loans \n", "108771 premium car title loans \n", "... ... ... \n", "10875416 premium car title loans \n", "10878239 premium car title loans \n", "10933380 premium car title loans \n", "10933381 premium car title loans \n", "10985986 premium car title loans \n", "\n", " biography \\\n", "51306 car title loans are a more straightforward way... \n", "51307 car title loans are a more straightforward way... \n", "106024 car title loans are a more straightforward way... \n", "108770 car title loans are a more straightforward way... \n", "108771 car title loans are a more straightforward way... \n", "... ... \n", "10875416 car title loans are a more straightforward way... \n", "10878239 car title loans are a more straightforward way... \n", "10933380 car title loans are a more straightforward way... \n", "10933381 car title loans are a more straightforward way... \n", "10985986 car title loans are a more straightforward way... \n", "\n", " other_names primary_email \\\n", "51306 [premium car title loans] \n", "51307 [premium car title loans] \n", "106024 [premium car title loans] \n", "108770 NaN \n", "108771 [loan agency] \n", "... ... ... \n", "10875416 [premium car title loans] \n", "10878239 [premium car title loans] \n", "10933380 [premium car title loans] \n", "10933381 [premium car title loans] \n", "10985986 [premium car title loans] \n", "\n", " keywords external_ids \\\n", "51306 [car title loan upland] NaN \n", "51307 [car title loan saratoga] NaN \n", "106024 [car title loan victorville] NaN \n", "108770 NaN NaN \n", "108771 [refinance car title loan, title loan on car, ... NaN \n", "... ... ... \n", "10875416 [car title loan clovis] NaN \n", "10878239 [car title loan escondido] NaN \n", "10933380 [car title loan san rafael] NaN \n", "10933381 [car title loan san juan capistrano] NaN \n", "10985986 [car title loan mount pleasant] NaN \n", "\n", " education employment n_works works_source activation_date \\\n", "51306 NaN NaN 0 NaN 2020-11-06t06:10:20.070z \n", "51307 NaN NaN 0 NaN 2020-11-13t01:04:19.859z \n", "106024 NaN NaN 0 NaN 2020-11-05t00:38:21.096z \n", "108770 NaN NaN 0 NaN 2020-12-08t05:38:30.786z \n", "108771 NaN NaN 0 NaN 2020-12-10t08:54:56.127z \n", "... ... ... ... ... ... \n", "10875416 NaN NaN 0 NaN 2020-10-22t06:11:02.945z \n", "10878239 NaN NaN 0 NaN 2020-12-03t02:00:33.684z \n", "10933380 NaN NaN 0 NaN 2020-11-18t00:39:17.492z \n", "10933381 NaN NaN 0 NaN 2020-11-19t00:31:54.080z \n", "10985986 NaN NaN 0 NaN 2020-10-16t00:32:26.207z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n", "51306 2020-11-06t06:24:28.005z 0 0 0 0 \n", "51307 2020-11-13t01:15:12.546z 0 0 0 0 \n", "106024 2020-11-05t00:40:40.091z 0 0 0 0 \n", "108770 2020-12-08t05:40:03.786z 0 0 0 0 \n", "108771 2020-12-10t08:57:15.791z 0 0 0 0 \n", "... ... ... ... ... ... \n", "10875416 2020-10-22t06:17:09.111z 0 0 0 0 \n", "10878239 2020-12-03t02:02:07.054z 0 0 0 0 \n", "10933380 2020-11-18t00:52:19.024z 0 0 0 0 \n", "10933381 2020-11-19t00:34:08.721z 0 0 0 0 \n", "10985986 2020-10-16t00:37:42.646z 0 0 0 0 \n", "\n", " label primary_email_domain other_email_domains \\\n", "51306 False NaN NaN \n", "51307 False NaN NaN \n", "106024 False NaN NaN \n", "108770 False NaN NaN \n", "108771 False NaN NaN \n", "... ... ... ... \n", "10875416 False NaN NaN \n", "10878239 False NaN NaN \n", "10933380 False NaN NaN \n", "10933381 False NaN NaN \n", "10985986 False NaN NaN \n", "\n", " url_domains n_emails n_urls n_ids n_keywords \\\n", "51306 [premiumcartitleloans.com] 1 1 \n", "51307 [premiumcartitleloans.com] 1 1 \n", "106024 [premiumcartitleloans.com] 1 1 \n", "108770 [premiumcartitleloans.com] 1 \n", "108771 [premiumcartitleloans.com] 1 4 \n", "... ... ... ... ... ... \n", "10875416 [premiumcartitleloans.com] 1 1 \n", "10878239 [premiumcartitleloans.com] 1 1 \n", "10933380 [premiumcartitleloans.com] 1 1 \n", "10933381 [premiumcartitleloans.com] 1 1 \n", "10985986 [premiumcartitleloans.com] 1 1 \n", "\n", " n_education n_employment ext_works_source n_ext_work_source \\\n", "51306 NaN NaN \n", "51307 NaN NaN \n", "106024 NaN NaN \n", "108770 NaN NaN \n", "108771 NaN NaN \n", "... ... ... ... ... \n", "10875416 NaN NaN \n", "10878239 NaN NaN \n", "10933380 NaN NaN \n", "10933381 NaN NaN \n", "10985986 NaN NaN \n", "\n", " authoritative \n", "51306 False \n", "51307 False \n", "106024 False \n", "108770 False \n", "108771 False \n", "... ... \n", "10875416 False \n", "10878239 False \n", "10933380 False \n", "10933381 False \n", "10985986 False \n", "\n", "[421 rows x 33 columns]" ] }, "execution_count": 65, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[(df.biography.notna()) & (df.biography.str.contains('car title loans are a more straightforward'))]" ] }, { "cell_type": "code", "execution_count": 66, "metadata": {}, "outputs": [], "source": [ "def score(bio):\n", " try:\n", " return antispam.score(bio)\n", " except: # if len(bio) < 3 the filter doesn't know how to handle that\n", " return -1" ] }, { "cell_type": "code", "execution_count": 67, "metadata": {}, "outputs": [], "source": [ "df['spam_score'] = df[df.biography.notna()]['biography'].apply(lambda bio: score(bio))" ] }, { "cell_type": "code", "execution_count": 68, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidbiography
255050000-0003-0505-2734j
1384870000-0002-3417-7299.....
1395950000-0003-3794-1288m.d., ph.d.
1933400000-0001-9655-4806肿瘤
1949900000-0002-9149-0142be y
.........
109278660000-0002-7341-5480ph.d.
109760800000-0003-4041-0840/
109766890000-0002-4285-8537
109769220000-0002-1545-8773hi
109873790000-0002-6302-4224.
\n", "

348 rows × 2 columns

\n", "
" ], "text/plain": [ " orcid biography\n", "25505 0000-0003-0505-2734 j\n", "138487 0000-0002-3417-7299 .....\n", "139595 0000-0003-3794-1288 m.d., ph.d.\n", "193340 0000-0001-9655-4806 肿瘤\n", "194990 0000-0002-9149-0142 be y\n", "... ... ...\n", "10927866 0000-0002-7341-5480 ph.d.\n", "10976080 0000-0003-4041-0840 /\n", "10976689 0000-0002-4285-8537 \n", "10976922 0000-0002-1545-8773 hi\n", "10987379 0000-0002-6302-4224 .\n", "\n", "[348 rows x 2 columns]" ] }, "execution_count": 68, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.spam_score == -1][['orcid','biography']]" ] }, { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "df['spam_score'] = df['spam_score'].replace(-1, np.NaN)" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 3.536670e+05\n", "mean 6.098044e-01\n", "std 4.476618e-01\n", "min 1.917500e-22\n", "25% 1.858235e-02\n", "50% 9.529688e-01\n", "75% 9.999992e-01\n", "max 1.000000e+00\n", "Name: spam_score, dtype: float64" ] }, "execution_count": 70, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.spam_score.describe()" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
biographyspam_score
29investigador de la universidad de oviedo. depa...1.000000
83formación académica en la temática de manejo d...1.000000
217doctor en educación, maestro en gerencia de la...1.000000
222possui graduação em psicologia pela pontifícia...1.000000
470roofing contractors in seattle waroofing contr...1.000000
.........
10989593jose ignacio peláez sánchez ha sido profesor e...0.999966
10989603mestranda em tecnologia na saúde e foi aluna o...1.000000
10989605the phd degree of pharmacy was received under ...1.000000
10989615mostafa metwaly is an assistant lecturer at th...1.000000
10989617jual obat aborsi di tangerang, obat penggugur ...0.999999
\n", "

120733 rows × 2 columns

\n", "
" ], "text/plain": [ " biography spam_score\n", "29 investigador de la universidad de oviedo. depa... 1.000000\n", "83 formación académica en la temática de manejo d... 1.000000\n", "217 doctor en educación, maestro en gerencia de la... 1.000000\n", "222 possui graduação em psicologia pela pontifícia... 1.000000\n", "470 roofing contractors in seattle waroofing contr... 1.000000\n", "... ... ...\n", "10989593 jose ignacio peláez sánchez ha sido profesor e... 0.999966\n", "10989603 mestranda em tecnologia na saúde e foi aluna o... 1.000000\n", "10989605 the phd degree of pharmacy was received under ... 1.000000\n", "10989615 mostafa metwaly is an assistant lecturer at th... 1.000000\n", "10989617 jual obat aborsi di tangerang, obat penggugur ... 0.999999\n", "\n", "[120733 rows x 2 columns]" ] }, "execution_count": 71, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.spam_score > 0.9999][['biography', 'spam_score']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "TODO: offending words, sexually explicit content" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## All VS All correlation" ] }, { "cell_type": "code", "execution_count": 136, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "plotlyServerURL": "https://plot.ly" }, "data": [ { "coloraxis": "coloraxis", "hovertemplate": "x: %{x}
y: %{y}
color: %{z}", "name": "0", "type": "heatmap", "x": [ "verified_email", "verified_primary_email", "n_works", "n_doi", "n_arxiv", "n_pmc", "n_other_pids", "label", "n_emails", "n_urls", "n_ids", "n_keywords", "n_education", "n_employment", "n_ext_work_source", "spam_score", "n_valid_employment", "n_valid_education" ], "xaxis": "x", "y": [ "verified_email", "verified_primary_email", "n_works", "n_doi", "n_arxiv", "n_pmc", "n_other_pids", "label", "n_emails", "n_urls", "n_ids", "n_keywords", "n_education", "n_employment", "n_ext_work_source", "spam_score", "n_valid_employment", "n_valid_education" ], "yaxis": "y", "z": [ [ 1, 0.9851287263502948, 0.032912669844356764, 0.030955329523625504, 0.002797563048323001, 0.01323756721452204, 0.026255505701943696, 0.05741408837128926, 0.01569597153718797, 0.040989235686940816, 0.061568740672497534, 0.03696794447170714, 0.05352081277803731, 0.04004869062451576, 0.09668574568788427, 0.020983848636741746, 0.04230149069451306, 0.0530823224770852 ], [ 0.9851287263502948, 1, 0.032893021887808044, 0.030952716284260284, 0.0027246992919826396, 0.013197039191265785, 0.026305729103779554, 0.056657974238298256, 0.013593908269146495, 0.04075930430401838, 0.06093122306222735, 0.036427174310613826, 0.05283100363748757, 0.039617047413162375, 0.09617813225153261, 0.020779934453739678, 0.04187757510946746, 0.05252647681980516 ], [ 0.032912669844356764, 0.032893021887808044, 1, 0.9347845603401734, 0.31289353882027604, 0.3327034204536421, 0.8256148606228672, 0.24175475571155877, 0.025014569456040185, 0.15426190915221324, 0.3477514643689929, 0.13281808409522428, 0.06673049906774123, 0.14909994555141065, 0.3919479197818211, 0.11430255435331263, 0.1530900744997813, 0.04224623793246877 ], [ 0.030955329523625504, 0.030952716284260284, 0.9347845603401734, 1, 0.3551317964752695, 0.3396714358684357, 0.792393006061876, 0.23812475099223263, 0.021917404276208984, 0.12119446342488618, 0.3286894436814909, 0.11073297656642778, 0.048318069131114486, 0.12097737688298277, 0.37572530981238894, 0.08706480424871708, 0.1367349622160526, 0.03356612824419668 ], [ 0.002797563048323001, 0.0027246992919826396, 0.31289353882027604, 0.3551317964752695, 1, -0.0006813987707150902, 0.2282600668610334, 0.01778638684849228, 0.0013809757631778703, 0.006444532015829912, 0.004329998524016476, 0.002683817677771984, 0.0013582777338929175, 0.012499036555991529, 0.01818525196078464, 0.0005840365476183185, 0.016598234185244483, 0.0016583682855190094 ], [ 0.01323756721452204, 0.013197039191265785, 0.3327034204536421, 0.3396714358684357, -0.0006813987707150902, 1, 0.24340068611465254, 0.1079192129721149, 0.004399691864203764, 0.04628651725406843, 0.07481405788686601, 0.042315521608991075, 0.04004460862097079, 0.08040739840894055, 0.16125389614532118, 0.02926424821268392, 0.08624271168084971, 0.02840212050111673 ], [ 0.026255505701943696, 0.026305729103779554, 0.8256148606228672, 0.792393006061876, 0.2282600668610334, 0.24340068611465254, 1, 0.19116755828206128, 0.01771983580822778, 0.1252448169099988, 0.3322716889228379, 0.09961147764401061, 0.041174205561030665, 0.10958484453530018, 0.33496675669425113, 0.09286341691577868, 0.11434288833407659, 0.025414430857047858 ], [ 0.05741408837128926, 0.056657974238298256, 0.24175475571155877, 0.23812475099223263, 0.01778638684849228, 0.1079192129721149, 0.19116755828206128, 1, 0.03224992209236746, 0.11065458894869484, 0.3192781208585223, 0.10852551976660856, 0.05957549731552794, 0.10279567787680673, 0.5003162103088351, 0.04411190681146708, 0.14858846866838135, 0.05163986473435027 ], [ 0.01569597153718797, 0.013593908269146495, 0.025014569456040185, 0.021917404276208984, 0.0013809757631778703, 0.004399691864203764, 0.01771983580822778, 0.03224992209236746, 1, 0.08461189794646956, 0.05362282810243895, 0.07093322291648477, 0.03791228294028205, 0.0392669868393358, 0.07145369872483175, 0.049930352165612917, 0.03549877798968587, 0.03111432559287062 ], [ 0.040989235686940816, 0.04075930430401838, 0.15426190915221324, 0.12119446342488618, 0.006444532015829912, 0.04628651725406843, 0.1252448169099988, 0.11065458894869484, 0.08461189794646956, 1, 0.22021014568642164, 0.3859318893600913, 0.1423827468461567, 0.18096621138083624, 0.24286823272263663, 0.21228500020230104, 0.1459962513695925, 0.10496069983968939 ], [ 0.061568740672497534, 0.06093122306222735, 0.3477514643689929, 0.3286894436814909, 0.004329998524016476, 0.07481405788686601, 0.3322716889228379, 0.3192781208585223, 0.05362282810243895, 0.22021014568642164, 1, 0.21377525703111794, 0.09573562665274538, 0.1469607584988768, 0.6666060438319036, 0.11784342341053432, 0.17059756897606349, 0.07232857049616995 ], [ 0.03696794447170714, 0.036427174310613826, 0.13281808409522428, 0.11073297656642778, 0.002683817677771984, 0.042315521608991075, 0.09961147764401061, 0.10852551976660856, 0.07093322291648477, 0.3859318893600913, 0.21377525703111794, 1, 0.15256142173955728, 0.17722420130407404, 0.22691037584731275, 0.23230106942130674, 0.14594541852321113, 0.11454051858316387 ], [ 0.05352081277803731, 0.05283100363748757, 0.06673049906774123, 0.048318069131114486, 0.0013582777338929175, 0.04004460862097079, 0.041174205561030665, 0.05957549731552794, 0.03791228294028205, 0.1423827468461567, 0.09573562665274538, 0.15256142173955728, 1, 0.35408552736376164, 0.1432327734837157, 0.13359305997797627, 0.2749705812249971, 0.7885246402196713 ], [ 0.04004869062451576, 0.039617047413162375, 0.14909994555141065, 0.12097737688298277, 0.012499036555991529, 0.08040739840894055, 0.10958484453530018, 0.10279567787680673, 0.0392669868393358, 0.18096621138083624, 0.1469607584988768, 0.17722420130407404, 0.35408552736376164, 1, 0.1930653267280024, 0.14496890815575308, 0.7528620160953167, 0.26673120117740184 ], [ 0.09668574568788427, 0.09617813225153261, 0.3919479197818211, 0.37572530981238894, 0.01818525196078464, 0.16125389614532118, 0.33496675669425113, 0.5003162103088351, 0.07145369872483175, 0.24286823272263663, 0.6666060438319036, 0.22691037584731275, 0.1432327734837157, 0.1930653267280024, 1, 0.1297554982537589, 0.22390994050434165, 0.11768415981384495 ], [ 0.020983848636741746, 0.020779934453739678, 0.11430255435331263, 0.08706480424871708, 0.0005840365476183185, 0.02926424821268392, 0.09286341691577868, 0.04411190681146708, 0.049930352165612917, 0.21228500020230104, 0.11784342341053432, 0.23230106942130674, 0.13359305997797627, 0.14496890815575308, 0.1297554982537589, 1, 0.09868450111186694, 0.09679171001982584 ], [ 0.04230149069451306, 0.04187757510946746, 0.1530900744997813, 0.1367349622160526, 0.016598234185244483, 0.08624271168084971, 0.11434288833407659, 0.14858846866838135, 0.03549877798968587, 0.1459962513695925, 0.17059756897606349, 0.14594541852321113, 0.2749705812249971, 0.7528620160953167, 0.22390994050434165, 0.09868450111186694, 1, 0.34138499506636344 ], [ 0.0530823224770852, 0.05252647681980516, 0.04224623793246877, 0.03356612824419668, 0.0016583682855190094, 0.02840212050111673, 0.025414430857047858, 0.05163986473435027, 0.03111432559287062, 0.10496069983968939, 0.07232857049616995, 0.11454051858316387, 0.7885246402196713, 0.26673120117740184, 0.11768415981384495, 0.09679171001982584, 0.34138499506636344, 1 ] ] } ], "layout": { "coloraxis": { "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "margin": { "t": 60 }, "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "xaxis": { "anchor": "y", "constrain": "domain", "domain": [ 0, 1 ], "scaleanchor": "y" }, "yaxis": { "anchor": "x", "autorange": "reversed", "constrain": "domain", "domain": [ 0, 1 ] } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig = px.imshow(df.select_dtypes(include=['bool','number']).fillna(-1).corr())\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "# df[['verified_email', \n", "# 'verified_primary_email', \n", "# 'n_works', \n", "# 'n_doi',\n", "# 'n_arxiv', \n", "# 'n_pmc', \n", "# 'n_other_pids', \n", "# 'n_emails', \n", "# 'n_urls', \n", "# 'n_ids', \n", "# 'n_keywords', \n", "# 'n_employment', \n", "# 'n_education', \n", "# 'label']].to_pickle('../data/processed/features.pkl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Label speculation" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employmentext_works_sourcen_ext_work_sourceauthoritativespam_score
170000-0002-0137-3066TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaNNaN0NaN2017-07-25t04:34:17.338z2019-11-27t17:54:45.418z0000TrueNaNNaNNaN<NA><NA><NA><NA><NA><NA>NaNNaNFalseNaN
190000-0002-0461-9711TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaNNaN2[crossref]2015-08-18t12:42:01.797z2019-12-06t11:37:38.203z2000TrueNaNNaNNaN<NA><NA><NA><NA><NA><NA>NaNNaNFalseNaN
220000-0002-0761-9450TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaNNaN1[crossref]2020-05-13t17:15:28.405z2020-08-11t21:00:45.694z1000TrueNaNNaNNaN<NA><NA><NA><NA><NA><NA>NaNNaNFalseNaN
330000-0002-4447-9215TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaNNaN0NaN2017-07-24t09:37:50.242z2019-11-15t08:31:24.820z0000TrueNaNNaNNaN<NA><NA><NA><NA><NA><NA>NaNNaNFalseNaN
440000-0003-0426-4065TrueTrue<NA><NA><NA>[eliza i. gilbert]<NA>NaNNaNNaN[[, us fish and wildlife service, albuquerque,...0NaN2017-08-07t18:32:31.802z2020-04-08t16:48:55.732z0000TrueNaNNaNNaN<NA><NA><NA><NA><NA>1NaNNaNFalseNaN
.........................................................................................................
109896350000-0002-7340-9697TrueTruetawandamarandure<NA>NaN<NA>NaN[[scopus author id, 48261373600]][[animal science, msc sustainable agriculture,...[[lecturer, zimbabwe open university faculty o...7[scopus - elsevier]2015-11-05t08:52:08.743z2020-12-09t17:59:18.350z7007TrueNaNNaNNaN<NA><NA>1<NA>33[scopus - elsevier]1.0TrueNaN
109896360000-0002-2906-0299TrueTruetiffanymackay<NA>[tiffany russel sia]<NA>[microfluidics, gpc-1, gallium-67, pet/ct, oxy...[[researcherid, a-2121-2017]][[faculty of medicine, master in pharmaceutica...[[clinical project lead, minomic international...11[crossref, researcherid, tiffany mackay]2017-01-03t23:28:48.736z2020-12-09t17:12:20.326z11000TrueNaNNaN[oxytocin.com.au, linkedin.com]<NA>211324[crossref, researcherid]2.0TrueNaN
109896370000-0001-5896-2024TrueTruegiovanni, ltiscia<NA>NaN<NA>NaN[[scopus author id, 54948242800]]NaNNaN70[scopus - elsevier, tiscia giovanni, l, europe...2016-07-27t10:09:13.585z2020-12-07t22:23:05.706z6501752TrueNaNNaNNaN<NA><NA>1<NA><NA><NA>[scopus - elsevier, europe pubmed central, cro...3.0TrueNaN
109896430000-0003-2606-0936TrueTrueluangxu<NA>[xu lu-ang, lu lu]<NA>NaNNaNNaN[[post-doc, institute of biochemistry and cell...2[scopus - elsevier, crossref]2015-10-24t03:53:23.544z2020-11-19t09:23:48.896z2001TrueNaNNaNNaN<NA><NA><NA><NA><NA>1[scopus - elsevier, crossref]2.0TrueNaN
109896450000-0002-3800-6331TrueTruezacharycalamari<NA>NaN<NA>NaNNaN[[richard gilder graduate school, phd in compa...[[assistant professor, baruch college, city un...7[crossref metadata search, zachary t. calamari...2015-01-20t20:20:17.042z2020-11-21t19:48:36.221z7010TrueNaNNaNNaN<NA><NA><NA><NA>22[crossref metadata search, crossref]2.0TrueNaN
\n", "

2075872 rows × 34 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "17 0000-0002-0137-3066 True True \n", "19 0000-0002-0461-9711 True True \n", "22 0000-0002-0761-9450 True True \n", "33 0000-0002-4447-9215 True True \n", "44 0000-0003-0426-4065 True True \n", "... ... ... ... \n", "10989635 0000-0002-7340-9697 True True \n", "10989636 0000-0002-2906-0299 True True \n", "10989637 0000-0001-5896-2024 True True \n", "10989643 0000-0003-2606-0936 True True \n", "10989645 0000-0002-3800-6331 True True \n", "\n", " given_names family_name biography other_names \\\n", "17 NaN \n", "19 NaN \n", "22 NaN \n", "33 NaN \n", "44 [eliza i. gilbert] \n", "... ... ... ... ... \n", "10989635 tawanda marandure NaN \n", "10989636 tiffany mackay [tiffany russel sia] \n", "10989637 giovanni, l tiscia NaN \n", "10989643 luang xu [xu lu-ang, lu lu] \n", "10989645 zachary calamari NaN \n", "\n", " primary_email keywords \\\n", "17 NaN \n", "19 NaN \n", "22 NaN \n", "33 NaN \n", "44 NaN \n", "... ... ... \n", "10989635 NaN \n", "10989636 [microfluidics, gpc-1, gallium-67, pet/ct, oxy... \n", "10989637 NaN \n", "10989643 NaN \n", "10989645 NaN \n", "\n", " external_ids \\\n", "17 NaN \n", "19 NaN \n", "22 NaN \n", "33 NaN \n", "44 NaN \n", "... ... \n", "10989635 [[scopus author id, 48261373600]] \n", "10989636 [[researcherid, a-2121-2017]] \n", "10989637 [[scopus author id, 54948242800]] \n", "10989643 NaN \n", "10989645 NaN \n", "\n", " education \\\n", "17 NaN \n", "19 NaN \n", "22 NaN \n", "33 NaN \n", "44 NaN \n", "... ... \n", "10989635 [[animal science, msc sustainable agriculture,... \n", "10989636 [[faculty of medicine, master in pharmaceutica... \n", "10989637 NaN \n", "10989643 NaN \n", "10989645 [[richard gilder graduate school, phd in compa... \n", "\n", " employment n_works \\\n", "17 NaN 0 \n", "19 NaN 2 \n", "22 NaN 1 \n", "33 NaN 0 \n", "44 [[, us fish and wildlife service, albuquerque,... 0 \n", "... ... ... \n", "10989635 [[lecturer, zimbabwe open university faculty o... 7 \n", "10989636 [[clinical project lead, minomic international... 11 \n", "10989637 NaN 70 \n", "10989643 [[post-doc, institute of biochemistry and cell... 2 \n", "10989645 [[assistant professor, baruch college, city un... 7 \n", "\n", " works_source \\\n", "17 NaN \n", "19 [crossref] \n", "22 [crossref] \n", "33 NaN \n", "44 NaN \n", "... ... \n", "10989635 [scopus - elsevier] \n", "10989636 [crossref, researcherid, tiffany mackay] \n", "10989637 [scopus - elsevier, tiscia giovanni, l, europe... \n", "10989643 [scopus - elsevier, crossref] \n", "10989645 [crossref metadata search, zachary t. calamari... \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "17 2017-07-25t04:34:17.338z 2019-11-27t17:54:45.418z 0 0 \n", "19 2015-08-18t12:42:01.797z 2019-12-06t11:37:38.203z 2 0 \n", "22 2020-05-13t17:15:28.405z 2020-08-11t21:00:45.694z 1 0 \n", "33 2017-07-24t09:37:50.242z 2019-11-15t08:31:24.820z 0 0 \n", "44 2017-08-07t18:32:31.802z 2020-04-08t16:48:55.732z 0 0 \n", "... ... ... ... ... \n", "10989635 2015-11-05t08:52:08.743z 2020-12-09t17:59:18.350z 7 0 \n", "10989636 2017-01-03t23:28:48.736z 2020-12-09t17:12:20.326z 11 0 \n", "10989637 2016-07-27t10:09:13.585z 2020-12-07t22:23:05.706z 65 0 \n", "10989643 2015-10-24t03:53:23.544z 2020-11-19t09:23:48.896z 2 0 \n", "10989645 2015-01-20t20:20:17.042z 2020-11-21t19:48:36.221z 7 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", "17 0 0 True NaN NaN \n", "19 0 0 True NaN NaN \n", "22 0 0 True NaN NaN \n", "33 0 0 True NaN NaN \n", "44 0 0 True NaN NaN \n", "... ... ... ... ... ... \n", "10989635 0 7 True NaN NaN \n", "10989636 0 0 True NaN NaN \n", "10989637 17 52 True NaN NaN \n", "10989643 0 1 True NaN NaN \n", "10989645 1 0 True NaN NaN \n", "\n", " url_domains n_emails n_urls n_ids \\\n", "17 NaN \n", "19 NaN \n", "22 NaN \n", "33 NaN \n", "44 NaN \n", "... ... ... ... ... \n", "10989635 NaN 1 \n", "10989636 [oxytocin.com.au, linkedin.com] 2 1 \n", "10989637 NaN 1 \n", "10989643 NaN \n", "10989645 NaN \n", "\n", " n_keywords n_education n_employment \\\n", "17 \n", "19 \n", "22 \n", "33 \n", "44 1 \n", "... ... ... ... \n", "10989635 3 3 \n", "10989636 13 2 4 \n", "10989637 \n", "10989643 1 \n", "10989645 2 2 \n", "\n", " ext_works_source \\\n", "17 NaN \n", "19 NaN \n", "22 NaN \n", "33 NaN \n", "44 NaN \n", "... ... \n", "10989635 [scopus - elsevier] \n", "10989636 [crossref, researcherid] \n", "10989637 [scopus - elsevier, europe pubmed central, cro... \n", "10989643 [scopus - elsevier, crossref] \n", "10989645 [crossref metadata search, crossref] \n", "\n", " n_ext_work_source authoritative spam_score \n", "17 NaN False NaN \n", "19 NaN False NaN \n", "22 NaN False NaN \n", "33 NaN False NaN \n", "44 NaN False NaN \n", "... ... ... ... \n", "10989635 1.0 True NaN \n", "10989636 2.0 True NaN \n", "10989637 3.0 True NaN \n", "10989643 2.0 True NaN \n", "10989645 2.0 True NaN \n", "\n", "[2075872 rows x 34 columns]" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.label == 1]" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "# (df.n_works > 0) & (df.n_ids > 1)" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 10989649 entries, 0 to 10989648\n", "Data columns (total 34 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", " 0 orcid object \n", " 1 verified_email bool \n", " 2 verified_primary_email bool \n", " 3 given_names string \n", " 4 family_name string \n", " 5 biography string \n", " 6 other_names object \n", " 7 primary_email string \n", " 8 keywords object \n", " 9 external_ids object \n", " 10 education object \n", " 11 employment object \n", " 12 n_works Int16 \n", " 13 works_source object \n", " 14 activation_date string \n", " 15 last_update_date string \n", " 16 n_doi Int16 \n", " 17 n_arxiv Int16 \n", " 18 n_pmc Int16 \n", " 19 n_other_pids Int16 \n", " 20 label bool \n", " 21 primary_email_domain object \n", " 22 other_email_domains object \n", " 23 url_domains object \n", " 24 n_emails Int16 \n", " 25 n_urls Int16 \n", " 26 n_ids Int16 \n", " 27 n_keywords Int16 \n", " 28 n_education Int16 \n", " 29 n_employment Int16 \n", " 30 ext_works_source object \n", " 31 n_ext_work_source float64\n", " 32 authoritative object \n", " 33 spam_score float64\n", "dtypes: Int16(11), bool(3), float64(2), object(12), string(6)\n", "memory usage: 2.0+ GB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }