{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Exploratory analysis" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "TODO:\n", "- Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)\n", "- Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)\n", "- Temporal dimension of any use?\n", "- Can we access private info thanks to the OpenAIRE-ORCID agreement?\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import glob\n", "\n", "import pandas as pd\n", "import ast\n", "import tldextract\n", "import numpy as np\n", "\n", "import antispam\n", "\n", "import plotly\n", "from plotly.offline import iplot, init_notebook_mode\n", "import plotly.graph_objs as go\n", "import plotly.express as px\n", "\n", "init_notebook_mode(connected=True)\n", "TOP_N = 0\n", "TOP_RANGE = [0, 0]\n", "\n", "def set_top_n(n):\n", " global TOP_N, TOP_RANGE\n", " TOP_N = n\n", " TOP_RANGE = [-.5, n - 1 + .5]\n", " \n", "pd.set_option('display.max_columns', None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notable solid ORCID iDs for explorative purposes:" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "AM = '0000-0002-5193-7851'\n", "PP = '0000-0002-8588-4196'\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notable anomalies:" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "JOURNAL = '0000-0003-1815-5732'\n", "NOINFO = '0000-0001-5009-2052'\n", "VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE\n", "# todo: find group-shared ORCiD, if possible" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notable fake ORCID iDs:" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "SCAFFOLD = '0000-0001-5004-7761'\n", "WHATSAPP = '0000-0001-6997-9470'\n", "PENIS = '0000-0002-3399-7287'\n", "BITCOIN = '0000-0002-7518-6845'\n", "FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment\n", "CANNABIS = '0000-0002-9025-8632' # URL > 70 + works (REMOVED)\n", "PLUMBER = '0000-0002-1700-8311' # URL > 10 + works " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load the dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
00000-0001-6097-395300NaNNaNNaNNaNNaNNaNNaNNaNNaN0NaN2018-03-02t09:29:16.528z2018-03-02t09:43:07.551z00000NaNNaNNaNNaNNaNNaNNaNNaNNaN
10000-0001-6112-555011NaNNaNNaN[v.i. yurtaev; v. yurtaev]NaNNaNNaNNaN[[professor, peoples friendship university of ...0NaN2018-04-03t07:50:23.358z2020-03-18t09:42:44.753z00000NaNNaNNaNNaNNaNNaNNaNNaN1.0
20000-0001-6152-269511NaNNaNNaNNaNNaNNaNNaNNaNNaN0NaN2019-12-11t15:31:56.388z2020-01-28t15:34:17.309z00000NaNNaNNaNNaNNaNNaNNaNNaNNaN
30000-0001-6220-568311NaNNaNNaNNaNNaNNaNNaNNaN[[research scientist, new york university abu ...0NaN2015-08-18t12:36:45.307z2020-09-23t13:37:54.180z00000NaNNaNNaNNaNNaNNaNNaNNaN1.0
40000-0001-7071-829411NaNNaNNaNNaNNaNNaNNaNNaN[[researcher (academic), universidad de zarago...0NaN2014-03-10t13:22:01.966z2016-06-14t22:17:54.470z00000NaNNaNNaNNaNNaNNaNNaNNaN2.0
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email given_names \\\n", "0 0000-0001-6097-3953 0 0 NaN \n", "1 0000-0001-6112-5550 1 1 NaN \n", "2 0000-0001-6152-2695 1 1 NaN \n", "3 0000-0001-6220-5683 1 1 NaN \n", "4 0000-0001-7071-8294 1 1 NaN \n", "\n", " family_name biography other_names primary_email keywords \\\n", "0 NaN NaN NaN NaN NaN \n", "1 NaN NaN [v.i. yurtaev; v. yurtaev] NaN NaN \n", "2 NaN NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN \n", "\n", " external_ids education employment \\\n", "0 NaN NaN NaN \n", "1 NaN NaN [[professor, peoples friendship university of ... \n", "2 NaN NaN NaN \n", "3 NaN NaN [[research scientist, new york university abu ... \n", "4 NaN NaN [[researcher (academic), universidad de zarago... \n", "\n", " n_works works_source activation_date last_update_date \\\n", "0 0 NaN 2018-03-02t09:29:16.528z 2018-03-02t09:43:07.551z \n", "1 0 NaN 2018-04-03t07:50:23.358z 2020-03-18t09:42:44.753z \n", "2 0 NaN 2019-12-11t15:31:56.388z 2020-01-28t15:34:17.309z \n", "3 0 NaN 2015-08-18t12:36:45.307z 2020-09-23t13:37:54.180z \n", "4 0 NaN 2014-03-10t13:22:01.966z 2016-06-14t22:17:54.470z \n", "\n", " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", "0 0 0 0 0 0 NaN \n", "1 0 0 0 0 0 NaN \n", "2 0 0 0 0 0 NaN \n", "3 0 0 0 0 0 NaN \n", "4 0 0 0 0 0 NaN \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", "0 NaN NaN NaN NaN NaN NaN \n", "1 NaN NaN NaN NaN NaN NaN \n", "2 NaN NaN NaN NaN NaN NaN \n", "3 NaN NaN NaN NaN NaN NaN \n", "4 NaN NaN NaN NaN NaN NaN \n", "\n", " n_education n_employment \n", "0 NaN NaN \n", "1 NaN 1.0 \n", "2 NaN NaN \n", "3 NaN 1.0 \n", "4 NaN 2.0 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "parts = glob.glob('../data/processed/dataset.pkl.*')\n", "\n", "df = pd.concat((pd.read_pickle(part) for part in sorted(parts)))\n", "df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Notable profiles inspection" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
30732610000-0002-5193-785111andreamannoccidata scientist & researcher; scholarly knowled...NaNandrea.mannocci@isti.cnr.it[research infrastructures, science of science,...[[scopus author id, 55233589900]][[information engineering, ph.d., università d...[[research associate, istituto di scienza e te...37[scopus - elsevier, crossref metadata search, ...2017-09-12t14:28:33.467z2021-03-17t15:40:07.776z3400601isti.cnr.itNaN[github.io, twitter.com, linkedin.com]NaN3.01.05.04.05.0
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "3073261 0000-0002-5193-7851 1 1 \n", "\n", " given_names family_name \\\n", "3073261 andrea mannocci \n", "\n", " biography other_names \\\n", "3073261 data scientist & researcher; scholarly knowled... NaN \n", "\n", " primary_email \\\n", "3073261 andrea.mannocci@isti.cnr.it \n", "\n", " keywords \\\n", "3073261 [research infrastructures, science of science,... \n", "\n", " external_ids \\\n", "3073261 [[scopus author id, 55233589900]] \n", "\n", " education \\\n", "3073261 [[information engineering, ph.d., università d... \n", "\n", " employment n_works \\\n", "3073261 [[research associate, istituto di scienza e te... 37 \n", "\n", " works_source \\\n", "3073261 [scopus - elsevier, crossref metadata search, ... \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "3073261 2017-09-12t14:28:33.467z 2021-03-17t15:40:07.776z 34 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", "3073261 0 60 1 isti.cnr.it NaN \n", "\n", " url_domains n_emails n_urls n_ids \\\n", "3073261 [github.io, twitter.com, linkedin.com] NaN 3.0 1.0 \n", "\n", " n_keywords n_education n_employment \n", "3073261 5.0 4.0 5.0 " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['orcid'] == AM]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
98872720000-0001-6997-947011otherwhatsappNaNNaNNaN[whatsapp gb apk, whatsapp gb baixar, whatsapp...NaNNaNNaN0NaN2020-10-07t10:37:12.237z2020-10-08t02:32:03.935z00000NaNNaN[otherwhatsapp.com, im-creator.com, facebook.c...NaN27.0NaN4.0NaNNaN
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "9887272 0000-0001-6997-9470 1 1 \n", "\n", " given_names family_name biography other_names primary_email \\\n", "9887272 other whatsapp NaN NaN NaN \n", "\n", " keywords external_ids \\\n", "9887272 [whatsapp gb apk, whatsapp gb baixar, whatsapp... NaN \n", "\n", " education employment n_works works_source activation_date \\\n", "9887272 NaN NaN 0 NaN 2020-10-07t10:37:12.237z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", "9887272 2020-10-08t02:32:03.935z 0 0 0 0 0 \n", "\n", " primary_email_domain other_email_domains \\\n", "9887272 NaN NaN \n", "\n", " url_domains n_emails n_urls \\\n", "9887272 [otherwhatsapp.com, im-creator.com, facebook.c... NaN 27.0 \n", "\n", " n_ids n_keywords n_education n_employment \n", "9887272 NaN 4.0 NaN NaN " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['orcid'] == WHATSAPP]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/plain": [ "orcid 10989649\n", "verified_email 10989649\n", "verified_primary_email 10989649\n", "given_names 10959039\n", "family_name 10671715\n", "biography 354015\n", "other_names 554684\n", "primary_email 124722\n", "keywords 649637\n", "external_ids 1308598\n", "education 2441645\n", "employment 2680488\n", "n_works 10989649\n", "works_source 2740939\n", "activation_date 10989649\n", "last_update_date 10989649\n", "n_doi 10989649\n", "n_arxiv 10989649\n", "n_pmc 10989649\n", "n_other_pids 10989649\n", "label 10989649\n", "primary_email_domain 124722\n", "other_email_domains 48615\n", "url_domains 715067\n", "n_emails 48615\n", "n_urls 715067\n", "n_ids 1308598\n", "n_keywords 649637\n", "n_education 2441645\n", "n_employment 2680488\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.count()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 10989649\n", "unique 10989649\n", "top 0000-0002-7963-4502\n", "freq 1\n", "Name: orcid, dtype: object" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['orcid'].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Primary email" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 124722\n", "unique 124718\n", "top opercin@erbakan.edu.tr\n", "freq 2\n", "Name: primary_email, dtype: object" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['primary_email'].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Dupe emails" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1681787 opercin@erbakan.edu.tr\n", "5590332 patrick.davey@monash.edu\n", "9316843 maykin@owasp.org\n", "10375852 andycheng2026@163.com\n", "Name: primary_email, dtype: object" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['primary_email'].dropna().loc[df['primary_email'].duplicated()]" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
75439810000-0002-0836-227111maykinwarasartNaNNaNmaykin@owasp.orgNaNNaNNaNNaN0NaN2020-09-15t04:43:55.709z2020-09-15t05:17:28.509z00000owasp.org[dga.or.th]NaN1.0NaNNaNNaNNaNNaN
93168430000-0001-9855-167611maykinwarasartNaNNaNmaykin@owasp.orgNaNNaNNaNNaN0NaN2020-10-23t17:51:51.925z2021-01-01t15:00:52.053z00000owasp.org[dga.or.th, ieee.org]NaN2.0NaNNaNNaNNaNNaN
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "7543981 0000-0002-0836-2271 1 1 \n", "9316843 0000-0001-9855-1676 1 1 \n", "\n", " given_names family_name biography other_names primary_email \\\n", "7543981 maykin warasart NaN NaN maykin@owasp.org \n", "9316843 maykin warasart NaN NaN maykin@owasp.org \n", "\n", " keywords external_ids education employment n_works works_source \\\n", "7543981 NaN NaN NaN NaN 0 NaN \n", "9316843 NaN NaN NaN NaN 0 NaN \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "7543981 2020-09-15t04:43:55.709z 2020-09-15t05:17:28.509z 0 0 \n", "9316843 2020-10-23t17:51:51.925z 2021-01-01t15:00:52.053z 0 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain \\\n", "7543981 0 0 0 owasp.org \n", "9316843 0 0 0 owasp.org \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids \\\n", "7543981 [dga.or.th] NaN 1.0 NaN NaN \n", "9316843 [dga.or.th, ieee.org] NaN 2.0 NaN NaN \n", "\n", " n_keywords n_education n_employment \n", "7543981 NaN NaN NaN \n", "9316843 NaN NaN NaN " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['primary_email'] == 'maykin@owasp.org']" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
3478520000-0002-2232-963811osmanperçinNaNNaNopercin@erbakan.edu.trNaNNaNNaNNaN0NaN2015-01-12t13:47:55.549z2020-01-27t07:38:24.269z00000erbakan.edu.trNaNNaNNaNNaNNaNNaNNaNNaN
16817870000-0003-0033-091811osmanperçinNaNNaNopercin@erbakan.edu.trNaNNaNNaN[[, necmettin erbakan university, konya, , tr,...0NaN2015-10-13t05:47:12.014z2020-12-25t13:52:03.976z00000erbakan.edu.trNaNNaNNaNNaNNaNNaNNaN1.0
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "347852 0000-0002-2232-9638 1 1 \n", "1681787 0000-0003-0033-0918 1 1 \n", "\n", " given_names family_name biography other_names primary_email \\\n", "347852 osman perçin NaN NaN opercin@erbakan.edu.tr \n", "1681787 osman perçin NaN NaN opercin@erbakan.edu.tr \n", "\n", " keywords external_ids education \\\n", "347852 NaN NaN NaN \n", "1681787 NaN NaN NaN \n", "\n", " employment n_works \\\n", "347852 NaN 0 \n", "1681787 [[, necmettin erbakan university, konya, , tr,... 0 \n", "\n", " works_source activation_date last_update_date \\\n", "347852 NaN 2015-01-12t13:47:55.549z 2020-01-27t07:38:24.269z \n", "1681787 NaN 2015-10-13t05:47:12.014z 2020-12-25t13:52:03.976z \n", "\n", " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", "347852 0 0 0 0 0 erbakan.edu.tr \n", "1681787 0 0 0 0 0 erbakan.edu.tr \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", "347852 NaN NaN NaN NaN NaN NaN \n", "1681787 NaN NaN NaN NaN NaN NaN \n", "\n", " n_education n_employment \n", "347852 NaN NaN \n", "1681787 NaN 1.0 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['primary_email'] == 'opercin@erbakan.edu.tr']" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
9540850000-0002-9158-175711patrickdaveyNaNNaNpatrick.davey@monash.edu[radiochemistry, radiopharmaceuticals, inorgan...NaNNaN[[phd student, monash university, melbourne, ,...0NaN2019-05-09t23:01:02.170z2019-08-20t03:00:17.844z00000monash.eduNaNNaNNaNNaNNaN4.0NaN1.0
55903320000-0002-8774-003011patrickdaveyNaNNaNpatrick.davey@monash.eduNaNNaNNaN[[phd student, monash university, melbourne, v...1[crossref]2018-09-11t10:47:10.997z2021-02-09t06:21:44.138z10001monash.eduNaNNaNNaNNaNNaNNaNNaN1.0
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "954085 0000-0002-9158-1757 1 1 \n", "5590332 0000-0002-8774-0030 1 1 \n", "\n", " given_names family_name biography other_names \\\n", "954085 patrick davey NaN NaN \n", "5590332 patrick davey NaN NaN \n", "\n", " primary_email \\\n", "954085 patrick.davey@monash.edu \n", "5590332 patrick.davey@monash.edu \n", "\n", " keywords external_ids \\\n", "954085 [radiochemistry, radiopharmaceuticals, inorgan... NaN \n", "5590332 NaN NaN \n", "\n", " education employment n_works \\\n", "954085 NaN [[phd student, monash university, melbourne, ,... 0 \n", "5590332 NaN [[phd student, monash university, melbourne, v... 1 \n", "\n", " works_source activation_date last_update_date \\\n", "954085 NaN 2019-05-09t23:01:02.170z 2019-08-20t03:00:17.844z \n", "5590332 [crossref] 2018-09-11t10:47:10.997z 2021-02-09t06:21:44.138z \n", "\n", " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", "954085 0 0 0 0 0 monash.edu \n", "5590332 1 0 0 0 1 monash.edu \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", "954085 NaN NaN NaN NaN NaN 4.0 \n", "5590332 NaN NaN NaN NaN NaN NaN \n", "\n", " n_education n_employment \n", "954085 NaN 1.0 \n", "5590332 NaN 1.0 " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df['primary_email'] == 'patrick.davey@monash.edu']" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 124722\n", "unique 17160\n", "top gmail.com\n", "freq 26750\n", "Name: primary_email_domain, dtype: object" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['primary_email_domain'].describe()" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcid
primary_email_domain
gmail.com26750
hotmail.com3801
yahoo.com2625
163.com2132
yuhs.ac1134
......
imf.csic.es1
imf.org1
imfd.tu-freiberg.de1
imft.fr1
zzuli.edu.cn1
\n", "

17160 rows × 1 columns

\n", "
" ], "text/plain": [ " orcid\n", "primary_email_domain \n", "gmail.com 26750\n", "hotmail.com 3801\n", "yahoo.com 2625\n", "163.com 2132\n", "yuhs.ac 1134\n", "... ...\n", "imf.csic.es 1\n", "imf.org 1\n", "imfd.tu-freiberg.de 1\n", "imft.fr 1\n", "zzuli.edu.cn 1\n", "\n", "[17160 rows x 1 columns]" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "top_primary_emails = df[['primary_email_domain', 'orcid']]\\\n", " .groupby('primary_email_domain')\\\n", " .count()\\\n", " .sort_values('orcid', ascending=False)\n", "top_primary_emails" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "gmail.com", "hotmail.com", "yahoo.com", "163.com", "yuhs.ac", "qq.com", "outlook.com", "126.com", "bu.edu", "usgs.gov", "mail.ru", "usp.br", "yahoo.com.br", "ua.pt", "umich.edu", "ust.hk", "foxmail.com", "uomustansiriyah.edu.iq", "yandex.ru", "uq.edu.au", "ukr.net", "unesp.br", "ucl.ac.uk", "ieee.org", "naver.com", "stcatz.ox.ac.uk", "st-annes.ox.ac.uk", "yahoo.fr", "ucm.es", "live.com" ], "y": [ 26750, 3801, 2625, 2132, 1134, 1059, 948, 766, 629, 586, 579, 464, 459, 302, 290, 277, 260, 248, 244, 235, 226, 218, 210, 205, 188, 184, 184, 174, 174, 165 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top-30 email domains" }, "xaxis": { "range": [ -0.5, 29.5 ], "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(30)\n", "data = [\n", " go.Bar(\n", " x=top_primary_emails[:TOP_N].index,\n", " y=top_primary_emails[:TOP_N]['orcid']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Top-%s email domains' % TOP_N,\n", " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Other emails" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
2510000-0002-5916-446X11antonio gilvanteixeira júniorNaN[teixeira, antônio gilvan, júnior, antonio gil...gilvan.junior@aluno.ufca.edu.br[ethicis; medicine; infectology; neurology; ne...[[scopus author id, 56647743200], [scopus auth...[[faculty of health and life sciences, , unive...NaN14[antonio gilvan teixeira júnior, scopus - else...2016-05-18t11:26:36.642z2016-09-20t18:25:05.728z130080aluno.ufca.edu.br[liverpool.ac.uk][researchgate.net, academia.edu, cnpq.br]1.03.04.01.01.0NaN
3160000-0002-8742-947X11aarontan shing loongNaNNaNaaron.tanshingloong@wadh.ox.ac.ukNaNNaN[[ruskin school of art; wadham college, , univ...NaN0NaN2015-10-05t23:10:08.771z2016-06-14t19:55:50.313z00000wadh.ox.ac.uk[rsa.ox.ac.uk]NaN1.0NaNNaNNaN1.0NaN
4330000-0001-9097-228111abhisheksolankiNaNNaNNaNNaNNaNNaN[[senior engineer, robert bosch (india), benga...1[abhishek solanki]2019-04-22t04:43:06.232z2020-07-02t14:18:28.305z00000NaN[in.bosch.com][github.com, linkedin.com]1.02.0NaNNaNNaN2.0
4970000-0002-8614-300711adamarraNaNNaNNaNNaNNaNNaNNaN0NaN2017-11-15t06:33:45.625z2017-11-15t06:44:02.998z00000NaN[hct.ac.ae]NaN1.0NaNNaNNaNNaNNaN
8690000-0001-9884-549811albertoronzaniNaNNaNalberto@aronza.comNaNNaNNaN[[research scientist, vtt technical research c...19[crossref metadata search, alberto ronzani, cr...2014-04-16t13:21:54.287z2020-09-28t15:10:37.439z180031aronza.com[vtt.fi]NaN1.0NaNNaNNaNNaN1.0
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "251 0000-0002-5916-446X 1 1 \n", "316 0000-0002-8742-947X 1 1 \n", "433 0000-0001-9097-2281 1 1 \n", "497 0000-0002-8614-3007 1 1 \n", "869 0000-0001-9884-5498 1 1 \n", "\n", " given_names family_name biography \\\n", "251 antonio gilvan teixeira júnior NaN \n", "316 aaron tan shing loong NaN \n", "433 abhishek solanki NaN \n", "497 adam arra NaN \n", "869 alberto ronzani NaN \n", "\n", " other_names \\\n", "251 [teixeira, antônio gilvan, júnior, antonio gil... \n", "316 NaN \n", "433 NaN \n", "497 NaN \n", "869 NaN \n", "\n", " primary_email \\\n", "251 gilvan.junior@aluno.ufca.edu.br \n", "316 aaron.tanshingloong@wadh.ox.ac.uk \n", "433 NaN \n", "497 NaN \n", "869 alberto@aronza.com \n", "\n", " keywords \\\n", "251 [ethicis; medicine; infectology; neurology; ne... \n", "316 NaN \n", "433 NaN \n", "497 NaN \n", "869 NaN \n", "\n", " external_ids \\\n", "251 [[scopus author id, 56647743200], [scopus auth... \n", "316 NaN \n", "433 NaN \n", "497 NaN \n", "869 NaN \n", "\n", " education \\\n", "251 [[faculty of health and life sciences, , unive... \n", "316 [[ruskin school of art; wadham college, , univ... \n", "433 NaN \n", "497 NaN \n", "869 NaN \n", "\n", " employment n_works \\\n", "251 NaN 14 \n", "316 NaN 0 \n", "433 [[senior engineer, robert bosch (india), benga... 1 \n", "497 NaN 0 \n", "869 [[research scientist, vtt technical research c... 19 \n", "\n", " works_source \\\n", "251 [antonio gilvan teixeira júnior, scopus - else... \n", "316 NaN \n", "433 [abhishek solanki] \n", "497 NaN \n", "869 [crossref metadata search, alberto ronzani, cr... \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "251 2016-05-18t11:26:36.642z 2016-09-20t18:25:05.728z 13 0 \n", "316 2015-10-05t23:10:08.771z 2016-06-14t19:55:50.313z 0 0 \n", "433 2019-04-22t04:43:06.232z 2020-07-02t14:18:28.305z 0 0 \n", "497 2017-11-15t06:33:45.625z 2017-11-15t06:44:02.998z 0 0 \n", "869 2014-04-16t13:21:54.287z 2020-09-28t15:10:37.439z 18 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", "251 0 8 0 aluno.ufca.edu.br [liverpool.ac.uk] \n", "316 0 0 0 wadh.ox.ac.uk [rsa.ox.ac.uk] \n", "433 0 0 0 NaN [in.bosch.com] \n", "497 0 0 0 NaN [hct.ac.ae] \n", "869 0 3 1 aronza.com [vtt.fi] \n", "\n", " url_domains n_emails n_urls n_ids \\\n", "251 [researchgate.net, academia.edu, cnpq.br] 1.0 3.0 4.0 \n", "316 NaN 1.0 NaN NaN \n", "433 [github.com, linkedin.com] 1.0 2.0 NaN \n", "497 NaN 1.0 NaN NaN \n", "869 NaN 1.0 NaN NaN \n", "\n", " n_keywords n_education n_employment \n", "251 1.0 1.0 NaN \n", "316 NaN 1.0 NaN \n", "433 NaN NaN 2.0 \n", "497 NaN NaN NaN \n", "869 NaN NaN 1.0 " ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.other_email_domains.notna()].head()" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "emails_by_orcid = df[['orcid', 'n_emails']].sort_values('n_emails', ascending=False)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "0000-0003-4171-3835", "0000-0001-6239-2968", "0000-0003-2151-4089", "0000-0003-2290-2817", "0000-0001-9084-3156", "0000-0001-6349-1044", "0000-0002-2085-1908", "0000-0003-4147-212X", "0000-0002-9599-6909", "0000-0001-9311-0687", "0000-0003-1502-3910", "0000-0002-9821-8424", "0000-0003-4327-6827", "0000-0002-1929-6054", "0000-0002-8390-8238", "0000-0002-1615-8633", "0000-0003-0671-1543", "0000-0003-4499-7300", "0000-0002-5341-6531", "0000-0002-8565-194X", "0000-0002-0776-9547", "0000-0001-8420-9204", "0000-0002-7396-1561", "0000-0002-3165-132X", "0000-0002-2567-3741", "0000-0003-2657-8225", "0000-0003-4685-5621", "0000-0001-5548-8259", "0000-0003-0391-3430", "0000-0003-2526-0928" ], "y": [ 12, 9, 7, 7, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top 30 ORCID iDs by email" }, "xaxis": { "range": [ -0.5, 29.5 ], "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(30)\n", "data = [\n", " go.Bar(\n", " x=emails_by_orcid[:TOP_N]['orcid'],\n", " y=emails_by_orcid[:TOP_N]['n_emails']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Top %s ORCID iDs by email' % TOP_N, \n", " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "top_other_emails = df[['orcid', 'other_email_domains']]\\\n", " .explode('other_email_domains')\\\n", " .reset_index(drop=True)\\\n", " .groupby('other_email_domains')\\\n", " .count()\\\n", " .sort_values('orcid', ascending=False)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "gmail.com", "hotmail.com", "yahoo.com", "qq.com", "163.com", "outlook.com", "126.com", "usp.br", "ieee.org", "yahoo.com.br", "mail.ru", "unesp.br", "sbs.ox.ac.uk", "yuhs.ac", "naver.com", "icloud.com", "foxmail.com", "uq.edu.au", "ua.pt", "cam.ac.uk", "imperial.ac.uk", "ukr.net", "law.ox.ac.uk", "mit.edu", "monash.edu", "stanford.edu", "ucl.ac.uk", "education.ox.ac.uk", "ucm.es", "conted.ox.ac.uk" ], "y": [ 11198, 1550, 1303, 785, 780, 433, 262, 236, 226, 151, 148, 141, 136, 134, 132, 119, 98, 96, 90, 84, 79, 75, 75, 74, 70, 70, 69, 67, 66, 65 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top 30 other email domains" }, "xaxis": { "range": [ -0.5, 29.5 ], "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(30)\n", "data = [\n", " go.Bar(\n", " x=top_other_emails[:TOP_N].index,\n", " y=top_other_emails[:TOP_N]['orcid']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Top %s other email domains' % TOP_N, \n", " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This somehow makes sense, legitimate users could put the gmail account as primary for login purposes and have institutional addresses as other email addresses. It makes also the life easier upon relocation." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Email speculation" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
4330000-0001-9097-228111abhisheksolankiNaNNaNNaNNaNNaNNaN[[senior engineer, robert bosch (india), benga...1[abhishek solanki]2019-04-22t04:43:06.232z2020-07-02t14:18:28.305z00000NaN[in.bosch.com][github.com, linkedin.com]1.02.0NaNNaNNaN2.0
4970000-0002-8614-300711adamarraNaNNaNNaNNaNNaNNaNNaN0NaN2017-11-15t06:33:45.625z2017-11-15t06:44:02.998z00000NaN[hct.ac.ae]NaN1.0NaNNaNNaNNaNNaN
8980000-0003-3728-643911alejandraecheverry velásquezalejandra echeverry is an industrial electrici...NaNNaN[innovation, renewable, control, science, ener...NaN[[, electrical engineer, institución universit...[[professor, institución universitaria pascual...1[crossref]2019-03-31t00:00:42.929z2020-09-06t02:18:54.290z10001NaN[pascualbravo.edu.co]NaN1.0NaNNaN7.01.01.0
17190000-0001-8330-744311andreatesonieroNaNNaNNaNNaN[[researcherid, d-9056-2015]][[department of geophysics, master of science ...[[postdoctoral associate, yale university, new...4[andrea tesoniero]2015-03-09t11:59:06.093z2020-08-20t15:03:23.447z40020NaN[yale.edu]NaN1.0NaN1.0NaN4.02.0
68290000-0001-9670-515X11esma esinyildirimNaNNaNNaN[pharmacognosy, natural chemistry, chemical en...NaN[[business management, master of science, ista...NaN0NaN2020-07-26t10:38:03.721z2020-07-26t10:52:26.539z00000NaN[gmail.com]NaN1.0NaNNaN3.03.0NaN
.............................................................................................
109858160000-0003-1204-600911nathanwalkNaNNaNNaNNaNNaN[[department of physics, doctor of philosophy,...[[, university of oxford, oxford, oxfordshire,...10[crossref metadata search]2016-07-28t14:24:16.844z2020-10-13t11:47:50.621z100001NaN[cs.ox.ac.uk][fu-berlin.de]1.01.0NaNNaN3.02.0
109860270000-0002-3472-766811rafvandeveldeNaNNaNNaNNaNNaN[[chemical engineering technology, master, kat...[[phd researcher, katholieke universiteit leuv...0NaN2020-10-14t13:56:44.779z2020-10-16t14:21:40.673z00000NaN[kuleuven.be][linkedin.com]1.01.0NaNNaN2.01.0
109875010000-0002-9602-052911carlos augustofinelliNaNNaNNaNNaNNaNNaNNaN1[crossref]2013-09-16t16:52:06.120z2020-12-01t22:47:08.074z10001NaN[cecot.com.br]NaN1.0NaNNaNNaNNaNNaN
109878290000-0003-4402-598211filipede almeida araújoNaNNaNNaNNaNNaN[[materials science, msc. materials science, m...[[co-owner, aeft acessory, manaus, amazonas, b...0NaN2020-03-02t20:11:01.699z2020-12-04t13:53:39.404z00000NaN[ime.eb.br]NaN1.0NaNNaNNaN2.01.0
109884440000-0002-1734-724111manareldeenahmedNaNNaNNaN[graphene, deep learning, atomistic simulation...NaNNaN[[post-doctor, zhejiang university, hangzhou, ...6[manareldeen ahmed]2017-02-17t13:18:36.540z2020-12-04t02:04:36.668z60031NaN[hotmail.com]NaN1.0NaNNaN5.0NaN1.0
\n", "

19814 rows × 30 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "433 0000-0001-9097-2281 1 1 \n", "497 0000-0002-8614-3007 1 1 \n", "898 0000-0003-3728-6439 1 1 \n", "1719 0000-0001-8330-7443 1 1 \n", "6829 0000-0001-9670-515X 1 1 \n", "... ... ... ... \n", "10985816 0000-0003-1204-6009 1 1 \n", "10986027 0000-0002-3472-7668 1 1 \n", "10987501 0000-0002-9602-0529 1 1 \n", "10987829 0000-0003-4402-5982 1 1 \n", "10988444 0000-0002-1734-7241 1 1 \n", "\n", " given_names family_name \\\n", "433 abhishek solanki \n", "497 adam arra \n", "898 alejandra echeverry velásquez \n", "1719 andrea tesoniero \n", "6829 esma esin yildirim \n", "... ... ... \n", "10985816 nathan walk \n", "10986027 raf vandevelde \n", "10987501 carlos augusto finelli \n", "10987829 filipe de almeida araújo \n", "10988444 manareldeen ahmed \n", "\n", " biography other_names \\\n", "433 NaN NaN \n", "497 NaN NaN \n", "898 alejandra echeverry is an industrial electrici... NaN \n", "1719 NaN NaN \n", "6829 NaN NaN \n", "... ... ... \n", "10985816 NaN NaN \n", "10986027 NaN NaN \n", "10987501 NaN NaN \n", "10987829 NaN NaN \n", "10988444 NaN NaN \n", "\n", " primary_email keywords \\\n", "433 NaN NaN \n", "497 NaN NaN \n", "898 NaN [innovation, renewable, control, science, ener... \n", "1719 NaN NaN \n", "6829 NaN [pharmacognosy, natural chemistry, chemical en... \n", "... ... ... \n", "10985816 NaN NaN \n", "10986027 NaN NaN \n", "10987501 NaN NaN \n", "10987829 NaN NaN \n", "10988444 NaN [graphene, deep learning, atomistic simulation... \n", "\n", " external_ids \\\n", "433 NaN \n", "497 NaN \n", "898 NaN \n", "1719 [[researcherid, d-9056-2015]] \n", "6829 NaN \n", "... ... \n", "10985816 NaN \n", "10986027 NaN \n", "10987501 NaN \n", "10987829 NaN \n", "10988444 NaN \n", "\n", " education \\\n", "433 NaN \n", "497 NaN \n", "898 [[, electrical engineer, institución universit... \n", "1719 [[department of geophysics, master of science ... \n", "6829 [[business management, master of science, ista... \n", "... ... \n", "10985816 [[department of physics, doctor of philosophy,... \n", "10986027 [[chemical engineering technology, master, kat... \n", "10987501 NaN \n", "10987829 [[materials science, msc. materials science, m... \n", "10988444 NaN \n", "\n", " employment n_works \\\n", "433 [[senior engineer, robert bosch (india), benga... 1 \n", "497 NaN 0 \n", "898 [[professor, institución universitaria pascual... 1 \n", "1719 [[postdoctoral associate, yale university, new... 4 \n", "6829 NaN 0 \n", "... ... ... \n", "10985816 [[, university of oxford, oxford, oxfordshire,... 10 \n", "10986027 [[phd researcher, katholieke universiteit leuv... 0 \n", "10987501 NaN 1 \n", "10987829 [[co-owner, aeft acessory, manaus, amazonas, b... 0 \n", "10988444 [[post-doctor, zhejiang university, hangzhou, ... 6 \n", "\n", " works_source activation_date \\\n", "433 [abhishek solanki] 2019-04-22t04:43:06.232z \n", "497 NaN 2017-11-15t06:33:45.625z \n", "898 [crossref] 2019-03-31t00:00:42.929z \n", "1719 [andrea tesoniero] 2015-03-09t11:59:06.093z \n", "6829 NaN 2020-07-26t10:38:03.721z \n", "... ... ... \n", "10985816 [crossref metadata search] 2016-07-28t14:24:16.844z \n", "10986027 NaN 2020-10-14t13:56:44.779z \n", "10987501 [crossref] 2013-09-16t16:52:06.120z \n", "10987829 NaN 2020-03-02t20:11:01.699z \n", "10988444 [manareldeen ahmed] 2017-02-17t13:18:36.540z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n", "433 2020-07-02t14:18:28.305z 0 0 0 0 \n", "497 2017-11-15t06:44:02.998z 0 0 0 0 \n", "898 2020-09-06t02:18:54.290z 1 0 0 0 \n", "1719 2020-08-20t15:03:23.447z 4 0 0 2 \n", "6829 2020-07-26t10:52:26.539z 0 0 0 0 \n", "... ... ... ... ... ... \n", "10985816 2020-10-13t11:47:50.621z 10 0 0 0 \n", "10986027 2020-10-16t14:21:40.673z 0 0 0 0 \n", "10987501 2020-12-01t22:47:08.074z 1 0 0 0 \n", "10987829 2020-12-04t13:53:39.404z 0 0 0 0 \n", "10988444 2020-12-04t02:04:36.668z 6 0 0 3 \n", "\n", " label primary_email_domain other_email_domains \\\n", "433 0 NaN [in.bosch.com] \n", "497 0 NaN [hct.ac.ae] \n", "898 1 NaN [pascualbravo.edu.co] \n", "1719 0 NaN [yale.edu] \n", "6829 0 NaN [gmail.com] \n", "... ... ... ... \n", "10985816 1 NaN [cs.ox.ac.uk] \n", "10986027 0 NaN [kuleuven.be] \n", "10987501 1 NaN [cecot.com.br] \n", "10987829 0 NaN [ime.eb.br] \n", "10988444 1 NaN [hotmail.com] \n", "\n", " url_domains n_emails n_urls n_ids n_keywords \\\n", "433 [github.com, linkedin.com] 1.0 2.0 NaN NaN \n", "497 NaN 1.0 NaN NaN NaN \n", "898 NaN 1.0 NaN NaN 7.0 \n", "1719 NaN 1.0 NaN 1.0 NaN \n", "6829 NaN 1.0 NaN NaN 3.0 \n", "... ... ... ... ... ... \n", "10985816 [fu-berlin.de] 1.0 1.0 NaN NaN \n", "10986027 [linkedin.com] 1.0 1.0 NaN NaN \n", "10987501 NaN 1.0 NaN NaN NaN \n", "10987829 NaN 1.0 NaN NaN NaN \n", "10988444 NaN 1.0 NaN NaN 5.0 \n", "\n", " n_education n_employment \n", "433 NaN 2.0 \n", "497 NaN NaN \n", "898 1.0 1.0 \n", "1719 4.0 2.0 \n", "6829 3.0 NaN \n", "... ... ... \n", "10985816 3.0 2.0 \n", "10986027 2.0 1.0 \n", "10987501 NaN NaN \n", "10987829 2.0 1.0 \n", "10988444 NaN 1.0 \n", "\n", "[19814 rows x 30 columns]" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.primary_email.isna() & df.other_email_domains.notna()]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## URLs" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
60000-0001-7402-009611NaNNaNNaNNaNNaNNaNNaNNaN[[, kth royal institute of technology, stockho...0NaN2015-01-11t15:13:06.467z2016-06-14t23:55:59.896z00000NaNNaN[kth.se]NaN1.0NaNNaNNaN1.0
110000-0001-8377-350811NaNNaNNaN[fontana, milena da silva]NaN[educação; informática; matemática.]NaNNaN[[, instituto federal de educação, ciência e t...0NaN2018-05-23t23:39:04.534z2019-10-16t02:50:11.007z00000NaNNaN[cnpq.br]NaN1.0NaN1.0NaN3.0
290000-0002-2638-410811NaNNaNinvestigador de la universidad de oviedo. depa...NaNNaN[constitutional history, history of political ...[[scopus author id, 54394231000]][[public law, ph doctor, university of oviedo,...[[professor of constitutional law, university ...1[crossref]2013-03-25t14:38:06.016z2020-07-01t13:10:37.025z10000NaNNaN[unioviedo.es]NaN1.01.03.01.01.0
460000-0003-1435-654511NaNNaNNaNNaNNaN[prostate cancer, migration, culture cell][[researcherid, p-2223-2018]][[morfologia, , universidade estadual paulista...[[, universidade estadual paulista (unesp), in...0NaN2018-08-09t12:12:24.405z2020-04-22t01:38:03.184z00000NaNNaN[cnpq.br, linkedin.com]NaN2.01.03.01.01.0
1580000-0003-1284-974111alex percy antoniomanriquez paisigNaNNaNNaNNaNNaNNaNNaN0NaN2020-09-08t20:04:33.906z2020-09-08t20:25:55.432z00000NaNNaN[youtube.com]NaN1.0NaNNaNNaNNaN
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "6 0000-0001-7402-0096 1 1 \n", "11 0000-0001-8377-3508 1 1 \n", "29 0000-0002-2638-4108 1 1 \n", "46 0000-0003-1435-6545 1 1 \n", "158 0000-0003-1284-9741 1 1 \n", "\n", " given_names family_name \\\n", "6 NaN NaN \n", "11 NaN NaN \n", "29 NaN NaN \n", "46 NaN NaN \n", "158 alex percy antonio manriquez paisig \n", "\n", " biography \\\n", "6 NaN \n", "11 NaN \n", "29 investigador de la universidad de oviedo. depa... \n", "46 NaN \n", "158 NaN \n", "\n", " other_names primary_email \\\n", "6 NaN NaN \n", "11 [fontana, milena da silva] NaN \n", "29 NaN NaN \n", "46 NaN NaN \n", "158 NaN NaN \n", "\n", " keywords \\\n", "6 NaN \n", "11 [educação; informática; matemática.] \n", "29 [constitutional history, history of political ... \n", "46 [prostate cancer, migration, culture cell] \n", "158 NaN \n", "\n", " external_ids \\\n", "6 NaN \n", "11 NaN \n", "29 [[scopus author id, 54394231000]] \n", "46 [[researcherid, p-2223-2018]] \n", "158 NaN \n", "\n", " education \\\n", "6 NaN \n", "11 NaN \n", "29 [[public law, ph doctor, university of oviedo,... \n", "46 [[morfologia, , universidade estadual paulista... \n", "158 NaN \n", "\n", " employment n_works works_source \\\n", "6 [[, kth royal institute of technology, stockho... 0 NaN \n", "11 [[, instituto federal de educação, ciência e t... 0 NaN \n", "29 [[professor of constitutional law, university ... 1 [crossref] \n", "46 [[, universidade estadual paulista (unesp), in... 0 NaN \n", "158 NaN 0 NaN \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "6 2015-01-11t15:13:06.467z 2016-06-14t23:55:59.896z 0 0 \n", "11 2018-05-23t23:39:04.534z 2019-10-16t02:50:11.007z 0 0 \n", "29 2013-03-25t14:38:06.016z 2020-07-01t13:10:37.025z 1 0 \n", "46 2018-08-09t12:12:24.405z 2020-04-22t01:38:03.184z 0 0 \n", "158 2020-09-08t20:04:33.906z 2020-09-08t20:25:55.432z 0 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", "6 0 0 0 NaN NaN \n", "11 0 0 0 NaN NaN \n", "29 0 0 0 NaN NaN \n", "46 0 0 0 NaN NaN \n", "158 0 0 0 NaN NaN \n", "\n", " url_domains n_emails n_urls n_ids n_keywords \\\n", "6 [kth.se] NaN 1.0 NaN NaN \n", "11 [cnpq.br] NaN 1.0 NaN 1.0 \n", "29 [unioviedo.es] NaN 1.0 1.0 3.0 \n", "46 [cnpq.br, linkedin.com] NaN 2.0 1.0 3.0 \n", "158 [youtube.com] NaN 1.0 NaN NaN \n", "\n", " n_education n_employment \n", "6 NaN 1.0 \n", "11 NaN 3.0 \n", "29 1.0 1.0 \n", "46 1.0 1.0 \n", "158 NaN NaN " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.url_domains.notna()].head()" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidn_urls
32265180000-0002-1234-835X219.0
42060550000-0001-7478-4539174.0
49018700000-0002-7392-3792169.0
81842600000-0002-6938-9638152.0
27436480000-0002-5710-4041114.0
.........
109896440000-0002-1686-1935NaN
109896450000-0002-3800-6331NaN
109896460000-0002-8783-5814NaN
109896470000-0002-7584-2283NaN
109896480000-0003-0529-3538NaN
\n", "

10989649 rows × 2 columns

\n", "
" ], "text/plain": [ " orcid n_urls\n", "3226518 0000-0002-1234-835X 219.0\n", "4206055 0000-0001-7478-4539 174.0\n", "4901870 0000-0002-7392-3792 169.0\n", "8184260 0000-0002-6938-9638 152.0\n", "2743648 0000-0002-5710-4041 114.0\n", "... ... ...\n", "10989644 0000-0002-1686-1935 NaN\n", "10989645 0000-0002-3800-6331 NaN\n", "10989646 0000-0002-8783-5814 NaN\n", "10989647 0000-0002-7584-2283 NaN\n", "10989648 0000-0003-0529-3538 NaN\n", "\n", "[10989649 rows x 2 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "urls_by_orcid = df[['orcid', 'n_urls']].sort_values('n_urls', ascending=False)\n", "urls_by_orcid" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "0000-0002-1234-835X", "0000-0001-7478-4539", "0000-0002-7392-3792", "0000-0002-6938-9638", "0000-0002-5710-4041", "0000-0003-2450-090X", "0000-0002-3920-7389", "0000-0002-6689-4129", "0000-0001-5384-0001", "0000-0002-4621-5571", "0000-0001-9131-1266", "0000-0002-7754-8889", "0000-0002-5250-1144", "0000-0002-9025-8632", "0000-0002-7456-3848", "0000-0003-0176-1293", "0000-0003-0321-7339", "0000-0002-8493-0402", "0000-0002-9965-2425", "0000-0001-8873-6677", "0000-0002-3997-5070", "0000-0002-1856-6905", "0000-0002-4316-1467", "0000-0002-4062-3603", "0000-0003-0594-2462", "0000-0001-5880-7091", "0000-0003-1524-6268", "0000-0002-0752-7513", "0000-0003-2593-7134", "0000-0002-1298-5252", "0000-0003-1761-3842", "0000-0003-2383-8386", "0000-0003-3546-2312", "0000-0002-2886-9248", "0000-0003-2183-8112", "0000-0002-1929-6054", "0000-0003-4948-9268", "0000-0003-2407-3557", "0000-0002-9276-6921", "0000-0003-1484-6958", "0000-0002-7568-3403", "0000-0002-4305-4215", "0000-0002-4004-6666", "0000-0003-0796-0234", "0000-0001-7133-6896", "0000-0002-8208-0897", "0000-0002-9071-5450", "0000-0003-4993-5555", "0000-0003-0930-6121", "0000-0002-8116-9611", "0000-0002-5139-2660", "0000-0002-3277-9659", "0000-0002-8122-879X", "0000-0001-9559-1103", "0000-0003-2862-6315", "0000-0002-2000-8339", "0000-0001-5300-4601", "0000-0002-6254-8683", "0000-0002-6547-0172", "0000-0003-4808-6619", "0000-0003-3933-0229", "0000-0002-0971-9375", "0000-0003-0694-1154", "0000-0003-1585-1134", "0000-0002-4659-5391", "0000-0002-2916-2893", "0000-0001-6783-2037", "0000-0001-6461-2573", "0000-0003-4501-3756", "0000-0001-5549-6822", "0000-0002-8940-3177", "0000-0003-4326-9336", "0000-0001-8096-4333", "0000-0001-8978-4830", "0000-0002-5946-1595", "0000-0002-6680-1703", "0000-0002-8593-9257", "0000-0002-7653-4899", "0000-0003-1904-4188", "0000-0002-5196-4905", "0000-0001-8808-4867", "0000-0001-6921-0426", "0000-0003-1815-1993", "0000-0002-7843-8497", "0000-0003-1675-2840", "0000-0001-8644-2114", "0000-0003-0907-9870", "0000-0001-7784-0583", "0000-0001-7550-5802", "0000-0001-8986-2528", "0000-0002-5265-6074", "0000-0001-9102-8639", "0000-0002-0696-8560", "0000-0001-6979-4273", "0000-0002-7179-6953", "0000-0002-3334-9386", "0000-0001-6714-009X", "0000-0001-7193-5039", "0000-0002-5241-1026", "0000-0001-7608-9433" ], "y": [ 219, 174, 169, 152, 114, 114, 111, 104, 104, 90, 83, 83, 81, 81, 80, 80, 80, 76, 73, 72, 71, 70, 69, 69, 68, 68, 68, 68, 67, 67, 66, 66, 65, 64, 61, 61, 61, 59, 57, 57, 57, 57, 57, 57, 57, 56, 55, 55, 55, 55, 51, 50, 50, 50, 49, 49, 48, 48, 48, 48, 47, 47, 46, 46, 46, 45, 45, 45, 45, 44, 43, 43, 43, 43, 42, 42, 42, 41, 41, 41, 40, 40, 39, 39, 39, 39, 38, 38, 38, 38, 38, 37, 37, 37, 37, 37, 36, 36, 36, 36 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top 100 ORCID iDs with URLs" }, "xaxis": { "range": [ -0.5, 99.5 ], "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(100)\n", "data = [\n", " go.Bar(\n", " x=urls_by_orcid[:TOP_N]['orcid'],\n", " y=urls_by_orcid[:TOP_N]['n_urls']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Top %s ORCID iDs with URLs' % TOP_N,\n", " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "top_urls = df[['orcid', 'url_domains']]\\\n", " .explode('url_domains')\\\n", " .reset_index(drop=True)\\\n", " .groupby('url_domains')\\\n", " .count()\\\n", " .sort_values('orcid', ascending=False)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "linkedin.com", "researchgate.net", "google.com", "cnpq.br", "academia.edu", "twitter.com", "facebook.com", "publons.com", "wordpress.com", "mendeley.com", "instagram.com", "github.io", "google.com.ua", "blogspot.com", "github.com", "google.es", "helsinki.fi", "unirioja.es", "youtube.com", "wixsite.com", "ku.dk", "", "scopus.com", "weebly.com", "us.es", "kth.se", "cityu.edu.hk", "au.dk", "kcl.ac.uk", "man.ac.uk", "google.com.au", "ucl.ac.uk", "sdu.dk", "ugr.es", "researcherid.com", "mq.edu.au", "ntu.edu.tw", "dtu.dk", "rug.nl", "colciencias.gov.co", "google.co.in", "bris.ac.uk", "uwa.edu.au", "uc3m.es", "vub.be", "bu.edu", "monash.edu", "google.co.uk", "aau.dk", "lancs.ac.uk" ], "y": [ 78418, 67823, 44804, 24635, 21174, 19046, 15368, 10751, 9043, 6960, 6040, 5516, 5371, 5272, 5252, 5163, 4730, 4590, 4470, 4140, 3771, 3620, 3586, 3122, 3037, 2957, 2795, 2746, 2724, 2689, 2610, 2586, 2478, 2231, 2134, 2133, 2094, 2002, 1975, 1929, 1917, 1840, 1820, 1804, 1803, 1803, 1772, 1656, 1653, 1650 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top-50 URL domains" }, "xaxis": { "range": [ -0.5, 49.5 ], "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(50)\n", "data = [\n", " go.Bar(\n", " x=top_urls[:TOP_N].index,\n", " y=top_urls[:TOP_N]['orcid']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Top-%s URL domains' % TOP_N,\n", " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## URLs speculation" ] }, { "cell_type": "code", "execution_count": 29, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
10257130000-0003-2407-355711abdulazizabdul aziz was born on may 25, 1973, in brebes...[abdul aziz, aziz, abdul, aziz, a., aziz, abd,...NaN[ekonomi islam, etika bisnis islam, ilmu ekono...NaN[[ilmu ekonomi, dr, universitas borobudur, jak...[[assisten professor/dr, institut agama islam ...72[base - bielefeld academic search engine, abdu...2016-09-12t04:41:24.842z2021-01-26t11:58:33.039z1900771NaNNaN[google.com, syekhnurjati.ac.id, orcid.org, bl...NaN59.0NaN4.03.01.0
27436480000-0002-5710-404111ryszardromaniukprofessor of electronics and communications en...[r.romaniuk, r.s.romaniuk, ryszard romaniuk, r...rrom@ise.pw.edu.pl[measurement systems, electronics, photonics, ...[[isni, 0000000071432485], [researcherid, b-91...[[faculty of electronics and information techn...[[professor, institute director, politechnika ...5008[inspire-hep, researcherid, isni2orcid search ...2013-01-20t12:09:21.600z2021-03-16t19:37:31.650z122125017421ise.pw.edu.pl[ise.pw.edu.pl, elka.pw.edu.pl, cern.ch][google.pl, publons.com, scopus.com, mendeley....3.0114.03.05.01.01.0
30117240000-0003-2450-090X11eduardbabulakprofessor eduard babulak is accomplished inter...[professor eduard babulak]NaN[next generation of ict and eservices, compute...[[scopus author id, 6506867432], [researcherid...[[information technology, doctor habilitated (...[[consultant, horizon 2020 framework programme...274[the lens, base - bielefeld academic search en...2013-04-03t08:02:30.013z2021-02-28t10:07:13.231z199011740NaNNaN[worldassessmentcouncil.org, spseke.sk, bcs.or...NaN114.05.08.06.022.0
38810640000-0002-3920-738911а.гусевsurname, name gusev alexander leonidovichdate...[alexander l. gusev , alexander leonidovich gu...NaN[photochromic, electrochromic, storage and tra...[[researcherid, f-8048-2014], [scopus author i...[[chemical technology and cryogenic-vacuum tec...[[general director, scientific technical centr...472[publons, datacite, scopus - elsevier, a.l. gu...2014-05-14t00:01:28.030z2021-01-16t13:44:14.134z3700211NaNNaN[youtube.com, isjaee.com, researchgate.net, re...NaN111.02.016.02.07.0
74660620000-0002-1929-605411franklin américocanaza choquedocente-investigador social. maestrando en der...[franklin américo canaza-choque , franklin a. ...leo_123fa@hotmail.com[justicia global; democracia; derechos humanos...[[researcherid, p-8613-2018], [loop profile, 8...[[facultad de ciencias de la educación , maest...[[investigador social, universidad católica de...39[researcherid, base - bielefeld academic searc...2017-09-15t19:45:43.483z2021-03-23t20:12:47.297z3000341hotmail.com[gmail.com, gmail.com, hotmail.com, baldwin.ed...[concytec.gob.pe, redalyc.org, redalyc.org, un...5.061.04.02.01.01.0
75170960000-0003-4948-926811gustavoduperrégustavo norberto duperré graduated in arts and...[gustavo norberto duperré, duperré, g. n., gus...gustavo.duperre@usal.edu.ar[sciences of antiquity, social sciences, compu...[[scopus author id, 57195936346], [researcheri...[[programme in history, history of art and ter...[[titular professor, dirección general de cult...41[gustavo duperré, scopus - elsevier, publons, ...2020-02-22t15:49:52.386z2021-03-12t15:13:44.065z1300340usal.edu.arNaN[icomos.ro, unirioja.es, unirioja.es, unc.edu....NaN61.02.011.06.05.0
80682750000-0003-2183-811211pelayo munhozoleapós-doutorado em gestão ambiental pela univers...[ munhoz, pelayo olea, olea, pelayo, olea, p...NaN[empreendedorismo, sustentabilidade, inovação][[scopus author id, 55175503300], [researcheri...[[, postdoctoral in environmental sustainabili...[[professor, universidade federal do rio grand...1109[the lens, pelayo munhoz olea, dimensions, bas...2013-02-04t17:25:34.723z2021-03-19t18:51:01.128z798015821NaNNaN[cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c...NaN61.02.03.07.09.0
81842600000-0002-6938-963811adolfocatral sanabriamy education is in computer science, mathemati...NaNNaNNaN[[loop profile, 747193]][[education, capacitación para la enseñanza en...NaN2023[base - bielefeld academic search engine, data...2019-05-07t19:27:02.210z2020-12-10t23:39:15.236z202200161NaNNaN[researchgate.net, youtube.com, linkedin.com, ...NaN152.01.0NaN6.0NaN
87912560000-0002-9025-863211buycannabisdispensarywe procure and deliver premium cannabis strain...[we procure and deliver premium cannabis strai...NaN[canabis dispensary, cannabis, cannabis commun...NaNNaNNaN10[goowonderland dispensary]2020-12-09t21:19:46.004z2020-12-10t01:17:28.772z00000NaNNaN[goowonderland.com, goowonderland.com, goowond...NaN81.0NaN7.0NaNNaN
101745090000-0002-9965-242511jaroslawspychalajaroslaw spychala has received a doctoral degr...[jaroslaw jozef spychala]NaN[medicinal and pharmaceutical chemistry, photo...[[scopus author id, 7006745874]][[department of chemistry, postdoctoral associ...[[assistant professor, adam mickiewicz univers...29[scopus - elsevier]2014-09-18t12:34:14.242z2020-02-11t14:31:25.544z1500291NaNNaN[biowebspin.com, biowebspin.com, google.com, l...NaN73.01.04.04.02.0
102578080000-0002-4062-360311juan de diosbeltrán mancillajuan de dios beltrán mancilla (*) filósofo aut...[juan de dios beltrán mancilla, filósofo autod...NaN[filosofia medicina arquitectura economía dere...NaN[[, diplomado en practicas directivas para or...[[inspector general jornada vespertina // de 2...11[juan de dios beltr´´án mancilla]2020-04-19t21:06:33.495z2021-02-10t20:13:07.698z00070NaNNaN[yumpu.com, ijopm.org, google.com, blogspot.co...NaN69.0NaN1.08.06.0
104862120000-0002-3997-507011dr. parameshacharib ddr. parameshachari b dacm distinguished speake...[dr. parameshachari b d]NaN[professor & head |dept. of tce| gsssiet for w...[[researcherid, f-7045-2018], [scopus author i...[[electronics and communication engineering, p...[[acm distinguished speaker (volunteer), assoc...93[publons, multidisciplinary digital publishing...2016-08-24t11:00:30.403z2021-03-23t07:16:22.582z4700481NaNNaN[geethashishu.in, geethashishu.in, acm.org, go...NaN71.03.06.05.010.0
106526320000-0003-2593-713411aanjaelaniall my papers can be downloaded from portal:re...[jaelani, a., jaelani, aan]aan_jaelani@syekhnurjati.ac.id[tourism industry, islamic finance and banking...[[scopus author id, 57195963463], [loop profil...[[post graduate, s3/dr, universitas islam nege...[[dr, institut agama islam negeri syekh nurjat...79[publons, aan jaelani, scopus - elsevier, dime...2016-03-02t18:37:44.989z2021-03-19t10:11:57.908z88001931syekhnurjati.ac.id[gmail.com][microsoft.com, twitter.com, academia.edu, aca...1.067.04.07.02.01.0
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "1025713 0000-0003-2407-3557 1 1 \n", "2743648 0000-0002-5710-4041 1 1 \n", "3011724 0000-0003-2450-090X 1 1 \n", "3881064 0000-0002-3920-7389 1 1 \n", "7466062 0000-0002-1929-6054 1 1 \n", "7517096 0000-0003-4948-9268 1 1 \n", "8068275 0000-0003-2183-8112 1 1 \n", "8184260 0000-0002-6938-9638 1 1 \n", "8791256 0000-0002-9025-8632 1 1 \n", "10174509 0000-0002-9965-2425 1 1 \n", "10257808 0000-0002-4062-3603 1 1 \n", "10486212 0000-0002-3997-5070 1 1 \n", "10652632 0000-0003-2593-7134 1 1 \n", "\n", " given_names family_name \\\n", "1025713 abdul aziz \n", "2743648 ryszard romaniuk \n", "3011724 eduard babulak \n", "3881064 а. гусев \n", "7466062 franklin américo canaza choque \n", "7517096 gustavo duperré \n", "8068275 pelayo munhoz olea \n", "8184260 adolfo catral sanabria \n", "8791256 buycannabis dispensary \n", "10174509 jaroslaw spychala \n", "10257808 juan de dios beltrán mancilla \n", "10486212 dr. parameshachari b d \n", "10652632 aan jaelani \n", "\n", " biography \\\n", "1025713 abdul aziz was born on may 25, 1973, in brebes... \n", "2743648 professor of electronics and communications en... \n", "3011724 professor eduard babulak is accomplished inter... \n", "3881064 surname, name gusev alexander leonidovichdate... \n", "7466062 docente-investigador social. maestrando en der... \n", "7517096 gustavo norberto duperré graduated in arts and... \n", "8068275 pós-doutorado em gestão ambiental pela univers... \n", "8184260 my education is in computer science, mathemati... \n", "8791256 we procure and deliver premium cannabis strain... \n", "10174509 jaroslaw spychala has received a doctoral degr... \n", "10257808 juan de dios beltrán mancilla (*) filósofo aut... \n", "10486212 dr. parameshachari b dacm distinguished speake... \n", "10652632 all my papers can be downloaded from portal:re... \n", "\n", " other_names \\\n", "1025713 [abdul aziz, aziz, abdul, aziz, a., aziz, abd,... \n", "2743648 [r.romaniuk, r.s.romaniuk, ryszard romaniuk, r... \n", "3011724 [professor eduard babulak] \n", "3881064 [alexander l. gusev , alexander leonidovich gu... \n", "7466062 [franklin américo canaza-choque , franklin a. ... \n", "7517096 [gustavo norberto duperré, duperré, g. n., gus... \n", "8068275 [ munhoz, pelayo olea, olea, pelayo, olea, p... \n", "8184260 NaN \n", "8791256 [we procure and deliver premium cannabis strai... \n", "10174509 [jaroslaw jozef spychala] \n", "10257808 [juan de dios beltrán mancilla, filósofo autod... \n", "10486212 [dr. parameshachari b d] \n", "10652632 [jaelani, a., jaelani, aan] \n", "\n", " primary_email \\\n", "1025713 NaN \n", "2743648 rrom@ise.pw.edu.pl \n", "3011724 NaN \n", "3881064 NaN \n", "7466062 leo_123fa@hotmail.com \n", "7517096 gustavo.duperre@usal.edu.ar \n", "8068275 NaN \n", "8184260 NaN \n", "8791256 NaN \n", "10174509 NaN \n", "10257808 NaN \n", "10486212 NaN \n", "10652632 aan_jaelani@syekhnurjati.ac.id \n", "\n", " keywords \\\n", "1025713 [ekonomi islam, etika bisnis islam, ilmu ekono... \n", "2743648 [measurement systems, electronics, photonics, ... \n", "3011724 [next generation of ict and eservices, compute... \n", "3881064 [photochromic, electrochromic, storage and tra... \n", "7466062 [justicia global; democracia; derechos humanos... \n", "7517096 [sciences of antiquity, social sciences, compu... \n", "8068275 [empreendedorismo, sustentabilidade, inovação] \n", "8184260 NaN \n", "8791256 [canabis dispensary, cannabis, cannabis commun... \n", "10174509 [medicinal and pharmaceutical chemistry, photo... \n", "10257808 [filosofia medicina arquitectura economía dere... \n", "10486212 [professor & head |dept. of tce| gsssiet for w... \n", "10652632 [tourism industry, islamic finance and banking... \n", "\n", " external_ids \\\n", "1025713 NaN \n", "2743648 [[isni, 0000000071432485], [researcherid, b-91... \n", "3011724 [[scopus author id, 6506867432], [researcherid... \n", "3881064 [[researcherid, f-8048-2014], [scopus author i... \n", "7466062 [[researcherid, p-8613-2018], [loop profile, 8... \n", "7517096 [[scopus author id, 57195936346], [researcheri... \n", "8068275 [[scopus author id, 55175503300], [researcheri... \n", "8184260 [[loop profile, 747193]] \n", "8791256 NaN \n", "10174509 [[scopus author id, 7006745874]] \n", "10257808 NaN \n", "10486212 [[researcherid, f-7045-2018], [scopus author i... \n", "10652632 [[scopus author id, 57195963463], [loop profil... \n", "\n", " education \\\n", "1025713 [[ilmu ekonomi, dr, universitas borobudur, jak... \n", "2743648 [[faculty of electronics and information techn... \n", "3011724 [[information technology, doctor habilitated (... \n", "3881064 [[chemical technology and cryogenic-vacuum tec... \n", "7466062 [[facultad de ciencias de la educación , maest... \n", "7517096 [[programme in history, history of art and ter... \n", "8068275 [[, postdoctoral in environmental sustainabili... \n", "8184260 [[education, capacitación para la enseñanza en... \n", "8791256 NaN \n", "10174509 [[department of chemistry, postdoctoral associ... \n", "10257808 [[, diplomado en practicas directivas para or... \n", "10486212 [[electronics and communication engineering, p... \n", "10652632 [[post graduate, s3/dr, universitas islam nege... \n", "\n", " employment n_works \\\n", "1025713 [[assisten professor/dr, institut agama islam ... 72 \n", "2743648 [[professor, institute director, politechnika ... 5008 \n", "3011724 [[consultant, horizon 2020 framework programme... 274 \n", "3881064 [[general director, scientific technical centr... 472 \n", "7466062 [[investigador social, universidad católica de... 39 \n", "7517096 [[titular professor, dirección general de cult... 41 \n", "8068275 [[professor, universidade federal do rio grand... 1109 \n", "8184260 NaN 2023 \n", "8791256 NaN 10 \n", "10174509 [[assistant professor, adam mickiewicz univers... 29 \n", "10257808 [[inspector general jornada vespertina // de 2... 11 \n", "10486212 [[acm distinguished speaker (volunteer), assoc... 93 \n", "10652632 [[dr, institut agama islam negeri syekh nurjat... 79 \n", "\n", " works_source \\\n", "1025713 [base - bielefeld academic search engine, abdu... \n", "2743648 [inspire-hep, researcherid, isni2orcid search ... \n", "3011724 [the lens, base - bielefeld academic search en... \n", "3881064 [publons, datacite, scopus - elsevier, a.l. gu... \n", "7466062 [researcherid, base - bielefeld academic searc... \n", "7517096 [gustavo duperré, scopus - elsevier, publons, ... \n", "8068275 [the lens, pelayo munhoz olea, dimensions, bas... \n", "8184260 [base - bielefeld academic search engine, data... \n", "8791256 [goowonderland dispensary] \n", "10174509 [scopus - elsevier] \n", "10257808 [juan de dios beltr´´án mancilla] \n", "10486212 [publons, multidisciplinary digital publishing... \n", "10652632 [publons, aan jaelani, scopus - elsevier, dime... \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "1025713 2016-09-12t04:41:24.842z 2021-01-26t11:58:33.039z 19 0 \n", "2743648 2013-01-20t12:09:21.600z 2021-03-16t19:37:31.650z 1221 25 \n", "3011724 2013-04-03t08:02:30.013z 2021-02-28t10:07:13.231z 199 0 \n", "3881064 2014-05-14t00:01:28.030z 2021-01-16t13:44:14.134z 37 0 \n", "7466062 2017-09-15t19:45:43.483z 2021-03-23t20:12:47.297z 30 0 \n", "7517096 2020-02-22t15:49:52.386z 2021-03-12t15:13:44.065z 13 0 \n", "8068275 2013-02-04t17:25:34.723z 2021-03-19t18:51:01.128z 798 0 \n", "8184260 2019-05-07t19:27:02.210z 2020-12-10t23:39:15.236z 2022 0 \n", "8791256 2020-12-09t21:19:46.004z 2020-12-10t01:17:28.772z 0 0 \n", "10174509 2014-09-18t12:34:14.242z 2020-02-11t14:31:25.544z 15 0 \n", "10257808 2020-04-19t21:06:33.495z 2021-02-10t20:13:07.698z 0 0 \n", "10486212 2016-08-24t11:00:30.403z 2021-03-23t07:16:22.582z 47 0 \n", "10652632 2016-03-02t18:37:44.989z 2021-03-19t10:11:57.908z 88 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain \\\n", "1025713 0 77 1 NaN \n", "2743648 0 1742 1 ise.pw.edu.pl \n", "3011724 1 174 0 NaN \n", "3881064 0 21 1 NaN \n", "7466062 0 34 1 hotmail.com \n", "7517096 0 34 0 usal.edu.ar \n", "8068275 1 582 1 NaN \n", "8184260 0 16 1 NaN \n", "8791256 0 0 0 NaN \n", "10174509 0 29 1 NaN \n", "10257808 0 7 0 NaN \n", "10486212 0 48 1 NaN \n", "10652632 0 193 1 syekhnurjati.ac.id \n", "\n", " other_email_domains \\\n", "1025713 NaN \n", "2743648 [ise.pw.edu.pl, elka.pw.edu.pl, cern.ch] \n", "3011724 NaN \n", "3881064 NaN \n", "7466062 [gmail.com, gmail.com, hotmail.com, baldwin.ed... \n", "7517096 NaN \n", "8068275 NaN \n", "8184260 NaN \n", "8791256 NaN \n", "10174509 NaN \n", "10257808 NaN \n", "10486212 NaN \n", "10652632 [gmail.com] \n", "\n", " url_domains n_emails n_urls \\\n", "1025713 [google.com, syekhnurjati.ac.id, orcid.org, bl... NaN 59.0 \n", "2743648 [google.pl, publons.com, scopus.com, mendeley.... 3.0 114.0 \n", "3011724 [worldassessmentcouncil.org, spseke.sk, bcs.or... NaN 114.0 \n", "3881064 [youtube.com, isjaee.com, researchgate.net, re... NaN 111.0 \n", "7466062 [concytec.gob.pe, redalyc.org, redalyc.org, un... 5.0 61.0 \n", "7517096 [icomos.ro, unirioja.es, unirioja.es, unc.edu.... NaN 61.0 \n", "8068275 [cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c... NaN 61.0 \n", "8184260 [researchgate.net, youtube.com, linkedin.com, ... NaN 152.0 \n", "8791256 [goowonderland.com, goowonderland.com, goowond... NaN 81.0 \n", "10174509 [biowebspin.com, biowebspin.com, google.com, l... NaN 73.0 \n", "10257808 [yumpu.com, ijopm.org, google.com, blogspot.co... NaN 69.0 \n", "10486212 [geethashishu.in, geethashishu.in, acm.org, go... NaN 71.0 \n", "10652632 [microsoft.com, twitter.com, academia.edu, aca... 1.0 67.0 \n", "\n", " n_ids n_keywords n_education n_employment \n", "1025713 NaN 4.0 3.0 1.0 \n", "2743648 3.0 5.0 1.0 1.0 \n", "3011724 5.0 8.0 6.0 22.0 \n", "3881064 2.0 16.0 2.0 7.0 \n", "7466062 4.0 2.0 1.0 1.0 \n", "7517096 2.0 11.0 6.0 5.0 \n", "8068275 2.0 3.0 7.0 9.0 \n", "8184260 1.0 NaN 6.0 NaN \n", "8791256 NaN 7.0 NaN NaN \n", "10174509 1.0 4.0 4.0 2.0 \n", "10257808 NaN 1.0 8.0 6.0 \n", "10486212 3.0 6.0 5.0 10.0 \n", "10652632 4.0 7.0 2.0 1.0 " ] }, "execution_count": 29, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]" ] }, { "cell_type": "code", "execution_count": 30, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
474390000-0002-5967-283511oleksiygoryayinovNaN[алексей николаевич горяинов, о.м.горяїнов, а....NaN[diagnostics, transport, logistics][[researcherid, i-7977-2016]][[, дистанционный курс «ctl.sc2x: supply chain...[[docent, kharkiv petro vasylenko national tec...274[oleksiy goryayinov]2014-08-03t18:06:42.925z2021-03-22t13:56:48.311z00001NaNNaN[khntusg.com.ua, khntusg.com.ua, google.com.ua...NaN13.01.03.014.07.0
725570000-0002-3505-279711nurulmalahayatigoogle scholarNaNNaNNaN[[researcherid, q-3861-2017]][[civil and transportation engineering , maste...[[senior lecturer, universitas syiah kuala, ba...6[nurul malahayati]2017-10-01t00:46:31.324z2019-08-19t15:52:47.253z30031NaNNaN[google.com, ristekdikti.go.id, unsyiah.ac.id,...NaN16.01.0NaN2.01.0
940810000-0003-3670-962011carlosbarreraim individual inventor, and this is my work; s...[retrodynamic, novelinflow]NaN[imploturbocompressor, innovation, gearturbine...[[loop profile, 394457]]NaNNaN1[carlos barrera]2016-08-29t20:32:10.362z2021-02-09t04:56:35.554z00001NaNNaN[blogspot.mx, behance.net, authorstream.com, d...NaN24.01.08.0NaNNaN
2616730000-0002-5441-046511nuriahernández-leónNaN[nuria h. león, nuria hernández león, hernánde...NaN[business management, research, human resource...NaN[[, course: social skills, university of salam...[[merchandise reception and expedition trainer...11[nuria hernández-león]2015-11-28t07:18:58.442z2021-03-05t16:37:47.403z10041NaNNaN[feriaempresamujer.com, escueladenegociosydire...NaN16.0NaN7.019.016.0
3262110000-0002-7781-676711mohd nazriismailborn in penang, malaysia in 1971, dr. mohd had...[ndum (national defence university of malaysia)]NaN[sensor, iot, voice over ip, wsn, design of ne...[[scopus author id, 24372977800], [researcheri...NaN[[lecturer, universiti pertahanan nasional mal...35[scopus - elsevier]2016-09-06t02:25:52.974z2020-10-20t06:55:55.051z2400351NaNNaN[google.com.my, researchgate.net, academia.edu...NaN16.02.010.0NaN4.0
.............................................................................................
105798010000-0001-5087-696511robertoharasystematics, evolutionary biology, and the his...[r. o’hara, r.j. o’hara, robert o’hara, robert...NaN[history and philosophy of science, ancient nu...[[isni, 0000000138200102], [researcherid, b-47...[[biology, ph.d., harvard university, cambridg...NaN45[robert j. o’hara]2014-09-21t02:45:19.620z2020-07-09t06:51:09.228z2300721NaNNaN[rjohara.net, google.com, collegiateway.org, r...NaN12.03.05.01.0NaN
105908820000-0002-3318-986111shaguftaperveenprof. dr. shagufta perveen is a professor at k...NaNshagufta792000@yahoo.com[shagufta perveen university of southampton, s...NaN[[hej research institute of chemistry, phd che...[[professor, king saud university college of p...66[scopus - elsevier]2015-12-21t10:34:06.771z2021-02-22t14:58:30.893z5600661yahoo.com[msu.edu, ksu.edu.sa][shaguftaperveen.com, researchgate.net, ksu.ed...2.011.0NaN25.03.07.0
107660620000-0001-8960-900411susanbastaniNaN[s. bastani, سوسن باستانی]sbastani@alzahra.ac.ir[online and offline communities, personal netw...[[scopus author id, 16642098400]][[sociology, ph.d., university of toronto, tor...[[professor, alzahra university, tehran, vanak...20[scopus - elsevier]2019-07-10t06:50:46.255z2020-10-07t04:08:01.961z1900331alzahra.ac.ir[gmail.com, gmail.com][scopus.com, google.com, publons.com, zenodo.o...2.011.01.04.03.04.0
108078390000-0002-4379-645411caroline wanjirukariukicaroline holds a phd in economics from curtin ...NaNNaN[applied economics, applied econometrics, deve...NaN[[economics, doctor of philosophy , curtin uni...[[director, educational development, strathmor...4[caroline wanjiru kariuki]2020-03-18t10:18:04.007z2021-02-11t14:40:38.515z10000NaNNaN[scopus.com, mendeley.com, publons.com, resear...NaN13.0NaN4.03.06.0
109119660000-0003-2311-060011myokyaw hlaingNaN[dr myo kyaw hlaing]NaN[economic geology]NaNNaN[[lecturer, union of myanmar ministry of educa...2[myo kyaw hlaing]2018-12-26t12:51:57.801z2021-01-26t14:36:47.421z10020NaNNaN[facebook.com, linkedin.com, instagram.com, re...NaN12.0NaN1.0NaN2.0
\n", "

140 rows × 30 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "47439 0000-0002-5967-2835 1 1 \n", "72557 0000-0002-3505-2797 1 1 \n", "94081 0000-0003-3670-9620 1 1 \n", "261673 0000-0002-5441-0465 1 1 \n", "326211 0000-0002-7781-6767 1 1 \n", "... ... ... ... \n", "10579801 0000-0001-5087-6965 1 1 \n", "10590882 0000-0002-3318-9861 1 1 \n", "10766062 0000-0001-8960-9004 1 1 \n", "10807839 0000-0002-4379-6454 1 1 \n", "10911966 0000-0003-2311-0600 1 1 \n", "\n", " given_names family_name \\\n", "47439 oleksiy goryayinov \n", "72557 nurul malahayati \n", "94081 carlos barrera \n", "261673 nuria hernández-león \n", "326211 mohd nazri ismail \n", "... ... ... \n", "10579801 robert ohara \n", "10590882 shagufta perveen \n", "10766062 susan bastani \n", "10807839 caroline wanjiru kariuki \n", "10911966 myo kyaw hlaing \n", "\n", " biography \\\n", "47439 NaN \n", "72557 google scholar \n", "94081 im individual inventor, and this is my work; s... \n", "261673 NaN \n", "326211 born in penang, malaysia in 1971, dr. mohd had... \n", "... ... \n", "10579801 systematics, evolutionary biology, and the his... \n", "10590882 prof. dr. shagufta perveen is a professor at k... \n", "10766062 NaN \n", "10807839 caroline holds a phd in economics from curtin ... \n", "10911966 NaN \n", "\n", " other_names \\\n", "47439 [алексей николаевич горяинов, о.м.горяїнов, а.... \n", "72557 NaN \n", "94081 [retrodynamic, novelinflow] \n", "261673 [nuria h. león, nuria hernández león, hernánde... \n", "326211 [ndum (national defence university of malaysia)] \n", "... ... \n", "10579801 [r. o’hara, r.j. o’hara, robert o’hara, robert... \n", "10590882 NaN \n", "10766062 [s. bastani, سوسن باستانی] \n", "10807839 NaN \n", "10911966 [dr myo kyaw hlaing] \n", "\n", " primary_email \\\n", "47439 NaN \n", "72557 NaN \n", "94081 NaN \n", "261673 NaN \n", "326211 NaN \n", "... ... \n", "10579801 NaN \n", "10590882 shagufta792000@yahoo.com \n", "10766062 sbastani@alzahra.ac.ir \n", "10807839 NaN \n", "10911966 NaN \n", "\n", " keywords \\\n", "47439 [diagnostics, transport, logistics] \n", "72557 NaN \n", "94081 [imploturbocompressor, innovation, gearturbine... \n", "261673 [business management, research, human resource... \n", "326211 [sensor, iot, voice over ip, wsn, design of ne... \n", "... ... \n", "10579801 [history and philosophy of science, ancient nu... \n", "10590882 [shagufta perveen university of southampton, s... \n", "10766062 [online and offline communities, personal netw... \n", "10807839 [applied economics, applied econometrics, deve... \n", "10911966 [economic geology] \n", "\n", " external_ids \\\n", "47439 [[researcherid, i-7977-2016]] \n", "72557 [[researcherid, q-3861-2017]] \n", "94081 [[loop profile, 394457]] \n", "261673 NaN \n", "326211 [[scopus author id, 24372977800], [researcheri... \n", "... ... \n", "10579801 [[isni, 0000000138200102], [researcherid, b-47... \n", "10590882 NaN \n", "10766062 [[scopus author id, 16642098400]] \n", "10807839 NaN \n", "10911966 NaN \n", "\n", " education \\\n", "47439 [[, дистанционный курс «ctl.sc2x: supply chain... \n", "72557 [[civil and transportation engineering , maste... \n", "94081 NaN \n", "261673 [[, course: social skills, university of salam... \n", "326211 NaN \n", "... ... \n", "10579801 [[biology, ph.d., harvard university, cambridg... \n", "10590882 [[hej research institute of chemistry, phd che... \n", "10766062 [[sociology, ph.d., university of toronto, tor... \n", "10807839 [[economics, doctor of philosophy , curtin uni... \n", "10911966 NaN \n", "\n", " employment n_works \\\n", "47439 [[docent, kharkiv petro vasylenko national tec... 274 \n", "72557 [[senior lecturer, universitas syiah kuala, ba... 6 \n", "94081 NaN 1 \n", "261673 [[merchandise reception and expedition trainer... 11 \n", "326211 [[lecturer, universiti pertahanan nasional mal... 35 \n", "... ... ... \n", "10579801 NaN 45 \n", "10590882 [[professor, king saud university college of p... 66 \n", "10766062 [[professor, alzahra university, tehran, vanak... 20 \n", "10807839 [[director, educational development, strathmor... 4 \n", "10911966 [[lecturer, union of myanmar ministry of educa... 2 \n", "\n", " works_source activation_date \\\n", "47439 [oleksiy goryayinov] 2014-08-03t18:06:42.925z \n", "72557 [nurul malahayati] 2017-10-01t00:46:31.324z \n", "94081 [carlos barrera] 2016-08-29t20:32:10.362z \n", "261673 [nuria hernández-león] 2015-11-28t07:18:58.442z \n", "326211 [scopus - elsevier] 2016-09-06t02:25:52.974z \n", "... ... ... \n", "10579801 [robert j. o’hara] 2014-09-21t02:45:19.620z \n", "10590882 [scopus - elsevier] 2015-12-21t10:34:06.771z \n", "10766062 [scopus - elsevier] 2019-07-10t06:50:46.255z \n", "10807839 [caroline wanjiru kariuki] 2020-03-18t10:18:04.007z \n", "10911966 [myo kyaw hlaing] 2018-12-26t12:51:57.801z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n", "47439 2021-03-22t13:56:48.311z 0 0 0 0 \n", "72557 2019-08-19t15:52:47.253z 3 0 0 3 \n", "94081 2021-02-09t04:56:35.554z 0 0 0 0 \n", "261673 2021-03-05t16:37:47.403z 1 0 0 4 \n", "326211 2020-10-20t06:55:55.051z 24 0 0 35 \n", "... ... ... ... ... ... \n", "10579801 2020-07-09t06:51:09.228z 23 0 0 72 \n", "10590882 2021-02-22t14:58:30.893z 56 0 0 66 \n", "10766062 2020-10-07t04:08:01.961z 19 0 0 33 \n", "10807839 2021-02-11t14:40:38.515z 1 0 0 0 \n", "10911966 2021-01-26t14:36:47.421z 1 0 0 2 \n", "\n", " label primary_email_domain other_email_domains \\\n", "47439 1 NaN NaN \n", "72557 1 NaN NaN \n", "94081 1 NaN NaN \n", "261673 1 NaN NaN \n", "326211 1 NaN NaN \n", "... ... ... ... \n", "10579801 1 NaN NaN \n", "10590882 1 yahoo.com [msu.edu, ksu.edu.sa] \n", "10766062 1 alzahra.ac.ir [gmail.com, gmail.com] \n", "10807839 0 NaN NaN \n", "10911966 0 NaN NaN \n", "\n", " url_domains n_emails n_urls \\\n", "47439 [khntusg.com.ua, khntusg.com.ua, google.com.ua... NaN 13.0 \n", "72557 [google.com, ristekdikti.go.id, unsyiah.ac.id,... NaN 16.0 \n", "94081 [blogspot.mx, behance.net, authorstream.com, d... NaN 24.0 \n", "261673 [feriaempresamujer.com, escueladenegociosydire... NaN 16.0 \n", "326211 [google.com.my, researchgate.net, academia.edu... NaN 16.0 \n", "... ... ... ... \n", "10579801 [rjohara.net, google.com, collegiateway.org, r... NaN 12.0 \n", "10590882 [shaguftaperveen.com, researchgate.net, ksu.ed... 2.0 11.0 \n", "10766062 [scopus.com, google.com, publons.com, zenodo.o... 2.0 11.0 \n", "10807839 [scopus.com, mendeley.com, publons.com, resear... NaN 13.0 \n", "10911966 [facebook.com, linkedin.com, instagram.com, re... NaN 12.0 \n", "\n", " n_ids n_keywords n_education n_employment \n", "47439 1.0 3.0 14.0 7.0 \n", "72557 1.0 NaN 2.0 1.0 \n", "94081 1.0 8.0 NaN NaN \n", "261673 NaN 7.0 19.0 16.0 \n", "326211 2.0 10.0 NaN 4.0 \n", "... ... ... ... ... \n", "10579801 3.0 5.0 1.0 NaN \n", "10590882 NaN 25.0 3.0 7.0 \n", "10766062 1.0 4.0 3.0 4.0 \n", "10807839 NaN 4.0 3.0 6.0 \n", "10911966 NaN 1.0 NaN 2.0 \n", "\n", "[140 rows x 30 columns]" ] }, "execution_count": 30, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]" ] }, { "cell_type": "code", "execution_count": 31, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
00000-0002-5967-283511oleksiygoryayinovNaN[алексей николаевич горяинов, о.м.горяїнов, а....NaN[diagnostics, transport, logistics][[researcherid, i-7977-2016]][[, дистанционный курс «ctl.sc2x: supply chain...[[docent, kharkiv petro vasylenko national tec...274oleksiy goryayinov2014-08-03t18:06:42.925z2021-03-22t13:56:48.311z00001NaNNaN[khntusg.com.ua, khntusg.com.ua, google.com.ua...NaN13.01.03.014.07.0
10000-0002-3505-279711nurulmalahayatigoogle scholarNaNNaNNaN[[researcherid, q-3861-2017]][[civil and transportation engineering , maste...[[senior lecturer, universitas syiah kuala, ba...6nurul malahayati2017-10-01t00:46:31.324z2019-08-19t15:52:47.253z30031NaNNaN[google.com, ristekdikti.go.id, unsyiah.ac.id,...NaN16.01.0NaN2.01.0
20000-0003-3670-962011carlosbarreraim individual inventor, and this is my work; s...[retrodynamic, novelinflow]NaN[imploturbocompressor, innovation, gearturbine...[[loop profile, 394457]]NaNNaN1carlos barrera2016-08-29t20:32:10.362z2021-02-09t04:56:35.554z00001NaNNaN[blogspot.mx, behance.net, authorstream.com, d...NaN24.01.08.0NaNNaN
30000-0002-5441-046511nuriahernández-leónNaN[nuria h. león, nuria hernández león, hernánde...NaN[business management, research, human resource...NaN[[, course: social skills, university of salam...[[merchandise reception and expedition trainer...11nuria hernández-león2015-11-28t07:18:58.442z2021-03-05t16:37:47.403z10041NaNNaN[feriaempresamujer.com, escueladenegociosydire...NaN16.0NaN7.019.016.0
40000-0002-7781-676711mohd nazriismailborn in penang, malaysia in 1971, dr. mohd had...[ndum (national defence university of malaysia)]NaN[sensor, iot, voice over ip, wsn, design of ne...[[scopus author id, 24372977800], [researcheri...NaN[[lecturer, universiti pertahanan nasional mal...35scopus - elsevier2016-09-06t02:25:52.974z2020-10-20t06:55:55.051z2400351NaNNaN[google.com.my, researchgate.net, academia.edu...NaN16.02.010.0NaN4.0
.............................................................................................
1350000-0001-5087-696511robertoharasystematics, evolutionary biology, and the his...[r. o’hara, r.j. o’hara, robert o’hara, robert...NaN[history and philosophy of science, ancient nu...[[isni, 0000000138200102], [researcherid, b-47...[[biology, ph.d., harvard university, cambridg...NaN45robert j. o’hara2014-09-21t02:45:19.620z2020-07-09t06:51:09.228z2300721NaNNaN[rjohara.net, google.com, collegiateway.org, r...NaN12.03.05.01.0NaN
1360000-0002-3318-986111shaguftaperveenprof. dr. shagufta perveen is a professor at k...NaNshagufta792000@yahoo.com[shagufta perveen university of southampton, s...NaN[[hej research institute of chemistry, phd che...[[professor, king saud university college of p...66scopus - elsevier2015-12-21t10:34:06.771z2021-02-22t14:58:30.893z5600661yahoo.com[msu.edu, ksu.edu.sa][shaguftaperveen.com, researchgate.net, ksu.ed...2.011.0NaN25.03.07.0
1370000-0001-8960-900411susanbastaniNaN[s. bastani, سوسن باستانی]sbastani@alzahra.ac.ir[online and offline communities, personal netw...[[scopus author id, 16642098400]][[sociology, ph.d., university of toronto, tor...[[professor, alzahra university, tehran, vanak...20scopus - elsevier2019-07-10t06:50:46.255z2020-10-07t04:08:01.961z1900331alzahra.ac.ir[gmail.com, gmail.com][scopus.com, google.com, publons.com, zenodo.o...2.011.01.04.03.04.0
1380000-0002-4379-645411caroline wanjirukariukicaroline holds a phd in economics from curtin ...NaNNaN[applied economics, applied econometrics, deve...NaN[[economics, doctor of philosophy , curtin uni...[[director, educational development, strathmor...4caroline wanjiru kariuki2020-03-18t10:18:04.007z2021-02-11t14:40:38.515z10000NaNNaN[scopus.com, mendeley.com, publons.com, resear...NaN13.0NaN4.03.06.0
1390000-0003-2311-060011myokyaw hlaingNaN[dr myo kyaw hlaing]NaN[economic geology]NaNNaN[[lecturer, union of myanmar ministry of educa...2myo kyaw hlaing2018-12-26t12:51:57.801z2021-01-26t14:36:47.421z10020NaNNaN[facebook.com, linkedin.com, instagram.com, re...NaN12.0NaN1.0NaN2.0
\n", "

140 rows × 30 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "0 0000-0002-5967-2835 1 1 \n", "1 0000-0002-3505-2797 1 1 \n", "2 0000-0003-3670-9620 1 1 \n", "3 0000-0002-5441-0465 1 1 \n", "4 0000-0002-7781-6767 1 1 \n", ".. ... ... ... \n", "135 0000-0001-5087-6965 1 1 \n", "136 0000-0002-3318-9861 1 1 \n", "137 0000-0001-8960-9004 1 1 \n", "138 0000-0002-4379-6454 1 1 \n", "139 0000-0003-2311-0600 1 1 \n", "\n", " given_names family_name \\\n", "0 oleksiy goryayinov \n", "1 nurul malahayati \n", "2 carlos barrera \n", "3 nuria hernández-león \n", "4 mohd nazri ismail \n", ".. ... ... \n", "135 robert ohara \n", "136 shagufta perveen \n", "137 susan bastani \n", "138 caroline wanjiru kariuki \n", "139 myo kyaw hlaing \n", "\n", " biography \\\n", "0 NaN \n", "1 google scholar \n", "2 im individual inventor, and this is my work; s... \n", "3 NaN \n", "4 born in penang, malaysia in 1971, dr. mohd had... \n", ".. ... \n", "135 systematics, evolutionary biology, and the his... \n", "136 prof. dr. shagufta perveen is a professor at k... \n", "137 NaN \n", "138 caroline holds a phd in economics from curtin ... \n", "139 NaN \n", "\n", " other_names \\\n", "0 [алексей николаевич горяинов, о.м.горяїнов, а.... \n", "1 NaN \n", "2 [retrodynamic, novelinflow] \n", "3 [nuria h. león, nuria hernández león, hernánde... \n", "4 [ndum (national defence university of malaysia)] \n", ".. ... \n", "135 [r. o’hara, r.j. o’hara, robert o’hara, robert... \n", "136 NaN \n", "137 [s. bastani, سوسن باستانی] \n", "138 NaN \n", "139 [dr myo kyaw hlaing] \n", "\n", " primary_email \\\n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "4 NaN \n", ".. ... \n", "135 NaN \n", "136 shagufta792000@yahoo.com \n", "137 sbastani@alzahra.ac.ir \n", "138 NaN \n", "139 NaN \n", "\n", " keywords \\\n", "0 [diagnostics, transport, logistics] \n", "1 NaN \n", "2 [imploturbocompressor, innovation, gearturbine... \n", "3 [business management, research, human resource... \n", "4 [sensor, iot, voice over ip, wsn, design of ne... \n", ".. ... \n", "135 [history and philosophy of science, ancient nu... \n", "136 [shagufta perveen university of southampton, s... \n", "137 [online and offline communities, personal netw... \n", "138 [applied economics, applied econometrics, deve... \n", "139 [economic geology] \n", "\n", " external_ids \\\n", "0 [[researcherid, i-7977-2016]] \n", "1 [[researcherid, q-3861-2017]] \n", "2 [[loop profile, 394457]] \n", "3 NaN \n", "4 [[scopus author id, 24372977800], [researcheri... \n", ".. ... \n", "135 [[isni, 0000000138200102], [researcherid, b-47... \n", "136 NaN \n", "137 [[scopus author id, 16642098400]] \n", "138 NaN \n", "139 NaN \n", "\n", " education \\\n", "0 [[, дистанционный курс «ctl.sc2x: supply chain... \n", "1 [[civil and transportation engineering , maste... \n", "2 NaN \n", "3 [[, course: social skills, university of salam... \n", "4 NaN \n", ".. ... \n", "135 [[biology, ph.d., harvard university, cambridg... \n", "136 [[hej research institute of chemistry, phd che... \n", "137 [[sociology, ph.d., university of toronto, tor... \n", "138 [[economics, doctor of philosophy , curtin uni... \n", "139 NaN \n", "\n", " employment n_works \\\n", "0 [[docent, kharkiv petro vasylenko national tec... 274 \n", "1 [[senior lecturer, universitas syiah kuala, ba... 6 \n", "2 NaN 1 \n", "3 [[merchandise reception and expedition trainer... 11 \n", "4 [[lecturer, universiti pertahanan nasional mal... 35 \n", ".. ... ... \n", "135 NaN 45 \n", "136 [[professor, king saud university college of p... 66 \n", "137 [[professor, alzahra university, tehran, vanak... 20 \n", "138 [[director, educational development, strathmor... 4 \n", "139 [[lecturer, union of myanmar ministry of educa... 2 \n", "\n", " works_source activation_date \\\n", "0 oleksiy goryayinov 2014-08-03t18:06:42.925z \n", "1 nurul malahayati 2017-10-01t00:46:31.324z \n", "2 carlos barrera 2016-08-29t20:32:10.362z \n", "3 nuria hernández-león 2015-11-28t07:18:58.442z \n", "4 scopus - elsevier 2016-09-06t02:25:52.974z \n", ".. ... ... \n", "135 robert j. o’hara 2014-09-21t02:45:19.620z \n", "136 scopus - elsevier 2015-12-21t10:34:06.771z \n", "137 scopus - elsevier 2019-07-10t06:50:46.255z \n", "138 caroline wanjiru kariuki 2020-03-18t10:18:04.007z \n", "139 myo kyaw hlaing 2018-12-26t12:51:57.801z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", "0 2021-03-22t13:56:48.311z 0 0 0 0 1 \n", "1 2019-08-19t15:52:47.253z 3 0 0 3 1 \n", "2 2021-02-09t04:56:35.554z 0 0 0 0 1 \n", "3 2021-03-05t16:37:47.403z 1 0 0 4 1 \n", "4 2020-10-20t06:55:55.051z 24 0 0 35 1 \n", ".. ... ... ... ... ... ... \n", "135 2020-07-09t06:51:09.228z 23 0 0 72 1 \n", "136 2021-02-22t14:58:30.893z 56 0 0 66 1 \n", "137 2020-10-07t04:08:01.961z 19 0 0 33 1 \n", "138 2021-02-11t14:40:38.515z 1 0 0 0 0 \n", "139 2021-01-26t14:36:47.421z 1 0 0 2 0 \n", "\n", " primary_email_domain other_email_domains \\\n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "4 NaN NaN \n", ".. ... ... \n", "135 NaN NaN \n", "136 yahoo.com [msu.edu, ksu.edu.sa] \n", "137 alzahra.ac.ir [gmail.com, gmail.com] \n", "138 NaN NaN \n", "139 NaN NaN \n", "\n", " url_domains n_emails n_urls \\\n", "0 [khntusg.com.ua, khntusg.com.ua, google.com.ua... NaN 13.0 \n", "1 [google.com, ristekdikti.go.id, unsyiah.ac.id,... NaN 16.0 \n", "2 [blogspot.mx, behance.net, authorstream.com, d... NaN 24.0 \n", "3 [feriaempresamujer.com, escueladenegociosydire... NaN 16.0 \n", "4 [google.com.my, researchgate.net, academia.edu... NaN 16.0 \n", ".. ... ... ... \n", "135 [rjohara.net, google.com, collegiateway.org, r... NaN 12.0 \n", "136 [shaguftaperveen.com, researchgate.net, ksu.ed... 2.0 11.0 \n", "137 [scopus.com, google.com, publons.com, zenodo.o... 2.0 11.0 \n", "138 [scopus.com, mendeley.com, publons.com, resear... NaN 13.0 \n", "139 [facebook.com, linkedin.com, instagram.com, re... NaN 12.0 \n", "\n", " n_ids n_keywords n_education n_employment \n", "0 1.0 3.0 14.0 7.0 \n", "1 1.0 NaN 2.0 1.0 \n", "2 1.0 8.0 NaN NaN \n", "3 NaN 7.0 19.0 16.0 \n", "4 2.0 10.0 NaN 4.0 \n", ".. ... ... ... ... \n", "135 3.0 5.0 1.0 NaN \n", "136 NaN 25.0 3.0 7.0 \n", "137 1.0 4.0 3.0 4.0 \n", "138 NaN 4.0 3.0 6.0 \n", "139 NaN 1.0 NaN 2.0 \n", "\n", "[140 rows x 30 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)\n", "exploded_sources" ] }, { "cell_type": "code", "execution_count": 32, "metadata": { "scrolled": true }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
00000-0002-5967-283511oleksiygoryayinovNaN[алексей николаевич горяинов, о.м.горяїнов, а....NaN[diagnostics, transport, logistics][[researcherid, i-7977-2016]][[, дистанционный курс «ctl.sc2x: supply chain...[[docent, kharkiv petro vasylenko national tec...274oleksiy goryayinov2014-08-03t18:06:42.925z2021-03-22t13:56:48.311z00001NaNNaN[khntusg.com.ua, khntusg.com.ua, google.com.ua...NaN13.01.03.014.07.0
10000-0002-3505-279711nurulmalahayatigoogle scholarNaNNaNNaN[[researcherid, q-3861-2017]][[civil and transportation engineering , maste...[[senior lecturer, universitas syiah kuala, ba...6nurul malahayati2017-10-01t00:46:31.324z2019-08-19t15:52:47.253z30031NaNNaN[google.com, ristekdikti.go.id, unsyiah.ac.id,...NaN16.01.0NaN2.01.0
20000-0003-3670-962011carlosbarreraim individual inventor, and this is my work; s...[retrodynamic, novelinflow]NaN[imploturbocompressor, innovation, gearturbine...[[loop profile, 394457]]NaNNaN1carlos barrera2016-08-29t20:32:10.362z2021-02-09t04:56:35.554z00001NaNNaN[blogspot.mx, behance.net, authorstream.com, d...NaN24.01.08.0NaNNaN
30000-0002-5441-046511nuriahernández-leónNaN[nuria h. león, nuria hernández león, hernánde...NaN[business management, research, human resource...NaN[[, course: social skills, university of salam...[[merchandise reception and expedition trainer...11nuria hernández-león2015-11-28t07:18:58.442z2021-03-05t16:37:47.403z10041NaNNaN[feriaempresamujer.com, escueladenegociosydire...NaN16.0NaN7.019.016.0
50000-0001-7010-290811clarasarmentoclara sarmento holds an aggregation in cultura...NaNNaN[feminist and gender studies, tourism and busi...[[ciência id, d418-d6f8-7d49]][[ao abrigo da bolsa santander ie best practic...[[presidente da comissão de acreditação do nov...275clara sarmento2013-12-12t00:33:58.190z2020-10-12t14:43:00.749z1700601NaNNaN[iscap.pt, google.pt, academia.edu, researchga...NaN13.01.06.08.037.0
.............................................................................................
1330000-0003-1020-135111sheikh saifullahahmedsheikh saifullah ahmed is a full-time lecturer...NaNsaifullahahmedku@gmail.com[post-truth, critical trauma analysis, postmod...NaN[[english discipline , ma & ba in english , kh...[[lecturer , international university of busin...3sheikh saifullah ahmed2020-04-08t21:00:11.201z2021-02-12t20:45:32.247z20030gmail.comNaN[academia.edu, iubat.edu, google.com, research...NaN12.0NaN5.01.01.0
1340000-0001-7228-568011textprotocolNaNNaNNaNNaNNaNNaN[[engineer, textprotocol.org, palo alto, ca, u...1text protocol2021-03-09t10:30:32.237z2021-03-21t17:17:40.500z00000NaNNaN[about.me, figma.com, github.com, gitlab.com, ...NaN15.0NaNNaNNaN1.0
1350000-0001-5087-696511robertoharasystematics, evolutionary biology, and the his...[r. o’hara, r.j. o’hara, robert o’hara, robert...NaN[history and philosophy of science, ancient nu...[[isni, 0000000138200102], [researcherid, b-47...[[biology, ph.d., harvard university, cambridg...NaN45robert j. o’hara2014-09-21t02:45:19.620z2020-07-09t06:51:09.228z2300721NaNNaN[rjohara.net, google.com, collegiateway.org, r...NaN12.03.05.01.0NaN
1380000-0002-4379-645411caroline wanjirukariukicaroline holds a phd in economics from curtin ...NaNNaN[applied economics, applied econometrics, deve...NaN[[economics, doctor of philosophy , curtin uni...[[director, educational development, strathmor...4caroline wanjiru kariuki2020-03-18t10:18:04.007z2021-02-11t14:40:38.515z10000NaNNaN[scopus.com, mendeley.com, publons.com, resear...NaN13.0NaN4.03.06.0
1390000-0003-2311-060011myokyaw hlaingNaN[dr myo kyaw hlaing]NaN[economic geology]NaNNaN[[lecturer, union of myanmar ministry of educa...2myo kyaw hlaing2018-12-26t12:51:57.801z2021-01-26t14:36:47.421z10020NaNNaN[facebook.com, linkedin.com, instagram.com, re...NaN12.0NaN1.0NaN2.0
\n", "

113 rows × 30 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "0 0000-0002-5967-2835 1 1 \n", "1 0000-0002-3505-2797 1 1 \n", "2 0000-0003-3670-9620 1 1 \n", "3 0000-0002-5441-0465 1 1 \n", "5 0000-0001-7010-2908 1 1 \n", ".. ... ... ... \n", "133 0000-0003-1020-1351 1 1 \n", "134 0000-0001-7228-5680 1 1 \n", "135 0000-0001-5087-6965 1 1 \n", "138 0000-0002-4379-6454 1 1 \n", "139 0000-0003-2311-0600 1 1 \n", "\n", " given_names family_name \\\n", "0 oleksiy goryayinov \n", "1 nurul malahayati \n", "2 carlos barrera \n", "3 nuria hernández-león \n", "5 clara sarmento \n", ".. ... ... \n", "133 sheikh saifullah ahmed \n", "134 text protocol \n", "135 robert ohara \n", "138 caroline wanjiru kariuki \n", "139 myo kyaw hlaing \n", "\n", " biography \\\n", "0 NaN \n", "1 google scholar \n", "2 im individual inventor, and this is my work; s... \n", "3 NaN \n", "5 clara sarmento holds an aggregation in cultura... \n", ".. ... \n", "133 sheikh saifullah ahmed is a full-time lecturer... \n", "134 NaN \n", "135 systematics, evolutionary biology, and the his... \n", "138 caroline holds a phd in economics from curtin ... \n", "139 NaN \n", "\n", " other_names \\\n", "0 [алексей николаевич горяинов, о.м.горяїнов, а.... \n", "1 NaN \n", "2 [retrodynamic, novelinflow] \n", "3 [nuria h. león, nuria hernández león, hernánde... \n", "5 NaN \n", ".. ... \n", "133 NaN \n", "134 NaN \n", "135 [r. o’hara, r.j. o’hara, robert o’hara, robert... \n", "138 NaN \n", "139 [dr myo kyaw hlaing] \n", "\n", " primary_email \\\n", "0 NaN \n", "1 NaN \n", "2 NaN \n", "3 NaN \n", "5 NaN \n", ".. ... \n", "133 saifullahahmedku@gmail.com \n", "134 NaN \n", "135 NaN \n", "138 NaN \n", "139 NaN \n", "\n", " keywords \\\n", "0 [diagnostics, transport, logistics] \n", "1 NaN \n", "2 [imploturbocompressor, innovation, gearturbine... \n", "3 [business management, research, human resource... \n", "5 [feminist and gender studies, tourism and busi... \n", ".. ... \n", "133 [post-truth, critical trauma analysis, postmod... \n", "134 NaN \n", "135 [history and philosophy of science, ancient nu... \n", "138 [applied economics, applied econometrics, deve... \n", "139 [economic geology] \n", "\n", " external_ids \\\n", "0 [[researcherid, i-7977-2016]] \n", "1 [[researcherid, q-3861-2017]] \n", "2 [[loop profile, 394457]] \n", "3 NaN \n", "5 [[ciência id, d418-d6f8-7d49]] \n", ".. ... \n", "133 NaN \n", "134 NaN \n", "135 [[isni, 0000000138200102], [researcherid, b-47... \n", "138 NaN \n", "139 NaN \n", "\n", " education \\\n", "0 [[, дистанционный курс «ctl.sc2x: supply chain... \n", "1 [[civil and transportation engineering , maste... \n", "2 NaN \n", "3 [[, course: social skills, university of salam... \n", "5 [[ao abrigo da bolsa santander ie best practic... \n", ".. ... \n", "133 [[english discipline , ma & ba in english , kh... \n", "134 NaN \n", "135 [[biology, ph.d., harvard university, cambridg... \n", "138 [[economics, doctor of philosophy , curtin uni... \n", "139 NaN \n", "\n", " employment n_works \\\n", "0 [[docent, kharkiv petro vasylenko national tec... 274 \n", "1 [[senior lecturer, universitas syiah kuala, ba... 6 \n", "2 NaN 1 \n", "3 [[merchandise reception and expedition trainer... 11 \n", "5 [[presidente da comissão de acreditação do nov... 275 \n", ".. ... ... \n", "133 [[lecturer , international university of busin... 3 \n", "134 [[engineer, textprotocol.org, palo alto, ca, u... 1 \n", "135 NaN 45 \n", "138 [[director, educational development, strathmor... 4 \n", "139 [[lecturer, union of myanmar ministry of educa... 2 \n", "\n", " works_source activation_date \\\n", "0 oleksiy goryayinov 2014-08-03t18:06:42.925z \n", "1 nurul malahayati 2017-10-01t00:46:31.324z \n", "2 carlos barrera 2016-08-29t20:32:10.362z \n", "3 nuria hernández-león 2015-11-28t07:18:58.442z \n", "5 clara sarmento 2013-12-12t00:33:58.190z \n", ".. ... ... \n", "133 sheikh saifullah ahmed 2020-04-08t21:00:11.201z \n", "134 text protocol 2021-03-09t10:30:32.237z \n", "135 robert j. o’hara 2014-09-21t02:45:19.620z \n", "138 caroline wanjiru kariuki 2020-03-18t10:18:04.007z \n", "139 myo kyaw hlaing 2018-12-26t12:51:57.801z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", "0 2021-03-22t13:56:48.311z 0 0 0 0 1 \n", "1 2019-08-19t15:52:47.253z 3 0 0 3 1 \n", "2 2021-02-09t04:56:35.554z 0 0 0 0 1 \n", "3 2021-03-05t16:37:47.403z 1 0 0 4 1 \n", "5 2020-10-12t14:43:00.749z 17 0 0 60 1 \n", ".. ... ... ... ... ... ... \n", "133 2021-02-12t20:45:32.247z 2 0 0 3 0 \n", "134 2021-03-21t17:17:40.500z 0 0 0 0 0 \n", "135 2020-07-09t06:51:09.228z 23 0 0 72 1 \n", "138 2021-02-11t14:40:38.515z 1 0 0 0 0 \n", "139 2021-01-26t14:36:47.421z 1 0 0 2 0 \n", "\n", " primary_email_domain other_email_domains \\\n", "0 NaN NaN \n", "1 NaN NaN \n", "2 NaN NaN \n", "3 NaN NaN \n", "5 NaN NaN \n", ".. ... ... \n", "133 gmail.com NaN \n", "134 NaN NaN \n", "135 NaN NaN \n", "138 NaN NaN \n", "139 NaN NaN \n", "\n", " url_domains n_emails n_urls \\\n", "0 [khntusg.com.ua, khntusg.com.ua, google.com.ua... NaN 13.0 \n", "1 [google.com, ristekdikti.go.id, unsyiah.ac.id,... NaN 16.0 \n", "2 [blogspot.mx, behance.net, authorstream.com, d... NaN 24.0 \n", "3 [feriaempresamujer.com, escueladenegociosydire... NaN 16.0 \n", "5 [iscap.pt, google.pt, academia.edu, researchga... NaN 13.0 \n", ".. ... ... ... \n", "133 [academia.edu, iubat.edu, google.com, research... NaN 12.0 \n", "134 [about.me, figma.com, github.com, gitlab.com, ... NaN 15.0 \n", "135 [rjohara.net, google.com, collegiateway.org, r... NaN 12.0 \n", "138 [scopus.com, mendeley.com, publons.com, resear... NaN 13.0 \n", "139 [facebook.com, linkedin.com, instagram.com, re... NaN 12.0 \n", "\n", " n_ids n_keywords n_education n_employment \n", "0 1.0 3.0 14.0 7.0 \n", "1 1.0 NaN 2.0 1.0 \n", "2 1.0 8.0 NaN NaN \n", "3 NaN 7.0 19.0 16.0 \n", "5 1.0 6.0 8.0 37.0 \n", ".. ... ... ... ... \n", "133 NaN 5.0 1.0 1.0 \n", "134 NaN NaN NaN 1.0 \n", "135 3.0 5.0 1.0 NaN \n", "138 NaN 4.0 3.0 6.0 \n", "139 NaN 1.0 NaN 2.0 \n", "\n", "[113 rows x 30 columns]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Works source" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Paste from Miriam" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## External IDs" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "External IDs should come from reliable sources. ORCiD registrants cannot add them freely." ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 1.308598e+06\n", "mean 1.359082e+00\n", "std 6.643235e-01\n", "min 1.000000e+00\n", "25% 1.000000e+00\n", "50% 1.000000e+00\n", "75% 2.000000e+00\n", "max 8.000000e+01\n", "Name: n_ids, dtype: float64" ] }, "execution_count": 33, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.n_ids.describe()" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
38962260000-0002-9554-663311john awilliamsNaNNaNNaNNaN[[scopus author id,  55553733518], [scopus aut...NaN[[, aston university, birmingham, , gb, 1722, ...92[aston research explorer]2014-11-20t09:42:10.690z2021-03-17t01:00:51.203z80002081NaNNaN[aston.ac.uk]NaN1.080.0NaNNaN1.0
\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "3896226 0000-0002-9554-6633 1 1 \n", "\n", " given_names family_name biography other_names primary_email keywords \\\n", "3896226 john a williams NaN NaN NaN NaN \n", "\n", " external_ids education \\\n", "3896226 [[scopus author id,  55553733518], [scopus aut... NaN \n", "\n", " employment n_works \\\n", "3896226 [[, aston university, birmingham, , gb, 1722, ... 92 \n", "\n", " works_source activation_date \\\n", "3896226 [aston research explorer] 2014-11-20t09:42:10.690z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", "3896226 2021-03-17t01:00:51.203z 80 0 0 208 1 \n", "\n", " primary_email_domain other_email_domains url_domains n_emails \\\n", "3896226 NaN NaN [aston.ac.uk] NaN \n", "\n", " n_urls n_ids n_keywords n_education n_employment \n", "3896226 1.0 80.0 NaN NaN 1.0 " ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.n_ids == df.n_ids.max()]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidexternal_idsprovider
90000-0001-8315-2066[researcherid, k-4630-2014]researcherid
290000-0002-2638-4108[scopus author id, 54394231000]scopus author id
460000-0003-1435-6545[researcherid, p-2223-2018]researcherid
500000-0003-2259-7023[scopus author id, 57189297461]scopus author id
640000-0002-7397-5824[scopus author id, 8399842800]scopus author id
\n", "
" ], "text/plain": [ " orcid external_ids provider\n", "9 0000-0001-8315-2066 [researcherid, k-4630-2014] researcherid\n", "29 0000-0002-2638-4108 [scopus author id, 54394231000] scopus author id\n", "46 0000-0003-1435-6545 [researcherid, p-2223-2018] researcherid\n", "50 0000-0003-2259-7023 [scopus author id, 57189297461] scopus author id\n", "64 0000-0002-7397-5824 [scopus author id, 8399842800] scopus author id" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ "ids[ids.provider.notna()].head()" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "top_ids_providers = ids.groupby('provider').count().sort_values('orcid', ascending=False)" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "scopus author id", "researcherid", "loop profile", "ciência id", "researcher name resolver id", "sciprofile", "中国科学家在线", "isni", "gnd", "pitt id", "technical university of denmark cwis", "researcher id", "id dialnet", "digital author id", "scopus author id: ", "authenticusid", "hku researcherpage", "uow scholars", "cti vitae", "scopus author id:", "hkust profile", "chalmers id", "scopus id", "iauthor", "google scholar", "digital author id (dai)", "authid", "dai", "us epa vivo", "scopus id", "authenticus", "smithsonian profiles", "github", "escientist", "vivo cornell", "researcherid:", "id dialnet:", "dialnet id", "sciprofiles", "kaken", "une researcher id", "researcherid: ", "orcid", "scienceopen", "profile system identifier", "orcid id", "custom" ], "y": [ 1037239, 545399, 118645, 37042, 7954, 5164, 4811, 3089, 2999, 2679, 2483, 1452, 1169, 1126, 1077, 878, 741, 646, 582, 547, 523, 430, 256, 212, 201, 180, 175, 155, 146, 127, 83, 61, 51, 49, 46, 39, 7, 6, 5, 5, 4, 3, 2, 1, 1, 1, 1 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "IDs provided by providers" }, "xaxis": { "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "data = [\n", " go.Bar(\n", " x=top_ids_providers.index,\n", " y=top_ids_providers['orcid']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='IDs provided by providers',\n", " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([nan, 'researcherid', 'scopus author id', 'loop profile', 'gnd',\n", " 'ciência id', 'researcher name resolver id', 'pitt id',\n", " 'id dialnet', 'isni', 'technical university of denmark cwis',\n", " 'chalmers id', 'scopus author id: ', 'scopus author id:',\n", " 'hkust profile', 'hku researcherpage', '中国科学家在线', 'uow scholars',\n", " 'sciprofile', 'cti vitae', 'digital author id', 'researcher id',\n", " 'authenticusid', 'authid', 'authenticus', 'scopus id',\n", " 'digital author id (dai)', 'researcherid:', 'vivo cornell',\n", " 'us epa vivo', 'escientist', 'github', 'iauthor', 'orcid id',\n", " 'dai', 'scopus id', 'smithsonian profiles', 'google scholar',\n", " 'kaken', 'dialnet id', 'researcherid: ', 'une researcher id',\n", " 'sciprofiles', 'id dialnet:', 'scienceopen', 'orcid',\n", " 'profile system identifier', 'custom'], dtype=object)" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.unique(ids['provider'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Keywords" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidn_keywords
37517140000-0002-0673-0341154.0
86979260000-0003-3343-5660148.0
11545230000-0002-6075-3501140.0
65129710000-0002-7060-4112140.0
15151970000-0001-5287-1949132.0
.........
109896440000-0002-1686-1935NaN
109896450000-0002-3800-6331NaN
109896460000-0002-8783-5814NaN
109896470000-0002-7584-2283NaN
109896480000-0003-0529-3538NaN
\n", "

10989649 rows × 2 columns

\n", "
" ], "text/plain": [ " orcid n_keywords\n", "3751714 0000-0002-0673-0341 154.0\n", "8697926 0000-0003-3343-5660 148.0\n", "1154523 0000-0002-6075-3501 140.0\n", "6512971 0000-0002-7060-4112 140.0\n", "1515197 0000-0001-5287-1949 132.0\n", "... ... ...\n", "10989644 0000-0002-1686-1935 NaN\n", "10989645 0000-0002-3800-6331 NaN\n", "10989646 0000-0002-8783-5814 NaN\n", "10989647 0000-0002-7584-2283 NaN\n", "10989648 0000-0003-0529-3538 NaN\n", "\n", "[10989649 rows x 2 columns]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "keywords_by_orcid = df[['orcid', 'n_keywords']].sort_values('n_keywords', ascending=False)\n", "keywords_by_orcid" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "0000-0002-0673-0341", "0000-0003-3343-5660", "0000-0002-6075-3501", "0000-0002-7060-4112", "0000-0001-5287-1949", "0000-0002-9638-8091", "0000-0002-4071-0301", "0000-0001-9462-5666", "0000-0002-0929-2412", "0000-0002-0115-7195", "0000-0002-4235-4259", "0000-0003-0076-6287", "0000-0001-9715-9357", "0000-0002-1878-9762", "0000-0001-6307-6027", "0000-0003-2273-9888", "0000-0002-0937-7061", "0000-0002-1770-9660", "0000-0001-5696-1052", "0000-0003-2998-5520", "0000-0003-1799-0971", "0000-0002-0156-3580", "0000-0002-9625-6742", "0000-0003-1399-7156", "0000-0001-9985-1697", "0000-0001-6537-7683", "0000-0002-8401-8018", "0000-0003-4246-8579", "0000-0001-7857-4133", "0000-0002-7710-0355", "0000-0001-5869-2204", "0000-0002-8083-7382", "0000-0001-8670-4372", "0000-0001-7654-5013", "0000-0002-4488-2880", "0000-0003-4374-6374", "0000-0001-6939-3859", "0000-0003-2509-2549", "0000-0002-3186-8860", "0000-0002-0441-1507", "0000-0001-5230-715X", "0000-0003-0209-180X", "0000-0001-9336-6850", "0000-0002-0463-0048", "0000-0001-5458-7167", "0000-0002-9381-2264", "0000-0002-8227-5387", "0000-0002-3061-3364", "0000-0002-9293-0189", "0000-0002-3123-3021", "0000-0003-1071-4296", "0000-0003-3340-6413", "0000-0003-3584-6834", "0000-0002-8644-8396", "0000-0002-2935-1934", "0000-0002-1718-1632", "0000-0002-8659-6321", "0000-0002-8449-2211", "0000-0003-1693-3190", "0000-0001-5637-1124", "0000-0001-5167-7466", "0000-0002-3532-043X", "0000-0001-6861-9561", "0000-0003-4608-3844", "0000-0003-4505-3678", "0000-0003-4673-1063", "0000-0001-8174-8835", "0000-0002-6347-9464", "0000-0002-8918-2781", "0000-0003-4511-7942", "0000-0003-2532-2906", "0000-0001-9280-6017", "0000-0002-5274-7742", "0000-0001-9586-0780", "0000-0003-3720-1183", "0000-0001-5819-4555", "0000-0002-1103-9651", "0000-0001-8135-2304", "0000-0002-8499-1045", "0000-0003-2550-1859", "0000-0002-8665-9281", "0000-0001-7818-3212", "0000-0003-1863-0265", "0000-0001-8733-5230", "0000-0003-2218-1343", "0000-0002-5306-7781", "0000-0001-7728-4046", "0000-0003-4486-2684", "0000-0002-4982-5236", "0000-0001-5300-3932", "0000-0003-3342-6123", "0000-0002-8072-1152", "0000-0002-3494-2624", "0000-0002-0715-0461", "0000-0002-3907-3552", "0000-0001-5556-8275", "0000-0002-3597-3350", "0000-0002-2252-672X", "0000-0001-7392-9361", "0000-0001-8689-185X" ], "y": [ 154, 148, 140, 140, 132, 124, 115, 106, 105, 102, 100, 94, 92, 92, 88, 86, 78, 77, 75, 75, 72, 71, 70, 68, 68, 68, 67, 66, 64, 64, 63, 62, 61, 61, 61, 60, 60, 56, 55, 54, 54, 53, 53, 53, 53, 53, 52, 52, 52, 51, 51, 51, 50, 50, 50, 50, 50, 49, 49, 49, 49, 48, 48, 48, 48, 48, 47, 47, 47, 47, 46, 46, 46, 45, 45, 45, 45, 44, 44, 44, 44, 44, 44, 44, 44, 44, 43, 43, 43, 43, 43, 43, 43, 43, 42, 42, 42, 42, 42, 42 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Keywords provided by ORCiD" }, "xaxis": { "range": [ -0.5, 99.5 ], "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(100)\n", "data = [\n", " go.Bar(\n", " x=keywords_by_orcid[:TOP_N]['orcid'],\n", " y=keywords_by_orcid[:TOP_N]['n_keywords']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Keywords provided by ORCiD',\n", " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "top_keywords = df[['orcid', 'keywords']]\\\n", " .explode('keywords')\\\n", " .reset_index(drop=True)\\\n", " .groupby('keywords')\\\n", " .count()\\\n", " .sort_values('orcid', ascending=False)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "linkText": "Export to plot.ly", "plotlyServerURL": "https://plot.ly", "showLink": false }, "data": [ { "type": "bar", "x": [ "machine learning", "bioinformatics", "education", "molecular biology", "cancer", "ecology", "artificial intelligence", "epidemiology", "public health", "microbiology", "neuroscience", "immunology", "genetics", "climate change", "remote sensing", "biochemistry", "genomics", "biotechnology", "nanotechnology", "sustainability", "educación", "gis", "deep learning", "psychology", "computer vision", "marketing", "nutrition", "innovation", "data science", "statistics", "data mining", "nanomaterials", "image processing", "robotics", "management", "optimization", "renewable energy", "chemistry", "biomaterials", "diabetes", "gender", "educação", "architecture", "catalysis", "history", "electrochemistry", "evolution", "research", "energy", "biodiversity" ], "y": [ 8574, 5424, 5191, 4557, 4163, 3923, 3839, 3789, 3676, 3550, 3495, 3468, 3343, 3337, 3279, 3003, 2794, 2681, 2674, 2654, 2526, 2511, 2466, 2381, 2309, 2213, 2199, 2154, 2153, 2144, 2108, 2100, 2099, 2086, 2081, 2071, 2009, 2005, 2002, 1998, 1997, 1873, 1835, 1813, 1813, 1800, 1797, 1789, 1770, 1717 ] } ], "layout": { "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top-50 keywords occurrence" }, "xaxis": { "tickangle": 45, "tickfont": { "size": 12 } } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "set_top_n(50)\n", "data = [\n", " go.Bar(\n", " x=top_keywords[:TOP_N].index,\n", " y=top_keywords[:TOP_N]['orcid']\n", " )\n", "]\n", "\n", "layout = go.Layout(\n", " title='Top-%s keywords occurrence' % TOP_N,\n", " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", ")\n", "fig = go.Figure(data=data, layout=layout)\n", "plotly.offline.iplot(fig)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Education" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "def extract_education(lst):\n", " educations = []\n", " for e in lst:\n", " # e[0] degree\n", " # e[1] role\n", " # e[2] university\n", " # e[..] city, region, country, id, id_scheme\n", " educations.append(' '.join([e[0], e[1], e[2]]))\n", " return educations" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Employment" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "def extract_employment(lst):\n", " res = []\n", " for e in lst:\n", " # e[0] role\n", " # e[1] institute\n", " # e[..] city, region, country, id, id_scheme\n", " res.append(' '.join([e[0], e[1]]))\n", " return res" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Biography" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "df['biography'] = df[df.biography.notna()]['biography'].replace('', np.NaN)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 354015\n", "unique 337007\n", "top car title loans are a more straightforward way...\n", "freq 343\n", "Name: biography, dtype: object" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.biography.describe()" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employment
513060000-0002-7397-797711premium cartitle loanscar title loans are a more straightforward way...[premium car title loans]NaN[car title loan upland]NaNNaNNaN0NaN2020-11-06t06:10:20.070z2020-11-06t06:24:28.005z00000NaNNaN[premiumcartitleloans.com]NaN1.0NaN1.0NaNNaN
513070000-0003-4931-973611premium cartitle loanscar title loans are a more straightforward way...[premium car title loans]NaN[car title loan saratoga]NaNNaNNaN0NaN2020-11-13t01:04:19.859z2020-11-13t01:15:12.546z00000NaNNaN[premiumcartitleloans.com]NaN1.0NaN1.0NaNNaN
1060240000-0001-8221-230311premium cartitle loanscar title loans are a more straightforward way...[premium car title loans]NaN[car title loan victorville]NaNNaNNaN0NaN2020-11-05t00:38:21.096z2020-11-05t00:40:40.091z00000NaNNaN[premiumcartitleloans.com]NaN1.0NaN1.0NaNNaN
1087700000-0001-6736-072X11premium cartitle loanscar title loans are a more straightforward way...NaNNaNNaNNaNNaNNaN0NaN2020-12-08t05:38:30.786z2020-12-08t05:40:03.786z00000NaNNaN[premiumcartitleloans.com]NaN1.0NaNNaNNaNNaN
1087710000-0002-8727-124611premium cartitle loanscar title loans are a more straightforward way...[loan agency]NaN[title loan on car, car title loan online, ref...NaNNaNNaN0NaN2020-12-10t08:54:56.127z2020-12-10t08:57:15.791z00000NaNNaN[premiumcartitleloans.com]NaN1.0NaN4.0NaNNaN
.............................................................................................
108754160000-0002-9640-813611premium cartitle loanscar title loans are a more straightforward way...[premium car title loans]NaN[car title loan clovis]NaNNaNNaN0NaN2020-10-22t06:11:02.945z2020-10-22t06:17:09.111z00000NaNNaN[premiumcartitleloans.com]NaN1.0NaN1.0NaNNaN
108782390000-0002-6926-375211premium cartitle loanscar title loans are a more straightforward way...[premium car title loans]NaN[car title loan escondido]NaNNaNNaN0NaN2020-12-03t02:00:33.684z2020-12-03t02:02:07.054z00000NaNNaN[premiumcartitleloans.com]NaN1.0NaN1.0NaNNaN
109333800000-0002-3655-471311premium cartitle loanscar title loans are a more straightforward way...[premium car title loans]NaN[car title loan san rafael]NaNNaNNaN0NaN2020-11-18t00:39:17.492z2020-11-18t00:52:19.024z00000NaNNaN[premiumcartitleloans.com]NaN1.0NaN1.0NaNNaN
109333810000-0002-8724-102011premium cartitle loanscar title loans are a more straightforward way...[premium car title loans]NaN[car title loan san juan capistrano]NaNNaNNaN0NaN2020-11-19t00:31:54.080z2020-11-19t00:34:08.721z00000NaNNaN[premiumcartitleloans.com]NaN1.0NaN1.0NaNNaN
109859860000-0002-4601-456911premium cartitle loanscar title loans are a more straightforward way...[premium car title loans]NaN[car title loan mount pleasant]NaNNaNNaN0NaN2020-10-16t00:32:26.207z2020-10-16t00:37:42.646z00000NaNNaN[premiumcartitleloans.com]NaN1.0NaN1.0NaNNaN
\n", "

421 rows × 30 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "51306 0000-0002-7397-7977 1 1 \n", "51307 0000-0003-4931-9736 1 1 \n", "106024 0000-0001-8221-2303 1 1 \n", "108770 0000-0001-6736-072X 1 1 \n", "108771 0000-0002-8727-1246 1 1 \n", "... ... ... ... \n", "10875416 0000-0002-9640-8136 1 1 \n", "10878239 0000-0002-6926-3752 1 1 \n", "10933380 0000-0002-3655-4713 1 1 \n", "10933381 0000-0002-8724-1020 1 1 \n", "10985986 0000-0002-4601-4569 1 1 \n", "\n", " given_names family_name \\\n", "51306 premium car title loans \n", "51307 premium car title loans \n", "106024 premium car title loans \n", "108770 premium car title loans \n", "108771 premium car title loans \n", "... ... ... \n", "10875416 premium car title loans \n", "10878239 premium car title loans \n", "10933380 premium car title loans \n", "10933381 premium car title loans \n", "10985986 premium car title loans \n", "\n", " biography \\\n", "51306 car title loans are a more straightforward way... \n", "51307 car title loans are a more straightforward way... \n", "106024 car title loans are a more straightforward way... \n", "108770 car title loans are a more straightforward way... \n", "108771 car title loans are a more straightforward way... \n", "... ... \n", "10875416 car title loans are a more straightforward way... \n", "10878239 car title loans are a more straightforward way... \n", "10933380 car title loans are a more straightforward way... \n", "10933381 car title loans are a more straightforward way... \n", "10985986 car title loans are a more straightforward way... \n", "\n", " other_names primary_email \\\n", "51306 [premium car title loans] NaN \n", "51307 [premium car title loans] NaN \n", "106024 [premium car title loans] NaN \n", "108770 NaN NaN \n", "108771 [loan agency] NaN \n", "... ... ... \n", "10875416 [premium car title loans] NaN \n", "10878239 [premium car title loans] NaN \n", "10933380 [premium car title loans] NaN \n", "10933381 [premium car title loans] NaN \n", "10985986 [premium car title loans] NaN \n", "\n", " keywords external_ids \\\n", "51306 [car title loan upland] NaN \n", "51307 [car title loan saratoga] NaN \n", "106024 [car title loan victorville] NaN \n", "108770 NaN NaN \n", "108771 [title loan on car, car title loan online, ref... NaN \n", "... ... ... \n", "10875416 [car title loan clovis] NaN \n", "10878239 [car title loan escondido] NaN \n", "10933380 [car title loan san rafael] NaN \n", "10933381 [car title loan san juan capistrano] NaN \n", "10985986 [car title loan mount pleasant] NaN \n", "\n", " education employment n_works works_source activation_date \\\n", "51306 NaN NaN 0 NaN 2020-11-06t06:10:20.070z \n", "51307 NaN NaN 0 NaN 2020-11-13t01:04:19.859z \n", "106024 NaN NaN 0 NaN 2020-11-05t00:38:21.096z \n", "108770 NaN NaN 0 NaN 2020-12-08t05:38:30.786z \n", "108771 NaN NaN 0 NaN 2020-12-10t08:54:56.127z \n", "... ... ... ... ... ... \n", "10875416 NaN NaN 0 NaN 2020-10-22t06:11:02.945z \n", "10878239 NaN NaN 0 NaN 2020-12-03t02:00:33.684z \n", "10933380 NaN NaN 0 NaN 2020-11-18t00:39:17.492z \n", "10933381 NaN NaN 0 NaN 2020-11-19t00:31:54.080z \n", "10985986 NaN NaN 0 NaN 2020-10-16t00:32:26.207z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n", "51306 2020-11-06t06:24:28.005z 0 0 0 0 \n", "51307 2020-11-13t01:15:12.546z 0 0 0 0 \n", "106024 2020-11-05t00:40:40.091z 0 0 0 0 \n", "108770 2020-12-08t05:40:03.786z 0 0 0 0 \n", "108771 2020-12-10t08:57:15.791z 0 0 0 0 \n", "... ... ... ... ... ... \n", "10875416 2020-10-22t06:17:09.111z 0 0 0 0 \n", "10878239 2020-12-03t02:02:07.054z 0 0 0 0 \n", "10933380 2020-11-18t00:52:19.024z 0 0 0 0 \n", "10933381 2020-11-19t00:34:08.721z 0 0 0 0 \n", "10985986 2020-10-16t00:37:42.646z 0 0 0 0 \n", "\n", " label primary_email_domain other_email_domains \\\n", "51306 0 NaN NaN \n", "51307 0 NaN NaN \n", "106024 0 NaN NaN \n", "108770 0 NaN NaN \n", "108771 0 NaN NaN \n", "... ... ... ... \n", "10875416 0 NaN NaN \n", "10878239 0 NaN NaN \n", "10933380 0 NaN NaN \n", "10933381 0 NaN NaN \n", "10985986 0 NaN NaN \n", "\n", " url_domains n_emails n_urls n_ids n_keywords \\\n", "51306 [premiumcartitleloans.com] NaN 1.0 NaN 1.0 \n", "51307 [premiumcartitleloans.com] NaN 1.0 NaN 1.0 \n", "106024 [premiumcartitleloans.com] NaN 1.0 NaN 1.0 \n", "108770 [premiumcartitleloans.com] NaN 1.0 NaN NaN \n", "108771 [premiumcartitleloans.com] NaN 1.0 NaN 4.0 \n", "... ... ... ... ... ... \n", "10875416 [premiumcartitleloans.com] NaN 1.0 NaN 1.0 \n", "10878239 [premiumcartitleloans.com] NaN 1.0 NaN 1.0 \n", "10933380 [premiumcartitleloans.com] NaN 1.0 NaN 1.0 \n", "10933381 [premiumcartitleloans.com] NaN 1.0 NaN 1.0 \n", "10985986 [premiumcartitleloans.com] NaN 1.0 NaN 1.0 \n", "\n", " n_education n_employment \n", "51306 NaN NaN \n", "51307 NaN NaN \n", "106024 NaN NaN \n", "108770 NaN NaN \n", "108771 NaN NaN \n", "... ... ... \n", "10875416 NaN NaN \n", "10878239 NaN NaN \n", "10933380 NaN NaN \n", "10933381 NaN NaN \n", "10985986 NaN NaN \n", "\n", "[421 rows x 30 columns]" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[(df.biography.notna()) & (df.biography.str.contains('car title loans are a more straightforward'))]" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "def score(bio):\n", " try:\n", " return antispam.score(bio)\n", " except: # if len(bio) < 3 the filter doesn't know how to handle that\n", " return -1" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "df['spam_score'] = df[df.biography.notna()]['biography'].apply(lambda bio: score(bio))" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidbiography
255050000-0003-0505-2734j
1384870000-0002-3417-7299.....
1395950000-0003-3794-1288m.d., ph.d.
1933400000-0001-9655-4806肿瘤
1949900000-0002-9149-0142be y
.........
109278660000-0002-7341-5480ph.d.
109760800000-0003-4041-0840/
109766890000-0002-4285-8537
109769220000-0002-1545-8773hi
109873790000-0002-6302-4224.
\n", "

348 rows × 2 columns

\n", "
" ], "text/plain": [ " orcid biography\n", "25505 0000-0003-0505-2734 j\n", "138487 0000-0002-3417-7299 .....\n", "139595 0000-0003-3794-1288 m.d., ph.d.\n", "193340 0000-0001-9655-4806 肿瘤\n", "194990 0000-0002-9149-0142 be y\n", "... ... ...\n", "10927866 0000-0002-7341-5480 ph.d.\n", "10976080 0000-0003-4041-0840 /\n", "10976689 0000-0002-4285-8537 \n", "10976922 0000-0002-1545-8773 hi\n", "10987379 0000-0002-6302-4224 .\n", "\n", "[348 rows x 2 columns]" ] }, "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.spam_score == -1][['orcid','biography']]" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "df['spam_score'] = df['spam_score'].replace(-1, np.NaN)" ] }, { "cell_type": "code", "execution_count": 54, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "count 3.536670e+05\n", "mean 6.098044e-01\n", "std 4.476618e-01\n", "min 1.917500e-22\n", "25% 1.858235e-02\n", "50% 9.529688e-01\n", "75% 9.999992e-01\n", "max 1.000000e+00\n", "Name: spam_score, dtype: float64" ] }, "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.spam_score.describe()" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
biographyspam_score
29investigador de la universidad de oviedo. depa...1.000000
83formación académica en la temática de manejo d...1.000000
217doctor en educación, maestro en gerencia de la...1.000000
222possui graduação em psicologia pela pontifícia...1.000000
470roofing contractors in seattle waroofing contr...1.000000
.........
10989593jose ignacio peláez sánchez ha sido profesor e...0.999966
10989603mestranda em tecnologia na saúde e foi aluna o...1.000000
10989605the phd degree of pharmacy was received under ...1.000000
10989615mostafa metwaly is an assistant lecturer at th...1.000000
10989617jual obat aborsi di tangerang, obat penggugur ...0.999999
\n", "

120733 rows × 2 columns

\n", "
" ], "text/plain": [ " biography spam_score\n", "29 investigador de la universidad de oviedo. depa... 1.000000\n", "83 formación académica en la temática de manejo d... 1.000000\n", "217 doctor en educación, maestro en gerencia de la... 1.000000\n", "222 possui graduação em psicologia pela pontifícia... 1.000000\n", "470 roofing contractors in seattle waroofing contr... 1.000000\n", "... ... ...\n", "10989593 jose ignacio peláez sánchez ha sido profesor e... 0.999966\n", "10989603 mestranda em tecnologia na saúde e foi aluna o... 1.000000\n", "10989605 the phd degree of pharmacy was received under ... 1.000000\n", "10989615 mostafa metwaly is an assistant lecturer at th... 1.000000\n", "10989617 jual obat aborsi di tangerang, obat penggugur ... 0.999999\n", "\n", "[120733 rows x 2 columns]" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.spam_score > 0.9999][['biography', 'spam_score']]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## All VS All correlation" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "plotlyServerURL": "https://plot.ly" }, "data": [ { "coloraxis": "coloraxis", "hovertemplate": "x: %{x}
y: %{y}
color: %{z}", "name": "0", "type": "heatmap", "x": [ "verified_email", "verified_primary_email", "n_works", "n_doi", "n_arxiv", "n_pmc", "n_other_pids", "label", "n_emails", "n_urls", "n_ids", "n_keywords", "n_education", "n_employment", "spam_score" ], "xaxis": "x", "y": [ "verified_email", "verified_primary_email", "n_works", "n_doi", "n_arxiv", "n_pmc", "n_other_pids", "label", "n_emails", "n_urls", "n_ids", "n_keywords", "n_education", "n_employment", "spam_score" ], "yaxis": "y", "z": [ [ 1, 0.9649829131836175, 0.07899833525811681, 0.07259719921935885, 0.0064613638682561435, 0.030614701011724112, 0.0606246420123506, 0.15318397733660719, 0.01707412367245974, 0.011595665588543176, 0.08757201810408902, 0.04328019132305945, 0.06273922418819365, 0.032388380558167604, -0.0014643347702066065 ], [ 0.9649829131836175, 1, 0.08183974046700901, 0.07518160639621203, 0.0066860590291805974, 0.031712353459948744, 0.06277678931008057, 0.1599569518292285, 0.012845249565805577, 0.01225294291707495, 0.08884972976983674, 0.042965738098621045, 0.06267129593308499, 0.032109851763758655, -0.0017281238203322692 ], [ 0.07899833525811681, 0.08183974046700901, 1, 0.9378726254398347, 0.3126299250047347, 0.35108563893979355, 0.8353346326813307, 0.22974076078506034, 0.0467324699274979, 0.05677052771427962, 0.24206378753330832, 0.10059683097006519, 0.07842571839441875, 0.13044055927104833, 0.03309964849512055 ], [ 0.07259719921935885, 0.07518160639621203, 0.9378726254398347, 1, 0.35605399617723354, 0.3624050122938972, 0.8018196175347003, 0.2133388352039094, 0.04323705596335824, 0.03671920325536051, 0.22747917315452765, 0.08818211719282171, 0.059736292814786394, 0.10808879924173968, 0.023598121224384254 ], [ 0.0064613638682561435, 0.0066860590291805974, 0.3126299250047347, 0.35605399617723354, 1, 0.0009072282179230706, 0.2420914875526222, 0.019397970952505558, -0.0012892654216752134, -0.0012450878454092582, 0.005506976661225234, 0.005323525498163837, 0.002855334129447379, 0.012045839456523506, -0.004097867638595096 ], [ 0.030614701011724112, 0.031712353459948744, 0.35108563893979355, 0.3624050122938972, 0.0009072282179230706, 1, 0.2570742999530523, 0.08736856703203819, 0.00692153906662496, 0.00937502065483787, 0.06902426238378008, 0.04403133874020783, 0.04420662755474919, 0.06519395895970276, 0.0440829672362604 ], [ 0.0606246420123506, 0.06277678931008057, 0.8353346326813307, 0.8018196175347003, 0.2420914875526222, 0.2570742999530523, 1, 0.17528852589876096, 0.031879693664395886, 0.026541715168505287, 0.23609661160546389, 0.07896831483400772, 0.05210451688345082, 0.0918979380914551, 0.02758050967439794 ], [ 0.15318397733660719, 0.1599569518292285, 0.22974076078506034, 0.2133388352039094, 0.019397970952505558, 0.08736856703203819, 0.17528852589876096, 1, 0.039331704893463555, 0.05281328942776445, 0.18491493109941123, 0.14988174988204958, 0.12335412758038665, 0.12963923566628435, 0.05150567230962376 ], [ 0.01707412367245974, 0.012845249565805577, 0.0467324699274979, 0.04323705596335824, -0.0012892654216752134, 0.00692153906662496, 0.031879693664395886, 0.039331704893463555, 1, 0.10908188290463508, 0.046252487740645075, 0.055370893918463594, 0.042441168551650255, 0.06949329940594548, -0.004708719768184376 ], [ 0.011595665588543176, 0.01225294291707495, 0.05677052771427962, 0.03671920325536051, -0.0012450878454092582, 0.00937502065483787, 0.026541715168505287, 0.05281328942776445, 0.10908188290463508, 1, 0.06981395653573894, 0.14857395261406392, 0.09579443160455722, 0.10053018560706249, 0.06160583974971729 ], [ 0.08757201810408902, 0.08884972976983674, 0.24206378753330832, 0.22747917315452765, 0.005506976661225234, 0.06902426238378008, 0.23609661160546389, 0.18491493109941123, 0.046252487740645075, 0.06981395653573894, 1, 0.08226882925030908, 0.06193758461332223, 0.10418580365255088, 0.03521147234785954 ], [ 0.04328019132305945, 0.042965738098621045, 0.10059683097006519, 0.08818211719282171, 0.005323525498163837, 0.04403133874020783, 0.07896831483400772, 0.14988174988204958, 0.055370893918463594, 0.14857395261406392, 0.08226882925030908, 1, 0.13365374046726683, 0.15472097182119743, 0.041184832685515815 ], [ 0.06273922418819365, 0.06267129593308499, 0.07842571839441875, 0.059736292814786394, 0.002855334129447379, 0.04420662755474919, 0.05210451688345082, 0.12335412758038665, 0.042441168551650255, 0.09579443160455722, 0.06193758461332223, 0.13365374046726683, 1, 0.35408552736376164, 0.07225588511213044 ], [ 0.032388380558167604, 0.032109851763758655, 0.13044055927104833, 0.10808879924173968, 0.012045839456523506, 0.06519395895970276, 0.0918979380914551, 0.12963923566628435, 0.06949329940594548, 0.10053018560706249, 0.10418580365255088, 0.15472097182119743, 0.35408552736376164, 1, 0.03643762096668171 ], [ -0.0014643347702066065, -0.0017281238203322692, 0.03309964849512055, 0.023598121224384254, -0.004097867638595096, 0.0440829672362604, 0.02758050967439794, 0.05150567230962376, -0.004708719768184376, 0.06160583974971729, 0.03521147234785954, 0.041184832685515815, 0.07225588511213044, 0.03643762096668171, 1 ] ] } ], "layout": { "coloraxis": { "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "margin": { "t": 60 }, "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "xaxis": { "anchor": "y", "constrain": "domain", "domain": [ 0, 1 ], "scaleanchor": "y" }, "yaxis": { "anchor": "x", "autorange": "reversed", "constrain": "domain", "domain": [ 0, 1 ] } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "fig = px.imshow(df.corr())\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [], "source": [ "df[['verified_email', \n", " 'verified_primary_email', \n", " 'n_works', \n", " 'n_doi',\n", " 'n_arxiv', \n", " 'n_pmc', \n", " 'n_other_pids', \n", " 'n_emails', \n", " 'n_urls', \n", " 'n_ids', \n", " 'n_keywords', \n", " 'n_employment', \n", " 'n_education', \n", " 'label']].to_pickle('../data/processed/features.pkl')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Label speculation" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employmentspam_score
170000-0002-0137-306611NaNNaNNaNNaNNaNNaNNaNNaNNaN0NaN2017-07-25t04:34:17.338z2019-11-27t17:54:45.418z00001NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
190000-0002-0461-971111NaNNaNNaNNaNNaNNaNNaNNaNNaN2[crossref]2015-08-18t12:42:01.797z2019-12-06t11:37:38.203z20001NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
220000-0002-0761-945011NaNNaNNaNNaNNaNNaNNaNNaNNaN1[crossref]2020-05-13t17:15:28.405z2020-08-11t21:00:45.694z10001NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
330000-0002-4447-921511NaNNaNNaNNaNNaNNaNNaNNaNNaN0NaN2017-07-24t09:37:50.242z2019-11-15t08:31:24.820z00001NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
440000-0003-0426-406511NaNNaNNaN[eliza i. gilbert]NaNNaNNaNNaN[[, us fish and wildlife service, albuquerque,...0NaN2017-08-07t18:32:31.802z2020-04-08t16:48:55.732z00001NaNNaNNaNNaNNaNNaNNaNNaN1.0NaN
................................................................................................
109896360000-0002-2906-029911tiffanymackayNaN[tiffany russel sia]NaN[prostate cancer, oxytocin, radiolabelling, ga...[[researcherid, a-2121-2017]][[faculty of medicine, master in pharmaceutica...[[clinical project lead, minomic international...11[crossref, researcherid, tiffany mackay]2017-01-03t23:28:48.736z2020-12-09t17:12:20.326z110001NaNNaN[oxytocin.com.au, linkedin.com]NaN2.01.013.02.04.0NaN
109896370000-0001-5896-202411giovanni, ltisciaNaNNaNNaNNaN[[scopus author id, 54948242800]]NaNNaN70[scopus - elsevier, tiscia giovanni, l, europe...2016-07-27t10:09:13.585z2020-12-07t22:23:05.706z65017521NaNNaNNaNNaNNaN1.0NaNNaNNaNNaN
109896400000-0002-1070-222011viniciossantannaNaN[vinicios sant anna, vinicios sant anna, vinic...NaNNaN[[scopus author id, 57201697952]][[economics, ph.d., university of illinois at ...NaN1[crossref metadata search]2016-03-19t21:24:42.821z2020-12-10t16:34:09.722z10011NaNNaN[vpsantanna.com]NaN1.01.0NaN2.0NaNNaN
109896430000-0003-2606-093611luangxuNaN[xu lu-ang, lu lu]NaNNaNNaNNaN[[post-doc, institute of biochemistry and cell...2[scopus - elsevier, crossref]2015-10-24t03:53:23.544z2020-11-19t09:23:48.896z20011NaNNaNNaNNaNNaNNaNNaNNaN1.0NaN
109896450000-0002-3800-633111zacharycalamariNaNNaNNaNNaNNaN[[richard gilder graduate school, phd in compa...[[assistant professor, baruch college, city un...7[crossref metadata search, zachary t. calamari...2015-01-20t20:20:17.042z2020-11-21t19:48:36.221z70101NaNNaNNaNNaNNaNNaNNaN2.02.0NaN
\n", "

2664886 rows × 31 columns

\n", "
" ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", "17 0000-0002-0137-3066 1 1 \n", "19 0000-0002-0461-9711 1 1 \n", "22 0000-0002-0761-9450 1 1 \n", "33 0000-0002-4447-9215 1 1 \n", "44 0000-0003-0426-4065 1 1 \n", "... ... ... ... \n", "10989636 0000-0002-2906-0299 1 1 \n", "10989637 0000-0001-5896-2024 1 1 \n", "10989640 0000-0002-1070-2220 1 1 \n", "10989643 0000-0003-2606-0936 1 1 \n", "10989645 0000-0002-3800-6331 1 1 \n", "\n", " given_names family_name biography \\\n", "17 NaN NaN NaN \n", "19 NaN NaN NaN \n", "22 NaN NaN NaN \n", "33 NaN NaN NaN \n", "44 NaN NaN NaN \n", "... ... ... ... \n", "10989636 tiffany mackay NaN \n", "10989637 giovanni, l tiscia NaN \n", "10989640 vinicios santanna NaN \n", "10989643 luang xu NaN \n", "10989645 zachary calamari NaN \n", "\n", " other_names primary_email \\\n", "17 NaN NaN \n", "19 NaN NaN \n", "22 NaN NaN \n", "33 NaN NaN \n", "44 [eliza i. gilbert] NaN \n", "... ... ... \n", "10989636 [tiffany russel sia] NaN \n", "10989637 NaN NaN \n", "10989640 [vinicios sant anna, vinicios sant anna, vinic... NaN \n", "10989643 [xu lu-ang, lu lu] NaN \n", "10989645 NaN NaN \n", "\n", " keywords \\\n", "17 NaN \n", "19 NaN \n", "22 NaN \n", "33 NaN \n", "44 NaN \n", "... ... \n", "10989636 [prostate cancer, oxytocin, radiolabelling, ga... \n", "10989637 NaN \n", "10989640 NaN \n", "10989643 NaN \n", "10989645 NaN \n", "\n", " external_ids \\\n", "17 NaN \n", "19 NaN \n", "22 NaN \n", "33 NaN \n", "44 NaN \n", "... ... \n", "10989636 [[researcherid, a-2121-2017]] \n", "10989637 [[scopus author id, 54948242800]] \n", "10989640 [[scopus author id, 57201697952]] \n", "10989643 NaN \n", "10989645 NaN \n", "\n", " education \\\n", "17 NaN \n", "19 NaN \n", "22 NaN \n", "33 NaN \n", "44 NaN \n", "... ... \n", "10989636 [[faculty of medicine, master in pharmaceutica... \n", "10989637 NaN \n", "10989640 [[economics, ph.d., university of illinois at ... \n", "10989643 NaN \n", "10989645 [[richard gilder graduate school, phd in compa... \n", "\n", " employment n_works \\\n", "17 NaN 0 \n", "19 NaN 2 \n", "22 NaN 1 \n", "33 NaN 0 \n", "44 [[, us fish and wildlife service, albuquerque,... 0 \n", "... ... ... \n", "10989636 [[clinical project lead, minomic international... 11 \n", "10989637 NaN 70 \n", "10989640 NaN 1 \n", "10989643 [[post-doc, institute of biochemistry and cell... 2 \n", "10989645 [[assistant professor, baruch college, city un... 7 \n", "\n", " works_source \\\n", "17 NaN \n", "19 [crossref] \n", "22 [crossref] \n", "33 NaN \n", "44 NaN \n", "... ... \n", "10989636 [crossref, researcherid, tiffany mackay] \n", "10989637 [scopus - elsevier, tiscia giovanni, l, europe... \n", "10989640 [crossref metadata search] \n", "10989643 [scopus - elsevier, crossref] \n", "10989645 [crossref metadata search, zachary t. calamari... \n", "\n", " activation_date last_update_date n_doi n_arxiv \\\n", "17 2017-07-25t04:34:17.338z 2019-11-27t17:54:45.418z 0 0 \n", "19 2015-08-18t12:42:01.797z 2019-12-06t11:37:38.203z 2 0 \n", "22 2020-05-13t17:15:28.405z 2020-08-11t21:00:45.694z 1 0 \n", "33 2017-07-24t09:37:50.242z 2019-11-15t08:31:24.820z 0 0 \n", "44 2017-08-07t18:32:31.802z 2020-04-08t16:48:55.732z 0 0 \n", "... ... ... ... ... \n", "10989636 2017-01-03t23:28:48.736z 2020-12-09t17:12:20.326z 11 0 \n", "10989637 2016-07-27t10:09:13.585z 2020-12-07t22:23:05.706z 65 0 \n", "10989640 2016-03-19t21:24:42.821z 2020-12-10t16:34:09.722z 1 0 \n", "10989643 2015-10-24t03:53:23.544z 2020-11-19t09:23:48.896z 2 0 \n", "10989645 2015-01-20t20:20:17.042z 2020-11-21t19:48:36.221z 7 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", "17 0 0 1 NaN NaN \n", "19 0 0 1 NaN NaN \n", "22 0 0 1 NaN NaN \n", "33 0 0 1 NaN NaN \n", "44 0 0 1 NaN NaN \n", "... ... ... ... ... ... \n", "10989636 0 0 1 NaN NaN \n", "10989637 17 52 1 NaN NaN \n", "10989640 0 1 1 NaN NaN \n", "10989643 0 1 1 NaN NaN \n", "10989645 1 0 1 NaN NaN \n", "\n", " url_domains n_emails n_urls n_ids \\\n", "17 NaN NaN NaN NaN \n", "19 NaN NaN NaN NaN \n", "22 NaN NaN NaN NaN \n", "33 NaN NaN NaN NaN \n", "44 NaN NaN NaN NaN \n", "... ... ... ... ... \n", "10989636 [oxytocin.com.au, linkedin.com] NaN 2.0 1.0 \n", "10989637 NaN NaN NaN 1.0 \n", "10989640 [vpsantanna.com] NaN 1.0 1.0 \n", "10989643 NaN NaN NaN NaN \n", "10989645 NaN NaN NaN NaN \n", "\n", " n_keywords n_education n_employment spam_score \n", "17 NaN NaN NaN NaN \n", "19 NaN NaN NaN NaN \n", "22 NaN NaN NaN NaN \n", "33 NaN NaN NaN NaN \n", "44 NaN NaN 1.0 NaN \n", "... ... ... ... ... \n", "10989636 13.0 2.0 4.0 NaN \n", "10989637 NaN NaN NaN NaN \n", "10989640 NaN 2.0 NaN NaN \n", "10989643 NaN NaN 1.0 NaN \n", "10989645 NaN 2.0 2.0 NaN \n", "\n", "[2664886 rows x 31 columns]" ] }, "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[df.label == 1]" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [], "source": [ "# (df.n_works > 0) & (df.n_ids > 1)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }