{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Exploratory analysis"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"TODO:\n",
"- URLs can be found elsewhere (e.g., biographies, names, etc.)\n",
"- Check line feed handling when creating dump\n",
"- Always keep an eye to the temporal dimension\n",
"- Why fake ORCID records are being created? [Link farming/SEO hacking, anything else?]\n",
"- Can we access private info thanks to the OpenAIRE ORCID membership? No.\n",
"- Check special cases of worksource as in https://orcid.org/0000-0002-4469-621X where \"author name VIA ResearcherID\"\n"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"[nltk_data] Downloading package punkt to /Users/andrea/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
]
},
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import glob\n",
"import ast\n",
"import re\n",
"\n",
"from datetime import datetime\n",
"import pytz\n",
"\n",
"import tldextract\n",
"\n",
"import ssl # needed because nltk.download down here fires an error\n",
"try:\n",
" _create_unverified_https_context = ssl._create_unverified_context\n",
"except AttributeError:\n",
" pass\n",
"else:\n",
" ssl._create_default_https_context = _create_unverified_https_context\n",
"\n",
"import nltk\n",
"from nltk.tokenize import sent_tokenize\n",
"from nltk.tokenize import word_tokenize\n",
"nltk.download('punkt')\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"# import antispam\n",
"# import profanity_check\n",
"\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"import seaborn as sns\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
"import plotly.express as px\n",
"\n",
"init_notebook_mode(connected=True)\n",
"TOP_N = 0\n",
"TOP_RANGE = [0, 0]\n",
"\n",
"def set_top_n(n):\n",
" global TOP_N, TOP_RANGE\n",
" TOP_N = n\n",
" TOP_RANGE = [-.5, n - 1 + .5]\n",
" \n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notable solid ORCID iDs for explorative purposes:"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"AM = '0000-0002-5193-7851'\n",
"PP = '0000-0002-8588-4196'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notable anomalies:"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"JOURNAL = '0000-0003-1815-5732'\n",
"NOINFO = '0000-0001-5009-2052'\n",
"VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE\n",
"WORK_MISUSE = '0000-0001-7870-1120'\n",
"# todo: find group-shared ORCiD, if possible"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notable fake ORCID iDs:"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"FAKE_HEAP = {\n",
" 'scaffold': '0000-0001-5004-7761',\n",
" 'whatsapp': '0000-0001-6997-9470',\n",
" 'penis': '0000-0002-3399-7287',\n",
" 'bitcoin': '0000-0002-7518-6845',\n",
" 'fitness': '0000-0002-1234-835X', # URL record + employment\n",
" 'cannabis': '0000-0002-9025-8632', # URL > 70 + works (now REMOVED)\n",
" 'plumber': '0000-0002-1700-8311', # URL > 10 + works\n",
" 'furniture': '0000-0001-7478-4539',\n",
" 'cleaners': '0000-0002-7392-3792',\n",
" 'toxiburn': '0000-0001-7505-2081', # URLs in bio\n",
" 'ultraburst': '0000-0002-7037-3393', # URLs in bio\n",
" 'testoryze': '0000-0002-6361-8129', # UNRL in bio\n",
" 'rlmax': '0000-0002-0393-7865',\n",
" 'eretrol': '0000-0002-6226-8905',\n",
" 'memomax': '0000-0002-2231-4233',\n",
" 'keto': '0000-0002-5521-9494',\n",
" 'baukredit': '0000-0002-5402-9920',\n",
" 'barber': '0000-0002-6766-8254'\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Load the dataset"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0000-0001-6097-3953 | \n",
" False | \n",
" False | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaN | \n",
" 2018-03-02 09:29:16.528000+00:00 | \n",
" 2018-03-02 09:43:07.551000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
"
\n",
" \n",
" 1 | \n",
" 0000-0001-6112-5550 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" [v.i. yurtaev; v. yurtaev] | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" [[professor, peoples friendship university of ... | \n",
" 0 | \n",
" NaN | \n",
" 2018-04-03 07:50:23.358000+00:00 | \n",
" 2020-03-18 09:42:44.753000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 1 | \n",
"
\n",
" \n",
" 2 | \n",
" 0000-0001-6152-2695 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaN | \n",
" 2019-12-11 15:31:56.388000+00:00 | \n",
" 2020-01-28 15:34:17.309000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
"
\n",
" \n",
" 3 | \n",
" 0000-0001-7071-8294 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" [[researcher (academic), universidad de zarago... | \n",
" 0 | \n",
" NaN | \n",
" 2014-03-10 13:22:01.966000+00:00 | \n",
" 2016-06-14 22:17:54.470000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 2 | \n",
"
\n",
" \n",
" 4 | \n",
" 0000-0001-7247-6831 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaN | \n",
" 2019-07-19 15:57:46.116000+00:00 | \n",
" 2019-07-19 16:04:33.839000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" orcid verified_email verified_primary_email given_names \\\n",
"0 0000-0001-6097-3953 False False \n",
"1 0000-0001-6112-5550 True True \n",
"2 0000-0001-6152-2695 True True \n",
"3 0000-0001-7071-8294 True True \n",
"4 0000-0001-7247-6831 True True \n",
"\n",
" family_name biography other_names urls primary_email \\\n",
"0 NaN NaN \n",
"1 [v.i. yurtaev; v. yurtaev] NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n",
"\n",
" keywords external_ids education \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"\n",
" employment n_works works_source \\\n",
"0 NaN 0 NaN \n",
"1 [[professor, peoples friendship university of ... 0 NaN \n",
"2 NaN 0 NaN \n",
"3 [[researcher (academic), universidad de zarago... 0 NaN \n",
"4 NaN 0 NaN \n",
"\n",
" activation_date last_update_date n_doi \\\n",
"0 2018-03-02 09:29:16.528000+00:00 2018-03-02 09:43:07.551000+00:00 0 \n",
"1 2018-04-03 07:50:23.358000+00:00 2020-03-18 09:42:44.753000+00:00 0 \n",
"2 2019-12-11 15:31:56.388000+00:00 2020-01-28 15:34:17.309000+00:00 0 \n",
"3 2014-03-10 13:22:01.966000+00:00 2016-06-14 22:17:54.470000+00:00 0 \n",
"4 2019-07-19 15:57:46.116000+00:00 2019-07-19 16:04:33.839000+00:00 0 \n",
"\n",
" n_arxiv n_pmc n_other_pids other_urls label primary_email_domain \\\n",
"0 0 0 0 NaN False NaN \n",
"1 0 0 0 NaN False NaN \n",
"2 0 0 0 NaN False NaN \n",
"3 0 0 0 NaN False NaN \n",
"4 0 0 0 NaN False NaN \n",
"\n",
" other_email_domains url_domains other_url_domains n_emails n_urls n_ids \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"\n",
" n_keywords n_education n_employment \n",
"0 \n",
"1 1 \n",
"2 \n",
"3 2 \n",
"4 "
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"parts = glob.glob('../data/processed/dataset.pkl.*')\n",
"\n",
"df = pd.concat((pd.read_pickle(part) for part in sorted(parts)))\n",
"df.head(5)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Notable records inspection"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
"
\n",
" \n",
" \n",
" \n",
" 6462863 | \n",
" 0000-0002-5193-7851 | \n",
" True | \n",
" True | \n",
" andrea | \n",
" mannocci | \n",
" data scientist & researcher; scholarly knowled... | \n",
" NaN | \n",
" [[personal website, https://andremann.github.i... | \n",
" andrea.mannocci@isti.cnr.it | \n",
" [science of science, scholarly knowledge minin... | \n",
" [[scopus author id, 55233589900]] | \n",
" [[information engineering, ph.d., università d... | \n",
" [[research associate, istituto di scienza e te... | \n",
" 37 | \n",
" [scopus - elsevier, crossref metadata search, ... | \n",
" 2017-09-12 14:28:33.467000+00:00 | \n",
" 2021-04-24 04:37:09.879000+00:00 | \n",
" 34 | \n",
" 0 | \n",
" 0 | \n",
" 60 | \n",
" NaN | \n",
" True | \n",
" isti.cnr.it | \n",
" NaN | \n",
" {twitter.com, google.com, github.io, linkedin.... | \n",
" NaN | \n",
" <NA> | \n",
" 4 | \n",
" 1 | \n",
" 5 | \n",
" 4 | \n",
" 5 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
"6462863 0000-0002-5193-7851 True True \n",
"\n",
" given_names family_name \\\n",
"6462863 andrea mannocci \n",
"\n",
" biography other_names \\\n",
"6462863 data scientist & researcher; scholarly knowled... NaN \n",
"\n",
" urls \\\n",
"6462863 [[personal website, https://andremann.github.i... \n",
"\n",
" primary_email \\\n",
"6462863 andrea.mannocci@isti.cnr.it \n",
"\n",
" keywords \\\n",
"6462863 [science of science, scholarly knowledge minin... \n",
"\n",
" external_ids \\\n",
"6462863 [[scopus author id, 55233589900]] \n",
"\n",
" education \\\n",
"6462863 [[information engineering, ph.d., università d... \n",
"\n",
" employment n_works \\\n",
"6462863 [[research associate, istituto di scienza e te... 37 \n",
"\n",
" works_source \\\n",
"6462863 [scopus - elsevier, crossref metadata search, ... \n",
"\n",
" activation_date last_update_date \\\n",
"6462863 2017-09-12 14:28:33.467000+00:00 2021-04-24 04:37:09.879000+00:00 \n",
"\n",
" n_doi n_arxiv n_pmc n_other_pids other_urls label \\\n",
"6462863 34 0 0 60 NaN True \n",
"\n",
" primary_email_domain other_email_domains \\\n",
"6462863 isti.cnr.it NaN \n",
"\n",
" url_domains other_url_domains \\\n",
"6462863 {twitter.com, google.com, github.io, linkedin.... NaN \n",
"\n",
" n_emails n_urls n_ids n_keywords n_education n_employment \n",
"6462863 4 1 5 4 5 "
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == AM]"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
"
\n",
" \n",
" \n",
" \n",
" 10212327 | \n",
" 0000-0001-6997-9470 | \n",
" True | \n",
" True | \n",
" other | \n",
" whatsapp | \n",
" <NA> | \n",
" NaN | \n",
" [[otherwhatsapp, https://otherwhatsapp.com/], ... | \n",
" <NA> | \n",
" [whatsapp gb, whatsapp gb apk, whatsapp gb 202... | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaN | \n",
" 2020-10-07 10:37:12.237000+00:00 | \n",
" 2020-10-08 02:32:03.935000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" {facebook.com, last.fm, vk.com, tumblr.com, yo... | \n",
" NaN | \n",
" <NA> | \n",
" 27 | \n",
" <NA> | \n",
" 4 | \n",
" <NA> | \n",
" <NA> | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
"10212327 0000-0001-6997-9470 True True \n",
"\n",
" given_names family_name biography other_names \\\n",
"10212327 other whatsapp NaN \n",
"\n",
" urls primary_email \\\n",
"10212327 [[otherwhatsapp, https://otherwhatsapp.com/], ... \n",
"\n",
" keywords external_ids \\\n",
"10212327 [whatsapp gb, whatsapp gb apk, whatsapp gb 202... NaN \n",
"\n",
" education employment n_works works_source \\\n",
"10212327 NaN NaN 0 NaN \n",
"\n",
" activation_date last_update_date \\\n",
"10212327 2020-10-07 10:37:12.237000+00:00 2020-10-08 02:32:03.935000+00:00 \n",
"\n",
" n_doi n_arxiv n_pmc n_other_pids other_urls label \\\n",
"10212327 0 0 0 0 NaN False \n",
"\n",
" primary_email_domain other_email_domains \\\n",
"10212327 NaN NaN \n",
"\n",
" url_domains other_url_domains \\\n",
"10212327 {facebook.com, last.fm, vk.com, tumblr.com, yo... NaN \n",
"\n",
" n_emails n_urls n_ids n_keywords n_education n_employment \n",
"10212327 27 4 "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['orcid'] == FAKE_HEAP['whatsapp']]"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/plain": [
"orcid 11349607\n",
"verified_email 11349607\n",
"verified_primary_email 11349607\n",
"given_names 11318228\n",
"family_name 11017320\n",
"biography 376153\n",
"other_names 568998\n",
"urls 746443\n",
"primary_email 129037\n",
"keywords 664556\n",
"external_ids 1335691\n",
"education 2493817\n",
"employment 2750231\n",
"n_works 11349607\n",
"works_source 2830772\n",
"activation_date 11349607\n",
"last_update_date 11349607\n",
"n_doi 11349607\n",
"n_arxiv 11349607\n",
"n_pmc 11349607\n",
"n_other_pids 11349607\n",
"other_urls 15842\n",
"label 11349607\n",
"primary_email_domain 129037\n",
"other_email_domains 50223\n",
"url_domains 746443\n",
"other_url_domains 15842\n",
"n_emails 50223\n",
"n_urls 746443\n",
"n_ids 1335691\n",
"n_keywords 664556\n",
"n_education 2493817\n",
"n_employment 2750231\n",
"dtype: int64"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.count()"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 11349607\n",
"unique 11349607\n",
"top 0000-0003-4828-3321\n",
"freq 1\n",
"Name: orcid, dtype: object"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['orcid'].describe()"
]
},
{
"cell_type": "raw",
"metadata": {},
"source": [
"## Primary email"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 129037\n",
"unique 129032\n",
"top opercin@erbakan.edu.tr\n",
"freq 2\n",
"Name: primary_email, dtype: object"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['primary_email'].describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Dupe emails"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"1734114 opercin@erbakan.edu.tr\n",
"7790373 maykin@owasp.org\n",
"10265986 garcialopezart@gmail.com\n",
"10706978 andycheng2026@163.com\n",
"11344431 patrick.davey@monash.edu\n",
"Name: primary_email, dtype: string"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['primary_email'].dropna().loc[df['primary_email'].duplicated()]"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
"
\n",
" \n",
" \n",
" \n",
" 4466844 | \n",
" 0000-0001-9855-1676 | \n",
" True | \n",
" True | \n",
" maykin | \n",
" warasart | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" maykin@owasp.org | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaN | \n",
" 2020-10-23 17:51:51.925000+00:00 | \n",
" 2021-03-26 08:27:30.509000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" owasp.org | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
"
\n",
" \n",
" 7790373 | \n",
" 0000-0002-0836-2271 | \n",
" True | \n",
" True | \n",
" maykin | \n",
" warasart | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" maykin@owasp.org | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaN | \n",
" 2020-09-15 04:43:55.709000+00:00 | \n",
" 2020-09-15 05:17:28.509000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" owasp.org | \n",
" [dga.or.th] | \n",
" NaN | \n",
" NaN | \n",
" 1 | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
"4466844 0000-0001-9855-1676 True True \n",
"7790373 0000-0002-0836-2271 True True \n",
"\n",
" given_names family_name biography other_names urls primary_email \\\n",
"4466844 maykin warasart NaN NaN maykin@owasp.org \n",
"7790373 maykin warasart NaN NaN maykin@owasp.org \n",
"\n",
" keywords external_ids education employment n_works works_source \\\n",
"4466844 NaN NaN NaN NaN 0 NaN \n",
"7790373 NaN NaN NaN NaN 0 NaN \n",
"\n",
" activation_date last_update_date \\\n",
"4466844 2020-10-23 17:51:51.925000+00:00 2021-03-26 08:27:30.509000+00:00 \n",
"7790373 2020-09-15 04:43:55.709000+00:00 2020-09-15 05:17:28.509000+00:00 \n",
"\n",
" n_doi n_arxiv n_pmc n_other_pids other_urls label \\\n",
"4466844 0 0 0 0 NaN False \n",
"7790373 0 0 0 0 NaN False \n",
"\n",
" primary_email_domain other_email_domains url_domains \\\n",
"4466844 owasp.org NaN NaN \n",
"7790373 owasp.org [dga.or.th] NaN \n",
"\n",
" other_url_domains n_emails n_urls n_ids n_keywords n_education \\\n",
"4466844 NaN \n",
"7790373 NaN 1 \n",
"\n",
" n_employment \n",
"4466844 \n",
"7790373 "
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['primary_email'] == 'maykin@owasp.org']"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
"
\n",
" \n",
" \n",
" \n",
" 358094 | \n",
" 0000-0002-2232-9638 | \n",
" True | \n",
" True | \n",
" osman | \n",
" perçin | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" opercin@erbakan.edu.tr | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaN | \n",
" 2015-01-12 13:47:55.549000+00:00 | \n",
" 2020-01-27 07:38:24.269000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" erbakan.edu.tr | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
"
\n",
" \n",
" 1734114 | \n",
" 0000-0003-0033-0918 | \n",
" True | \n",
" True | \n",
" osman | \n",
" perçin | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" opercin@erbakan.edu.tr | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" [[, necmettin erbakan university, konya, , tr,... | \n",
" 0 | \n",
" NaN | \n",
" 2015-10-13 05:47:12.014000+00:00 | \n",
" 2020-12-25 13:52:03.976000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" erbakan.edu.tr | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
"358094 0000-0002-2232-9638 True True \n",
"1734114 0000-0003-0033-0918 True True \n",
"\n",
" given_names family_name biography other_names urls \\\n",
"358094 osman perçin NaN NaN \n",
"1734114 osman perçin NaN NaN \n",
"\n",
" primary_email keywords external_ids education \\\n",
"358094 opercin@erbakan.edu.tr NaN NaN NaN \n",
"1734114 opercin@erbakan.edu.tr NaN NaN NaN \n",
"\n",
" employment n_works \\\n",
"358094 NaN 0 \n",
"1734114 [[, necmettin erbakan university, konya, , tr,... 0 \n",
"\n",
" works_source activation_date \\\n",
"358094 NaN 2015-01-12 13:47:55.549000+00:00 \n",
"1734114 NaN 2015-10-13 05:47:12.014000+00:00 \n",
"\n",
" last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n",
"358094 2020-01-27 07:38:24.269000+00:00 0 0 0 0 \n",
"1734114 2020-12-25 13:52:03.976000+00:00 0 0 0 0 \n",
"\n",
" other_urls label primary_email_domain other_email_domains \\\n",
"358094 NaN False erbakan.edu.tr NaN \n",
"1734114 NaN False erbakan.edu.tr NaN \n",
"\n",
" url_domains other_url_domains n_emails n_urls n_ids n_keywords \\\n",
"358094 NaN NaN \n",
"1734114 NaN NaN \n",
"\n",
" n_education n_employment \n",
"358094 \n",
"1734114 1 "
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['primary_email'] == 'opercin@erbakan.edu.tr']"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
"
\n",
" \n",
" \n",
" \n",
" 983898 | \n",
" 0000-0002-9158-1757 | \n",
" True | \n",
" True | \n",
" patrick | \n",
" davey | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" patrick.davey@monash.edu | \n",
" [inorganic chemistry, radiochemistry, radiopha... | \n",
" NaN | \n",
" NaN | \n",
" [[phd student, monash university, melbourne, ,... | \n",
" 0 | \n",
" NaN | \n",
" 2019-05-09 23:01:02.170000+00:00 | \n",
" 2019-08-20 03:00:17.844000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" monash.edu | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 4 | \n",
" <NA> | \n",
" 1 | \n",
"
\n",
" \n",
" 11344431 | \n",
" 0000-0002-8774-0030 | \n",
" True | \n",
" True | \n",
" patrick | \n",
" davey | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" patrick.davey@monash.edu | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" [[phd student, monash university, melbourne, v... | \n",
" 1 | \n",
" [crossref] | \n",
" 2018-09-11 10:47:10.997000+00:00 | \n",
" 2021-04-30 05:47:48.213000+00:00 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" True | \n",
" monash.edu | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
"983898 0000-0002-9158-1757 True True \n",
"11344431 0000-0002-8774-0030 True True \n",
"\n",
" given_names family_name biography other_names urls \\\n",
"983898 patrick davey NaN NaN \n",
"11344431 patrick davey NaN NaN \n",
"\n",
" primary_email \\\n",
"983898 patrick.davey@monash.edu \n",
"11344431 patrick.davey@monash.edu \n",
"\n",
" keywords external_ids \\\n",
"983898 [inorganic chemistry, radiochemistry, radiopha... NaN \n",
"11344431 NaN NaN \n",
"\n",
" education employment \\\n",
"983898 NaN [[phd student, monash university, melbourne, ,... \n",
"11344431 NaN [[phd student, monash university, melbourne, v... \n",
"\n",
" n_works works_source activation_date \\\n",
"983898 0 NaN 2019-05-09 23:01:02.170000+00:00 \n",
"11344431 1 [crossref] 2018-09-11 10:47:10.997000+00:00 \n",
"\n",
" last_update_date n_doi n_arxiv n_pmc \\\n",
"983898 2019-08-20 03:00:17.844000+00:00 0 0 0 \n",
"11344431 2021-04-30 05:47:48.213000+00:00 1 0 0 \n",
"\n",
" n_other_pids other_urls label primary_email_domain \\\n",
"983898 0 NaN False monash.edu \n",
"11344431 0 NaN True monash.edu \n",
"\n",
" other_email_domains url_domains other_url_domains n_emails n_urls \\\n",
"983898 NaN NaN NaN \n",
"11344431 NaN NaN NaN \n",
"\n",
" n_ids n_keywords n_education n_employment \n",
"983898 4 1 \n",
"11344431 1 "
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df['primary_email'] == 'patrick.davey@monash.edu']"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 129037\n",
"unique 17499\n",
"top gmail.com\n",
"freq 27831\n",
"Name: primary_email_domain, dtype: object"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['primary_email_domain'].describe()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"top_primary_emails = df[['primary_email_domain', 'orcid']]\\\n",
" .groupby('primary_email_domain')\\\n",
" .count()\\\n",
" .sort_values('orcid', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"linkText": "Export to plot.ly",
"plotlyServerURL": "https://plot.ly",
"showLink": false
},
"data": [
{
"type": "bar",
"x": [
"gmail.com",
"hotmail.com",
"yahoo.com",
"163.com",
"yuhs.ac",
"qq.com",
"outlook.com",
"126.com",
"bu.edu",
"mail.ru",
"usgs.gov",
"usp.br",
"yahoo.com.br",
"ua.pt",
"umich.edu",
"ust.hk",
"foxmail.com",
"uomustansiriyah.edu.iq",
"yandex.ru",
"uq.edu.au",
"ukr.net",
"unesp.br",
"ucl.ac.uk",
"ieee.org",
"naver.com",
"stcatz.ox.ac.uk",
"st-annes.ox.ac.uk",
"ucm.es",
"yahoo.fr",
"live.com"
],
"y": [
27831,
3919,
2700,
2211,
1130,
1101,
1005,
798,
635,
605,
592,
476,
473,
303,
293,
277,
268,
253,
252,
240,
238,
225,
217,
208,
201,
184,
183,
180,
180,
169
]
}
],
"layout": {
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Top-30 email domains"
},
"xaxis": {
"range": [
-0.5,
29.5
],
"tickangle": 45,
"tickfont": {
"size": 12
}
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"set_top_n(30)\n",
"data = [\n",
" go.Bar(\n",
" x=top_primary_emails[:TOP_N].index,\n",
" y=top_primary_emails[:TOP_N]['orcid']\n",
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Top-%s email domains' % TOP_N,\n",
" xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This somehow makes sense, legitimate users could put the gmail account as primary for login purposes and have institutional addresses as other email addresses. It makes also the life easier upon relocation."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Other emails"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 50223\n",
"unique 16012\n",
"top [gmail.com]\n",
"freq 9331\n",
"Name: other_email_domains, dtype: object"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['other_email_domains'].describe()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"emails_by_orcid = df[['orcid', 'n_emails']].sort_values('n_emails', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"linkText": "Export to plot.ly",
"plotlyServerURL": "https://plot.ly",
"showLink": false
},
"data": [
{
"type": "bar",
"x": [
"0000-0003-4171-3835",
"0000-0001-6239-2968",
"0000-0003-2151-4089",
"0000-0003-2290-2817",
"0000-0003-4147-212X",
"0000-0001-6349-1044",
"0000-0002-2085-1908",
"0000-0001-9084-3156",
"0000-0003-2657-8225",
"0000-0002-0776-9547",
"0000-0001-5548-8259",
"0000-0002-3165-132X",
"0000-0003-0391-3430",
"0000-0003-1502-3910",
"0000-0002-9599-6909",
"0000-0003-0671-1543",
"0000-0001-9311-0687",
"0000-0002-5341-6531",
"0000-0003-2272-7254",
"0000-0001-8420-9204",
"0000-0003-4499-7300",
"0000-0002-8565-194X",
"0000-0003-1196-9987",
"0000-0002-1615-8633",
"0000-0002-2567-3741",
"0000-0002-8390-8238",
"0000-0003-4685-5621",
"0000-0003-4327-6827",
"0000-0002-1929-6054",
"0000-0002-9821-8424"
],
"y": [
12,
9,
7,
7,
6,
6,
6,
6,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5,
5
]
}
],
"layout": {
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Top 30 ORCID iDs by email"
},
"xaxis": {
"range": [
-0.5,
29.5
],
"tickangle": 45,
"tickfont": {
"size": 12
}
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"set_top_n(30)\n",
"data = [\n",
" go.Bar(\n",
" x=emails_by_orcid[:TOP_N]['orcid'],\n",
" y=emails_by_orcid[:TOP_N]['n_emails']\n",
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Top %s ORCID iDs by email' % TOP_N, \n",
" xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"top_other_emails = df[['orcid', 'other_email_domains']]\\\n",
" .explode('other_email_domains')\\\n",
" .reset_index(drop=True)\\\n",
" .groupby('other_email_domains')\\\n",
" .count()\\\n",
" .sort_values('orcid', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"linkText": "Export to plot.ly",
"plotlyServerURL": "https://plot.ly",
"showLink": false
},
"data": [
{
"type": "bar",
"x": [
"gmail.com",
"hotmail.com",
"yahoo.com",
"qq.com",
"163.com",
"outlook.com",
"126.com",
"usp.br",
"ieee.org",
"yahoo.com.br",
"mail.ru",
"unesp.br",
"yuhs.ac",
"sbs.ox.ac.uk",
"naver.com",
"icloud.com",
"foxmail.com",
"uq.edu.au",
"ua.pt",
"cam.ac.uk",
"imperial.ac.uk",
"ukr.net",
"mit.edu",
"ucl.ac.uk",
"law.ox.ac.uk",
"stanford.edu",
"monash.edu",
"ucm.es",
"education.ox.ac.uk",
"conted.ox.ac.uk"
],
"y": [
11607,
1589,
1336,
813,
803,
457,
270,
242,
230,
155,
152,
147,
141,
136,
135,
122,
103,
93,
92,
86,
81,
77,
77,
76,
75,
74,
69,
68,
67,
64
]
}
],
"layout": {
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Top 30 other email domains"
},
"xaxis": {
"range": [
-0.5,
29.5
],
"tickangle": 45,
"tickfont": {
"size": 12
}
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"set_top_n(30)\n",
"data = [\n",
" go.Bar(\n",
" x=top_other_emails[:TOP_N].index,\n",
" y=top_other_emails[:TOP_N]['orcid']\n",
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Top %s other email domains' % TOP_N, \n",
" xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## URLs"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 746443.000000\n",
"mean 1.368308\n",
"std 1.224000\n",
"min 1.000000\n",
"25% 1.000000\n",
"50% 1.000000\n",
"75% 1.000000\n",
"max 193.000000\n",
"Name: n_urls, dtype: float64"
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.n_urls.describe()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
"
\n",
" \n",
" \n",
" \n",
"
\n",
"
"
],
"text/plain": [
"Empty DataFrame\n",
"Columns: [orcid, verified_email, verified_primary_email, given_names, family_name, biography, other_names, urls, primary_email, keywords, external_ids, education, employment, n_works, works_source, activation_date, last_update_date, n_doi, n_arxiv, n_pmc, n_other_pids, other_urls, label, primary_email_domain, other_email_domains, url_domains, other_url_domains, n_emails, n_urls, n_ids, n_keywords, n_education, n_employment]\n",
"Index: []"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.n_urls > df.n_urls.max()]"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
"
\n",
" \n",
" \n",
" \n",
" 5 | \n",
" 0000-0001-7402-0096 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" [[kth profile, https://www.kth.se/profile/toma... | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" [[, kth royal institute of technology, stockho... | \n",
" 0 | \n",
" NaN | \n",
" 2015-01-11 15:13:06.467000+00:00 | \n",
" 2016-06-14 23:55:59.896000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" {kth.se} | \n",
" NaN | \n",
" <NA> | \n",
" 1 | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 1 | \n",
"
\n",
" \n",
" 10 | \n",
" 0000-0001-8377-3508 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" [fontana, milena da silva] | \n",
" [[currículo lattes, http://buscatextual.cnpq.b... | \n",
" <NA> | \n",
" [educação; informática; matemática.] | \n",
" NaN | \n",
" NaN | \n",
" [[, instituto federal de educação, ciência e t... | \n",
" 0 | \n",
" NaN | \n",
" 2018-05-23 23:39:04.534000+00:00 | \n",
" 2019-10-16 02:50:11.007000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" {cnpq.br} | \n",
" NaN | \n",
" <NA> | \n",
" 1 | \n",
" <NA> | \n",
" 1 | \n",
" <NA> | \n",
" 3 | \n",
"
\n",
" \n",
" 26 | \n",
" 0000-0002-2638-4108 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" investigador de la universidad de oviedo. depa... | \n",
" NaN | \n",
" [[cv ignacio fernandez sarasola, https://www.u... | \n",
" <NA> | \n",
" [constitutional history, constitutional law, h... | \n",
" [[scopus author id, 54394231000]] | \n",
" [[public law, ph doctor, university of oviedo,... | \n",
" [[professor of constitutional law, university ... | \n",
" 1 | \n",
" [crossref] | \n",
" 2013-03-25 14:38:06.016000+00:00 | \n",
" 2020-07-01 13:10:37.025000+00:00 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" {unioviedo.es} | \n",
" NaN | \n",
" <NA> | \n",
" 1 | \n",
" 1 | \n",
" 3 | \n",
" 1 | \n",
" 1 | \n",
"
\n",
" \n",
" 151 | \n",
" 0000-0003-1284-9741 | \n",
" True | \n",
" True | \n",
" alex percy antonio | \n",
" manriquez paisig | \n",
" <NA> | \n",
" NaN | \n",
" [[canal de youtube, https://www.youtube.com/ch... | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaN | \n",
" 2020-09-08 20:04:33.906000+00:00 | \n",
" 2020-09-08 20:25:55.432000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" {youtube.com} | \n",
" NaN | \n",
" <NA> | \n",
" 1 | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
"
\n",
" \n",
" 171 | \n",
" 0000-0001-7324-3247 | \n",
" True | \n",
" True | \n",
" amable | \n",
" lopez piñeiro | \n",
" <NA> | \n",
" NaN | \n",
" [[web de la universidad politécnica de madrid ... | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" [[, universidad politécnica de madrid - techni... | \n",
" 0 | \n",
" NaN | \n",
" 2014-12-19 11:48:29.669000+00:00 | \n",
" 2019-12-09 18:48:42.579000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" {upm.es} | \n",
" NaN | \n",
" <NA> | \n",
" 1 | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 1 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
"5 0000-0001-7402-0096 True True \n",
"10 0000-0001-8377-3508 True True \n",
"26 0000-0002-2638-4108 True True \n",
"151 0000-0003-1284-9741 True True \n",
"171 0000-0001-7324-3247 True True \n",
"\n",
" given_names family_name \\\n",
"5 \n",
"10 \n",
"26 \n",
"151 alex percy antonio manriquez paisig \n",
"171 amable lopez piñeiro \n",
"\n",
" biography \\\n",
"5 \n",
"10 \n",
"26 investigador de la universidad de oviedo. depa... \n",
"151 \n",
"171 \n",
"\n",
" other_names \\\n",
"5 NaN \n",
"10 [fontana, milena da silva] \n",
"26 NaN \n",
"151 NaN \n",
"171 NaN \n",
"\n",
" urls primary_email \\\n",
"5 [[kth profile, https://www.kth.se/profile/toma... \n",
"10 [[currículo lattes, http://buscatextual.cnpq.b... \n",
"26 [[cv ignacio fernandez sarasola, https://www.u... \n",
"151 [[canal de youtube, https://www.youtube.com/ch... \n",
"171 [[web de la universidad politécnica de madrid ... \n",
"\n",
" keywords \\\n",
"5 NaN \n",
"10 [educação; informática; matemática.] \n",
"26 [constitutional history, constitutional law, h... \n",
"151 NaN \n",
"171 NaN \n",
"\n",
" external_ids \\\n",
"5 NaN \n",
"10 NaN \n",
"26 [[scopus author id, 54394231000]] \n",
"151 NaN \n",
"171 NaN \n",
"\n",
" education \\\n",
"5 NaN \n",
"10 NaN \n",
"26 [[public law, ph doctor, university of oviedo,... \n",
"151 NaN \n",
"171 NaN \n",
"\n",
" employment n_works works_source \\\n",
"5 [[, kth royal institute of technology, stockho... 0 NaN \n",
"10 [[, instituto federal de educação, ciência e t... 0 NaN \n",
"26 [[professor of constitutional law, university ... 1 [crossref] \n",
"151 NaN 0 NaN \n",
"171 [[, universidad politécnica de madrid - techni... 0 NaN \n",
"\n",
" activation_date last_update_date n_doi \\\n",
"5 2015-01-11 15:13:06.467000+00:00 2016-06-14 23:55:59.896000+00:00 0 \n",
"10 2018-05-23 23:39:04.534000+00:00 2019-10-16 02:50:11.007000+00:00 0 \n",
"26 2013-03-25 14:38:06.016000+00:00 2020-07-01 13:10:37.025000+00:00 1 \n",
"151 2020-09-08 20:04:33.906000+00:00 2020-09-08 20:25:55.432000+00:00 0 \n",
"171 2014-12-19 11:48:29.669000+00:00 2019-12-09 18:48:42.579000+00:00 0 \n",
"\n",
" n_arxiv n_pmc n_other_pids other_urls label primary_email_domain \\\n",
"5 0 0 0 NaN False NaN \n",
"10 0 0 0 NaN False NaN \n",
"26 0 0 0 NaN False NaN \n",
"151 0 0 0 NaN False NaN \n",
"171 0 0 0 NaN False NaN \n",
"\n",
" other_email_domains url_domains other_url_domains n_emails n_urls \\\n",
"5 NaN {kth.se} NaN 1 \n",
"10 NaN {cnpq.br} NaN 1 \n",
"26 NaN {unioviedo.es} NaN 1 \n",
"151 NaN {youtube.com} NaN 1 \n",
"171 NaN {upm.es} NaN 1 \n",
"\n",
" n_ids n_keywords n_education n_employment \n",
"5 1 \n",
"10 1 3 \n",
"26 1 3 1 1 \n",
"151 \n",
"171 1 "
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.url_domains.notna()].head()"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" n_urls | \n",
"
\n",
" \n",
" \n",
" \n",
" 3326559 | \n",
" 0000-0002-1234-835X | \n",
" 193 | \n",
"
\n",
" \n",
" 4341400 | \n",
" 0000-0001-7478-4539 | \n",
" 168 | \n",
"
\n",
" \n",
" 5061057 | \n",
" 0000-0002-7392-3792 | \n",
" 152 | \n",
"
\n",
" \n",
" 8453449 | \n",
" 0000-0002-6938-9638 | \n",
" 148 | \n",
"
\n",
" \n",
" 1743760 | \n",
" 0000-0001-5384-0001 | \n",
" 104 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 11349602 | \n",
" 0000-0002-1686-1935 | \n",
" <NA> | \n",
"
\n",
" \n",
" 11349603 | \n",
" 0000-0002-3800-6331 | \n",
" <NA> | \n",
"
\n",
" \n",
" 11349604 | \n",
" 0000-0002-8783-5814 | \n",
" <NA> | \n",
"
\n",
" \n",
" 11349605 | \n",
" 0000-0002-7584-2283 | \n",
" <NA> | \n",
"
\n",
" \n",
" 11349606 | \n",
" 0000-0003-0529-3538 | \n",
" <NA> | \n",
"
\n",
" \n",
"
\n",
"
11349607 rows × 2 columns
\n",
"
"
],
"text/plain": [
" orcid n_urls\n",
"3326559 0000-0002-1234-835X 193\n",
"4341400 0000-0001-7478-4539 168\n",
"5061057 0000-0002-7392-3792 152\n",
"8453449 0000-0002-6938-9638 148\n",
"1743760 0000-0001-5384-0001 104\n",
"... ... ...\n",
"11349602 0000-0002-1686-1935 \n",
"11349603 0000-0002-3800-6331 \n",
"11349604 0000-0002-8783-5814 \n",
"11349605 0000-0002-7584-2283 \n",
"11349606 0000-0003-0529-3538 \n",
"\n",
"[11349607 rows x 2 columns]"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"urls_by_orcid = df[['orcid', 'n_urls']].sort_values('n_urls', ascending=False)\n",
"urls_by_orcid"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"The first three are fake, the fourth isn't. No assumption can be taken."
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"linkText": "Export to plot.ly",
"plotlyServerURL": "https://plot.ly",
"showLink": false
},
"data": [
{
"type": "bar",
"x": [
"0000-0002-1234-835X",
"0000-0001-7478-4539",
"0000-0002-7392-3792",
"0000-0002-6938-9638",
"0000-0001-5384-0001",
"0000-0003-3344-7266",
"0000-0003-2450-090X",
"0000-0001-9328-4038",
"0000-0002-4621-5571",
"0000-0002-7754-8889",
"0000-0001-9131-1266",
"0000-0003-0176-1293",
"0000-0002-7456-3848",
"0000-0002-0135-9344",
"0000-0001-5880-7091",
"0000-0002-4316-1467",
"0000-0002-1856-6905",
"0000-0001-8873-6677",
"0000-0002-8493-0402",
"0000-0002-0752-7513",
"0000-0003-1524-6268",
"0000-0002-6689-4129",
"0000-0002-1298-5252",
"0000-0003-0594-2462",
"0000-0002-2886-9248",
"0000-0003-2593-7134",
"0000-0003-2383-8386",
"0000-0003-1761-3842",
"0000-0002-9025-8632",
"0000-0003-0796-0234",
"0000-0002-5139-2660",
"0000-0003-2183-8112",
"0000-0002-1929-6054",
"0000-0002-5710-4041",
"0000-0003-1097-926X",
"0000-0003-4808-6619",
"0000-0002-6547-0172",
"0000-0003-2407-3557",
"0000-0003-0694-1154",
"0000-0003-1585-1134",
"0000-0003-4948-9268",
"0000-0002-2916-2893",
"0000-0003-4326-9336",
"0000-0001-8978-4830",
"0000-0002-8940-3177",
"0000-0002-8593-9257",
"0000-0002-5946-1595",
"0000-0002-7653-4899",
"0000-0001-5300-4601",
"0000-0001-6921-0426",
"0000-0001-5898-6843",
"0000-0001-7550-5802",
"0000-0001-8644-2114",
"0000-0002-3920-7389",
"0000-0001-9102-8639",
"0000-0001-9026-4795",
"0000-0001-9119-5955",
"0000-0001-7608-9433",
"0000-0003-1436-5986",
"0000-0003-1188-2187",
"0000-0002-3997-5070",
"0000-0002-6712-7327",
"0000-0002-3482-7984",
"0000-0001-6714-009X",
"0000-0002-1292-285X",
"0000-0002-2988-3973",
"0000-0001-9135-4362",
"0000-0001-6599-5449",
"0000-0003-2281-0680",
"0000-0003-0386-5475",
"0000-0001-7449-761X",
"0000-0002-4659-5391",
"0000-0002-6674-3806",
"0000-0002-3630-2516",
"0000-0002-5928-0479",
"0000-0001-9189-0122",
"0000-0002-9658-1473",
"0000-0003-2567-0847",
"0000-0003-3123-5286",
"0000-0002-1728-5503",
"0000-0002-9616-9470",
"0000-0002-2535-4855",
"0000-0001-5960-5766",
"0000-0001-5005-6385",
"0000-0002-1652-8518",
"0000-0002-7038-3225",
"0000-0003-4757-9876",
"0000-0002-8881-8047",
"0000-0002-2198-6740",
"0000-0002-3808-0697",
"0000-0002-4027-0692",
"0000-0001-5426-478X",
"0000-0002-0535-5236",
"0000-0003-0930-6121",
"0000-0001-9059-7602",
"0000-0002-8116-9611",
"0000-0003-1310-9366",
"0000-0001-5760-6570",
"0000-0003-3655-522X",
"0000-0002-4968-5149"
],
"y": [
193,
168,
152,
148,
104,
90,
90,
89,
89,
83,
83,
78,
75,
73,
68,
66,
65,
65,
63,
62,
62,
62,
61,
61,
61,
60,
59,
59,
57,
57,
56,
55,
54,
52,
52,
48,
48,
48,
46,
46,
45,
45,
43,
43,
43,
42,
42,
41,
41,
40,
39,
38,
38,
38,
37,
36,
36,
36,
35,
35,
35,
35,
35,
35,
34,
34,
34,
34,
34,
33,
33,
33,
32,
32,
32,
32,
32,
31,
31,
31,
31,
31,
31,
31,
31,
31,
31,
30,
30,
30,
30,
30,
30,
30,
29,
29,
29,
29,
29,
29
]
}
],
"layout": {
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Top 100 ORCID iDs with URLs"
},
"xaxis": {
"range": [
-0.5,
99.5
],
"tickangle": 45,
"tickfont": {
"size": 12
}
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"set_top_n(100)\n",
"data = [\n",
" go.Bar(\n",
" x=urls_by_orcid[:TOP_N]['orcid'],\n",
" y=urls_by_orcid[:TOP_N]['n_urls']\n",
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Top %s ORCID iDs with URLs' % TOP_N,\n",
" xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
"top_urls = df[['orcid', 'url_domains']]\\\n",
" .explode('url_domains')\\\n",
" .reset_index(drop=True)\\\n",
" .groupby('url_domains')\\\n",
" .count()\\\n",
" .sort_values('orcid', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"linkText": "Export to plot.ly",
"plotlyServerURL": "https://plot.ly",
"showLink": false
},
"data": [
{
"type": "bar",
"x": [
"linkedin.com",
"researchgate.net",
"google.com",
"cnpq.br",
"academia.edu",
"twitter.com",
"facebook.com",
"publons.com",
"wordpress.com",
"mendeley.com",
"instagram.com",
"github.io",
"google.com.ua",
"github.com",
"google.es",
"blogspot.com",
"unirioja.es",
"youtube.com",
"helsinki.fi",
"wixsite.com",
"scopus.com",
"ku.dk",
"weebly.com",
"kth.se",
"cityu.edu.hk",
"man.ac.uk",
"google.com.au",
"kcl.ac.uk",
"au.dk",
"ucl.ac.uk",
"us.es",
"sdu.dk",
"dtu.dk",
"ugr.es",
"researcherid.com",
"mq.edu.au",
"google.co.in",
"rug.nl",
"colciencias.gov.co",
"bris.ac.uk",
"vub.be",
"tumblr.com",
"uwa.edu.au",
"uc3m.es",
"ntu.edu.tw",
"monash.edu",
"google.co.uk",
"orcid.org",
"lancs.ac.uk",
"pinterest.com"
],
"y": [
82404,
69706,
45301,
25478,
21525,
20299,
15959,
11144,
8866,
6855,
6627,
5660,
5501,
5440,
5265,
4988,
4565,
4550,
4455,
4174,
3665,
3569,
3082,
2947,
2761,
2658,
2638,
2590,
2571,
2502,
2490,
2375,
2279,
2172,
2133,
2061,
1953,
1940,
1934,
1845,
1826,
1809,
1781,
1722,
1704,
1693,
1661,
1605,
1598,
1588
]
}
],
"layout": {
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Top-50 URL domains"
},
"xaxis": {
"range": [
-0.5,
49.5
],
"tickangle": 45,
"tickfont": {
"size": 12
}
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"set_top_n(50)\n",
"data = [\n",
" go.Bar(\n",
" x=top_urls[:TOP_N].index,\n",
" y=top_urls[:TOP_N]['orcid']\n",
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Top-%s URL domains' % TOP_N,\n",
" xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Malformed URLs are left empty"
]
},
{
"cell_type": "code",
"execution_count": 54,
"metadata": {},
"outputs": [],
"source": [
"exploded_url_domains = df[['orcid', 'url_domains']].explode('url_domains')"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"orcid 71\n",
"url_domains 71\n",
"dtype: int64"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"exploded_url_domains[exploded_url_domains.url_domains == ''].count()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Academic URL filtering according to GRID.ac"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [],
"source": [
"def extract_domain(link):\n",
" return tldextract.extract(link).registered_domain"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"grid_df = pd.read_csv('../data/external/grid/full_tables/links.csv', index_col='grid_id')\n",
"grid_df['domain'] = grid_df.link.apply(extract_domain)"
]
},
{
"cell_type": "code",
"execution_count": 58,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" link | \n",
" domain | \n",
"
\n",
" \n",
" grid_id | \n",
" | \n",
" | \n",
"
\n",
" \n",
" \n",
" \n",
" grid.1001.0 | \n",
" http://www.anu.edu.au/ | \n",
" anu.edu.au | \n",
"
\n",
" \n",
" grid.1002.3 | \n",
" http://www.monash.edu/ | \n",
" monash.edu | \n",
"
\n",
" \n",
" grid.1003.2 | \n",
" http://www.uq.edu.au/ | \n",
" uq.edu.au | \n",
"
\n",
" \n",
" grid.1004.5 | \n",
" http://mq.edu.au/ | \n",
" mq.edu.au | \n",
"
\n",
" \n",
" grid.1005.4 | \n",
" https://www.unsw.edu.au/ | \n",
" unsw.edu.au | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" grid.510886.2 | \n",
" https://cftni.org/ | \n",
" cftni.org | \n",
"
\n",
" \n",
" grid.510887.3 | \n",
" https://tinybeamfund.org/ | \n",
" tinybeamfund.org | \n",
"
\n",
" \n",
" grid.510888.c | \n",
" https://www.tmg-thinktank.com/ | \n",
" tmg-thinktank.com | \n",
"
\n",
" \n",
" grid.510889.d | \n",
" https://www.unicef.org/eap | \n",
" unicef.org | \n",
"
\n",
" \n",
" grid.510890.5 | \n",
" https://www.whitehouse.gov/ | \n",
" whitehouse.gov | \n",
"
\n",
" \n",
"
\n",
"
98933 rows × 2 columns
\n",
"
"
],
"text/plain": [
" link domain\n",
"grid_id \n",
"grid.1001.0 http://www.anu.edu.au/ anu.edu.au\n",
"grid.1002.3 http://www.monash.edu/ monash.edu\n",
"grid.1003.2 http://www.uq.edu.au/ uq.edu.au\n",
"grid.1004.5 http://mq.edu.au/ mq.edu.au\n",
"grid.1005.4 https://www.unsw.edu.au/ unsw.edu.au\n",
"... ... ...\n",
"grid.510886.2 https://cftni.org/ cftni.org\n",
"grid.510887.3 https://tinybeamfund.org/ tinybeamfund.org\n",
"grid.510888.c https://www.tmg-thinktank.com/ tmg-thinktank.com\n",
"grid.510889.d https://www.unicef.org/eap unicef.org\n",
"grid.510890.5 https://www.whitehouse.gov/ whitehouse.gov\n",
"\n",
"[98933 rows x 2 columns]"
]
},
"execution_count": 58,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grid_df"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"link http://www.isti.cnr.it/\n",
"domain cnr.it\n",
"Name: grid.451498.5, dtype: object"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"grid_df.loc['grid.451498.5']"
]
},
{
"cell_type": "code",
"execution_count": 60,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" url_domains | \n",
"
\n",
" \n",
" \n",
" \n",
" 5 | \n",
" 0000-0001-7402-0096 | \n",
" kth.se | \n",
"
\n",
" \n",
" 10 | \n",
" 0000-0001-8377-3508 | \n",
" cnpq.br | \n",
"
\n",
" \n",
" 26 | \n",
" 0000-0002-2638-4108 | \n",
" unioviedo.es | \n",
"
\n",
" \n",
" 151 | \n",
" 0000-0003-1284-9741 | \n",
" youtube.com | \n",
"
\n",
" \n",
" 171 | \n",
" 0000-0001-7324-3247 | \n",
" upm.es | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 11349589 | \n",
" 0000-0002-5204-5302 | \n",
" apetau.com | \n",
"
\n",
" \n",
" 11349589 | \n",
" 0000-0002-5204-5302 | \n",
" facebook.com | \n",
"
\n",
" \n",
" 11349597 | \n",
" 0000-0002-2906-0299 | \n",
" oxytocin.com.au | \n",
"
\n",
" \n",
" 11349597 | \n",
" 0000-0002-2906-0299 | \n",
" linkedin.com | \n",
"
\n",
" \n",
" 11349599 | \n",
" 0000-0002-1070-2220 | \n",
" vpsantanna.com | \n",
"
\n",
" \n",
"
\n",
"
1021364 rows × 2 columns
\n",
"
"
],
"text/plain": [
" orcid url_domains\n",
"5 0000-0001-7402-0096 kth.se\n",
"10 0000-0001-8377-3508 cnpq.br\n",
"26 0000-0002-2638-4108 unioviedo.es\n",
"151 0000-0003-1284-9741 youtube.com\n",
"171 0000-0001-7324-3247 upm.es\n",
"... ... ...\n",
"11349589 0000-0002-5204-5302 apetau.com\n",
"11349589 0000-0002-5204-5302 facebook.com\n",
"11349597 0000-0002-2906-0299 oxytocin.com.au\n",
"11349597 0000-0002-2906-0299 linkedin.com\n",
"11349599 0000-0002-1070-2220 vpsantanna.com\n",
"\n",
"[1021364 rows x 2 columns]"
]
},
"execution_count": 60,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"exp = df[['orcid', 'url_domains']].explode('url_domains')\n",
"exp = exp[exp.url_domains.notna()]\n",
"exp"
]
},
{
"cell_type": "code",
"execution_count": 61,
"metadata": {},
"outputs": [],
"source": [
"exp['grid'] = exp.url_domains.isin(grid_df.domain)"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [],
"source": [
"non_grid_domains = exp[~exp.url_domains.isin(grid_df.domain)].groupby('url_domains').count().sort_values('orcid', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 63,
"metadata": {},
"outputs": [],
"source": [
"DOMAIN_EXCLUSIONS = ['google.', 'youtube.', 'github', 'researchgate', 'academia.edu', 'elsevier.', 'elsevierpure.com',\n",
" 'publons.', 'scopus', 'researcherid', 'ac.uk', '.gov.', '.edu', 'arxiv']\n",
"for dex in DOMAIN_EXCLUSIONS:\n",
" non_grid_domains.drop(non_grid_domains.filter(like=dex, axis=0).index, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 64,
"metadata": {},
"outputs": [],
"source": [
"non_grid_domains.to_csv('../data/processed/non_grid_urls.csv')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### URLs present in other parts of the ORCID records"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" urls | \n",
" other_urls | \n",
"
\n",
" \n",
" \n",
" \n",
" 593 | \n",
" 0000-0002-0316-839X | \n",
" NaN | \n",
" [http://www.dukagjinicollege.eu/libri3/001, ht... | \n",
"
\n",
" \n",
" 1078 | \n",
" 0000-0002-5231-666X | \n",
" NaN | \n",
" [http://www.pythagoreanuniverse.com/] | \n",
"
\n",
" \n",
" 1111 | \n",
" 0000-0001-5067-2321 | \n",
" NaN | \n",
" [https://doi.org/10.1155/2017/2170816.2., http... | \n",
"
\n",
" \n",
" 5636 | \n",
" 0000-0002-1080-7903 | \n",
" [[, https://www.beautybydrcat.com/body-plastic... | \n",
" [https://www.beautybydrcat.com/about-surgery-p... | \n",
"
\n",
" \n",
" 5947 | \n",
" 0000-0001-7259-8038 | \n",
" NaN | \n",
" [http://www.linkedin.com/in/edrovera] | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 11344576 | \n",
" 0000-0002-8883-1217 | \n",
" NaN | \n",
" [https://aaerc.com.au] | \n",
"
\n",
" \n",
" 11345866 | \n",
" 0000-0002-7597-9511 | \n",
" NaN | \n",
" [https://videos.arynews.tv/] | \n",
"
\n",
" \n",
" 11346317 | \n",
" 0000-0002-0201-1163 | \n",
" NaN | \n",
" [https://g.page/non-surgical-body-contouring-n... | \n",
"
\n",
" \n",
" 11347666 | \n",
" 0000-0002-8506-4323 | \n",
" [[du an empire city, https://salereal.com.vn/c... | \n",
" [https://trandinhhieu.com/du-an/empire-city/] | \n",
"
\n",
" \n",
" 11348851 | \n",
" 0000-0001-7432-5404 | \n",
" [[queen mary university of london staff page, ... | \n",
" [https://peoplespalaceprojects.org.uk/en/proje... | \n",
"
\n",
" \n",
"
\n",
"
15842 rows × 3 columns
\n",
"
"
],
"text/plain": [
" orcid \\\n",
"593 0000-0002-0316-839X \n",
"1078 0000-0002-5231-666X \n",
"1111 0000-0001-5067-2321 \n",
"5636 0000-0002-1080-7903 \n",
"5947 0000-0001-7259-8038 \n",
"... ... \n",
"11344576 0000-0002-8883-1217 \n",
"11345866 0000-0002-7597-9511 \n",
"11346317 0000-0002-0201-1163 \n",
"11347666 0000-0002-8506-4323 \n",
"11348851 0000-0001-7432-5404 \n",
"\n",
" urls \\\n",
"593 NaN \n",
"1078 NaN \n",
"1111 NaN \n",
"5636 [[, https://www.beautybydrcat.com/body-plastic... \n",
"5947 NaN \n",
"... ... \n",
"11344576 NaN \n",
"11345866 NaN \n",
"11346317 NaN \n",
"11347666 [[du an empire city, https://salereal.com.vn/c... \n",
"11348851 [[queen mary university of london staff page, ... \n",
"\n",
" other_urls \n",
"593 [http://www.dukagjinicollege.eu/libri3/001, ht... \n",
"1078 [http://www.pythagoreanuniverse.com/] \n",
"1111 [https://doi.org/10.1155/2017/2170816.2., http... \n",
"5636 [https://www.beautybydrcat.com/about-surgery-p... \n",
"5947 [http://www.linkedin.com/in/edrovera] \n",
"... ... \n",
"11344576 [https://aaerc.com.au] \n",
"11345866 [https://videos.arynews.tv/] \n",
"11346317 [https://g.page/non-surgical-body-contouring-n... \n",
"11347666 [https://trandinhhieu.com/du-an/empire-city/] \n",
"11348851 [https://peoplespalaceprojects.org.uk/en/proje... \n",
"\n",
"[15842 rows x 3 columns]"
]
},
"execution_count": 65,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.other_urls.str.len() > 0][['orcid', 'urls', 'other_urls']]"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" urls | \n",
" other_urls | \n",
"
\n",
" \n",
" \n",
" \n",
" 593 | \n",
" 0000-0002-0316-839X | \n",
" NaN | \n",
" [http://www.dukagjinicollege.eu/libri3/001, ht... | \n",
"
\n",
" \n",
" 1078 | \n",
" 0000-0002-5231-666X | \n",
" NaN | \n",
" [http://www.pythagoreanuniverse.com/] | \n",
"
\n",
" \n",
" 1111 | \n",
" 0000-0001-5067-2321 | \n",
" NaN | \n",
" [https://doi.org/10.1155/2017/2170816.2., http... | \n",
"
\n",
" \n",
" 5947 | \n",
" 0000-0001-7259-8038 | \n",
" NaN | \n",
" [http://www.linkedin.com/in/edrovera] | \n",
"
\n",
" \n",
" 7227 | \n",
" 0000-0001-8381-4891 | \n",
" NaN | \n",
" [https://www.fontussciences.com/lozenges] | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 11343402 | \n",
" 0000-0003-1551-4613 | \n",
" NaN | \n",
" [http://2016.ifmbe.org/),, http://www.bd2decid... | \n",
"
\n",
" \n",
" 11344192 | \n",
" 0000-0002-1070-2976 | \n",
" NaN | \n",
" [http://lattes.cnpq.br/0219822413040642] | \n",
"
\n",
" \n",
" 11344576 | \n",
" 0000-0002-8883-1217 | \n",
" NaN | \n",
" [https://aaerc.com.au] | \n",
"
\n",
" \n",
" 11345866 | \n",
" 0000-0002-7597-9511 | \n",
" NaN | \n",
" [https://videos.arynews.tv/] | \n",
"
\n",
" \n",
" 11346317 | \n",
" 0000-0002-0201-1163 | \n",
" NaN | \n",
" [https://g.page/non-surgical-body-contouring-n... | \n",
"
\n",
" \n",
"
\n",
"
9413 rows × 3 columns
\n",
"
"
],
"text/plain": [
" orcid urls \\\n",
"593 0000-0002-0316-839X NaN \n",
"1078 0000-0002-5231-666X NaN \n",
"1111 0000-0001-5067-2321 NaN \n",
"5947 0000-0001-7259-8038 NaN \n",
"7227 0000-0001-8381-4891 NaN \n",
"... ... ... \n",
"11343402 0000-0003-1551-4613 NaN \n",
"11344192 0000-0002-1070-2976 NaN \n",
"11344576 0000-0002-8883-1217 NaN \n",
"11345866 0000-0002-7597-9511 NaN \n",
"11346317 0000-0002-0201-1163 NaN \n",
"\n",
" other_urls \n",
"593 [http://www.dukagjinicollege.eu/libri3/001, ht... \n",
"1078 [http://www.pythagoreanuniverse.com/] \n",
"1111 [https://doi.org/10.1155/2017/2170816.2., http... \n",
"5947 [http://www.linkedin.com/in/edrovera] \n",
"7227 [https://www.fontussciences.com/lozenges] \n",
"... ... \n",
"11343402 [http://2016.ifmbe.org/),, http://www.bd2decid... \n",
"11344192 [http://lattes.cnpq.br/0219822413040642] \n",
"11344576 [https://aaerc.com.au] \n",
"11345866 [https://videos.arynews.tv/] \n",
"11346317 [https://g.page/non-surgical-body-contouring-n... \n",
"\n",
"[9413 rows x 3 columns]"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[(df.other_urls.str.len() > 0) & (df.urls.isna())][['orcid', 'urls', 'other_urls']]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Works source"
]
},
{
"cell_type": "code",
"execution_count": 67,
"metadata": {},
"outputs": [],
"source": [
"def remove_self_source(lst, given, family):\n",
" res = []\n",
" for ws in lst:\n",
" if ws.lower().find(given.lower()) == -1:\n",
" if pd.notna(family):\n",
" if ws.lower().find(family.lower()) == -1:\n",
" res.append(ws)\n",
" else:\n",
" res.append(ws)\n",
" return res"
]
},
{
"cell_type": "code",
"execution_count": 68,
"metadata": {},
"outputs": [],
"source": [
"df['ext_works_source'] = df[(df.works_source.notna()) & (df.given_names.notna())]\\\n",
" .apply(lambda x: remove_self_source(x['works_source'], x['given_names'], x['family_name']), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [],
"source": [
"df['n_ext_work_source'] = pd.Series(df.ext_works_source.str.len(), dtype=pd.Int16Dtype())"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"exploded_external_sources = df[df['ext_works_source'].str.len() > 0][['orcid','ext_works_source']]\\\n",
" .explode('ext_works_source').reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [],
"source": [
"grouped_ext_sources = exploded_external_sources.groupby('ext_works_source')\\\n",
" .count()\\\n",
" .sort_values('orcid', ascending=False)\\\n",
" .reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"linkText": "Export to plot.ly",
"plotlyServerURL": "https://plot.ly",
"showLink": false
},
"data": [
{
"type": "bar",
"x": [
"crossref",
"scopus - elsevier",
"crossref metadata search",
"multidisciplinary digital publishing institute",
"europe pubmed central",
"researcherid",
"publons",
"ciênciavitae",
"base - bielefeld academic search engine",
"datacite",
"redalyc",
"deutsche nationalbibliothek (dnb)",
"mla international bibliography",
"national information processing institute ",
"nasa astrophysics data system",
"f1000",
"inspire-hep",
"hal",
"university of helsinki",
"igi global",
"airiti",
"university of copenhagen",
"universidade federal de uberlândia",
"aarhus university",
"universidad del país vasco",
"university of manchester - pure",
"kings college london",
"university of southern denmark",
"wellcome open research",
"macquarie university"
],
"y": [
1526827,
925561,
303924,
302224,
185098,
158204,
42567,
33252,
21515,
16312,
9846,
8404,
8233,
7933,
7568,
5356,
4992,
4966,
4197,
4081,
3812,
3176,
2820,
2390,
2291,
2265,
2215,
2211,
2173,
2072
]
}
],
"layout": {
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Top 30 works_source"
},
"xaxis": {
"range": [
-0.5,
29.5
],
"tickangle": 45,
"tickfont": {
"size": 12
}
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"set_top_n(30)\n",
"data = [\n",
" go.Bar(\n",
" x=grouped_ext_sources[:TOP_N].ext_works_source,\n",
" y=grouped_ext_sources[:TOP_N].orcid\n",
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Top %s works_source' % TOP_N,\n",
" xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 73,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ext_works_source | \n",
" orcid | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" crossref | \n",
" 1526827 | \n",
"
\n",
" \n",
" 1 | \n",
" scopus - elsevier | \n",
" 925561 | \n",
"
\n",
" \n",
" 2 | \n",
" crossref metadata search | \n",
" 303924 | \n",
"
\n",
" \n",
" 3 | \n",
" multidisciplinary digital publishing institute | \n",
" 302224 | \n",
"
\n",
" \n",
" 4 | \n",
" europe pubmed central | \n",
" 185098 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 348 | \n",
" smithsonian institution | \n",
" 3 | \n",
"
\n",
" \n",
" 349 | \n",
" universitätsbibliothek paderborn | \n",
" 3 | \n",
"
\n",
" \n",
" 350 | \n",
" uta - oa journal global insight | \n",
" 3 | \n",
"
\n",
" \n",
" 351 | \n",
" uc viden - university college south denmark | \n",
" 3 | \n",
"
\n",
" \n",
" 352 | \n",
" silva | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
353 rows × 2 columns
\n",
"
"
],
"text/plain": [
" ext_works_source orcid\n",
"0 crossref 1526827\n",
"1 scopus - elsevier 925561\n",
"2 crossref metadata search 303924\n",
"3 multidisciplinary digital publishing institute 302224\n",
"4 europe pubmed central 185098\n",
".. ... ...\n",
"348 smithsonian institution 3\n",
"349 universitätsbibliothek paderborn 3\n",
"350 uta - oa journal global insight 3\n",
"351 uc viden - university college south denmark 3\n",
"352 silva 3\n",
"\n",
"[353 rows x 2 columns]"
]
},
"execution_count": 73,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"authoritative_sources = grouped_ext_sources[grouped_ext_sources['orcid'] > 2]\n",
"authoritative_sources"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
"exploded_external_sources['authoritative'] = exploded_external_sources.ext_works_source\\\n",
" .isin(authoritative_sources['ext_works_source'])"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [],
"source": [
"orcid_authoritative_source = exploded_external_sources\\\n",
" .groupby('orcid')['authoritative']\\\n",
" .any()\\\n",
" .reset_index()[['orcid', 'authoritative']]"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"df = df.merge(orcid_authoritative_source, on='orcid', how='left')"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"df.loc[df.authoritative.isna(), 'authoritative'] = False"
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
" ext_works_source | \n",
" n_ext_work_source | \n",
" authoritative | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0000-0001-6097-3953 | \n",
" False | \n",
" False | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaN | \n",
" 2018-03-02 09:29:16.528000+00:00 | \n",
" 2018-03-02 09:43:07.551000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
"
\n",
" \n",
" 1 | \n",
" 0000-0001-6112-5550 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" [v.i. yurtaev; v. yurtaev] | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" [[professor, peoples friendship university of ... | \n",
" 0 | \n",
" NaN | \n",
" 2018-04-03 07:50:23.358000+00:00 | \n",
" 2020-03-18 09:42:44.753000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 1 | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
"
\n",
" \n",
" 2 | \n",
" 0000-0001-6152-2695 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaN | \n",
" 2019-12-11 15:31:56.388000+00:00 | \n",
" 2020-01-28 15:34:17.309000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
"
\n",
" \n",
" 3 | \n",
" 0000-0001-7071-8294 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" [[researcher (academic), universidad de zarago... | \n",
" 0 | \n",
" NaN | \n",
" 2014-03-10 13:22:01.966000+00:00 | \n",
" 2016-06-14 22:17:54.470000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 2 | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
"
\n",
" \n",
" 4 | \n",
" 0000-0001-7247-6831 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 0 | \n",
" NaN | \n",
" 2019-07-19 15:57:46.116000+00:00 | \n",
" 2019-07-19 16:04:33.839000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" orcid verified_email verified_primary_email given_names \\\n",
"0 0000-0001-6097-3953 False False \n",
"1 0000-0001-6112-5550 True True \n",
"2 0000-0001-6152-2695 True True \n",
"3 0000-0001-7071-8294 True True \n",
"4 0000-0001-7247-6831 True True \n",
"\n",
" family_name biography other_names urls primary_email \\\n",
"0 NaN NaN \n",
"1 [v.i. yurtaev; v. yurtaev] NaN \n",
"2 NaN NaN \n",
"3 NaN NaN \n",
"4 NaN NaN \n",
"\n",
" keywords external_ids education \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"\n",
" employment n_works works_source \\\n",
"0 NaN 0 NaN \n",
"1 [[professor, peoples friendship university of ... 0 NaN \n",
"2 NaN 0 NaN \n",
"3 [[researcher (academic), universidad de zarago... 0 NaN \n",
"4 NaN 0 NaN \n",
"\n",
" activation_date last_update_date n_doi \\\n",
"0 2018-03-02 09:29:16.528000+00:00 2018-03-02 09:43:07.551000+00:00 0 \n",
"1 2018-04-03 07:50:23.358000+00:00 2020-03-18 09:42:44.753000+00:00 0 \n",
"2 2019-12-11 15:31:56.388000+00:00 2020-01-28 15:34:17.309000+00:00 0 \n",
"3 2014-03-10 13:22:01.966000+00:00 2016-06-14 22:17:54.470000+00:00 0 \n",
"4 2019-07-19 15:57:46.116000+00:00 2019-07-19 16:04:33.839000+00:00 0 \n",
"\n",
" n_arxiv n_pmc n_other_pids other_urls label primary_email_domain \\\n",
"0 0 0 0 NaN False NaN \n",
"1 0 0 0 NaN False NaN \n",
"2 0 0 0 NaN False NaN \n",
"3 0 0 0 NaN False NaN \n",
"4 0 0 0 NaN False NaN \n",
"\n",
" other_email_domains url_domains other_url_domains n_emails n_urls n_ids \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"\n",
" n_keywords n_education n_employment ext_works_source n_ext_work_source \\\n",
"0 NaN \n",
"1 1 NaN \n",
"2 NaN \n",
"3 2 NaN \n",
"4 NaN \n",
"\n",
" authoritative \n",
"0 False \n",
"1 False \n",
"2 False \n",
"3 False \n",
"4 False "
]
},
"execution_count": 78,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## External IDs"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"External IDs should come from reliable sources. ORCiD registrants cannot add them freely."
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 1.335691e+06\n",
"mean 1.360596e+00\n",
"std 6.673242e-01\n",
"min 1.000000e+00\n",
"25% 1.000000e+00\n",
"50% 1.000000e+00\n",
"75% 2.000000e+00\n",
"max 8.000000e+01\n",
"Name: n_ids, dtype: float64"
]
},
"execution_count": 79,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.n_ids.describe()"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
" ext_works_source | \n",
" n_ext_work_source | \n",
" authoritative | \n",
"
\n",
" \n",
" \n",
" \n",
" 2145720 | \n",
" 0000-0002-9554-6633 | \n",
" True | \n",
" True | \n",
" john a | \n",
" williams | \n",
" <NA> | \n",
" NaN | \n",
" [[aston university profile page, https://resea... | \n",
" <NA> | \n",
" NaN | \n",
" [[scopus author id, 55553733518], [scopus aut... | \n",
" NaN | \n",
" [[, aston university, birmingham, , gb, 1722, ... | \n",
" 92 | \n",
" [aston research explorer] | \n",
" 2014-11-20 09:42:10.690000+00:00 | \n",
" 2021-04-08 00:00:40.672000+00:00 | \n",
" 80 | \n",
" 0 | \n",
" 0 | \n",
" 209 | \n",
" NaN | \n",
" True | \n",
" NaN | \n",
" NaN | \n",
" {aston.ac.uk} | \n",
" NaN | \n",
" <NA> | \n",
" 1 | \n",
" 80 | \n",
" <NA> | \n",
" <NA> | \n",
" 1 | \n",
" [aston research explorer] | \n",
" 1 | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
"2145720 0000-0002-9554-6633 True True \n",
"\n",
" given_names family_name biography other_names \\\n",
"2145720 john a williams NaN \n",
"\n",
" urls primary_email \\\n",
"2145720 [[aston university profile page, https://resea... \n",
"\n",
" keywords external_ids education \\\n",
"2145720 NaN [[scopus author id, 55553733518], [scopus aut... NaN \n",
"\n",
" employment n_works \\\n",
"2145720 [[, aston university, birmingham, , gb, 1722, ... 92 \n",
"\n",
" works_source activation_date \\\n",
"2145720 [aston research explorer] 2014-11-20 09:42:10.690000+00:00 \n",
"\n",
" last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n",
"2145720 2021-04-08 00:00:40.672000+00:00 80 0 0 209 \n",
"\n",
" other_urls label primary_email_domain other_email_domains \\\n",
"2145720 NaN True NaN NaN \n",
"\n",
" url_domains other_url_domains n_emails n_urls n_ids n_keywords \\\n",
"2145720 {aston.ac.uk} NaN 1 80 \n",
"\n",
" n_education n_employment ext_works_source \\\n",
"2145720 1 [aston research explorer] \n",
"\n",
" n_ext_work_source authoritative \n",
"2145720 1 True "
]
},
"execution_count": 80,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.n_ids == df.n_ids.max()]"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" external_ids | \n",
" provider | \n",
"
\n",
" \n",
" \n",
" \n",
" 8 | \n",
" 0000-0001-8315-2066 | \n",
" [researcherid, k-4630-2014] | \n",
" researcherid | \n",
"
\n",
" \n",
" 26 | \n",
" 0000-0002-2638-4108 | \n",
" [scopus author id, 54394231000] | \n",
" scopus author id | \n",
"
\n",
" \n",
" 45 | \n",
" 0000-0003-2259-7023 | \n",
" [scopus author id, 57189297461] | \n",
" scopus author id | \n",
"
\n",
" \n",
" 59 | \n",
" 0000-0002-7397-5824 | \n",
" [scopus author id, 8399842800] | \n",
" scopus author id | \n",
"
\n",
" \n",
" 68 | \n",
" 0000-0002-0427-9745 | \n",
" [researcherid, b-5471-2018] | \n",
" researcherid | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" orcid external_ids provider\n",
"8 0000-0001-8315-2066 [researcherid, k-4630-2014] researcherid\n",
"26 0000-0002-2638-4108 [scopus author id, 54394231000] scopus author id\n",
"45 0000-0003-2259-7023 [scopus author id, 57189297461] scopus author id\n",
"59 0000-0002-7397-5824 [scopus author id, 8399842800] scopus author id\n",
"68 0000-0002-0427-9745 [researcherid, b-5471-2018] researcherid"
]
},
"execution_count": 83,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ids[ids.provider.notna()].head()"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"top_ids_providers = ids.groupby('provider').count().sort_values('orcid', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"linkText": "Export to plot.ly",
"plotlyServerURL": "https://plot.ly",
"showLink": false
},
"data": [
{
"type": "bar",
"x": [
"scopus author id",
"researcherid",
"loop profile",
"ciência id",
"researcher name resolver id",
"sciprofiles",
"中国科学家在线",
"gnd",
"isni",
"pitt id",
"technical university of denmark cwis",
"researcher id",
"id dialnet",
"digital author id",
"sciprofile",
"scopus author id: ",
"authenticusid",
"hku researcherpage",
"uow scholars",
"cti vitae",
"scopus author id:",
"hkust profile",
"chalmers id",
"scopus id",
"iauthor",
"google scholar",
"digital author id (dai)",
"authid",
"dai",
"us epa vivo",
"scopus id",
"authenticus",
"smithsonian profiles",
"github",
"escientist",
"vivo cornell",
"researcherid:",
"id dialnet:",
"dialnet id",
"kaken",
"une researcher id",
"researcherid: ",
"orcid",
"scienceopen",
"profile system identifier",
"orcid id",
"custom"
],
"y": [
1064250,
547706,
124286,
37988,
8303,
6144,
4842,
3216,
3170,
2717,
2480,
1515,
1169,
1151,
1111,
1077,
898,
742,
643,
582,
547,
526,
430,
277,
212,
201,
184,
175,
160,
146,
127,
83,
61,
52,
49,
46,
39,
7,
6,
5,
4,
3,
2,
1,
1,
1,
1
]
}
],
"layout": {
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "IDs provided by providers"
},
"xaxis": {
"tickangle": 45,
"tickfont": {
"size": 12
}
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"data = [\n",
" go.Bar(\n",
" x=top_ids_providers.index,\n",
" y=top_ids_providers['orcid']\n",
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='IDs provided by providers',\n",
" xaxis=dict(tickangle=45, tickfont=dict(size=12))\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([nan, 'researcherid', 'scopus author id', 'loop profile', 'gnd',\n",
" 'ciência id', 'researcher name resolver id', 'pitt id',\n",
" 'id dialnet', 'isni', 'technical university of denmark cwis',\n",
" 'chalmers id', 'scopus author id: ', 'scopus author id:',\n",
" 'hkust profile', 'hku researcherpage', '中国科学家在线', 'uow scholars',\n",
" 'sciprofile', 'cti vitae', 'researcher id', 'digital author id',\n",
" 'researcherid:', 'authenticus', 'authenticusid', 'scopus id',\n",
" 'sciprofiles', 'vivo cornell', 'authid', 'digital author id (dai)',\n",
" 'us epa vivo', 'github', 'dai', 'escientist',\n",
" 'smithsonian profiles', 'orcid id', 'iauthor', 'scopus id',\n",
" 'google scholar', 'kaken', 'dialnet id', 'researcherid: ',\n",
" 'une researcher id', 'id dialnet:', 'scienceopen', 'orcid',\n",
" 'profile system identifier', 'custom'], dtype=object)"
]
},
"execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.unique(ids['provider'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Keywords"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this"
]
},
{
"cell_type": "code",
"execution_count": 87,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" n_keywords | \n",
"
\n",
" \n",
" \n",
" \n",
" 3873190 | \n",
" 0000-0002-0673-0341 | \n",
" 154 | \n",
"
\n",
" \n",
" 8982683 | \n",
" 0000-0003-3343-5660 | \n",
" 148 | \n",
"
\n",
" \n",
" 5494950 | \n",
" 0000-0002-6075-3501 | \n",
" 140 | \n",
"
\n",
" \n",
" 6724874 | \n",
" 0000-0002-7060-4112 | \n",
" 140 | \n",
"
\n",
" \n",
" 1562610 | \n",
" 0000-0001-5287-1949 | \n",
" 132 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 11349602 | \n",
" 0000-0002-1686-1935 | \n",
" <NA> | \n",
"
\n",
" \n",
" 11349603 | \n",
" 0000-0002-3800-6331 | \n",
" <NA> | \n",
"
\n",
" \n",
" 11349604 | \n",
" 0000-0002-8783-5814 | \n",
" <NA> | \n",
"
\n",
" \n",
" 11349605 | \n",
" 0000-0002-7584-2283 | \n",
" <NA> | \n",
"
\n",
" \n",
" 11349606 | \n",
" 0000-0003-0529-3538 | \n",
" <NA> | \n",
"
\n",
" \n",
"
\n",
"
11349607 rows × 2 columns
\n",
"
"
],
"text/plain": [
" orcid n_keywords\n",
"3873190 0000-0002-0673-0341 154\n",
"8982683 0000-0003-3343-5660 148\n",
"5494950 0000-0002-6075-3501 140\n",
"6724874 0000-0002-7060-4112 140\n",
"1562610 0000-0001-5287-1949 132\n",
"... ... ...\n",
"11349602 0000-0002-1686-1935 \n",
"11349603 0000-0002-3800-6331 \n",
"11349604 0000-0002-8783-5814 \n",
"11349605 0000-0002-7584-2283 \n",
"11349606 0000-0003-0529-3538 \n",
"\n",
"[11349607 rows x 2 columns]"
]
},
"execution_count": 87,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"keywords_by_orcid = df[['orcid', 'n_keywords']].sort_values('n_keywords', ascending=False)\n",
"keywords_by_orcid"
]
},
{
"cell_type": "code",
"execution_count": 88,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"linkText": "Export to plot.ly",
"plotlyServerURL": "https://plot.ly",
"showLink": false
},
"data": [
{
"type": "bar",
"x": [
"0000-0002-0673-0341",
"0000-0003-3343-5660",
"0000-0002-6075-3501",
"0000-0002-7060-4112",
"0000-0001-5287-1949",
"0000-0002-9638-8091",
"0000-0002-4071-0301",
"0000-0003-1097-926X",
"0000-0001-9462-5666",
"0000-0002-0929-2412",
"0000-0002-0115-7195",
"0000-0002-4235-4259",
"0000-0003-0076-6287",
"0000-0001-9715-9357",
"0000-0002-1878-9762",
"0000-0001-6307-6027",
"0000-0003-2273-9888",
"0000-0002-0937-7061",
"0000-0002-1770-9660",
"0000-0001-5898-6843",
"0000-0003-2998-5520",
"0000-0001-5696-1052",
"0000-0003-1799-0971",
"0000-0002-0156-3580",
"0000-0002-9625-6742",
"0000-0003-1399-7156",
"0000-0001-6537-7683",
"0000-0001-9985-1697",
"0000-0002-8401-8018",
"0000-0003-4246-8579",
"0000-0001-7857-4133",
"0000-0002-7710-0355",
"0000-0001-5869-2204",
"0000-0002-8083-7382",
"0000-0002-4497-3176",
"0000-0001-7654-5013",
"0000-0002-4488-2880",
"0000-0001-8670-4372",
"0000-0003-4374-6374",
"0000-0001-6939-3859",
"0000-0003-2509-2549",
"0000-0002-3186-8860",
"0000-0001-5230-715X",
"0000-0002-0441-1507",
"0000-0001-9336-6850",
"0000-0003-0209-180X",
"0000-0001-5458-7167",
"0000-0002-9381-2264",
"0000-0002-0463-0048",
"0000-0002-9293-0189",
"0000-0002-8227-5387",
"0000-0002-3061-3364",
"0000-0002-3123-3021",
"0000-0003-1071-4296",
"0000-0003-3340-6413",
"0000-0002-8659-6321",
"0000-0002-1718-1632",
"0000-0003-3584-6834",
"0000-0002-8644-8396",
"0000-0002-2935-1934",
"0000-0001-5167-7466",
"0000-0003-1693-3190",
"0000-0002-8449-2211",
"0000-0002-3532-043X",
"0000-0003-4505-3678",
"0000-0001-6861-9561",
"0000-0003-4608-3844",
"0000-0003-4673-1063",
"0000-0002-6347-9464",
"0000-0001-8174-8835",
"0000-0002-8918-2781",
"0000-0003-4511-7942",
"0000-0002-5274-7742",
"0000-0001-9280-6017",
"0000-0003-2532-2906",
"0000-0001-5819-4555",
"0000-0003-3720-1183",
"0000-0001-9586-0780",
"0000-0002-1103-9651",
"0000-0003-2550-1859",
"0000-0001-8135-2304",
"0000-0003-1863-0265",
"0000-0002-8499-1045",
"0000-0001-8733-5230",
"0000-0002-8665-9281",
"0000-0002-5306-7781",
"0000-0003-2218-1343",
"0000-0001-7818-3212",
"0000-0002-3494-2624",
"0000-0003-3342-6123",
"0000-0002-8072-1152",
"0000-0002-4982-5236",
"0000-0002-0715-0461",
"0000-0001-7728-4046",
"0000-0001-5300-3932",
"0000-0003-4486-2684",
"0000-0002-2252-672X",
"0000-0001-7392-9361",
"0000-0001-5556-8275",
"0000-0002-9569-0678"
],
"y": [
154,
148,
140,
140,
132,
124,
115,
110,
106,
105,
102,
100,
94,
92,
92,
88,
86,
78,
77,
76,
75,
75,
72,
71,
70,
68,
68,
68,
67,
66,
64,
64,
63,
62,
61,
61,
61,
61,
60,
60,
56,
55,
54,
54,
53,
53,
53,
53,
53,
52,
52,
52,
51,
51,
51,
50,
50,
50,
50,
50,
49,
49,
49,
48,
48,
48,
48,
48,
47,
47,
47,
47,
46,
46,
46,
45,
45,
45,
45,
44,
44,
44,
44,
44,
44,
44,
44,
44,
43,
43,
43,
43,
43,
43,
43,
43,
42,
42,
42,
42
]
}
],
"layout": {
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Keywords provided by ORCiD"
},
"xaxis": {
"range": [
-0.5,
99.5
],
"tickangle": 45,
"tickfont": {
"size": 12
}
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"set_top_n(100)\n",
"data = [\n",
" go.Bar(\n",
" x=keywords_by_orcid[:TOP_N]['orcid'],\n",
" y=keywords_by_orcid[:TOP_N]['n_keywords']\n",
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Keywords provided by ORCiD',\n",
" xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 89,
"metadata": {},
"outputs": [],
"source": [
"top_keywords = df[['orcid', 'keywords']]\\\n",
" .explode('keywords')\\\n",
" .reset_index(drop=True)\\\n",
" .groupby('keywords')\\\n",
" .count()\\\n",
" .sort_values('orcid', ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 90,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"linkText": "Export to plot.ly",
"plotlyServerURL": "https://plot.ly",
"showLink": false
},
"data": [
{
"type": "bar",
"x": [
"machine learning",
"bioinformatics",
"education",
"molecular biology",
"cancer",
"artificial intelligence",
"ecology",
"epidemiology",
"public health",
"microbiology",
"neuroscience",
"immunology",
"climate change",
"genetics",
"remote sensing",
"biochemistry",
"genomics",
"biotechnology",
"nanotechnology",
"sustainability",
"educación",
"deep learning",
"gis",
"psychology",
"computer vision",
"marketing",
"nutrition",
"data science",
"innovation",
"statistics",
"data mining",
"nanomaterials",
"image processing",
"robotics",
"management",
"optimization",
"renewable energy",
"biomaterials",
"chemistry",
"gender",
"diabetes",
"educação",
"architecture",
"history",
"catalysis",
"electrochemistry",
"research",
"evolution",
"energy",
"biodiversity"
],
"y": [
8870,
5519,
5350,
4641,
4218,
3986,
3983,
3850,
3754,
3622,
3560,
3516,
3419,
3387,
3327,
3046,
2838,
2748,
2729,
2726,
2629,
2577,
2563,
2451,
2373,
2274,
2242,
2232,
2221,
2180,
2141,
2135,
2132,
2125,
2117,
2113,
2056,
2040,
2039,
2035,
2024,
1945,
1889,
1853,
1835,
1831,
1818,
1815,
1797,
1747
]
}
],
"layout": {
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Top-50 keywords occurrence"
},
"xaxis": {
"tickangle": 45,
"tickfont": {
"size": 12
}
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"set_top_n(50)\n",
"data = [\n",
" go.Bar(\n",
" x=top_keywords[:TOP_N].index,\n",
" y=top_keywords[:TOP_N]['orcid']\n",
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Top-%s keywords occurrence' % TOP_N,\n",
" xaxis=dict(tickangle=45, tickfont=dict(size=12))\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Education"
]
},
{
"cell_type": "code",
"execution_count": 91,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 2.493817e+06\n",
"mean 1.816203e+00\n",
"std 1.134214e+00\n",
"min 1.000000e+00\n",
"25% 1.000000e+00\n",
"50% 1.000000e+00\n",
"75% 2.000000e+00\n",
"max 2.190000e+02\n",
"Name: n_education, dtype: float64"
]
},
"execution_count": 91,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.n_education.describe()"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
" ext_works_source | \n",
" n_ext_work_source | \n",
" authoritative | \n",
"
\n",
" \n",
" \n",
" \n",
" 3171493 | \n",
" 0000-0002-1927-0292 | \n",
" True | \n",
" True | \n",
" phd. carmen m | \n",
" galvez-sánchez | \n",
" my name is carmen maria galvez sánchez. i´m a ... | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" [chronic pain, gender-based violence, qualitat... | \n",
" [[loop profile, 509331], [scopus author id, 57... | \n",
" [[psychology, 2019-2020 course. degree in psyc... | \n",
" [[research technician, fundación pública andal... | \n",
" 35 | \n",
" [phd. carmen m galvez-sánchez, multidisciplina... | \n",
" 2016-04-18 14:28:57.237000+00:00 | \n",
" 2021-04-25 18:24:06.260000+00:00 | \n",
" 24 | \n",
" 0 | \n",
" 0 | \n",
" 7 | \n",
" NaN | \n",
" True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" 2 | \n",
" 5 | \n",
" 219 | \n",
" 4 | \n",
" [multidisciplinary digital publishing institut... | \n",
" 4 | \n",
" True | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
"3171493 0000-0002-1927-0292 True True \n",
"\n",
" given_names family_name \\\n",
"3171493 phd. carmen m galvez-sánchez \n",
"\n",
" biography other_names urls \\\n",
"3171493 my name is carmen maria galvez sánchez. i´m a ... NaN NaN \n",
"\n",
" primary_email keywords \\\n",
"3171493 [chronic pain, gender-based violence, qualitat... \n",
"\n",
" external_ids \\\n",
"3171493 [[loop profile, 509331], [scopus author id, 57... \n",
"\n",
" education \\\n",
"3171493 [[psychology, 2019-2020 course. degree in psyc... \n",
"\n",
" employment n_works \\\n",
"3171493 [[research technician, fundación pública andal... 35 \n",
"\n",
" works_source \\\n",
"3171493 [phd. carmen m galvez-sánchez, multidisciplina... \n",
"\n",
" activation_date last_update_date \\\n",
"3171493 2016-04-18 14:28:57.237000+00:00 2021-04-25 18:24:06.260000+00:00 \n",
"\n",
" n_doi n_arxiv n_pmc n_other_pids other_urls label \\\n",
"3171493 24 0 0 7 NaN True \n",
"\n",
" primary_email_domain other_email_domains url_domains \\\n",
"3171493 NaN NaN NaN \n",
"\n",
" other_url_domains n_emails n_urls n_ids n_keywords n_education \\\n",
"3171493 NaN 2 5 219 \n",
"\n",
" n_employment ext_works_source \\\n",
"3171493 4 [multidisciplinary digital publishing institut... \n",
"\n",
" n_ext_work_source authoritative \n",
"3171493 4 True "
]
},
"execution_count": 92,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.n_education == df.n_education.max()]"
]
},
{
"cell_type": "code",
"execution_count": 93,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" education | \n",
"
\n",
" \n",
" \n",
" \n",
" 25 | \n",
" 0000-0002-2343-910X | \n",
" [aeronautics and astronautics, phd, massachuse... | \n",
"
\n",
" \n",
" 25 | \n",
" 0000-0002-2343-910X | \n",
" [aeronautics and astronautics, sm, massachuset... | \n",
"
\n",
" \n",
" 25 | \n",
" 0000-0002-2343-910X | \n",
" [mechanical engineering and material science, ... | \n",
"
\n",
" \n",
" 26 | \n",
" 0000-0002-2638-4108 | \n",
" [public law, ph doctor, university of oviedo, ... | \n",
"
\n",
" \n",
" 47 | \n",
" 0000-0003-2513-9984 | \n",
" [medicine and surgery, student, cairo universi... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 11349602 | \n",
" 0000-0002-1686-1935 | \n",
" [, , south china agricultural university, guan... | \n",
"
\n",
" \n",
" 11349603 | \n",
" 0000-0002-3800-6331 | \n",
" [richard gilder graduate school, phd in compar... | \n",
"
\n",
" \n",
" 11349603 | \n",
" 0000-0002-3800-6331 | \n",
" [geological sciences and history (dual major),... | \n",
"
\n",
" \n",
" 11349605 | \n",
" 0000-0002-7584-2283 | \n",
" [school of electronics and information, master... | \n",
"
\n",
" \n",
" 11349605 | \n",
" 0000-0002-7584-2283 | \n",
" [ department of electrical engineering, bachel... | \n",
"
\n",
" \n",
"
\n",
"
4529278 rows × 2 columns
\n",
"
"
],
"text/plain": [
" orcid \\\n",
"25 0000-0002-2343-910X \n",
"25 0000-0002-2343-910X \n",
"25 0000-0002-2343-910X \n",
"26 0000-0002-2638-4108 \n",
"47 0000-0003-2513-9984 \n",
"... ... \n",
"11349602 0000-0002-1686-1935 \n",
"11349603 0000-0002-3800-6331 \n",
"11349603 0000-0002-3800-6331 \n",
"11349605 0000-0002-7584-2283 \n",
"11349605 0000-0002-7584-2283 \n",
"\n",
" education \n",
"25 [aeronautics and astronautics, phd, massachuse... \n",
"25 [aeronautics and astronautics, sm, massachuset... \n",
"25 [mechanical engineering and material science, ... \n",
"26 [public law, ph doctor, university of oviedo, ... \n",
"47 [medicine and surgery, student, cairo universi... \n",
"... ... \n",
"11349602 [, , south china agricultural university, guan... \n",
"11349603 [richard gilder graduate school, phd in compar... \n",
"11349603 [geological sciences and history (dual major),... \n",
"11349605 [school of electronics and information, master... \n",
"11349605 [ department of electrical engineering, bachel... \n",
"\n",
"[4529278 rows x 2 columns]"
]
},
"execution_count": 93,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"exploded_education = df[['orcid', 'education']].explode('education').dropna()\n",
"exploded_education"
]
},
{
"cell_type": "code",
"execution_count": 94,
"metadata": {},
"outputs": [],
"source": [
"exploded_education[['department', 'degree', 'university', 'city', 'region', 'country', 'id', 'id_scheme']] = pd.DataFrame(exploded_education.education.tolist(), index=exploded_education.index)"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {},
"outputs": [],
"source": [
"exploded_education.id.replace('', pd.NA, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 96,
"metadata": {},
"outputs": [],
"source": [
"# exploded_education.groupby('orcid').id.count().reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 97,
"metadata": {},
"outputs": [],
"source": [
"df = df.merge(exploded_education.groupby('orcid').id.count().reset_index(), on='orcid', how='left')\n",
"df.rename(columns={'id': 'n_valid_education'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 98,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
" ext_works_source | \n",
" n_ext_work_source | \n",
" authoritative | \n",
" n_valid_education | \n",
"
\n",
" \n",
" \n",
" \n",
" 68 | \n",
" 0000-0002-0427-9745 | \n",
" True | \n",
" True | \n",
" a. can | \n",
" inci | \n",
" i am a professor of finance at bryant universi... | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" [[researcherid, b-5471-2018], [scopus author i... | \n",
" [[finance, ph.d., university of michigan - ros... | \n",
" [[professor of finance, bryant university, smi... | \n",
" 34 | \n",
" [a. can inci] | \n",
" 2018-01-20 02:58:05.199000+00:00 | \n",
" 2020-06-16 12:35:09.403000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" 2 | \n",
" <NA> | \n",
" 4 | \n",
" 5 | \n",
" [] | \n",
" 0 | \n",
" False | \n",
" 0.0 | \n",
"
\n",
" \n",
" 82 | \n",
" 0000-0002-3380-6671 | \n",
" True | \n",
" True | \n",
" abdul | \n",
" asis pata | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" [[agribisnis, m.si, universitas hasanuddin, ma... | \n",
" [[s.p, universitas muslim maros, maros, , id, ... | \n",
" 0 | \n",
" NaN | \n",
" 2018-02-12 02:08:37.018000+00:00 | \n",
" 2018-02-12 02:22:33.378000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 1 | \n",
" 1 | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
" 0.0 | \n",
"
\n",
" \n",
" 91 | \n",
" 0000-0001-6902-6549 | \n",
" True | \n",
" True | \n",
" abubakar | \n",
" muhammad | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" [[school of electrical and information enginee... | \n",
" [[lecturer, university of faisalabad, faisalab... | \n",
" 1 | \n",
" [multidisciplinary digital publishing institute] | \n",
" 2017-07-06 10:29:17.738000+00:00 | \n",
" 2020-08-01 05:18:53.393000+00:00 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 1 | \n",
" 1 | \n",
" [multidisciplinary digital publishing institute] | \n",
" 1 | \n",
" True | \n",
" 0.0 | \n",
"
\n",
" \n",
" 94 | \n",
" 0000-0002-6142-6406 | \n",
" True | \n",
" True | \n",
" adam | \n",
" mamadou | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" [[département deconomie sociologie rurale et t... | \n",
" [[, institut national de la recherche agronomi... | \n",
" 0 | \n",
" NaN | \n",
" 2018-02-15 09:54:59.943000+00:00 | \n",
" 2018-02-15 10:19:27.869000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 1 | \n",
" 1 | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
" 0.0 | \n",
"
\n",
" \n",
" 167 | \n",
" 0000-0002-0923-8176 | \n",
" True | \n",
" True | \n",
" alicia alelí | \n",
" chaparro caso-lópez | \n",
" <NA> | \n",
" [alicia a. chaparro] | \n",
" NaN | \n",
" <NA> | \n",
" [eficacia escoalr, factores asociados al logro... | \n",
" [[researcherid, k-7451-2014], [scopus author i... | \n",
" [[facultad de psicología, doctorado en análisi... | \n",
" [[investigador titutal c, universidad autonoma... | \n",
" 45 | \n",
" [alicia alelí chaparro caso-lópez, crossref] | \n",
" 2014-09-15 00:28:08.963000+00:00 | \n",
" 2020-08-18 19:42:26.026000+00:00 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 25 | \n",
" NaN | \n",
" True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" [crossref] | \n",
" 1 | \n",
" True | \n",
" 0.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 11349563 | \n",
" 0000-0002-1842-4130 | \n",
" True | \n",
" True | \n",
" josé de jesús | \n",
" cázares-marinero | \n",
" <NA> | \n",
" [josé cázares] | \n",
" [[linkedin, https://www.linkedin.com/feed/], [... | \n",
" <NA> | \n",
" [biotechnology, biosurfactants, chemical biolo... | \n",
" [[researcherid, h-2597-2013], [scopus author i... | \n",
" [[charles friedel, postdoc, école nationale su... | \n",
" [[mtc, polioles, mexico, , mx, , ], [head of r... | \n",
" 17 | \n",
" [crossref metadata search, scopus - elsevier, ... | \n",
" 2013-07-09 14:39:30.950000+00:00 | \n",
" 2020-12-10 17:42:20.176000+00:00 | \n",
" 17 | \n",
" 0 | \n",
" 0 | \n",
" 29 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" {google.com, researchgate.net, linkedin.com} | \n",
" NaN | \n",
" <NA> | \n",
" 3 | \n",
" 2 | \n",
" 5 | \n",
" 3 | \n",
" 3 | \n",
" [crossref metadata search, scopus - elsevier] | \n",
" 2 | \n",
" True | \n",
" 0.0 | \n",
"
\n",
" \n",
" 11349569 | \n",
" 0000-0003-0459-4822 | \n",
" True | \n",
" True | \n",
" luana | \n",
" <NA> | \n",
" mestranda em tecnologia na saúde e foi aluna o... | \n",
" [luana bastos morey] | \n",
" [[site da equipe unidos pela saúde (voluntári... | \n",
" <NA> | \n",
" [tradução; língua espanhol; língua portuguesa;... | \n",
" NaN | \n",
" [[pós-graduação em tecnologia em saúde stricto... | \n",
" [[professora de espanhol e português para estr... | \n",
" 7 | \n",
" [luana arrial bastos] | \n",
" 2017-05-11 13:14:59.372000+00:00 | \n",
" 2020-12-08 20:18:24.163000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" {facebook.com, unidospelasaude.com.br} | \n",
" NaN | \n",
" <NA> | \n",
" 2 | \n",
" <NA> | \n",
" 2 | \n",
" 4 | \n",
" 3 | \n",
" [] | \n",
" 0 | \n",
" False | \n",
" 3.0 | \n",
"
\n",
" \n",
" 11349571 | \n",
" 0000-0003-0057-1551 | \n",
" True | \n",
" True | \n",
" lyudmyla | \n",
" antypenko | \n",
" the phd degree of pharmacy was received under ... | \n",
" [lyudmila nikolaevna antipenko (russian transl... | \n",
" NaN | \n",
" <NA> | \n",
" [pharmaceutical chemistry, structure elucidati... | \n",
" [[scopus author id, 55070809900], [researcheri... | \n",
" [[centre for nanomaterials, advanced technolog... | \n",
" [[visiting scientist, north dakota state unive... | \n",
" 35 | \n",
" [crossref metadata search, scopus - elsevier, ... | \n",
" 2014-02-19 08:15:15.698000+00:00 | \n",
" 2020-12-09 18:14:17.963000+00:00 | \n",
" 28 | \n",
" 0 | \n",
" 11 | \n",
" 17 | \n",
" NaN | \n",
" True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" 2 | \n",
" 5 | \n",
" 7 | \n",
" 8 | \n",
" [crossref metadata search, scopus - elsevier, ... | \n",
" 4 | \n",
" True | \n",
" 4.0 | \n",
"
\n",
" \n",
" 11349585 | \n",
" 0000-0003-4653-4705 | \n",
" True | \n",
" True | \n",
" patricia | \n",
" teixeira | \n",
" 2005 - phd, university of coimbra july 2009-ju... | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" [steroid hormones, ecotoxicology, heavy metals... | \n",
" [[researcherid, i-6863-2013], [scopus author i... | \n",
" [[, phd, university of coimbra, coimbra, , pt,... | \n",
" [[senior researcher, university of coimbra, co... | \n",
" 95 | \n",
" [ciênciavitae, scopus - elsevier, pg cardoso, ... | \n",
" 2013-11-26 10:59:34.331000+00:00 | \n",
" 2020-12-02 15:28:26.221000+00:00 | \n",
" 90 | \n",
" 0 | \n",
" 0 | \n",
" 42 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" 3 | \n",
" 7 | \n",
" 1 | \n",
" 3 | \n",
" [ciênciavitae, scopus - elsevier, pg cardoso, ... | \n",
" 4 | \n",
" True | \n",
" 0.0 | \n",
"
\n",
" \n",
" 11349602 | \n",
" 0000-0002-1686-1935 | \n",
" True | \n",
" True | \n",
" youxia | \n",
" wang | \n",
" youxia wang (1995-), native of zunyi, guizhou ... | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" [[institute of animal nutrition, master degree... | \n",
" [[master, sichuan agricultural university , ch... | \n",
" 0 | \n",
" NaN | \n",
" 2020-12-11 02:11:51.808000+00:00 | \n",
" 2020-12-11 03:25:28.263000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 2 | \n",
" 1 | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
" 1.0 | \n",
"
\n",
" \n",
"
\n",
"
653409 rows × 37 columns
\n",
"
"
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
"68 0000-0002-0427-9745 True True \n",
"82 0000-0002-3380-6671 True True \n",
"91 0000-0001-6902-6549 True True \n",
"94 0000-0002-6142-6406 True True \n",
"167 0000-0002-0923-8176 True True \n",
"... ... ... ... \n",
"11349563 0000-0002-1842-4130 True True \n",
"11349569 0000-0003-0459-4822 True True \n",
"11349571 0000-0003-0057-1551 True True \n",
"11349585 0000-0003-4653-4705 True True \n",
"11349602 0000-0002-1686-1935 True True \n",
"\n",
" given_names family_name \\\n",
"68 a. can inci \n",
"82 abdul asis pata \n",
"91 abubakar muhammad \n",
"94 adam mamadou \n",
"167 alicia alelí chaparro caso-lópez \n",
"... ... ... \n",
"11349563 josé de jesús cázares-marinero \n",
"11349569 luana \n",
"11349571 lyudmyla antypenko \n",
"11349585 patricia teixeira \n",
"11349602 youxia wang \n",
"\n",
" biography \\\n",
"68 i am a professor of finance at bryant universi... \n",
"82 \n",
"91 \n",
"94 \n",
"167 \n",
"... ... \n",
"11349563 \n",
"11349569 mestranda em tecnologia na saúde e foi aluna o... \n",
"11349571 the phd degree of pharmacy was received under ... \n",
"11349585 2005 - phd, university of coimbra july 2009-ju... \n",
"11349602 youxia wang (1995-), native of zunyi, guizhou ... \n",
"\n",
" other_names \\\n",
"68 NaN \n",
"82 NaN \n",
"91 NaN \n",
"94 NaN \n",
"167 [alicia a. chaparro] \n",
"... ... \n",
"11349563 [josé cázares] \n",
"11349569 [luana bastos morey] \n",
"11349571 [lyudmila nikolaevna antipenko (russian transl... \n",
"11349585 NaN \n",
"11349602 NaN \n",
"\n",
" urls primary_email \\\n",
"68 NaN \n",
"82 NaN \n",
"91 NaN \n",
"94 NaN \n",
"167 NaN \n",
"... ... ... \n",
"11349563 [[linkedin, https://www.linkedin.com/feed/], [... \n",
"11349569 [[site da equipe unidos pela saúde (voluntári... \n",
"11349571 NaN \n",
"11349585 NaN \n",
"11349602 NaN \n",
"\n",
" keywords \\\n",
"68 NaN \n",
"82 NaN \n",
"91 NaN \n",
"94 NaN \n",
"167 [eficacia escoalr, factores asociados al logro... \n",
"... ... \n",
"11349563 [biotechnology, biosurfactants, chemical biolo... \n",
"11349569 [tradução; língua espanhol; língua portuguesa;... \n",
"11349571 [pharmaceutical chemistry, structure elucidati... \n",
"11349585 [steroid hormones, ecotoxicology, heavy metals... \n",
"11349602 NaN \n",
"\n",
" external_ids \\\n",
"68 [[researcherid, b-5471-2018], [scopus author i... \n",
"82 NaN \n",
"91 NaN \n",
"94 NaN \n",
"167 [[researcherid, k-7451-2014], [scopus author i... \n",
"... ... \n",
"11349563 [[researcherid, h-2597-2013], [scopus author i... \n",
"11349569 NaN \n",
"11349571 [[scopus author id, 55070809900], [researcheri... \n",
"11349585 [[researcherid, i-6863-2013], [scopus author i... \n",
"11349602 NaN \n",
"\n",
" education \\\n",
"68 [[finance, ph.d., university of michigan - ros... \n",
"82 [[agribisnis, m.si, universitas hasanuddin, ma... \n",
"91 [[school of electrical and information enginee... \n",
"94 [[département deconomie sociologie rurale et t... \n",
"167 [[facultad de psicología, doctorado en análisi... \n",
"... ... \n",
"11349563 [[charles friedel, postdoc, école nationale su... \n",
"11349569 [[pós-graduação em tecnologia em saúde stricto... \n",
"11349571 [[centre for nanomaterials, advanced technolog... \n",
"11349585 [[, phd, university of coimbra, coimbra, , pt,... \n",
"11349602 [[institute of animal nutrition, master degree... \n",
"\n",
" employment n_works \\\n",
"68 [[professor of finance, bryant university, smi... 34 \n",
"82 [[s.p, universitas muslim maros, maros, , id, ... 0 \n",
"91 [[lecturer, university of faisalabad, faisalab... 1 \n",
"94 [[, institut national de la recherche agronomi... 0 \n",
"167 [[investigador titutal c, universidad autonoma... 45 \n",
"... ... ... \n",
"11349563 [[mtc, polioles, mexico, , mx, , ], [head of r... 17 \n",
"11349569 [[professora de espanhol e português para estr... 7 \n",
"11349571 [[visiting scientist, north dakota state unive... 35 \n",
"11349585 [[senior researcher, university of coimbra, co... 95 \n",
"11349602 [[master, sichuan agricultural university , ch... 0 \n",
"\n",
" works_source \\\n",
"68 [a. can inci] \n",
"82 NaN \n",
"91 [multidisciplinary digital publishing institute] \n",
"94 NaN \n",
"167 [alicia alelí chaparro caso-lópez, crossref] \n",
"... ... \n",
"11349563 [crossref metadata search, scopus - elsevier, ... \n",
"11349569 [luana arrial bastos] \n",
"11349571 [crossref metadata search, scopus - elsevier, ... \n",
"11349585 [ciênciavitae, scopus - elsevier, pg cardoso, ... \n",
"11349602 NaN \n",
"\n",
" activation_date last_update_date \\\n",
"68 2018-01-20 02:58:05.199000+00:00 2020-06-16 12:35:09.403000+00:00 \n",
"82 2018-02-12 02:08:37.018000+00:00 2018-02-12 02:22:33.378000+00:00 \n",
"91 2017-07-06 10:29:17.738000+00:00 2020-08-01 05:18:53.393000+00:00 \n",
"94 2018-02-15 09:54:59.943000+00:00 2018-02-15 10:19:27.869000+00:00 \n",
"167 2014-09-15 00:28:08.963000+00:00 2020-08-18 19:42:26.026000+00:00 \n",
"... ... ... \n",
"11349563 2013-07-09 14:39:30.950000+00:00 2020-12-10 17:42:20.176000+00:00 \n",
"11349569 2017-05-11 13:14:59.372000+00:00 2020-12-08 20:18:24.163000+00:00 \n",
"11349571 2014-02-19 08:15:15.698000+00:00 2020-12-09 18:14:17.963000+00:00 \n",
"11349585 2013-11-26 10:59:34.331000+00:00 2020-12-02 15:28:26.221000+00:00 \n",
"11349602 2020-12-11 02:11:51.808000+00:00 2020-12-11 03:25:28.263000+00:00 \n",
"\n",
" n_doi n_arxiv n_pmc n_other_pids other_urls label \\\n",
"68 0 0 0 0 NaN False \n",
"82 0 0 0 0 NaN False \n",
"91 1 0 0 0 NaN True \n",
"94 0 0 0 0 NaN False \n",
"167 4 0 0 25 NaN True \n",
"... ... ... ... ... ... ... \n",
"11349563 17 0 0 29 NaN False \n",
"11349569 0 0 0 0 NaN False \n",
"11349571 28 0 11 17 NaN True \n",
"11349585 90 0 0 42 NaN False \n",
"11349602 0 0 0 0 NaN False \n",
"\n",
" primary_email_domain other_email_domains \\\n",
"68 NaN NaN \n",
"82 NaN NaN \n",
"91 NaN NaN \n",
"94 NaN NaN \n",
"167 NaN NaN \n",
"... ... ... \n",
"11349563 NaN NaN \n",
"11349569 NaN NaN \n",
"11349571 NaN NaN \n",
"11349585 NaN NaN \n",
"11349602 NaN NaN \n",
"\n",
" url_domains other_url_domains \\\n",
"68 NaN NaN \n",
"82 NaN NaN \n",
"91 NaN NaN \n",
"94 NaN NaN \n",
"167 NaN NaN \n",
"... ... ... \n",
"11349563 {google.com, researchgate.net, linkedin.com} NaN \n",
"11349569 {facebook.com, unidospelasaude.com.br} NaN \n",
"11349571 NaN NaN \n",
"11349585 NaN NaN \n",
"11349602 NaN NaN \n",
"\n",
" n_emails n_urls n_ids n_keywords n_education n_employment \\\n",
"68 2 4 5 \n",
"82 1 1 \n",
"91 1 1 \n",
"94 1 1 \n",
"167 2 3 4 5 \n",
"... ... ... ... ... ... ... \n",
"11349563 3 2 5 3 3 \n",
"11349569 2 2 4 3 \n",
"11349571 2 5 7 8 \n",
"11349585 3 7 1 3 \n",
"11349602 2 1 \n",
"\n",
" ext_works_source \\\n",
"68 [] \n",
"82 NaN \n",
"91 [multidisciplinary digital publishing institute] \n",
"94 NaN \n",
"167 [crossref] \n",
"... ... \n",
"11349563 [crossref metadata search, scopus - elsevier] \n",
"11349569 [] \n",
"11349571 [crossref metadata search, scopus - elsevier, ... \n",
"11349585 [ciênciavitae, scopus - elsevier, pg cardoso, ... \n",
"11349602 NaN \n",
"\n",
" n_ext_work_source authoritative n_valid_education \n",
"68 0 False 0.0 \n",
"82 False 0.0 \n",
"91 1 True 0.0 \n",
"94 False 0.0 \n",
"167 1 True 0.0 \n",
"... ... ... ... \n",
"11349563 2 True 0.0 \n",
"11349569 0 False 3.0 \n",
"11349571 4 True 4.0 \n",
"11349585 4 True 0.0 \n",
"11349602 False 1.0 \n",
"\n",
"[653409 rows x 37 columns]"
]
},
"execution_count": 98,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.n_education != df.n_valid_education]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Employment"
]
},
{
"cell_type": "code",
"execution_count": 99,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"count 2.750231e+06\n",
"mean 1.666351e+00\n",
"std 1.537579e+00\n",
"min 1.000000e+00\n",
"25% 1.000000e+00\n",
"50% 1.000000e+00\n",
"75% 2.000000e+00\n",
"max 1.980000e+02\n",
"Name: n_employment, dtype: float64"
]
},
"execution_count": 99,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.n_employment.describe()"
]
},
{
"cell_type": "code",
"execution_count": 100,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
" ext_works_source | \n",
" n_ext_work_source | \n",
" authoritative | \n",
" n_valid_education | \n",
"
\n",
" \n",
" \n",
" \n",
" 5725751 | \n",
" 0000-0002-0293-964X | \n",
" True | \n",
" True | \n",
" ben zhong | \n",
" tang | \n",
" <NA> | \n",
" [唐本忠] | \n",
" [[homepage, http://ihome.ust.hk/~tangbenz/]] | \n",
" tangbenz@ust.hk | \n",
" [light-emitting molecules, hyperbranched polym... | \n",
" [[hkust profile, tang-benzhong], [researcherid... | \n",
" [[department of chemistry and faculty of pharm... | \n",
" [[chair professor, division of biomedical engi... | \n",
" 435 | \n",
" [tang, benzhong, crossref] | \n",
" 2015-03-13 00:28:33.270000+00:00 | \n",
" 2021-05-03 16:55:47.245000+00:00 | \n",
" 372 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" ust.hk | \n",
" NaN | \n",
" {ust.hk} | \n",
" NaN | \n",
" <NA> | \n",
" 1 | \n",
" 3 | \n",
" 7 | \n",
" 7 | \n",
" 198 | \n",
" [crossref] | \n",
" 1 | \n",
" True | \n",
" 3.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
"5725751 0000-0002-0293-964X True True \n",
"\n",
" given_names family_name biography other_names \\\n",
"5725751 ben zhong tang [唐本忠] \n",
"\n",
" urls primary_email \\\n",
"5725751 [[homepage, http://ihome.ust.hk/~tangbenz/]] tangbenz@ust.hk \n",
"\n",
" keywords \\\n",
"5725751 [light-emitting molecules, hyperbranched polym... \n",
"\n",
" external_ids \\\n",
"5725751 [[hkust profile, tang-benzhong], [researcherid... \n",
"\n",
" education \\\n",
"5725751 [[department of chemistry and faculty of pharm... \n",
"\n",
" employment n_works \\\n",
"5725751 [[chair professor, division of biomedical engi... 435 \n",
"\n",
" works_source activation_date \\\n",
"5725751 [tang, benzhong, crossref] 2015-03-13 00:28:33.270000+00:00 \n",
"\n",
" last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n",
"5725751 2021-05-03 16:55:47.245000+00:00 372 0 0 0 \n",
"\n",
" other_urls label primary_email_domain other_email_domains \\\n",
"5725751 NaN False ust.hk NaN \n",
"\n",
" url_domains other_url_domains n_emails n_urls n_ids n_keywords \\\n",
"5725751 {ust.hk} NaN 1 3 7 \n",
"\n",
" n_education n_employment ext_works_source n_ext_work_source \\\n",
"5725751 7 198 [crossref] 1 \n",
"\n",
" authoritative n_valid_education \n",
"5725751 True 3.0 "
]
},
"execution_count": 100,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df.n_employment == df.n_employment.max()]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Let's count how many employments have a valid assigned id by orcid (ringols, isni, grid, etc.)"
]
},
{
"cell_type": "code",
"execution_count": 101,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" employment | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 0000-0001-6112-5550 | \n",
" [professor, peoples friendship university of r... | \n",
"
\n",
" \n",
" 3 | \n",
" 0000-0001-7071-8294 | \n",
" [researcher (academic), universidad de zaragoz... | \n",
"
\n",
" \n",
" 3 | \n",
" 0000-0001-7071-8294 | \n",
" [researcher (academic), instituto de síntesis ... | \n",
"
\n",
" \n",
" 5 | \n",
" 0000-0001-7402-0096 | \n",
" [, kth royal institute of technology, stockhol... | \n",
"
\n",
" \n",
" 8 | \n",
" 0000-0001-8315-2066 | \n",
" [, universidad de córdoba, córdoba, andalucía,... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 11349601 | \n",
" 0000-0003-2606-0936 | \n",
" [post-doc, institute of biochemistry and cell ... | \n",
"
\n",
" \n",
" 11349602 | \n",
" 0000-0002-1686-1935 | \n",
" [master, sichuan agricultural university , che... | \n",
"
\n",
" \n",
" 11349603 | \n",
" 0000-0002-3800-6331 | \n",
" [assistant professor, baruch college, city uni... | \n",
"
\n",
" \n",
" 11349603 | \n",
" 0000-0002-3800-6331 | \n",
" [postdoctoral scholar, university of californi... | \n",
"
\n",
" \n",
" 11349605 | \n",
" 0000-0002-7584-2283 | \n",
" [lecturer, henan institute of science and tech... | \n",
"
\n",
" \n",
"
\n",
"
4582849 rows × 2 columns
\n",
"
"
],
"text/plain": [
" orcid \\\n",
"1 0000-0001-6112-5550 \n",
"3 0000-0001-7071-8294 \n",
"3 0000-0001-7071-8294 \n",
"5 0000-0001-7402-0096 \n",
"8 0000-0001-8315-2066 \n",
"... ... \n",
"11349601 0000-0003-2606-0936 \n",
"11349602 0000-0002-1686-1935 \n",
"11349603 0000-0002-3800-6331 \n",
"11349603 0000-0002-3800-6331 \n",
"11349605 0000-0002-7584-2283 \n",
"\n",
" employment \n",
"1 [professor, peoples friendship university of r... \n",
"3 [researcher (academic), universidad de zaragoz... \n",
"3 [researcher (academic), instituto de síntesis ... \n",
"5 [, kth royal institute of technology, stockhol... \n",
"8 [, universidad de córdoba, córdoba, andalucía,... \n",
"... ... \n",
"11349601 [post-doc, institute of biochemistry and cell ... \n",
"11349602 [master, sichuan agricultural university , che... \n",
"11349603 [assistant professor, baruch college, city uni... \n",
"11349603 [postdoctoral scholar, university of californi... \n",
"11349605 [lecturer, henan institute of science and tech... \n",
"\n",
"[4582849 rows x 2 columns]"
]
},
"execution_count": 101,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"exploded_employment = df[['orcid', 'employment']].explode('employment').dropna()\n",
"exploded_employment"
]
},
{
"cell_type": "code",
"execution_count": 102,
"metadata": {},
"outputs": [],
"source": [
"exploded_employment[['role', 'institution', 'city', 'region', 'country', 'id', 'id_scheme']] = pd.DataFrame(exploded_employment.employment.tolist(), index=exploded_employment.index)"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [],
"source": [
"exploded_employment.id.replace('', pd.NA, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 104,
"metadata": {},
"outputs": [],
"source": [
"# exploded_employment.groupby('orcid').id.count().reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 105,
"metadata": {},
"outputs": [],
"source": [
"df = df.merge(exploded_employment.groupby('orcid').id.count().reset_index(), on='orcid', how='left')\n",
"df.rename(columns={'id': 'n_valid_employment'}, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 106,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" orcid | \n",
" verified_email | \n",
" verified_primary_email | \n",
" given_names | \n",
" family_name | \n",
" biography | \n",
" other_names | \n",
" urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
" education | \n",
" employment | \n",
" n_works | \n",
" works_source | \n",
" activation_date | \n",
" last_update_date | \n",
" n_doi | \n",
" n_arxiv | \n",
" n_pmc | \n",
" n_other_pids | \n",
" other_urls | \n",
" label | \n",
" primary_email_domain | \n",
" other_email_domains | \n",
" url_domains | \n",
" other_url_domains | \n",
" n_emails | \n",
" n_urls | \n",
" n_ids | \n",
" n_keywords | \n",
" n_education | \n",
" n_employment | \n",
" ext_works_source | \n",
" n_ext_work_source | \n",
" authoritative | \n",
" n_valid_education | \n",
" n_valid_employment | \n",
"
\n",
" \n",
" \n",
" \n",
" 3 | \n",
" 0000-0001-7071-8294 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" [[researcher (academic), universidad de zarago... | \n",
" 0 | \n",
" NaN | \n",
" 2014-03-10 13:22:01.966000+00:00 | \n",
" 2016-06-14 22:17:54.470000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 2 | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
"
\n",
" \n",
" 5 | \n",
" 0000-0001-7402-0096 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" [[kth profile, https://www.kth.se/profile/toma... | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" [[, kth royal institute of technology, stockho... | \n",
" 0 | \n",
" NaN | \n",
" 2015-01-11 15:13:06.467000+00:00 | \n",
" 2016-06-14 23:55:59.896000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" {kth.se} | \n",
" NaN | \n",
" <NA> | \n",
" 1 | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 1 | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
" NaN | \n",
" 0.0 | \n",
"
\n",
" \n",
" 10 | \n",
" 0000-0001-8377-3508 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" [fontana, milena da silva] | \n",
" [[currículo lattes, http://buscatextual.cnpq.b... | \n",
" <NA> | \n",
" [educação; informática; matemática.] | \n",
" NaN | \n",
" NaN | \n",
" [[, instituto federal de educação, ciência e t... | \n",
" 0 | \n",
" NaN | \n",
" 2018-05-23 23:39:04.534000+00:00 | \n",
" 2019-10-16 02:50:11.007000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" {cnpq.br} | \n",
" NaN | \n",
" <NA> | \n",
" 1 | \n",
" <NA> | \n",
" 1 | \n",
" <NA> | \n",
" 3 | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
" NaN | \n",
" 0.0 | \n",
"
\n",
" \n",
" 35 | \n",
" 0000-0002-6508-6998 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" [[researcher (academic), universidad de zarago... | \n",
" 0 | \n",
" NaN | \n",
" 2014-03-12 08:23:22.492000+00:00 | \n",
" 2015-07-27 15:51:38.411000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 2 | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
" NaN | \n",
" 1.0 | \n",
"
\n",
" \n",
" 45 | \n",
" 0000-0003-2259-7023 | \n",
" True | \n",
" True | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" [applied oceanography] | \n",
" [[scopus author id, 57189297461]] | \n",
" NaN | \n",
" [[researcher, ministry of marine affairs and f... | \n",
" 4 | \n",
" [scopus - elsevier] | \n",
" 2019-05-02 07:29:45.597000+00:00 | \n",
" 2019-06-27 00:38:22.964000+00:00 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" 1 | \n",
" 1 | \n",
" <NA> | \n",
" 1 | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
" NaN | \n",
" 0.0 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 11349589 | \n",
" 0000-0002-5204-5302 | \n",
" True | \n",
" True | \n",
" sabri | \n",
" alshboul | \n",
" i specialize in the cross-linguistic study of ... | \n",
" NaN | \n",
" [[a member in the association of professors of... | \n",
" <NA> | \n",
" [linguistics/english/morphology/ syntax/actfl] | \n",
" NaN | \n",
" [[linguistics, phd, university of kansas, lawr... | \n",
" [[director, language centter , zarqa , zarqa ,... | \n",
" 15 | \n",
" [sabri shehadeh yusuf alshboul] | \n",
" 2016-12-10 09:09:48.405000+00:00 | \n",
" 2020-11-28 18:51:42.007000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" {apetau.com, facebook.com} | \n",
" NaN | \n",
" <NA> | \n",
" 2 | \n",
" <NA> | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" [] | \n",
" 0 | \n",
" False | \n",
" 1.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 11349592 | \n",
" 0000-0002-6096-5272 | \n",
" True | \n",
" True | \n",
" sérgio wilson | \n",
" lima de amorim | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" [[estado gobierno y políticas públicas, maestr... | \n",
" [[, caixa econômica federal, rio de janeiro, r... | \n",
" 0 | \n",
" NaN | \n",
" 2018-12-29 21:47:19.630000+00:00 | \n",
" 2020-12-11 13:40:31.295000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 3 | \n",
" 1 | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
" 3.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 11349595 | \n",
" 0000-0001-8494-2123 | \n",
" True | \n",
" True | \n",
" tarun | \n",
" jain | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" [pet/ct specialist; nuclear medicine physician... | \n",
" NaN | \n",
" NaN | \n",
" [[assistant professor, mahatma gandhi medical ... | \n",
" 0 | \n",
" NaN | \n",
" 2014-12-19 08:21:46.292000+00:00 | \n",
" 2020-12-09 06:03:57.055000+00:00 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 1 | \n",
" <NA> | \n",
" 5 | \n",
" NaN | \n",
" <NA> | \n",
" False | \n",
" NaN | \n",
" 4.0 | \n",
"
\n",
" \n",
" 11349597 | \n",
" 0000-0002-2906-0299 | \n",
" True | \n",
" True | \n",
" tiffany | \n",
" mackay | \n",
" <NA> | \n",
" [tiffany russel sia] | \n",
" [[social neuroscience website profile, http://... | \n",
" <NA> | \n",
" [fluorine-18, radiolabelling, oxytocin, galliu... | \n",
" [[researcherid, a-2121-2017]] | \n",
" [[faculty of medicine, master in pharmaceutica... | \n",
" [[clinical project lead, minomic international... | \n",
" 11 | \n",
" [crossref, researcherid, tiffany mackay] | \n",
" 2017-01-03 23:28:48.736000+00:00 | \n",
" 2020-12-09 17:12:20.326000+00:00 | \n",
" 11 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" NaN | \n",
" True | \n",
" NaN | \n",
" NaN | \n",
" {oxytocin.com.au, linkedin.com} | \n",
" NaN | \n",
" <NA> | \n",
" 2 | \n",
" 1 | \n",
" 13 | \n",
" 2 | \n",
" 4 | \n",
" [crossref, researcherid] | \n",
" 2 | \n",
" True | \n",
" 2.0 | \n",
" 1.0 | \n",
"
\n",
" \n",
" 11349603 | \n",
" 0000-0002-3800-6331 | \n",
" True | \n",
" True | \n",
" zachary | \n",
" calamari | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
" [[richard gilder graduate school, phd in compa... | \n",
" [[assistant professor, baruch college, city un... | \n",
" 7 | \n",
" [crossref metadata search, zachary t. calamari... | \n",
" 2015-01-20 20:20:17.042000+00:00 | \n",
" 2020-11-21 19:48:36.221000+00:00 | \n",
" 7 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" NaN | \n",
" True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" <NA> | \n",
" 2 | \n",
" 2 | \n",
" [crossref metadata search, crossref] | \n",
" 2 | \n",
" True | \n",
" 2.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
1065827 rows × 38 columns
\n",
"
"
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
"3 0000-0001-7071-8294 True True \n",
"5 0000-0001-7402-0096 True True \n",
"10 0000-0001-8377-3508 True True \n",
"35 0000-0002-6508-6998 True True \n",
"45 0000-0003-2259-7023 True True \n",
"... ... ... ... \n",
"11349589 0000-0002-5204-5302 True True \n",
"11349592 0000-0002-6096-5272 True True \n",
"11349595 0000-0001-8494-2123 True True \n",
"11349597 0000-0002-2906-0299 True True \n",
"11349603 0000-0002-3800-6331 True True \n",
"\n",
" given_names family_name \\\n",
"3 \n",
"5 \n",
"10 \n",
"35 \n",
"45