Exploratory analysis¶

TODO:

Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)
Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)
Temporal dimension of any use?
Can we access private info thanks to the OpenAIRE-ORCID agreement?

In [76]:

import glob

import pandas as pd
import ast
import tldextract
import numpy as np

import antispam

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

init_notebook_mode(connected=True)
TOP_N = 0
TOP_RANGE = [0, 0]

def set_top_n(n):
    global TOP_N, TOP_RANGE
    TOP_N = n
    TOP_RANGE = [-.5, n - 1 + .5]
    
pd.set_option('display.max_columns', None)

Notable solid ORCID iDs for explorative purposes:

In [77]:

AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'

Notable anomalies:

In [78]:

JOURNAL = '0000-0003-1815-5732'
NOINFO = '0000-0001-5009-2052'
VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE
WORK_MISUSE = '0000-0001-7870-1120'
# todo: find group-shared ORCiD, if possible

Notable fake ORCID iDs:

In [79]:

SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'
PENIS = '0000-0002-3399-7287'
BITCOIN = '0000-0002-7518-6845'
FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment
CANNABIS = '0000-0002-9025-8632'      # URL > 70 + works (REMOVED)
PLUMBER = '0000-0002-1700-8311'       # URL > 10 + works

Load the dataset

In [80]:

parts = glob.glob('../data/processed/dataset.pkl.*')

df = pd.concat((pd.read_pickle(part) for part in sorted(parts)))
df.head(5)

Out[80]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	works_source	activation_date	last_update_date	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
0	0000-0001-6097-3953	False	False	<NA>	<NA>	<NA>	NaN	<NA>	NaN	NaN	NaN	NaN	NaN	2018-03-02t09:29:16.528z	2018-03-02t09:43:07.551z	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	0000-0001-6112-5550	True	True	<NA>	<NA>	<NA>	[v.i. yurtaev; v. yurtaev]	<NA>	NaN	NaN	NaN	[[professor, peoples friendship university of ...	NaN	2018-04-03t07:50:23.358z	2020-03-18t09:42:44.753z	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0
2	0000-0001-6152-2695	True	True	<NA>	<NA>	<NA>	NaN	<NA>	NaN	NaN	NaN	NaN	NaN	2019-12-11t15:31:56.388z	2020-01-28t15:34:17.309z	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	0000-0001-6220-5683	True	True	<NA>	<NA>	<NA>	NaN	<NA>	NaN	NaN	NaN	[[research scientist, new york university abu ...	NaN	2015-08-18t12:36:45.307z	2020-09-23t13:37:54.180z	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0
4	0000-0001-7071-8294	True	True	<NA>	<NA>	<NA>	NaN	<NA>	NaN	NaN	NaN	[[researcher (academic), universidad de zarago...	NaN	2014-03-10t13:22:01.966z	2016-06-14t22:17:54.470z	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0

Notable profiles inspection

In [6]:

df[df['orcid'] == AM]

Out[6]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
3073261	0000-0002-5193-7851	1	1	andrea	mannocci	data scientist & researcher; scholarly knowled...	NaN	andrea.mannocci@isti.cnr.it	[research infrastructures, science of science,...	scopus author id, 55233589900	[[information engineering, ph.d., università d...	[[research associate, istituto di scienza e te...	37	[scopus - elsevier, crossref metadata search, ...	2017-09-12t14:28:33.467z	2021-03-17t15:40:07.776z	34	0	0	60	1	isti.cnr.it	NaN	[github.io, twitter.com, linkedin.com]	NaN	3.0	1.0	5.0	4.0	5.0

In [7]:

df[df['orcid'] == WHATSAPP]

Out[7]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
9887272	0000-0001-6997-9470	1	1	other	whatsapp	NaN	NaN	NaN	[whatsapp gb apk, whatsapp gb baixar, whatsapp...	NaN	NaN	NaN	0	NaN	2020-10-07t10:37:12.237z	2020-10-08t02:32:03.935z	0	0	0	0	0	NaN	NaN	[otherwhatsapp.com, im-creator.com, facebook.c...	NaN	27.0	NaN	4.0	NaN	NaN

In [8]:

df.count()

Out[8]:

orcid                     10989649
verified_email            10989649
verified_primary_email    10989649
given_names               10959039
family_name               10671715
biography                   354015
other_names                 554684
primary_email               124722
keywords                    649637
external_ids               1308598
education                  2441645
employment                 2680488
n_works                   10989649
works_source               2740939
activation_date           10989649
last_update_date          10989649
n_doi                     10989649
n_arxiv                   10989649
n_pmc                     10989649
n_other_pids              10989649
label                     10989649
primary_email_domain        124722
other_email_domains          48615
url_domains                 715067
n_emails                     48615
n_urls                      715067
n_ids                      1308598
n_keywords                  649637
n_education                2441645
n_employment               2680488
dtype: int64

In [9]:

df['orcid'].describe()

Out[9]:

count                10989649
unique               10989649
top       0000-0001-7886-4851
freq                        1
Name: orcid, dtype: object

Primary email¶

In [10]:

df['primary_email'].describe()

Out[10]:

count               124722
unique              124718
top       maykin@owasp.org
freq                     2
Name: primary_email, dtype: object

Dupe emails

In [11]:

df['primary_email'].dropna().loc[df['primary_email'].duplicated()]

Out[11]:

1681787       opercin@erbakan.edu.tr
5590332     patrick.davey@monash.edu
9316843             maykin@owasp.org
10375852       andycheng2026@163.com
Name: primary_email, dtype: object

In [12]:

df[df['primary_email'] == 'maykin@owasp.org']

Out[12]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
7543981	0000-0002-0836-2271	1	1	maykin	warasart	NaN	NaN	maykin@owasp.org	NaN	NaN	NaN	NaN	0	NaN	2020-09-15t04:43:55.709z	2020-09-15t05:17:28.509z	0	0	0	0	0	owasp.org	[dga.or.th]	NaN	1.0	NaN	NaN	NaN	NaN	NaN
9316843	0000-0001-9855-1676	1	1	maykin	warasart	NaN	NaN	maykin@owasp.org	NaN	NaN	NaN	NaN	0	NaN	2020-10-23t17:51:51.925z	2021-01-01t15:00:52.053z	0	0	0	0	0	owasp.org	[dga.or.th, ieee.org]	NaN	2.0	NaN	NaN	NaN	NaN	NaN

In [13]:

df[df['primary_email'] == 'opercin@erbakan.edu.tr']

Out[13]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
347852	0000-0002-2232-9638	1	1	osman	perçin	NaN	NaN	opercin@erbakan.edu.tr	NaN	NaN	NaN	NaN	0	NaN	2015-01-12t13:47:55.549z	2020-01-27t07:38:24.269z	0	0	0	0	0	erbakan.edu.tr	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1681787	0000-0003-0033-0918	1	1	osman	perçin	NaN	NaN	opercin@erbakan.edu.tr	NaN	NaN	NaN	[[, necmettin erbakan university, konya, , tr,...	0	NaN	2015-10-13t05:47:12.014z	2020-12-25t13:52:03.976z	0	0	0	0	0	erbakan.edu.tr	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0

In [14]:

df[df['primary_email'] == 'patrick.davey@monash.edu']

Out[14]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
954085	0000-0002-9158-1757	1	1	patrick	davey	NaN	NaN	patrick.davey@monash.edu	[radiochemistry, radiopharmaceuticals, inorgan...	NaN	NaN	[[phd student, monash university, melbourne, ,...	0	NaN	2019-05-09t23:01:02.170z	2019-08-20t03:00:17.844z	0	0	0	0	0	monash.edu	NaN	NaN	NaN	NaN	NaN	4.0	NaN	1.0
5590332	0000-0002-8774-0030	1	1	patrick	davey	NaN	NaN	patrick.davey@monash.edu	NaN	NaN	NaN	[[phd student, monash university, melbourne, v...	1	[crossref]	2018-09-11t10:47:10.997z	2021-02-09t06:21:44.138z	1	0	0	0	1	monash.edu	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0

In [15]:

df['primary_email_domain'].describe()

Out[15]:

count        124722
unique        17160
top       gmail.com
freq          26750
Name: primary_email_domain, dtype: object

In [16]:

top_primary_emails = df[['primary_email_domain', 'orcid']]\
                .groupby('primary_email_domain')\
                .count()\
                .sort_values('orcid', ascending=False)
top_primary_emails

Out[16]:

	orcid
primary_email_domain
gmail.com	26750
hotmail.com	3801
yahoo.com	2625
163.com	2132
yuhs.ac	1134
...	...
imf.csic.es	1
imf.org	1
imfd.tu-freiberg.de	1
imft.fr	1
zzuli.edu.cn	1

17160 rows × 1 columns

In [17]:

set_top_n(30)
data = [
    go.Bar(
        x=top_primary_emails[:TOP_N].index,
        y=top_primary_emails[:TOP_N]['orcid']
    )
]

layout = go.Layout(
    title='Top-%s email domains' % TOP_N,
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

Other emails¶

In [18]:

df[df.other_email_domains.notna()].head()

Out[18]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
251	0000-0002-5916-446X	1	1	antonio gilvan	teixeira júnior	NaN	[teixeira, antônio gilvan, júnior, antonio gil...	gilvan.junior@aluno.ufca.edu.br	[ethicis; medicine; infectology; neurology; ne...	[[scopus author id, 56647743200], [scopus auth...	[[faculty of health and life sciences, , unive...	NaN	14	[antonio gilvan teixeira júnior, scopus - else...	2016-05-18t11:26:36.642z	2016-09-20t18:25:05.728z	13	8	0	aluno.ufca.edu.br	[liverpool.ac.uk]	[researchgate.net, academia.edu, cnpq.br]	1.0	3.0	4.0	1.0	1.0	NaN
316	0000-0002-8742-947X	1	1	aaron	tan shing loong	NaN	NaN	aaron.tanshingloong@wadh.ox.ac.uk	NaN	NaN	[[ruskin school of art; wadham college, , univ...	NaN	0	NaN	2015-10-05t23:10:08.771z	2016-06-14t19:55:50.313z	0	0	0	wadh.ox.ac.uk	[rsa.ox.ac.uk]	NaN	1.0	NaN	NaN	NaN	1.0	NaN
433	0000-0001-9097-2281	1	1	abhishek	solanki	NaN	NaN	NaN	NaN	NaN	NaN	[[senior engineer, robert bosch (india), benga...	1	[abhishek solanki]	2019-04-22t04:43:06.232z	2020-07-02t14:18:28.305z	0	0	0	NaN	[in.bosch.com]	[github.com, linkedin.com]	1.0	2.0	NaN	NaN	NaN	2.0
497	0000-0002-8614-3007	1	1	adam	arra	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	2017-11-15t06:33:45.625z	2017-11-15t06:44:02.998z	0	0	0	NaN	[hct.ac.ae]	NaN	1.0	NaN	NaN	NaN	NaN	NaN
869	0000-0001-9884-5498	1	1	alberto	ronzani	NaN	NaN	alberto@aronza.com	NaN	NaN	NaN	[[research scientist, vtt technical research c...	19	[crossref metadata search, alberto ronzani, cr...	2014-04-16t13:21:54.287z	2020-09-28t15:10:37.439z	18	3	1	aronza.com	[vtt.fi]	NaN	1.0	NaN	NaN	NaN	NaN	1.0

In [19]:

emails_by_orcid = df[['orcid', 'n_emails']].sort_values('n_emails', ascending=False)

In [20]:

set_top_n(30)
data = [
    go.Bar(
        x=emails_by_orcid[:TOP_N]['orcid'],
        y=emails_by_orcid[:TOP_N]['n_emails']
    )
]

layout = go.Layout(
    title='Top %s ORCID iDs by email' % TOP_N, 
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [21]:

top_other_emails = df[['orcid', 'other_email_domains']]\
                        .explode('other_email_domains')\
                        .reset_index(drop=True)\
                        .groupby('other_email_domains')\
                        .count()\
                        .sort_values('orcid', ascending=False)

In [22]:

set_top_n(30)
data = [
    go.Bar(
        x=top_other_emails[:TOP_N].index,
        y=top_other_emails[:TOP_N]['orcid']
    )
]

layout = go.Layout(
    title='Top %s other email domains' % TOP_N, 
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

This somehow makes sense, legitimate users could put the gmail account as primary for login purposes and have institutional addresses as other email addresses. It makes also the life easier upon relocation.

Email speculation¶

In [23]:

df[df.primary_email.isna() & df.other_email_domains.notna()]

Out[23]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
433	0000-0001-9097-2281	1	1	abhishek	solanki	NaN	NaN	NaN	NaN	NaN	NaN	[[senior engineer, robert bosch (india), benga...	1	[abhishek solanki]	2019-04-22t04:43:06.232z	2020-07-02t14:18:28.305z	0	0	0	0	0	NaN	[in.bosch.com]	[github.com, linkedin.com]	1.0	2.0	NaN	NaN	NaN	2.0
497	0000-0002-8614-3007	1	1	adam	arra	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	2017-11-15t06:33:45.625z	2017-11-15t06:44:02.998z	0	0	0	0	0	NaN	[hct.ac.ae]	NaN	1.0	NaN	NaN	NaN	NaN	NaN
898	0000-0003-3728-6439	1	1	alejandra	echeverry velásquez	alejandra echeverry is an industrial electrici...	NaN	NaN	[innovation, renewable, control, science, ener...	NaN	[[, electrical engineer, institución universit...	[[professor, institución universitaria pascual...	1	[crossref]	2019-03-31t00:00:42.929z	2020-09-06t02:18:54.290z	1	0	0	0	1	NaN	[pascualbravo.edu.co]	NaN	1.0	NaN	NaN	7.0	1.0	1.0
1719	0000-0001-8330-7443	1	1	andrea	tesoniero	NaN	NaN	NaN	NaN	researcherid, d-9056-2015	[[department of geophysics, master of science ...	[[postdoctoral associate, yale university, new...	4	[andrea tesoniero]	2015-03-09t11:59:06.093z	2020-08-20t15:03:23.447z	4	0	0	2	0	NaN	[yale.edu]	NaN	1.0	NaN	1.0	NaN	4.0	2.0
6829	0000-0001-9670-515X	1	1	esma esin	yildirim	NaN	NaN	NaN	[pharmacognosy, natural chemistry, chemical en...	NaN	[[business management, master of science, ista...	NaN	0	NaN	2020-07-26t10:38:03.721z	2020-07-26t10:52:26.539z	0	0	0	0	0	NaN	[gmail.com]	NaN	1.0	NaN	NaN	3.0	3.0	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10985816	0000-0003-1204-6009	1	1	nathan	walk	NaN	NaN	NaN	NaN	NaN	[[department of physics, doctor of philosophy,...	[[, university of oxford, oxford, oxfordshire,...	10	[crossref metadata search]	2016-07-28t14:24:16.844z	2020-10-13t11:47:50.621z	10	0	0	0	1	NaN	[cs.ox.ac.uk]	[fu-berlin.de]	1.0	1.0	NaN	NaN	3.0	2.0
10986027	0000-0002-3472-7668	1	1	raf	vandevelde	NaN	NaN	NaN	NaN	NaN	[[chemical engineering technology, master, kat...	[[phd researcher, katholieke universiteit leuv...	0	NaN	2020-10-14t13:56:44.779z	2020-10-16t14:21:40.673z	0	0	0	0	0	NaN	[kuleuven.be]	[linkedin.com]	1.0	1.0	NaN	NaN	2.0	1.0
10987501	0000-0002-9602-0529	1	1	carlos augusto	finelli	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1	[crossref]	2013-09-16t16:52:06.120z	2020-12-01t22:47:08.074z	1	0	0	0	1	NaN	[cecot.com.br]	NaN	1.0	NaN	NaN	NaN	NaN	NaN
10987829	0000-0003-4402-5982	1	1	filipe	de almeida araújo	NaN	NaN	NaN	NaN	NaN	[[materials science, msc. materials science, m...	[[co-owner, aeft acessory, manaus, amazonas, b...	0	NaN	2020-03-02t20:11:01.699z	2020-12-04t13:53:39.404z	0	0	0	0	0	NaN	[ime.eb.br]	NaN	1.0	NaN	NaN	NaN	2.0	1.0
10988444	0000-0002-1734-7241	1	1	manareldeen	ahmed	NaN	NaN	NaN	[graphene, deep learning, atomistic simulation...	NaN	NaN	[[post-doctor, zhejiang university, hangzhou, ...	6	[manareldeen ahmed]	2017-02-17t13:18:36.540z	2020-12-04t02:04:36.668z	6	0	0	3	1	NaN	[hotmail.com]	NaN	1.0	NaN	NaN	5.0	NaN	1.0

19814 rows × 30 columns

URLs¶

In [24]:

df[df.url_domains.notna()].head()

Out[24]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
6	0000-0001-7402-0096	1	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[[, kth royal institute of technology, stockho...	0	NaN	2015-01-11t15:13:06.467z	2016-06-14t23:55:59.896z	0	NaN	NaN	[kth.se]	NaN	1.0	NaN	NaN	NaN	1.0
11	0000-0001-8377-3508	1	1	NaN	NaN	NaN	[fontana, milena da silva]	NaN	[educação; informática; matemática.]	NaN	NaN	[[, instituto federal de educação, ciência e t...	0	NaN	2018-05-23t23:39:04.534z	2019-10-16t02:50:11.007z	0	NaN	NaN	[cnpq.br]	NaN	1.0	NaN	1.0	NaN	3.0
29	0000-0002-2638-4108	1	1	NaN	NaN	investigador de la universidad de oviedo. depa...	NaN	NaN	[constitutional history, history of political ...	scopus author id, 54394231000	[[public law, ph doctor, university of oviedo,...	[[professor of constitutional law, university ...	1	[crossref]	2013-03-25t14:38:06.016z	2020-07-01t13:10:37.025z	1	NaN	NaN	[unioviedo.es]	NaN	1.0	1.0	3.0	1.0	1.0
46	0000-0003-1435-6545	1	1	NaN	NaN	NaN	NaN	NaN	[prostate cancer, migration, culture cell]	researcherid, p-2223-2018	[[morfologia, , universidade estadual paulista...	[[, universidade estadual paulista (unesp), in...	0	NaN	2018-08-09t12:12:24.405z	2020-04-22t01:38:03.184z	0	NaN	NaN	[cnpq.br, linkedin.com]	NaN	2.0	1.0	3.0	1.0	1.0
158	0000-0003-1284-9741	1	1	alex percy antonio	manriquez paisig	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	2020-09-08t20:04:33.906z	2020-09-08t20:25:55.432z	0	NaN	NaN	[youtube.com]	NaN	1.0	NaN	NaN	NaN	NaN

In [25]:

urls_by_orcid = df[['orcid', 'n_urls']].sort_values('n_urls', ascending=False)
urls_by_orcid

Out[25]:

	orcid	n_urls
3226518	0000-0002-1234-835X	219.0
4206055	0000-0001-7478-4539	174.0
4901870	0000-0002-7392-3792	169.0
8184260	0000-0002-6938-9638	152.0
2743648	0000-0002-5710-4041	114.0
...	...	...
10989644	0000-0002-1686-1935	NaN
10989645	0000-0002-3800-6331	NaN
10989646	0000-0002-8783-5814	NaN
10989647	0000-0002-7584-2283	NaN
10989648	0000-0003-0529-3538	NaN

10989649 rows × 2 columns

In [26]:

set_top_n(100)
data = [
    go.Bar(
        x=urls_by_orcid[:TOP_N]['orcid'],
        y=urls_by_orcid[:TOP_N]['n_urls']
    )
]

layout = go.Layout(
    title='Top %s ORCID iDs with URLs' % TOP_N,
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [27]:

top_urls = df[['orcid', 'url_domains']]\
                .explode('url_domains')\
                .reset_index(drop=True)\
                .groupby('url_domains')\
                .count()\
                .sort_values('orcid', ascending=False)

In [28]:

set_top_n(50)
data = [
    go.Bar(
        x=top_urls[:TOP_N].index,
        y=top_urls[:TOP_N]['orcid']
    )
]

layout = go.Layout(
    title='Top-%s URL domains' % TOP_N,
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

URLs speculation¶

In [29]:

df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]

Out[29]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
1025713	0000-0003-2407-3557	1	1	abdul	aziz	abdul aziz was born on may 25, 1973, in brebes...	[abdul aziz, aziz, abdul, aziz, a., aziz, abd,...	NaN	[ekonomi islam, etika bisnis islam, ilmu ekono...	NaN	[[ilmu ekonomi, dr, universitas borobudur, jak...	[[assisten professor/dr, institut agama islam ...	72	[base - bielefeld academic search engine, abdu...	2016-09-12t04:41:24.842z	2021-01-26t11:58:33.039z	19	0	0	77	1	NaN	NaN	[google.com, syekhnurjati.ac.id, orcid.org, bl...	NaN	59.0	NaN	4.0	3.0	1.0
2743648	0000-0002-5710-4041	1	1	ryszard	romaniuk	professor of electronics and communications en...	[r.romaniuk, r.s.romaniuk, ryszard romaniuk, r...	rrom@ise.pw.edu.pl	[measurement systems, electronics, photonics, ...	[[isni, 0000000071432485], [researcherid, b-91...	[[faculty of electronics and information techn...	[[professor, institute director, politechnika ...	5008	[inspire-hep, researcherid, isni2orcid search ...	2013-01-20t12:09:21.600z	2021-03-16t19:37:31.650z	1221	25	0	1742	1	ise.pw.edu.pl	[ise.pw.edu.pl, elka.pw.edu.pl, cern.ch]	[google.pl, publons.com, scopus.com, mendeley....	3.0	114.0	3.0	5.0	1.0	1.0
3011724	0000-0003-2450-090X	1	1	eduard	babulak	professor eduard babulak is accomplished inter...	[professor eduard babulak]	NaN	[next generation of ict and eservices, compute...	[[scopus author id, 6506867432], [researcherid...	[[information technology, doctor habilitated (...	[[consultant, horizon 2020 framework programme...	274	[the lens, base - bielefeld academic search en...	2013-04-03t08:02:30.013z	2021-02-28t10:07:13.231z	199	0	1	174	0	NaN	NaN	[worldassessmentcouncil.org, spseke.sk, bcs.or...	NaN	114.0	5.0	8.0	6.0	22.0
3881064	0000-0002-3920-7389	1	1	а.	гусев	surname, name gusev alexander leonidovichdate...	[alexander l. gusev , alexander leonidovich gu...	NaN	[photochromic, electrochromic, storage and tra...	[[researcherid, f-8048-2014], [scopus author i...	[[chemical technology and cryogenic-vacuum tec...	[[general director, scientific technical centr...	472	[publons, datacite, scopus - elsevier, a.l. gu...	2014-05-14t00:01:28.030z	2021-01-16t13:44:14.134z	37	0	0	21	1	NaN	NaN	[youtube.com, isjaee.com, researchgate.net, re...	NaN	111.0	2.0	16.0	2.0	7.0
7466062	0000-0002-1929-6054	1	1	franklin américo	canaza choque	docente-investigador social. maestrando en der...	[franklin américo canaza-choque , franklin a. ...	leo_123fa@hotmail.com	[justicia global; democracia; derechos humanos...	[[researcherid, p-8613-2018], [loop profile, 8...	[[facultad de ciencias de la educación , maest...	[[investigador social, universidad católica de...	39	[researcherid, base - bielefeld academic searc...	2017-09-15t19:45:43.483z	2021-03-23t20:12:47.297z	30	0	0	34	1	hotmail.com	[gmail.com, gmail.com, hotmail.com, baldwin.ed...	[concytec.gob.pe, redalyc.org, redalyc.org, un...	5.0	61.0	4.0	2.0	1.0	1.0
7517096	0000-0003-4948-9268	1	1	gustavo	duperré	gustavo norberto duperré graduated in arts and...	[gustavo norberto duperré, duperré, g. n., gus...	gustavo.duperre@usal.edu.ar	[sciences of antiquity, social sciences, compu...	[[scopus author id, 57195936346], [researcheri...	[[programme in history, history of art and ter...	[[titular professor, dirección general de cult...	41	[gustavo duperré, scopus - elsevier, publons, ...	2020-02-22t15:49:52.386z	2021-03-12t15:13:44.065z	13	0	0	34	0	usal.edu.ar	NaN	[icomos.ro, unirioja.es, unirioja.es, unc.edu....	NaN	61.0	2.0	11.0	6.0	5.0
8068275	0000-0003-2183-8112	1	1	pelayo munhoz	olea	pós-doutorado em gestão ambiental pela univers...	[ munhoz, pelayo olea, olea, pelayo, olea, p...	NaN	[empreendedorismo, sustentabilidade, inovação]	[[scopus author id, 55175503300], [researcheri...	[[, postdoctoral in environmental sustainabili...	[[professor, universidade federal do rio grand...	1109	[the lens, pelayo munhoz olea, dimensions, bas...	2013-02-04t17:25:34.723z	2021-03-19t18:51:01.128z	798	0	1	582	1	NaN	NaN	[cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c...	NaN	61.0	2.0	3.0	7.0	9.0
8184260	0000-0002-6938-9638	1	1	adolfo	catral sanabria	my education is in computer science, mathemati...	NaN	NaN	NaN	loop profile, 747193	[[education, capacitación para la enseñanza en...	NaN	2023	[base - bielefeld academic search engine, data...	2019-05-07t19:27:02.210z	2020-12-10t23:39:15.236z	2022	0	0	16	1	NaN	NaN	[researchgate.net, youtube.com, linkedin.com, ...	NaN	152.0	1.0	NaN	6.0	NaN
8791256	0000-0002-9025-8632	1	1	buycannabis	dispensary	we procure and deliver premium cannabis strain...	[we procure and deliver premium cannabis strai...	NaN	[canabis dispensary, cannabis, cannabis commun...	NaN	NaN	NaN	10	[goowonderland dispensary]	2020-12-09t21:19:46.004z	2020-12-10t01:17:28.772z	0	0	0	0	0	NaN	NaN	[goowonderland.com, goowonderland.com, goowond...	NaN	81.0	NaN	7.0	NaN	NaN
10174509	0000-0002-9965-2425	1	1	jaroslaw	spychala	jaroslaw spychala has received a doctoral degr...	[jaroslaw jozef spychala]	NaN	[medicinal and pharmaceutical chemistry, photo...	scopus author id, 7006745874	[[department of chemistry, postdoctoral associ...	[[assistant professor, adam mickiewicz univers...	29	[scopus - elsevier]	2014-09-18t12:34:14.242z	2020-02-11t14:31:25.544z	15	0	0	29	1	NaN	NaN	[biowebspin.com, biowebspin.com, google.com, l...	NaN	73.0	1.0	4.0	4.0	2.0
10257808	0000-0002-4062-3603	1	1	juan de dios	beltrán mancilla	juan de dios beltrán mancilla (*) filósofo aut...	[juan de dios beltrán mancilla, filósofo autod...	NaN	[filosofia medicina arquitectura economía dere...	NaN	[[, diplomado en practicas directivas para or...	[[inspector general jornada vespertina // de 2...	11	[juan de dios beltr´´án mancilla]	2020-04-19t21:06:33.495z	2021-02-10t20:13:07.698z	0	0	0	7	0	NaN	NaN	[yumpu.com, ijopm.org, google.com, blogspot.co...	NaN	69.0	NaN	1.0	8.0	6.0
10486212	0000-0002-3997-5070	1	1	dr. parameshachari	b d	dr. parameshachari b dacm distinguished speake...	[dr. parameshachari b d]	NaN	[professor & head \|dept. of tce\| gsssiet for w...	[[researcherid, f-7045-2018], [scopus author i...	[[electronics and communication engineering, p...	[[acm distinguished speaker (volunteer), assoc...	93	[publons, multidisciplinary digital publishing...	2016-08-24t11:00:30.403z	2021-03-23t07:16:22.582z	47	0	0	48	1	NaN	NaN	[geethashishu.in, geethashishu.in, acm.org, go...	NaN	71.0	3.0	6.0	5.0	10.0
10652632	0000-0003-2593-7134	1	1	aan	jaelani	all my papers can be downloaded from portal:re...	[jaelani, a., jaelani, aan]	aan_jaelani@syekhnurjati.ac.id	[tourism industry, islamic finance and banking...	[[scopus author id, 57195963463], [loop profil...	[[post graduate, s3/dr, universitas islam nege...	[[dr, institut agama islam negeri syekh nurjat...	79	[publons, aan jaelani, scopus - elsevier, dime...	2016-03-02t18:37:44.989z	2021-03-19t10:11:57.908z	88	0	0	193	1	syekhnurjati.ac.id	[gmail.com]	[microsoft.com, twitter.com, academia.edu, aca...	1.0	67.0	4.0	7.0	2.0	1.0

In [30]:

df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]

Out[30]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
47439	0000-0002-5967-2835	1	1	oleksiy	goryayinov	NaN	[алексей николаевич горяинов, о.м.горяїнов, а....	NaN	[diagnostics, transport, logistics]	researcherid, i-7977-2016	[[, дистанционный курс «ctl.sc2x: supply chain...	[[docent, kharkiv petro vasylenko national tec...	274	[oleksiy goryayinov]	2014-08-03t18:06:42.925z	2021-03-22t13:56:48.311z	0	0	0	0	1	NaN	NaN	[khntusg.com.ua, khntusg.com.ua, google.com.ua...	NaN	13.0	1.0	3.0	14.0	7.0
72557	0000-0002-3505-2797	1	1	nurul	malahayati	google scholar	NaN	NaN	NaN	researcherid, q-3861-2017	[[civil and transportation engineering , maste...	[[senior lecturer, universitas syiah kuala, ba...	6	[nurul malahayati]	2017-10-01t00:46:31.324z	2019-08-19t15:52:47.253z	3	0	0	3	1	NaN	NaN	[google.com, ristekdikti.go.id, unsyiah.ac.id,...	NaN	16.0	1.0	NaN	2.0	1.0
94081	0000-0003-3670-9620	1	1	carlos	barrera	im individual inventor, and this is my work; s...	[retrodynamic, novelinflow]	NaN	[imploturbocompressor, innovation, gearturbine...	loop profile, 394457	NaN	NaN	1	[carlos barrera]	2016-08-29t20:32:10.362z	2021-02-09t04:56:35.554z	0	0	0	0	1	NaN	NaN	[blogspot.mx, behance.net, authorstream.com, d...	NaN	24.0	1.0	8.0	NaN	NaN
261673	0000-0002-5441-0465	1	1	nuria	hernández-león	NaN	[nuria h. león, nuria hernández león, hernánde...	NaN	[business management, research, human resource...	NaN	[[, course: social skills, university of salam...	[[merchandise reception and expedition trainer...	11	[nuria hernández-león]	2015-11-28t07:18:58.442z	2021-03-05t16:37:47.403z	1	0	0	4	1	NaN	NaN	[feriaempresamujer.com, escueladenegociosydire...	NaN	16.0	NaN	7.0	19.0	16.0
326211	0000-0002-7781-6767	1	1	mohd nazri	ismail	born in penang, malaysia in 1971, dr. mohd had...	[ndum (national defence university of malaysia)]	NaN	[sensor, iot, voice over ip, wsn, design of ne...	[[scopus author id, 24372977800], [researcheri...	NaN	[[lecturer, universiti pertahanan nasional mal...	35	[scopus - elsevier]	2016-09-06t02:25:52.974z	2020-10-20t06:55:55.051z	24	0	0	35	1	NaN	NaN	[google.com.my, researchgate.net, academia.edu...	NaN	16.0	2.0	10.0	NaN	4.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10579801	0000-0001-5087-6965	1	1	robert	ohara	systematics, evolutionary biology, and the his...	[r. o’hara, r.j. o’hara, robert o’hara, robert...	NaN	[history and philosophy of science, ancient nu...	[[isni, 0000000138200102], [researcherid, b-47...	[[biology, ph.d., harvard university, cambridg...	NaN	45	[robert j. o’hara]	2014-09-21t02:45:19.620z	2020-07-09t06:51:09.228z	23	0	0	72	1	NaN	NaN	[rjohara.net, google.com, collegiateway.org, r...	NaN	12.0	3.0	5.0	1.0	NaN
10590882	0000-0002-3318-9861	1	1	shagufta	perveen	prof. dr. shagufta perveen is a professor at k...	NaN	shagufta792000@yahoo.com	[shagufta perveen university of southampton, s...	NaN	[[hej research institute of chemistry, phd che...	[[professor, king saud university college of p...	66	[scopus - elsevier]	2015-12-21t10:34:06.771z	2021-02-22t14:58:30.893z	56	0	0	66	1	yahoo.com	[msu.edu, ksu.edu.sa]	[shaguftaperveen.com, researchgate.net, ksu.ed...	2.0	11.0	NaN	25.0	3.0	7.0
10766062	0000-0001-8960-9004	1	1	susan	bastani	NaN	[s. bastani, سوسن باستانی]	sbastani@alzahra.ac.ir	[online and offline communities, personal netw...	scopus author id, 16642098400	[[sociology, ph.d., university of toronto, tor...	[[professor, alzahra university, tehran, vanak...	20	[scopus - elsevier]	2019-07-10t06:50:46.255z	2020-10-07t04:08:01.961z	19	0	0	33	1	alzahra.ac.ir	[gmail.com, gmail.com]	[scopus.com, google.com, publons.com, zenodo.o...	2.0	11.0	1.0	4.0	3.0	4.0
10807839	0000-0002-4379-6454	1	1	caroline wanjiru	kariuki	caroline holds a phd in economics from curtin ...	NaN	NaN	[applied economics, applied econometrics, deve...	NaN	[[economics, doctor of philosophy , curtin uni...	[[director, educational development, strathmor...	4	[caroline wanjiru kariuki]	2020-03-18t10:18:04.007z	2021-02-11t14:40:38.515z	1	0	0	0	0	NaN	NaN	[scopus.com, mendeley.com, publons.com, resear...	NaN	13.0	NaN	4.0	3.0	6.0
10911966	0000-0003-2311-0600	1	1	myo	kyaw hlaing	NaN	[dr myo kyaw hlaing]	NaN	[economic geology]	NaN	NaN	[[lecturer, union of myanmar ministry of educa...	2	[myo kyaw hlaing]	2018-12-26t12:51:57.801z	2021-01-26t14:36:47.421z	1	0	0	2	0	NaN	NaN	[facebook.com, linkedin.com, instagram.com, re...	NaN	12.0	NaN	1.0	NaN	2.0

140 rows × 30 columns

In [31]:

exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)
exploded_sources

Out[31]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
0	0000-0002-5967-2835	1	1	oleksiy	goryayinov	NaN	[алексей николаевич горяинов, о.м.горяїнов, а....	NaN	[diagnostics, transport, logistics]	researcherid, i-7977-2016	[[, дистанционный курс «ctl.sc2x: supply chain...	[[docent, kharkiv petro vasylenko national tec...	274	oleksiy goryayinov	2014-08-03t18:06:42.925z	2021-03-22t13:56:48.311z	0	0	0	0	1	NaN	NaN	[khntusg.com.ua, khntusg.com.ua, google.com.ua...	NaN	13.0	1.0	3.0	14.0	7.0
1	0000-0002-3505-2797	1	1	nurul	malahayati	google scholar	NaN	NaN	NaN	researcherid, q-3861-2017	[[civil and transportation engineering , maste...	[[senior lecturer, universitas syiah kuala, ba...	6	nurul malahayati	2017-10-01t00:46:31.324z	2019-08-19t15:52:47.253z	3	0	0	3	1	NaN	NaN	[google.com, ristekdikti.go.id, unsyiah.ac.id,...	NaN	16.0	1.0	NaN	2.0	1.0
2	0000-0003-3670-9620	1	1	carlos	barrera	im individual inventor, and this is my work; s...	[retrodynamic, novelinflow]	NaN	[imploturbocompressor, innovation, gearturbine...	loop profile, 394457	NaN	NaN	1	carlos barrera	2016-08-29t20:32:10.362z	2021-02-09t04:56:35.554z	0	0	0	0	1	NaN	NaN	[blogspot.mx, behance.net, authorstream.com, d...	NaN	24.0	1.0	8.0	NaN	NaN
3	0000-0002-5441-0465	1	1	nuria	hernández-león	NaN	[nuria h. león, nuria hernández león, hernánde...	NaN	[business management, research, human resource...	NaN	[[, course: social skills, university of salam...	[[merchandise reception and expedition trainer...	11	nuria hernández-león	2015-11-28t07:18:58.442z	2021-03-05t16:37:47.403z	1	0	0	4	1	NaN	NaN	[feriaempresamujer.com, escueladenegociosydire...	NaN	16.0	NaN	7.0	19.0	16.0
4	0000-0002-7781-6767	1	1	mohd nazri	ismail	born in penang, malaysia in 1971, dr. mohd had...	[ndum (national defence university of malaysia)]	NaN	[sensor, iot, voice over ip, wsn, design of ne...	[[scopus author id, 24372977800], [researcheri...	NaN	[[lecturer, universiti pertahanan nasional mal...	35	scopus - elsevier	2016-09-06t02:25:52.974z	2020-10-20t06:55:55.051z	24	0	0	35	1	NaN	NaN	[google.com.my, researchgate.net, academia.edu...	NaN	16.0	2.0	10.0	NaN	4.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
135	0000-0001-5087-6965	1	1	robert	ohara	systematics, evolutionary biology, and the his...	[r. o’hara, r.j. o’hara, robert o’hara, robert...	NaN	[history and philosophy of science, ancient nu...	[[isni, 0000000138200102], [researcherid, b-47...	[[biology, ph.d., harvard university, cambridg...	NaN	45	robert j. o’hara	2014-09-21t02:45:19.620z	2020-07-09t06:51:09.228z	23	0	0	72	1	NaN	NaN	[rjohara.net, google.com, collegiateway.org, r...	NaN	12.0	3.0	5.0	1.0	NaN
136	0000-0002-3318-9861	1	1	shagufta	perveen	prof. dr. shagufta perveen is a professor at k...	NaN	shagufta792000@yahoo.com	[shagufta perveen university of southampton, s...	NaN	[[hej research institute of chemistry, phd che...	[[professor, king saud university college of p...	66	scopus - elsevier	2015-12-21t10:34:06.771z	2021-02-22t14:58:30.893z	56	0	0	66	1	yahoo.com	[msu.edu, ksu.edu.sa]	[shaguftaperveen.com, researchgate.net, ksu.ed...	2.0	11.0	NaN	25.0	3.0	7.0
137	0000-0001-8960-9004	1	1	susan	bastani	NaN	[s. bastani, سوسن باستانی]	sbastani@alzahra.ac.ir	[online and offline communities, personal netw...	scopus author id, 16642098400	[[sociology, ph.d., university of toronto, tor...	[[professor, alzahra university, tehran, vanak...	20	scopus - elsevier	2019-07-10t06:50:46.255z	2020-10-07t04:08:01.961z	19	0	0	33	1	alzahra.ac.ir	[gmail.com, gmail.com]	[scopus.com, google.com, publons.com, zenodo.o...	2.0	11.0	1.0	4.0	3.0	4.0
138	0000-0002-4379-6454	1	1	caroline wanjiru	kariuki	caroline holds a phd in economics from curtin ...	NaN	NaN	[applied economics, applied econometrics, deve...	NaN	[[economics, doctor of philosophy , curtin uni...	[[director, educational development, strathmor...	4	caroline wanjiru kariuki	2020-03-18t10:18:04.007z	2021-02-11t14:40:38.515z	1	0	0	0	0	NaN	NaN	[scopus.com, mendeley.com, publons.com, resear...	NaN	13.0	NaN	4.0	3.0	6.0
139	0000-0003-2311-0600	1	1	myo	kyaw hlaing	NaN	[dr myo kyaw hlaing]	NaN	[economic geology]	NaN	NaN	[[lecturer, union of myanmar ministry of educa...	2	myo kyaw hlaing	2018-12-26t12:51:57.801z	2021-01-26t14:36:47.421z	1	0	0	2	0	NaN	NaN	[facebook.com, linkedin.com, instagram.com, re...	NaN	12.0	NaN	1.0	NaN	2.0

140 rows × 30 columns

In [32]:

exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]

Out[32]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
0	0000-0002-5967-2835	1	1	oleksiy	goryayinov	NaN	[алексей николаевич горяинов, о.м.горяїнов, а....	NaN	[diagnostics, transport, logistics]	researcherid, i-7977-2016	[[, дистанционный курс «ctl.sc2x: supply chain...	[[docent, kharkiv petro vasylenko national tec...	274	oleksiy goryayinov	2014-08-03t18:06:42.925z	2021-03-22t13:56:48.311z	0	0	0	0	1	NaN	NaN	[khntusg.com.ua, khntusg.com.ua, google.com.ua...	NaN	13.0	1.0	3.0	14.0	7.0
1	0000-0002-3505-2797	1	1	nurul	malahayati	google scholar	NaN	NaN	NaN	researcherid, q-3861-2017	[[civil and transportation engineering , maste...	[[senior lecturer, universitas syiah kuala, ba...	6	nurul malahayati	2017-10-01t00:46:31.324z	2019-08-19t15:52:47.253z	3	0	0	3	1	NaN	NaN	[google.com, ristekdikti.go.id, unsyiah.ac.id,...	NaN	16.0	1.0	NaN	2.0	1.0
2	0000-0003-3670-9620	1	1	carlos	barrera	im individual inventor, and this is my work; s...	[retrodynamic, novelinflow]	NaN	[imploturbocompressor, innovation, gearturbine...	loop profile, 394457	NaN	NaN	1	carlos barrera	2016-08-29t20:32:10.362z	2021-02-09t04:56:35.554z	0	0	0	0	1	NaN	NaN	[blogspot.mx, behance.net, authorstream.com, d...	NaN	24.0	1.0	8.0	NaN	NaN
3	0000-0002-5441-0465	1	1	nuria	hernández-león	NaN	[nuria h. león, nuria hernández león, hernánde...	NaN	[business management, research, human resource...	NaN	[[, course: social skills, university of salam...	[[merchandise reception and expedition trainer...	11	nuria hernández-león	2015-11-28t07:18:58.442z	2021-03-05t16:37:47.403z	1	0	0	4	1	NaN	NaN	[feriaempresamujer.com, escueladenegociosydire...	NaN	16.0	NaN	7.0	19.0	16.0
5	0000-0001-7010-2908	1	1	clara	sarmento	clara sarmento holds an aggregation in cultura...	NaN	NaN	[feminist and gender studies, tourism and busi...	ciência id, d418-d6f8-7d49	[[ao abrigo da bolsa santander ie best practic...	[[presidente da comissão de acreditação do nov...	275	clara sarmento	2013-12-12t00:33:58.190z	2020-10-12t14:43:00.749z	17	0	0	60	1	NaN	NaN	[iscap.pt, google.pt, academia.edu, researchga...	NaN	13.0	1.0	6.0	8.0	37.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
133	0000-0003-1020-1351	1	1	sheikh saifullah	ahmed	sheikh saifullah ahmed is a full-time lecturer...	NaN	saifullahahmedku@gmail.com	[post-truth, critical trauma analysis, postmod...	NaN	[[english discipline , ma & ba in english , kh...	[[lecturer , international university of busin...	3	sheikh saifullah ahmed	2020-04-08t21:00:11.201z	2021-02-12t20:45:32.247z	2	0	0	3	0	gmail.com	NaN	[academia.edu, iubat.edu, google.com, research...	NaN	12.0	NaN	5.0	1.0	1.0
134	0000-0001-7228-5680	1	1	text	protocol	NaN	NaN	NaN	NaN	NaN	NaN	[[engineer, textprotocol.org, palo alto, ca, u...	1	text protocol	2021-03-09t10:30:32.237z	2021-03-21t17:17:40.500z	0	0	0	0	0	NaN	NaN	[about.me, figma.com, github.com, gitlab.com, ...	NaN	15.0	NaN	NaN	NaN	1.0
135	0000-0001-5087-6965	1	1	robert	ohara	systematics, evolutionary biology, and the his...	[r. o’hara, r.j. o’hara, robert o’hara, robert...	NaN	[history and philosophy of science, ancient nu...	[[isni, 0000000138200102], [researcherid, b-47...	[[biology, ph.d., harvard university, cambridg...	NaN	45	robert j. o’hara	2014-09-21t02:45:19.620z	2020-07-09t06:51:09.228z	23	0	0	72	1	NaN	NaN	[rjohara.net, google.com, collegiateway.org, r...	NaN	12.0	3.0	5.0	1.0	NaN
138	0000-0002-4379-6454	1	1	caroline wanjiru	kariuki	caroline holds a phd in economics from curtin ...	NaN	NaN	[applied economics, applied econometrics, deve...	NaN	[[economics, doctor of philosophy , curtin uni...	[[director, educational development, strathmor...	4	caroline wanjiru kariuki	2020-03-18t10:18:04.007z	2021-02-11t14:40:38.515z	1	0	0	0	0	NaN	NaN	[scopus.com, mendeley.com, publons.com, resear...	NaN	13.0	NaN	4.0	3.0	6.0
139	0000-0003-2311-0600	1	1	myo	kyaw hlaing	NaN	[dr myo kyaw hlaing]	NaN	[economic geology]	NaN	NaN	[[lecturer, union of myanmar ministry of educa...	2	myo kyaw hlaing	2018-12-26t12:51:57.801z	2021-01-26t14:36:47.421z	1	0	0	2	0	NaN	NaN	[facebook.com, linkedin.com, instagram.com, re...	NaN	12.0	NaN	1.0	NaN	2.0

113 rows × 30 columns

Works source¶

In [33]:

def remove_own_source(lst, given, family):
    res = []
    for ws in lst:
        if ws.lower().find(given.lower()) == -1:
            if pd.notna(family):
                if ws.lower().find(family.lower()) == -1:
                    res.append(ws)
            else:
                res.append(ws)
    return res

In [34]:

df['ext_works_source'] = df[(df.works_source.notna()) & (df.given_names.notna())]\
                        .apply(lambda x: remove_own_source(x['works_source'], x['given_names'], x['family_name']), axis=1)

In [35]:

df['n_ext_work_source'] = df.ext_works_source.str.len()

In [36]:

exploded_external_sources = df[df['ext_works_source'].str.len() > 0][['orcid','ext_works_source']]\
                            .explode('ext_works_source').reset_index(drop=True)

In [37]:

grouped_ext_sources = exploded_external_sources.groupby('ext_works_source')\
                        .count()\
                        .sort_values('orcid', ascending=False)\
                        .reset_index()

In [38]:

data = [
    go.Bar(
        x=grouped_ext_sources[:30].ext_works_source,
        y=grouped_ext_sources[:30].orcid
    )
]

layout = go.Layout(
    title='Top 30 works_source',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [39]:

authoritative_sources = grouped_ext_sources[grouped_ext_sources['orcid'] > 2]
authoritative_sources

Out[39]:

	ext_works_source	orcid
0	crossref	1460841
1	scopus - elsevier	902231
2	crossref metadata search	297684
3	multidisciplinary digital publishing institute	281664
4	europe pubmed central	181605
...	...	...
337	uta - oa journal global insight	3
338	francis crick institute	3
339	anna	3
340	santos	3
341	universitäts- und stadtbibliothek köln	3

342 rows × 2 columns

In [40]:

exploded_external_sources['authoritative'] = exploded_external_sources.ext_works_source\
                                            .isin(authoritative_sources['ext_works_source'])

In [41]:

orcid_authoritative_source = exploded_external_sources\
                            .groupby('orcid')['authoritative']\
                            .any()\
                            .reset_index()[['orcid', 'authoritative']]

In [42]:

df = df.set_index('orcid').join(orcid_authoritative_source.set_index('orcid')).reset_index()

In [43]:

df.loc[df.authoritative.isna(), 'authoritative'] = False

In [44]:

df.head()

Out[44]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	works_source	activation_date	last_update_date	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative
0	0000-0001-6097-3953	0	0	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2018-03-02t09:29:16.528z	2018-03-02t09:43:07.551z	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	False
1	0000-0001-6112-5550	1	1	NaN	NaN	NaN	[v.i. yurtaev; v. yurtaev]	NaN	NaN	NaN	NaN	[[professor, peoples friendship university of ...	NaN	2018-04-03t07:50:23.358z	2020-03-18t09:42:44.753z	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	False
2	0000-0001-6152-2695	1	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2019-12-11t15:31:56.388z	2020-01-28t15:34:17.309z	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	False
3	0000-0001-6220-5683	1	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[[research scientist, new york university abu ...	NaN	2015-08-18t12:36:45.307z	2020-09-23t13:37:54.180z	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	False
4	0000-0001-7071-8294	1	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	[[researcher (academic), universidad de zarago...	NaN	2014-03-10t13:22:01.966z	2016-06-14t22:17:54.470z	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0	NaN	NaN	False

External IDs¶

External IDs should come from reliable sources. ORCiD registrants cannot add them freely.

In [45]:

df.n_ids.describe()

Out[45]:

count    1.308598e+06
mean     1.359082e+00
std      6.643235e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      8.000000e+01
Name: n_ids, dtype: float64

In [46]:

df[df.n_ids == df.n_ids.max()]

Out[46]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative
3896226	0000-0002-9554-6633	1	1	john a	williams	NaN	NaN	NaN	NaN	[[scopus author id, 55553733518], [scopus aut...	NaN	[[, aston university, birmingham, , gb, 1722, ...	92	[aston research explorer]	2014-11-20t09:42:10.690z	2021-03-17t01:00:51.203z	80	0	0	208	1	NaN	NaN	[aston.ac.uk]	NaN	1.0	80.0	NaN	NaN	1.0	[aston research explorer]	1.0	True

In [47]:

ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)

In [48]:

ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])

In [49]:

ids[ids.provider.notna()].head()

Out[49]:

	orcid	external_ids	provider
9	0000-0001-8315-2066	[researcherid, k-4630-2014]	researcherid
29	0000-0002-2638-4108	[scopus author id, 54394231000]	scopus author id
46	0000-0003-1435-6545	[researcherid, p-2223-2018]	researcherid
50	0000-0003-2259-7023	[scopus author id, 57189297461]	scopus author id
64	0000-0002-7397-5824	[scopus author id, 8399842800]	scopus author id

In [50]:

top_ids_providers = ids.groupby('provider').count().sort_values('orcid', ascending=False)

In [51]:

data = [
    go.Bar(
        x=top_ids_providers.index,
        y=top_ids_providers['orcid']
    )
]

layout = go.Layout(
    title='IDs provided by providers',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [52]:

pd.unique(ids['provider'])

Out[52]:

array([nan, 'researcherid', 'scopus author id', 'loop profile', 'gnd',
       'ciência id', 'researcher name resolver id', 'pitt id',
       'id dialnet', 'isni', 'technical university of denmark cwis',
       'chalmers id', 'scopus author id: ', 'scopus author id:',
       'hkust profile', 'hku researcherpage', '中国科学家在线', 'uow scholars',
       'sciprofile', 'cti vitae', 'digital author id', 'researcher id',
       'authenticusid', 'authid', 'authenticus', 'scopus  id',
       'digital author id (dai)', 'researcherid:', 'vivo cornell',
       'us epa vivo', 'escientist', 'github', 'iauthor', 'orcid id',
       'dai', 'scopus id', 'smithsonian profiles', 'google scholar',
       'kaken', 'dialnet id', 'researcherid: ', 'une researcher id',
       'sciprofiles', 'id dialnet:', 'scienceopen', 'orcid',
       'profile system identifier', 'custom'], dtype=object)

Keywords¶

This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this

In [53]:

keywords_by_orcid = df[['orcid', 'n_keywords']].sort_values('n_keywords', ascending=False)
keywords_by_orcid

Out[53]:

	orcid	n_keywords
3751714	0000-0002-0673-0341	154.0
8697926	0000-0003-3343-5660	148.0
1154523	0000-0002-6075-3501	140.0
6512971	0000-0002-7060-4112	140.0
1515197	0000-0001-5287-1949	132.0
...	...	...
10989644	0000-0002-1686-1935	NaN
10989645	0000-0002-3800-6331	NaN
10989646	0000-0002-8783-5814	NaN
10989647	0000-0002-7584-2283	NaN
10989648	0000-0003-0529-3538	NaN

10989649 rows × 2 columns

In [54]:

set_top_n(100)
data = [
    go.Bar(
        x=keywords_by_orcid[:TOP_N]['orcid'],
        y=keywords_by_orcid[:TOP_N]['n_keywords']
    )
]

layout = go.Layout(
    title='Keywords provided by ORCiD',
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [55]:

top_keywords = df[['orcid', 'keywords']]\
                .explode('keywords')\
                .reset_index(drop=True)\
                .groupby('keywords')\
                .count()\
                .sort_values('orcid', ascending=False)

In [56]:

set_top_n(50)
data = [
    go.Bar(
        x=top_keywords[:TOP_N].index,
        y=top_keywords[:TOP_N]['orcid']
    )
]

layout = go.Layout(
    title='Top-%s keywords occurrence' % TOP_N,
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

Education¶

In [57]:

def extract_education(lst):
    educations = []
    for e in lst:
        # e[0] degree
        # e[1] role
        # e[2] university
        # e[..] city, region, country, id, id_scheme
        educations.append(' '.join([e[0], e[1], e[2]]))
    return educations

Employment¶

In [58]:

def extract_employment(lst):
    res = []
    for e in lst:
        # e[0] role
        # e[1] institute
        # e[..] city, region, country, id, id_scheme
        res.append(' '.join([e[0], e[1]]))
    return res

Biography¶

In [59]:

df['biography'] = df[df.biography.notna()]['biography'].replace('', np.NaN)

In [60]:

df.biography.describe()

Out[60]:

count                                                354015
unique                                               337007
top       car title loans are a more straightforward way...
freq                                                    343
Name: biography, dtype: object

In [61]:

df[(df.biography.notna()) & (df.biography.str.contains('car title loans are a more straightforward'))]

Out[61]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative
51306	0000-0002-7397-7977	1	1	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	NaN	[car title loan upland]	NaN	NaN	NaN	0	NaN	2020-11-06t06:10:20.070z	2020-11-06t06:24:28.005z	0	0	0	0	0	NaN	NaN	[premiumcartitleloans.com]	NaN	1.0	NaN	1.0	NaN	NaN	NaN	NaN	False
51307	0000-0003-4931-9736	1	1	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	NaN	[car title loan saratoga]	NaN	NaN	NaN	0	NaN	2020-11-13t01:04:19.859z	2020-11-13t01:15:12.546z	0	0	0	0	0	NaN	NaN	[premiumcartitleloans.com]	NaN	1.0	NaN	1.0	NaN	NaN	NaN	NaN	False
106024	0000-0001-8221-2303	1	1	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	NaN	[car title loan victorville]	NaN	NaN	NaN	0	NaN	2020-11-05t00:38:21.096z	2020-11-05t00:40:40.091z	0	0	0	0	0	NaN	NaN	[premiumcartitleloans.com]	NaN	1.0	NaN	1.0	NaN	NaN	NaN	NaN	False
108770	0000-0001-6736-072X	1	1	premium car	title loans	car title loans are a more straightforward way...	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	2020-12-08t05:38:30.786z	2020-12-08t05:40:03.786z	0	0	0	0	0	NaN	NaN	[premiumcartitleloans.com]	NaN	1.0	NaN	NaN	NaN	NaN	NaN	NaN	False
108771	0000-0002-8727-1246	1	1	premium car	title loans	car title loans are a more straightforward way...	[loan agency]	NaN	[title loan on car, car title loan online, ref...	NaN	NaN	NaN	0	NaN	2020-12-10t08:54:56.127z	2020-12-10t08:57:15.791z	0	0	0	0	0	NaN	NaN	[premiumcartitleloans.com]	NaN	1.0	NaN	4.0	NaN	NaN	NaN	NaN	False
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10875416	0000-0002-9640-8136	1	1	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	NaN	[car title loan clovis]	NaN	NaN	NaN	0	NaN	2020-10-22t06:11:02.945z	2020-10-22t06:17:09.111z	0	0	0	0	0	NaN	NaN	[premiumcartitleloans.com]	NaN	1.0	NaN	1.0	NaN	NaN	NaN	NaN	False
10878239	0000-0002-6926-3752	1	1	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	NaN	[car title loan escondido]	NaN	NaN	NaN	0	NaN	2020-12-03t02:00:33.684z	2020-12-03t02:02:07.054z	0	0	0	0	0	NaN	NaN	[premiumcartitleloans.com]	NaN	1.0	NaN	1.0	NaN	NaN	NaN	NaN	False
10933380	0000-0002-3655-4713	1	1	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	NaN	[car title loan san rafael]	NaN	NaN	NaN	0	NaN	2020-11-18t00:39:17.492z	2020-11-18t00:52:19.024z	0	0	0	0	0	NaN	NaN	[premiumcartitleloans.com]	NaN	1.0	NaN	1.0	NaN	NaN	NaN	NaN	False
10933381	0000-0002-8724-1020	1	1	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	NaN	[car title loan san juan capistrano]	NaN	NaN	NaN	0	NaN	2020-11-19t00:31:54.080z	2020-11-19t00:34:08.721z	0	0	0	0	0	NaN	NaN	[premiumcartitleloans.com]	NaN	1.0	NaN	1.0	NaN	NaN	NaN	NaN	False
10985986	0000-0002-4601-4569	1	1	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	NaN	[car title loan mount pleasant]	NaN	NaN	NaN	0	NaN	2020-10-16t00:32:26.207z	2020-10-16t00:37:42.646z	0	0	0	0	0	NaN	NaN	[premiumcartitleloans.com]	NaN	1.0	NaN	1.0	NaN	NaN	NaN	NaN	False

421 rows × 33 columns

In [62]:

def score(bio):
    try:
        return antispam.score(bio)
    except: # if len(bio) < 3 the filter doesn't know how to handle that
        return -1

In [63]:

df['spam_score'] = df[df.biography.notna()]['biography'].apply(lambda bio: score(bio))

In [64]:

df[df.spam_score == -1][['orcid','biography']]

Out[64]:

	orcid	biography
25505	0000-0003-0505-2734	j
138487	0000-0002-3417-7299	.....
139595	0000-0003-3794-1288	m.d., ph.d.
193340	0000-0001-9655-4806	肿瘤
194990	0000-0002-9149-0142	be y
...	...	...
10927866	0000-0002-7341-5480	ph.d.
10976080	0000-0003-4041-0840	/
10976689	0000-0002-4285-8537
10976922	0000-0002-1545-8773	hi
10987379	0000-0002-6302-4224	.

348 rows × 2 columns

In [65]:

df['spam_score'] = df['spam_score'].replace(-1, np.NaN)

In [66]:

df.spam_score.describe()

Out[66]:

count    3.536670e+05
mean     6.098044e-01
std      4.476618e-01
min      1.917500e-22
25%      1.858235e-02
50%      9.529688e-01
75%      9.999992e-01
max      1.000000e+00
Name: spam_score, dtype: float64

In [67]:

df[df.spam_score > 0.9999][['biography', 'spam_score']]

Out[67]:

	biography	spam_score
29	investigador de la universidad de oviedo. depa...	1.000000
83	formación académica en la temática de manejo d...	1.000000
217	doctor en educación, maestro en gerencia de la...	1.000000
222	possui graduação em psicologia pela pontifícia...	1.000000
470	roofing contractors in seattle waroofing contr...	1.000000
...	...	...
10989593	jose ignacio peláez sánchez ha sido profesor e...	0.999966
10989603	mestranda em tecnologia na saúde e foi aluna o...	1.000000
10989605	the phd degree of pharmacy was received under ...	1.000000
10989615	mostafa metwaly is an assistant lecturer at th...	1.000000
10989617	jual obat aborsi di tangerang, obat penggugur ...	0.999999

120733 rows × 2 columns

All VS All correlation¶

In [68]:

fig = px.imshow(df.fillna(-1).corr())
fig.show()

In [69]:

df[['verified_email', 
    'verified_primary_email', 
    'n_works', 
    'n_doi',
    'n_arxiv', 
    'n_pmc', 
    'n_other_pids', 
    'n_emails', 
    'n_urls', 
    'n_ids', 
    'n_keywords', 
    'n_employment', 
    'n_education', 
    'label']].to_pickle('../data/processed/features.pkl')

Label speculation¶

In [70]:

df[df.label == 1]

Out[70]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative	spam_score
17	0000-0002-0137-3066	1	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	2017-07-25t04:34:17.338z	2019-11-27t17:54:45.418z	0	0	0	0	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	False	NaN
19	0000-0002-0461-9711	1	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2	[crossref]	2015-08-18t12:42:01.797z	2019-12-06t11:37:38.203z	2	0	0	0	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	False	NaN
22	0000-0002-0761-9450	1	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1	[crossref]	2020-05-13t17:15:28.405z	2020-08-11t21:00:45.694z	1	0	0	0	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	False	NaN
33	0000-0002-4447-9215	1	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN	2017-07-24t09:37:50.242z	2019-11-15t08:31:24.820z	0	0	0	0	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	False	NaN
44	0000-0003-0426-4065	1	1	NaN	NaN	NaN	[eliza i. gilbert]	NaN	NaN	NaN	NaN	[[, us fish and wildlife service, albuquerque,...	0	NaN	2017-08-07t18:32:31.802z	2020-04-08t16:48:55.732z	0	0	0	0	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	False	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10989636	0000-0002-2906-0299	1	1	tiffany	mackay	NaN	[tiffany russel sia]	NaN	[prostate cancer, oxytocin, radiolabelling, ga...	researcherid, a-2121-2017	[[faculty of medicine, master in pharmaceutica...	[[clinical project lead, minomic international...	11	[crossref, researcherid, tiffany mackay]	2017-01-03t23:28:48.736z	2020-12-09t17:12:20.326z	11	0	0	0	1	NaN	NaN	[oxytocin.com.au, linkedin.com]	NaN	2.0	1.0	13.0	2.0	4.0	[crossref, researcherid]	2.0	True	NaN
10989637	0000-0001-5896-2024	1	1	giovanni, l	tiscia	NaN	NaN	NaN	NaN	scopus author id, 54948242800	NaN	NaN	70	[scopus - elsevier, tiscia giovanni, l, europe...	2016-07-27t10:09:13.585z	2020-12-07t22:23:05.706z	65	0	17	52	1	NaN	NaN	NaN	NaN	NaN	1.0	NaN	NaN	NaN	[scopus - elsevier, europe pubmed central, cro...	3.0	True	NaN
10989640	0000-0002-1070-2220	1	1	vinicios	santanna	NaN	[vinicios sant anna, vinicios sant anna, vinic...	NaN	NaN	scopus author id, 57201697952	[[economics, ph.d., university of illinois at ...	NaN	1	[crossref metadata search]	2016-03-19t21:24:42.821z	2020-12-10t16:34:09.722z	1	0	0	1	1	NaN	NaN	[vpsantanna.com]	NaN	1.0	1.0	NaN	2.0	NaN	[crossref metadata search]	1.0	True	NaN
10989643	0000-0003-2606-0936	1	1	luang	xu	NaN	[xu lu-ang, lu lu]	NaN	NaN	NaN	NaN	[[post-doc, institute of biochemistry and cell...	2	[scopus - elsevier, crossref]	2015-10-24t03:53:23.544z	2020-11-19t09:23:48.896z	2	0	0	1	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1.0	[scopus - elsevier, crossref]	2.0	True	NaN
10989645	0000-0002-3800-6331	1	1	zachary	calamari	NaN	NaN	NaN	NaN	NaN	[[richard gilder graduate school, phd in compa...	[[assistant professor, baruch college, city un...	7	[crossref metadata search, zachary t. calamari...	2015-01-20t20:20:17.042z	2020-11-21t19:48:36.221z	7	0	1	0	1	NaN	NaN	NaN	NaN	NaN	NaN	NaN	2.0	2.0	[crossref metadata search, crossref]	2.0	True	NaN

2664886 rows × 34 columns

In [105]:

# (df.n_works > 0) & (df.n_ids > 1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10989649 entries, 0 to 10989648
Data columns (total 30 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   orcid                   string
 1   verified_email          bool  
 2   verified_primary_email  bool  
 3   given_names             string
 4   family_name             string
 5   biography               string
 6   other_names             object
 7   primary_email           string
 8   keywords                object
 9   external_ids            object
 10  education               object
 11  employment              object
 12  n_works                 int64 
 13  works_source            object
 14  activation_date         string
 15  last_update_date        string
 16  n_doi                   int64 
 17  n_arxiv                 int64 
 18  n_pmc                   int64 
 19  n_other_pids            int64 
 20  label                   int64 
 21  primary_email_domain    object
 22  other_email_domains     object
 23  url_domains             object
 24  n_emails                UInt16
 25  n_urls                  UInt16
 26  n_ids                   UInt16
 27  n_keywords              UInt16
 28  n_education             UInt16
 29  n_employment            UInt16
dtypes: UInt16(6), bool(2), int64(6), object(9), string(7)
memory usage: 2.0+ GB

In [104]:

df.n_ids = df.n_ids.astype(pd.UInt16Dtype())

In [107]:

pd.Series(['2016-07-27t10:09:13.585z', '2016-07-27t10:09:13.585z', pd.NA, '2016-07-27t10:09:13.585z'])

Out[107]:

0   2016-07-27 10:09:13.585000+00:00
1   2016-07-27 10:09:13.585000+00:00
2                                NaT
3   2016-07-27 10:09:13.585000+00:00
dtype: datetime64[ns, UTC]

In [108]:

pd.to_datetime(df.activation_date)

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-108-741fcb5ea182> in <module>
----> 1 pd.to_datetime(df.activation_date)

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/tools/datetimes.py in to_datetime(arg, errors, dayfirst, yearfirst, utc, format, exact, unit, infer_datetime_format, origin, cache)
    803             result = arg.map(cache_array)
    804         else:
--> 805             values = convert_listlike(arg._values, format)
    806             result = arg._constructor(values, index=arg.index, name=arg.name)
    807     elif isinstance(arg, (ABCDataFrame, abc.MutableMapping)):

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/tools/datetimes.py in _convert_listlike_datetimes(arg, format, name, tz, unit, errors, infer_datetime_format, dayfirst, yearfirst, exact)
    463         assert format is None or infer_datetime_format
    464         utc = tz == "utc"
--> 465         result, tz_parsed = objects_to_datetime64ns(
    466             arg,
    467             dayfirst=dayfirst,

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py in objects_to_datetime64ns(data, dayfirst, yearfirst, utc, errors, require_iso8601, allow_object)
   2073 
   2074     try:
-> 2075         result, tz_parsed = tslib.array_to_datetime(
   2076             data,
   2077             errors=errors,

pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()

pandas/_libs/tslib.pyx in pandas._libs.tslib.array_to_datetime()

pandas/_libs/tslibs/parsing.pyx in pandas._libs.tslibs.parsing.parse_datetime_string()

pandas/_libs/tslibs/parsing.pyx in pandas._libs.tslibs.parsing._parse_dateabbr_string()

/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/_strptime.py in _strptime_datetime(cls, data_string, format)
    566     """Return a class cls instance based on the input string and the
    567     format string."""
--> 568     tt, fraction, gmtoff_fraction = _strptime(data_string, format)
    569     tzname, gmtoff = tt[-2:]
    570     args = tt[:6] + (fraction,)

/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/_strptime.py in _strptime(data_string, format)
    347     found = format_regex.match(data_string)
    348     if not found:
--> 349         raise ValueError("time data %r does not match format %r" %
    350                          (data_string, format))
    351     if len(data_string) != found.end():

KeyboardInterrupt:

In [109]:

df['label'] = df['label'].astype('bool')

In [110]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10989649 entries, 0 to 10989648
Data columns (total 30 columns):
 #   Column                  Dtype 
---  ------                  ----- 
 0   orcid                   string
 1   verified_email          bool  
 2   verified_primary_email  bool  
 3   given_names             string
 4   family_name             string
 5   biography               string
 6   other_names             object
 7   primary_email           string
 8   keywords                object
 9   external_ids            object
 10  education               object
 11  employment              object
 12  n_works                 int64 
 13  works_source            object
 14  activation_date         string
 15  last_update_date        string
 16  n_doi                   int64 
 17  n_arxiv                 int64 
 18  n_pmc                   int64 
 19  n_other_pids            int64 
 20  label                   bool  
 21  primary_email_domain    object
 22  other_email_domains     object
 23  url_domains             object
 24  n_emails                UInt16
 25  n_urls                  UInt16
 26  n_ids                   UInt16
 27  n_keywords              UInt16
 28  n_education             UInt16
 29  n_employment            UInt16
dtypes: UInt16(6), bool(3), int64(5), object(9), string(7)
memory usage: 1.9+ GB

678 KiB Raw Blame History Unescape Escape