Exploratory analysis¶

TODO:

Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)
Study different cases (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)
Temporal dimension; is it of any use?
Can we access private info thanks to the OpenAIRE-ORCID agreement? No.

In [114]:

import glob
import ast

from datetime import datetime
import pytz

import tldextract

import ssl # needed because nltk.download down here fires an error
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download('punkt')

import numpy as np
import pandas as pd

# import antispam
# import profanity_check

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

init_notebook_mode(connected=True)
TOP_N = 0
TOP_RANGE = [0, 0]

def set_top_n(n):
    global TOP_N, TOP_RANGE
    TOP_N = n
    TOP_RANGE = [-.5, n - 1 + .5]
    
pd.set_option('display.max_columns', None)

[nltk_data] Downloading package punkt to /Users/andrea/nltk_data...
[nltk_data]   Package punkt is already up-to-date!

Notable solid ORCID iDs for explorative purposes:

In [2]:

AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'

Notable anomalies:

In [3]:

JOURNAL = '0000-0003-1815-5732'
NOINFO = '0000-0001-5009-2052'
VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE
WORK_MISUSE = '0000-0001-7870-1120'
# todo: find group-shared ORCiD, if possible

Notable fake ORCID iDs:

In [4]:

FAKE_HEAP = {
    'scaffold': '0000-0001-5004-7761',
    'whatsapp': '0000-0001-6997-9470',
    'penis': '0000-0002-3399-7287',
    'bitcoin': '0000-0002-7518-6845',
    'fitness': '0000-0002-1234-835X',    # URL record + employment
    'cannabis': '0000-0002-9025-8632',   # URL > 70 + works (now REMOVED)
    'plumber': '0000-0002-1700-8311',    # URL > 10 + works
    'furniture': '0000-0001-7478-4539',
    'cleaners': '0000-0002-7392-3792'
}

Load the dataset

In [221]:

parts = glob.glob('../data/processed/dataset.pkl.*')

df = pd.concat((pd.read_pickle(part) for part in sorted(parts)))
df.head(5)

Out[221]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	urls	primary_email	keywords	external_ids	education	employment	works_source	activation_date	last_update_date	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
0	0000-0001-6097-3953	False	False	<NA>	<NA>	<NA>	NaN	NaN	<NA>	NaN	NaN	NaN	NaN	NaN	2018-03-02 09:29:16.528000+00:00	2018-03-02 09:43:07.551000+00:00	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>
1	0000-0001-6112-5550	True	True	<NA>	<NA>	<NA>	[v.i. yurtaev; v. yurtaev]	NaN	<NA>	NaN	NaN	NaN	[[professor, peoples friendship university of ...	NaN	2018-04-03 07:50:23.358000+00:00	2020-03-18 09:42:44.753000+00:00	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	1
2	0000-0001-6152-2695	True	True	<NA>	<NA>	<NA>	NaN	NaN	<NA>	NaN	NaN	NaN	NaN	NaN	2019-12-11 15:31:56.388000+00:00	2020-01-28 15:34:17.309000+00:00	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>
3	0000-0001-6220-5683	True	True	<NA>	<NA>	<NA>	NaN	NaN	<NA>	NaN	NaN	NaN	[[research scientist, new york university abu ...	NaN	2015-08-18 12:36:45.307000+00:00	2020-09-23 13:37:54.180000+00:00	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	1
4	0000-0001-7071-8294	True	True	<NA>	<NA>	<NA>	NaN	NaN	<NA>	NaN	NaN	NaN	[[researcher (academic), universidad de zarago...	NaN	2014-03-10 13:22:01.966000+00:00	2016-06-14 22:17:54.470000+00:00	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	2

Notable profiles inspection

In [6]:

df[df['orcid'] == AM]

Out[6]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
3073261	0000-0002-5193-7851	True	True	andrea	mannocci	data scientist & researcher; scholarly knowled...	NaN	andrea.mannocci@isti.cnr.it	[science of science, open science, research in...	scopus author id, 55233589900	[[information engineering, ph.d., università d...	[[research associate, istituto di scienza e te...	37	[scopus - elsevier, crossref metadata search, ...	2017-09-12 14:28:33.467000+00:00	2021-03-17 15:40:07.776000+00:00	34	0	0	60	True	isti.cnr.it	NaN	[github.io, twitter.com, linkedin.com]	<NA>	3	1	5	4	5

In [7]:

df[df['orcid'] == FAKE_HEAP['whatsapp']]

Out[7]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
9887272	0000-0001-6997-9470	True	True	other	whatsapp	<NA>	NaN	<NA>	[whatsapp gb apk, whatsapp gb, whatsapp gb bai...	NaN	NaN	NaN	0	NaN	2020-10-07 10:37:12.237000+00:00	2020-10-08 02:32:03.935000+00:00	0	0	0	0	False	NaN	NaN	[otherwhatsapp.com, im-creator.com, facebook.c...	<NA>	27	<NA>	4	<NA>	<NA>

In [8]:

df.count()

Out[8]:

orcid                     10989649
verified_email            10989649
verified_primary_email    10989649
given_names               10959039
family_name               10671715
biography                   354015
other_names                 554684
primary_email               124722
keywords                    649637
external_ids               1308598
education                  2441645
employment                 2680488
n_works                   10989649
works_source               2740939
activation_date           10989649
last_update_date          10989649
n_doi                     10989649
n_arxiv                   10989649
n_pmc                     10989649
n_other_pids              10989649
label                     10989649
primary_email_domain        124722
other_email_domains          48615
url_domains                 715067
n_emails                     48615
n_urls                      715067
n_ids                      1308598
n_keywords                  649637
n_education                2441645
n_employment               2680488
dtype: int64

In [9]:

df['orcid'].describe()

Out[9]:

count                10989649
unique               10989649
top       0000-0003-4717-4481
freq                        1
Name: orcid, dtype: object

Primary email¶

In [10]:

df['primary_email'].describe()

Out[10]:

count                     124722
unique                    124718
top       opercin@erbakan.edu.tr
freq                           2
Name: primary_email, dtype: object

Dupe emails

In [11]:

df['primary_email'].dropna().loc[df['primary_email'].duplicated()]

Out[11]:

1681787       opercin@erbakan.edu.tr
5590332     patrick.davey@monash.edu
9316843             maykin@owasp.org
10375852       andycheng2026@163.com
Name: primary_email, dtype: string

In [12]:

df[df['primary_email'] == 'maykin@owasp.org']

Out[12]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
7543981	0000-0002-0836-2271	True	True	maykin	warasart	<NA>	NaN	maykin@owasp.org	NaN	NaN	NaN	NaN	0	NaN	2020-09-15 04:43:55.709000+00:00	2020-09-15 05:17:28.509000+00:00	0	0	0	0	False	owasp.org	[dga.or.th]	NaN	1	<NA>	<NA>	<NA>	<NA>	<NA>
9316843	0000-0001-9855-1676	True	True	maykin	warasart	<NA>	NaN	maykin@owasp.org	NaN	NaN	NaN	NaN	0	NaN	2020-10-23 17:51:51.925000+00:00	2021-01-01 15:00:52.053000+00:00	0	0	0	0	False	owasp.org	[dga.or.th, ieee.org]	NaN	2	<NA>	<NA>	<NA>	<NA>	<NA>

In [13]:

df[df['primary_email'] == 'opercin@erbakan.edu.tr']

Out[13]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
347852	0000-0002-2232-9638	True	True	osman	perçin	<NA>	NaN	opercin@erbakan.edu.tr	NaN	NaN	NaN	NaN	0	NaN	2015-01-12 13:47:55.549000+00:00	2020-01-27 07:38:24.269000+00:00	0	0	0	0	False	erbakan.edu.tr	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>
1681787	0000-0003-0033-0918	True	True	osman	perçin	<NA>	NaN	opercin@erbakan.edu.tr	NaN	NaN	NaN	[[, necmettin erbakan university, konya, , tr,...	0	NaN	2015-10-13 05:47:12.014000+00:00	2020-12-25 13:52:03.976000+00:00	0	0	0	0	False	erbakan.edu.tr	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	1

In [14]:

df[df['primary_email'] == 'patrick.davey@monash.edu']

Out[14]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
954085	0000-0002-9158-1757	True	True	patrick	davey	<NA>	NaN	patrick.davey@monash.edu	[inorganic chemistry, radiopharmaceuticals, ra...	NaN	NaN	[[phd student, monash university, melbourne, ,...	0	NaN	2019-05-09 23:01:02.170000+00:00	2019-08-20 03:00:17.844000+00:00	0	0	0	0	False	monash.edu	NaN	NaN	<NA>	<NA>	<NA>	4	<NA>	1
5590332	0000-0002-8774-0030	True	True	patrick	davey	<NA>	NaN	patrick.davey@monash.edu	NaN	NaN	NaN	[[phd student, monash university, melbourne, v...	1	[crossref]	2018-09-11 10:47:10.997000+00:00	2021-02-09 06:21:44.138000+00:00	1	0	0	0	True	monash.edu	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	1

In [15]:

df['primary_email_domain'].describe()

Out[15]:

count        124722
unique        17160
top       gmail.com
freq          26750
Name: primary_email_domain, dtype: object

In [16]:

top_primary_emails = df[['primary_email_domain', 'orcid']]\
                .groupby('primary_email_domain')\
                .count()\
                .sort_values('orcid', ascending=False)
top_primary_emails

Out[16]:

	orcid
primary_email_domain
gmail.com	26750
hotmail.com	3801
yahoo.com	2625
163.com	2132
yuhs.ac	1134
...	...
imf.csic.es	1
imf.org	1
imfd.tu-freiberg.de	1
imft.fr	1
zzuli.edu.cn	1

17160 rows × 1 columns

In [17]:

set_top_n(30)
data = [
    go.Bar(
        x=top_primary_emails[:TOP_N].index,
        y=top_primary_emails[:TOP_N]['orcid']
    )
]

layout = go.Layout(
    title='Top-%s email domains' % TOP_N,
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

Other emails¶

In [18]:

df[df.other_email_domains.notna()].head()

Out[18]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
251	0000-0002-5916-446X	True	True	antonio gilvan	teixeira júnior	<NA>	[teixeira, antônio gilvan, júnior, antonio gil...	gilvan.junior@aluno.ufca.edu.br	[ethicis; medicine; infectology; neurology; ne...	[[scopus author id, 56647743200], [scopus auth...	[[faculty of health and life sciences, , unive...	NaN	14	[antonio gilvan teixeira júnior, scopus - else...	2016-05-18 11:26:36.642000+00:00	2016-09-20 18:25:05.728000+00:00	13	8	False	aluno.ufca.edu.br	[liverpool.ac.uk]	[researchgate.net, academia.edu, cnpq.br]	1	3	4	1	1	<NA>
316	0000-0002-8742-947X	True	True	aaron	tan shing loong	<NA>	NaN	aaron.tanshingloong@wadh.ox.ac.uk	NaN	NaN	[[ruskin school of art; wadham college, , univ...	NaN	0	NaN	2015-10-05 23:10:08.771000+00:00	2016-06-14 19:55:50.313000+00:00	0	0	False	wadh.ox.ac.uk	[rsa.ox.ac.uk]	NaN	1	<NA>	<NA>	<NA>	1	<NA>
433	0000-0001-9097-2281	True	True	abhishek	solanki	<NA>	NaN	<NA>	NaN	NaN	NaN	[[senior engineer, robert bosch (india), benga...	1	[abhishek solanki]	2019-04-22 04:43:06.232000+00:00	2020-07-02 14:18:28.305000+00:00	0	0	False	NaN	[in.bosch.com]	[github.com, linkedin.com]	1	2	<NA>	<NA>	<NA>	2
497	0000-0002-8614-3007	True	True	adam	arra	<NA>	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2017-11-15 06:33:45.625000+00:00	2017-11-15 06:44:02.998000+00:00	0	0	False	NaN	[hct.ac.ae]	NaN	1	<NA>	<NA>	<NA>	<NA>	<NA>
869	0000-0001-9884-5498	True	True	alberto	ronzani	<NA>	NaN	alberto@aronza.com	NaN	NaN	NaN	[[research scientist, vtt technical research c...	19	[crossref metadata search, alberto ronzani, cr...	2014-04-16 13:21:54.287000+00:00	2020-09-28 15:10:37.439000+00:00	18	3	True	aronza.com	[vtt.fi]	NaN	1	<NA>	<NA>	<NA>	<NA>	1

In [19]:

emails_by_orcid = df[['orcid', 'n_emails']].sort_values('n_emails', ascending=False)

In [20]:

set_top_n(30)
data = [
    go.Bar(
        x=emails_by_orcid[:TOP_N]['orcid'],
        y=emails_by_orcid[:TOP_N]['n_emails']
    )
]

layout = go.Layout(
    title='Top %s ORCID iDs by email' % TOP_N, 
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [21]:

top_other_emails = df[['orcid', 'other_email_domains']]\
                        .explode('other_email_domains')\
                        .reset_index(drop=True)\
                        .groupby('other_email_domains')\
                        .count()\
                        .sort_values('orcid', ascending=False)

In [22]:

set_top_n(30)
data = [
    go.Bar(
        x=top_other_emails[:TOP_N].index,
        y=top_other_emails[:TOP_N]['orcid']
    )
]

layout = go.Layout(
    title='Top %s other email domains' % TOP_N, 
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

This somehow makes sense, legitimate users could put the gmail account as primary for login purposes and have institutional addresses as other email addresses. It makes also the life easier upon relocation.

Email speculation¶

In [23]:

df[df.primary_email.isna() & df.other_email_domains.notna()]

Out[23]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
433	0000-0001-9097-2281	True	True	abhishek	solanki	<NA>	NaN	<NA>	NaN	NaN	NaN	[[senior engineer, robert bosch (india), benga...	1	[abhishek solanki]	2019-04-22 04:43:06.232000+00:00	2020-07-02 14:18:28.305000+00:00	0	0	0	0	False	NaN	[in.bosch.com]	[github.com, linkedin.com]	1	2	<NA>	<NA>	<NA>	2
497	0000-0002-8614-3007	True	True	adam	arra	<NA>	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2017-11-15 06:33:45.625000+00:00	2017-11-15 06:44:02.998000+00:00	0	0	0	0	False	NaN	[hct.ac.ae]	NaN	1	<NA>	<NA>	<NA>	<NA>	<NA>
898	0000-0003-3728-6439	True	True	alejandra	echeverry velásquez	alejandra echeverry is an industrial electrici...	NaN	<NA>	[control, science, technology, innovation, ren...	NaN	[[, electrical engineer, institución universit...	[[professor, institución universitaria pascual...	1	[crossref]	2019-03-31 00:00:42.929000+00:00	2020-09-06 02:18:54.290000+00:00	1	0	0	0	True	NaN	[pascualbravo.edu.co]	NaN	1	<NA>	<NA>	7	1	1
1719	0000-0001-8330-7443	True	True	andrea	tesoniero	<NA>	NaN	<NA>	NaN	researcherid, d-9056-2015	[[department of geophysics, master of science ...	[[postdoctoral associate, yale university, new...	4	[andrea tesoniero]	2015-03-09 11:59:06.093000+00:00	2020-08-20 15:03:23.447000+00:00	4	0	0	2	False	NaN	[yale.edu]	NaN	1	<NA>	1	<NA>	4	2
6829	0000-0001-9670-515X	True	True	esma esin	yildirim	<NA>	NaN	<NA>	[natural chemistry, pharmacognosy, chemical en...	NaN	[[business management, master of science, ista...	NaN	0	NaN	2020-07-26 10:38:03.721000+00:00	2020-07-26 10:52:26.539000+00:00	0	0	0	0	False	NaN	[gmail.com]	NaN	1	<NA>	<NA>	3	3	<NA>
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10985816	0000-0003-1204-6009	True	True	nathan	walk	<NA>	NaN	<NA>	NaN	NaN	[[department of physics, doctor of philosophy,...	[[, university of oxford, oxford, oxfordshire,...	10	[crossref metadata search]	2016-07-28 14:24:16.844000+00:00	2020-10-13 11:47:50.621000+00:00	10	0	0	0	True	NaN	[cs.ox.ac.uk]	[fu-berlin.de]	1	1	<NA>	<NA>	3	2
10986027	0000-0002-3472-7668	True	True	raf	vandevelde	<NA>	NaN	<NA>	NaN	NaN	[[chemical engineering technology, master, kat...	[[phd researcher, katholieke universiteit leuv...	0	NaN	2020-10-14 13:56:44.779000+00:00	2020-10-16 14:21:40.673000+00:00	0	0	0	0	False	NaN	[kuleuven.be]	[linkedin.com]	1	1	<NA>	<NA>	2	1
10987501	0000-0002-9602-0529	True	True	carlos augusto	finelli	<NA>	NaN	<NA>	NaN	NaN	NaN	NaN	1	[crossref]	2013-09-16 16:52:06.120000+00:00	2020-12-01 22:47:08.074000+00:00	1	0	0	0	True	NaN	[cecot.com.br]	NaN	1	<NA>	<NA>	<NA>	<NA>	<NA>
10987829	0000-0003-4402-5982	True	True	filipe	de almeida araújo	<NA>	NaN	<NA>	NaN	NaN	[[materials science, msc. materials science, m...	[[co-owner, aeft acessory, manaus, amazonas, b...	0	NaN	2020-03-02 20:11:01.699000+00:00	2020-12-04 13:53:39.404000+00:00	0	0	0	0	False	NaN	[ime.eb.br]	NaN	1	<NA>	<NA>	<NA>	2	1
10988444	0000-0002-1734-7241	True	True	manareldeen	ahmed	<NA>	NaN	<NA>	[atomistic simulation, ai chips, graphene, dee...	NaN	NaN	[[post-doctor, zhejiang university, hangzhou, ...	6	[manareldeen ahmed]	2017-02-17 13:18:36.540000+00:00	2020-12-04 02:04:36.668000+00:00	6	0	0	3	True	NaN	[hotmail.com]	NaN	1	<NA>	<NA>	5	<NA>	1

19814 rows × 30 columns

URLs¶

In [24]:

df.n_urls.describe()

Out[24]:

count    715067.000000
mean          1.434629
std           1.429160
min           1.000000
25%           1.000000
50%           1.000000
75%           1.000000
max         219.000000
Name: n_urls, dtype: float64

In [25]:

df[df.n_urls > df.n_urls.max()]

Out[25]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment

In [26]:

df[df.url_domains.notna()].head()

Out[26]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
6	0000-0001-7402-0096	True	True	<NA>	<NA>	<NA>	NaN	<NA>	NaN	NaN	NaN	[[, kth royal institute of technology, stockho...	0	NaN	2015-01-11 15:13:06.467000+00:00	2016-06-14 23:55:59.896000+00:00	0	False	NaN	NaN	[kth.se]	<NA>	1	<NA>	<NA>	<NA>	1
11	0000-0001-8377-3508	True	True	<NA>	<NA>	<NA>	[fontana, milena da silva]	<NA>	[educação; informática; matemática.]	NaN	NaN	[[, instituto federal de educação, ciência e t...	0	NaN	2018-05-23 23:39:04.534000+00:00	2019-10-16 02:50:11.007000+00:00	0	False	NaN	NaN	[cnpq.br]	<NA>	1	<NA>	1	<NA>	3
29	0000-0002-2638-4108	True	True	<NA>	<NA>	investigador de la universidad de oviedo. depa...	NaN	<NA>	[history of political thought, constitutional ...	scopus author id, 54394231000	[[public law, ph doctor, university of oviedo,...	[[professor of constitutional law, university ...	1	[crossref]	2013-03-25 14:38:06.016000+00:00	2020-07-01 13:10:37.025000+00:00	1	False	NaN	NaN	[unioviedo.es]	<NA>	1	1	3	1	1
46	0000-0003-1435-6545	True	True	<NA>	<NA>	<NA>	NaN	<NA>	[migration, culture cell, prostate cancer]	researcherid, p-2223-2018	[[morfologia, , universidade estadual paulista...	[[, universidade estadual paulista (unesp), in...	0	NaN	2018-08-09 12:12:24.405000+00:00	2020-04-22 01:38:03.184000+00:00	0	False	NaN	NaN	[cnpq.br, linkedin.com]	<NA>	2	1	3	1	1
158	0000-0003-1284-9741	True	True	alex percy antonio	manriquez paisig	<NA>	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-09-08 20:04:33.906000+00:00	2020-09-08 20:25:55.432000+00:00	0	False	NaN	NaN	[youtube.com]	<NA>	1	<NA>	<NA>	<NA>	<NA>

In [27]:

urls_by_orcid = df[['orcid', 'n_urls']].sort_values('n_urls', ascending=False)
urls_by_orcid

Out[27]:

	orcid	n_urls
3226518	0000-0002-1234-835X	219
4206055	0000-0001-7478-4539	174
4901870	0000-0002-7392-3792	169
8184260	0000-0002-6938-9638	152
2743648	0000-0002-5710-4041	114
...	...	...
10989644	0000-0002-1686-1935	<NA>
10989645	0000-0002-3800-6331	<NA>
10989646	0000-0002-8783-5814	<NA>
10989647	0000-0002-7584-2283	<NA>
10989648	0000-0003-0529-3538	<NA>

10989649 rows × 2 columns

The first three are fake, the fourth isn't. No assumption can be taken.

In [28]:

set_top_n(100)
data = [
    go.Bar(
        x=urls_by_orcid[:TOP_N]['orcid'],
        y=urls_by_orcid[:TOP_N]['n_urls']
    )
]

layout = go.Layout(
    title='Top %s ORCID iDs with URLs' % TOP_N,
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [29]:

top_urls = df[['orcid', 'url_domains']]\
                .explode('url_domains')\
                .reset_index(drop=True)\
                .groupby('url_domains')\
                .count()\
                .sort_values('orcid', ascending=False)

In [30]:

set_top_n(50)
data = [
    go.Bar(
        x=top_urls[:TOP_N].index,
        y=top_urls[:TOP_N]['orcid']
    )
]

layout = go.Layout(
    title='Top-%s URL domains' % TOP_N,
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

Malformed URLs are left empty

In [31]:

exploded_url_domains = df[['orcid', 'url_domains']].explode('url_domains')

In [32]:

exploded_url_domains[exploded_url_domains.url_domains == ''].count()

Out[32]:

orcid          71
url_domains    71
dtype: int64

URLs speculation¶

In [33]:

df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]

Out[33]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
1025713	0000-0003-2407-3557	True	True	abdul	aziz	abdul aziz was born on may 25, 1973, in brebes...	[abdul aziz, aziz, abdul, aziz, a., aziz, abd,...	<NA>	[ilmu ekonomi, ekonomi islam, etika bisnis isl...	NaN	[[ilmu ekonomi, dr, universitas borobudur, jak...	[[assisten professor/dr, institut agama islam ...	72	[base - bielefeld academic search engine, abdu...	2016-09-12 04:41:24.842000+00:00	2021-01-26 11:58:33.039000+00:00	19	0	0	77	False	NaN	NaN	[google.com, syekhnurjati.ac.id, orcid.org, bl...	<NA>	59	<NA>	4	3	1
2743648	0000-0002-5710-4041	True	True	ryszard	romaniuk	professor of electronics and communications en...	[r.romaniuk, r.s.romaniuk, ryszard romaniuk, r...	rrom@ise.pw.edu.pl	[research systems, measurement systems, photon...	[[isni, 0000000071432485], [researcherid, b-91...	[[faculty of electronics and information techn...	[[professor, institute director, politechnika ...	5008	[inspire-hep, researcherid, isni2orcid search ...	2013-01-20 12:09:21.600000+00:00	2021-03-16 19:37:31.650000+00:00	1221	25	0	1742	True	ise.pw.edu.pl	[ise.pw.edu.pl, elka.pw.edu.pl, cern.ch]	[google.pl, publons.com, scopus.com, mendeley....	3	114	3	5	1	1
3011724	0000-0003-2450-090X	True	True	eduard	babulak	professor eduard babulak is accomplished inter...	[professor eduard babulak]	<NA>	[internet of things, next generation of ict an...	[[scopus author id, 6506867432], [researcherid...	[[information technology, doctor habilitated (...	[[consultant, horizon 2020 framework programme...	274	[the lens, base - bielefeld academic search en...	2013-04-03 08:02:30.013000+00:00	2021-02-28 10:07:13.231000+00:00	199	0	1	174	False	NaN	NaN	[worldassessmentcouncil.org, spseke.sk, bcs.or...	<NA>	114	5	8	6	22
3881064	0000-0002-3920-7389	True	True	а.	гусев	surname, name gusev alexander leonidovichdate...	[alexander l. gusev , alexander leonidovich gu...	<NA>	[nanomaterials and nanocomposites, supercapaci...	[[researcherid, f-8048-2014], [scopus author i...	[[chemical technology and cryogenic-vacuum tec...	[[general director, scientific technical centr...	472	[publons, datacite, scopus - elsevier, a.l. gu...	2014-05-14 00:01:28.030000+00:00	2021-01-16 13:44:14.134000+00:00	37	0	0	21	False	NaN	NaN	[youtube.com, isjaee.com, researchgate.net, re...	<NA>	111	2	16	2	7
7466062	0000-0002-1929-6054	True	True	franklin américo	canaza choque	docente-investigador social. maestrando en der...	[franklin américo canaza-choque , franklin a. ...	leo_123fa@hotmail.com	[filosofía; educación; políticas de desarrollo...	[[researcherid, p-8613-2018], [loop profile, 8...	[[facultad de ciencias de la educación , maest...	[[investigador social, universidad católica de...	39	[researcherid, base - bielefeld academic searc...	2017-09-15 19:45:43.483000+00:00	2021-03-23 20:12:47.297000+00:00	30	0	0	34	True	hotmail.com	[gmail.com, gmail.com, hotmail.com, baldwin.ed...	[concytec.gob.pe, redalyc.org, redalyc.org, un...	5	61	4	2	1	1
7517096	0000-0003-4948-9268	True	True	gustavo	duperré	gustavo norberto duperré graduated in arts and...	[gustavo norberto duperré, duperré, g. n., gus...	gustavo.duperre@usal.edu.ar	[sciences of antiquity, cultural heritage, ear...	[[scopus author id, 57195936346], [researcheri...	[[programme in history, history of art and ter...	[[titular professor, dirección general de cult...	41	[gustavo duperré, scopus - elsevier, publons, ...	2020-02-22 15:49:52.386000+00:00	2021-03-12 15:13:44.065000+00:00	13	0	0	34	False	usal.edu.ar	NaN	[icomos.ro, unirioja.es, unirioja.es, unc.edu....	<NA>	61	2	11	6	5
8068275	0000-0003-2183-8112	True	True	pelayo munhoz	olea	pós-doutorado em gestão ambiental pela univers...	[ munhoz, pelayo olea, olea, pelayo, olea, p...	<NA>	[inovação, sustentabilidade, empreendedorismo]	[[scopus author id, 55175503300], [researcheri...	[[, postdoctoral in environmental sustainabili...	[[professor, universidade federal do rio grand...	1109	[the lens, pelayo munhoz olea, dimensions, bas...	2013-02-04 17:25:34.723000+00:00	2021-03-19 18:51:01.128000+00:00	798	0	1	582	True	NaN	NaN	[cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c...	<NA>	61	2	3	7	9
8184260	0000-0002-6938-9638	True	True	adolfo	catral sanabria	my education is in computer science, mathemati...	NaN	<NA>	NaN	loop profile, 747193	[[education, capacitación para la enseñanza en...	NaN	2023	[base - bielefeld academic search engine, data...	2019-05-07 19:27:02.210000+00:00	2020-12-10 23:39:15.236000+00:00	2022	0	0	16	False	NaN	NaN	[researchgate.net, youtube.com, linkedin.com, ...	<NA>	152	1	<NA>	6	<NA>
8791256	0000-0002-9025-8632	True	True	buycannabis	dispensary	we procure and deliver premium cannabis strain...	[we procure and deliver premium cannabis strai...	<NA>	[marijuana, canabis dispensary, marijuana stoc...	NaN	NaN	NaN	10	[goowonderland dispensary]	2020-12-09 21:19:46.004000+00:00	2020-12-10 01:17:28.772000+00:00	0	0	0	0	False	NaN	NaN	[goowonderland.com, goowonderland.com, goowond...	<NA>	81	<NA>	7	<NA>	<NA>
10174509	0000-0002-9965-2425	True	True	jaroslaw	spychala	jaroslaw spychala has received a doctoral degr...	[jaroslaw jozef spychala]	<NA>	[organic chemistry, medicinal and pharmaceutic...	scopus author id, 7006745874	[[department of chemistry, postdoctoral associ...	[[assistant professor, adam mickiewicz univers...	29	[scopus - elsevier]	2014-09-18 12:34:14.242000+00:00	2020-02-11 14:31:25.544000+00:00	15	0	0	29	True	NaN	NaN	[biowebspin.com, biowebspin.com, google.com, l...	<NA>	73	1	4	4	2
10257808	0000-0002-4062-3603	True	True	juan de dios	beltrán mancilla	juan de dios beltrán mancilla (*) filósofo aut...	[juan de dios beltrán mancilla, filósofo autod...	<NA>	[filosofia medicina arquitectura economía dere...	NaN	[[, diplomado en practicas directivas para or...	[[inspector general jornada vespertina // de 2...	11	[juan de dios beltr´´án mancilla]	2020-04-19 21:06:33.495000+00:00	2021-02-10 20:13:07.698000+00:00	0	0	0	7	False	NaN	NaN	[yumpu.com, ijopm.org, google.com, blogspot.co...	<NA>	69	<NA>	1	8	6
10486212	0000-0002-3997-5070	True	True	dr. parameshachari	b d	dr. parameshachari b dacm distinguished speake...	[dr. parameshachari b d]	<NA>	[honorary secretary\| iete mysuru centre, profe...	[[researcherid, f-7045-2018], [scopus author i...	[[electronics and communication engineering, p...	[[acm distinguished speaker (volunteer), assoc...	93	[publons, multidisciplinary digital publishing...	2016-08-24 11:00:30.403000+00:00	2021-03-23 07:16:22.582000+00:00	47	0	0	48	False	NaN	NaN	[geethashishu.in, geethashishu.in, acm.org, go...	<NA>	71	3	6	5	10
10652632	0000-0003-2593-7134	True	True	aan	jaelani	all my papers can be downloaded from portal:re...	[jaelani, a., jaelani, aan]	aan_jaelani@syekhnurjati.ac.id	[tourism industry, public finance & budgeting,...	[[scopus author id, 57195963463], [loop profil...	[[post graduate, s3/dr, universitas islam nege...	[[dr, institut agama islam negeri syekh nurjat...	79	[publons, aan jaelani, scopus - elsevier, dime...	2016-03-02 18:37:44.989000+00:00	2021-03-19 10:11:57.908000+00:00	88	0	0	193	True	syekhnurjati.ac.id	[gmail.com]	[microsoft.com, twitter.com, academia.edu, aca...	1	67	4	7	2	1

In [34]:

df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]

Out[34]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
47439	0000-0002-5967-2835	True	True	oleksiy	goryayinov	<NA>	[алексей николаевич горяинов, о.м.горяїнов, а....	<NA>	[diagnostics, transport, logistics]	researcherid, i-7977-2016	[[, дистанционный курс «ctl.sc2x: supply chain...	[[docent, kharkiv petro vasylenko national tec...	274	[oleksiy goryayinov]	2014-08-03 18:06:42.925000+00:00	2021-03-22 13:56:48.311000+00:00	0	0	0	0	False	NaN	NaN	[khntusg.com.ua, khntusg.com.ua, google.com.ua...	<NA>	13	1	3	14	7
72557	0000-0002-3505-2797	True	True	nurul	malahayati	google scholar	NaN	<NA>	NaN	researcherid, q-3861-2017	[[civil and transportation engineering , maste...	[[senior lecturer, universitas syiah kuala, ba...	6	[nurul malahayati]	2017-10-01 00:46:31.324000+00:00	2019-08-19 15:52:47.253000+00:00	3	0	0	3	False	NaN	NaN	[google.com, ristekdikti.go.id, unsyiah.ac.id,...	<NA>	16	1	<NA>	2	1
94081	0000-0003-3670-9620	True	True	carlos	barrera	im individual inventor, and this is my work; s...	[retrodynamic, novelinflow]	<NA>	[engineering, gearturbine, technology, power, ...	loop profile, 394457	NaN	NaN	1	[carlos barrera]	2016-08-29 20:32:10.362000+00:00	2021-02-09 04:56:35.554000+00:00	0	0	0	0	False	NaN	NaN	[blogspot.mx, behance.net, authorstream.com, d...	<NA>	24	1	8	<NA>	<NA>
261673	0000-0002-5441-0465	True	True	nuria	hernández-león	<NA>	[nuria h. león, nuria hernández león, hernánde...	<NA>	[icts, human resources, psychology of organiza...	NaN	[[, course: social skills, university of salam...	[[merchandise reception and expedition trainer...	11	[nuria hernández-león]	2015-11-28 07:18:58.442000+00:00	2021-03-05 16:37:47.403000+00:00	1	0	0	4	False	NaN	NaN	[feriaempresamujer.com, escueladenegociosydire...	<NA>	16	<NA>	7	19	16
326211	0000-0002-7781-6767	True	True	mohd nazri	ismail	born in penang, malaysia in 1971, dr. mohd had...	[ndum (national defence university of malaysia)]	<NA>	[wsn, design of network ip address, network ma...	[[scopus author id, 24372977800], [researcheri...	NaN	[[lecturer, universiti pertahanan nasional mal...	35	[scopus - elsevier]	2016-09-06 02:25:52.974000+00:00	2020-10-20 06:55:55.051000+00:00	24	0	0	35	True	NaN	NaN	[google.com.my, researchgate.net, academia.edu...	<NA>	16	2	10	<NA>	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10579801	0000-0001-5087-6965	True	True	robert	ohara	systematics, evolutionary biology, and the his...	[r. o’hara, r.j. o’hara, robert o’hara, robert...	<NA>	[evolutionary biology, education, new england ...	[[isni, 0000000138200102], [researcherid, b-47...	[[biology, ph.d., harvard university, cambridg...	NaN	45	[robert j. o’hara]	2014-09-21 02:45:19.620000+00:00	2020-07-09 06:51:09.228000+00:00	23	0	0	72	True	NaN	NaN	[rjohara.net, google.com, collegiateway.org, r...	<NA>	12	3	5	1	<NA>
10590882	0000-0002-3318-9861	True	True	shagufta	perveen	prof. dr. shagufta perveen is a professor at k...	NaN	shagufta792000@yahoo.com	[shagufta perveen msu, shagufta perveen uk, sh...	NaN	[[hej research institute of chemistry, phd che...	[[professor, king saud university college of p...	66	[scopus - elsevier]	2015-12-21 10:34:06.771000+00:00	2021-02-22 14:58:30.893000+00:00	56	0	0	66	True	yahoo.com	[msu.edu, ksu.edu.sa]	[shaguftaperveen.com, researchgate.net, ksu.ed...	2	11	<NA>	25	3	7
10766062	0000-0001-8960-9004	True	True	susan	bastani	<NA>	[s. bastani, سوسن باستانی]	sbastani@alzahra.ac.ir	[fuzzy logic, personal networks, social networ...	scopus author id, 16642098400	[[sociology, ph.d., university of toronto, tor...	[[professor, alzahra university, tehran, vanak...	20	[scopus - elsevier]	2019-07-10 06:50:46.255000+00:00	2020-10-07 04:08:01.961000+00:00	19	0	0	33	True	alzahra.ac.ir	[gmail.com, gmail.com]	[scopus.com, google.com, publons.com, zenodo.o...	2	11	1	4	3	4
10807839	0000-0002-4379-6454	True	True	caroline wanjiru	kariuki	caroline holds a phd in economics from curtin ...	NaN	<NA>	[applied economics, financial economics, devel...	NaN	[[economics, doctor of philosophy , curtin uni...	[[director, educational development, strathmor...	4	[caroline wanjiru kariuki]	2020-03-18 10:18:04.007000+00:00	2021-02-11 14:40:38.515000+00:00	1	0	0	0	False	NaN	NaN	[scopus.com, mendeley.com, publons.com, resear...	<NA>	13	<NA>	4	3	6
10911966	0000-0003-2311-0600	True	True	myo	kyaw hlaing	<NA>	[dr myo kyaw hlaing]	<NA>	[economic geology]	NaN	NaN	[[lecturer, union of myanmar ministry of educa...	2	[myo kyaw hlaing]	2018-12-26 12:51:57.801000+00:00	2021-01-26 14:36:47.421000+00:00	1	0	0	2	False	NaN	NaN	[facebook.com, linkedin.com, instagram.com, re...	<NA>	12	<NA>	1	<NA>	2

140 rows × 30 columns

In [35]:

exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)
exploded_sources

Out[35]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
0	0000-0002-5967-2835	True	True	oleksiy	goryayinov	<NA>	[алексей николаевич горяинов, о.м.горяїнов, а....	<NA>	[diagnostics, transport, logistics]	researcherid, i-7977-2016	[[, дистанционный курс «ctl.sc2x: supply chain...	[[docent, kharkiv petro vasylenko national tec...	274	oleksiy goryayinov	2014-08-03 18:06:42.925000+00:00	2021-03-22 13:56:48.311000+00:00	0	0	0	0	False	NaN	NaN	[khntusg.com.ua, khntusg.com.ua, google.com.ua...	<NA>	13	1	3	14	7
1	0000-0002-3505-2797	True	True	nurul	malahayati	google scholar	NaN	<NA>	NaN	researcherid, q-3861-2017	[[civil and transportation engineering , maste...	[[senior lecturer, universitas syiah kuala, ba...	6	nurul malahayati	2017-10-01 00:46:31.324000+00:00	2019-08-19 15:52:47.253000+00:00	3	0	0	3	False	NaN	NaN	[google.com, ristekdikti.go.id, unsyiah.ac.id,...	<NA>	16	1	<NA>	2	1
2	0000-0003-3670-9620	True	True	carlos	barrera	im individual inventor, and this is my work; s...	[retrodynamic, novelinflow]	<NA>	[engineering, gearturbine, technology, power, ...	loop profile, 394457	NaN	NaN	1	carlos barrera	2016-08-29 20:32:10.362000+00:00	2021-02-09 04:56:35.554000+00:00	0	0	0	0	False	NaN	NaN	[blogspot.mx, behance.net, authorstream.com, d...	<NA>	24	1	8	<NA>	<NA>
3	0000-0002-5441-0465	True	True	nuria	hernández-león	<NA>	[nuria h. león, nuria hernández león, hernánde...	<NA>	[icts, human resources, psychology of organiza...	NaN	[[, course: social skills, university of salam...	[[merchandise reception and expedition trainer...	11	nuria hernández-león	2015-11-28 07:18:58.442000+00:00	2021-03-05 16:37:47.403000+00:00	1	0	0	4	False	NaN	NaN	[feriaempresamujer.com, escueladenegociosydire...	<NA>	16	<NA>	7	19	16
4	0000-0002-7781-6767	True	True	mohd nazri	ismail	born in penang, malaysia in 1971, dr. mohd had...	[ndum (national defence university of malaysia)]	<NA>	[wsn, design of network ip address, network ma...	[[scopus author id, 24372977800], [researcheri...	NaN	[[lecturer, universiti pertahanan nasional mal...	35	scopus - elsevier	2016-09-06 02:25:52.974000+00:00	2020-10-20 06:55:55.051000+00:00	24	0	0	35	True	NaN	NaN	[google.com.my, researchgate.net, academia.edu...	<NA>	16	2	10	<NA>	4
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
135	0000-0001-5087-6965	True	True	robert	ohara	systematics, evolutionary biology, and the his...	[r. o’hara, r.j. o’hara, robert o’hara, robert...	<NA>	[evolutionary biology, education, new england ...	[[isni, 0000000138200102], [researcherid, b-47...	[[biology, ph.d., harvard university, cambridg...	NaN	45	robert j. o’hara	2014-09-21 02:45:19.620000+00:00	2020-07-09 06:51:09.228000+00:00	23	0	0	72	True	NaN	NaN	[rjohara.net, google.com, collegiateway.org, r...	<NA>	12	3	5	1	<NA>
136	0000-0002-3318-9861	True	True	shagufta	perveen	prof. dr. shagufta perveen is a professor at k...	NaN	shagufta792000@yahoo.com	[shagufta perveen msu, shagufta perveen uk, sh...	NaN	[[hej research institute of chemistry, phd che...	[[professor, king saud university college of p...	66	scopus - elsevier	2015-12-21 10:34:06.771000+00:00	2021-02-22 14:58:30.893000+00:00	56	0	0	66	True	yahoo.com	[msu.edu, ksu.edu.sa]	[shaguftaperveen.com, researchgate.net, ksu.ed...	2	11	<NA>	25	3	7
137	0000-0001-8960-9004	True	True	susan	bastani	<NA>	[s. bastani, سوسن باستانی]	sbastani@alzahra.ac.ir	[fuzzy logic, personal networks, social networ...	scopus author id, 16642098400	[[sociology, ph.d., university of toronto, tor...	[[professor, alzahra university, tehran, vanak...	20	scopus - elsevier	2019-07-10 06:50:46.255000+00:00	2020-10-07 04:08:01.961000+00:00	19	0	0	33	True	alzahra.ac.ir	[gmail.com, gmail.com]	[scopus.com, google.com, publons.com, zenodo.o...	2	11	1	4	3	4
138	0000-0002-4379-6454	True	True	caroline wanjiru	kariuki	caroline holds a phd in economics from curtin ...	NaN	<NA>	[applied economics, financial economics, devel...	NaN	[[economics, doctor of philosophy , curtin uni...	[[director, educational development, strathmor...	4	caroline wanjiru kariuki	2020-03-18 10:18:04.007000+00:00	2021-02-11 14:40:38.515000+00:00	1	0	0	0	False	NaN	NaN	[scopus.com, mendeley.com, publons.com, resear...	<NA>	13	<NA>	4	3	6
139	0000-0003-2311-0600	True	True	myo	kyaw hlaing	<NA>	[dr myo kyaw hlaing]	<NA>	[economic geology]	NaN	NaN	[[lecturer, union of myanmar ministry of educa...	2	myo kyaw hlaing	2018-12-26 12:51:57.801000+00:00	2021-01-26 14:36:47.421000+00:00	1	0	0	2	False	NaN	NaN	[facebook.com, linkedin.com, instagram.com, re...	<NA>	12	<NA>	1	<NA>	2

140 rows × 30 columns

In [36]:

exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]

Out[36]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment
0	0000-0002-5967-2835	True	True	oleksiy	goryayinov	<NA>	[алексей николаевич горяинов, о.м.горяїнов, а....	<NA>	[diagnostics, transport, logistics]	researcherid, i-7977-2016	[[, дистанционный курс «ctl.sc2x: supply chain...	[[docent, kharkiv petro vasylenko national tec...	274	oleksiy goryayinov	2014-08-03 18:06:42.925000+00:00	2021-03-22 13:56:48.311000+00:00	0	0	0	0	False	NaN	NaN	[khntusg.com.ua, khntusg.com.ua, google.com.ua...	<NA>	13	1	3	14	7
1	0000-0002-3505-2797	True	True	nurul	malahayati	google scholar	NaN	<NA>	NaN	researcherid, q-3861-2017	[[civil and transportation engineering , maste...	[[senior lecturer, universitas syiah kuala, ba...	6	nurul malahayati	2017-10-01 00:46:31.324000+00:00	2019-08-19 15:52:47.253000+00:00	3	0	0	3	False	NaN	NaN	[google.com, ristekdikti.go.id, unsyiah.ac.id,...	<NA>	16	1	<NA>	2	1
2	0000-0003-3670-9620	True	True	carlos	barrera	im individual inventor, and this is my work; s...	[retrodynamic, novelinflow]	<NA>	[engineering, gearturbine, technology, power, ...	loop profile, 394457	NaN	NaN	1	carlos barrera	2016-08-29 20:32:10.362000+00:00	2021-02-09 04:56:35.554000+00:00	0	0	0	0	False	NaN	NaN	[blogspot.mx, behance.net, authorstream.com, d...	<NA>	24	1	8	<NA>	<NA>
3	0000-0002-5441-0465	True	True	nuria	hernández-león	<NA>	[nuria h. león, nuria hernández león, hernánde...	<NA>	[icts, human resources, psychology of organiza...	NaN	[[, course: social skills, university of salam...	[[merchandise reception and expedition trainer...	11	nuria hernández-león	2015-11-28 07:18:58.442000+00:00	2021-03-05 16:37:47.403000+00:00	1	0	0	4	False	NaN	NaN	[feriaempresamujer.com, escueladenegociosydire...	<NA>	16	<NA>	7	19	16
5	0000-0001-7010-2908	True	True	clara	sarmento	clara sarmento holds an aggregation in cultura...	NaN	<NA>	[ethnography, portuguese culture and literatur...	ciência id, d418-d6f8-7d49	[[ao abrigo da bolsa santander ie best practic...	[[presidente da comissão de acreditação do nov...	275	clara sarmento	2013-12-12 00:33:58.190000+00:00	2020-10-12 14:43:00.749000+00:00	17	0	0	60	True	NaN	NaN	[iscap.pt, google.pt, academia.edu, researchga...	<NA>	13	1	6	8	37
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
133	0000-0003-1020-1351	True	True	sheikh saifullah	ahmed	sheikh saifullah ahmed is a full-time lecturer...	NaN	saifullahahmedku@gmail.com	[post-truth, postmodern literature, critical t...	NaN	[[english discipline , ma & ba in english , kh...	[[lecturer , international university of busin...	3	sheikh saifullah ahmed	2020-04-08 21:00:11.201000+00:00	2021-02-12 20:45:32.247000+00:00	2	0	0	3	False	gmail.com	NaN	[academia.edu, iubat.edu, google.com, research...	<NA>	12	<NA>	5	1	1
134	0000-0001-7228-5680	True	True	text	protocol	<NA>	NaN	<NA>	NaN	NaN	NaN	[[engineer, textprotocol.org, palo alto, ca, u...	1	text protocol	2021-03-09 10:30:32.237000+00:00	2021-03-21 17:17:40.500000+00:00	0	0	0	0	False	NaN	NaN	[about.me, figma.com, github.com, gitlab.com, ...	<NA>	15	<NA>	<NA>	<NA>	1
135	0000-0001-5087-6965	True	True	robert	ohara	systematics, evolutionary biology, and the his...	[r. o’hara, r.j. o’hara, robert o’hara, robert...	<NA>	[evolutionary biology, education, new england ...	[[isni, 0000000138200102], [researcherid, b-47...	[[biology, ph.d., harvard university, cambridg...	NaN	45	robert j. o’hara	2014-09-21 02:45:19.620000+00:00	2020-07-09 06:51:09.228000+00:00	23	0	0	72	True	NaN	NaN	[rjohara.net, google.com, collegiateway.org, r...	<NA>	12	3	5	1	<NA>
138	0000-0002-4379-6454	True	True	caroline wanjiru	kariuki	caroline holds a phd in economics from curtin ...	NaN	<NA>	[applied economics, financial economics, devel...	NaN	[[economics, doctor of philosophy , curtin uni...	[[director, educational development, strathmor...	4	caroline wanjiru kariuki	2020-03-18 10:18:04.007000+00:00	2021-02-11 14:40:38.515000+00:00	1	0	0	0	False	NaN	NaN	[scopus.com, mendeley.com, publons.com, resear...	<NA>	13	<NA>	4	3	6
139	0000-0003-2311-0600	True	True	myo	kyaw hlaing	<NA>	[dr myo kyaw hlaing]	<NA>	[economic geology]	NaN	NaN	[[lecturer, union of myanmar ministry of educa...	2	myo kyaw hlaing	2018-12-26 12:51:57.801000+00:00	2021-01-26 14:36:47.421000+00:00	1	0	0	2	False	NaN	NaN	[facebook.com, linkedin.com, instagram.com, re...	<NA>	12	<NA>	1	<NA>	2

113 rows × 30 columns

GRID.ac filtering¶

In [166]:

def extract_domain(link):
    return tldextract.extract(link).registered_domain

In [223]:

grid_df = pd.read_csv('../data/external/grid/full_tables/links.csv', index_col='grid_id')
grid_df['domain'] = grid_df.link.apply(extract_domain)

In [171]:

grid_df

Out[171]:

	link	domain
grid_id
grid.1001.0	http://www.anu.edu.au/	anu.edu.au
grid.1002.3	http://www.monash.edu/	monash.edu
grid.1003.2	http://www.uq.edu.au/	uq.edu.au
grid.1004.5	http://mq.edu.au/	mq.edu.au
grid.1005.4	https://www.unsw.edu.au/	unsw.edu.au
...	...	...
grid.510886.2	https://cftni.org/	cftni.org
grid.510887.3	https://tinybeamfund.org/	tinybeamfund.org
grid.510888.c	https://www.tmg-thinktank.com/	tmg-thinktank.com
grid.510889.d	https://www.unicef.org/eap	unicef.org
grid.510890.5	https://www.whitehouse.gov/	whitehouse.gov

98933 rows × 2 columns

In [172]:

grid_df.loc['grid.451498.5']

Out[172]:

link      http://www.isti.cnr.it/
domain                     cnr.it
Name: grid.451498.5, dtype: object

In [178]:

exp = df[['orcid', 'url_domains']].explode('url_domains')
exp = exp[exp.url_domains.notna()]
exp

Out[178]:

	orcid	url_domains
6	0000-0001-7402-0096	kth.se
11	0000-0001-8377-3508	cnpq.br
29	0000-0002-2638-4108	unioviedo.es
46	0000-0003-1435-6545	cnpq.br
46	0000-0003-1435-6545	linkedin.com
...	...	...
10989632	0000-0001-9133-2366	au.dk
10989636	0000-0002-2906-0299	oxytocin.com.au
10989636	0000-0002-2906-0299	linkedin.com
10989640	0000-0002-1070-2220	vpsantanna.com
10989641	0000-0002-2187-970X	linkedin.com

1025856 rows × 2 columns

In [218]:

exp['grid'] = exp.url_domains.isin(grid_df.domain)

In [201]:

non_grid_domains = exp[~exp.url_domains.isin(grid_df.domain)].groupby('url_domains').count().sort_values('orcid', ascending=False)

In [213]:

DOMAIN_EXCLUSIONS = ['google.', 'youtube.', 'github', 'researchgate', 'academia.edu', 'elsevier.', 'elsevierpure.com',
                    'publons.', 'scopus', 'researcherid', 'ac.uk', '.gov.', '.edu', 'arxiv']
for dex in DOMAIN_EXCLUSIONS:
    non_grid_domains.drop(non_grid_domains.filter(like=dex, axis=0).index, inplace=True)

In [214]:

non_grid_domains.to_csv('../data/processed/non_grid_urls.csv')

Works source¶

In [37]:

def remove_self_source(lst, given, family):
    res = []
    for ws in lst:
        if ws.lower().find(given.lower()) == -1:
            if pd.notna(family):
                if ws.lower().find(family.lower()) == -1:
                    res.append(ws)
            else:
                res.append(ws)
    return res

In [38]:

df['ext_works_source'] = df[(df.works_source.notna()) & (df.given_names.notna())]\
                        .apply(lambda x: remove_self_source(x['works_source'], x['given_names'], x['family_name']), axis=1)

In [39]:

df['n_ext_work_source'] = pd.Series(df.ext_works_source.str.len(), dtype=pd.Int16Dtype())

In [40]:

exploded_external_sources = df[df['ext_works_source'].str.len() > 0][['orcid','ext_works_source']]\
                            .explode('ext_works_source').reset_index(drop=True)

In [41]:

grouped_ext_sources = exploded_external_sources.groupby('ext_works_source')\
                        .count()\
                        .sort_values('orcid', ascending=False)\
                        .reset_index()

In [42]:

set_top_n(30)
data = [
    go.Bar(
        x=grouped_ext_sources[:TOP_N].ext_works_source,
        y=grouped_ext_sources[:TOP_N].orcid
    )
]

layout = go.Layout(
    title='Top %s works_source' % TOP_N,
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [43]:

authoritative_sources = grouped_ext_sources[grouped_ext_sources['orcid'] > 2]
authoritative_sources

Out[43]:

	ext_works_source	orcid
0	crossref	1460841
1	scopus - elsevier	902231
2	crossref metadata search	297684
3	multidisciplinary digital publishing institute	281664
4	europe pubmed central	181605
...	...	...
337	uta - oa journal global insight	3
338	francis crick institute	3
339	anna	3
340	santos	3
341	universitäts- und stadtbibliothek köln	3

342 rows × 2 columns

In [44]:

exploded_external_sources['authoritative'] = exploded_external_sources.ext_works_source\
                                            .isin(authoritative_sources['ext_works_source'])

In [45]:

orcid_authoritative_source = exploded_external_sources\
                            .groupby('orcid')['authoritative']\
                            .any()\
                            .reset_index()[['orcid', 'authoritative']]

In [46]:

df = df.merge(orcid_authoritative_source, on='orcid', how='left')

In [47]:

df.loc[df.authoritative.isna(), 'authoritative'] = False

In [48]:

df.head()

Out[48]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	works_source	activation_date	last_update_date	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative
0	0000-0001-6097-3953	False	False	<NA>	<NA>	<NA>	NaN	<NA>	NaN	NaN	NaN	NaN	NaN	2018-03-02 09:29:16.528000+00:00	2018-03-02 09:43:07.551000+00:00	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False
1	0000-0001-6112-5550	True	True	<NA>	<NA>	<NA>	[v.i. yurtaev; v. yurtaev]	<NA>	NaN	NaN	NaN	[[professor, peoples friendship university of ...	NaN	2018-04-03 07:50:23.358000+00:00	2020-03-18 09:42:44.753000+00:00	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	1	NaN	<NA>	False
2	0000-0001-6152-2695	True	True	<NA>	<NA>	<NA>	NaN	<NA>	NaN	NaN	NaN	NaN	NaN	2019-12-11 15:31:56.388000+00:00	2020-01-28 15:34:17.309000+00:00	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False
3	0000-0001-6220-5683	True	True	<NA>	<NA>	<NA>	NaN	<NA>	NaN	NaN	NaN	[[research scientist, new york university abu ...	NaN	2015-08-18 12:36:45.307000+00:00	2020-09-23 13:37:54.180000+00:00	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	1	NaN	<NA>	False
4	0000-0001-7071-8294	True	True	<NA>	<NA>	<NA>	NaN	<NA>	NaN	NaN	NaN	[[researcher (academic), universidad de zarago...	NaN	2014-03-10 13:22:01.966000+00:00	2016-06-14 22:17:54.470000+00:00	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	2	NaN	<NA>	False

External IDs¶

External IDs should come from reliable sources. ORCiD registrants cannot add them freely.

In [49]:

df.n_ids.describe()

Out[49]:

count    1.308598e+06
mean     1.359082e+00
std      6.643235e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      8.000000e+01
Name: n_ids, dtype: float64

In [50]:

df[df.n_ids == df.n_ids.max()]

Out[50]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative
3896226	0000-0002-9554-6633	True	True	john a	williams	<NA>	NaN	<NA>	NaN	[[scopus author id, 55553733518], [scopus aut...	NaN	[[, aston university, birmingham, , gb, 1722, ...	92	[aston research explorer]	2014-11-20 09:42:10.690000+00:00	2021-03-17 01:00:51.203000+00:00	80	0	0	208	True	NaN	NaN	[aston.ac.uk]	<NA>	1	80	<NA>	<NA>	1	[aston research explorer]	1	True

In [51]:

ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)

In [52]:

ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])

In [53]:

ids[ids.provider.notna()].head()

Out[53]:

	orcid	external_ids	provider
9	0000-0001-8315-2066	[researcherid, k-4630-2014]	researcherid
29	0000-0002-2638-4108	[scopus author id, 54394231000]	scopus author id
46	0000-0003-1435-6545	[researcherid, p-2223-2018]	researcherid
50	0000-0003-2259-7023	[scopus author id, 57189297461]	scopus author id
64	0000-0002-7397-5824	[scopus author id, 8399842800]	scopus author id

In [54]:

top_ids_providers = ids.groupby('provider').count().sort_values('orcid', ascending=False)

In [55]:

data = [
    go.Bar(
        x=top_ids_providers.index,
        y=top_ids_providers['orcid']
    )
]

layout = go.Layout(
    title='IDs provided by providers',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [56]:

pd.unique(ids['provider'])

Out[56]:

array([nan, 'researcherid', 'scopus author id', 'loop profile', 'gnd',
       'ciência id', 'researcher name resolver id', 'pitt id',
       'id dialnet', 'isni', 'technical university of denmark cwis',
       'chalmers id', 'scopus author id: ', 'scopus author id:',
       'hkust profile', 'hku researcherpage', '中国科学家在线', 'uow scholars',
       'sciprofile', 'cti vitae', 'digital author id', 'researcher id',
       'authenticusid', 'authid', 'authenticus', 'scopus  id',
       'digital author id (dai)', 'researcherid:', 'vivo cornell',
       'us epa vivo', 'escientist', 'github', 'iauthor', 'orcid id',
       'dai', 'scopus id', 'smithsonian profiles', 'google scholar',
       'kaken', 'dialnet id', 'researcherid: ', 'une researcher id',
       'sciprofiles', 'id dialnet:', 'scienceopen', 'orcid',
       'profile system identifier', 'custom'], dtype=object)

Keywords¶

This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this

In [57]:

keywords_by_orcid = df[['orcid', 'n_keywords']].sort_values('n_keywords', ascending=False)
keywords_by_orcid

Out[57]:

	orcid	n_keywords
3751714	0000-0002-0673-0341	154
8697926	0000-0003-3343-5660	148
1154523	0000-0002-6075-3501	140
6512971	0000-0002-7060-4112	140
1515197	0000-0001-5287-1949	132
...	...	...
10989644	0000-0002-1686-1935	<NA>
10989645	0000-0002-3800-6331	<NA>
10989646	0000-0002-8783-5814	<NA>
10989647	0000-0002-7584-2283	<NA>
10989648	0000-0003-0529-3538	<NA>

10989649 rows × 2 columns

In [58]:

set_top_n(100)
data = [
    go.Bar(
        x=keywords_by_orcid[:TOP_N]['orcid'],
        y=keywords_by_orcid[:TOP_N]['n_keywords']
    )
]

layout = go.Layout(
    title='Keywords provided by ORCiD',
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [59]:

top_keywords = df[['orcid', 'keywords']]\
                .explode('keywords')\
                .reset_index(drop=True)\
                .groupby('keywords')\
                .count()\
                .sort_values('orcid', ascending=False)

In [60]:

set_top_n(50)
data = [
    go.Bar(
        x=top_keywords[:TOP_N].index,
        y=top_keywords[:TOP_N]['orcid']
    )
]

layout = go.Layout(
    title='Top-%s keywords occurrence' % TOP_N,
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

Education¶

In [61]:

df.n_education.describe()

Out[61]:

count    2.441645e+06
mean     1.816169e+00
std      1.132196e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      2.000000e+02
Name: n_education, dtype: float64

In [62]:

df[df.n_education == df.n_education.max()]

Out[62]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative
10536169	0000-0002-1927-0292	True	True	phd. carmen m	galvez-sánchez	my name is carmen maria galvez sánchez. i´m a ...	NaN	<NA>	[gender-based violence, fibromyalgia, quantita...	[[loop profile, 509331], [scopus author id, 57...	[[psychology, 2019-2020 course. degree in psyc...	[[researcher and teaching staff. postdoctoral ...	35	[phd. carmen m galvez-sánchez, multidisciplina...	2016-04-18 14:28:57.237000+00:00	2021-03-06 14:17:33.246000+00:00	24	0	0	7	True	NaN	NaN	NaN	<NA>	<NA>	2	5	200	3	[multidisciplinary digital publishing institut...	4	True

In [63]:

exploded_education = df[['orcid', 'education']].explode('education').dropna()
exploded_education

Out[63]:

	orcid	education
28	0000-0002-2343-910X	[aeronautics and astronautics, phd, massachuse...
28	0000-0002-2343-910X	[aeronautics and astronautics, sm, massachuset...
28	0000-0002-2343-910X	[mechanical engineering and material science, ...
29	0000-0002-2638-4108	[public law, ph doctor, university of oviedo, ...
46	0000-0003-1435-6545	[morfologia, , universidade estadual paulista ...
...	...	...
10989644	0000-0002-1686-1935	[, , south china agricultural university, guan...
10989645	0000-0002-3800-6331	[richard gilder graduate school, phd in compar...
10989645	0000-0002-3800-6331	[geological sciences and history (dual major),...
10989647	0000-0002-7584-2283	[school of electronics and information, master...
10989647	0000-0002-7584-2283	[ department of electrical engineering, bachel...

4434439 rows × 2 columns

In [64]:

exploded_education[['degree', 'role', 'university', 'city', 'region', 'country', 'id', 'id_scheme']] = pd.DataFrame(exploded_education.education.tolist(), index=exploded_education.index)

In [65]:

exploded_education.id.replace('', pd.NA, inplace=True)

In [68]:

exploded_education.groupby('orcid').id.count().reset_index()

Out[68]:

	orcid	id
0	0000-0001-5000-0162	3
1	0000-0001-5000-0170	2
2	0000-0001-5000-0218	3
3	0000-0001-5000-0226	1
4	0000-0001-5000-0306	0
...	...	...
2441640	0000-0003-4999-9719	1
2441641	0000-0003-4999-9735	1
2441642	0000-0003-4999-992X	2
2441643	0000-0003-4999-9938	2
2441644	0000-0003-4999-9954	1

2441645 rows × 2 columns

In [69]:

df = df.merge(exploded_education.groupby('orcid').id.count().reset_index(), on='orcid', how='left')
df.rename(columns={'id': 'n_valid_education'}, inplace=True)

In [70]:

df[df.n_education != df.n_valid_education]

Out[70]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative	n_valid_education
46	0000-0003-1435-6545	True	True	<NA>	<NA>	<NA>	NaN	<NA>	[migration, culture cell, prostate cancer]	researcherid, p-2223-2018	[[morfologia, , universidade estadual paulista...	[[, universidade estadual paulista (unesp), in...	0	NaN	2018-08-09 12:12:24.405000+00:00	2020-04-22 01:38:03.184000+00:00	0	0	0	0	False	NaN	NaN	[cnpq.br, linkedin.com]	<NA>	2	1	3	1	1	NaN	<NA>	False	0.0
74	0000-0002-0427-9745	True	True	a. can	inci	i am a professor of finance at bryant universi...	NaN	<NA>	NaN	[[researcherid, b-5471-2018], [scopus author i...	[[finance, ph.d., university of michigan - ros...	[[professor of finance, bryant university, smi...	34	[a. can inci]	2018-01-20 02:58:05.199000+00:00	2020-06-16 12:35:09.403000+00:00	0	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	2	<NA>	4	5	[]	0	False	0.0
88	0000-0002-3380-6671	True	True	abdul	asis pata	<NA>	NaN	<NA>	NaN	NaN	[[agribisnis, m.si, universitas hasanuddin, ma...	[[s.p, universitas muslim maros, maros, , id, ...	0	NaN	2018-02-12 02:08:37.018000+00:00	2018-02-12 02:22:33.378000+00:00	0	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	1	1	NaN	<NA>	False	0.0
98	0000-0001-6902-6549	True	True	abubakar	muhammad	<NA>	NaN	<NA>	NaN	NaN	[[school of electrical and information enginee...	[[lecturer, university of faisalabad, faisalab...	1	[multidisciplinary digital publishing institute]	2017-07-06 10:29:17.738000+00:00	2020-08-01 05:18:53.393000+00:00	1	0	0	0	True	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	1	1	[multidisciplinary digital publishing institute]	1	True	0.0
101	0000-0002-6142-6406	True	True	adam	mamadou	<NA>	NaN	<NA>	NaN	NaN	[[département deconomie sociologie rurale et t...	[[, institut national de la recherche agronomi...	0	NaN	2018-02-15 09:54:59.943000+00:00	2018-02-15 10:19:27.869000+00:00	0	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	1	1	NaN	<NA>	False	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10989595	0000-0002-1842-4130	True	True	josé de jesús	cázares-marinero	<NA>	[josé cázares]	<NA>	[chemistry, chemical biology, industrial chemi...	[[researcherid, h-2597-2013], [scopus author i...	[[charles friedel, postdoc, école nationale su...	[[mtc, polioles, mexico, , mx, , ], [head of r...	17	[crossref metadata search, scopus - elsevier, ...	2013-07-09 14:39:30.950000+00:00	2020-12-10 17:42:20.176000+00:00	17	0	0	29	False	NaN	NaN	[linkedin.com, google.com, researchgate.net]	<NA>	3	2	5	3	3	[crossref metadata search, scopus - elsevier]	2	True	0.0
10989603	0000-0003-0459-4822	True	True	luana	<NA>	mestranda em tecnologia na saúde e foi aluna o...	[luana bastos morey]	<NA>	[tradução; língua espanhol; língua portuguesa;...	NaN	[[pós-graduação em tecnologia em saúde stricto...	[[professora de espanhol e português para estr...	7	[luana arrial bastos]	2017-05-11 13:14:59.372000+00:00	2020-12-08 20:18:24.163000+00:00	0	0	0	0	False	NaN	NaN	[unidospelasaude.com.br, facebook.com, faceboo...	<NA>	4	<NA>	2	4	3	[]	0	False	3.0
10989605	0000-0003-0057-1551	True	True	lyudmyla	antypenko	the phd degree of pharmacy was received under ...	[lyudmila nikolaevna antipenko (russian transl...	<NA>	[structure elucidation, organic synthesis, val...	[[scopus author id, 55070809900], [researcheri...	[[centre for nanomaterials, advanced technolog...	[[visiting scientist, north dakota state unive...	35	[crossref metadata search, scopus - elsevier, ...	2014-02-19 08:15:15.698000+00:00	2020-12-09 18:14:17.963000+00:00	28	0	11	17	True	NaN	NaN	NaN	<NA>	<NA>	2	5	7	8	[crossref metadata search, scopus - elsevier, ...	4	True	4.0
10989619	0000-0003-4653-4705	True	True	patricia	teixeira	2005 - phd, university of coimbrajuly 2009-jun...	NaN	<NA>	[estua, heavy metals, steroid hormones, ecotox...	[[researcherid, i-6863-2013], [scopus author i...	[[, phd, university of coimbra, coimbra, , pt,...	[[senior researcher, university of coimbra, co...	95	[ciênciavitae, scopus - elsevier, pg cardoso, ...	2013-11-26 10:59:34.331000+00:00	2020-12-02 15:28:26.221000+00:00	90	0	0	42	False	NaN	NaN	NaN	<NA>	<NA>	3	7	1	3	[ciênciavitae, scopus - elsevier, pg cardoso, ...	4	True	0.0
10989644	0000-0002-1686-1935	True	True	youxia	wang	youxia wang (1995-), native of zunyi, guizhou ...	NaN	<NA>	NaN	NaN	[[institute of animal nutrition, master degree...	[[master, sichuan agricultural university , ch...	0	NaN	2020-12-11 02:11:51.808000+00:00	2020-12-11 03:25:28.263000+00:00	0	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	2	1	NaN	<NA>	False	1.0

641229 rows × 34 columns

Employment¶

In [71]:

df.n_employment.describe()

Out[71]:

count    2.680488e+06
mean     1.664713e+00
std      1.530077e+00
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      1.980000e+02
Name: n_employment, dtype: float64

In [72]:

df[df.n_employment == df.n_employment.max()]

Out[72]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative	n_valid_education
8289432	0000-0002-0293-964X	True	True	ben zhong	tang	<NA>	[唐本忠]	tangbenz@ust.hk	[nanotechnology, fluorescent biosensors, optic...	[[hkust profile, tang-benzhong], [researcherid...	[[department of chemistry and faculty of pharm...	[[chair professor, division of biomedical engi...	422	[tang, benzhong, crossref]	2015-03-13 00:28:33.270000+00:00	2021-03-23 07:56:34.824000+00:00	359	0	0	0	False	ust.hk	NaN	[ust.hk]	<NA>	1	3	7	7	198	[crossref]	1	True	3.0

Let's count how many employments have a valid assigned id by orcid (ringols, isni, grid, etc.)

In [73]:

exploded_employment = df[['orcid', 'employment']].explode('employment').dropna()
exploded_employment

Out[73]:

	orcid	employment
1	0000-0001-6112-5550	[professor, peoples friendship university of r...
3	0000-0001-6220-5683	[research scientist, new york university abu d...
4	0000-0001-7071-8294	[researcher (academic), universidad de zaragoz...
4	0000-0001-7071-8294	[researcher (academic), instituto de síntesis ...
6	0000-0001-7402-0096	[, kth royal institute of technology, stockhol...
...	...	...
10989643	0000-0003-2606-0936	[post-doc, institute of biochemistry and cell ...
10989644	0000-0002-1686-1935	[master, sichuan agricultural university , che...
10989645	0000-0002-3800-6331	[assistant professor, baruch college, city uni...
10989645	0000-0002-3800-6331	[postdoctoral scholar, university of californi...
10989647	0000-0002-7584-2283	[lecturer, henan institute of science and tech...

4462243 rows × 2 columns

In [74]:

exploded_employment[['role', 'institution', 'city', 'region', 'country', 'id', 'id_scheme']] = pd.DataFrame(exploded_employment.employment.tolist(), index=exploded_employment.index)

In [75]:

exploded_employment.id.replace('', pd.NA, inplace=True)

In [76]:

exploded_employment.groupby('orcid').id.count().reset_index()

Out[76]:

	orcid	id
0	0000-0001-5000-0031	1
1	0000-0001-5000-0138	1
2	0000-0001-5000-0170	2
3	0000-0001-5000-0218	1
4	0000-0001-5000-0226	1
...	...	...
2680483	0000-0003-4999-9831	1
2680484	0000-0003-4999-9890	1
2680485	0000-0003-4999-992X	0
2680486	0000-0003-4999-9938	1
2680487	0000-0003-4999-9954	2

2680488 rows × 2 columns

In [77]:

df = df.merge(exploded_employment.groupby('orcid').id.count().reset_index(), on='orcid', how='left')
df.rename(columns={'id': 'n_valid_employment'}, inplace=True)

In [78]:

df[df.n_employment != df.n_valid_employment]

Out[78]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative	n_valid_education	n_valid_employment
3	0000-0001-6220-5683	True	True	<NA>	<NA>	<NA>	NaN	<NA>	NaN	NaN	NaN	[[research scientist, new york university abu ...	0	NaN	2015-08-18 12:36:45.307000+00:00	2020-09-23 13:37:54.180000+00:00	0	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	1	NaN	<NA>	False	NaN	0.0
4	0000-0001-7071-8294	True	True	<NA>	<NA>	<NA>	NaN	<NA>	NaN	NaN	NaN	[[researcher (academic), universidad de zarago...	0	NaN	2014-03-10 13:22:01.966000+00:00	2016-06-14 22:17:54.470000+00:00	0	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	2	NaN	<NA>	False	NaN	1.0
6	0000-0001-7402-0096	True	True	<NA>	<NA>	<NA>	NaN	<NA>	NaN	NaN	NaN	[[, kth royal institute of technology, stockho...	0	NaN	2015-01-11 15:13:06.467000+00:00	2016-06-14 23:55:59.896000+00:00	0	0	0	0	False	NaN	NaN	[kth.se]	<NA>	1	<NA>	<NA>	<NA>	1	NaN	<NA>	False	NaN	0.0
11	0000-0001-8377-3508	True	True	<NA>	<NA>	<NA>	[fontana, milena da silva]	<NA>	[educação; informática; matemática.]	NaN	NaN	[[, instituto federal de educação, ciência e t...	0	NaN	2018-05-23 23:39:04.534000+00:00	2019-10-16 02:50:11.007000+00:00	0	0	0	0	False	NaN	NaN	[cnpq.br]	<NA>	1	<NA>	1	<NA>	3	NaN	<NA>	False	NaN	0.0
38	0000-0002-6508-6998	True	True	<NA>	<NA>	<NA>	NaN	<NA>	NaN	NaN	NaN	[[researcher (academic), universidad de zarago...	0	NaN	2014-03-12 08:23:22.492000+00:00	2015-07-27 15:51:38.411000+00:00	0	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	2	NaN	<NA>	False	NaN	1.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10989632	0000-0001-9133-2366	True	True	søren	staugaard	<NA>	NaN	<NA>	NaN	NaN	[[, , aarhus universitet, aarhus, , dk, 1006, ...	[[, aarhus university, aarhus c, , dk, , ], [s...	29	[aarhus university, crossref]	2013-03-19 11:34:48.477000+00:00	2020-12-07 08:03:23.190000+00:00	14	0	10	35	True	NaN	NaN	[au.dk, au.dk]	<NA>	2	<NA>	<NA>	1	3	[aarhus university, crossref]	2	True	1.0	1.0
10989634	0000-0001-8494-2123	True	True	tarun	jain	<NA>	NaN	<NA>	[pet/ct specialist; nuclear medicine physician...	NaN	NaN	[[assistant professor, mahatma gandhi medical ...	0	NaN	2014-12-19 08:21:46.292000+00:00	2020-12-09 06:03:57.055000+00:00	0	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	1	<NA>	5	NaN	<NA>	False	NaN	4.0
10989636	0000-0002-2906-0299	True	True	tiffany	mackay	<NA>	[tiffany russel sia]	<NA>	[prostate cancer, oxytocin, pet/ct, gpc-1, gal...	researcherid, a-2121-2017	[[faculty of medicine, master in pharmaceutica...	[[clinical project lead, minomic international...	11	[crossref, researcherid, tiffany mackay]	2017-01-03 23:28:48.736000+00:00	2020-12-09 17:12:20.326000+00:00	11	0	0	0	True	NaN	NaN	[oxytocin.com.au, linkedin.com]	<NA>	2	1	13	2	4	[crossref, researcherid]	2	True	2.0	1.0
10989639	0000-0002-4422-4036	True	True	vijay	krishnan	<NA>	NaN	<NA>	NaN	NaN	[[psychiatry, md, all india institute of medic...	[[assistant professor, all india institute of ...	2	[crossref]	2015-05-28 17:24:39.519000+00:00	2020-11-24 08:57:22.875000+00:00	2	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	2	5	[crossref]	1	True	2.0	3.0
10989645	0000-0002-3800-6331	True	True	zachary	calamari	<NA>	NaN	<NA>	NaN	NaN	[[richard gilder graduate school, phd in compa...	[[assistant professor, baruch college, city un...	7	[crossref metadata search, zachary t. calamari...	2015-01-20 20:20:17.042000+00:00	2020-11-21 19:48:36.221000+00:00	7	0	1	0	True	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	2	2	[crossref metadata search, crossref]	2	True	2.0	0.0

1036967 rows × 35 columns

Biography¶

In [79]:

df.biography.replace('', np.NaN, inplace=True)

In [80]:

df.biography.describe()

Out[80]:

count                                                354015
unique                                               337007
top       car title loans are a more straightforward way...
freq                                                    343
Name: biography, dtype: object

Let's also fabricate a few other features from biographies.

In [81]:

df['biography_length'] = df.biography.str.len()

In [82]:

df['biography_n_sentences'] = df[df.biography.notna()].biography.apply(lambda bio: len(sent_tokenize(bio)))

In [83]:

df['biography_n_words'] = df[df.biography.notna()].biography.apply(lambda bio: len(word_tokenize(bio)))

Duplicated bios

In [84]:

df[(df.biography.notna()) & (df.biography.str.contains('car title loans are a more straightforward'))]

Out[84]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative	n_valid_education	n_valid_employment	biography_length	biography_n_sentences	biography_n_words
51306	0000-0002-7397-7977	True	True	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	<NA>	[car title loan upland]	NaN	NaN	NaN	0	NaN	2020-11-06 06:10:20.070000+00:00	2020-11-06 06:24:28.005000+00:00	0	0	0	0	False	NaN	NaN	[premiumcartitleloans.com]	<NA>	1	<NA>	1	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	445	4.0	85.0
51307	0000-0003-4931-9736	True	True	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	<NA>	[car title loan saratoga]	NaN	NaN	NaN	0	NaN	2020-11-13 01:04:19.859000+00:00	2020-11-13 01:15:12.546000+00:00	0	0	0	0	False	NaN	NaN	[premiumcartitleloans.com]	<NA>	1	<NA>	1	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	445	4.0	85.0
106024	0000-0001-8221-2303	True	True	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	<NA>	[car title loan victorville]	NaN	NaN	NaN	0	NaN	2020-11-05 00:38:21.096000+00:00	2020-11-05 00:40:40.091000+00:00	0	0	0	0	False	NaN	NaN	[premiumcartitleloans.com]	<NA>	1	<NA>	1	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	445	4.0	85.0
108770	0000-0001-6736-072X	True	True	premium car	title loans	car title loans are a more straightforward way...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-12-08 05:38:30.786000+00:00	2020-12-08 05:40:03.786000+00:00	0	0	0	0	False	NaN	NaN	[premiumcartitleloans.com]	<NA>	1	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	542	4.0	105.0
108771	0000-0002-8727-1246	True	True	premium car	title loans	car title loans are a more straightforward way...	[loan agency]	<NA>	[title loan on car, car title loan north ogden...	NaN	NaN	NaN	0	NaN	2020-12-10 08:54:56.127000+00:00	2020-12-10 08:57:15.791000+00:00	0	0	0	0	False	NaN	NaN	[premiumcartitleloans.com]	<NA>	1	<NA>	4	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	563	4.0	108.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10875416	0000-0002-9640-8136	True	True	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	<NA>	[car title loan clovis]	NaN	NaN	NaN	0	NaN	2020-10-22 06:11:02.945000+00:00	2020-10-22 06:17:09.111000+00:00	0	0	0	0	False	NaN	NaN	[premiumcartitleloans.com]	<NA>	1	<NA>	1	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	445	4.0	85.0
10878239	0000-0002-6926-3752	True	True	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	<NA>	[car title loan escondido]	NaN	NaN	NaN	0	NaN	2020-12-03 02:00:33.684000+00:00	2020-12-03 02:02:07.054000+00:00	0	0	0	0	False	NaN	NaN	[premiumcartitleloans.com]	<NA>	1	<NA>	1	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	445	4.0	85.0
10933380	0000-0002-3655-4713	True	True	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	<NA>	[car title loan san rafael]	NaN	NaN	NaN	0	NaN	2020-11-18 00:39:17.492000+00:00	2020-11-18 00:52:19.024000+00:00	0	0	0	0	False	NaN	NaN	[premiumcartitleloans.com]	<NA>	1	<NA>	1	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	445	4.0	85.0
10933381	0000-0002-8724-1020	True	True	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	<NA>	[car title loan san juan capistrano]	NaN	NaN	NaN	0	NaN	2020-11-19 00:31:54.080000+00:00	2020-11-19 00:34:08.721000+00:00	0	0	0	0	False	NaN	NaN	[premiumcartitleloans.com]	<NA>	1	<NA>	1	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	445	4.0	85.0
10985986	0000-0002-4601-4569	True	True	premium car	title loans	car title loans are a more straightforward way...	[premium car title loans]	<NA>	[car title loan mount pleasant]	NaN	NaN	NaN	0	NaN	2020-10-16 00:32:26.207000+00:00	2020-10-16 00:37:42.646000+00:00	0	0	0	0	False	NaN	NaN	[premiumcartitleloans.com]	<NA>	1	<NA>	1	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	445	4.0	85.0

421 rows × 38 columns

Let's note them down

In [85]:

i = 0
for orcid in df[(df.biography.notna()) & (df.biography.str.contains('car title loans are a more straightforward'))]['orcid']:
    FAKE_HEAP['carloan_' + str(i)] = orcid
    i = i+1

Let's check deeper into duplicated bios

In [86]:

df[(df.biography.notna()) & (df.biography.duplicated(keep=False))]

Out[86]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative	n_valid_education	n_valid_employment	biography_length	biography_n_sentences	biography_n_words
613	0000-0001-6750-1481	True	True	aesthetic	record	make your practice easy with a professional so...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-09-28 09:13:52.705000+00:00	2020-09-28 09:17:36.855000+00:00	0	0	0	0	False	NaN	NaN	[aestheticrecord.com]	<NA>	1	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	165	2.0	28.0
1145	0000-0001-9243-2342	True	True	alfonso	ruiz-bravo lopez	personal docente e investigador de la universi...	NaN	<NA>	NaN	NaN	NaN	[[, universidad de granada, granada, andalucia...	1	[crossref]	2015-01-19 13:01:31.041000+00:00	2020-03-10 10:17:07.174000+00:00	1	0	0	0	True	NaN	NaN	[ugr.es]	<NA>	1	<NA>	<NA>	<NA>	1	[crossref]	1	True	NaN	0.0	127	1.0	19.0
1519	0000-0002-3192-1481	True	True	amy	spahn	research administrator	NaN	<NA>	NaN	NaN	[[graduate studies, master of science in admin...	[[research and program administrator, michigan...	0	NaN	2020-05-04 18:55:27.062000+00:00	2020-08-04 14:52:03.330000+00:00	0	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	2	3	NaN	<NA>	False	2.0	2.0	22	1.0	2.0
2269	0000-0002-6104-6550	True	True	antonio	martinez	personal docente e investigador de la universi...	NaN	<NA>	NaN	researcherid, e-9723-2016	[[geometria y topologia, , universidad de gran...	[[, universidad de granada, granada, andalucia...	51	[researcherid, crossref]	2015-02-08 18:57:35.978000+00:00	2020-07-20 07:50:58.740000+00:00	35	0	0	46	True	NaN	NaN	[ugr.es]	<NA>	1	1	<NA>	1	1	[researcherid, crossref]	2	True	1.0	0.0	141	1.0	21.0
6163	0000-0003-0171-7962	True	True	eduardo	ortega bernaldo de quiros	personal docente e investigador de la universi...	NaN	<NA>	NaN	NaN	NaN	[[, universidad de granada, granada, andalucia...	0	NaN	2015-06-01 08:43:00.352000+00:00	2017-06-19 07:44:32.649000+00:00	0	0	0	0	False	NaN	NaN	[ugr.es]	<NA>	1	<NA>	<NA>	<NA>	1	NaN	<NA>	False	NaN	0.0	143	1.0	22.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10988461	0000-0002-7632-8868	True	True	margarita	lopez-viota gallardo	personal docente e investigador de la universi...	NaN	mlvg@ugr.es	NaN	NaN	NaN	[[, universidad de granada, granada, andalucia...	0	NaN	2015-07-29 07:17:22.959000+00:00	2020-11-26 19:55:46.827000+00:00	0	0	0	0	False	ugr.es	NaN	[ugr.es]	<NA>	1	<NA>	<NA>	<NA>	1	NaN	<NA>	False	NaN	0.0	122	1.0	19.0
10988510	0000-0001-5504-2767	True	True	maría nieves	gonzález pérez	personal docente e investigador de la universi...	NaN	<NA>	NaN	NaN	[[lengua española, doctora en estudios lingüís...	[[profesor asociado, universidad de castilla-l...	2	[mla international bibliography, maría nieves ...	2015-07-09 23:19:08.782000+00:00	2020-12-08 14:01:30.548000+00:00	0	0	0	1	False	NaN	NaN	[uclm.es]	<NA>	1	<NA>	<NA>	1	1	[mla international bibliography]	1	True	1.0	1.0	130	1.0	19.0
10988922	0000-0001-9501-3717	True	True	roxann	lynsey	een korte inleiding op waterverwarmingstoestel...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-12-10 21:40:39.371000+00:00	2020-12-10 21:49:33.129000+00:00	0	0	0	0	False	NaN	NaN	[zonneboilermagazijn.nl]	<NA>	1	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	151	1.0	22.0
10988929	0000-0003-2323-4005	True	True	ruland	star	tentu saja semua permainan yang disediakan aka...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-11-24 13:18:00.296000+00:00	2020-11-24 13:21:10.985000+00:00	0	0	0	0	False	NaN	NaN	[google.com]	<NA>	1	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	150	1.0	24.0
10989073	0000-0003-4244-0381	True	True	sumiko	cesar	welcome to my page! im still finding my way ar...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-12-13 00:34:15.955000+00:00	2020-12-13 00:37:21.793000+00:00	0	0	0	0	False	NaN	NaN	[139.59.245.36]	<NA>	1	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	98	2.0	21.0

19571 rows × 38 columns

In [87]:

dup_bios = df[['orcid', 'biography']].groupby('biography').count().sort_values('orcid', ascending=False)
dup_bios = dup_bios[dup_bios.orcid > 1]
dup_bios

Out[87]:

	orcid
biography
car title loans are a more straightforward way to borrow the money you need, without dealing with the hassles of a traditional bank loan. because they use the equity value of your qualifying vehicle to secure funding, they are a great borrowing option for customers with credit issues, or who need expedited funding. our customers know to turn to our local experts when they need auto title loans. our team is excited to get you your loan today!	343
hi, how are you? it is really cool to find an entire community of people interested in the same thing you are.	229
the sound and the fury is one of my all-time favorite novels but i have many.	218
悪意に満ちたバイアス（偏った）記事がいまなお健在だという好例の記事を見つけた。知識層が最も好むとされる大手新聞（8月24日付)の朝刊記事だ。グリホサートという除草剤が発がん性や胎児への影響をもたらすと指摘する記事だが、先進国の公的機関は明確に否定している。こういう記事が続く限り、活字メディアはいよいよ専門家から見放されるだろうとの思いを強くする。	137
one of my passions is people watching but i dont get to do it as much as i would like.	132
...	...
environmental engineering	2
insaat kalip yagi, kalip yag, plywood kalip yagi, ahsap kalip yagi alanlarinda profesyonel ve organik olarak imalat yapan sirketimiz musteri goruslerini son derece onemsemektedir.	2
<p class=p__7>since life and medical insurance commissions are front-loaded, agents generally do not get a commission after the 3rd policy renewal.	2
professor of otolaryngology	2
ive traveled to several countries and have several more to see. i have a lizard named tinky.	2

2563 rows × 1 columns

In [88]:

dup_bios.sum()

Out[88]:

orcid    19571
dtype: int64

In [89]:

# dup_bios.to_csv('../data/processed/dup_bios.csv', index=True, columns=[], header=False)
dup_bios.to_csv('../data/processed/dup_bios.csv')

I noticed that some bios can be found on google in other (probably fake) accounts. E.g. "hi, how are you? it is really cool to find an entire community of people interested in the same thing you are." can be found on https://dribbble.com/camrodoabh/about

Dup bios URLs

Let's plot the domains dup bios point to

In [90]:

BIO_SNIPPET = 'really cool to find an entire community of people'
dup_bios_df = df[df.biography.str.contains(BIO_SNIPPET)].explode('url_domains').groupby('url_domains')[['orcid']].count().sort_values('orcid', ascending=False)

set_top_n(50)
data = [
    go.Bar(
        x=dup_bios_df[:TOP_N].index,
        y=dup_bios_df[:TOP_N]['orcid']
    )
]

layout = go.Layout(
    title='URL distribution for bio "%s"' % BIO_SNIPPET,
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

Dup bios and date of activation

In [91]:

BIO_SNIPPET = 'more straightforward way to borrow the money you'
dup_bios_df = df[df.biography.str.contains(BIO_SNIPPET)]
# .groupby(df.activation_date.dt.month)[['orcid']].count().sort_values('orcid', ascending=False)

data = [
    go.Histogram(
        x=dup_bios_df['activation_date'],
        y=dup_bios_df['orcid'],
        histfunc='count'
    )
]

layout = go.Layout(
    title='Activation distribution for bio "%s"' % BIO_SNIPPET,
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
fig.update_traces(xbins_size='D1')
plotly.offline.iplot(fig)

For all duplicated bios

In [92]:

dup_bios_df = df[(df.biography.notna()) & (df.biography.duplicated(keep=False))]

data = [
    go.Histogram(
        x=dup_bios_df['activation_date'],
        y=dup_bios_df['orcid'],
        histfunc='count'
    )
]

layout = go.Layout(
    title='Activation date distribution for all dup bios',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
fig.update_traces(xbins_size='D1')
plotly.offline.iplot(fig)

While in general it holds a seasonality (e.g. weekends and holidays) (Commented as the HTML generated is huge.. will sort this out)

In [129]:

# YEAR = 2020
# data = [
#     go.Histogram(
#         x=df[df.activation_date.dt.year == YEAR]['activation_date'],
#         y=df[df.activation_date.dt.year == YEAR]['orcid'],
#         histfunc='count'
#     )
# ]

# layout = go.Layout(
#     title='Activation date distribution (general) for %s' % YEAR,
#     xaxis=dict(tickangle=45, tickfont=dict(size=12))
# )
# fig = go.Figure(data=data, layout=layout)
# fig.update_traces(xbins_size='D1')
# plotly.offline.iplot(fig)

Dup bios with extended length

Last update date ~ to activation date in duplicated bios

In [95]:

df[(df.biography.notna()) &
   (df.biography.duplicated(keep=False)) &
   (df.activation_date.dt.year == df.last_update_date.dt.year) &
   (df.activation_date.dt.month == df.last_update_date.dt.month) &
   (df.activation_date.dt.day == df.last_update_date.dt.day)]

Out[95]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative	n_valid_education	n_valid_employment	biography_length	biography_n_sentences	biography_n_words
613	0000-0001-6750-1481	True	True	aesthetic	record	make your practice easy with a professional so...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-09-28 09:13:52.705000+00:00	2020-09-28 09:17:36.855000+00:00	0	0	0	0	False	NaN	NaN	[aestheticrecord.com]	<NA>	1	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	165	2.0	28.0
17800	0000-0002-0267-4549	True	True	ocean impact	windows & doors	address:675 nw 116th street,miami, fl 33168pho...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2019-11-28 14:32:35.172000+00:00	2019-11-28 14:40:07.022000+00:00	0	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	369	1.0	60.0
26513	0000-0001-5148-4138	True	True	hamna	ali	https://pakistanpropertyinfo.blogspot.com/2019...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2019-03-09 00:35:08.585000+00:00	2019-03-09 00:36:05.087000+00:00	0	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	93	1.0	3.0
27456	0000-0001-8210-4986	True	True	oscar	hdtv	https://2017-oscar.com/https://2017-oscar.com/...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2017-02-21 06:14:06.156000+00:00	2017-02-21 06:41:12.010000+00:00	0	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	69	1.0	7.0
30196	0000-0002-1778-5209	True	True	nickie	jolliff	presently, my favorite tv show is true detecti...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-12-15 02:38:31.259000+00:00	2020-12-15 02:40:41.464000+00:00	0	0	0	0	False	NaN	NaN	[penzu.com]	<NA>	1	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	89	2.0	20.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
10988314	0000-0001-9931-5843	True	True	lando	edgar	im training to be a farm and home management a...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-12-05 11:44:22.829000+00:00	2020-12-05 11:49:30.839000+00:00	0	0	0	0	False	NaN	NaN	[skyrock.com]	<NA>	1	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	77	2.0	17.0
10988449	0000-0003-1946-7156	True	True	mantooth	ruland	i believe in living life to the fullest and fo...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-11-28 14:17:14.721000+00:00	2020-11-28 14:19:48.043000+00:00	0	0	0	0	False	NaN	NaN	[renomacars.pl]	<NA>	1	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	98	1.0	21.0
10988922	0000-0001-9501-3717	True	True	roxann	lynsey	een korte inleiding op waterverwarmingstoestel...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-12-10 21:40:39.371000+00:00	2020-12-10 21:49:33.129000+00:00	0	0	0	0	False	NaN	NaN	[zonneboilermagazijn.nl]	<NA>	1	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	151	1.0	22.0
10988929	0000-0003-2323-4005	True	True	ruland	star	tentu saja semua permainan yang disediakan aka...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-11-24 13:18:00.296000+00:00	2020-11-24 13:21:10.985000+00:00	0	0	0	0	False	NaN	NaN	[google.com]	<NA>	1	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	150	1.0	24.0
10989073	0000-0003-4244-0381	True	True	sumiko	cesar	welcome to my page! im still finding my way ar...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-12-13 00:34:15.955000+00:00	2020-12-13 00:37:21.793000+00:00	0	0	0	0	False	NaN	NaN	[139.59.245.36]	<NA>	1	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	98	2.0	21.0

13663 rows × 38 columns

Dup bios URLs

In [133]:

top_urls = dup_bios_df[['orcid', 'url_domains']]\
                .explode('url_domains')\
                .reset_index(drop=True)\
                .groupby('url_domains')\
                .count()\
                .sort_values('orcid', ascending=False)

In [134]:

set_top_n(50)
data = [
    go.Bar(
        x=top_urls[:TOP_N].index,
        y=top_urls[:TOP_N]['orcid']
    )
]

layout = go.Layout(
    title='Top-%s URL domains' % TOP_N,
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [153]:

top_urls = df[['orcid', 'url_domains']]\
                .explode('url_domains')\
                .reset_index(drop=True)\
                .groupby('url_domains')\
                .count()\
                .sort_values('orcid', ascending=False)

Out[153]:

	orcid
url_domains
linkedin.com	78418
researchgate.net	67823
google.com	44804
cnpq.br	24635
academia.edu	21174
...	...
gil-sanzlab.com	1
gilabola88.net	1
gilabs.com	1
giladfeldman.org	1
http	1

199625 rows × 1 columns

In [154]:

exp = df[['orcid', 'url_domains']].explode('url_domains')

Out[154]:

	orcid	url_domains
0	0000-0001-6097-3953	NaN
1	0000-0001-6112-5550	NaN
2	0000-0001-6152-2695	NaN
3	0000-0001-6220-5683	NaN
4	0000-0001-7071-8294	NaN
...	...	...
10989644	0000-0002-1686-1935	NaN
10989645	0000-0002-3800-6331	NaN
10989646	0000-0002-8783-5814	NaN
10989647	0000-0002-7584-2283	NaN
10989648	0000-0003-0529-3538	NaN

11300438 rows × 2 columns

In [157]:

exp[exp.url_domains == 'lucialpiazzale.com']

Out[157]:

	orcid	url_domains
48136	0000-0001-6141-6446	lucialpiazzale.com
89320	0000-0001-5691-4184	lucialpiazzale.com
91455	0000-0001-7228-6456	lucialpiazzale.com
104766	0000-0002-2219-4665	lucialpiazzale.com
142061	0000-0002-3869-9561	lucialpiazzale.com
...	...	...
10864865	0000-0002-2257-6612	lucialpiazzale.com
10874490	0000-0001-8141-8158	lucialpiazzale.com
10970527	0000-0002-5195-9647	lucialpiazzale.com
10974940	0000-0002-7562-5465	lucialpiazzale.com
10977979	0000-0002-8886-0069	lucialpiazzale.com

548 rows × 2 columns

In [158]:

df[df.orcid == '0000-0002-3869-9561']

Out[158]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative	n_valid_education	n_valid_employment	biography_length	biography_n_sentences	biography_n_words	date_diff	ref_year	date_stale
142061	0000-0002-3869-9561	True	True	kimble	esterly	when parents tackle additional dedications out...	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-12-25 20:10:07.336000+00:00	2020-12-25 20:36:27.423000+00:00	0	0	0	0	False	NaN	NaN	[lucialpiazzale.com]	<NA>	1	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	150	1.0	25.0	0.018288	2020	116.515419

Assign spam score from precanned library

In [96]:

# bios = df[df.biography.notna()][['orcid', 'biography']]

In [97]:

# def score(bio):
#     try:
#         return antispam.score(bio)
#     except: # if len(bio) < 3 the filter doesn't know how to handle that
#         return -1

In [98]:

# bios['spam_score'] = bios.biography.apply(lambda bio: score(bio))

In [99]:

# bios[bios.spam_score == -1] # these are artefacts (no scoring possible)

In [100]:

# bios.spam_score.replace(to_replace=-1, value=np.nan, inplace=True)

In [101]:

# bios.spam_score.describe()

In [102]:

# bios[bios.spam_score > 0.99]

Spam goes nowhere.

Search offending words, sexually explicit content, etc.

In [103]:

# bios['profanity_score'] = profanity_check.predict_prob(bios.biography)

In [104]:

# bios[bios.profanity_score > 0.90]

Profanity detection goes nowhere too.

Dates¶

In [105]:

df[df.activation_date == df.last_update_date]['orcid'].count()

Out[105]:

In [106]:

df[df.activation_date > df.last_update_date]['orcid'].count()

Out[106]:

In [107]:

df[(df.activation_date.dt.year == df.last_update_date.dt.year) &
   (df.activation_date.dt.month == df.last_update_date.dt.month) &
   (df.activation_date.dt.day == df.last_update_date.dt.day)]['orcid'].count()

Out[107]:

In [108]:

df['date_diff'] = (df.last_update_date - df.activation_date) / np.timedelta64(1, 'D')
df.date_diff.describe()

Out[108]:

count    1.098965e+07
mean     5.663836e+02
std      7.434119e+02
min     -9.120370e-06
25%      1.145045e-02
50%      1.980872e+02
75%      9.430198e+02
max      3.084876e+03
Name: date_diff, dtype: float64

In [109]:

df[df.date_diff == df.date_diff.min()]

Out[109]:

	orcid	verified_email	verified_primary_email	given_names	family_name	biography	other_names	primary_email	keywords	external_ids	education	employment	n_works	works_source	activation_date	last_update_date	n_doi	n_arxiv	n_pmc	n_other_pids	label	primary_email_domain	other_email_domains	url_domains	n_emails	n_urls	n_ids	n_keywords	n_education	n_employment	ext_works_source	n_ext_work_source	authoritative	n_valid_education	n_valid_employment	biography_length	biography_n_sentences	biography_n_words	date_diff
10771774	0000-0002-4035-0449	False	False	aracelis	fiffe pérez	<NA>	NaN	<NA>	NaN	NaN	NaN	NaN	0	NaN	2020-08-04 14:18:03.177000+00:00	2020-08-04 14:18:02.389000+00:00	0	0	0	0	False	NaN	NaN	NaN	<NA>	<NA>	<NA>	<NA>	<NA>	<NA>	NaN	<NA>	False	NaN	NaN	<NA>	NaN	NaN	-0.000009

In [110]:

df.loc[df.date_diff < 0, 'date_diff'] = 0

In [111]:

df['ref_year'] = df.activation_date.dt.year

In [112]:

# fig = go.Figure()
# years = range(2013, 2021, 1)

# for year in years:
#     fig.add_trace(go.Violin(x=df[df.ref_year == year].ref_year,
#                             y=df[df.ref_year == year].date_diff,
#                             name=year,
#                             points=False,
#                             box_visible=True,
#                             meanline_visible=True))

# fig.show()

In [115]:

plt.figure(figsize=(16, 6))
ax = sns.violinplot(x='ref_year', y='date_diff', data=df)

In [116]:

df['ref_year'] = df.last_update_date.dt.year

plt.figure(figsize=(16, 6))
ax = sns.violinplot(x='ref_year', y='date_diff', data=df)

In [117]:

tz = pytz.timezone('UTC')
NOW = datetime.now(tz)

df['date_stale'] = (NOW - df.last_update_date) / np.timedelta64(1, 'D')
df.date_stale.describe()

Out[117]:

count    1.098965e+07
mean     4.956143e+02
std      5.441444e+02
min      2.771697e+01
25%      7.902869e+01
50%      2.586001e+02
75%      7.215915e+02
max      2.967694e+03
Name: date_stale, dtype: float64

In [118]:

plt.figure(figsize=(16, 6))
ax= sns.violinplot(x='ref_year', y='date_stale', data=df)

In [119]:

df['ref_year'] = df.activation_date.dt.year

plt.figure(figsize=(16, 6))
ax = sns.violinplot(x='ref_year', y='date_stale', data=df)

Todo:

stale profiles with information initially set are likely to be fake?
the more info is present

All VS all colleration¶

In [120]:

fig = px.imshow(df.select_dtypes(include=['bool','number']).fillna(-1).corr())
fig.show()

In [121]:

fig = px.imshow(df[df.biography.notna()].select_dtypes(include=['bool','number']).fillna(-1).corr())
fig.show()

In [122]:

fig = px.imshow(df[df.label == True].select_dtypes(include=['bool','number']).fillna(-1).corr())
fig.show()

In [123]:

# df[['verified_email', 
#     'verified_primary_email', 
#     'n_works', 
#     'n_doi',
#     'n_arxiv', 
#     'n_pmc', 
#     'n_other_pids', 
#     'n_emails', 
#     'n_urls', 
#     'n_ids', 
#     'n_keywords', 
#     'n_employment', 
#     'n_education', 
#     'label']].to_pickle('../data/processed/features.pkl')

In [222]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10989649 entries, 0 to 10989648
Data columns (total 31 columns):
 #   Column                  Dtype              
---  ------                  -----              
 0   orcid                   string             
 1   verified_email          bool               
 2   verified_primary_email  bool               
 3   given_names             string             
 4   family_name             string             
 5   biography               string             
 6   other_names             object             
 7   urls                    object             
 8   primary_email           string             
 9   keywords                object             
 10  external_ids            object             
 11  education               object             
 12  employment              object             
 13  n_works                 Int16              
 14  works_source            object             
 15  activation_date         datetime64[ns, UTC]
 16  last_update_date        datetime64[ns, UTC]
 17  n_doi                   Int16              
 18  n_arxiv                 Int16              
 19  n_pmc                   Int16              
 20  n_other_pids            Int16              
 21  label                   bool               
 22  primary_email_domain    object             
 23  other_email_domains     object             
 24  url_domains             object             
 25  n_emails                Int16              
 26  n_urls                  Int16              
 27  n_ids                   Int16              
 28  n_keywords              Int16              
 29  n_education             Int16              
 30  n_employment            Int16              
dtypes: Int16(11), bool(3), datetime64[ns, UTC](2), object(10), string(5)
memory usage: 1.8+ GB

3.9 MiB Raw Blame History Unescape Escape