fake-orcid-analysis/notebooks/01-Exploration.ipynb

307 KiB
Raw Blame History

Exploratory analysis

TODO:

  • Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)
  • Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)
  • Temporal dimension of any use?
  • Can we access private info thanks to the OpenAIRE-ORCID agreement?
In [73]:
import pandas as pd
import ast
import tldextract
import numpy

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

init_notebook_mode(connected=True)
TOP_N = 0
TOP_RANGE = [0, 0]
def set_top_n(n):
    global TOP_N, TOP_RANGE
    TOP_N = n
    TOP_RANGE = [-.5, n - 1 + .5]

Notable solid ORCID iDs for explorative purposes:

In [2]:
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'

Anomalies ORCiD profile

In [3]:
JOURNAL = '0000-0003-1815-5732'
NOINFO= '0000-0001-5009-2052'
# todo: find group-shared ORCiD, if possible

Notable fake ORCID iDs for explorative purposes:

In [4]:
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'
PENIS = '0000-0002-3399-7287'
BITCOIN = '0000-0002-7518-6845'
FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment
CANNABIS = '0000-0002-9025-8632'      # URL > 70 + works (REMOVED)
PLUMBER = '0000-0002-1700-8311'       # URL > 10 + works 

Load the dataset

In [5]:
df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\t', header=0,
                         names = ['orcid', 'claimed','verified_email', 'verified_primary_email', 
                                  'given_names', 'family_name', 'biography', 'other_names', 'urls', 
                                  'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', 
                                  'employment', 'n_works', 'works_source'])
In [6]:
df[df.duplicated()]
Out[6]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
7552 0000-0001-7831-7567 1 1 1 Vahab Vahdat NaN NaN NaN NaN NaN NaN [["Scopus Author ID", "57193490305"], ["Scopus... [["Industrial Engineering", "PhD", "Northeaste... [["Post-doctorate fellow", "Harvard Medical Sc... 25 ["Vahab Vahdat", "Scopus - Elsevier", "Multidi...
8416 0000-0001-8161-1345 1 1 1 AYFER TEKIN ATACAN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN
16498 0000-0002-1133-1505 1 1 1 Xianrong Lai NaN NaN NaN NaN NaN NaN "Scopus Author ID", "15769435500" [["Department of pharmacy", "Bachelor of Tradi... [["Associate Research, Professor", "Chengdu Un... 115 ["Xianrong Lai", "Scopus - Elsevier", "Crossref"]
16830 0000-0002-1257-5536 1 1 1 Alexandra Zimmer NaN NaN NaN NaN NaN NaN NaN NaN [["Research assistent", "Fraunhofer-Institut f... 0 NaN
18835 0000-0002-2026-4156 1 1 1 Fatma Sri Wahyuni NaN ["Ayu"] NaN NaN NaN NaN [["ResearcherID", "C-5194-2015"], ["Scopus Aut... [["Biosains", "PHD", "Universiti Putra Malaysi... [["Lecturer", "Universitas Andalas", "Padang",... 27 ["Publons", "Crossref Metadata Search", "Scopu...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10733293 0000-0002-9887-7788 1 1 1 Markéta Laštůvková NaN NaN NaN NaN NaN NaN NaN NaN [["", "VSB - Technical University of Ostrava",... 0 NaN
10737258 0000-0003-1367-8104 1 1 1 LORENA GUTIÉRREZ GARCÍA NaN NaN [["LinkedIn", "https://www.linkedin.com/in/lor... lorenagg@unex.es NaN ["Agroecolog\u00eda, Bot\u00e1nica, Did\u00e1c... "ResearcherID", "AAE-6316-2021" [["", "M\u00e1ster en Formaci\u00f3n del profe... [["PCI", "Universidad de Extremadura - Campus ... 14 ["Multidisciplinary Digital Publishing Institu...
10738308 0000-0003-1741-3437 1 1 1 Xing Liu NaN NaN NaN NaN NaN NaN "ResearcherID", "S-3053-2017" NaN NaN 0 NaN
10741460 0000-0003-2909-8585 1 1 1 Yusuf Özcan NaN NaN NaN NaN NaN NaN NaN [["\u0130lahiyat Fak\u00fcltesi", "Doktora", "... [["Research Assistant", "\u00c7ukurova Univers... 0 NaN
10745078 0000-0003-4259-5324 1 1 1 P Rama Mohan NaN NaN NaN NaN NaN NaN NaN "Scopus Author ID", "24776757000" [["EEE Department", "Ph.D. (Power Electronics ... [["Associate Professor", "RGM College of Engin... 21 ["Scopus - Elsevier", "P Rama Mohan"]

2418 rows × 17 columns

In [7]:
df.drop_duplicates(inplace=True)

Basic column manipulation (interpret columns as lists when necessary)

In [8]:
df['other_names'] = df[df.other_names.notna()]['other_names'].apply(lambda x: ast.literal_eval(x))
In [9]:
df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: ast.literal_eval(x))
In [10]:
df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x))
In [11]:
df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x))
In [12]:
df['education'] = df[df.education.notna()]['education'].apply(lambda x: ast.literal_eval(x))
In [13]:
df['employment'] = df[df.employment.notna()]['employment'].apply(lambda x: ast.literal_eval(x))
In [14]:
df['external_ids'] = df[df.external_ids.notna()]['external_ids'].apply(lambda x: ast.literal_eval(x))
In [15]:
df['works_source'] = df[df.works_source.notna()]['works_source'].apply(lambda x: ast.literal_eval(x))
In [16]:
df.head(5)
Out[16]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
0 0000-0001-5000-2053 1 0 0 Jorge Jaramillo Sanchez NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN
1 0000-0001-5000-6548 1 0 0 Wiseman Bekelesi NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN
2 0000-0001-5000-7962 1 1 1 ALICE INDIMULI NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN
3 0000-0001-5000-8586 1 0 0 shim ji yun NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN
4 0000-0001-5001-0256 1 0 0 Sandro Caramaschi NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN
In [17]:
df[df['orcid'] == AM]
Out[17]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
8840413 0000-0002-5193-7851 1 1 1 Andrea Mannocci NaN NaN [[Personal website, https://andremann.github.i... andrea.mannocci@isti.cnr.it NaN [Data science , science of science, scholarly ... Scopus Author ID, 55233589900 [[Information engineering, Ph.D., Università d... [[Research Associate, Istituto di Scienza e Te... 37 [Scopus - Elsevier, Crossref Metadata Search, ...
In [18]:
df[df['orcid'] == WHATSAPP]
Out[18]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
9517099 0000-0001-6997-9470 1 1 1 other whatsapp NaN NaN [[Otherwhatsapp, https://otherwhatsapp.com/], ... NaN NaN [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... NaN NaN NaN 0 NaN
In [19]:
df.count()
Out[19]:
orcid                     10744622
claimed                   10744622
verified_email            10744622
verified_primary_email    10744622
given_names               10716789
family_name               10437094
biography                   333885
other_names                 544550
urls                        688262
primary_email               121476
other_emails                 47470
keywords                    638634
external_ids               1285292
education                  2402440
employment                 2626670
n_works                   10744622
works_source               2671906
dtype: int64
In [20]:
df[df['orcid'] == '0000-0002-5154-6404']
Out[20]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
4595263 0000-0002-5154-6404 1 1 1 Olusola Bamisile NaN NaN NaN NaN NaN NaN NaN [[Energy Systems Engineering , Doctoral, Cypru... [[, University of Electronic Science and Techn... 3 [Multidisciplinary Digital Publishing Institut...
4595264 0000-0002-5154-6404 1 1 1 Olusola Bamisile NaN NaN NaN NaN NaN NaN NaN [[Energy Systems Engineering , Doctoral, Cypru... [[, University of Electronic Science and Techn... 2 [Crossref]
In [21]:
df.drop(index=4595264, inplace=True)
In [22]:
df['orcid'].describe()
Out[22]:
count                10744621
unique               10744621
top       0000-0001-8644-5622
freq                        1
Name: orcid, dtype: object

Primary email

In [23]:
df['primary_email'].describe()
Out[23]:
count                       121476
unique                      121473
top       patrick.davey@monash.edu
freq                             2
Name: primary_email, dtype: object

Dupe emails

In [24]:
df['primary_email'].dropna().loc[df['primary_email'].duplicated()]
Out[24]:
7483666             maykin@owasp.org
9068234       opercin@erbakan.edu.tr
10246485    patrick.davey@monash.edu
Name: primary_email, dtype: object
In [25]:
df[df['primary_email'] == 'maykin@owasp.org']
Out[25]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
3776350 0000-0002-0836-2271 1 1 1 Maykin Warasart NaN NaN NaN maykin@owasp.org [maykin@dga.or.th] NaN NaN NaN NaN 0 NaN
7483666 0000-0001-9855-1676 1 1 1 Maykin Warasart NaN NaN NaN maykin@owasp.org [maykin@dga.or.th, maykin@ieee.org] NaN NaN NaN NaN 0 NaN
In [26]:
df[df['primary_email'] == 'opercin@erbakan.edu.tr']
Out[26]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
3995032 0000-0002-2232-9638 1 1 1 Osman Perçin NaN NaN NaN opercin@erbakan.edu.tr NaN NaN NaN NaN NaN 0 NaN
9068234 0000-0003-0033-0918 1 1 1 Osman PERÇİN NaN NaN NaN opercin@erbakan.edu.tr NaN NaN NaN NaN [[, Necmettin Erbakan University, Konya, , TR,... 0 NaN
In [27]:
df[df['primary_email'] == 'patrick.davey@monash.edu']
Out[27]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
5087745 0000-0002-8774-0030 1 1 1 Patrick Davey NaN NaN NaN patrick.davey@monash.edu NaN NaN NaN NaN [[PhD Student, Monash University, Melbourne, V... 1 [Crossref]
10246485 0000-0002-9158-1757 1 1 1 Patrick Davey NaN NaN NaN patrick.davey@monash.edu NaN [Radiopharmaceuticals, Inorganic Chemistry, Bi... NaN NaN [[PhD Student, Monash University, Melbourne, ,... 0 NaN
In [28]:
df['primary_email_domain'] = df['primary_email'].apply(lambda x: x.split('@')[1] if pd.notna(x) else x)
In [29]:
df['primary_email_domain'].describe()
Out[29]:
count        121476
unique        17047
top       gmail.com
freq          25892
Name: primary_email_domain, dtype: object
In [30]:
primary_emails = df[['primary_email_domain', 'orcid']].groupby('primary_email_domain').count().sort_values('orcid', ascending=False)
primary_emails
Out[30]:
orcid
primary_email_domain
gmail.com 25892
hotmail.com 3674
yahoo.com 2578
163.com 2067
yuhs.ac 1124
... ...
iiap.gob.pe 1
iiap.org.pe 1
iibb.csic.es 1
iic.hokudai.ac.jp 1
zzuli.edu.cn 1

17047 rows × 1 columns

In [65]:
set_top_n(30)
data = [
    go.Bar(
        x=primary_emails[:TOP_N].sort_values(by=['orcid'], ascending=False).index,
        y=primary_emails[:TOP_N].sort_values(by=['orcid'], ascending=False)['orcid']
    )
]

layout = go.Layout(
    title='Top %s email domains' % TOP_N,
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

Other emails

In [32]:
def extract_email_domains(lst):
    res = []
    for email in lst:
        res.append(email.split('@')[1])
    return res
In [33]:
df['other_email_domains'] = df['other_emails'].apply(lambda x: extract_email_domains(x) if isinstance(x, list) else x)
In [34]:
df[df['other_email_domains'].notna()].head()
Out[34]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source primary_email_domain other_email_domains
34 0000-0001-5011-9833 1 1 1 Mark Kilbane NaN NaN NaN mark.kilbane@seh.ox.ac.uk [mark.kilbane@bsg.ox.ac.uk] NaN NaN [[Blavatnik School of Government; St Edmund Ha... NaN 0 NaN seh.ox.ac.uk [bsg.ox.ac.uk]
47 0000-0001-5017-1295 1 1 1 Xinfeng Tang NaN NaN NaN NaN [tang.xinfeng@foxmail.com] NaN Scopus Author ID, 56927186900 [[, , University of Hong Kong, Hong Kong, , HK... NaN 11 [Scopus - Elsevier, Xinfeng Tang] NaN [foxmail.com]
299 0000-0001-5109-3989 1 1 1 colin tysall NaN NaN NaN NaN [colin.tysall@nhs.net] NaN NaN NaN [[Associate Mental Health Act Manager, Coventr... 0 NaN NaN [nhs.net]
868 0000-0001-5320-1277 1 1 1 Gökhan KESKİN NaN NaN NaN 2012001598@stu.adu.edu.tr [gokhankkeskin@gmail.com] NaN NaN NaN [[, Adnan Menderes University, Aydin, , TR, gr... 0 NaN stu.adu.edu.tr [gmail.com]
1176 0000-0001-5434-9994 1 1 1 Elena Borucu NaN NaN NaN lenapasali@gmail.com [epasali@yildiz.edu.tr] NaN NaN NaN NaN 0 NaN gmail.com [yildiz.edu.tr]
In [35]:
df['n_emails'] = df['other_emails'].str.len()
In [36]:
df.sort_values('n_emails', ascending=False)[['orcid', 'n_emails']]
Out[36]:
orcid n_emails
2039718 0000-0003-4171-3835 12.0
57198 0000-0001-6239-2968 9.0
10524509 0000-0003-2290-2817 7.0
7785216 0000-0003-2151-4089 7.0
3556386 0000-0001-9084-3156 6.0
... ... ...
10747035 0000-0003-4998-1551 NaN
10747036 0000-0003-4998-4111 NaN
10747037 0000-0003-4998-6045 NaN
10747038 0000-0003-4998-8868 NaN
10747039 0000-0003-4999-7916 NaN

10744621 rows × 2 columns

In [37]:
grouped_other_emails = df[['orcid', 'other_email_domains']]\
                        .explode('other_email_domains')\
                        .reset_index(drop=True)\
                        .groupby('other_email_domains')\
                        .count()\
                        .sort_values('orcid', ascending=False)
In [74]:
set_top_n(30)
data = [
    go.Bar(
        x=grouped_other_emails[:TOP_N].sort_values(by=['orcid'], ascending=False).index,
        y=grouped_other_emails[:TOP_N].sort_values(by=['orcid'], ascending=False)['orcid']
    )
]

layout = go.Layout(
    title='Top %s other email domains' % TOP_N, 
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

Email speculation

In [39]:
df[df['primary_email'].isna() & df['other_emails'].notna()]
Out[39]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source primary_email_domain other_email_domains n_emails
47 0000-0001-5017-1295 1 1 1 Xinfeng Tang NaN NaN NaN NaN [tang.xinfeng@foxmail.com] NaN Scopus Author ID, 56927186900 [[, , University of Hong Kong, Hong Kong, , HK... NaN 11 [Scopus - Elsevier, Xinfeng Tang] NaN [foxmail.com] 1.0
299 0000-0001-5109-3989 1 1 1 colin tysall NaN NaN NaN NaN [colin.tysall@nhs.net] NaN NaN NaN [[Associate Mental Health Act Manager, Coventr... 0 NaN NaN [nhs.net] 1.0
1296 0000-0001-5476-0126 1 1 1 Aura Windy Hernández Cetina NaN NaN NaN NaN [u0902038@unimilitar.edu.co] NaN NaN [[, Profesional en Relaciones Internacionales ... [[Asistente de Investigación, Pontificia Unive... 1 [Aura Windy Hernández Cetina] NaN [unimilitar.edu.co] 1.0
1429 0000-0001-5522-427X 1 1 1 Süleyman Özen NaN NaN [[Academic CV, https://akademik.yok.gov.tr/Aka... NaN [suleyman.ozen@btu.edu.tr] [construction materials, superplasticizers, co... Scopus Author ID, 57188750603 [[Civil Engineering, MSc and PhD, Uludağ Unive... [[Dr., Bursa Technical University, Bursa, , TR... 7 [Scopus - Elsevier, Crossref] NaN [btu.edu.tr] 1.0
1628 0000-0001-5597-3115 1 1 1 Wade Harrison NaN NaN NaN NaN [wade_harrison@unc.edu] NaN NaN [[, MD, Dartmouth College Geisel School of Med... [[Clinical Instructor / Research Fellow, Unive... 7 [Wade Harrison] NaN [unc.edu] 1.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10743658 0000-0003-3740-8352 1 1 1 Rui Zhang NaN NaN NaN NaN [zhang-r15@mails.tsinghua.edu.cn] [Lithium metal batteries, Graphene] ResearcherID, B-3843-2015 [[Department of Chemical Engineering, Ph.D. st... NaN 15 [ResearcherID, Crossref] NaN [mails.tsinghua.edu.cn] 1.0
10744876 0000-0003-4192-6451 1 1 1 Sanjib Raj Pandey NaN NaN [[Personal, https://www.sanjibpandey.wix.com/p... NaN [srpandey@gmail.com] NaN NaN [[Computing and Information System, PhD, Unive... [[Software Developer & Research Associate, Oxl... 11 [BASE - Bielefeld Academic Search Engine, Dr. ... NaN [gmail.com] 1.0
10745274 0000-0003-4333-9728 1 1 1 Mario De la Fuente Lloreda Person in charge to coordinate the scientific ... [M.de la Fuente, De la Fuente, M.] [[researchgate profile, https://www.researchga... NaN [mariofuente@gmail.com] [vineyard management, grapevine, viticulture, ... Scopus Author ID, 47960975000 [[Producción Vegetal, Doctor en Viticultura, U... NaN 3 [Scopus - Elsevier] NaN [gmail.com] 1.0
10745417 0000-0003-4383-4745 1 1 1 Jie Yang NaN NaN NaN NaN [jyang@esat.kuleuven.be] NaN NaN [[faculty of engineering science, Dr., KU Leuv... NaN 0 NaN NaN [esat.kuleuven.be] 1.0
10746702 0000-0003-4878-2737 1 1 1 Aleksey Adamtsevich NaN NaN [[Moscow State University of Civil Engineering... NaN [AdamtsevichAO@mgsu.ru] [concrete, calorimetry, cement, construction, ... [[Scopus Author ID, 56301531000], [ResearcherI... [[, Engineer (Industrial and Civil Engineering... [[Senior Researcher, Moscow State University o... 25 [Scopus - Elsevier, ResearcherID] NaN [mgsu.ru] 1.0

19409 rows × 20 columns

URLs

In [40]:
def extract_url_domains(lst):
    domains = []
    for e in lst:
        # e[0] is a string describing the url
        # e[1] is the url
        domain = tldextract.extract(e[1])
        domains.append(domain.registered_domain)
    return domains
In [41]:
df['url_domains'] = df['urls'].apply(lambda x: extract_url_domains(x) if isinstance(x, list) else x)
In [42]:
df[df['url_domains'].notna()].head()
Out[42]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email ... keywords external_ids education employment n_works works_source primary_email_domain other_email_domains n_emails url_domains
5 0000-0001-5001-4994 1 1 1 Siren Rühs I am an oceanographer studying the interannual... [Siren Ruehs] [[ResearchGate, https://www.researchgate.net/p... NaN ... NaN NaN NaN NaN 11 [Siren Rühs] NaN NaN NaN [researchgate.net]
14 0000-0001-5004-7761 1 1 1 scaffolding hire NaN [The first feature that you have to check in t... [[scaffolding hire Wellington, https://www.tig... NaN ... [scaffolding hire Wellington] NaN NaN NaN 0 NaN NaN NaN NaN [tigerscaffolds.co.nz]
15 0000-0001-5005-0557 1 1 1 Sen RT NaN NaN [[Research on Psychology, psychiatry, Genetics... NaN ... NaN NaN NaN NaN 0 NaN NaN NaN NaN [corticalbrain.com]
29 0000-0001-5009-8091 1 1 1 Gabriela Madruga Possui graduação em Medicina Veterinaria pela ... [Gabriela Morais Madruga] [[Curriculo lattes, http://buscatextual.cnpq.b... NaN ... [veterinary ophthalmology] NaN [[Surgery in small animal, PhD, Universidade E... [[PhD , University of Minnesota, Minneapolis, ... 14 [Gabriela Madruga] NaN NaN NaN [cnpq.br]
30 0000-0001-5010-9539 1 1 1 Sangram Keshari Sahu NaN [sk-sahu] Academic webpage, https://sksahu.net NaN ... [Computational Genomics and Bioinformatics] Loop profile, 1098977 [[Centre for Bioinformatics, M.Sc. Bioinformat... [[Bioinformatics Junior Research Fellow, India... 3 [Crossref Metadata Search, Sangram Keshari Sahu] NaN NaN NaN [sksahu.net]

5 rows × 21 columns

In [43]:
df['n_urls'] = df['url_domains'].str.len()
In [44]:
df.sort_values('n_urls', ascending=False)[['orcid', 'n_urls']]
Out[44]:
orcid n_urls
70577 0000-0002-1234-835X 219.0
5164541 0000-0001-7478-4539 174.0
1215225 0000-0002-7392-3792 169.0
10240510 0000-0002-6938-9638 152.0
4004281 0000-0002-5710-4041 114.0
... ... ...
10747035 0000-0003-4998-1551 NaN
10747036 0000-0003-4998-4111 NaN
10747037 0000-0003-4998-6045 NaN
10747038 0000-0003-4998-8868 NaN
10747039 0000-0003-4999-7916 NaN

10744621 rows × 2 columns

In [ ]:
set_top_n(100)
data = [
    go.Bar(
        x=df.sort_values(by=['n_urls'], ascending=False)['orcid'][:TOP_N],
        y=df.sort_values(by=['n_urls'], ascending=False)['n_urls'][:TOP_N]
    )
]

layout = go.Layout(
    title='Top %s ORCID with URLs' % TOP_N,
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [46]:
grouped_urls = df[['orcid', 'url_domains']]\
                .explode('url_domains')\
                .reset_index(drop=True)\
                .groupby('url_domains')\
                .count()\
                .sort_values('orcid', ascending=False)
In [62]:
set_top_n(10)
data = [
    go.Bar(
        x=grouped_urls[:TOP_N].sort_values(by=['orcid'], ascending=False).index,
        y=grouped_urls[:TOP_N].sort_values(by=['orcid'], ascending=False)['orcid']
    )
]

layout = go.Layout(
    title='Top %s URL domains' % TOP_N,
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [48]:
df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]
Out[48]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email ... external_ids education employment n_works works_source primary_email_domain other_email_domains n_emails url_domains n_urls
482862 0000-0003-4948-9268 1 1 1 Gustavo Duperré Gustavo Norberto Duperré graduated in Arts and... [Gustavo Norberto Duperré, Duperré, G. N.] [[Gis in Cultural Heritage - ICOMOS România, h... gustavo.duperre@usal.edu.ar ... [[Scopus Author ID, 57195936346], [ResearcherI... [[Programme in History, History of Art and Ter... [[Titular Professor, Dirección General de Cult... 13 [Gustavo Duperré, Scopus - Elsevier, Publons, ... usal.edu.ar NaN NaN [icomos.ro, unirioja.es, unirioja.es, unc.edu.... 51.0
554859 0000-0002-1929-6054 1 1 1 Franklin Américo Canaza Choque Docente-Investigador Social. Maestrando en Der... [Franklin Américo Canaza-Choque , Franklin A. ... [[Consejo Nacional de Ciencia, Tecnología e In... Leo_123fa@hotmail.com ... [[ResearcherID, P-8613-2018], [Loop profile, 8... [[Facultad de Ciencias de la Educación , Maest... [[Investigador Social, Universidad Católica de... 38 [ResearcherID, BASE - Bielefeld Academic Searc... hotmail.com [gmail.com, gmail.com, hotmail.com, baldwin.ed... 5.0 [concytec.gob.pe, redalyc.org, redalyc.org, un... 61.0
1381092 0000-0002-9025-8632 1 1 1 buycannabis dispensary We procure and deliver premium cannabis strain... [We procure and deliver premium cannabis strai... [[find your cannabis & marijuana dispensary , ... NaN ... NaN NaN NaN 10 [goowonderland dispensary] NaN NaN NaN [goowonderland.com, goowonderland.com, goowond... 81.0
2679353 0000-0003-2407-3557 1 1 1 Abdul Aziz Abdul Aziz was born on May 25, 1973, in Brebes... [Abdul Aziz, Aziz, Abdul, Aziz, A., Aziz, Abd,... [[Google Scholar, https://scholar.google.com/c... NaN ... NaN [[Ilmu Ekonomi, Dr, Universitas Borobudur, Jak... [[Assisten Professor/Dr, Institut Agama Islam ... 72 [BASE - Bielefeld Academic Search Engine, Abdu... NaN NaN NaN [google.com, syekhnurjati.ac.id, orcid.org, bl... 59.0
3354430 0000-0002-3920-7389 1 1 1 А. Гусев Surname, Name Gusev Alexander LeonidovichDate... [Alexander L. Gusev , Alexander Leonidovich Gu... [[A.L. Gusev Alternative Energy and Ecology, ... NaN ... [[ResearcherID, F-8048-2014], [Scopus Author I... [[Chemical technology and cryogenic-vacuum tec... [[General Director, Scientific Technical Centr... 472 [Publons, DataCite, Scopus - Elsevier, A.L. Gu... NaN NaN NaN [youtube.com, isjaee.com, researchgate.net, re... 111.0
4004281 0000-0002-5710-4041 1 1 1 Ryszard Romaniuk Professor of Electronics and Communications En... [R.Romaniuk, R.S.Romaniuk, Ryszard Romaniuk, R... [[Scholar Google, http://scholar.google.pl/cit... rrom@ise.pw.edu.pl ... [[ISNI, 0000000071432485], [ResearcherID, B-91... [[Faculty of Electronics and Information Techn... [[Professor, Institute Director, Politechnika ... 5008 [INSPIRE-HEP, ResearcherID, ISNI2ORCID search ... ise.pw.edu.pl [ise.pw.edu.pl, elka.pw.edu.pl, cern.ch] 3.0 [google.pl, publons.com, scopus.com, mendeley.... 114.0
4022480 0000-0003-2450-090X 1 1 1 Eduard Babulak Professor Eduard Babulak is accomplished inter... [Professor Eduard Babulak] [[Honorary Chair, Chief Mentor & Senior Adviso... NaN ... [[Scopus Author ID, 6506867432], [ResearcherID... [[Information Technology, Doctor Habilitated (... [[Consultant, Horizon 2020 Framework Programme... 274 [The Lens, BASE - Bielefeld Academic Search En... NaN NaN NaN [worldassessmentcouncil.org, spseke.sk, bcs.or... 114.0
6335357 0000-0003-2593-7134 1 1 1 Aan Jaelani All my papers can be downloaded from portal:Re... [Jaelani, A., Jaelani, Aan] [[Microsoft Academic Research, https://academi... aan_jaelani@syekhnurjati.ac.id ... [[Scopus Author ID, 57195963463], [Loop profil... [[Post Graduate, S3/Dr, Universitas Islam Nege... [[Dr, Institut Agama Islam Negeri Syekh Nurjat... 79 [Publons, Aan Jaelani, Scopus - Elsevier, Dime... syekhnurjati.ac.id [gmail.com] 1.0 [microsoft.com, twitter.com, academia.edu, aca... 67.0
6489838 0000-0002-9965-2425 1 1 1 Jaroslaw Spychala Jaroslaw Spychala has received a doctoral degr... [Jaroslaw Jozef Spychala] [[RESUME, http://www.biowebspin.com/wp-content... NaN ... Scopus Author ID, 7006745874 [[Department of Chemistry, Postdoctoral Associ... [[Assistant Professor, Adam Mickiewicz Univers... 29 [Scopus - Elsevier] NaN NaN NaN [biowebspin.com, biowebspin.com, google.com, l... 73.0
7570584 0000-0003-2183-8112 1 1 1 Pelayo Munhoz Olea Pós-Doutorado em Gestão Ambiental pela Univers... [ Munhoz, Pelayo Olea, Olea, Pelayo, Olea, P... [[Currículo Lattes, http://lattes.cnpq.br/6209... NaN ... [[Scopus Author ID, 55175503300], [ResearcherI... [[, Postdoctoral in Environmental Sustainabili... [[Professor, Universidade Federal do Rio Grand... 1105 [The Lens, Pelayo Munhoz Olea, Dimensions, BAS... NaN NaN NaN [cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c... 61.0
10240510 0000-0002-6938-9638 1 1 1 Adolfo Catral Sanabria My education is in computer science, mathemati... NaN [[ResearchGate Adolfo Catral , https://www.res... NaN ... Loop profile, 747193 [[Education, Capacitación para la enseñanza en... NaN 2023 [BASE - Bielefeld Academic Search Engine, Data... NaN NaN NaN [researchgate.net, youtube.com, linkedin.com, ... 152.0
10448304 0000-0002-4062-3603 1 1 1 JUAN DE DIOS BELTRÁN MANCILLA JUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut... [Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD... [[01.- Juan de Dios Beltrán Mancilla. Teoría O... NaN ... NaN [[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR... [[INSPECTOR GENERAL JORNADA VESPERTINA // De 2... 11 [JUAN DE DIOS BELTR´´ÁN MANCILLA] NaN NaN NaN [yumpu.com, ijopm.org, google.com, blogspot.co... 69.0
10663894 0000-0002-3997-5070 1 1 1 Dr. Parameshachari B D Dr. Parameshachari B DACM Distinguished Speake... [Dr. PARAMESHACHARI B D] [[GSSSIETW,MYSURU, http://geethashishu.in/], [... NaN ... [[ResearcherID, F-7045-2018], [Scopus Author I... [[Electronics and Communication Engineering, P... [[ACM Distinguished Speaker (Volunteer), Assoc... 93 [Publons, Multidisciplinary Digital Publishing... NaN NaN NaN [geethashishu.in, geethashishu.in, acm.org, go... 71.0

13 rows × 22 columns

In [49]:
df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]
Out[49]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email ... external_ids education employment n_works works_source primary_email_domain other_email_domains n_emails url_domains n_urls
45566 0000-0003-1948-3180 1 1 1 Mark Katz Mark N. Katz is a professor of government and ... NaN [[Adjusting to Change: American Foreign Policy... NaN ... Scopus Author ID, 25649901800 [[Political Science, Ph.D., Massachusetts Inst... [[Professor of Government and Politics, George... 58 [Scopus - Elsevier] NaN NaN NaN [wordpress.com, marknkatz.com, gmu.edu, atlant... 16.0
72674 0000-0002-2000-8339 1 1 1 Phòng khám tư nhân Hà Nội NaN NaN NaN [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN ... NaN NaN NaN 4 [Phòng khám tư nhân Hà Nội] NaN NaN NaN [onhealth.vn, onhealth.vn, onhealth.vn, onheal... 49.0
172820 0000-0001-9293-2224 1 1 1 Juan Carlos Garcia Hoyos My name is Juan Carlos García Hoyos. I was bor... [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... [[Air Force Office of Scientific Research (WRI... NaN ... NaN [[Faculty of Philosophy, History - Ph.D., Char... [[responsible for the Project Service Level Ag... 20 [Juan Carlos Garcia Hoyos] NaN NaN NaN [af.mil, gst.com, govtribe.com, sbir.gov, open... 28.0
209505 0000-0003-3045-0056 1 1 1 Ananda Majumdar I am Ananda Majumdar, Child Care Educator at B... NaN [[Migration Scholar and Ananda , https://grfdt... NaN ... NaN [[Education , B.Ed. After Degree , University ... [[General Coordinator- University of Alberta C... 43 [Ananda Majumdar] NaN NaN NaN [grfdt.com, linkedin.com, academia.edu, resear... 24.0
259877 0000-0003-1815-5732 1 1 1 JAS (Jurnal Akuntansi Syariah) JAS (Jurnal Akuntansi Syariah) published in pr... NaN [[Website, https://ejournal.stiesyariahbengkal... NaN ... NaN NaN NaN 67 [JAS (Jurnal Akuntansi Syariah)] NaN NaN NaN [stiesyariahbengkalis.ac.id, lipi.go.id, cross... 17.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10494820 0000-0002-1324-7171 1 1 1 Vanesa Natalia Rodriguez Nombre y Apellido: Vanesa Natalia Rodriguez. ... [Vanesa Rodriguez, Vanesa N. Rodriguez] [[De rufianes y franchutas Representaciones y ... NaN ... NaN [[, Maestría en Ciencias Sociales con Mención ... [[Profesora, Universidad Nacional de La Matanz... 7 [Vanesa Natalia Rodriguez] NaN NaN NaN [unlam.edu.ar, unirioja.es, amazon.fr, abebook... 19.0
10495806 0000-0002-1700-8311 1 1 1 Fix-IT Rite NaN [Best Heating & Plumbing Company] [[Website, https://fix-itrite.com], [Muckrack,... NaN ... NaN NaN NaN 1 [Fix-It Rite] NaN NaN NaN [fix-itrite.com, muckrack.com, tumblr.com, dri... 11.0
10633545 0000-0003-2676-4431 1 1 1 Benny Soewandi NaN [Benny Soewandi] [[Conservation Efforts as a Result of Theoreti... NaN ... NaN NaN [[Membership, Paguyuban Pelestarian Budaya Ban... 2 [Benny Soewandi] NaN NaN NaN [wordpress.com, wordpress.com, linkedin.com, f... 11.0
10648241 0000-0001-8157-0600 1 1 1 Bijan Yavar Senior Research Assistant and Phd Student in O... [B. Yavar, Yavar Bijan] [[Web of Science (Pub) Researcher ID: A-3544-2... NaN ... Scopus Author ID, 56556873600 NaN NaN 6 [Scopus - Elsevier] NaN NaN NaN [publons.com, articulate.com, zenodo.org, orci... 15.0
10679699 0000-0002-9874-1450 1 1 1 FENGZHI WU NaN NaN [[A Systematic Study on the Dynamic Softening ... NaN ... NaN NaN NaN 3 [FENGZHI WU] NaN NaN NaN [springer.com, sciencedirect.com, sciencedirec... 23.0

139 rows × 22 columns

In [50]:
exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)
exploded_sources
Out[50]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email ... external_ids education employment n_works works_source primary_email_domain other_email_domains n_emails url_domains n_urls
0 0000-0003-1948-3180 1 1 1 Mark Katz Mark N. Katz is a professor of government and ... NaN [[Adjusting to Change: American Foreign Policy... NaN ... Scopus Author ID, 25649901800 [[Political Science, Ph.D., Massachusetts Inst... [[Professor of Government and Politics, George... 58 Scopus - Elsevier NaN NaN NaN [wordpress.com, marknkatz.com, gmu.edu, atlant... 16.0
1 0000-0002-2000-8339 1 1 1 Phòng khám tư nhân Hà Nội NaN NaN NaN [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN ... NaN NaN NaN 4 Phòng khám tư nhân Hà Nội NaN NaN NaN [onhealth.vn, onhealth.vn, onhealth.vn, onheal... 49.0
2 0000-0001-9293-2224 1 1 1 Juan Carlos Garcia Hoyos My name is Juan Carlos García Hoyos. I was bor... [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... [[Air Force Office of Scientific Research (WRI... NaN ... NaN [[Faculty of Philosophy, History - Ph.D., Char... [[responsible for the Project Service Level Ag... 20 Juan Carlos Garcia Hoyos NaN NaN NaN [af.mil, gst.com, govtribe.com, sbir.gov, open... 28.0
3 0000-0003-3045-0056 1 1 1 Ananda Majumdar I am Ananda Majumdar, Child Care Educator at B... NaN [[Migration Scholar and Ananda , https://grfdt... NaN ... NaN [[Education , B.Ed. After Degree , University ... [[General Coordinator- University of Alberta C... 43 Ananda Majumdar NaN NaN NaN [grfdt.com, linkedin.com, academia.edu, resear... 24.0
4 0000-0003-1815-5732 1 1 1 JAS (Jurnal Akuntansi Syariah) JAS (Jurnal Akuntansi Syariah) published in pr... NaN [[Website, https://ejournal.stiesyariahbengkal... NaN ... NaN NaN NaN 67 JAS (Jurnal Akuntansi Syariah) NaN NaN NaN [stiesyariahbengkalis.ac.id, lipi.go.id, cross... 17.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
134 0000-0002-1324-7171 1 1 1 Vanesa Natalia Rodriguez Nombre y Apellido: Vanesa Natalia Rodriguez. ... [Vanesa Rodriguez, Vanesa N. Rodriguez] [[De rufianes y franchutas Representaciones y ... NaN ... NaN [[, Maestría en Ciencias Sociales con Mención ... [[Profesora, Universidad Nacional de La Matanz... 7 Vanesa Natalia Rodriguez NaN NaN NaN [unlam.edu.ar, unirioja.es, amazon.fr, abebook... 19.0
135 0000-0002-1700-8311 1 1 1 Fix-IT Rite NaN [Best Heating & Plumbing Company] [[Website, https://fix-itrite.com], [Muckrack,... NaN ... NaN NaN NaN 1 Fix-It Rite NaN NaN NaN [fix-itrite.com, muckrack.com, tumblr.com, dri... 11.0
136 0000-0003-2676-4431 1 1 1 Benny Soewandi NaN [Benny Soewandi] [[Conservation Efforts as a Result of Theoreti... NaN ... NaN NaN [[Membership, Paguyuban Pelestarian Budaya Ban... 2 Benny Soewandi NaN NaN NaN [wordpress.com, wordpress.com, linkedin.com, f... 11.0
137 0000-0001-8157-0600 1 1 1 Bijan Yavar Senior Research Assistant and Phd Student in O... [B. Yavar, Yavar Bijan] [[Web of Science (Pub) Researcher ID: A-3544-2... NaN ... Scopus Author ID, 56556873600 NaN NaN 6 Scopus - Elsevier NaN NaN NaN [publons.com, articulate.com, zenodo.org, orci... 15.0
138 0000-0002-9874-1450 1 1 1 FENGZHI WU NaN NaN [[A Systematic Study on the Dynamic Softening ... NaN ... NaN NaN NaN 3 FENGZHI WU NaN NaN NaN [springer.com, sciencedirect.com, sciencedirec... 23.0

139 rows × 22 columns

In [51]:
exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]
Out[51]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email ... external_ids education employment n_works works_source primary_email_domain other_email_domains n_emails url_domains n_urls
1 0000-0002-2000-8339 1 1 1 Phòng khám tư nhân Hà Nội NaN NaN NaN [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN ... NaN NaN NaN 4 Phòng khám tư nhân Hà Nội NaN NaN NaN [onhealth.vn, onhealth.vn, onhealth.vn, onheal... 49.0
2 0000-0001-9293-2224 1 1 1 Juan Carlos Garcia Hoyos My name is Juan Carlos García Hoyos. I was bor... [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... [[Air Force Office of Scientific Research (WRI... NaN ... NaN [[Faculty of Philosophy, History - Ph.D., Char... [[responsible for the Project Service Level Ag... 20 Juan Carlos Garcia Hoyos NaN NaN NaN [af.mil, gst.com, govtribe.com, sbir.gov, open... 28.0
3 0000-0003-3045-0056 1 1 1 Ananda Majumdar I am Ananda Majumdar, Child Care Educator at B... NaN [[Migration Scholar and Ananda , https://grfdt... NaN ... NaN [[Education , B.Ed. After Degree , University ... [[General Coordinator- University of Alberta C... 43 Ananda Majumdar NaN NaN NaN [grfdt.com, linkedin.com, academia.edu, resear... 24.0
4 0000-0003-1815-5732 1 1 1 JAS (Jurnal Akuntansi Syariah) JAS (Jurnal Akuntansi Syariah) published in pr... NaN [[Website, https://ejournal.stiesyariahbengkal... NaN ... NaN NaN NaN 67 JAS (Jurnal Akuntansi Syariah) NaN NaN NaN [stiesyariahbengkalis.ac.id, lipi.go.id, cross... 17.0
5 0000-0002-4379-6454 1 1 1 Caroline Wanjiru Kariuki Caroline holds a PhD in Economics from Curtin ... NaN [[Scopus Profile, https://www.scopus.com/dashb... NaN ... NaN [[Economics, Doctor of Philosophy , Curtin Uni... [[Director, Educational Development, Strathmor... 4 Caroline Wanjiru Kariuki NaN NaN NaN [scopus.com, mendeley.com, publons.com, resear... 13.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
132 0000-0001-6352-7086 1 1 1 Susan Hawthorne Susan is a poet, novelist, publisher and Sansk... [S. Hawthorne, Susan C. C. Hawthorne] [[Spinifex Press, http://www.spinifexpress.com... NaN ... ResearcherID, K-6039-2018 [[School of Asian Studies, Honours Sanskrit, A... [[Adjunct Professor, James Cook University, To... 352 Susan Hawthorne NaN NaN NaN [spinifexpress.com.au, linkedin.com, twitter.c... 12.0
133 0000-0002-4062-3603 1 1 1 JUAN DE DIOS BELTRÁN MANCILLA JUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut... [Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD... [[01.- Juan de Dios Beltrán Mancilla. Teoría O... NaN ... NaN [[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR... [[INSPECTOR GENERAL JORNADA VESPERTINA // De 2... 11 JUAN DE DIOS BELTR´´ÁN MANCILLA NaN NaN NaN [yumpu.com, ijopm.org, google.com, blogspot.co... 69.0
134 0000-0002-1324-7171 1 1 1 Vanesa Natalia Rodriguez Nombre y Apellido: Vanesa Natalia Rodriguez. ... [Vanesa Rodriguez, Vanesa N. Rodriguez] [[De rufianes y franchutas Representaciones y ... NaN ... NaN [[, Maestría en Ciencias Sociales con Mención ... [[Profesora, Universidad Nacional de La Matanz... 7 Vanesa Natalia Rodriguez NaN NaN NaN [unlam.edu.ar, unirioja.es, amazon.fr, abebook... 19.0
136 0000-0003-2676-4431 1 1 1 Benny Soewandi NaN [Benny Soewandi] [[Conservation Efforts as a Result of Theoreti... NaN ... NaN NaN [[Membership, Paguyuban Pelestarian Budaya Ban... 2 Benny Soewandi NaN NaN NaN [wordpress.com, wordpress.com, linkedin.com, f... 11.0
138 0000-0002-9874-1450 1 1 1 FENGZHI WU NaN NaN [[A Systematic Study on the Dynamic Softening ... NaN ... NaN NaN NaN 3 FENGZHI WU NaN NaN NaN [springer.com, sciencedirect.com, sciencedirec... 23.0

108 rows × 22 columns

Works source

Paste from Miriam

External IDs

External IDs should come from reliable sources. ORCiD registrants cannot add them freely.

In [52]:
df['n_ids'] = df[df['external_ids'].notna()].external_ids.str.len()
In [53]:
df.n_ids.describe()
Out[53]:
count    1.285292e+06
mean     1.357162e+00
std      6.607097e-01
min      1.000000e+00
25%      1.000000e+00
50%      1.000000e+00
75%      2.000000e+00
max      8.000000e+01
Name: n_ids, dtype: float64
In [54]:
df[df.n_ids == df.n_ids.max()]
Out[54]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email ... education employment n_works works_source primary_email_domain other_email_domains n_emails url_domains n_urls n_ids
9228793 0000-0002-9554-6633 1 1 1 John A Williams NaN NaN [[Aston University profile page, https://resea... NaN ... NaN [[, Aston University, Birmingham, , GB, 1722, ... 91 [Aston Research Explorer] NaN NaN NaN [aston.ac.uk] 1.0 80.0

1 rows × 23 columns

In [55]:
ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)
In [56]:
ids[ids.provider.notna()]
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-56-45d68792bcdb> in <module>
----> 1 ids[ids.provider.notna()]

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/generic.py in __getattr__(self, name)
   5463             if self._info_axis._can_hold_identifiers_and_holds_name(name):
   5464                 return self[name]
-> 5465             return object.__getattribute__(self, name)
   5466 
   5467     def __setattr__(self, name: str, value) -> None:

AttributeError: 'DataFrame' object has no attribute 'provider'
In [ ]:
ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])
In [ ]:
ids[ids.provider.notna()].head()
In [ ]:
data = [
    go.Bar(
        x=ids.groupby('provider').count().sort_values('orcid', ascending=False).index,
        y=ids.groupby('provider').count().sort_values('orcid', ascending=False)['orcid']
    )
]

layout = go.Layout(
    title='IDs provided',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [ ]:
pd.unique(ids['provider'])

Keywords

In [ ]:
df['n_keywords'] = df.keywords.str.len()
In [ ]:
df.sort_values('n_keywords', ascending=False)[['orcid', 'n_keywords']]
In [ ]:
data = [
    go.Bar(
        x=df.sort_values('n_keywords', ascending=False)['orcid'][:100],
        y=df.sort_values('n_keywords', ascending=False)['n_keywords'][:100]
    )
]

layout = go.Layout(
    title='Keywords provided',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

Correlation

In [ ]:
fig = px.imshow(df[df.n_ids > 0].corr())
fig.show()