You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

351 KiB

Explorative analysis

TODO:

  • Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)
  • Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)
  • Temporal dimension of any use?
  • Can we access private info thanks to the OpenAIRE-ORCID agreement?
In [1]:
import pandas as pd
import ast
import tldextract
import numpy

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go

init_notebook_mode(connected=True)
TOP_N = 30
TOP_RANGE = [-.5, TOP_N - 1 + .5]

Notable solid ORCID iDs for explorative purposes:

In [2]:
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'

Anomalies ORCiD profile

In [3]:
JOURNAL = '0000-0003-1815-5732'
NOINFO= '0000-0001-5009-2052'
# find group-shared ORCiD

Notable fake ORCID iDs for explorative purposes:

In [4]:
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'
PENIS = '0000-0002-3399-7287'
BITCOIN = '0000-0002-7518-6845'
FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment
CANNABIS = '0000-0002-9025-8632'      # URL > 70 + works (REMOVED)
PLUMBER = '0000-0002-1700-8311'       # URL > 10 + works 

Load the dataset

In [7]:
df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\t', header=0,
                         names = ['orcid', 'claimed','verified_email', 'verified_primary_email', 
                                  'given_names', 'family_name', 'biography', 'other_names', 'urls', 
                                  'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', 
                                  'employment', 'n_works', 'works_source'])
In [8]:
df[df.duplicated()]
Out[8]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
7552 0000-0001-7831-7567 1 1 1 Vahab Vahdat NaN NaN NaN NaN NaN NaN [["Scopus Author ID", "57193490305"], ["Scopus... [["Industrial Engineering", "PhD", "Northeaste... [["Post-doctorate fellow", "Harvard Medical Sc... 25 ["Vahab Vahdat", "Scopus - Elsevier", "Multidi...
8416 0000-0001-8161-1345 1 1 1 AYFER TEKIN ATACAN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN
16498 0000-0002-1133-1505 1 1 1 Xianrong Lai NaN NaN NaN NaN NaN NaN "Scopus Author ID", "15769435500" [["Department of pharmacy", "Bachelor of Tradi... [["Associate Research, Professor", "Chengdu Un... 115 ["Xianrong Lai", "Scopus - Elsevier", "Crossref"]
16830 0000-0002-1257-5536 1 1 1 Alexandra Zimmer NaN NaN NaN NaN NaN NaN NaN NaN [["Research assistent", "Fraunhofer-Institut f... 0 NaN
18835 0000-0002-2026-4156 1 1 1 Fatma Sri Wahyuni NaN ["Ayu"] NaN NaN NaN NaN [["ResearcherID", "C-5194-2015"], ["Scopus Aut... [["Biosains", "PHD", "Universiti Putra Malaysi... [["Lecturer", "Universitas Andalas", "Padang",... 27 ["Publons", "Crossref Metadata Search", "Scopu...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10733293 0000-0002-9887-7788 1 1 1 Markéta Laštůvková NaN NaN NaN NaN NaN NaN NaN NaN [["", "VSB - Technical University of Ostrava",... 0 NaN
10737258 0000-0003-1367-8104 1 1 1 LORENA GUTIÉRREZ GARCÍA NaN NaN [["LinkedIn", "https://www.linkedin.com/in/lor... lorenagg@unex.es NaN ["Agroecolog\u00eda, Bot\u00e1nica, Did\u00e1c... "ResearcherID", "AAE-6316-2021" [["", "M\u00e1ster en Formaci\u00f3n del profe... [["PCI", "Universidad de Extremadura - Campus ... 14 ["Multidisciplinary Digital Publishing Institu...
10738308 0000-0003-1741-3437 1 1 1 Xing Liu NaN NaN NaN NaN NaN NaN "ResearcherID", "S-3053-2017" NaN NaN 0 NaN
10741460 0000-0003-2909-8585 1 1 1 Yusuf Özcan NaN NaN NaN NaN NaN NaN NaN [["\u0130lahiyat Fak\u00fcltesi", "Doktora", "... [["Research Assistant", "\u00c7ukurova Univers... 0 NaN
10745078 0000-0003-4259-5324 1 1 1 P Rama Mohan NaN NaN NaN NaN NaN NaN NaN "Scopus Author ID", "24776757000" [["EEE Department", "Ph.D. (Power Electronics ... [["Associate Professor", "RGM College of Engin... 21 ["Scopus - Elsevier", "P Rama Mohan"]

2418 rows × 17 columns

In [9]:
df.drop_duplicates(inplace=True)

Basic column manipulation (interpret columns as lists when necessary)

In [10]:
df.loc[df.other_names.notna(), 'other_names'] = df.loc[df.other_names.notna(), 'other_names'].apply(lambda x: ast.literal_eval(x))
In [11]:
df.loc[df.keywords.notna(), 'keywords'] = df.loc[df.keywords.notna(), 'keywords'].apply(lambda x: ast.literal_eval(x))
In [12]:
df.loc[df.urls.notna(), 'urls'] = df.loc[df.urls.notna(), 'urls'].apply(lambda x: ast.literal_eval(x))
In [13]:
df.loc[df.other_emails.notna(), 'other_emails'] = df.loc[df.other_emails.notna(), 'other_emails'].apply(lambda x: ast.literal_eval(x))
In [14]:
df.loc[df.education.notna(), 'education'] = df.loc[df.education.notna(), 'education'].apply(lambda x: ast.literal_eval(x))
In [15]:
df.loc[df.employment.notna(), 'employment'] = df.loc[df.employment.notna(), 'employment'].apply(lambda x: ast.literal_eval(x))
In [16]:
df.loc[df.external_ids.notna(), 'external_ids'] = df.loc[df.external_ids.notna(), 'external_ids'].apply(lambda x: ast.literal_eval(x))
In [17]:
df.loc[df.works_source.notna(), 'works_source'] = df.loc[df.works_source.notna(), 'works_source'].apply(lambda x: ast.literal_eval(x))
In [18]:
df.head(5)
Out[18]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
0 0000-0001-5000-2053 1 0 0 Jorge Jaramillo Sanchez NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN
1 0000-0001-5000-6548 1 0 0 Wiseman Bekelesi NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN
2 0000-0001-5000-7962 1 1 1 ALICE INDIMULI NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN
3 0000-0001-5000-8586 1 0 0 shim ji yun NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN
4 0000-0001-5001-0256 1 0 0 Sandro Caramaschi NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN
In [19]:
df[df['orcid'] == AM]
Out[19]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
8840413 0000-0002-5193-7851 1 1 1 Andrea Mannocci NaN NaN [[Personal website, https://andremann.github.i... andrea.mannocci@isti.cnr.it NaN [Data science , science of science, scholarly ... Scopus Author ID, 55233589900 [[Information engineering, Ph.D., Università d... [[Research Associate, Istituto di Scienza e Te... 37 [Scopus - Elsevier, Crossref Metadata Search, ...
In [20]:
df[df['orcid'] == WHATSAPP]
Out[20]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
9517099 0000-0001-6997-9470 1 1 1 other whatsapp NaN NaN [[Otherwhatsapp, https://otherwhatsapp.com/], ... NaN NaN [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... NaN NaN NaN 0 NaN
In [21]:
df.count()
Out[21]:
orcid                     10744622
claimed                   10744622
verified_email            10744622
verified_primary_email    10744622
given_names               10716789
family_name               10437094
biography                   333885
other_names                 544550
urls                        688262
primary_email               121476
other_emails                 47470
keywords                    638634
external_ids               1285292
education                  2402440
employment                 2626670
n_works                   10744622
works_source               2671906
dtype: int64
In [22]:
df[df['orcid'] == '0000-0002-5154-6404']
Out[22]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
4595263 0000-0002-5154-6404 1 1 1 Olusola Bamisile NaN NaN NaN NaN NaN NaN NaN [[Energy Systems Engineering , Doctoral, Cypru... [[, University of Electronic Science and Techn... 3 [Multidisciplinary Digital Publishing Institut...
4595264 0000-0002-5154-6404 1 1 1 Olusola Bamisile NaN NaN NaN NaN NaN NaN NaN [[Energy Systems Engineering , Doctoral, Cypru... [[, University of Electronic Science and Techn... 2 [Crossref]
In [23]:
df.drop(index=4595264, inplace=True)
In [24]:
df['orcid'].describe()
Out[24]:
count                10744621
unique               10744621
top       0000-0002-3376-9946
freq                        1
Name: orcid, dtype: object

Primary email

In [25]:
df['primary_email'].describe()
Out[25]:
count               121476
unique              121473
top       maykin@owasp.org
freq                     2
Name: primary_email, dtype: object

Dupe emails

In [26]:
df['primary_email'].dropna().loc[df['primary_email'].duplicated()]
Out[26]:
7483666             maykin@owasp.org
9068234       opercin@erbakan.edu.tr
10246485    patrick.davey@monash.edu
Name: primary_email, dtype: object
In [27]:
df[df['primary_email'] == 'maykin@owasp.org']
Out[27]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
3776350 0000-0002-0836-2271 1 1 1 Maykin Warasart NaN NaN NaN maykin@owasp.org [maykin@dga.or.th] NaN NaN NaN NaN 0 NaN
7483666 0000-0001-9855-1676 1 1 1 Maykin Warasart NaN NaN NaN maykin@owasp.org [maykin@dga.or.th, maykin@ieee.org] NaN NaN NaN NaN 0 NaN
In [28]:
df[df['primary_email'] == 'opercin@erbakan.edu.tr']
Out[28]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
3995032 0000-0002-2232-9638 1 1 1 Osman Perçin NaN NaN NaN opercin@erbakan.edu.tr NaN NaN NaN NaN NaN 0 NaN
9068234 0000-0003-0033-0918 1 1 1 Osman PERÇİN NaN NaN NaN opercin@erbakan.edu.tr NaN NaN NaN NaN [[, Necmettin Erbakan University, Konya, , TR,... 0 NaN
In [29]:
df[df['primary_email'] == 'patrick.davey@monash.edu']
Out[29]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source
5087745 0000-0002-8774-0030 1 1 1 Patrick Davey NaN NaN NaN patrick.davey@monash.edu NaN NaN NaN NaN [[PhD Student, Monash University, Melbourne, V... 1 [Crossref]
10246485 0000-0002-9158-1757 1 1 1 Patrick Davey NaN NaN NaN patrick.davey@monash.edu NaN [Radiopharmaceuticals, Inorganic Chemistry, Bi... NaN NaN [[PhD Student, Monash University, Melbourne, ,... 0 NaN
In [30]:
df['primary_email_domain'] = df['primary_email'].apply(lambda x: x.split('@')[1] if pd.notna(x) else x)
In [31]:
df['primary_email_domain'].describe()
Out[31]:
count        121476
unique        17047
top       gmail.com
freq          25892
Name: primary_email_domain, dtype: object
In [32]:
primary_emails = df[['primary_email_domain', 'orcid']].groupby('primary_email_domain').count().sort_values('orcid', ascending=False)
primary_emails
Out[32]:
orcid
primary_email_domain
gmail.com 25892
hotmail.com 3674
yahoo.com 2578
163.com 2067
yuhs.ac 1124
... ...
iiap.gob.pe 1
iiap.org.pe 1
iibb.csic.es 1
iic.hokudai.ac.jp 1
zzuli.edu.cn 1

17047 rows × 1 columns

In [33]:
data = [
    go.Bar(
        x=primary_emails[:30].sort_values(by=['orcid'], ascending=False).index,
        y=primary_emails[:30].sort_values(by=['orcid'], ascending=False)['orcid']
    )
]

layout = go.Layout(
    title='Top 30 email domains',
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

Other emails

In [34]:
def extract_email_domains(lst):
    res = []
    for email in lst:
        res.append(email.split('@')[1])
    return res
In [35]:
df['other_email_domains'] = df['other_emails'].apply(lambda x: extract_email_domains(x) if isinstance(x, list) else x)
In [36]:
df[df['other_email_domains'].notna()].head()
Out[36]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source primary_email_domain other_email_domains
34 0000-0001-5011-9833 1 1 1 Mark Kilbane NaN NaN NaN mark.kilbane@seh.ox.ac.uk [mark.kilbane@bsg.ox.ac.uk] NaN NaN [[Blavatnik School of Government; St Edmund Ha... NaN 0 NaN seh.ox.ac.uk [bsg.ox.ac.uk]
47 0000-0001-5017-1295 1 1 1 Xinfeng Tang NaN NaN NaN NaN [tang.xinfeng@foxmail.com] NaN Scopus Author ID, 56927186900 [[, , University of Hong Kong, Hong Kong, , HK... NaN 11 [Scopus - Elsevier, Xinfeng Tang] NaN [foxmail.com]
299 0000-0001-5109-3989 1 1 1 colin tysall NaN NaN NaN NaN [colin.tysall@nhs.net] NaN NaN NaN [[Associate Mental Health Act Manager, Coventr... 0 NaN NaN [nhs.net]
868 0000-0001-5320-1277 1 1 1 Gökhan KESKİN NaN NaN NaN 2012001598@stu.adu.edu.tr [gokhankkeskin@gmail.com] NaN NaN NaN [[, Adnan Menderes University, Aydin, , TR, gr... 0 NaN stu.adu.edu.tr [gmail.com]
1176 0000-0001-5434-9994 1 1 1 Elena Borucu NaN NaN NaN lenapasali@gmail.com [epasali@yildiz.edu.tr] NaN NaN NaN NaN 0 NaN gmail.com [yildiz.edu.tr]
In [37]:
other_emails = df[['orcid', 'other_email_domains']].explode('other_email_domains').reset_index(drop=True)
In [38]:
grouped_other_emails = other_emails.groupby('other_email_domains').count().sort_values('orcid', ascending=False)
grouped_other_emails
Out[38]:
orcid
other_email_domains
gmail.com 10856
hotmail.com 1521
yahoo.com 1263
163.com 763
qq.com 755
... ...
ifzz.pan.pl 1
ig.ufpa.br 1
ig.ufu.br 1
ig.utexas.edu 1
zzuli.edu.cn 1

12795 rows × 1 columns

In [39]:
data = [
    go.Bar(
        x=grouped_other_emails[:30].sort_values(by=['orcid'], ascending=False).index,
        y=grouped_other_emails[:30].sort_values(by=['orcid'], ascending=False)['orcid']
    )
]

layout = go.Layout(
    title='Top 30 other email domains',
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [40]:
other_emails.groupby('orcid').count().sort_values('other_email_domains', ascending=False)
Out[40]:
other_email_domains
orcid
0000-0003-4171-3835 12
0000-0001-6239-2968 9
0000-0003-2290-2817 7
0000-0003-2151-4089 7
0000-0001-9084-3156 6
... ...
0000-0002-1678-0668 0
0000-0002-1678-0684 0
0000-0002-1678-0705 0
0000-0002-1678-0713 0
0000-0003-5000-0001 0

10744621 rows × 1 columns

Email speculation

In [41]:
df[df['primary_email'].isna() & df['other_emails'].notna()]
Out[41]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source primary_email_domain other_email_domains
47 0000-0001-5017-1295 1 1 1 Xinfeng Tang NaN NaN NaN NaN [tang.xinfeng@foxmail.com] NaN Scopus Author ID, 56927186900 [[, , University of Hong Kong, Hong Kong, , HK... NaN 11 [Scopus - Elsevier, Xinfeng Tang] NaN [foxmail.com]
299 0000-0001-5109-3989 1 1 1 colin tysall NaN NaN NaN NaN [colin.tysall@nhs.net] NaN NaN NaN [[Associate Mental Health Act Manager, Coventr... 0 NaN NaN [nhs.net]
1296 0000-0001-5476-0126 1 1 1 Aura Windy Hernández Cetina NaN NaN NaN NaN [u0902038@unimilitar.edu.co] NaN NaN [[, Profesional en Relaciones Internacionales ... [[Asistente de Investigación, Pontificia Unive... 1 [Aura Windy Hernández Cetina] NaN [unimilitar.edu.co]
1429 0000-0001-5522-427X 1 1 1 Süleyman Özen NaN NaN [[Academic CV, https://akademik.yok.gov.tr/Aka... NaN [suleyman.ozen@btu.edu.tr] [construction materials, superplasticizers, co... Scopus Author ID, 57188750603 [[Civil Engineering, MSc and PhD, Uludağ Unive... [[Dr., Bursa Technical University, Bursa, , TR... 7 [Scopus - Elsevier, Crossref] NaN [btu.edu.tr]
1628 0000-0001-5597-3115 1 1 1 Wade Harrison NaN NaN NaN NaN [wade_harrison@unc.edu] NaN NaN [[, MD, Dartmouth College Geisel School of Med... [[Clinical Instructor / Research Fellow, Unive... 7 [Wade Harrison] NaN [unc.edu]
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10743658 0000-0003-3740-8352 1 1 1 Rui Zhang NaN NaN NaN NaN [zhang-r15@mails.tsinghua.edu.cn] [Lithium metal batteries, Graphene] ResearcherID, B-3843-2015 [[Department of Chemical Engineering, Ph.D. st... NaN 15 [ResearcherID, Crossref] NaN [mails.tsinghua.edu.cn]
10744876 0000-0003-4192-6451 1 1 1 Sanjib Raj Pandey NaN NaN [[Personal, https://www.sanjibpandey.wix.com/p... NaN [srpandey@gmail.com] NaN NaN [[Computing and Information System, PhD, Unive... [[Software Developer & Research Associate, Oxl... 11 [BASE - Bielefeld Academic Search Engine, Dr. ... NaN [gmail.com]
10745274 0000-0003-4333-9728 1 1 1 Mario De la Fuente Lloreda Person in charge to coordinate the scientific ... [M.de la Fuente, De la Fuente, M.] [[researchgate profile, https://www.researchga... NaN [mariofuente@gmail.com] [vineyard management, grapevine, viticulture, ... Scopus Author ID, 47960975000 [[Producción Vegetal, Doctor en Viticultura, U... NaN 3 [Scopus - Elsevier] NaN [gmail.com]
10745417 0000-0003-4383-4745 1 1 1 Jie Yang NaN NaN NaN NaN [jyang@esat.kuleuven.be] NaN NaN [[faculty of engineering science, Dr., KU Leuv... NaN 0 NaN NaN [esat.kuleuven.be]
10746702 0000-0003-4878-2737 1 1 1 Aleksey Adamtsevich NaN NaN [[Moscow State University of Civil Engineering... NaN [AdamtsevichAO@mgsu.ru] [concrete, calorimetry, cement, construction, ... [[Scopus Author ID, 56301531000], [ResearcherI... [[, Engineer (Industrial and Civil Engineering... [[Senior Researcher, Moscow State University o... 25 [Scopus - Elsevier, ResearcherID] NaN [mgsu.ru]

19409 rows × 19 columns

URLs

In [42]:
def extract_url_domains(lst):
    domains = []
    for e in lst:
        # e[0] is a string describing the url
        # e[1] is the url
        domain = tldextract.extract(e[1])
        domains.append(domain.registered_domain)
    return domains
In [43]:
df['url_domains'] = df['urls'].apply(lambda x: extract_url_domains(x) if isinstance(x, list) else x)
In [44]:
df[df['url_domains'].notna()].head()
Out[44]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source primary_email_domain other_email_domains url_domains
5 0000-0001-5001-4994 1 1 1 Siren Rühs I am an oceanographer studying the interannual... [Siren Ruehs] [[ResearchGate, https://www.researchgate.net/p... NaN NaN NaN NaN NaN NaN 11 [Siren Rühs] NaN NaN [researchgate.net]
14 0000-0001-5004-7761 1 1 1 scaffolding hire NaN [The first feature that you have to check in t... [[scaffolding hire Wellington, https://www.tig... NaN NaN [scaffolding hire Wellington] NaN NaN NaN 0 NaN NaN NaN [tigerscaffolds.co.nz]
15 0000-0001-5005-0557 1 1 1 Sen RT NaN NaN [[Research on Psychology, psychiatry, Genetics... NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN [corticalbrain.com]
29 0000-0001-5009-8091 1 1 1 Gabriela Madruga Possui graduação em Medicina Veterinaria pela ... [Gabriela Morais Madruga] [[Curriculo lattes, http://buscatextual.cnpq.b... NaN NaN [veterinary ophthalmology] NaN [[Surgery in small animal, PhD, Universidade E... [[PhD , University of Minnesota, Minneapolis, ... 14 [Gabriela Madruga] NaN NaN [cnpq.br]
30 0000-0001-5010-9539 1 1 1 Sangram Keshari Sahu NaN [sk-sahu] Academic webpage, https://sksahu.net NaN NaN [Computational Genomics and Bioinformatics] Loop profile, 1098977 [[Centre for Bioinformatics, M.Sc. Bioinformat... [[Bioinformatics Junior Research Fellow, India... 3 [Crossref Metadata Search, Sangram Keshari Sahu] NaN NaN [sksahu.net]
In [45]:
urls = df[['orcid', 'url_domains']].explode('url_domains').reset_index(drop=True)
In [46]:
grouped_urls = urls.groupby('url_domains').count().sort_values('orcid', ascending=False)
grouped_urls
Out[46]:
orcid
url_domains
linkedin.com 75344
researchgate.net 66267
google.com 43468
cnpq.br 23936
academia.edu 20786
... ...
gerberpumps.com 1
gerbilvis.org 1
gercekmedyumlar.org 1
gerceksiyaset.com 1
политуправление.рф 1

193320 rows × 1 columns

In [47]:
data = [
    go.Bar(
        x=grouped_urls[:30].sort_values(by=['orcid'], ascending=False).index,
        y=grouped_urls[:30].sort_values(by=['orcid'], ascending=False)['orcid']
    )
]

layout = go.Layout(
    title='Top 30 URL domains',
    xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [48]:
grouped_most_domains = urls.groupby('orcid').count().sort_values('url_domains', ascending=False)
grouped_most_domains
Out[48]:
url_domains
orcid
0000-0002-1234-835X 219
0000-0001-7478-4539 174
0000-0002-7392-3792 169
0000-0002-6938-9638 152
0000-0003-2450-090X 114
... ...
0000-0002-1883-0569 0
0000-0002-1883-0577 0
0000-0002-1883-0585 0
0000-0002-1883-0606 0
0000-0003-5000-0001 0

10744621 rows × 1 columns

In [49]:
data = [
    go.Bar(
        x=grouped_most_domains[:100].sort_values(by=['url_domains'], ascending=False).index,
        y=grouped_most_domains[:100].sort_values(by=['url_domains'], ascending=False)['url_domains']
    )
]

layout = go.Layout(
    title='Top 100 ORCID with URLs',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [50]:
df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]
Out[50]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source primary_email_domain other_email_domains url_domains
482862 0000-0003-4948-9268 1 1 1 Gustavo Duperré Gustavo Norberto Duperré graduated in Arts and... [Gustavo Norberto Duperré, Duperré, G. N.] [[Gis in Cultural Heritage - ICOMOS România, h... gustavo.duperre@usal.edu.ar NaN [History of Art, Humanities, International Coo... [[Scopus Author ID, 57195936346], [ResearcherI... [[Programme in History, History of Art and Ter... [[Titular Professor, Dirección General de Cult... 13 [Gustavo Duperré, Scopus - Elsevier, Publons, ... usal.edu.ar NaN [icomos.ro, unirioja.es, unirioja.es, unc.edu....
554859 0000-0002-1929-6054 1 1 1 Franklin Américo Canaza Choque Docente-Investigador Social. Maestrando en Der... [Franklin Américo Canaza-Choque , Franklin A. ... [[Consejo Nacional de Ciencia, Tecnología e In... Leo_123fa@hotmail.com [Leoameric123@gmail.com, Frankmericnazac@gmail... [Justicia Global; Democracia; Derechos Humanos... [[ResearcherID, P-8613-2018], [Loop profile, 8... [[Facultad de Ciencias de la Educación , Maest... [[Investigador Social, Universidad Católica de... 38 [ResearcherID, BASE - Bielefeld Academic Searc... hotmail.com [gmail.com, gmail.com, hotmail.com, baldwin.ed... [concytec.gob.pe, redalyc.org, redalyc.org, un...
1381092 0000-0002-9025-8632 1 1 1 buycannabis dispensary We procure and deliver premium cannabis strain... [We procure and deliver premium cannabis strai... [[find your cannabis & marijuana dispensary , ... NaN NaN [cannabis, cannabis culture, cannabis communit... NaN NaN NaN 10 [goowonderland dispensary] NaN NaN [goowonderland.com, goowonderland.com, goowond...
2679353 0000-0003-2407-3557 1 1 1 Abdul Aziz Abdul Aziz was born on May 25, 1973, in Brebes... [Abdul Aziz, Aziz, Abdul, Aziz, A., Aziz, Abd,... [[Google Scholar, https://scholar.google.com/c... NaN NaN [Ilmu Ekonomi, Ekonomi Islam, Metodologi Penel... NaN [[Ilmu Ekonomi, Dr, Universitas Borobudur, Jak... [[Assisten Professor/Dr, Institut Agama Islam ... 72 [BASE - Bielefeld Academic Search Engine, Abdu... NaN NaN [google.com, syekhnurjati.ac.id, orcid.org, bl...
3354430 0000-0002-3920-7389 1 1 1 А. Гусев Surname, Name Gusev Alexander LeonidovichDate... [Alexander L. Gusev , Alexander Leonidovich Gu... [[A.L. Gusev Alternative Energy and Ecology, ... NaN NaN [Supercapacitors, Electrochromic, Photochromic... [[ResearcherID, F-8048-2014], [Scopus Author I... [[Chemical technology and cryogenic-vacuum tec... [[General Director, Scientific Technical Centr... 472 [Publons, DataCite, Scopus - Elsevier, A.L. Gu... NaN NaN [youtube.com, isjaee.com, researchgate.net, re...
4004281 0000-0002-5710-4041 1 1 1 Ryszard Romaniuk Professor of Electronics and Communications En... [R.Romaniuk, R.S.Romaniuk, Ryszard Romaniuk, R... [[Scholar Google, http://scholar.google.pl/cit... rrom@ise.pw.edu.pl [R.Romaniuk@ise.pw.edu.pl, R.Romaniuk@elka.pw.... [telecommunications, photonics, measurement sy... [[ISNI, 0000000071432485], [ResearcherID, B-91... [[Faculty of Electronics and Information Techn... [[Professor, Institute Director, Politechnika ... 5008 [INSPIRE-HEP, ResearcherID, ISNI2ORCID search ... ise.pw.edu.pl [ise.pw.edu.pl, elka.pw.edu.pl, cern.ch] [google.pl, publons.com, scopus.com, mendeley....
4022480 0000-0003-2450-090X 1 1 1 Eduard Babulak Professor Eduard Babulak is accomplished inter... [Professor Eduard Babulak] [[Honorary Chair, Chief Mentor & Senior Adviso... NaN NaN [Computer Security, Computer Networking, Inter... [[Scopus Author ID, 6506867432], [ResearcherID... [[Information Technology, Doctor Habilitated (... [[Consultant, Horizon 2020 Framework Programme... 274 [The Lens, BASE - Bielefeld Academic Search En... NaN NaN [worldassessmentcouncil.org, spseke.sk, bcs.or...
6335357 0000-0003-2593-7134 1 1 1 Aan Jaelani All my papers can be downloaded from portal:Re... [Jaelani, A., Jaelani, Aan] [[Microsoft Academic Research, https://academi... aan_jaelani@syekhnurjati.ac.id [iainanjal@gmail.com] [Islamic Economics, Tourism Industry, Islamic ... [[Scopus Author ID, 57195963463], [Loop profil... [[Post Graduate, S3/Dr, Universitas Islam Nege... [[Dr, Institut Agama Islam Negeri Syekh Nurjat... 79 [Publons, Aan Jaelani, Scopus - Elsevier, Dime... syekhnurjati.ac.id [gmail.com] [microsoft.com, twitter.com, academia.edu, aca...
6489838 0000-0002-9965-2425 1 1 1 Jaroslaw Spychala Jaroslaw Spychala has received a doctoral degr... [Jaroslaw Jozef Spychala] [[RESUME, http://www.biowebspin.com/wp-content... NaN NaN [organic chemistry, biochemistry, photochemist... Scopus Author ID, 7006745874 [[Department of Chemistry, Postdoctoral Associ... [[Assistant Professor, Adam Mickiewicz Univers... 29 [Scopus - Elsevier] NaN NaN [biowebspin.com, biowebspin.com, google.com, l...
7570584 0000-0003-2183-8112 1 1 1 Pelayo Munhoz Olea Pós-Doutorado em Gestão Ambiental pela Univers... [ Munhoz, Pelayo Olea, Olea, Pelayo, Olea, P... [[Currículo Lattes, http://lattes.cnpq.br/6209... NaN NaN [Inovação, Empreendedorismo, Sustentabilidade] [[Scopus Author ID, 55175503300], [ResearcherI... [[, Postdoctoral in Environmental Sustainabili... [[Professor, Universidade Federal do Rio Grand... 1105 [The Lens, Pelayo Munhoz Olea, Dimensions, BAS... NaN NaN [cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c...
10240510 0000-0002-6938-9638 1 1 1 Adolfo Catral Sanabria My education is in computer science, mathemati... NaN [[ResearchGate Adolfo Catral , https://www.res... NaN NaN NaN Loop profile, 747193 [[Education, Capacitación para la enseñanza en... NaN 2023 [BASE - Bielefeld Academic Search Engine, Data... NaN NaN [researchgate.net, youtube.com, linkedin.com, ...
10448304 0000-0002-4062-3603 1 1 1 JUAN DE DIOS BELTRÁN MANCILLA JUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut... [Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD... [[01.- Juan de Dios Beltrán Mancilla. Teoría O... NaN NaN [FILOSOFIA MEDICINA ARQUITECTURA ECONOMÍA DERE... NaN [[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR... [[INSPECTOR GENERAL JORNADA VESPERTINA // De 2... 11 [JUAN DE DIOS BELTR´´ÁN MANCILLA] NaN NaN [yumpu.com, ijopm.org, google.com, blogspot.co...
10663894 0000-0002-3997-5070 1 1 1 Dr. Parameshachari B D Dr. Parameshachari B DACM Distinguished Speake... [Dr. PARAMESHACHARI B D] [[GSSSIETW,MYSURU, http://geethashishu.in/], [... NaN NaN [Professor & Head |Dept. of TCE| GSSSIET for W... [[ResearcherID, F-7045-2018], [Scopus Author I... [[Electronics and Communication Engineering, P... [[ACM Distinguished Speaker (Volunteer), Assoc... 93 [Publons, Multidisciplinary Digital Publishing... NaN NaN [geethashishu.in, geethashishu.in, acm.org, go...
In [51]:
df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]
Out[51]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source primary_email_domain other_email_domains url_domains
45566 0000-0003-1948-3180 1 1 1 Mark Katz Mark N. Katz is a professor of government and ... NaN [[Adjusting to Change: American Foreign Policy... NaN NaN NaN Scopus Author ID, 25649901800 [[Political Science, Ph.D., Massachusetts Inst... [[Professor of Government and Politics, George... 58 [Scopus - Elsevier] NaN NaN [wordpress.com, marknkatz.com, gmu.edu, atlant...
72674 0000-0002-2000-8339 1 1 1 Phòng khám tư nhân Hà Nội NaN NaN NaN [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN NaN NaN NaN NaN NaN 4 [Phòng khám tư nhân Hà Nội] NaN NaN [onhealth.vn, onhealth.vn, onhealth.vn, onheal...
172820 0000-0001-9293-2224 1 1 1 Juan Carlos Garcia Hoyos My name is Juan Carlos García Hoyos. I was bor... [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... [[Air Force Office of Scientific Research (WRI... NaN NaN [Exolinguistics, Ethnoastronomy, Sociology of ... NaN [[Faculty of Philosophy, History - Ph.D., Char... [[responsible for the Project Service Level Ag... 20 [Juan Carlos Garcia Hoyos] NaN NaN [af.mil, gst.com, govtribe.com, sbir.gov, open...
209505 0000-0003-3045-0056 1 1 1 Ananda Majumdar I am Ananda Majumdar, Child Care Educator at B... NaN [[Migration Scholar and Ananda , https://grfdt... NaN NaN NaN NaN [[Education , B.Ed. After Degree , University ... [[General Coordinator- University of Alberta C... 43 [Ananda Majumdar] NaN NaN [grfdt.com, linkedin.com, academia.edu, resear...
259877 0000-0003-1815-5732 1 1 1 JAS (Jurnal Akuntansi Syariah) JAS (Jurnal Akuntansi Syariah) published in pr... NaN [[Website, https://ejournal.stiesyariahbengkal... NaN NaN [Akuntansi, Akuntansi Syariah] NaN NaN NaN 67 [JAS (Jurnal Akuntansi Syariah)] NaN NaN [stiesyariahbengkalis.ac.id, lipi.go.id, cross...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10494820 0000-0002-1324-7171 1 1 1 Vanesa Natalia Rodriguez Nombre y Apellido: Vanesa Natalia Rodriguez. ... [Vanesa Rodriguez, Vanesa N. Rodriguez] [[De rufianes y franchutas Representaciones y ... NaN NaN [Historia - Prostitución - Mujeres - Enfermeda... NaN [[, Maestría en Ciencias Sociales con Mención ... [[Profesora, Universidad Nacional de La Matanz... 7 [Vanesa Natalia Rodriguez] NaN NaN [unlam.edu.ar, unirioja.es, amazon.fr, abebook...
10495806 0000-0002-1700-8311 1 1 1 Fix-IT Rite NaN [Best Heating & Plumbing Company] [[Website, https://fix-itrite.com], [Muckrack,... NaN NaN [Plumber, Appliance, Refrigerator, Repair , Se... NaN NaN NaN 1 [Fix-It Rite] NaN NaN [fix-itrite.com, muckrack.com, tumblr.com, dri...
10633545 0000-0003-2676-4431 1 1 1 Benny Soewandi NaN [Benny Soewandi] [[Conservation Efforts as a Result of Theoreti... NaN NaN [Researchers-Conservator for the Architectural... NaN NaN [[Membership, Paguyuban Pelestarian Budaya Ban... 2 [Benny Soewandi] NaN NaN [wordpress.com, wordpress.com, linkedin.com, f...
10648241 0000-0001-8157-0600 1 1 1 Bijan Yavar Senior Research Assistant and Phd Student in O... [B. Yavar, Yavar Bijan] [[Web of Science (Pub) Researcher ID: A-3544-2... NaN NaN [Certainty and Uncertainty, Risk Analysis (Qua... Scopus Author ID, 56556873600 NaN NaN 6 [Scopus - Elsevier] NaN NaN [publons.com, articulate.com, zenodo.org, orci...
10679699 0000-0002-9874-1450 1 1 1 FENGZHI WU NaN NaN [[A Systematic Study on the Dynamic Softening ... NaN NaN NaN NaN NaN NaN 3 [FENGZHI WU] NaN NaN [springer.com, sciencedirect.com, sciencedirec...

139 rows × 20 columns

In [52]:
exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)
exploded_sources
Out[52]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source primary_email_domain other_email_domains url_domains
0 0000-0003-1948-3180 1 1 1 Mark Katz Mark N. Katz is a professor of government and ... NaN [[Adjusting to Change: American Foreign Policy... NaN NaN NaN Scopus Author ID, 25649901800 [[Political Science, Ph.D., Massachusetts Inst... [[Professor of Government and Politics, George... 58 Scopus - Elsevier NaN NaN [wordpress.com, marknkatz.com, gmu.edu, atlant...
1 0000-0002-2000-8339 1 1 1 Phòng khám tư nhân Hà Nội NaN NaN NaN [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN NaN NaN NaN NaN NaN 4 Phòng khám tư nhân Hà Nội NaN NaN [onhealth.vn, onhealth.vn, onhealth.vn, onheal...
2 0000-0001-9293-2224 1 1 1 Juan Carlos Garcia Hoyos My name is Juan Carlos García Hoyos. I was bor... [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... [[Air Force Office of Scientific Research (WRI... NaN NaN [Exolinguistics, Ethnoastronomy, Sociology of ... NaN [[Faculty of Philosophy, History - Ph.D., Char... [[responsible for the Project Service Level Ag... 20 Juan Carlos Garcia Hoyos NaN NaN [af.mil, gst.com, govtribe.com, sbir.gov, open...
3 0000-0003-3045-0056 1 1 1 Ananda Majumdar I am Ananda Majumdar, Child Care Educator at B... NaN [[Migration Scholar and Ananda , https://grfdt... NaN NaN NaN NaN [[Education , B.Ed. After Degree , University ... [[General Coordinator- University of Alberta C... 43 Ananda Majumdar NaN NaN [grfdt.com, linkedin.com, academia.edu, resear...
4 0000-0003-1815-5732 1 1 1 JAS (Jurnal Akuntansi Syariah) JAS (Jurnal Akuntansi Syariah) published in pr... NaN [[Website, https://ejournal.stiesyariahbengkal... NaN NaN [Akuntansi, Akuntansi Syariah] NaN NaN NaN 67 JAS (Jurnal Akuntansi Syariah) NaN NaN [stiesyariahbengkalis.ac.id, lipi.go.id, cross...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
134 0000-0002-1324-7171 1 1 1 Vanesa Natalia Rodriguez Nombre y Apellido: Vanesa Natalia Rodriguez. ... [Vanesa Rodriguez, Vanesa N. Rodriguez] [[De rufianes y franchutas Representaciones y ... NaN NaN [Historia - Prostitución - Mujeres - Enfermeda... NaN [[, Maestría en Ciencias Sociales con Mención ... [[Profesora, Universidad Nacional de La Matanz... 7 Vanesa Natalia Rodriguez NaN NaN [unlam.edu.ar, unirioja.es, amazon.fr, abebook...
135 0000-0002-1700-8311 1 1 1 Fix-IT Rite NaN [Best Heating & Plumbing Company] [[Website, https://fix-itrite.com], [Muckrack,... NaN NaN [Plumber, Appliance, Refrigerator, Repair , Se... NaN NaN NaN 1 Fix-It Rite NaN NaN [fix-itrite.com, muckrack.com, tumblr.com, dri...
136 0000-0003-2676-4431 1 1 1 Benny Soewandi NaN [Benny Soewandi] [[Conservation Efforts as a Result of Theoreti... NaN NaN [Researchers-Conservator for the Architectural... NaN NaN [[Membership, Paguyuban Pelestarian Budaya Ban... 2 Benny Soewandi NaN NaN [wordpress.com, wordpress.com, linkedin.com, f...
137 0000-0001-8157-0600 1 1 1 Bijan Yavar Senior Research Assistant and Phd Student in O... [B. Yavar, Yavar Bijan] [[Web of Science (Pub) Researcher ID: A-3544-2... NaN NaN [Certainty and Uncertainty, Risk Analysis (Qua... Scopus Author ID, 56556873600 NaN NaN 6 Scopus - Elsevier NaN NaN [publons.com, articulate.com, zenodo.org, orci...
138 0000-0002-9874-1450 1 1 1 FENGZHI WU NaN NaN [[A Systematic Study on the Dynamic Softening ... NaN NaN NaN NaN NaN NaN 3 FENGZHI WU NaN NaN [springer.com, sciencedirect.com, sciencedirec...

139 rows × 20 columns

In [53]:
exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]
Out[53]:
orcid claimed verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails keywords external_ids education employment n_works works_source primary_email_domain other_email_domains url_domains
1 0000-0002-2000-8339 1 1 1 Phòng khám tư nhân Hà Nội NaN NaN NaN [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN NaN NaN NaN NaN NaN 4 Phòng khám tư nhân Hà Nội NaN NaN [onhealth.vn, onhealth.vn, onhealth.vn, onheal...
2 0000-0001-9293-2224 1 1 1 Juan Carlos Garcia Hoyos My name is Juan Carlos García Hoyos. I was bor... [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... [[Air Force Office of Scientific Research (WRI... NaN NaN [Exolinguistics, Ethnoastronomy, Sociology of ... NaN [[Faculty of Philosophy, History - Ph.D., Char... [[responsible for the Project Service Level Ag... 20 Juan Carlos Garcia Hoyos NaN NaN [af.mil, gst.com, govtribe.com, sbir.gov, open...
3 0000-0003-3045-0056 1 1 1 Ananda Majumdar I am Ananda Majumdar, Child Care Educator at B... NaN [[Migration Scholar and Ananda , https://grfdt... NaN NaN NaN NaN [[Education , B.Ed. After Degree , University ... [[General Coordinator- University of Alberta C... 43 Ananda Majumdar NaN NaN [grfdt.com, linkedin.com, academia.edu, resear...
4 0000-0003-1815-5732 1 1 1 JAS (Jurnal Akuntansi Syariah) JAS (Jurnal Akuntansi Syariah) published in pr... NaN [[Website, https://ejournal.stiesyariahbengkal... NaN NaN [Akuntansi, Akuntansi Syariah] NaN NaN NaN 67 JAS (Jurnal Akuntansi Syariah) NaN NaN [stiesyariahbengkalis.ac.id, lipi.go.id, cross...
5 0000-0002-4379-6454 1 1 1 Caroline Wanjiru Kariuki Caroline holds a PhD in Economics from Curtin ... NaN [[Scopus Profile, https://www.scopus.com/dashb... NaN NaN [Applied Econometrics, Development Economics, ... NaN [[Economics, Doctor of Philosophy , Curtin Uni... [[Director, Educational Development, Strathmor... 4 Caroline Wanjiru Kariuki NaN NaN [scopus.com, mendeley.com, publons.com, resear...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
132 0000-0001-6352-7086 1 1 1 Susan Hawthorne Susan is a poet, novelist, publisher and Sansk... [S. Hawthorne, Susan C. C. Hawthorne] [[Spinifex Press, http://www.spinifexpress.com... NaN NaN [Womens Studies, Philosophy, Ancient Greek, Sa... ResearcherID, K-6039-2018 [[School of Asian Studies, Honours Sanskrit, A... [[Adjunct Professor, James Cook University, To... 352 Susan Hawthorne NaN NaN [spinifexpress.com.au, linkedin.com, twitter.c...
133 0000-0002-4062-3603 1 1 1 JUAN DE DIOS BELTRÁN MANCILLA JUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut... [Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD... [[01.- Juan de Dios Beltrán Mancilla. Teoría O... NaN NaN [FILOSOFIA MEDICINA ARQUITECTURA ECONOMÍA DERE... NaN [[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR... [[INSPECTOR GENERAL JORNADA VESPERTINA // De 2... 11 JUAN DE DIOS BELTR´´ÁN MANCILLA NaN NaN [yumpu.com, ijopm.org, google.com, blogspot.co...
134 0000-0002-1324-7171 1 1 1 Vanesa Natalia Rodriguez Nombre y Apellido: Vanesa Natalia Rodriguez. ... [Vanesa Rodriguez, Vanesa N. Rodriguez] [[De rufianes y franchutas Representaciones y ... NaN NaN [Historia - Prostitución - Mujeres - Enfermeda... NaN [[, Maestría en Ciencias Sociales con Mención ... [[Profesora, Universidad Nacional de La Matanz... 7 Vanesa Natalia Rodriguez NaN NaN [unlam.edu.ar, unirioja.es, amazon.fr, abebook...
136 0000-0003-2676-4431 1 1 1 Benny Soewandi NaN [Benny Soewandi] [[Conservation Efforts as a Result of Theoreti... NaN NaN [Researchers-Conservator for the Architectural... NaN NaN [[Membership, Paguyuban Pelestarian Budaya Ban... 2 Benny Soewandi NaN NaN [wordpress.com, wordpress.com, linkedin.com, f...
138 0000-0002-9874-1450 1 1 1 FENGZHI WU NaN NaN [[A Systematic Study on the Dynamic Softening ... NaN NaN NaN NaN NaN NaN 3 FENGZHI WU NaN NaN [springer.com, sciencedirect.com, sciencedirec...

108 rows × 20 columns

Works source

In [54]:
def remove_own_source(lst, own):
    res = []
    if isinstance(lst, list) and pd.notna(own):
        for ws in lst:
            if ws.find(own) == -1:
                res.append(ws)
        return res
    else:
        return np.na()
In [55]:
df['ext_works_source'] = df.apply(lambda x: remove_own_source(x['works_source'], x['given_names']), axis=1)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-55-fb84921b6ce6> in <module>
----> 1 df['ext_works_source'] = df.apply(lambda x: remove_own_source(x['works_source'], x['given_names']), axis=1)

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/frame.py in apply(self, func, axis, raw, result_type, args, **kwds)
   7766             kwds=kwds,
   7767         )
-> 7768         return op.get_result()
   7769 
   7770     def applymap(self, func, na_action: Optional[str] = None) -> DataFrame:

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/apply.py in get_result(self)
    183             return self.apply_raw()
    184 
--> 185         return self.apply_standard()
    186 
    187     def apply_empty_result(self):

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/apply.py in apply_standard(self)
    274 
    275     def apply_standard(self):
--> 276         results, res_index = self.apply_series_generator()
    277 
    278         # wrap results

~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/apply.py in apply_series_generator(self)
    288             for i, v in enumerate(series_gen):
    289                 # ignore SettingWithCopy here in case the user mutates
--> 290                 results[i] = self.f(v)
    291                 if isinstance(results[i], ABCSeries):
    292                     # If we have a view on v, we need to make a copy because

<ipython-input-55-fb84921b6ce6> in <lambda>(x)
----> 1 df['ext_works_source'] = df.apply(lambda x: remove_own_source(x['works_source'], x['given_names']), axis=1)

<ipython-input-54-7e0c4e0b4cf4> in remove_own_source(lst, own)
      7         return res
      8     else:
----> 9         return np.na()

NameError: name 'np' is not defined