449 KiB
449 KiB
Exploratory analysis¶
TODO:
- Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)
- Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)
- Temporal dimension of any use?
- Can we access private info thanks to the OpenAIRE-ORCID agreement?
In [73]:
import pandas as pd
import ast
import tldextract
import numpy
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
init_notebook_mode(connected=True)
TOP_N = 0
TOP_RANGE = [0, 0]
def set_top_n(n):
global TOP_N, TOP_RANGE
TOP_N = n
TOP_RANGE = [-.5, n - 1 + .5]
Notable solid ORCID iDs for explorative purposes:
In [2]:
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'
Anomalies ORCiD profile
In [3]:
JOURNAL = '0000-0003-1815-5732'
NOINFO= '0000-0001-5009-2052'
# todo: find group-shared ORCiD, if possible
Notable fake ORCID iDs for explorative purposes:
In [4]:
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'
PENIS = '0000-0002-3399-7287'
BITCOIN = '0000-0002-7518-6845'
FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment
CANNABIS = '0000-0002-9025-8632' # URL > 70 + works (REMOVED)
PLUMBER = '0000-0002-1700-8311' # URL > 10 + works
Load the dataset
In [5]:
df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\t', header=0,
names = ['orcid', 'claimed','verified_email', 'verified_primary_email',
'given_names', 'family_name', 'biography', 'other_names', 'urls',
'primary_email', 'other_emails', 'keywords', 'external_ids', 'education',
'employment', 'n_works', 'works_source'])
In [6]:
df[df.duplicated()]
Out[6]:
In [7]:
df.drop_duplicates(inplace=True)
Basic column manipulation (interpret columns as lists when necessary)
In [8]:
df['other_names'] = df[df.other_names.notna()]['other_names'].apply(lambda x: ast.literal_eval(x))
In [9]:
df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: ast.literal_eval(x))
In [10]:
df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x))
In [11]:
df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x))
In [12]:
df['education'] = df[df.education.notna()]['education'].apply(lambda x: ast.literal_eval(x))
In [13]:
df['employment'] = df[df.employment.notna()]['employment'].apply(lambda x: ast.literal_eval(x))
In [14]:
df['external_ids'] = df[df.external_ids.notna()]['external_ids'].apply(lambda x: ast.literal_eval(x))
In [15]:
df['works_source'] = df[df.works_source.notna()]['works_source'].apply(lambda x: ast.literal_eval(x))
In [16]:
df.head(5)
Out[16]:
In [17]:
df[df['orcid'] == AM]
Out[17]:
In [18]:
df[df['orcid'] == WHATSAPP]
Out[18]:
In [19]:
df.count()
Out[19]:
In [20]:
df[df['orcid'] == '0000-0002-5154-6404']
Out[20]:
In [21]:
df.drop(index=4595264, inplace=True)
In [22]:
df['orcid'].describe()
Out[22]:
Primary email¶
In [23]:
df['primary_email'].describe()
Out[23]:
Dupe emails
In [24]:
df['primary_email'].dropna().loc[df['primary_email'].duplicated()]
Out[24]:
In [25]:
df[df['primary_email'] == 'maykin@owasp.org']
Out[25]:
In [26]:
df[df['primary_email'] == 'opercin@erbakan.edu.tr']
Out[26]:
In [27]:
df[df['primary_email'] == 'patrick.davey@monash.edu']
Out[27]:
In [28]:
df['primary_email_domain'] = df['primary_email'].apply(lambda x: x.split('@')[1] if pd.notna(x) else x)
In [29]:
df['primary_email_domain'].describe()
Out[29]:
In [30]:
primary_emails = df[['primary_email_domain', 'orcid']].groupby('primary_email_domain').count().sort_values('orcid', ascending=False)
primary_emails
Out[30]:
In [65]:
set_top_n(30)
data = [
go.Bar(
x=primary_emails[:TOP_N].sort_values(by=['orcid'], ascending=False).index,
y=primary_emails[:TOP_N].sort_values(by=['orcid'], ascending=False)['orcid']
)
]
layout = go.Layout(
title='Top %s email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Other emails¶
In [32]:
def extract_email_domains(lst):
res = []
for email in lst:
res.append(email.split('@')[1])
return res
In [33]:
df['other_email_domains'] = df['other_emails'].apply(lambda x: extract_email_domains(x) if isinstance(x, list) else x)
In [34]:
df[df['other_email_domains'].notna()].head()
Out[34]:
In [35]:
df['n_emails'] = df['other_emails'].str.len()
In [36]:
df.sort_values('n_emails', ascending=False)[['orcid', 'n_emails']]
Out[36]:
In [37]:
grouped_other_emails = df[['orcid', 'other_email_domains']]\
.explode('other_email_domains')\
.reset_index(drop=True)\
.groupby('other_email_domains')\
.count()\
.sort_values('orcid', ascending=False)
In [74]:
set_top_n(30)
data = [
go.Bar(
x=grouped_other_emails[:TOP_N].sort_values(by=['orcid'], ascending=False).index,
y=grouped_other_emails[:TOP_N].sort_values(by=['orcid'], ascending=False)['orcid']
)
]
layout = go.Layout(
title='Top %s other email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Email speculation¶
In [39]:
df[df['primary_email'].isna() & df['other_emails'].notna()]
Out[39]:
URLs¶
In [40]:
def extract_url_domains(lst):
domains = []
for e in lst:
# e[0] is a string describing the url
# e[1] is the url
domain = tldextract.extract(e[1])
domains.append(domain.registered_domain)
return domains
In [41]:
df['url_domains'] = df['urls'].apply(lambda x: extract_url_domains(x) if isinstance(x, list) else x)
In [42]:
df[df['url_domains'].notna()].head()
Out[42]:
In [43]:
df['n_urls'] = df['url_domains'].str.len()
In [44]:
df.sort_values('n_urls', ascending=False)[['orcid', 'n_urls']]
Out[44]:
In [75]:
set_top_n(100)
data = [
go.Bar(
x=df.sort_values(by=['n_urls'], ascending=False)['orcid'][:TOP_N],
y=df.sort_values(by=['n_urls'], ascending=False)['n_urls'][:TOP_N]
)
]
layout = go.Layout(
title='Top %s ORCID with URLs' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [46]:
grouped_urls = df[['orcid', 'url_domains']]\
.explode('url_domains')\
.reset_index(drop=True)\
.groupby('url_domains')\
.count()\
.sort_values('orcid', ascending=False)
In [77]:
set_top_n(30)
data = [
go.Bar(
x=grouped_urls[:TOP_N].sort_values(by=['orcid'], ascending=False).index,
y=grouped_urls[:TOP_N].sort_values(by=['orcid'], ascending=False)['orcid']
)
]
layout = go.Layout(
title='Top %s URL domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [48]:
df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]
Out[48]:
In [49]:
df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]
Out[49]:
In [50]:
exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)
exploded_sources
Out[50]:
In [51]:
exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]
Out[51]:
Works source¶
Paste from Miriam
External IDs¶
External IDs should come from reliable sources. ORCiD registrants cannot add them freely.
In [52]:
df['n_ids'] = df[df['external_ids'].notna()].external_ids.str.len()
In [53]:
df.n_ids.describe()
Out[53]:
In [54]:
df[df.n_ids == df.n_ids.max()]
Out[54]:
In [55]:
ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)
In [78]:
ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])
In [79]:
ids[ids.provider.notna()].head()
Out[79]:
In [80]:
data = [
go.Bar(
x=ids.groupby('provider').count().sort_values('orcid', ascending=False).index,
y=ids.groupby('provider').count().sort_values('orcid', ascending=False)['orcid']
)
]
layout = go.Layout(
title='IDs provided',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [81]:
pd.unique(ids['provider'])
Out[81]:
Keywords¶
In [82]:
df['n_keywords'] = df.keywords.str.len()
In [83]:
df.sort_values('n_keywords', ascending=False)[['orcid', 'n_keywords']]
Out[83]:
In [84]:
data = [
go.Bar(
x=df.sort_values('n_keywords', ascending=False)['orcid'][:100],
y=df.sort_values('n_keywords', ascending=False)['n_keywords'][:100]
)
]
layout = go.Layout(
title='Keywords provided',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Correlation¶
In [85]:
fig = px.imshow(df[df.n_ids > 0].corr())
fig.show()