You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
477 KiB
477 KiB
Exploratory analysis¶
TODO:
- Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)
- Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)
- Temporal dimension of any use?
- Can we access private info thanks to the OpenAIRE-ORCID agreement?
In [1]:
import pandas as pd
import ast
import tldextract
import numpy
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
init_notebook_mode(connected=True)
TOP_N = 0
TOP_RANGE = [0, 0]
def set_top_n(n):
global TOP_N, TOP_RANGE
TOP_N = n
TOP_RANGE = [-.5, n - 1 + .5]
Notable solid ORCID iDs for explorative purposes:
In [2]:
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'
Notable anomalies:
In [3]:
JOURNAL = '0000-0003-1815-5732'
NOINFO = '0000-0001-5009-2052'
VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE
# todo: find group-shared ORCiD, if possible
Notable fake ORCID iDs:
In [4]:
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'
PENIS = '0000-0002-3399-7287'
BITCOIN = '0000-0002-7518-6845'
FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment
CANNABIS = '0000-0002-9025-8632' # URL > 70 + works (REMOVED)
PLUMBER = '0000-0002-1700-8311' # URL > 10 + works
Load the dataset
In [6]:
df = pd.read_pickle('../data/processed/dataset.pkl')
df.head(5)
Out[6]:
Notable profiles inspection
In [7]:
df[df['orcid'] == AM]
Out[7]:
In [8]:
df[df['orcid'] == WHATSAPP]
Out[8]:
In [9]:
df.count()
Out[9]:
In [10]:
df['orcid'].describe()
Out[10]:
Primary email¶
In [20]:
df['primary_email'].describe()
Out[20]:
Dupe emails
In [21]:
df['primary_email'].dropna().loc[df['primary_email'].duplicated()]
Out[21]:
In [22]:
df[df['primary_email'] == 'maykin@owasp.org']
Out[22]:
In [23]:
df[df['primary_email'] == 'opercin@erbakan.edu.tr']
Out[23]:
In [24]:
df[df['primary_email'] == 'patrick.davey@monash.edu']
Out[24]:
In [25]:
df['primary_email_domain'] = df['primary_email'].apply(lambda x: x.split('@')[1] if pd.notna(x) else x)
In [26]:
df['primary_email_domain'].describe()
Out[26]:
In [27]:
primary_emails = df[['primary_email_domain', 'orcid']].groupby('primary_email_domain').count().sort_values('orcid', ascending=False)
primary_emails
Out[27]:
In [28]:
set_top_n(30)
data = [
go.Bar(
x=primary_emails.sort_values(by=['orcid'], ascending=False).index[:TOP_N],
y=primary_emails.sort_values(by=['orcid'], ascending=False)['orcid'][:TOP_N]
)
]
layout = go.Layout(
title='Top %s email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Other emails¶
In [29]:
def extract_email_domains(lst):
res = []
for email in lst:
res.append(email.split('@')[1])
return res
In [30]:
df['other_email_domains'] = df['other_emails'].apply(lambda x: extract_email_domains(x) if isinstance(x, list) else x)
In [31]:
df[df['other_email_domains'].notna()].head()
Out[31]:
In [32]:
df['n_emails'] = df['other_emails'].str.len()
In [33]:
set_top_n(30)
data = [
go.Bar(
x=df.sort_values('n_emails', ascending=False)['orcid'][:TOP_N],
y=df.sort_values('n_emails', ascending=False)['n_emails'][:TOP_N]
)
]
layout = go.Layout(
title='Top %s ORCiD by email' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [34]:
grouped_other_emails = df[['orcid', 'other_email_domains']]\
.explode('other_email_domains')\
.reset_index(drop=True)\
.groupby('other_email_domains')\
.count()\
.sort_values('orcid', ascending=False)
In [35]:
set_top_n(30)
data = [
go.Bar(
x=grouped_other_emails.sort_values(by=['orcid'], ascending=False).index[:TOP_N],
y=grouped_other_emails.sort_values(by=['orcid'], ascending=False)['orcid'][:TOP_N]
)
]
layout = go.Layout(
title='Top %s other email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Email speculation¶
In [36]:
df[df['primary_email'].isna() & df['other_emails'].notna()]
Out[36]:
URLs¶
In [37]:
def extract_url_domains(lst):
domains = []
for e in lst:
# e[0] is a string describing the url
# e[1] is the url
domain = tldextract.extract(e[1])
domains.append(domain.registered_domain)
return domains
In [38]:
df['url_domains'] = df['urls'].apply(lambda x: extract_url_domains(x) if isinstance(x, list) else x)
In [39]:
df[df['url_domains'].notna()].head()
Out[39]:
In [40]:
df['n_urls'] = df['url_domains'].str.len()
In [41]:
df.sort_values('n_urls', ascending=False)[['orcid', 'n_urls']]
Out[41]:
In [42]:
set_top_n(100)
data = [
go.Bar(
x=df.sort_values(by=['n_urls'], ascending=False)['orcid'][:TOP_N],
y=df.sort_values(by=['n_urls'], ascending=False)['n_urls'][:TOP_N]
)
]
layout = go.Layout(
title='Top %s ORCID with URLs' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [43]:
grouped_urls = df[['orcid', 'url_domains']]\
.explode('url_domains')\
.reset_index(drop=True)\
.groupby('url_domains')\
.count()\
.sort_values('orcid', ascending=False)
In [44]:
set_top_n(30)
data = [
go.Bar(
x=grouped_urls.sort_values(by=['orcid'], ascending=False).index[:TOP_N],
y=grouped_urls.sort_values(by=['orcid'], ascending=False)['orcid'][:TOP_N]
)
]
layout = go.Layout(
title='Top %s URL domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [45]:
df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]
Out[45]:
In [46]:
df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]
Out[46]:
In [47]:
exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)
exploded_sources
Out[47]:
In [48]:
exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]
Out[48]:
Works source¶
Paste from Miriam
External IDs¶
External IDs should come from reliable sources. ORCiD registrants cannot add them freely.
In [49]:
df['n_ids'] = df[df['external_ids'].notna()].external_ids.str.len()
In [50]:
df.n_ids.describe()
Out[50]:
In [51]:
df[df.n_ids == df.n_ids.max()]
Out[51]:
In [52]:
ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)
In [53]:
ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])
In [54]:
ids[ids.provider.notna()].head()
Out[54]:
In [55]:
data = [
go.Bar(
x=ids.groupby('provider').count().sort_values('orcid', ascending=False).index,
y=ids.groupby('provider').count().sort_values('orcid', ascending=False)['orcid']
)
]
layout = go.Layout(
title='IDs provided by providers',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [56]:
pd.unique(ids['provider'])
Out[56]:
Keywords¶
This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this
In [57]:
df[df['orcid'] == AM]['keywords'].values[0]
Out[57]:
I did a good job. The following instead is dirty
In [58]:
df[df['orcid'] == PP]['keywords'].values[0]
Out[58]:
So the keyword field needs some cleaning
In [59]:
def fix_keywords(lst):
fixed = set()
for k in lst:
tokens = set(k.split(','))
# tokens.remove('')
for t in tokens:
fixed.add(str.strip(t))
fixed.discard('')
return list(fixed)
In [60]:
df['fixed_keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))
In [61]:
df[df['orcid'] == PP]['fixed_keywords'].values[0]
Out[61]:
In [62]:
df['n_keywords'] = df.keywords.str.len()
In [63]:
df.sort_values('n_keywords', ascending=False)[['orcid', 'n_keywords']]
Out[63]:
In [64]:
set_top_n(100)
data = [
go.Bar(
x=df.sort_values('n_keywords', ascending=False)['orcid'][:TOP_N],
y=df.sort_values('n_keywords', ascending=False)['n_keywords'][:TOP_N]
)
]
layout = go.Layout(
title='Keywords provided by ORCiD',
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [65]:
grouped_keywords = df[['orcid', 'keywords']]\
.explode('keywords')\
.reset_index(drop=True)\
.groupby('keywords')\
.count()\
.sort_values('orcid', ascending=False)
In [66]:
set_top_n(50)
data = [
go.Bar(
x=grouped_keywords.index[:TOP_N],
y=grouped_keywords['orcid'][:TOP_N]
)
]
layout = go.Layout(
title='Top-%s keywords occurrence' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Correlation¶
In [67]:
fig = px.imshow(df[df.n_ids > 0].corr())
fig.show()