478 KiB
478 KiB
Exploratory analysis¶
TODO:
- Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)
- Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)
- Temporal dimension of any use?
- Can we access private info thanks to the OpenAIRE-ORCID agreement?
In [1]:
import pandas as pd
import ast
import tldextract
import numpy
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
init_notebook_mode(connected=True)
TOP_N = 0
TOP_RANGE = [0, 0]
def set_top_n(n):
global TOP_N, TOP_RANGE
TOP_N = n
TOP_RANGE = [-.5, n - 1 + .5]
Notable solid ORCID iDs for explorative purposes:
In [2]:
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'
Notable anomalies:
In [3]:
JOURNAL = '0000-0003-1815-5732'
NOINFO = '0000-0001-5009-2052'
VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE
# todo: find group-shared ORCiD, if possible
Notable fake ORCID iDs:
In [4]:
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'
PENIS = '0000-0002-3399-7287'
BITCOIN = '0000-0002-7518-6845'
FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment
CANNABIS = '0000-0002-9025-8632' # URL > 70 + works (REMOVED)
PLUMBER = '0000-0002-1700-8311' # URL > 10 + works
Load the dataset
In [5]:
df = pd.read_pickle('../data/processed/dataset.pkl')
df.head(5)
Out[5]:
Notable profiles inspection
In [6]:
df[df['orcid'] == AM]
Out[6]:
In [7]:
df[df['orcid'] == WHATSAPP]
Out[7]:
In [8]:
df.count()
Out[8]:
In [9]:
df['orcid'].describe()
Out[9]:
Primary email¶
In [10]:
df['primary_email'].describe()
Out[10]:
Dupe emails
In [11]:
df['primary_email'].dropna().loc[df['primary_email'].duplicated()]
Out[11]:
In [12]:
df[df['primary_email'] == 'maykin@owasp.org']
Out[12]:
In [13]:
df[df['primary_email'] == 'opercin@erbakan.edu.tr']
Out[13]:
In [14]:
df[df['primary_email'] == 'patrick.davey@monash.edu']
Out[14]:
In [15]:
df['primary_email_domain'] = df[df.primary_email.notna()]['primary_email'].apply(lambda x: x.split('@')[1])
In [16]:
df['primary_email_domain'].describe()
Out[16]:
In [17]:
top_primary_emails = df[['primary_email_domain', 'orcid']]\
.groupby('primary_email_domain')\
.count()\
.sort_values('orcid', ascending=False)
top_primary_emails
Out[17]:
In [18]:
set_top_n(30)
data = [
go.Bar(
x=top_primary_emails[:TOP_N].index,
y=top_primary_emails[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Other emails¶
In [19]:
def extract_email_domains(lst):
res = []
for email in lst:
res.append(email.split('@')[1])
return res
In [20]:
df['other_email_domains'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: extract_email_domains(x))
In [21]:
df[df['other_email_domains'].notna()].head()
Out[21]:
In [22]:
df['n_emails'] = df['other_emails'].str.len()
In [23]:
emails_by_orcid = df.sort_values('n_emails', ascending=False)
In [24]:
set_top_n(30)
data = [
go.Bar(
x=emails_by_orcid[:TOP_N]['orcid'],
y=emails_by_orcid[:TOP_N]['n_emails']
)
]
layout = go.Layout(
title='Top %s ORCID iDs by email' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [25]:
top_other_emails = df[['orcid', 'other_email_domains']]\
.explode('other_email_domains')\
.reset_index(drop=True)\
.groupby('other_email_domains')\
.count()\
.sort_values('orcid', ascending=False)
In [26]:
set_top_n(30)
data = [
go.Bar(
x=top_other_emails[:TOP_N].index,
y=top_other_emails[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top %s other email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Email speculation¶
In [27]:
df[df['primary_email'].isna() & df['other_emails'].notna()]
Out[27]:
URLs¶
In [28]:
def extract_url_domains(lst):
domains = []
for e in lst:
# e[0] is a string describing the url
# e[1] is the url
domain = tldextract.extract(e[1])
domains.append(domain.registered_domain)
return domains
In [29]:
df['url_domains'] = df[df.urls.notna()]['urls'].apply(lambda x: extract_url_domains(x))
In [30]:
df[df['url_domains'].notna()].head()
Out[30]:
In [31]:
df['n_urls'] = df['url_domains'].str.len()
In [32]:
urls_by_orcid = df.sort_values('n_urls', ascending=False)[['orcid', 'n_urls']]
urls_by_orcid
Out[32]:
In [33]:
set_top_n(100)
data = [
go.Bar(
x=urls_by_orcid[:TOP_N]['orcid'],
y=urls_by_orcid[:TOP_N]['n_urls']
)
]
layout = go.Layout(
title='Top %s ORCID iDs with URLs' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [34]:
top_urls = df[['orcid', 'url_domains']]\
.explode('url_domains')\
.reset_index(drop=True)\
.groupby('url_domains')\
.count()\
.sort_values('orcid', ascending=False)
In [35]:
set_top_n(30)
data = [
go.Bar(
x=top_urls[:TOP_N].index,
y=top_urls[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s URL domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
URLs speculation¶
In [36]:
df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]
Out[36]:
In [37]:
df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]
Out[37]:
In [38]:
exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)
exploded_sources
Out[38]:
In [39]:
exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]
Out[39]:
Works source¶
Paste from Miriam
External IDs¶
External IDs should come from reliable sources. ORCiD registrants cannot add them freely.
In [40]:
df['n_ids'] = df[df['external_ids'].notna()].external_ids.str.len()
In [41]:
df.n_ids.describe()
Out[41]:
In [42]:
df[df.n_ids == df.n_ids.max()]
Out[42]:
In [43]:
ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)
In [44]:
ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])
In [45]:
ids[ids.provider.notna()].head()
Out[45]:
In [46]:
top_ids_providers = ids.groupby('provider').count().sort_values('orcid', ascending=False)
In [47]:
data = [
go.Bar(
x=top_ids_providers.index,
y=top_ids_providers['orcid']
)
]
layout = go.Layout(
title='IDs provided by providers',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [48]:
pd.unique(ids['provider'])
Out[48]:
Keywords¶
This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this
In [49]:
df[df['orcid'] == AM]['keywords'].values[0]
Out[49]:
I did a good job. The following instead is dirty
In [50]:
df[df['orcid'] == PP]['keywords'].values[0]
Out[50]:
So the keyword field needs some cleaning
In [51]:
def fix_keywords(lst):
fixed = set()
for k in lst:
tokens = set(k.split(','))
# tokens.remove('')
for t in tokens:
fixed.add(str.strip(t))
fixed.discard('')
return list(fixed)
In [52]:
df['fixed_keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))
In [53]:
df[df['orcid'] == PP]['fixed_keywords'].values[0]
Out[53]:
In [54]:
df['n_keywords'] = df.keywords.str.len()
In [55]:
keywords_by_orcid = df.sort_values('n_keywords', ascending=False)[['orcid', 'n_keywords']]
keywords_by_orcid
Out[55]:
In [56]:
set_top_n(100)
data = [
go.Bar(
x=keywords_by_orcid[:TOP_N]['orcid'],
y=keywords_by_orcid[:TOP_N]['n_keywords']
)
]
layout = go.Layout(
title='Keywords provided by ORCiD',
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [57]:
top_keywords = df[['orcid', 'keywords']]\
.explode('keywords')\
.reset_index(drop=True)\
.groupby('keywords')\
.count()\
.sort_values('orcid', ascending=False)
In [58]:
set_top_n(50)
data = [
go.Bar(
x=top_keywords[:TOP_N].index,
y=top_keywords[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s keywords occurrence' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Correlation¶
In [59]:
fig = px.imshow(df[df.n_ids > 0].corr())
fig.show()