678 KiB
678 KiB
Exploratory analysis¶
TODO:
- Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)
- Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)
- Temporal dimension of any use?
- Can we access private info thanks to the OpenAIRE-ORCID agreement?
In [76]:
import glob
import pandas as pd
import ast
import tldextract
import numpy as np
import antispam
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
init_notebook_mode(connected=True)
TOP_N = 0
TOP_RANGE = [0, 0]
def set_top_n(n):
global TOP_N, TOP_RANGE
TOP_N = n
TOP_RANGE = [-.5, n - 1 + .5]
pd.set_option('display.max_columns', None)
Notable solid ORCID iDs for explorative purposes:
In [77]:
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'
Notable anomalies:
In [78]:
JOURNAL = '0000-0003-1815-5732'
NOINFO = '0000-0001-5009-2052'
VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE
WORK_MISUSE = '0000-0001-7870-1120'
# todo: find group-shared ORCiD, if possible
Notable fake ORCID iDs:
In [79]:
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'
PENIS = '0000-0002-3399-7287'
BITCOIN = '0000-0002-7518-6845'
FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment
CANNABIS = '0000-0002-9025-8632' # URL > 70 + works (REMOVED)
PLUMBER = '0000-0002-1700-8311' # URL > 10 + works
Load the dataset
In [80]:
parts = glob.glob('../data/processed/dataset.pkl.*')
df = pd.concat((pd.read_pickle(part) for part in sorted(parts)))
df.head(5)
Out[80]:
Notable profiles inspection
In [6]:
df[df['orcid'] == AM]
Out[6]:
In [7]:
df[df['orcid'] == WHATSAPP]
Out[7]:
In [8]:
df.count()
Out[8]:
In [9]:
df['orcid'].describe()
Out[9]:
Primary email¶
In [10]:
df['primary_email'].describe()
Out[10]:
Dupe emails
In [11]:
df['primary_email'].dropna().loc[df['primary_email'].duplicated()]
Out[11]:
In [12]:
df[df['primary_email'] == 'maykin@owasp.org']
Out[12]:
In [13]:
df[df['primary_email'] == 'opercin@erbakan.edu.tr']
Out[13]:
In [14]:
df[df['primary_email'] == 'patrick.davey@monash.edu']
Out[14]:
In [15]:
df['primary_email_domain'].describe()
Out[15]:
In [16]:
top_primary_emails = df[['primary_email_domain', 'orcid']]\
.groupby('primary_email_domain')\
.count()\
.sort_values('orcid', ascending=False)
top_primary_emails
Out[16]:
In [17]:
set_top_n(30)
data = [
go.Bar(
x=top_primary_emails[:TOP_N].index,
y=top_primary_emails[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Other emails¶
In [18]:
df[df.other_email_domains.notna()].head()
Out[18]:
In [19]:
emails_by_orcid = df[['orcid', 'n_emails']].sort_values('n_emails', ascending=False)
In [20]:
set_top_n(30)
data = [
go.Bar(
x=emails_by_orcid[:TOP_N]['orcid'],
y=emails_by_orcid[:TOP_N]['n_emails']
)
]
layout = go.Layout(
title='Top %s ORCID iDs by email' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [21]:
top_other_emails = df[['orcid', 'other_email_domains']]\
.explode('other_email_domains')\
.reset_index(drop=True)\
.groupby('other_email_domains')\
.count()\
.sort_values('orcid', ascending=False)
In [22]:
set_top_n(30)
data = [
go.Bar(
x=top_other_emails[:TOP_N].index,
y=top_other_emails[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top %s other email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
This somehow makes sense, legitimate users could put the gmail account as primary for login purposes and have institutional addresses as other email addresses. It makes also the life easier upon relocation.
Email speculation¶
In [23]:
df[df.primary_email.isna() & df.other_email_domains.notna()]
Out[23]:
URLs¶
In [24]:
df[df.url_domains.notna()].head()
Out[24]:
In [25]:
urls_by_orcid = df[['orcid', 'n_urls']].sort_values('n_urls', ascending=False)
urls_by_orcid
Out[25]:
In [26]:
set_top_n(100)
data = [
go.Bar(
x=urls_by_orcid[:TOP_N]['orcid'],
y=urls_by_orcid[:TOP_N]['n_urls']
)
]
layout = go.Layout(
title='Top %s ORCID iDs with URLs' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [27]:
top_urls = df[['orcid', 'url_domains']]\
.explode('url_domains')\
.reset_index(drop=True)\
.groupby('url_domains')\
.count()\
.sort_values('orcid', ascending=False)
In [28]:
set_top_n(50)
data = [
go.Bar(
x=top_urls[:TOP_N].index,
y=top_urls[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s URL domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
URLs speculation¶
In [29]:
df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]
Out[29]:
In [30]:
df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]
Out[30]:
In [31]:
exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)
exploded_sources
Out[31]:
In [32]:
exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]
Out[32]:
Works source¶
In [33]:
def remove_own_source(lst, given, family):
res = []
for ws in lst:
if ws.lower().find(given.lower()) == -1:
if pd.notna(family):
if ws.lower().find(family.lower()) == -1:
res.append(ws)
else:
res.append(ws)
return res
In [34]:
df['ext_works_source'] = df[(df.works_source.notna()) & (df.given_names.notna())]\
.apply(lambda x: remove_own_source(x['works_source'], x['given_names'], x['family_name']), axis=1)
In [35]:
df['n_ext_work_source'] = df.ext_works_source.str.len()
In [36]:
exploded_external_sources = df[df['ext_works_source'].str.len() > 0][['orcid','ext_works_source']]\
.explode('ext_works_source').reset_index(drop=True)
In [37]:
grouped_ext_sources = exploded_external_sources.groupby('ext_works_source')\
.count()\
.sort_values('orcid', ascending=False)\
.reset_index()
In [38]:
data = [
go.Bar(
x=grouped_ext_sources[:30].ext_works_source,
y=grouped_ext_sources[:30].orcid
)
]
layout = go.Layout(
title='Top 30 works_source',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [39]:
authoritative_sources = grouped_ext_sources[grouped_ext_sources['orcid'] > 2]
authoritative_sources
Out[39]:
In [40]:
exploded_external_sources['authoritative'] = exploded_external_sources.ext_works_source\
.isin(authoritative_sources['ext_works_source'])
In [41]:
orcid_authoritative_source = exploded_external_sources\
.groupby('orcid')['authoritative']\
.any()\
.reset_index()[['orcid', 'authoritative']]
In [42]:
df = df.set_index('orcid').join(orcid_authoritative_source.set_index('orcid')).reset_index()
In [43]:
df.loc[df.authoritative.isna(), 'authoritative'] = False
In [44]:
df.head()
Out[44]:
External IDs¶
External IDs should come from reliable sources. ORCiD registrants cannot add them freely.
In [45]:
df.n_ids.describe()
Out[45]:
In [46]:
df[df.n_ids == df.n_ids.max()]
Out[46]:
In [47]:
ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)
In [48]:
ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])
In [49]:
ids[ids.provider.notna()].head()
Out[49]:
In [50]:
top_ids_providers = ids.groupby('provider').count().sort_values('orcid', ascending=False)
In [51]:
data = [
go.Bar(
x=top_ids_providers.index,
y=top_ids_providers['orcid']
)
]
layout = go.Layout(
title='IDs provided by providers',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [52]:
pd.unique(ids['provider'])
Out[52]:
Keywords¶
This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this
In [53]:
keywords_by_orcid = df[['orcid', 'n_keywords']].sort_values('n_keywords', ascending=False)
keywords_by_orcid
Out[53]:
In [54]:
set_top_n(100)
data = [
go.Bar(
x=keywords_by_orcid[:TOP_N]['orcid'],
y=keywords_by_orcid[:TOP_N]['n_keywords']
)
]
layout = go.Layout(
title='Keywords provided by ORCiD',
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [55]:
top_keywords = df[['orcid', 'keywords']]\
.explode('keywords')\
.reset_index(drop=True)\
.groupby('keywords')\
.count()\
.sort_values('orcid', ascending=False)
In [56]:
set_top_n(50)
data = [
go.Bar(
x=top_keywords[:TOP_N].index,
y=top_keywords[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s keywords occurrence' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Education¶
In [57]:
def extract_education(lst):
educations = []
for e in lst:
# e[0] degree
# e[1] role
# e[2] university
# e[..] city, region, country, id, id_scheme
educations.append(' '.join([e[0], e[1], e[2]]))
return educations
Employment¶
In [58]:
def extract_employment(lst):
res = []
for e in lst:
# e[0] role
# e[1] institute
# e[..] city, region, country, id, id_scheme
res.append(' '.join([e[0], e[1]]))
return res
Biography¶
In [59]:
df['biography'] = df[df.biography.notna()]['biography'].replace('', np.NaN)
In [60]:
df.biography.describe()
Out[60]:
In [61]:
df[(df.biography.notna()) & (df.biography.str.contains('car title loans are a more straightforward'))]
Out[61]:
In [62]:
def score(bio):
try:
return antispam.score(bio)
except: # if len(bio) < 3 the filter doesn't know how to handle that
return -1
In [63]:
df['spam_score'] = df[df.biography.notna()]['biography'].apply(lambda bio: score(bio))
In [64]:
df[df.spam_score == -1][['orcid','biography']]
Out[64]:
In [65]:
df['spam_score'] = df['spam_score'].replace(-1, np.NaN)
In [66]:
df.spam_score.describe()
Out[66]:
In [67]:
df[df.spam_score > 0.9999][['biography', 'spam_score']]
Out[67]:
All VS All correlation¶
In [68]:
fig = px.imshow(df.fillna(-1).corr())
fig.show()
In [69]:
df[['verified_email',
'verified_primary_email',
'n_works',
'n_doi',
'n_arxiv',
'n_pmc',
'n_other_pids',
'n_emails',
'n_urls',
'n_ids',
'n_keywords',
'n_employment',
'n_education',
'label']].to_pickle('../data/processed/features.pkl')
Label speculation¶
In [70]:
df[df.label == 1]
Out[70]:
In [105]:
# (df.n_works > 0) & (df.n_ids > 1)
df.info()
In [104]:
df.n_ids = df.n_ids.astype(pd.UInt16Dtype())
In [107]:
pd.Series(['2016-07-27t10:09:13.585z', '2016-07-27t10:09:13.585z', pd.NA, '2016-07-27t10:09:13.585z'])
Out[107]:
In [108]:
pd.to_datetime(df.activation_date)
In [109]:
df['label'] = df['label'].astype('bool')
In [110]:
df.info()