943 KiB
Exploratory analysis¶
TODO:
- Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)
- Study different cases (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)
- Temporal dimension; is it of any use?
- Can we access private info thanks to the OpenAIRE-ORCID agreement? No.
import glob
import ast
import tldextract
import numpy as np
import pandas as pd
import antispam
import profanity_check
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
init_notebook_mode(connected=True)
TOP_N = 0
TOP_RANGE = [0, 0]
def set_top_n(n):
global TOP_N, TOP_RANGE
TOP_N = n
TOP_RANGE = [-.5, n - 1 + .5]
pd.set_option('display.max_columns', None)
Notable solid ORCID iDs for explorative purposes:
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'
Notable anomalies:
JOURNAL = '0000-0003-1815-5732'
NOINFO = '0000-0001-5009-2052'
VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE
WORK_MISUSE = '0000-0001-7870-1120'
# todo: find group-shared ORCiD, if possible
Notable fake ORCID iDs:
FAKE_HEAP = {
'scaffold': '0000-0001-5004-7761',
'whatsapp': '0000-0001-6997-9470',
'penis': '0000-0002-3399-7287',
'bitcoin': '0000-0002-7518-6845',
'fitness': '0000-0002-1234-835X', # URL record + employment
'cannabis': '0000-0002-9025-8632', # URL > 70 + works (now REMOVED)
'plumber': '0000-0002-1700-8311', # URL > 10 + works
'furniture': '0000-0001-7478-4539',
'cleaners': '0000-0002-7392-3792'
}
Load the dataset
parts = glob.glob('../data/processed/dataset.pkl.*')
df = pd.concat((pd.read_pickle(part) for part in sorted(parts)))
df.head(5)
Notable profiles inspection
df[df['orcid'] == AM]
df[df['orcid'] == FAKE_HEAP['whatsapp']]
df.count()
df['orcid'].describe()
Primary email¶
df['primary_email'].describe()
Dupe emails
df['primary_email'].dropna().loc[df['primary_email'].duplicated()]
df[df['primary_email'] == 'maykin@owasp.org']
df[df['primary_email'] == 'opercin@erbakan.edu.tr']
df[df['primary_email'] == 'patrick.davey@monash.edu']
df['primary_email_domain'].describe()
top_primary_emails = df[['primary_email_domain', 'orcid']]\
.groupby('primary_email_domain')\
.count()\
.sort_values('orcid', ascending=False)
top_primary_emails
set_top_n(30)
data = [
go.Bar(
x=top_primary_emails[:TOP_N].index,
y=top_primary_emails[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Other emails¶
df[df.other_email_domains.notna()].head()
emails_by_orcid = df[['orcid', 'n_emails']].sort_values('n_emails', ascending=False)
set_top_n(30)
data = [
go.Bar(
x=emails_by_orcid[:TOP_N]['orcid'],
y=emails_by_orcid[:TOP_N]['n_emails']
)
]
layout = go.Layout(
title='Top %s ORCID iDs by email' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
top_other_emails = df[['orcid', 'other_email_domains']]\
.explode('other_email_domains')\
.reset_index(drop=True)\
.groupby('other_email_domains')\
.count()\
.sort_values('orcid', ascending=False)
set_top_n(30)
data = [
go.Bar(
x=top_other_emails[:TOP_N].index,
y=top_other_emails[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top %s other email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
This somehow makes sense, legitimate users could put the gmail account as primary for login purposes and have institutional addresses as other email addresses. It makes also the life easier upon relocation.
Email speculation¶
df[df.primary_email.isna() & df.other_email_domains.notna()]
URLs¶
df.n_urls.describe()
df[df.n_urls > df.n_urls.max()]
df[df.url_domains.notna()].head()
urls_by_orcid = df[['orcid', 'n_urls']].sort_values('n_urls', ascending=False)
urls_by_orcid
The first three are fake, the fourth isn't. No assumption can be taken.
set_top_n(100)
data = [
go.Bar(
x=urls_by_orcid[:TOP_N]['orcid'],
y=urls_by_orcid[:TOP_N]['n_urls']
)
]
layout = go.Layout(
title='Top %s ORCID iDs with URLs' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
top_urls = df[['orcid', 'url_domains']]\
.explode('url_domains')\
.reset_index(drop=True)\
.groupby('url_domains')\
.count()\
.sort_values('orcid', ascending=False)
set_top_n(50)
data = [
go.Bar(
x=top_urls[:TOP_N].index,
y=top_urls[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s URL domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Malformed URLs are left empty
exploded_url_domains = df[['orcid', 'url_domains']].explode('url_domains')
exploded_url_domains[exploded_url_domains.url_domains == ''].count()
URLs speculation¶
df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]
df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]
exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)
exploded_sources
exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]
Works source¶
def remove_self_source(lst, given, family):
res = []
for ws in lst:
if ws.lower().find(given.lower()) == -1:
if pd.notna(family):
if ws.lower().find(family.lower()) == -1:
res.append(ws)
else:
res.append(ws)
return res
df['ext_works_source'] = df[(df.works_source.notna()) & (df.given_names.notna())]\
.apply(lambda x: remove_self_source(x['works_source'], x['given_names'], x['family_name']), axis=1)
df['n_ext_work_source'] = pd.Series(df.ext_works_source.str.len(), dtype=pd.Int16Dtype())
exploded_external_sources = df[df['ext_works_source'].str.len() > 0][['orcid','ext_works_source']]\
.explode('ext_works_source').reset_index(drop=True)
grouped_ext_sources = exploded_external_sources.groupby('ext_works_source')\
.count()\
.sort_values('orcid', ascending=False)\
.reset_index()
set_top_n(30)
data = [
go.Bar(
x=grouped_ext_sources[:TOP_N].ext_works_source,
y=grouped_ext_sources[:TOP_N].orcid
)
]
layout = go.Layout(
title='Top %s works_source' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
authoritative_sources = grouped_ext_sources[grouped_ext_sources['orcid'] > 2]
authoritative_sources
exploded_external_sources['authoritative'] = exploded_external_sources.ext_works_source\
.isin(authoritative_sources['ext_works_source'])
orcid_authoritative_source = exploded_external_sources\
.groupby('orcid')['authoritative']\
.any()\
.reset_index()[['orcid', 'authoritative']]
df = df.merge(orcid_authoritative_source, on='orcid', how='left')
df.loc[df.authoritative.isna(), 'authoritative'] = False
df.head()
External IDs¶
External IDs should come from reliable sources. ORCiD registrants cannot add them freely.
df.n_ids.describe()
df[df.n_ids == df.n_ids.max()]
ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)
ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])
ids[ids.provider.notna()].head()
top_ids_providers = ids.groupby('provider').count().sort_values('orcid', ascending=False)
data = [
go.Bar(
x=top_ids_providers.index,
y=top_ids_providers['orcid']
)
]
layout = go.Layout(
title='IDs provided by providers',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
pd.unique(ids['provider'])
Keywords¶
This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this
keywords_by_orcid = df[['orcid', 'n_keywords']].sort_values('n_keywords', ascending=False)
keywords_by_orcid
set_top_n(100)
data = [
go.Bar(
x=keywords_by_orcid[:TOP_N]['orcid'],
y=keywords_by_orcid[:TOP_N]['n_keywords']
)
]
layout = go.Layout(
title='Keywords provided by ORCiD',
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
top_keywords = df[['orcid', 'keywords']]\
.explode('keywords')\
.reset_index(drop=True)\
.groupby('keywords')\
.count()\
.sort_values('orcid', ascending=False)
set_top_n(50)
data = [
go.Bar(
x=top_keywords[:TOP_N].index,
y=top_keywords[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s keywords occurrence' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Education¶
df.n_education.describe()
df[df.n_education == df.n_education.max()]
exploded_education = df[['orcid', 'education']].explode('education').dropna()
exploded_education
exploded_education[['degree', 'role', 'university', 'city', 'region', 'country', 'id', 'id_scheme']] = pd.DataFrame(exploded_education.education.tolist(), index=exploded_education.index)
exploded_education.id.replace('', pd.NA, inplace=True)
exploded_education.groupby('orcid').id.count().reset_index()
df = df.merge(exploded_education.groupby('orcid').id.count().reset_index(), on='orcid', how='left')
df.rename(columns={'id': 'n_valid_education'}, inplace=True)
df[df.n_education != df.n_valid_education]
Employment¶
df.n_employment.describe()
df[df.n_employment == df.n_employment.max()]
Let's count how many employments have a valid assigned id by orcid (ringols, isni, grid, etc.)
exploded_employment = df[['orcid', 'employment']].explode('employment').dropna()
exploded_employment
exploded_employment[['role', 'institution', 'city', 'region', 'country', 'id', 'id_scheme']] = pd.DataFrame(exploded_employment.employment.tolist(), index=exploded_employment.index)
exploded_employment.id.replace('', pd.NA, inplace=True)
exploded_employment.groupby('orcid').id.count().reset_index()
df = df.merge(exploded_employment.groupby('orcid').id.count().reset_index(), on='orcid', how='left')
df.rename(columns={'id': 'n_valid_employment'}, inplace=True)
df[df.n_employment != df.n_valid_employment]
Biography¶
TODO:
- correlazione temporale
- picchi di creazione account (giornaliera)
df.biography.replace('', np.NaN, inplace=True)
df.biography.describe()
Duplicated bios
df[(df.biography.notna()) & (df.biography.str.contains('car title loans are a more straightforward'))]
Let's note them down
i = 0
for orcid in df[(df.biography.notna()) & (df.biography.str.contains('car title loans are a more straightforward'))]['orcid']:
FAKE_HEAP['carloan_' + str(i)] = orcid
i = i+1
Let's check deeper into duplicated bios
df[(df.biography.notna()) & (df.biography.duplicated(keep=False))]
dup_bios = df[['orcid', 'biography']].groupby('biography').count().sort_values('orcid', ascending=False)
dup_bios = dup_bios[dup_bios.orcid > 1]
dup_bios
dup_bios.sum()
# dup_bios.to_csv('../data/processed/dup_bios.csv', index=True, columns=[], header=False)
dup_bios.to_csv('../data/processed/dup_bios.csv')
I noticed that some bios can be found on google in other (probably fake) accounts. E.g. "hi, how are you? it is really cool to find an entire community of people interested in the same thing you are." can be found on https://dribbble.com/camrodoabh/about
Dup bios URLs
Let's plot the domains dup bios point to
BIO_SNIPPET = 'really cool to find an entire community of people'
dup_bios_df = df[df.biography.str.contains(BIO_SNIPPET)].explode('url_domains').groupby('url_domains')[['orcid']].count().sort_values('orcid', ascending=False)
set_top_n(50)
data = [
go.Bar(
x=dup_bios_df[:TOP_N].index,
y=dup_bios_df[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='URL distribution for bio "%s"' % BIO_SNIPPET,
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Dup bios dates
BIO_SNIPPET = 'more straightforward way to borrow the money you'
dup_bios_df = df[df.biography.str.contains(BIO_SNIPPET)]
# .groupby(df.activation_date.dt.month)[['orcid']].count().sort_values('orcid', ascending=False)
set_top_n(50)
data = [
go.Histogram(
x=dup_bios_df['activation_date'],
y=dup_bios_df['orcid'],
histfunc="count"
)
]
layout = go.Layout(
title='Activation distribution for bio "%s"' % BIO_SNIPPET,
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
fig.update_traces(xbins_size='D1')
plotly.offline.iplot(fig)
Dup bios with extended length
dup_bios[dup_bios.index.str.split(' ').str.len() > 10]
Assign spam score from precanned library
# bios = df[df.biography.notna()][['orcid', 'biography']]
# def score(bio):
# try:
# return antispam.score(bio)
# except: # if len(bio) < 3 the filter doesn't know how to handle that
# return -1
# bios['spam_score'] = bios.biography.apply(lambda bio: score(bio))
# bios[bios.spam_score == -1] # these are artefacts (no scoring possible)
# bios.spam_score.replace(to_replace=-1, value=np.nan, inplace=True)
# bios.spam_score.describe()
# bios[bios.spam_score > 0.99]
Spam goes nowhere.
Search offending words, sexually explicit content, etc.
# bios['profanity_score'] = profanity_check.predict_prob(bios.biography)
# bios[bios.profanity_score > 0.90]
Profanity detection goes nowhere too.
All VS All correlation¶
fig = px.imshow(df.select_dtypes(include=['bool','number']).fillna(-1).corr())
fig.show()
fig = px.imshow(df[df.label == True].select_dtypes(include=['bool','number']).fillna(-1).corr())
fig.show()
# df[['verified_email',
# 'verified_primary_email',
# 'n_works',
# 'n_doi',
# 'n_arxiv',
# 'n_pmc',
# 'n_other_pids',
# 'n_emails',
# 'n_urls',
# 'n_ids',
# 'n_keywords',
# 'n_employment',
# 'n_education',
# 'label']].to_pickle('../data/processed/features.pkl')
df.info()