3.9 MiB
Exploratory analysis¶
TODO:
- Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)
- Study different cases (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)
- Temporal dimension; is it of any use?
- Can we access private info thanks to the OpenAIRE-ORCID agreement? No.
import glob
import ast
from datetime import datetime
import pytz
import tldextract
import ssl # needed because nltk.download down here fires an error
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import numpy as np
import pandas as pd
# import antispam
# import profanity_check
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
init_notebook_mode(connected=True)
TOP_N = 0
TOP_RANGE = [0, 0]
def set_top_n(n):
global TOP_N, TOP_RANGE
TOP_N = n
TOP_RANGE = [-.5, n - 1 + .5]
pd.set_option('display.max_columns', None)
Notable solid ORCID iDs for explorative purposes:
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'
Notable anomalies:
JOURNAL = '0000-0003-1815-5732'
NOINFO = '0000-0001-5009-2052'
VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE
WORK_MISUSE = '0000-0001-7870-1120'
# todo: find group-shared ORCiD, if possible
Notable fake ORCID iDs:
FAKE_HEAP = {
'scaffold': '0000-0001-5004-7761',
'whatsapp': '0000-0001-6997-9470',
'penis': '0000-0002-3399-7287',
'bitcoin': '0000-0002-7518-6845',
'fitness': '0000-0002-1234-835X', # URL record + employment
'cannabis': '0000-0002-9025-8632', # URL > 70 + works (now REMOVED)
'plumber': '0000-0002-1700-8311', # URL > 10 + works
'furniture': '0000-0001-7478-4539',
'cleaners': '0000-0002-7392-3792'
}
Load the dataset
parts = glob.glob('../data/processed/dataset.pkl.*')
df = pd.concat((pd.read_pickle(part) for part in sorted(parts)))
df.head(5)
Notable profiles inspection
df[df['orcid'] == AM]
df[df['orcid'] == FAKE_HEAP['whatsapp']]
df.count()
df['orcid'].describe()
Primary email¶
df['primary_email'].describe()
Dupe emails
df['primary_email'].dropna().loc[df['primary_email'].duplicated()]
df[df['primary_email'] == 'maykin@owasp.org']
df[df['primary_email'] == 'opercin@erbakan.edu.tr']
df[df['primary_email'] == 'patrick.davey@monash.edu']
df['primary_email_domain'].describe()
top_primary_emails = df[['primary_email_domain', 'orcid']]\
.groupby('primary_email_domain')\
.count()\
.sort_values('orcid', ascending=False)
top_primary_emails
set_top_n(30)
data = [
go.Bar(
x=top_primary_emails[:TOP_N].index,
y=top_primary_emails[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Other emails¶
df[df.other_email_domains.notna()].head()
emails_by_orcid = df[['orcid', 'n_emails']].sort_values('n_emails', ascending=False)
set_top_n(30)
data = [
go.Bar(
x=emails_by_orcid[:TOP_N]['orcid'],
y=emails_by_orcid[:TOP_N]['n_emails']
)
]
layout = go.Layout(
title='Top %s ORCID iDs by email' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
top_other_emails = df[['orcid', 'other_email_domains']]\
.explode('other_email_domains')\
.reset_index(drop=True)\
.groupby('other_email_domains')\
.count()\
.sort_values('orcid', ascending=False)
set_top_n(30)
data = [
go.Bar(
x=top_other_emails[:TOP_N].index,
y=top_other_emails[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top %s other email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
This somehow makes sense, legitimate users could put the gmail account as primary for login purposes and have institutional addresses as other email addresses. It makes also the life easier upon relocation.
Email speculation¶
df[df.primary_email.isna() & df.other_email_domains.notna()]
URLs¶
df.n_urls.describe()
df[df.n_urls > df.n_urls.max()]
df[df.url_domains.notna()].head()
urls_by_orcid = df[['orcid', 'n_urls']].sort_values('n_urls', ascending=False)
urls_by_orcid
The first three are fake, the fourth isn't. No assumption can be taken.
set_top_n(100)
data = [
go.Bar(
x=urls_by_orcid[:TOP_N]['orcid'],
y=urls_by_orcid[:TOP_N]['n_urls']
)
]
layout = go.Layout(
title='Top %s ORCID iDs with URLs' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
top_urls = df[['orcid', 'url_domains']]\
.explode('url_domains')\
.reset_index(drop=True)\
.groupby('url_domains')\
.count()\
.sort_values('orcid', ascending=False)
set_top_n(50)
data = [
go.Bar(
x=top_urls[:TOP_N].index,
y=top_urls[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s URL domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Malformed URLs are left empty
exploded_url_domains = df[['orcid', 'url_domains']].explode('url_domains')
exploded_url_domains[exploded_url_domains.url_domains == ''].count()
URLs speculation¶
df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]
df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]
exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)
exploded_sources
exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]
GRID.ac filtering¶
def extract_domain(link):
return tldextract.extract(link).registered_domain
grid_df = pd.read_csv('../data/external/grid/full_tables/links.csv', index_col='grid_id')
grid_df['domain'] = grid_df.link.apply(extract_domain)
grid_df
grid_df.loc['grid.451498.5']
exp = df[['orcid', 'url_domains']].explode('url_domains')
exp = exp[exp.url_domains.notna()]
exp
exp['grid'] = exp.url_domains.isin(grid_df.domain)
non_grid_domains = exp[~exp.url_domains.isin(grid_df.domain)].groupby('url_domains').count().sort_values('orcid', ascending=False)
DOMAIN_EXCLUSIONS = ['google.', 'youtube.', 'github', 'researchgate', 'academia.edu', 'elsevier.', 'elsevierpure.com',
'publons.', 'scopus', 'researcherid', 'ac.uk', '.gov.', '.edu', 'arxiv']
for dex in DOMAIN_EXCLUSIONS:
non_grid_domains.drop(non_grid_domains.filter(like=dex, axis=0).index, inplace=True)
non_grid_domains.to_csv('../data/processed/non_grid_urls.csv')
Works source¶
def remove_self_source(lst, given, family):
res = []
for ws in lst:
if ws.lower().find(given.lower()) == -1:
if pd.notna(family):
if ws.lower().find(family.lower()) == -1:
res.append(ws)
else:
res.append(ws)
return res
df['ext_works_source'] = df[(df.works_source.notna()) & (df.given_names.notna())]\
.apply(lambda x: remove_self_source(x['works_source'], x['given_names'], x['family_name']), axis=1)
df['n_ext_work_source'] = pd.Series(df.ext_works_source.str.len(), dtype=pd.Int16Dtype())
exploded_external_sources = df[df['ext_works_source'].str.len() > 0][['orcid','ext_works_source']]\
.explode('ext_works_source').reset_index(drop=True)
grouped_ext_sources = exploded_external_sources.groupby('ext_works_source')\
.count()\
.sort_values('orcid', ascending=False)\
.reset_index()
set_top_n(30)
data = [
go.Bar(
x=grouped_ext_sources[:TOP_N].ext_works_source,
y=grouped_ext_sources[:TOP_N].orcid
)
]
layout = go.Layout(
title='Top %s works_source' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
authoritative_sources = grouped_ext_sources[grouped_ext_sources['orcid'] > 2]
authoritative_sources
exploded_external_sources['authoritative'] = exploded_external_sources.ext_works_source\
.isin(authoritative_sources['ext_works_source'])
orcid_authoritative_source = exploded_external_sources\
.groupby('orcid')['authoritative']\
.any()\
.reset_index()[['orcid', 'authoritative']]
df = df.merge(orcid_authoritative_source, on='orcid', how='left')
df.loc[df.authoritative.isna(), 'authoritative'] = False
df.head()
External IDs¶
External IDs should come from reliable sources. ORCiD registrants cannot add them freely.
df.n_ids.describe()
df[df.n_ids == df.n_ids.max()]
ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)
ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])
ids[ids.provider.notna()].head()
top_ids_providers = ids.groupby('provider').count().sort_values('orcid', ascending=False)
data = [
go.Bar(
x=top_ids_providers.index,
y=top_ids_providers['orcid']
)
]
layout = go.Layout(
title='IDs provided by providers',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
pd.unique(ids['provider'])
Keywords¶
This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this
keywords_by_orcid = df[['orcid', 'n_keywords']].sort_values('n_keywords', ascending=False)
keywords_by_orcid
set_top_n(100)
data = [
go.Bar(
x=keywords_by_orcid[:TOP_N]['orcid'],
y=keywords_by_orcid[:TOP_N]['n_keywords']
)
]
layout = go.Layout(
title='Keywords provided by ORCiD',
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
top_keywords = df[['orcid', 'keywords']]\
.explode('keywords')\
.reset_index(drop=True)\
.groupby('keywords')\
.count()\
.sort_values('orcid', ascending=False)
set_top_n(50)
data = [
go.Bar(
x=top_keywords[:TOP_N].index,
y=top_keywords[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s keywords occurrence' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Education¶
df.n_education.describe()
df[df.n_education == df.n_education.max()]
exploded_education = df[['orcid', 'education']].explode('education').dropna()
exploded_education
exploded_education[['degree', 'role', 'university', 'city', 'region', 'country', 'id', 'id_scheme']] = pd.DataFrame(exploded_education.education.tolist(), index=exploded_education.index)
exploded_education.id.replace('', pd.NA, inplace=True)
exploded_education.groupby('orcid').id.count().reset_index()
df = df.merge(exploded_education.groupby('orcid').id.count().reset_index(), on='orcid', how='left')
df.rename(columns={'id': 'n_valid_education'}, inplace=True)
df[df.n_education != df.n_valid_education]
Employment¶
df.n_employment.describe()
df[df.n_employment == df.n_employment.max()]
Let's count how many employments have a valid assigned id by orcid (ringols, isni, grid, etc.)
exploded_employment = df[['orcid', 'employment']].explode('employment').dropna()
exploded_employment
exploded_employment[['role', 'institution', 'city', 'region', 'country', 'id', 'id_scheme']] = pd.DataFrame(exploded_employment.employment.tolist(), index=exploded_employment.index)
exploded_employment.id.replace('', pd.NA, inplace=True)
exploded_employment.groupby('orcid').id.count().reset_index()
df = df.merge(exploded_employment.groupby('orcid').id.count().reset_index(), on='orcid', how='left')
df.rename(columns={'id': 'n_valid_employment'}, inplace=True)
df[df.n_employment != df.n_valid_employment]
Biography¶
df.biography.replace('', np.NaN, inplace=True)
df.biography.describe()
Let's also fabricate a few other features from biographies.
df['biography_length'] = df.biography.str.len()
df['biography_n_sentences'] = df[df.biography.notna()].biography.apply(lambda bio: len(sent_tokenize(bio)))
df['biography_n_words'] = df[df.biography.notna()].biography.apply(lambda bio: len(word_tokenize(bio)))
Duplicated bios
df[(df.biography.notna()) & (df.biography.str.contains('car title loans are a more straightforward'))]
Let's note them down
i = 0
for orcid in df[(df.biography.notna()) & (df.biography.str.contains('car title loans are a more straightforward'))]['orcid']:
FAKE_HEAP['carloan_' + str(i)] = orcid
i = i+1
Let's check deeper into duplicated bios
df[(df.biography.notna()) & (df.biography.duplicated(keep=False))]
dup_bios = df[['orcid', 'biography']].groupby('biography').count().sort_values('orcid', ascending=False)
dup_bios = dup_bios[dup_bios.orcid > 1]
dup_bios
dup_bios.sum()
# dup_bios.to_csv('../data/processed/dup_bios.csv', index=True, columns=[], header=False)
dup_bios.to_csv('../data/processed/dup_bios.csv')
I noticed that some bios can be found on google in other (probably fake) accounts. E.g. "hi, how are you? it is really cool to find an entire community of people interested in the same thing you are." can be found on https://dribbble.com/camrodoabh/about
Dup bios URLs
Let's plot the domains dup bios point to
BIO_SNIPPET = 'really cool to find an entire community of people'
dup_bios_df = df[df.biography.str.contains(BIO_SNIPPET)].explode('url_domains').groupby('url_domains')[['orcid']].count().sort_values('orcid', ascending=False)
set_top_n(50)
data = [
go.Bar(
x=dup_bios_df[:TOP_N].index,
y=dup_bios_df[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='URL distribution for bio "%s"' % BIO_SNIPPET,
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Dup bios and date of activation
BIO_SNIPPET = 'more straightforward way to borrow the money you'
dup_bios_df = df[df.biography.str.contains(BIO_SNIPPET)]
# .groupby(df.activation_date.dt.month)[['orcid']].count().sort_values('orcid', ascending=False)
data = [
go.Histogram(
x=dup_bios_df['activation_date'],
y=dup_bios_df['orcid'],
histfunc='count'
)
]
layout = go.Layout(
title='Activation distribution for bio "%s"' % BIO_SNIPPET,
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
fig.update_traces(xbins_size='D1')
plotly.offline.iplot(fig)
For all duplicated bios
dup_bios_df = df[(df.biography.notna()) & (df.biography.duplicated(keep=False))]
data = [
go.Histogram(
x=dup_bios_df['activation_date'],
y=dup_bios_df['orcid'],
histfunc='count'
)
]
layout = go.Layout(
title='Activation date distribution for all dup bios',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
fig.update_traces(xbins_size='D1')
plotly.offline.iplot(fig)
While in general it holds a seasonality (e.g. weekends and holidays) (Commented as the HTML generated is huge.. will sort this out)
# YEAR = 2020
# data = [
# go.Histogram(
# x=df[df.activation_date.dt.year == YEAR]['activation_date'],
# y=df[df.activation_date.dt.year == YEAR]['orcid'],
# histfunc='count'
# )
# ]
# layout = go.Layout(
# title='Activation date distribution (general) for %s' % YEAR,
# xaxis=dict(tickangle=45, tickfont=dict(size=12))
# )
# fig = go.Figure(data=data, layout=layout)
# fig.update_traces(xbins_size='D1')
# plotly.offline.iplot(fig)
Dup bios with extended length
Last update date ~ to activation date in duplicated bios
df[(df.biography.notna()) &
(df.biography.duplicated(keep=False)) &
(df.activation_date.dt.year == df.last_update_date.dt.year) &
(df.activation_date.dt.month == df.last_update_date.dt.month) &
(df.activation_date.dt.day == df.last_update_date.dt.day)]
Dup bios URLs
top_urls = dup_bios_df[['orcid', 'url_domains']]\
.explode('url_domains')\
.reset_index(drop=True)\
.groupby('url_domains')\
.count()\
.sort_values('orcid', ascending=False)
set_top_n(50)
data = [
go.Bar(
x=top_urls[:TOP_N].index,
y=top_urls[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s URL domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
top_urls = df[['orcid', 'url_domains']]\
.explode('url_domains')\
.reset_index(drop=True)\
.groupby('url_domains')\
.count()\
.sort_values('orcid', ascending=False)
exp = df[['orcid', 'url_domains']].explode('url_domains')
exp[exp.url_domains == 'lucialpiazzale.com']
df[df.orcid == '0000-0002-3869-9561']
Assign spam score from precanned library
# bios = df[df.biography.notna()][['orcid', 'biography']]
# def score(bio):
# try:
# return antispam.score(bio)
# except: # if len(bio) < 3 the filter doesn't know how to handle that
# return -1
# bios['spam_score'] = bios.biography.apply(lambda bio: score(bio))
# bios[bios.spam_score == -1] # these are artefacts (no scoring possible)
# bios.spam_score.replace(to_replace=-1, value=np.nan, inplace=True)
# bios.spam_score.describe()
# bios[bios.spam_score > 0.99]
Spam goes nowhere.
Search offending words, sexually explicit content, etc.
# bios['profanity_score'] = profanity_check.predict_prob(bios.biography)
# bios[bios.profanity_score > 0.90]
Profanity detection goes nowhere too.
Dates¶
df[df.activation_date == df.last_update_date]['orcid'].count()
df[df.activation_date > df.last_update_date]['orcid'].count()
df[(df.activation_date.dt.year == df.last_update_date.dt.year) &
(df.activation_date.dt.month == df.last_update_date.dt.month) &
(df.activation_date.dt.day == df.last_update_date.dt.day)]['orcid'].count()
df['date_diff'] = (df.last_update_date - df.activation_date) / np.timedelta64(1, 'D')
df.date_diff.describe()
df[df.date_diff == df.date_diff.min()]
df.loc[df.date_diff < 0, 'date_diff'] = 0
df['ref_year'] = df.activation_date.dt.year
# fig = go.Figure()
# years = range(2013, 2021, 1)
# for year in years:
# fig.add_trace(go.Violin(x=df[df.ref_year == year].ref_year,
# y=df[df.ref_year == year].date_diff,
# name=year,
# points=False,
# box_visible=True,
# meanline_visible=True))
# fig.show()
plt.figure(figsize=(16, 6))
ax = sns.violinplot(x='ref_year', y='date_diff', data=df)
df['ref_year'] = df.last_update_date.dt.year
plt.figure(figsize=(16, 6))
ax = sns.violinplot(x='ref_year', y='date_diff', data=df)
tz = pytz.timezone('UTC')
NOW = datetime.now(tz)
df['date_stale'] = (NOW - df.last_update_date) / np.timedelta64(1, 'D')
df.date_stale.describe()
plt.figure(figsize=(16, 6))
ax= sns.violinplot(x='ref_year', y='date_stale', data=df)
df['ref_year'] = df.activation_date.dt.year
plt.figure(figsize=(16, 6))
ax = sns.violinplot(x='ref_year', y='date_stale', data=df)
Todo:
- stale profiles with information initially set are likely to be fake?
- the more info is present
All VS all colleration¶
fig = px.imshow(df.select_dtypes(include=['bool','number']).fillna(-1).corr())
fig.show()
fig = px.imshow(df[df.biography.notna()].select_dtypes(include=['bool','number']).fillna(-1).corr())
fig.show()
fig = px.imshow(df[df.label == True].select_dtypes(include=['bool','number']).fillna(-1).corr())
fig.show()
# df[['verified_email',
# 'verified_primary_email',
# 'n_works',
# 'n_doi',
# 'n_arxiv',
# 'n_pmc',
# 'n_other_pids',
# 'n_emails',
# 'n_urls',
# 'n_ids',
# 'n_keywords',
# 'n_employment',
# 'n_education',
# 'label']].to_pickle('../data/processed/features.pkl')
df.info()