4.0 MiB
Exploratory analysis¶
TODO:
- URLs can be found elsewhere (e.g., biographies, names, etc.)
- Check line feed handling when creating dump
- Always keep an eye to the temporal dimension
- Why fake ORCID records are being created? [Link farming/SEO hacking, anything else?]
- Can we access private info thanks to the OpenAIRE ORCID membership? No.
- Check special cases of worksource as in https://orcid.org/0000-0002-4469-621X where "author name VIA ResearcherID"
import glob
import ast
import re
from datetime import datetime
import pytz
import tldextract
import ssl # needed because nltk.download down here fires an error
try:
_create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
pass
else:
ssl._create_default_https_context = _create_unverified_https_context
import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import numpy as np
import pandas as pd
# import antispam
# import profanity_check
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
init_notebook_mode(connected=True)
TOP_N = 0
TOP_RANGE = [0, 0]
def set_top_n(n):
global TOP_N, TOP_RANGE
TOP_N = n
TOP_RANGE = [-.5, n - 1 + .5]
pd.set_option('display.max_columns', None)
Notable solid ORCID iDs for explorative purposes:
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'
Notable anomalies:
JOURNAL = '0000-0003-1815-5732'
NOINFO = '0000-0001-5009-2052'
VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE
WORK_MISUSE = '0000-0001-7870-1120'
# todo: find group-shared ORCiD, if possible
Notable fake ORCID iDs:
FAKE_HEAP = {
'scaffold': '0000-0001-5004-7761',
'whatsapp': '0000-0001-6997-9470',
'penis': '0000-0002-3399-7287',
'bitcoin': '0000-0002-7518-6845',
'fitness': '0000-0002-1234-835X', # URL record + employment
'cannabis': '0000-0002-9025-8632', # URL > 70 + works (now REMOVED)
'plumber': '0000-0002-1700-8311', # URL > 10 + works
'furniture': '0000-0001-7478-4539',
'cleaners': '0000-0002-7392-3792',
'toxiburn': '0000-0001-7505-2081', # URLs in bio
'ultraburst': '0000-0002-7037-3393', # URLs in bio
'testoryze': '0000-0002-6361-8129', # UNRL in bio
'rlmax': '0000-0002-0393-7865',
'eretrol': '0000-0002-6226-8905',
'memomax': '0000-0002-2231-4233',
'keto': '0000-0002-5521-9494',
'baukredit': '0000-0002-5402-9920',
'barber': '0000-0002-6766-8254'
}
Load the dataset
parts = glob.glob('../data/processed/dataset.pkl.*')
df = pd.concat((pd.read_pickle(part) for part in sorted(parts)))
df.head(5)
Notable records inspection
df[df['orcid'] == AM]
df[df['orcid'] == FAKE_HEAP['whatsapp']]
df.count()
df['orcid'].describe()
df['primary_email'].describe()
Dupe emails
df['primary_email'].dropna().loc[df['primary_email'].duplicated()]
df[df['primary_email'] == 'maykin@owasp.org']
df[df['primary_email'] == 'opercin@erbakan.edu.tr']
df[df['primary_email'] == 'patrick.davey@monash.edu']
df['primary_email_domain'].describe()
top_primary_emails = df[['primary_email_domain', 'orcid']]\
.groupby('primary_email_domain')\
.count()\
.sort_values('orcid', ascending=False)
set_top_n(30)
data = [
go.Bar(
x=top_primary_emails[:TOP_N].index,
y=top_primary_emails[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
This somehow makes sense, legitimate users could put the gmail account as primary for login purposes and have institutional addresses as other email addresses. It makes also the life easier upon relocation.
Other emails¶
df['other_email_domains'].describe()
emails_by_orcid = df[['orcid', 'n_emails']].sort_values('n_emails', ascending=False)
set_top_n(30)
data = [
go.Bar(
x=emails_by_orcid[:TOP_N]['orcid'],
y=emails_by_orcid[:TOP_N]['n_emails']
)
]
layout = go.Layout(
title='Top %s ORCID iDs by email' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
top_other_emails = df[['orcid', 'other_email_domains']]\
.explode('other_email_domains')\
.reset_index(drop=True)\
.groupby('other_email_domains')\
.count()\
.sort_values('orcid', ascending=False)
set_top_n(30)
data = [
go.Bar(
x=top_other_emails[:TOP_N].index,
y=top_other_emails[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top %s other email domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
URLs¶
df.n_urls.describe()
df[df.n_urls > df.n_urls.max()]
df[df.url_domains.notna()].head()
urls_by_orcid = df[['orcid', 'n_urls']].sort_values('n_urls', ascending=False)
urls_by_orcid
The first three are fake, the fourth isn't. No assumption can be taken.
set_top_n(100)
data = [
go.Bar(
x=urls_by_orcid[:TOP_N]['orcid'],
y=urls_by_orcid[:TOP_N]['n_urls']
)
]
layout = go.Layout(
title='Top %s ORCID iDs with URLs' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
top_urls = df[['orcid', 'url_domains']]\
.explode('url_domains')\
.reset_index(drop=True)\
.groupby('url_domains')\
.count()\
.sort_values('orcid', ascending=False)
set_top_n(50)
data = [
go.Bar(
x=top_urls[:TOP_N].index,
y=top_urls[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s URL domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Malformed URLs are left empty
exploded_url_domains = df[['orcid', 'url_domains']].explode('url_domains')
exploded_url_domains[exploded_url_domains.url_domains == ''].count()
Academic URL filtering according to GRID.ac¶
def extract_domain(link):
return tldextract.extract(link).registered_domain
grid_df = pd.read_csv('../data/external/grid/full_tables/links.csv', index_col='grid_id')
grid_df['domain'] = grid_df.link.apply(extract_domain)
grid_df
grid_df.loc['grid.451498.5']
exp = df[['orcid', 'url_domains']].explode('url_domains')
exp = exp[exp.url_domains.notna()]
exp
exp['grid'] = exp.url_domains.isin(grid_df.domain)
non_grid_domains = exp[~exp.url_domains.isin(grid_df.domain)].groupby('url_domains').count().sort_values('orcid', ascending=False)
DOMAIN_EXCLUSIONS = ['google.', 'youtube.', 'github', 'researchgate', 'academia.edu', 'elsevier.', 'elsevierpure.com',
'publons.', 'scopus', 'researcherid', 'ac.uk', '.gov.', '.edu', 'arxiv']
for dex in DOMAIN_EXCLUSIONS:
non_grid_domains.drop(non_grid_domains.filter(like=dex, axis=0).index, inplace=True)
non_grid_domains.to_csv('../data/processed/non_grid_urls.csv')
URLs present in other parts of the ORCID records¶
df[df.other_urls.str.len() > 0][['orcid', 'urls', 'other_urls']]
df[(df.other_urls.str.len() > 0) & (df.urls.isna())][['orcid', 'urls', 'other_urls']]
Works source¶
def remove_self_source(lst, given, family):
res = []
for ws in lst:
if ws.lower().find(given.lower()) == -1:
if pd.notna(family):
if ws.lower().find(family.lower()) == -1:
res.append(ws)
else:
res.append(ws)
return res
df['ext_works_source'] = df[(df.works_source.notna()) & (df.given_names.notna())]\
.apply(lambda x: remove_self_source(x['works_source'], x['given_names'], x['family_name']), axis=1)
df['n_ext_work_source'] = pd.Series(df.ext_works_source.str.len(), dtype=pd.Int16Dtype())
exploded_external_sources = df[df['ext_works_source'].str.len() > 0][['orcid','ext_works_source']]\
.explode('ext_works_source').reset_index(drop=True)
grouped_ext_sources = exploded_external_sources.groupby('ext_works_source')\
.count()\
.sort_values('orcid', ascending=False)\
.reset_index()
set_top_n(30)
data = [
go.Bar(
x=grouped_ext_sources[:TOP_N].ext_works_source,
y=grouped_ext_sources[:TOP_N].orcid
)
]
layout = go.Layout(
title='Top %s works_source' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
authoritative_sources = grouped_ext_sources[grouped_ext_sources['orcid'] > 2]
authoritative_sources
exploded_external_sources['authoritative'] = exploded_external_sources.ext_works_source\
.isin(authoritative_sources['ext_works_source'])
orcid_authoritative_source = exploded_external_sources\
.groupby('orcid')['authoritative']\
.any()\
.reset_index()[['orcid', 'authoritative']]
df = df.merge(orcid_authoritative_source, on='orcid', how='left')
df.loc[df.authoritative.isna(), 'authoritative'] = False
df.head()
External IDs¶
External IDs should come from reliable sources. ORCiD registrants cannot add them freely.
df.n_ids.describe()
df[df.n_ids == df.n_ids.max()]
ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)
ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])
ids[ids.provider.notna()].head()
top_ids_providers = ids.groupby('provider').count().sort_values('orcid', ascending=False)
data = [
go.Bar(
x=top_ids_providers.index,
y=top_ids_providers['orcid']
)
]
layout = go.Layout(
title='IDs provided by providers',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
pd.unique(ids['provider'])
Keywords¶
This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this
keywords_by_orcid = df[['orcid', 'n_keywords']].sort_values('n_keywords', ascending=False)
keywords_by_orcid
set_top_n(100)
data = [
go.Bar(
x=keywords_by_orcid[:TOP_N]['orcid'],
y=keywords_by_orcid[:TOP_N]['n_keywords']
)
]
layout = go.Layout(
title='Keywords provided by ORCiD',
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
top_keywords = df[['orcid', 'keywords']]\
.explode('keywords')\
.reset_index(drop=True)\
.groupby('keywords')\
.count()\
.sort_values('orcid', ascending=False)
set_top_n(50)
data = [
go.Bar(
x=top_keywords[:TOP_N].index,
y=top_keywords[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s keywords occurrence' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Education¶
df.n_education.describe()
df[df.n_education == df.n_education.max()]
exploded_education = df[['orcid', 'education']].explode('education').dropna()
exploded_education
exploded_education[['department', 'degree', 'university', 'city', 'region', 'country', 'id', 'id_scheme']] = pd.DataFrame(exploded_education.education.tolist(), index=exploded_education.index)
exploded_education.id.replace('', pd.NA, inplace=True)
# exploded_education.groupby('orcid').id.count().reset_index()
df = df.merge(exploded_education.groupby('orcid').id.count().reset_index(), on='orcid', how='left')
df.rename(columns={'id': 'n_valid_education'}, inplace=True)
df[df.n_education != df.n_valid_education]
Employment¶
df.n_employment.describe()
df[df.n_employment == df.n_employment.max()]
Let's count how many employments have a valid assigned id by orcid (ringols, isni, grid, etc.)
exploded_employment = df[['orcid', 'employment']].explode('employment').dropna()
exploded_employment
exploded_employment[['role', 'institution', 'city', 'region', 'country', 'id', 'id_scheme']] = pd.DataFrame(exploded_employment.employment.tolist(), index=exploded_employment.index)
exploded_employment.id.replace('', pd.NA, inplace=True)
# exploded_employment.groupby('orcid').id.count().reset_index()
df = df.merge(exploded_employment.groupby('orcid').id.count().reset_index(), on='orcid', how='left')
df.rename(columns={'id': 'n_valid_employment'}, inplace=True)
df[df.n_employment != df.n_valid_employment]
Biography¶
df.biography.replace('', np.NaN, inplace=True)
df.biography.describe()
Let's also fabricate a few other features from biographies.
df['biography_length'] = df.biography.str.len()
df['biography_n_sentences'] = df[df.biography.notna()].biography.apply(lambda bio: len(sent_tokenize(bio)))
df['biography_n_words'] = df[df.biography.notna()].biography.apply(lambda bio: len(word_tokenize(bio)))
Duplicated bios
Let's check deeper into duplicated bios
df[(df.biography.notna()) & (df.biography.duplicated(keep=False))]
dup_bios = df[['orcid', 'biography']].groupby('biography').count().sort_values('orcid', ascending=False)
dup_bios = dup_bios[dup_bios.orcid > 1]
dup_bios
dup_bios.sum()
# dup_bios.to_csv('../data/processed/dup_bios.csv', index=True, columns=[], header=False)
dup_bios.to_csv('../data/processed/dup_bios.csv')
I noticed that some bios can be found on google in other (probably fake) accounts. E.g. "hi, how are you? it is really cool to find an entire community of people interested in the same thing you are." can be found on https://dribbble.com/camrodoabh/about
df[(df.biography.notna()) & (df.biography.str.contains('car title loans are a more straightforward'))]
Let's jot them down
i = 0
for orcid in df[(df.biography.notna()) & (df.biography.str.contains('car title loans are a more straightforward'))]['orcid']:
FAKE_HEAP['carloan_' + str(i)] = orcid
i = i+1
Dup bios and date of activation
BIO_SNIPPET = 'more straightforward way to borrow the money you'
dup_bios_df = df[df.biography.str.contains(BIO_SNIPPET)]
# .groupby(df.activation_date.dt.month)[['orcid']].count().sort_values('orcid', ascending=False)
data = [
go.Histogram(
x=dup_bios_df['activation_date'],
y=dup_bios_df['orcid'],
histfunc='count'
)
]
layout = go.Layout(
title='Activation distribution for bio "%s"' % BIO_SNIPPET,
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
fig.update_traces(xbins_size='D1')
plotly.offline.iplot(fig)
For all duplicated bios
dup_bios_df = df[(df.biography.notna()) & (df.biography.duplicated(keep=False))]
data = [
go.Histogram(
x=dup_bios_df['activation_date'],
y=dup_bios_df['orcid'],
histfunc='count'
)
]
layout = go.Layout(
title='Activation date distribution for all dup bios',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
fig.update_traces(xbins_size='D1')
plotly.offline.iplot(fig)
While in general it holds a seasonality (e.g. weekends and holidays) (Commented as the HTML generated is huge.. will sort this out)
YEAR = 2020
df[df.activation_date.dt.year == YEAR].resample('1D', on='activation_date')['orcid'].count().reset_index()
plt.figure(figsize=(16, 6))
ax = sns.histplot(x='activation_date', data=df[df.activation_date.dt.year.isin([2019,2020])], bins=731)
Last update date ~ to activation date in duplicated bios
df[(df.biography.notna()) &
(df.biography.duplicated(keep=False)) &
(df.activation_date.dt.year == df.last_update_date.dt.year) &
(df.activation_date.dt.month == df.last_update_date.dt.month) &
(df.activation_date.dt.day == df.last_update_date.dt.day)]
Percent of dup bios accounts left inactive right after their creation
13663 / 19571
Dup bios and URLs
df[(df.biography.notna()) & (df.biography.duplicated(keep=False)) & (df.urls.notna())].orcid.count()
top_urls = dup_bios_df[['orcid', 'url_domains']]\
.explode('url_domains')\
.reset_index(drop=True)\
.groupby('url_domains')\
.count()\
.sort_values('orcid', ascending=False)
set_top_n(50)
data = [
go.Bar(
x=top_urls[:TOP_N].index,
y=top_urls[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='Top-%s URL domains' % TOP_N,
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
top_urls = df[['orcid', 'url_domains']]\
.explode('url_domains')\
.reset_index(drop=True)\
.groupby('url_domains')\
.count()\
.sort_values('orcid', ascending=False)
exp = df[['orcid', 'url_domains']].explode('url_domains')
exp[exp.url_domains == 'lucialpiazzale.com']
df[df.orcid == '0000-0002-3869-9561']
Let's plot the domains which dup bios point to
BIO_SNIPPET = 'really cool to find an entire community of people'
dup_bios_df = df[df.biography.str.contains(BIO_SNIPPET)].explode('url_domains').groupby('url_domains')[['orcid']].count().sort_values('orcid', ascending=False)
set_top_n(50)
data = [
go.Bar(
x=dup_bios_df[:TOP_N].index,
y=dup_bios_df[:TOP_N]['orcid']
)
]
layout = go.Layout(
title='URL distribution for bio "%s"' % BIO_SNIPPET,
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Assign spam score from precanned library
# bios = df[df.biography.notna()][['orcid', 'biography']]
# def score(bio):
# try:
# return antispam.score(bio)
# except: # if len(bio) < 3 the filter doesn't know how to handle that
# return -1
# bios['spam_score'] = bios.biography.apply(lambda bio: score(bio))
# bios[bios.spam_score == -1] # these are artefacts (no scoring possible)
# bios.spam_score.replace(to_replace=-1, value=np.nan, inplace=True)
# bios.spam_score.describe()
# bios[bios.spam_score > 0.99]
Spam goes nowhere.
Search offending words, sexually explicit content, etc.
# bios['profanity_score'] = profanity_check.predict_prob(bios.biography)
# bios[bios.profanity_score > 0.90]
Profanity detection goes nowhere too.
Dates¶
df[df.activation_date == df.last_update_date]['orcid'].count()
df[df.activation_date > df.last_update_date]['orcid'].count()
df[(df.activation_date.dt.year == df.last_update_date.dt.year) &
(df.activation_date.dt.month == df.last_update_date.dt.month) &
(df.activation_date.dt.day == df.last_update_date.dt.day)]['orcid'].count()
df['date_diff'] = (df.last_update_date - df.activation_date) / np.timedelta64(1, 'D')
df.date_diff.describe()
df[df.date_diff == df.date_diff.min()]
df.loc[df.date_diff < 0, 'date_diff'] = 0
# df['ref_year'] = df.activation_date.dt.year
# fig = go.Figure()
# years = range(2013, 2021, 1)
# for year in years:
# fig.add_trace(go.Violin(x=df[df.ref_year == year].ref_year,
# y=df[df.ref_year == year].date_diff,
# name=year,
# points=False,
# box_visible=True,
# meanline_visible=True))
# fig.show()
# plt.figure(figsize=(16, 6))
# ax = sns.violinplot(x='ref_year', y='date_diff', data=df)
# df['ref_year'] = df.last_update_date.dt.year
# plt.figure(figsize=(16, 6))
# ax = sns.violinplot(x='ref_year', y='date_diff', data=df)
# tz = pytz.timezone('UTC')
# NOW = datetime.now(tz)
# df['date_stale'] = (NOW - df.last_update_date) / np.timedelta64(1, 'D')
# df.date_stale.describe()
# plt.figure(figsize=(16, 6))
# ax= sns.violinplot(x='ref_year', y='date_stale', data=df)
# df['ref_year'] = df.activation_date.dt.year
# plt.figure(figsize=(16, 6))
# ax = sns.violinplot(x='ref_year', y='date_stale', data=df)
Todo:
- stale profiles with information initially set are likely to be fake?
- the more info is visible, the higher the chance the profile is fake if not updated (for long)?
All VS all colleration¶
fig = px.imshow(df.select_dtypes(include=['bool','number']).fillna(-1).corr())
fig.show()
# df.select_dtypes(include=['bool','number']).to_pickle('../data/processed/features.pkl')
df.info()
# df[df.orcid.isin(FAKE_HEAP.values())].to_csv('../data/processed/fake_heap_index.csv', index=True, index_label='index')
Filtering¶
df[(df.url_domains.notna()) | (df.other_url_domains.notna())][['orcid', 'given_names', 'family_name', 'other_names', 'url_domains', 'other_url_domains', 'activation_date', 'last_update_date']]