351 KiB
351 KiB
Exploratory analysis¶
TODO:
- Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)
- Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)
- Temporal dimension of any use?
- Can we access private info thanks to the OpenAIRE-ORCID agreement?
In [1]:
import pandas as pd
import ast
import tldextract
import numpy
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
init_notebook_mode(connected=True)
TOP_N = 30
TOP_RANGE = [-.5, TOP_N - 1 + .5]
Notable solid ORCID iDs for explorative purposes:
In [2]:
AM = '0000-0002-5193-7851'
PP = '0000-0002-8588-4196'
Anomalies ORCiD profile
In [3]:
JOURNAL = '0000-0003-1815-5732'
NOINFO= '0000-0001-5009-2052'
# find group-shared ORCiD
Notable fake ORCID iDs for explorative purposes:
In [4]:
SCAFFOLD = '0000-0001-5004-7761'
WHATSAPP = '0000-0001-6997-9470'
PENIS = '0000-0002-3399-7287'
BITCOIN = '0000-0002-7518-6845'
FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment
CANNABIS = '0000-0002-9025-8632' # URL > 70 + works (REMOVED)
PLUMBER = '0000-0002-1700-8311' # URL > 10 + works
Load the dataset
In [7]:
df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\t', header=0,
names = ['orcid', 'claimed','verified_email', 'verified_primary_email',
'given_names', 'family_name', 'biography', 'other_names', 'urls',
'primary_email', 'other_emails', 'keywords', 'external_ids', 'education',
'employment', 'n_works', 'works_source'])
In [8]:
df[df.duplicated()]
Out[8]:
In [9]:
df.drop_duplicates(inplace=True)
Basic column manipulation (interpret columns as lists when necessary)
In [10]:
df.loc[df.other_names.notna(), 'other_names'] = df.loc[df.other_names.notna(), 'other_names'].apply(lambda x: ast.literal_eval(x))
In [11]:
df.loc[df.keywords.notna(), 'keywords'] = df.loc[df.keywords.notna(), 'keywords'].apply(lambda x: ast.literal_eval(x))
In [12]:
df.loc[df.urls.notna(), 'urls'] = df.loc[df.urls.notna(), 'urls'].apply(lambda x: ast.literal_eval(x))
In [13]:
df.loc[df.other_emails.notna(), 'other_emails'] = df.loc[df.other_emails.notna(), 'other_emails'].apply(lambda x: ast.literal_eval(x))
In [14]:
df.loc[df.education.notna(), 'education'] = df.loc[df.education.notna(), 'education'].apply(lambda x: ast.literal_eval(x))
In [15]:
df.loc[df.employment.notna(), 'employment'] = df.loc[df.employment.notna(), 'employment'].apply(lambda x: ast.literal_eval(x))
In [16]:
df.loc[df.external_ids.notna(), 'external_ids'] = df.loc[df.external_ids.notna(), 'external_ids'].apply(lambda x: ast.literal_eval(x))
In [17]:
df.loc[df.works_source.notna(), 'works_source'] = df.loc[df.works_source.notna(), 'works_source'].apply(lambda x: ast.literal_eval(x))
In [18]:
df.head(5)
Out[18]:
In [19]:
df[df['orcid'] == AM]
Out[19]:
In [20]:
df[df['orcid'] == WHATSAPP]
Out[20]:
In [21]:
df.count()
Out[21]:
In [22]:
df[df['orcid'] == '0000-0002-5154-6404']
Out[22]:
In [23]:
df.drop(index=4595264, inplace=True)
In [24]:
df['orcid'].describe()
Out[24]:
Primary email¶
In [25]:
df['primary_email'].describe()
Out[25]:
Dupe emails
In [26]:
df['primary_email'].dropna().loc[df['primary_email'].duplicated()]
Out[26]:
In [27]:
df[df['primary_email'] == 'maykin@owasp.org']
Out[27]:
In [28]:
df[df['primary_email'] == 'opercin@erbakan.edu.tr']
Out[28]:
In [29]:
df[df['primary_email'] == 'patrick.davey@monash.edu']
Out[29]:
In [30]:
df['primary_email_domain'] = df['primary_email'].apply(lambda x: x.split('@')[1] if pd.notna(x) else x)
In [31]:
df['primary_email_domain'].describe()
Out[31]:
In [32]:
primary_emails = df[['primary_email_domain', 'orcid']].groupby('primary_email_domain').count().sort_values('orcid', ascending=False)
primary_emails
Out[32]:
In [33]:
data = [
go.Bar(
x=primary_emails[:30].sort_values(by=['orcid'], ascending=False).index,
y=primary_emails[:30].sort_values(by=['orcid'], ascending=False)['orcid']
)
]
layout = go.Layout(
title='Top 30 email domains',
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
Other emails¶
In [34]:
def extract_email_domains(lst):
res = []
for email in lst:
res.append(email.split('@')[1])
return res
In [35]:
df['other_email_domains'] = df['other_emails'].apply(lambda x: extract_email_domains(x) if isinstance(x, list) else x)
In [36]:
df[df['other_email_domains'].notna()].head()
Out[36]:
In [37]:
other_emails = df[['orcid', 'other_email_domains']].explode('other_email_domains').reset_index(drop=True)
In [38]:
grouped_other_emails = other_emails.groupby('other_email_domains').count().sort_values('orcid', ascending=False)
grouped_other_emails
Out[38]:
In [39]:
data = [
go.Bar(
x=grouped_other_emails[:30].sort_values(by=['orcid'], ascending=False).index,
y=grouped_other_emails[:30].sort_values(by=['orcid'], ascending=False)['orcid']
)
]
layout = go.Layout(
title='Top 30 other email domains',
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [40]:
other_emails.groupby('orcid').count().sort_values('other_email_domains', ascending=False)
Out[40]:
Email speculation¶
In [41]:
df[df['primary_email'].isna() & df['other_emails'].notna()]
Out[41]:
URLs¶
In [42]:
def extract_url_domains(lst):
domains = []
for e in lst:
# e[0] is a string describing the url
# e[1] is the url
domain = tldextract.extract(e[1])
domains.append(domain.registered_domain)
return domains
In [43]:
df['url_domains'] = df['urls'].apply(lambda x: extract_url_domains(x) if isinstance(x, list) else x)
In [44]:
df[df['url_domains'].notna()].head()
Out[44]:
In [45]:
urls = df[['orcid', 'url_domains']].explode('url_domains').reset_index(drop=True)
In [46]:
grouped_urls = urls.groupby('url_domains').count().sort_values('orcid', ascending=False)
grouped_urls
Out[46]:
In [47]:
data = [
go.Bar(
x=grouped_urls[:30].sort_values(by=['orcid'], ascending=False).index,
y=grouped_urls[:30].sort_values(by=['orcid'], ascending=False)['orcid']
)
]
layout = go.Layout(
title='Top 30 URL domains',
xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [48]:
grouped_most_domains = urls.groupby('orcid').count().sort_values('url_domains', ascending=False)
grouped_most_domains
Out[48]:
In [49]:
data = [
go.Bar(
x=grouped_most_domains[:100].sort_values(by=['url_domains'], ascending=False).index,
y=grouped_most_domains[:100].sort_values(by=['url_domains'], ascending=False)['url_domains']
)
]
layout = go.Layout(
title='Top 100 ORCID with URLs',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [50]:
df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]
Out[50]:
In [51]:
df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]
Out[51]:
In [52]:
exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)
exploded_sources
Out[52]:
In [53]:
exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]
Out[53]:
Works source¶
In [54]:
def remove_own_source(lst, own):
res = []
if isinstance(lst, list) and pd.notna(own):
for ws in lst:
if ws.find(own) == -1:
res.append(ws)
return res
else:
return np.na()
In [55]:
df['ext_works_source'] = df.apply(lambda x: remove_own_source(x['works_source'], x['given_names']), axis=1)