99 KiB
99 KiB
In [1]:
import glob
import pandas as pd
import ast
import tldextract
import numpy
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
init_notebook_mode(connected=True)
TOP_N = 0
TOP_RANGE = [0, 0]
def set_top_n(n):
global TOP_N, TOP_RANGE
TOP_N = n
TOP_RANGE = [-.5, n - 1 + .5]
In [2]:
parts = glob.glob('/Users/miriam.baglioni/Develop/Gitea/fake-orcid-analysis-v2/fake-orcid-analysis/data/processed/dataset.pkl.*')
In [3]:
df = pd.concat((pd.read_pickle(part) for part in parts))
df.head(5)
Out[3]:
In [4]:
def remove_own_source(lst, given, family):
res = []
if isinstance(lst, list) and pd.notna(given):
for ws in lst:
if ws.lower().find(given.lower()) == -1:
if pd.notna(family):
if ws.lower().find(family.lower()) == -1:
res.append(ws)
else:
res.append(ws)
return res
In [5]:
df['ext_works_source'] = df.apply(lambda x: remove_own_source(x['works_source'], x['given_names'], x['family_name']), axis=1)
In [6]:
df['n_ext_work_source'] = df.ext_works_source.str.len()
In [7]:
exploded_external_sources = df[df['ext_works_source'].str.len() > 0][['orcid','ext_works_source']].explode('ext_works_source').reset_index(drop=True)
In [8]:
grouped_ext_sources = exploded_external_sources.groupby('ext_works_source').count().sort_values('orcid', ascending=False).reset_index()
In [44]:
data = [
go.Bar(
x=grouped_ext_sources[:30].ext_works_source,
y=grouped_ext_sources[:30].orcid
)
]
layout = go.Layout(
title='Top 30 works_source',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [9]:
authoritative_sources = grouped_ext_sources[grouped_ext_sources['orcid'] > 2]
authoritative_sources
Out[9]:
In [30]:
exploded_external_sources['authoritative'] = exploded_external_sources.ext_works_source.isin(authoritative_sources['ext_works_source'])
In [57]:
orcid_authoritative_source = exploded_external_sources.groupby('orcid')['authoritative'].any().reset_index()[['orcid', 'authoritative']]
In [64]:
df = df.set_index('orcid').join(orcid_authoritative_source.set_index('orcid')).reset_index()
In [65]:
df.loc[df.authoritative.isna(), 'authoritative'] = False
In [66]:
df
Out[66]:
In [67]:
fig = px.imshow(df.fillna(0).corr())
fig.show()