You cannot select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

99 KiB

In [1]:
import glob

import pandas as pd
import ast
import tldextract
import numpy

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

init_notebook_mode(connected=True)
TOP_N = 0
TOP_RANGE = [0, 0]
def set_top_n(n):
    global TOP_N, TOP_RANGE
    TOP_N = n
    TOP_RANGE = [-.5, n - 1 + .5]
In [2]:
parts = glob.glob('/Users/miriam.baglioni/Develop/Gitea/fake-orcid-analysis-v2/fake-orcid-analysis/data/processed/dataset.pkl.*')
In [3]:
df = pd.concat((pd.read_pickle(part) for part in parts))
df.head(5)
Out[3]:
orcid verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails ... employment n_works works_source activation_date last_update_date n_doi n_arxiv n_pmc n_other_pids label
10000000 0000-0002-7790-0483 1 0 abel elias NaN NaN NaN NaN NaN ... NaN 0 NaN 2020-09-16t16:51:54.155z 2020-09-16t17:00:08.451z 0 0 0 0 0
10000001 0000-0001-6368-0531 0 0 abelardo ramirez NaN NaN NaN NaN NaN ... NaN 0 NaN 2017-05-10t19:28:13.217z 2017-05-10t19:28:17.315z 0 0 0 0 0
10000002 0000-0001-8149-4900 1 1 abelardo mancinas NaN NaN NaN NaN NaN ... [[profesor investigador, instituto tecnológico... 0 NaN 2018-10-15t21:46:52.162z 2020-01-13t03:33:47.645z 0 0 0 0 0
10000003 0000-0002-8684-2422 0 0 abera nigussie NaN NaN NaN NaN NaN ... NaN 0 NaN 2020-09-23t08:36:17.451z 2020-09-23t08:36:17.450z 0 0 0 0 0
10000004 0000-0003-4814-7872 1 1 abhijeet singh NaN NaN NaN NaN NaN ... NaN 0 NaN 2018-05-01t22:43:17.407z 2018-10-06t22:21:54.024z 0 0 0 0 0

5 rows × 23 columns

In [4]:
def remove_own_source(lst, given, family):
    res = []
    if isinstance(lst, list) and pd.notna(given):
        for ws in lst:
            if ws.lower().find(given.lower()) == -1:
                if pd.notna(family):
                    if ws.lower().find(family.lower()) == -1:
                        res.append(ws)
                else:
                    res.append(ws)
    return res
In [5]:
df['ext_works_source'] = df.apply(lambda x: remove_own_source(x['works_source'], x['given_names'], x['family_name']), axis=1)
In [6]:
df['n_ext_work_source'] = df.ext_works_source.str.len()
In [7]:
exploded_external_sources = df[df['ext_works_source'].str.len() > 0][['orcid','ext_works_source']].explode('ext_works_source').reset_index(drop=True)
In [8]:
grouped_ext_sources = exploded_external_sources.groupby('ext_works_source').count().sort_values('orcid', ascending=False).reset_index()
In [44]:
data = [
    go.Bar(
        x=grouped_ext_sources[:30].ext_works_source,
        y=grouped_ext_sources[:30].orcid
    )
]

layout = go.Layout(
    title='Top 30 works_source',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [9]:
authoritative_sources = grouped_ext_sources[grouped_ext_sources['orcid'] > 2]
authoritative_sources
Out[9]:
ext_works_source orcid
0 crossref 1460841
1 scopus - elsevier 902231
2 crossref metadata search 297684
3 multidisciplinary digital publishing institute 281664
4 europe pubmed central 181605
... ... ...
337 uta - oa journal global insight 3
338 francis crick institute 3
339 anna 3
340 santos 3
341 universitäts- und stadtbibliothek köln 3

342 rows × 2 columns

In [30]:
exploded_external_sources['authoritative'] = exploded_external_sources.ext_works_source.isin(authoritative_sources['ext_works_source'])
In [57]:
orcid_authoritative_source = exploded_external_sources.groupby('orcid')['authoritative'].any().reset_index()[['orcid', 'authoritative']]
In [64]:
df = df.set_index('orcid').join(orcid_authoritative_source.set_index('orcid')).reset_index()
In [65]:
df.loc[df.authoritative.isna(), 'authoritative'] = False
In [66]:
df
Out[66]:
orcid verified_email verified_primary_email given_names family_name biography other_names urls primary_email other_emails ... activation_date last_update_date n_doi n_arxiv n_pmc n_other_pids label ext_works_source n_ext_work_source authoritative
0 0000-0002-7790-0483 1 0 abel elias NaN NaN NaN NaN NaN ... 2020-09-16t16:51:54.155z 2020-09-16t17:00:08.451z 0 0 0 0 0 [] 0 False
1 0000-0001-6368-0531 0 0 abelardo ramirez NaN NaN NaN NaN NaN ... 2017-05-10t19:28:13.217z 2017-05-10t19:28:17.315z 0 0 0 0 0 [] 0 False
2 0000-0001-8149-4900 1 1 abelardo mancinas NaN NaN NaN NaN NaN ... 2018-10-15t21:46:52.162z 2020-01-13t03:33:47.645z 0 0 0 0 0 [] 0 False
3 0000-0002-8684-2422 0 0 abera nigussie NaN NaN NaN NaN NaN ... 2020-09-23t08:36:17.451z 2020-09-23t08:36:17.450z 0 0 0 0 0 [] 0 False
4 0000-0003-4814-7872 1 1 abhijeet singh NaN NaN NaN NaN NaN ... 2018-05-01t22:43:17.407z 2018-10-06t22:21:54.024z 0 0 0 0 0 [] 0 False
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10989644 0000-0001-7468-9881 1 1 abeer elbaroudi NaN NaN NaN NaN NaN ... 2020-02-06t15:04:42.485z 2020-02-06t15:16:45.537z 0 0 0 0 0 [] 0 False
10989645 0000-0003-0081-4285 1 1 abeer sohrab NaN NaN NaN NaN NaN ... 2020-05-12t22:39:26.356z 2020-05-12t22:41:45.239z 0 0 0 0 0 [] 0 False
10989646 0000-0003-2004-3457 0 0 abeer abdelmaksoud NaN NaN NaN NaN NaN ... 2019-12-19t23:09:12.579z 2019-12-19t23:09:12.798z 0 0 0 0 0 [] 0 False
10989647 0000-0003-2841-9754 1 1 abeer al-ghazali NaN NaN NaN NaN NaN ... 2019-06-02t18:35:32.973z 2019-08-05t14:54:41.796z 2 0 0 2 1 [crossref metadata search] 1 True
10989648 0000-0002-3675-6876 0 0 abegail palos-simbre NaN [gail] NaN NaN NaN ... 2017-02-10t16:38:52.988z 2019-12-11t01:37:15.405z 0 0 0 0 0 [] 0 False

10989649 rows × 26 columns

In [67]:
fig = px.imshow(df.fillna(0).corr())
fig.show()