registries_analysis/notebooks/01-Explorative.ipynb

3.7 MiB
Raw Blame History

In [1]:
import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

FAIRsharing

In [2]:
fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', 
                             delimiter='|', header=0,
                             names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df['subjects'] = fairsharing_df.subjects.str.split(pat=',')
fairsharing_df['countries'] = fairsharing_df.countries.str.split(pat=',')
fairsharing_df.head()
Out[2]:
full_name short_name fs_url url countries subjects
0 GenBank GenBank https://fairsharing.org/10.25504/FAIRsharing.9... https://www.ncbi.nlm.nih.gov/genbank/ [European Union, Japan, United States] [Bioinformatics, Data Management, Data Submiss...
1 GlycoNAVI GlycoNAVI https://fairsharing.org/10.25504/FAIRsharing.w... https://glyconavi.org/ [Japan] [Chemistry, Glycomics, Life Science, Organic C...
2 ADHDgene ADHDgene https://fairsharing.org/10.25504/FAIRsharing.m... http://adhd.psych.ac.cn/ [China] [Biomedical Science, Genetics]
3 Allele frequency resource for research and tea... ALFRED https://fairsharing.org/10.25504/FAIRsharing.y... http://alfred.med.yale.edu [United States] [Life Science]
4 Animal Transcription Factor Database AnimalTFDB https://fairsharing.org/10.25504/FAIRsharing.e... http://bioinfo.life.hust.edu.cn/AnimalTFDB/ [China] [Life Science]
In [3]:
fairsharing_df.describe()
Out[3]:
full_name short_name fs_url url countries subjects
count 1752 1752 1752 1752 1749 1690
unique 1752 1741 1752 1752 178 834
top The Cardiovascular Research Grid CGD https://fairsharing.org/bsg-d001750 http://www.bmrb.wisc.edu/ [United States] [Life Science]
freq 1 3 1 1 588 367
In [4]:
fairsharing_subjects = fairsharing_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)

data = [
    go.Bar(
        x=fairsharing_subjects.index,
        y=fairsharing_subjects['url']
    )
]

layout = go.Layout(
    title='Fairsharing subject coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [5]:
fairsharing_countries = fairsharing_df.explode('countries').groupby('countries')[['url']].count().sort_values('url', ascending=False)

data = [
    go.Bar(
        x=fairsharing_countries.index,
        y=fairsharing_countries['url']
    )
]

layout = go.Layout(
    title='Fairsharing country coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

re3data

In [6]:
re3data_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
re3data_df = re3data_df[re3data_df.id.str.contains('re3data')]
re3data_df.head()
Out[6]:
id url official_name english_name description latitude longitude subjects
4 10|re3data_____::3f2e20af26ead0432f5470d8b739638d http://planttfdb.cbi.pku.edu.cn/ Plant Transcription Factor Database PlantTFDB NaN 0.0 0.0 ['Life Sciences', 'Basic Biological and Medica...
7 10|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfc https://spdf.gsfc.nasa.gov/ Space Physics Data Facility NASA's Space Physics Data Facility SPDF NaN 0.0 0.0 ['Natural Sciences', 'Astrophysics and Astrono...
13 10|re3data_____::59521daca59ac29b811343cc4cd370cf http://card.westgis.ac.cn/ Cold and Arid Regions Science Data Center at L... CARD WDC for Glaciology and Geocryology World ... NaN 0.0 0.0 ['Natural Sciences', 'Geosciences (including G...
14 10|re3data_____::ec1ba1674c852466c266acb64c618d15 https://www.psycharchives.org/ Psycharchives NaN NaN 0.0 0.0 ['Humanities and Social Sciences', 'Psychology...
19 10|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76 https://www.ihfc-iugg.org/products/global-heat... The Global Heat Flow Database of the Internati... International Heat-flow Database NaN 0.0 0.0 ['Natural Sciences', 'Geology and Palaeontolog...
In [7]:
re3data_df.loc[(re3data_df.latitude == 0.0) & (re3data_df.longitude == 0.0), ['latitude', 'longitude']] = [np.nan, np.nan]
In [8]:
re3data_df.subjects
Out[8]:
4       ['Life Sciences', 'Basic Biological and Medica...
7       ['Natural Sciences', 'Astrophysics and Astrono...
13      ['Natural Sciences', 'Geosciences (including G...
14      ['Humanities and Social Sciences', 'Psychology...
19      ['Natural Sciences', 'Geology and Palaeontolog...
                              ...                        
8693    ['Life Sciences', 'Basic Biological and Medica...
8695    ['Natural Sciences', 'Atmospheric Science and ...
8697    ['Natural Sciences', 'Atmospheric Science and ...
8699    ['Natural Sciences', 'Atmospheric Science and ...
8705    ['Life Sciences', 'Plant Sciences', 'Plant Gen...
Name: subjects, Length: 2693, dtype: object
In [9]:
re3data_df['subjects'] = re3data_df.subjects.apply(lambda x: ast.literal_eval(x))
In [10]:
def merge_lists(lists):
    res = []
    for l in lists:
        res = res + l
    return res

re3data_cleaned_subjects = re3data_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
                                .apply(lambda row: row.dropna().tolist(), axis=1)\
                                .reset_index()\
                                .groupby('index')[0].apply(lambda x: merge_lists(x))
In [11]:
re3data_cleaned_subjects
Out[11]:
index
4       [Life Sciences, Basic Biological, Medical Rese...
7       [Natural Sciences, Astrophysics, Astronomy, Ph...
13      [Natural Sciences, Geosciences (including Geog...
14      [Humanities, Social Sciences, Psychology, Soci...
19      [Natural Sciences, Geology, Palaeontology, Geo...
                              ...                        
8693    [Life Sciences, Basic Biological, Medical Rese...
8695    [Natural Sciences, Atmospheric Science, Oceano...
8697    [Natural Sciences, Atmospheric Science, Oceano...
8699    [Natural Sciences, Atmospheric Science, Oceano...
8705    [Life Sciences, Plant Sciences, Plant Genetics...
Name: 0, Length: 2693, dtype: object
In [12]:
re3data_df = re3data_df.join(re3data_cleaned_subjects)
In [13]:
re3data_df.drop(columns=['subjects'], inplace=True)
re3data_df.rename(columns={0:'subjects'}, inplace=True)
In [14]:
re3data_df.describe(include='all')
Out[14]:
id url official_name english_name description latitude longitude subjects
count 2693 2673 2693 2034 38 5.000000 5.000000 2693
unique 2693 2661 2668 2010 38 NaN NaN 1427
top 10|re3data_____::e59f89142e8d47d32523c53a9137f07b http://iubio.bio.indiana.edu/ IUBio-Archive Research Data Repository IUBio Archive is an archive of biology data an... NaN NaN [Humanities, Social Sciences, Life Sciences, N...
freq 1 2 2 2 1 NaN NaN 209
mean NaN NaN NaN NaN NaN 61.668113 36.623678 NaN
std NaN NaN NaN NaN NaN 96.984457 48.547521 NaN
min NaN NaN NaN NaN NaN 12.123000 12.123000 NaN
25% NaN NaN NaN NaN NaN 12.123000 12.123400 NaN
50% NaN NaN NaN NaN NaN 12.123400 12.123400 NaN
75% NaN NaN NaN NaN NaN 37.971163 23.748590 NaN
max NaN NaN NaN NaN NaN 234.000000 123.000000 NaN
In [15]:
re3data_subjects = re3data_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)

data = [
    go.Bar(
        x=re3data_subjects.index,
        y=re3data_subjects['url']
    )
]

layout = go.Layout(
    title='re3data subject coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

OpenDOAR

In [82]:
opendoar_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
opendoar_df = opendoar_df[opendoar_df.id.str.contains('opendoar')]
opendoar_df.head()
Out[82]:
id url official_name english_name description latitude longitude subjects
0 10|opendoar____::e833e042f509c996b1b25324d56659fb http://www.bilbao.net/bld BLD - Bilboko Liburutegi Digitala BLD - Bilboko Liburutegi Digitala BLD is a repository of digital documents, desi... 43.256699 -2.924100 []
1 10|opendoar____::f621585df244e9596dc70a39b579efb1 https://researchdirect.westernsydney.edu.au/ Western Sydney ResearchDirect Western Sydney ResearchDirect NaN 0.000000 0.000000 []
2 10|opendoar____::437d7d1d97917cd627a34a6a0fb41136 http://redress.lancs.ac.uk/Learning_Space/ Learning Space Catalogue NaN This repository is a Social Science e-Science ... 54.010760 -2.784990 ['Social Sciences General', 'Science General',...
3 10|opendoar____::d840cc5d906c3e9c84374c8919d2074e http://digitallibrary.usc.edu/search/controlle... USC Digital Library USC Digital Library This is an institutional repository providing ... 34.052200 -118.242996 []
5 10|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010 http://www.ufgd.edu.br:8080/jspui/ Repositório de Divulgação das Produções Cientí... Repositório de Divulgação das Produções Cientí... This site provides access to the research outp... -22.221800 -54.806400 []
In [84]:
opendoar_df.subjects
Out[84]:
0                                                      []
1                                                      []
2       ['Social Sciences General', 'Science General',...
3                                                      []
5                                                      []
                              ...                        
8701                                ['Multidisciplinary']
8702                                                   []
8703                           ['Business and Economics']
8704    ['Earth and Planetary Sciences', 'Ecology and ...
8706                                                   []
Name: subjects, Length: 6014, dtype: object
In [85]:
opendoar_df['subjects'] = opendoar_df.subjects.apply(lambda x: ast.literal_eval(x))
In [86]:
opendoar_cleaned_subjects = opendoar_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
                                .apply(lambda row: row.dropna().tolist(), axis=1)\
                                .reset_index()\
                                .groupby('index')[0].apply(lambda x: merge_lists(x))
In [87]:
opendoar_cleaned_subjects
Out[87]:
index
0                                                      []
1                                                      []
2       [Social Sciences General, Science General, Com...
3                                                      []
5                                                      []
                              ...                        
8701                                  [Multidisciplinary]
8702                                                   []
8703                                [Business, Economics]
8704    [Earth, Planetary Sciences, Ecology, Environme...
8706                                                   []
Name: 0, Length: 6014, dtype: object
In [88]:
opendoar_df = opendoar_df.join(opendoar_cleaned_subjects)
In [89]:
opendoar_df.drop(columns=['subjects'], inplace=True)
opendoar_df.rename(columns={0: 'subjects'}, inplace=True)
In [90]:
opendoar_df.describe(include='all')
Out[90]:
id url official_name english_name description latitude longitude subjects
count 6014 6013 6014 5500 5776 6014.000000 6014.000000 6014
unique 6014 5953 5946 5413 4920 NaN NaN 201
top 10|opendoar____::a2557a7b2e94197ff767970b67041697 http://harp.lib.hiroshima-u.ac.jp/ Hiroshima Associated Repository Portal AURA This site provides access to the research outp... NaN NaN []
freq 1 3 3 4 98 NaN NaN 5273
mean NaN NaN NaN NaN NaN 38.649393 7.810948 NaN
std NaN NaN NaN NaN NaN 788.406173 71.689788 NaN
min NaN NaN NaN NaN NaN -79.029999 -683.103027 NaN
25% NaN NaN NaN NaN NaN 4.644632 -49.273300 NaN
50% NaN NaN NaN NaN NaN 37.930449 4.788870 NaN
75% NaN NaN NaN NaN NaN 47.294400 30.685501 NaN
max NaN NaN NaN NaN NaN 61138.800781 178.438995 NaN
In [91]:
opendoar_subjects = opendoar_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)

data = [
    go.Bar(
        x=opendoar_subjects.index,
        y=opendoar_subjects['url']
    )
]

layout = go.Layout(
    title='OpenDOAR subject coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [92]:
reverse_geocoding = pd.DataFrame(rg.search(opendoar_df[['latitude', 'longitude']].apply(tuple, axis=1).tolist()))
reverse_geocoding['lat'] = reverse_geocoding['lat'].astype('float')
reverse_geocoding['lon'] = reverse_geocoding['lon'].astype('float')
reverse_geocoding
Out[92]:
lat lon name admin1 admin2 cc
0 43.26271 -2.92528 Bilbao Basque Country Bizkaia ES
1 4.88447 -1.75536 Takoradi Western GH
2 53.98333 -2.78333 Galgate England Lancashire GB
3 34.05223 -118.24368 Los Angeles California Los Angeles County US
4 -22.22111 -54.80556 Dourados Mato Grosso do Sul Dourados BR
... ... ... ... ... ... ...
6009 40.85631 14.24641 Napoli Campania Provincia di Napoli IT
6010 38.19394 15.55256 Messina Sicily Messina IT
6011 54.32133 10.13489 Kiel Schleswig-Holstein DE
6012 43.40785 -73.25955 Granville New York Washington County US
6013 33.96095 -83.37794 Athens Georgia Clarke County US

6014 rows × 6 columns

In [94]:
opendoar_df = opendoar_df.join(reverse_geocoding[['cc']])
In [96]:
opendoar_df.loc[(opendoar_df.latitude == 0.0) & (opendoar_df.longitude == 0.0), ['latitude', 'longitude', 'cc']] = [np.nan, np.nan, np.nan]
In [103]:
opendoar_countries = opendoar_df.groupby('cc')[['id']].count().sort_values('id', ascending=False)

data = [
    go.Bar(
        x=opendoar_countries.index,
        y=opendoar_countries['id']
    )
]

layout = go.Layout(
    title='OpenDOAR country coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)