registries_analysis/notebooks/01-Explorative.ipynb

3.6 MiB
Raw Blame History

In [40]:
import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
In [2]:
def country_to_countrycode(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha2(country)
        except:
            return np.nan

def countrycode_to_continent(country_code):
    if pd.isna(country_code):
        return np.nan
    else:
        try:
            return pycountry_convert.country_alpha2_to_continent_code(country_code)
        except:
            return np.nan

Loading datasets

FAIRsharing

In [3]:
fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', 
                             delimiter='|', header=0,
                             names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df['subjects'] = fairsharing_df.subjects.str.split(pat=',')
fairsharing_df['countries'] = fairsharing_df.countries.str.split(pat=',')
fairsharing_df.head()
Out[3]:
full_name short_name fs_url url countries subjects
0 GenBank GenBank https://fairsharing.org/10.25504/FAIRsharing.9... https://www.ncbi.nlm.nih.gov/genbank/ [European Union, Japan, United States] [Bioinformatics, Data Management, Data Submiss...
1 GlycoNAVI GlycoNAVI https://fairsharing.org/10.25504/FAIRsharing.w... https://glyconavi.org/ [Japan] [Chemistry, Glycomics, Life Science, Organic C...
2 ADHDgene ADHDgene https://fairsharing.org/10.25504/FAIRsharing.m... http://adhd.psych.ac.cn/ [China] [Biomedical Science, Genetics]
3 Allele frequency resource for research and tea... ALFRED https://fairsharing.org/10.25504/FAIRsharing.y... http://alfred.med.yale.edu [United States] [Life Science]
4 Animal Transcription Factor Database AnimalTFDB https://fairsharing.org/10.25504/FAIRsharing.e... http://bioinfo.life.hust.edu.cn/AnimalTFDB/ [China] [Life Science]
In [4]:
fairsharing_df.describe()
Out[4]:
full_name short_name fs_url url countries subjects
count 1752 1752 1752 1752 1749 1690
unique 1752 1741 1752 1752 178 834
top FunTree: A Resource For Exploring The Function... CGD https://fairsharing.org/10.25504/FAIRsharing.5... https://idn.ceos.org [United States] [Life Science]
freq 1 3 1 1 588 367

re3data

In [5]:
re3data_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
re3data_df = re3data_df[re3data_df.id.str.contains('re3data')].reset_index()
re3data_df.head()
Out[5]:
index id url official_name english_name description latitude longitude subjects
0 4 10|re3data_____::3f2e20af26ead0432f5470d8b739638d http://planttfdb.cbi.pku.edu.cn/ Plant Transcription Factor Database PlantTFDB NaN 0.0 0.0 ['Life Sciences', 'Basic Biological and Medica...
1 7 10|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfc https://spdf.gsfc.nasa.gov/ Space Physics Data Facility NASA's Space Physics Data Facility SPDF NaN 0.0 0.0 ['Natural Sciences', 'Astrophysics and Astrono...
2 13 10|re3data_____::59521daca59ac29b811343cc4cd370cf http://card.westgis.ac.cn/ Cold and Arid Regions Science Data Center at L... CARD WDC for Glaciology and Geocryology World ... NaN 0.0 0.0 ['Natural Sciences', 'Geosciences (including G...
3 14 10|re3data_____::ec1ba1674c852466c266acb64c618d15 https://www.psycharchives.org/ Psycharchives NaN NaN 0.0 0.0 ['Humanities and Social Sciences', 'Psychology...
4 19 10|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76 https://www.ihfc-iugg.org/products/global-heat... The Global Heat Flow Database of the Internati... International Heat-flow Database NaN 0.0 0.0 ['Natural Sciences', 'Geology and Palaeontolog...
In [6]:
re3data_df.describe(include='all')
Out[6]:
index id url official_name english_name description latitude longitude subjects
count 2693.000000 2693 2673 2693 2034 38 2693.000000 2693.000000 2693
unique NaN 2693 2661 2668 2010 38 NaN NaN 1427
top NaN 10|re3data_____::fc8141eebc533cb225498718479f4e66 http://wdcpc.org/ European Climate Assessment & Dataset project ECA&D The Atmospheric Science Data Center (ASDC) at ... NaN NaN ['Humanities and Social Sciences', 'Life Scien...
freq NaN 1 2 2 2 1 NaN NaN 209
mean 4443.650947 NaN NaN NaN NaN NaN 0.114497 0.067998 NaN
std 2518.294468 NaN NaN NaN NaN NaN 4.585469 2.447173 NaN
min 4.000000 NaN NaN NaN NaN NaN 0.000000 0.000000 NaN
25% 2266.000000 NaN NaN NaN NaN NaN 0.000000 0.000000 NaN
50% 4506.000000 NaN NaN NaN NaN NaN 0.000000 0.000000 NaN
75% 6660.000000 NaN NaN NaN NaN NaN 0.000000 0.000000 NaN
max 8705.000000 NaN NaN NaN NaN NaN 234.000000 123.000000 NaN

OpenDOAR

In [7]:
opendoar_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
opendoar_df = opendoar_df[opendoar_df.id.str.contains('opendoar')].reset_index()
opendoar_df.head()
Out[7]:
index id url official_name english_name description latitude longitude subjects
0 0 10|opendoar____::e833e042f509c996b1b25324d56659fb http://www.bilbao.net/bld BLD - Bilboko Liburutegi Digitala BLD - Bilboko Liburutegi Digitala BLD is a repository of digital documents, desi... 43.256699 -2.924100 []
1 1 10|opendoar____::f621585df244e9596dc70a39b579efb1 https://researchdirect.westernsydney.edu.au/ Western Sydney ResearchDirect Western Sydney ResearchDirect NaN 0.000000 0.000000 []
2 2 10|opendoar____::437d7d1d97917cd627a34a6a0fb41136 http://redress.lancs.ac.uk/Learning_Space/ Learning Space Catalogue NaN This repository is a Social Science e-Science ... 54.010760 -2.784990 ['Social Sciences General', 'Science General',...
3 3 10|opendoar____::d840cc5d906c3e9c84374c8919d2074e http://digitallibrary.usc.edu/search/controlle... USC Digital Library USC Digital Library This is an institutional repository providing ... 34.052200 -118.242996 []
4 5 10|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010 http://www.ufgd.edu.br:8080/jspui/ Repositório de Divulgação das Produções Cientí... Repositório de Divulgação das Produções Cientí... This site provides access to the research outp... -22.221800 -54.806400 []
In [8]:
opendoar_df.describe(include='all')
Out[8]:
index id url official_name english_name description latitude longitude subjects
count 6014.000000 6014 6013 6014 5500 5776 6014.000000 6014.000000 6014
unique NaN 6014 5953 5946 5413 4920 NaN NaN 201
top NaN 10|opendoar____::17256f049f1e3fede17c7a313f7657f4 http://harp.lib.hiroshima-u.ac.jp/ Hiroshima Associated Repository Portal AURA This site provides access to the research outp... NaN NaN []
freq NaN 1 3 3 4 98 NaN NaN 5273
mean 4312.407549 NaN NaN NaN NaN NaN 38.649393 7.810948 NaN
std 2510.699848 NaN NaN NaN NaN NaN 788.406173 71.689788 NaN
min 0.000000 NaN NaN NaN NaN NaN -79.029999 -683.103027 NaN
25% 2129.250000 NaN NaN NaN NaN NaN 4.644632 -49.273300 NaN
50% 4297.000000 NaN NaN NaN NaN NaN 37.930449 4.788870 NaN
75% 6476.750000 NaN NaN NaN NaN NaN 47.294400 30.685501 NaN
max 8706.000000 NaN NaN NaN NaN NaN 61138.800781 178.438995 NaN

Basic cleaning

re3data

In [9]:
re3data_df.loc[(re3data_df.latitude == 0.0) & (re3data_df.longitude == 0.0), ['latitude', 'longitude']] = [np.nan, np.nan]
In [10]:
re3data_df.subjects
Out[10]:
0       ['Life Sciences', 'Basic Biological and Medica...
1       ['Natural Sciences', 'Astrophysics and Astrono...
2       ['Natural Sciences', 'Geosciences (including G...
3       ['Humanities and Social Sciences', 'Psychology...
4       ['Natural Sciences', 'Geology and Palaeontolog...
                              ...                        
2688    ['Life Sciences', 'Basic Biological and Medica...
2689    ['Natural Sciences', 'Atmospheric Science and ...
2690    ['Natural Sciences', 'Atmospheric Science and ...
2691    ['Natural Sciences', 'Atmospheric Science and ...
2692    ['Life Sciences', 'Plant Sciences', 'Plant Gen...
Name: subjects, Length: 2693, dtype: object
In [11]:
re3data_df['subjects'] = re3data_df.subjects.apply(lambda x: ast.literal_eval(x))
In [12]:
def merge_lists(lists):
    res = []
    for l in lists:
        res = res + l
    return res

re3data_cleaned_subjects = re3data_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
                                .apply(lambda row: row.dropna().tolist(), axis=1)\
                                .reset_index()\
                                .groupby('index')[0].apply(lambda x: merge_lists(x))
In [13]:
re3data_cleaned_subjects
Out[13]:
index
0       [Life Sciences, Basic Biological, Medical Rese...
1       [Natural Sciences, Astrophysics, Astronomy, Ph...
2       [Natural Sciences, Geosciences (including Geog...
3       [Humanities, Social Sciences, Psychology, Soci...
4       [Natural Sciences, Geology, Palaeontology, Geo...
                              ...                        
2688    [Life Sciences, Basic Biological, Medical Rese...
2689    [Natural Sciences, Atmospheric Science, Oceano...
2690    [Natural Sciences, Atmospheric Science, Oceano...
2691    [Natural Sciences, Atmospheric Science, Oceano...
2692    [Life Sciences, Plant Sciences, Plant Genetics...
Name: 0, Length: 2693, dtype: object
In [14]:
re3data_df = re3data_df.join(re3data_cleaned_subjects)
In [15]:
re3data_df.drop(columns=['subjects'], inplace=True)
re3data_df.rename(columns={0:'subjects'}, inplace=True)

OpenDOAR

In [16]:
opendoar_df.subjects
Out[16]:
0                                                      []
1                                                      []
2       ['Social Sciences General', 'Science General',...
3                                                      []
4                                                      []
                              ...                        
6009                                ['Multidisciplinary']
6010                                                   []
6011                           ['Business and Economics']
6012    ['Earth and Planetary Sciences', 'Ecology and ...
6013                                                   []
Name: subjects, Length: 6014, dtype: object
In [17]:
opendoar_df['subjects'] = opendoar_df.subjects.apply(lambda x: ast.literal_eval(x))
In [18]:
opendoar_cleaned_subjects = opendoar_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
                                .apply(lambda row: row.dropna().tolist(), axis=1)\
                                .reset_index()\
                                .groupby('index')[0].apply(lambda x: merge_lists(x))
In [19]:
opendoar_cleaned_subjects
Out[19]:
index
0                                                      []
1                                                      []
2       [Social Sciences General, Science General, Com...
3                                                      []
4                                                      []
                              ...                        
6009                                  [Multidisciplinary]
6010                                                   []
6011                                [Business, Economics]
6012    [Earth, Planetary Sciences, Ecology, Environme...
6013                                                   []
Name: 0, Length: 6014, dtype: object
In [20]:
opendoar_df = opendoar_df.join(opendoar_cleaned_subjects)
In [21]:
opendoar_df.drop(columns=['subjects'], inplace=True)
opendoar_df.rename(columns={0: 'subjects'}, inplace=True)

Subjects analysis

In [22]:
fairsharing_subjects = fairsharing_df.explode('subjects')
re3data_subjects = re3data_df.explode('subjects')
opendoar_subjects = opendoar_df.explode('subjects')
In [23]:
data1 = fairsharing_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)
data2 = re3data_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)
data3 = opendoar_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)

plot = [
    go.Bar(
        x=data1.index,
        y=data1['url'],
        name='FAIRsharing'
    ),
    go.Bar(
        x=data2.index,
        y=data2['url'],
        name='re3data',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data3.index,
        y=data3['url'],
        name='OpenDOAR',
        visible = 'legendonly'
    )
]

layout = go.Layout(
    title='Subject coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()
In [24]:
len(fairsharing_subjects.subjects.unique())
Out[24]:
311
In [25]:
len(re3data_subjects.subjects.unique())
Out[25]:
414
In [26]:
len(opendoar_subjects.subjects.unique())
Out[26]:
64
In [27]:
opendoar_subjects.subjects.unique()
Out[27]:
array([nan, 'Social Sciences General', 'Science General', 'Computers',
       'IT', 'Physics', 'Astronomy', 'Multidisciplinary', 'Arts',
       'Humanities General', 'Philosophy', 'Religion', 'Business',
       'Economics', 'Law', 'Politics', 'Psychology', 'Health', 'Medicine',
       'History', 'Archaeology', 'Education', 'Technology General',
       'Library', 'Information Science', 'Earth', 'Planetary Sciences',
       'Geography', 'Regional Studies', 'Architecture', 'Ecology',
       'Environment', 'Electrical', 'Electronic Engineering', 'Biology',
       'Biochemistry', 'Mathematics', 'Statistics', 'Civil Engineering',
       'Agriculture', ' Food', 'Veterinary', 'Language', 'Literature',
       'Chemistry', 'Chemical Technology', 'Mechanical Engineering',
       'Materials', 'Fine', 'Performing Arts', 'Management', 'Planning',
       ' Language', ' Health', 'Veterinary ', ' Technology General',
       'Medicine ', ' History', 'IT ', ' Law', 'Social Sciences General ',
       ' Science General', ' Philosophy', 'Performing Arts '],
      dtype=object)

Geographic analysis

In [28]:
fairsharing_countries = fairsharing_df.explode('countries')
fairsharing_countries['countrycode'] = fairsharing_countries.countries.map(lambda c: country_to_countrycode(c))
fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(lambda cc: countrycode_to_continent(cc))
In [29]:
fairsharing_countries[fairsharing_countries.countrycode.isna()].countries.unique()
Out[29]:
array(['European Union', 'Republic of Ireland', 'Worldwide', nan],
      dtype=object)
In [30]:
fairsharing_countries[fairsharing_countries.continent.isna()].countries.unique()
Out[30]:
array(['European Union', 'Republic of Ireland', 'Worldwide', 'Antarctica',
       nan], dtype=object)

Manually fixing exceptions

In [31]:
fairsharing_countries.loc[fairsharing_countries.countries == 'Republic of Ireland', ['countries', 'countrycode', 'continent']] = ['Ireland', 'IE', 'EU']
fairsharing_countries.loc[fairsharing_countries.countries == 'Antarctica', ['countrycode', 'continent']] = ['AQ', np.nan]
fairsharing_countries.loc[fairsharing_countries.countries == 'European Union', ['countrycode', 'continent']] = ['EU', 'EU']
In [32]:
fairsharing_countries[fairsharing_countries.countrycode == 'AQ']
Out[32]:
full_name short_name fs_url url countries subjects countrycode continent
915 Antabif IPT - AntOBIS IPT - GBIF Belgium Antabif IPT - AntOBIS IPT - GBIF Belgium https://fairsharing.org/10.25504/FAIRsharing.e... http://ipt.biodiversity.aq/ Antarctica [Biodiversity, Life Science] AQ NaN

For re3data

In [65]:
re3data_df[re3data_df.latitude.notna()].count()
Out[65]:
index            5
id               5
url              5
official_name    5
english_name     5
description      5
latitude         5
longitude        5
subjects         5
dtype: int64

Location is basically absent in re3data

For OpenDOAR

In [34]:
reverse_geocoding = pd.DataFrame(rg.search(opendoar_df[['latitude', 'longitude']].apply(tuple, axis=1).tolist()))
reverse_geocoding['lat'] = reverse_geocoding['lat'].astype('float')
reverse_geocoding['lon'] = reverse_geocoding['lon'].astype('float')
reverse_geocoding['continent'] = reverse_geocoding.cc.map(countrycode_to_continent)
reverse_geocoding
Loading formatted geocoded file...
Out[34]:
lat lon name admin1 admin2 cc continent
0 43.26271 -2.92528 Bilbao Basque Country Bizkaia ES EU
1 4.88447 -1.75536 Takoradi Western GH AF
2 53.98333 -2.78333 Galgate England Lancashire GB EU
3 34.05223 -118.24368 Los Angeles California Los Angeles County US NA
4 -22.22111 -54.80556 Dourados Mato Grosso do Sul Dourados BR SA
... ... ... ... ... ... ... ...
6009 40.85631 14.24641 Napoli Campania Provincia di Napoli IT EU
6010 38.19394 15.55256 Messina Sicily Messina IT EU
6011 54.32133 10.13489 Kiel Schleswig-Holstein DE EU
6012 43.40785 -73.25955 Granville New York Washington County US NA
6013 33.96095 -83.37794 Athens Georgia Clarke County US NA

6014 rows × 7 columns

In [35]:
opendoar_df = opendoar_df.join(reverse_geocoding)

Manual fix of null lat/lon

In [36]:
opendoar_df.loc[(opendoar_df.latitude == 0.0) & (opendoar_df.longitude == 0.0), ['latitude', 'longitude', 'cc', 'continent']] = [np.nan, np.nan, np.nan, np.nan]

Country intersection

In [63]:
venn2([set(fairsharing_countries.countrycode.dropna()), set(opendoar_df.cc.dropna())], set_labels = ('FAIRsharing', 'OpenDOAR'))
plt.show()

Country coverage

In [37]:
data1 = fairsharing_countries.groupby('countrycode')[['url']].count().sort_values('url', ascending=False)
data2 = opendoar_df.groupby('cc')[['id']].count().sort_values('id', ascending=False)


plot = [
    go.Bar(
        x=data1.index,
        y=data1['url'],
        name='FAIRsharing'
    ),
    go.Bar(
        x=data2.index,
        y=data2['id'],
        name='OpenDOAR'
    )
]

layout = go.Layout(
    title='Country coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

go.Figure(plot, layout).show()

Continental coverage

In [38]:
data1 = fairsharing_countries.groupby('continent')[['url']].count()
data2 = opendoar_df.groupby('continent')[['url']].count()

plot = [
    go.Scatterpolar(
        r=data1.url,
        theta=data1.index,
        fill='toself',
        name='FAIRsharing'),
    go.Scatterpolar(
        r=data2.url,
        theta=data2.index,
        fill='toself',
        name='OpenDOAR')
]

layout = go.Layout(polar=dict(
    radialaxis=dict(
      visible=True
    ),
  )
)

go.Figure(plot, layout).show()