registries_analysis/01-Explorative.ipynb at c052601c9050d97e7eccfa96149e3faf79b264f2

In [40]:

import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

In [2]:

def country_to_countrycode(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha2(country)
        except:
            return np.nan

def countrycode_to_continent(country_code):
    if pd.isna(country_code):
        return np.nan
    else:
        try:
            return pycountry_convert.country_alpha2_to_continent_code(country_code)
        except:
            return np.nan

Loading datasets¶

FAIRsharing

In [3]:

fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', 
                             delimiter='|', header=0,
                             names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df['subjects'] = fairsharing_df.subjects.str.split(pat=',')
fairsharing_df['countries'] = fairsharing_df.countries.str.split(pat=',')
fairsharing_df.head()

Out[3]:

	full_name	short_name	fs_url	url	countries	subjects
0	GenBank	GenBank	https://fairsharing.org/10.25504/FAIRsharing.9...	https://www.ncbi.nlm.nih.gov/genbank/	[European Union, Japan, United States]	[Bioinformatics, Data Management, Data Submiss...
1	GlycoNAVI	GlycoNAVI	https://fairsharing.org/10.25504/FAIRsharing.w...	https://glyconavi.org/	[Japan]	[Chemistry, Glycomics, Life Science, Organic C...
2	ADHDgene	ADHDgene	https://fairsharing.org/10.25504/FAIRsharing.m...	http://adhd.psych.ac.cn/	[China]	[Biomedical Science, Genetics]
3	Allele frequency resource for research and tea...	ALFRED	https://fairsharing.org/10.25504/FAIRsharing.y...	http://alfred.med.yale.edu	[United States]	[Life Science]
4	Animal Transcription Factor Database	AnimalTFDB	https://fairsharing.org/10.25504/FAIRsharing.e...	http://bioinfo.life.hust.edu.cn/AnimalTFDB/	[China]	[Life Science]

In [4]:

fairsharing_df.describe()

Out[4]:

	full_name	short_name	fs_url	url	countries	subjects
count	1752	1752	1752	1752	1749	1690
unique	1752	1741	1752	1752	178	834
top	FunTree: A Resource For Exploring The Function...	CGD	https://fairsharing.org/10.25504/FAIRsharing.5...	https://idn.ceos.org	[United States]	[Life Science]
freq	1	3	1	1	588	367

re3data

In [5]:

re3data_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
re3data_df = re3data_df[re3data_df.id.str.contains('re3data')].reset_index()
re3data_df.head()

Out[5]:

	index	id	url	official_name	english_name	description	subjects
0	4	10\|re3data_____::3f2e20af26ead0432f5470d8b739638d	http://planttfdb.cbi.pku.edu.cn/	Plant Transcription Factor Database	PlantTFDB	NaN	['Life Sciences', 'Basic Biological and Medica...
1	7	10\|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfc	https://spdf.gsfc.nasa.gov/	Space Physics Data Facility	NASA's Space Physics Data Facility SPDF	NaN	['Natural Sciences', 'Astrophysics and Astrono...
2	13	10\|re3data_____::59521daca59ac29b811343cc4cd370cf	http://card.westgis.ac.cn/	Cold and Arid Regions Science Data Center at L...	CARD WDC for Glaciology and Geocryology World ...	NaN	['Natural Sciences', 'Geosciences (including G...
3	14	10\|re3data_____::ec1ba1674c852466c266acb64c618d15	https://www.psycharchives.org/	Psycharchives	NaN	NaN	['Humanities and Social Sciences', 'Psychology...
4	19	10\|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76	https://www.ihfc-iugg.org/products/global-heat...	The Global Heat Flow Database of the Internati...	International Heat-flow Database	NaN	['Natural Sciences', 'Geology and Palaeontolog...

In [6]:

re3data_df.describe(include='all')

Out[6]:

	index	id	url	official_name	english_name	description	latitude	longitude	subjects
count	2693.000000	2693	2673	2693	2034	38	2693.000000	2693.000000	2693
unique	NaN	2693	2661	2668	2010	38	NaN	NaN	1427
top	NaN	10\|re3data_____::fc8141eebc533cb225498718479f4e66	http://wdcpc.org/	European Climate Assessment & Dataset project	ECA&D	The Atmospheric Science Data Center (ASDC) at ...	NaN	NaN	['Humanities and Social Sciences', 'Life Scien...
freq	NaN	1	2	2	2	1	NaN	NaN	209
mean	4443.650947	NaN	NaN	NaN	NaN	NaN	0.114497	0.067998	NaN
std	2518.294468	NaN	NaN	NaN	NaN	NaN	4.585469	2.447173	NaN
min	4.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	NaN
25%	2266.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	NaN
50%	4506.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	NaN
75%	6660.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	NaN
max	8705.000000	NaN	NaN	NaN	NaN	NaN	234.000000	123.000000	NaN

OpenDOAR

In [7]:

opendoar_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
opendoar_df = opendoar_df[opendoar_df.id.str.contains('opendoar')].reset_index()
opendoar_df.head()

Out[7]:

	index	id	url	official_name	english_name	description	latitude	longitude	subjects
0	0	10\|opendoar____::e833e042f509c996b1b25324d56659fb	http://www.bilbao.net/bld	BLD - Bilboko Liburutegi Digitala	BLD - Bilboko Liburutegi Digitala	BLD is a repository of digital documents, desi...	43.256699	-2.924100	[]
1	1	10\|opendoar____::f621585df244e9596dc70a39b579efb1	https://researchdirect.westernsydney.edu.au/	Western Sydney ResearchDirect	Western Sydney ResearchDirect	NaN	0.000000	0.000000	[]
2	2	10\|opendoar____::437d7d1d97917cd627a34a6a0fb41136	http://redress.lancs.ac.uk/Learning_Space/	Learning Space Catalogue	NaN	This repository is a Social Science e-Science ...	54.010760	-2.784990	['Social Sciences General', 'Science General',...
3	3	10\|opendoar____::d840cc5d906c3e9c84374c8919d2074e	http://digitallibrary.usc.edu/search/controlle...	USC Digital Library	USC Digital Library	This is an institutional repository providing ...	34.052200	-118.242996	[]
4	5	10\|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010	http://www.ufgd.edu.br:8080/jspui/	Repositório de Divulgação das Produções Cientí...	Repositório de Divulgação das Produções Cientí...	This site provides access to the research outp...	-22.221800	-54.806400	[]

In [8]:

opendoar_df.describe(include='all')

Out[8]:

	index	id	url	official_name	english_name	description	latitude	longitude	subjects
count	6014.000000	6014	6013	6014	5500	5776	6014.000000	6014.000000	6014
unique	NaN	6014	5953	5946	5413	4920	NaN	NaN	201
top	NaN	10\|opendoar____::17256f049f1e3fede17c7a313f7657f4	http://harp.lib.hiroshima-u.ac.jp/	Hiroshima Associated Repository Portal	AURA	This site provides access to the research outp...	NaN	NaN	[]
freq	NaN	1	3	3	4	98	NaN	NaN	5273
mean	4312.407549	NaN	NaN	NaN	NaN	NaN	38.649393	7.810948	NaN
std	2510.699848	NaN	NaN	NaN	NaN	NaN	788.406173	71.689788	NaN
min	0.000000	NaN	NaN	NaN	NaN	NaN	-79.029999	-683.103027	NaN
25%	2129.250000	NaN	NaN	NaN	NaN	NaN	4.644632	-49.273300	NaN
50%	4297.000000	NaN	NaN	NaN	NaN	NaN	37.930449	4.788870	NaN
75%	6476.750000	NaN	NaN	NaN	NaN	NaN	47.294400	30.685501	NaN
max	8706.000000	NaN	NaN	NaN	NaN	NaN	61138.800781	178.438995	NaN

Basic cleaning¶

re3data

In [9]:

re3data_df.loc[(re3data_df.latitude == 0.0) & (re3data_df.longitude == 0.0), ['latitude', 'longitude']] = [np.nan, np.nan]

In [10]:

re3data_df.subjects

Out[10]:

0       ['Life Sciences', 'Basic Biological and Medica...
1       ['Natural Sciences', 'Astrophysics and Astrono...
2       ['Natural Sciences', 'Geosciences (including G...
3       ['Humanities and Social Sciences', 'Psychology...
4       ['Natural Sciences', 'Geology and Palaeontolog...
                              ...                        
2688    ['Life Sciences', 'Basic Biological and Medica...
2689    ['Natural Sciences', 'Atmospheric Science and ...
2690    ['Natural Sciences', 'Atmospheric Science and ...
2691    ['Natural Sciences', 'Atmospheric Science and ...
2692    ['Life Sciences', 'Plant Sciences', 'Plant Gen...
Name: subjects, Length: 2693, dtype: object

In [11]:

re3data_df['subjects'] = re3data_df.subjects.apply(lambda x: ast.literal_eval(x))

In [12]:

def merge_lists(lists):
    res = []
    for l in lists:
        res = res + l
    return res

re3data_cleaned_subjects = re3data_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
                                .apply(lambda row: row.dropna().tolist(), axis=1)\
                                .reset_index()\
                                .groupby('index')[0].apply(lambda x: merge_lists(x))

In [13]:

re3data_cleaned_subjects

Out[13]:

index
0       [Life Sciences, Basic Biological, Medical Rese...
1       [Natural Sciences, Astrophysics, Astronomy, Ph...
2       [Natural Sciences, Geosciences (including Geog...
3       [Humanities, Social Sciences, Psychology, Soci...
4       [Natural Sciences, Geology, Palaeontology, Geo...
                              ...                        
2688    [Life Sciences, Basic Biological, Medical Rese...
2689    [Natural Sciences, Atmospheric Science, Oceano...
2690    [Natural Sciences, Atmospheric Science, Oceano...
2691    [Natural Sciences, Atmospheric Science, Oceano...
2692    [Life Sciences, Plant Sciences, Plant Genetics...
Name: 0, Length: 2693, dtype: object

In [14]:

re3data_df = re3data_df.join(re3data_cleaned_subjects)

In [15]:

re3data_df.drop(columns=['subjects'], inplace=True)
re3data_df.rename(columns={0:'subjects'}, inplace=True)

OpenDOAR

In [16]:

opendoar_df.subjects

Out[16]:

0                                                      []
1                                                      []
2       ['Social Sciences General', 'Science General',...
3                                                      []
4                                                      []
                              ...                        
6009                                ['Multidisciplinary']
6010                                                   []
6011                           ['Business and Economics']
6012    ['Earth and Planetary Sciences', 'Ecology and ...
6013                                                   []
Name: subjects, Length: 6014, dtype: object

In [17]:

opendoar_df['subjects'] = opendoar_df.subjects.apply(lambda x: ast.literal_eval(x))

In [18]:

opendoar_cleaned_subjects = opendoar_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
                                .apply(lambda row: row.dropna().tolist(), axis=1)\
                                .reset_index()\
                                .groupby('index')[0].apply(lambda x: merge_lists(x))

In [19]:

opendoar_cleaned_subjects

Out[19]:

index
0                                                      []
1                                                      []
2       [Social Sciences General, Science General, Com...
3                                                      []
4                                                      []
                              ...                        
6009                                  [Multidisciplinary]
6010                                                   []
6011                                [Business, Economics]
6012    [Earth, Planetary Sciences, Ecology, Environme...
6013                                                   []
Name: 0, Length: 6014, dtype: object

In [20]:

opendoar_df = opendoar_df.join(opendoar_cleaned_subjects)

In [21]:

opendoar_df.drop(columns=['subjects'], inplace=True)
opendoar_df.rename(columns={0: 'subjects'}, inplace=True)

Subjects analysis¶

In [22]:

fairsharing_subjects = fairsharing_df.explode('subjects')
re3data_subjects = re3data_df.explode('subjects')
opendoar_subjects = opendoar_df.explode('subjects')

In [23]:

data1 = fairsharing_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)
data2 = re3data_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)
data3 = opendoar_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)

plot = [
    go.Bar(
        x=data1.index,
        y=data1['url'],
        name='FAIRsharing'
    ),
    go.Bar(
        x=data2.index,
        y=data2['url'],
        name='re3data',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data3.index,
        y=data3['url'],
        name='OpenDOAR',
        visible = 'legendonly'
    )
]

layout = go.Layout(
    title='Subject coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

In [24]:

len(fairsharing_subjects.subjects.unique())

Out[24]:

In [25]:

len(re3data_subjects.subjects.unique())

Out[25]:

In [26]:

len(opendoar_subjects.subjects.unique())

Out[26]:

In [27]:

opendoar_subjects.subjects.unique()

Out[27]:

array([nan, 'Social Sciences General', 'Science General', 'Computers',
       'IT', 'Physics', 'Astronomy', 'Multidisciplinary', 'Arts',
       'Humanities General', 'Philosophy', 'Religion', 'Business',
       'Economics', 'Law', 'Politics', 'Psychology', 'Health', 'Medicine',
       'History', 'Archaeology', 'Education', 'Technology General',
       'Library', 'Information Science', 'Earth', 'Planetary Sciences',
       'Geography', 'Regional Studies', 'Architecture', 'Ecology',
       'Environment', 'Electrical', 'Electronic Engineering', 'Biology',
       'Biochemistry', 'Mathematics', 'Statistics', 'Civil Engineering',
       'Agriculture', ' Food', 'Veterinary', 'Language', 'Literature',
       'Chemistry', 'Chemical Technology', 'Mechanical Engineering',
       'Materials', 'Fine', 'Performing Arts', 'Management', 'Planning',
       ' Language', ' Health', 'Veterinary ', ' Technology General',
       'Medicine ', ' History', 'IT ', ' Law', 'Social Sciences General ',
       ' Science General', ' Philosophy', 'Performing Arts '],
      dtype=object)

Geographic analysis¶

In [28]:

fairsharing_countries = fairsharing_df.explode('countries')
fairsharing_countries['countrycode'] = fairsharing_countries.countries.map(lambda c: country_to_countrycode(c))
fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(lambda cc: countrycode_to_continent(cc))

In [29]:

fairsharing_countries[fairsharing_countries.countrycode.isna()].countries.unique()

Out[29]:

array(['European Union', 'Republic of Ireland', 'Worldwide', nan],
      dtype=object)

In [30]:

fairsharing_countries[fairsharing_countries.continent.isna()].countries.unique()

Out[30]:

array(['European Union', 'Republic of Ireland', 'Worldwide', 'Antarctica',
       nan], dtype=object)

Manually fixing exceptions

In [31]:

fairsharing_countries.loc[fairsharing_countries.countries == 'Republic of Ireland', ['countries', 'countrycode', 'continent']] = ['Ireland', 'IE', 'EU']
fairsharing_countries.loc[fairsharing_countries.countries == 'Antarctica', ['countrycode', 'continent']] = ['AQ', np.nan]
fairsharing_countries.loc[fairsharing_countries.countries == 'European Union', ['countrycode', 'continent']] = ['EU', 'EU']

In [32]:

fairsharing_countries[fairsharing_countries.countrycode == 'AQ']

Out[32]:

	full_name	short_name	fs_url	url	countries	subjects	countrycode	continent
915	Antabif IPT - AntOBIS IPT - GBIF Belgium	Antabif IPT - AntOBIS IPT - GBIF Belgium	https://fairsharing.org/10.25504/FAIRsharing.e...	http://ipt.biodiversity.aq/	Antarctica	[Biodiversity, Life Science]	AQ	NaN

For re3data

In [65]:

re3data_df[re3data_df.latitude.notna()].count()

Out[65]:

index            5
id               5
url              5
official_name    5
english_name     5
description      5
latitude         5
longitude        5
subjects         5
dtype: int64

Location is basically absent in re3data

For OpenDOAR

In [34]:

reverse_geocoding = pd.DataFrame(rg.search(opendoar_df[['latitude', 'longitude']].apply(tuple, axis=1).tolist()))
reverse_geocoding['lat'] = reverse_geocoding['lat'].astype('float')
reverse_geocoding['lon'] = reverse_geocoding['lon'].astype('float')
reverse_geocoding['continent'] = reverse_geocoding.cc.map(countrycode_to_continent)
reverse_geocoding

Loading formatted geocoded file...

Out[34]:

	lat	lon	name	admin1	admin2	cc	continent
0	43.26271	-2.92528	Bilbao	Basque Country	Bizkaia	ES	EU
1	4.88447	-1.75536	Takoradi	Western		GH	AF
2	53.98333	-2.78333	Galgate	England	Lancashire	GB	EU
3	34.05223	-118.24368	Los Angeles	California	Los Angeles County	US	NA
4	-22.22111	-54.80556	Dourados	Mato Grosso do Sul	Dourados	BR	SA
...	...	...	...	...	...	...	...
6009	40.85631	14.24641	Napoli	Campania	Provincia di Napoli	IT	EU
6010	38.19394	15.55256	Messina	Sicily	Messina	IT	EU
6011	54.32133	10.13489	Kiel	Schleswig-Holstein		DE	EU
6012	43.40785	-73.25955	Granville	New York	Washington County	US	NA
6013	33.96095	-83.37794	Athens	Georgia	Clarke County	US	NA

6014 rows × 7 columns

In [35]:

opendoar_df = opendoar_df.join(reverse_geocoding)

Manual fix of null lat/lon

In [36]:

opendoar_df.loc[(opendoar_df.latitude == 0.0) & (opendoar_df.longitude == 0.0), ['latitude', 'longitude', 'cc', 'continent']] = [np.nan, np.nan, np.nan, np.nan]

Country intersection

In [63]:

venn2([set(fairsharing_countries.countrycode.dropna()), set(opendoar_df.cc.dropna())], set_labels = ('FAIRsharing', 'OpenDOAR'))
plt.show()

Country coverage

In [37]:

data1 = fairsharing_countries.groupby('countrycode')[['url']].count().sort_values('url', ascending=False)
data2 = opendoar_df.groupby('cc')[['id']].count().sort_values('id', ascending=False)


plot = [
    go.Bar(
        x=data1.index,
        y=data1['url'],
        name='FAIRsharing'
    ),
    go.Bar(
        x=data2.index,
        y=data2['id'],
        name='OpenDOAR'
    )
]

layout = go.Layout(
    title='Country coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

go.Figure(plot, layout).show()

Continental coverage

In [38]:

data1 = fairsharing_countries.groupby('continent')[['url']].count()
data2 = opendoar_df.groupby('continent')[['url']].count()

plot = [
    go.Scatterpolar(
        r=data1.url,
        theta=data1.index,
        fill='toself',
        name='FAIRsharing'),
    go.Scatterpolar(
        r=data2.url,
        theta=data2.index,
        fill='toself',
        name='OpenDOAR')
]

layout = go.Layout(polar=dict(
    radialaxis=dict(
      visible=True
    ),
  )
)

go.Figure(plot, layout).show()

3.6 MiB Raw Blame History Unescape Escape

Loading datasets¶

Basic cleaning¶

Subjects analysis¶

Geographic analysis¶

3.6 MiB

Raw Blame History