registries_analysis/[deprecated]-Explorative.ipynb at 84b32e5d3331e45fd099923a8b3e3bf66f218a31

In [1]:

import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

In [1]:

def country_to_countrycode(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha2(country)
        except:
            return np.nan

def countrycode_to_continent(country_code):
    if pd.isna(country_code):
        return np.nan
    else:
        try:
            return pycountry_convert.country_alpha2_to_continent_code(country_code)
        except:
            return np.nan

Loading datasets¶

FAIRsharing

In [3]:

fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', 
                             delimiter='|', header=0,
                             names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df['subjects'] = fairsharing_df.subjects.str.split(pat=',')
fairsharing_df['countries'] = fairsharing_df.countries.str.split(pat=',')
fairsharing_df.head()

Out[3]:

	full_name	short_name	fs_url	url	countries	subjects
0	GenBank	GenBank	https://fairsharing.org/10.25504/FAIRsharing.9...	https://www.ncbi.nlm.nih.gov/genbank/	[European Union, Japan, United States]	[Bioinformatics, Data Management, Data Submiss...
1	GlycoNAVI	GlycoNAVI	https://fairsharing.org/10.25504/FAIRsharing.w...	https://glyconavi.org/	[Japan]	[Chemistry, Glycomics, Life Science, Organic C...
2	ADHDgene	ADHDgene	https://fairsharing.org/10.25504/FAIRsharing.m...	http://adhd.psych.ac.cn/	[China]	[Biomedical Science, Genetics]
3	Allele frequency resource for research and tea...	ALFRED	https://fairsharing.org/10.25504/FAIRsharing.y...	http://alfred.med.yale.edu	[United States]	[Life Science]
4	Animal Transcription Factor Database	AnimalTFDB	https://fairsharing.org/10.25504/FAIRsharing.e...	http://bioinfo.life.hust.edu.cn/AnimalTFDB/	[China]	[Life Science]

In [4]:

fairsharing_df.describe()

Out[4]:

	full_name	short_name	fs_url	url	countries	subjects
count	1752	1752	1752	1752	1749	1690
unique	1752	1741	1752	1752	178	834
top	SoyBase	CGD	https://fairsharing.org/bsg-d001065	https://ada.edu.au	[United States]	[Life Science]
freq	1	3	1	1	588	367

re3data

In [42]:

re3data_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
re3data_df = re3data_df[(re3data_df.id.str.contains('re3data'))].reset_index()
re3data_df.head()

Out[42]:

	index	id	url	official_name	english_name	description	subjects	type	dataprovider
0	4	10\|re3data_____::3f2e20af26ead0432f5470d8b739638d	http://planttfdb.cbi.pku.edu.cn/	Plant Transcription Factor Database	PlantTFDB	NaN	['Life Sciences', 'Basic Biological and Medica...	datarepository::unknown	True
1	7	10\|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfc	https://spdf.gsfc.nasa.gov/	Space Physics Data Facility	NASA's Space Physics Data Facility SPDF	NaN	['Natural Sciences', 'Astrophysics and Astrono...	datarepository::unknown	True
2	13	10\|re3data_____::59521daca59ac29b811343cc4cd370cf	http://card.westgis.ac.cn/	Cold and Arid Regions Science Data Center at L...	CARD WDC for Glaciology and Geocryology World ...	NaN	['Natural Sciences', 'Geosciences (including G...	datarepository::unknown	True
3	14	10\|re3data_____::ec1ba1674c852466c266acb64c618d15	https://www.psycharchives.org/	Psycharchives	NaN	NaN	['Humanities and Social Sciences', 'Psychology...	datarepository::unknown	True
4	19	10\|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76	https://www.ihfc-iugg.org/products/global-heat...	The Global Heat Flow Database of the Internati...	International Heat-flow Database	NaN	['Natural Sciences', 'Geology and Palaeontolog...	datarepository::unknown	True

In [6]:

re3data_df.describe(include='all')

Out[6]:

	index	id	url	official_name	english_name	description	latitude	longitude	subjects	type	dataprovider
count	2693.000000	2693	2673	2693	2034	38	2693.000000	2693.000000	2693	2693	2693
unique	NaN	2693	2661	2668	2010	38	NaN	NaN	1427	2	2
top	NaN	10\|re3data_____::11b64a5229ae3a1ba4da3c9e1743a808	http://figshare.com/	Mansfeld's World Database of Agriculture and H...	GCMD	JEDI is an educational data archive service th...	NaN	NaN	['Humanities and Social Sciences', 'Life Scien...	datarepository::unknown	True
freq	NaN	1	2	2	2	1	NaN	NaN	209	2692	2428
mean	4443.650947	NaN	NaN	NaN	NaN	NaN	0.114497	0.067998	NaN	NaN	NaN
std	2518.294468	NaN	NaN	NaN	NaN	NaN	4.585469	2.447173	NaN	NaN	NaN
min	4.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	NaN	NaN	NaN
25%	2266.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	NaN	NaN	NaN
50%	4506.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	NaN	NaN	NaN
75%	6660.000000	NaN	NaN	NaN	NaN	NaN	0.000000	0.000000	NaN	NaN	NaN
max	8705.000000	NaN	NaN	NaN	NaN	NaN	234.000000	123.000000	NaN	NaN	NaN

OpenDOAR

In [7]:

opendoar_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
opendoar_df = opendoar_df[(opendoar_df.id.str.contains('opendoar'))].reset_index()
opendoar_df.head()

Out[7]:

	index	id	url	official_name	english_name	description	latitude	longitude	subjects	type	dataprovider
0	0	10\|opendoar____::e833e042f509c996b1b25324d56659fb	http://www.bilbao.net/bld	BLD - Bilboko Liburutegi Digitala	BLD - Bilboko Liburutegi Digitala	BLD is a repository of digital documents, desi...	43.256699	-2.924100	[]	pubsrepository::institutional	False
1	1	10\|opendoar____::f621585df244e9596dc70a39b579efb1	https://researchdirect.westernsydney.edu.au/	Western Sydney ResearchDirect	Western Sydney ResearchDirect	NaN	0.000000	0.000000	[]	pubsrepository::institutional	False
2	2	10\|opendoar____::437d7d1d97917cd627a34a6a0fb41136	http://redress.lancs.ac.uk/Learning_Space/	Learning Space Catalogue	NaN	This repository is a Social Science e-Science ...	54.010760	-2.784990	['Social Sciences General', 'Science General',...	pubsrepository::unknown	False
3	3	10\|opendoar____::d840cc5d906c3e9c84374c8919d2074e	http://digitallibrary.usc.edu/search/controlle...	USC Digital Library	USC Digital Library	This is an institutional repository providing ...	34.052200	-118.242996	[]	pubsrepository::institutional	False
4	5	10\|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010	http://www.ufgd.edu.br:8080/jspui/	Repositório de Divulgação das Produções Cientí...	Repositório de Divulgação das Produções Cientí...	This site provides access to the research outp...	-22.221800	-54.806400	[]	pubsrepository::institutional	False

In [8]:

opendoar_df.describe(include='all')

Out[8]:

	index	id	url	official_name	english_name	description	latitude	longitude	subjects	type	dataprovider
count	6014.000000	6014	6013	6014	5500	5776	6014.000000	6014.000000	6014	6014	6014
unique	NaN	6014	5953	5946	5413	4920	NaN	NaN	201	5	1
top	NaN	10\|opendoar____::6e3197aae95c2ff8fcab35cb730f6a86	http://harp.lib.hiroshima-u.ac.jp/	OpenKnowledge Ecology Repository	AURA	This site provides access to the research outp...	NaN	NaN	[]	pubsrepository::institutional	False
freq	NaN	1	3	3	4	98	NaN	NaN	5273	5368	6014
mean	4312.407549	NaN	NaN	NaN	NaN	NaN	38.649393	7.810948	NaN	NaN	NaN
std	2510.699848	NaN	NaN	NaN	NaN	NaN	788.406173	71.689788	NaN	NaN	NaN
min	0.000000	NaN	NaN	NaN	NaN	NaN	-79.029999	-683.103027	NaN	NaN	NaN
25%	2129.250000	NaN	NaN	NaN	NaN	NaN	4.644632	-49.273300	NaN	NaN	NaN
50%	4297.000000	NaN	NaN	NaN	NaN	NaN	37.930449	4.788870	NaN	NaN	NaN
75%	6476.750000	NaN	NaN	NaN	NaN	NaN	47.294400	30.685501	NaN	NaN	NaN
max	8706.000000	NaN	NaN	NaN	NaN	NaN	61138.800781	178.438995	NaN	NaN	NaN

Basic cleaning¶

re3data

In [9]:

re3data_df.subjects

Out[9]:

0       ['Life Sciences', 'Basic Biological and Medica...
1       ['Natural Sciences', 'Astrophysics and Astrono...
2       ['Natural Sciences', 'Geosciences (including G...
3       ['Humanities and Social Sciences', 'Psychology...
4       ['Natural Sciences', 'Geology and Palaeontolog...
                              ...                        
2688    ['Life Sciences', 'Basic Biological and Medica...
2689    ['Natural Sciences', 'Atmospheric Science and ...
2690    ['Natural Sciences', 'Atmospheric Science and ...
2691    ['Natural Sciences', 'Atmospheric Science and ...
2692    ['Life Sciences', 'Plant Sciences', 'Plant Gen...
Name: subjects, Length: 2693, dtype: object

In [10]:

re3data_df['subjects'] = re3data_df.subjects.apply(lambda x: ast.literal_eval(x))

In [11]:

def merge_lists(lists):
    res = []
    for l in lists:
        res = res + l
    return res

re3data_cleaned_subjects = re3data_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
                                .apply(lambda row: row.dropna().tolist(), axis=1)\
                                .reset_index()\
                                .groupby('index')[0].apply(lambda x: merge_lists(x))

In [12]:

re3data_cleaned_subjects

Out[12]:

index
0       [Life Sciences, Basic Biological, Medical Rese...
1       [Natural Sciences, Astrophysics, Astronomy, Ph...
2       [Natural Sciences, Geosciences (including Geog...
3       [Humanities, Social Sciences, Psychology, Soci...
4       [Natural Sciences, Geology, Palaeontology, Geo...
                              ...                        
2688    [Life Sciences, Basic Biological, Medical Rese...
2689    [Natural Sciences, Atmospheric Science, Oceano...
2690    [Natural Sciences, Atmospheric Science, Oceano...
2691    [Natural Sciences, Atmospheric Science, Oceano...
2692    [Life Sciences, Plant Sciences, Plant Genetics...
Name: 0, Length: 2693, dtype: object

In [13]:

re3data_df = re3data_df.join(re3data_cleaned_subjects)

In [14]:

re3data_df.drop(columns=['subjects'], inplace=True)
re3data_df.rename(columns={0:'subjects'}, inplace=True)

OpenDOAR

In [15]:

opendoar_df.subjects

Out[15]:

0                                                      []
1                                                      []
2       ['Social Sciences General', 'Science General',...
3                                                      []
4                                                      []
                              ...                        
6009                                ['Multidisciplinary']
6010                                                   []
6011                           ['Business and Economics']
6012    ['Earth and Planetary Sciences', 'Ecology and ...
6013                                                   []
Name: subjects, Length: 6014, dtype: object

In [16]:

opendoar_df['subjects'] = opendoar_df.subjects.apply(lambda x: ast.literal_eval(x))

In [17]:

opendoar_cleaned_subjects = opendoar_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
                                .apply(lambda row: row.dropna().tolist(), axis=1)\
                                .reset_index()\
                                .groupby('index')[0].apply(lambda x: merge_lists(x))

In [18]:

opendoar_cleaned_subjects

Out[18]:

index
0                                                      []
1                                                      []
2       [Social Sciences General, Science General, Com...
3                                                      []
4                                                      []
                              ...                        
6009                                  [Multidisciplinary]
6010                                                   []
6011                                [Business, Economics]
6012    [Earth, Planetary Sciences, Ecology, Environme...
6013                                                   []
Name: 0, Length: 6014, dtype: object

In [19]:

opendoar_df = opendoar_df.join(opendoar_cleaned_subjects)

In [20]:

opendoar_df.drop(columns=['subjects'], inplace=True)
opendoar_df.rename(columns={0: 'subjects'}, inplace=True)

Subjects analysis¶

In [21]:

fairsharing_subjects = fairsharing_df.explode('subjects')
re3data_subjects = re3data_df.explode('subjects')
opendoar_subjects = opendoar_df.explode('subjects')

In [22]:

data1 = fairsharing_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)
data2 = re3data_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)
data3 = opendoar_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)

plot = [
    go.Bar(
        x=data1.index,
        y=data1['url'],
        name='FAIRsharing'
    ),
    go.Bar(
        x=data2.index,
        y=data2['url'],
        name='re3data',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data3.index,
        y=data3['url'],
        name='OpenDOAR',
        visible = 'legendonly'
    )
]

layout = go.Layout(
    title='Subject coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

In [23]:

len(fairsharing_subjects.subjects.unique())

Out[23]:

In [24]:

len(re3data_subjects.subjects.unique())

Out[24]:

In [25]:

len(opendoar_subjects.subjects.unique())

Out[25]:

In [26]:

opendoar_subjects.subjects.unique()

Out[26]:

array([nan, 'Social Sciences General', 'Science General', 'Computers',
       'IT', 'Physics', 'Astronomy', 'Multidisciplinary', 'Arts',
       'Humanities General', 'Philosophy', 'Religion', 'Business',
       'Economics', 'Law', 'Politics', 'Psychology', 'Health', 'Medicine',
       'History', 'Archaeology', 'Education', 'Technology General',
       'Library', 'Information Science', 'Earth', 'Planetary Sciences',
       'Geography', 'Regional Studies', 'Architecture', 'Ecology',
       'Environment', 'Electrical', 'Electronic Engineering', 'Biology',
       'Biochemistry', 'Mathematics', 'Statistics', 'Civil Engineering',
       'Agriculture', ' Food', 'Veterinary', 'Language', 'Literature',
       'Chemistry', 'Chemical Technology', 'Mechanical Engineering',
       'Materials', 'Fine', 'Performing Arts', 'Management', 'Planning',
       ' Language', ' Health', 'Veterinary ', ' Technology General',
       'Medicine ', ' History', 'IT ', ' Law', 'Social Sciences General ',
       ' Science General', ' Philosophy', 'Performing Arts '],
      dtype=object)

Geographic analysis¶

FAIRsharing

In [2]:

fairsharing_countries = fairsharing_df.explode('countries')
fairsharing_countries['countrycode'] = fairsharing_countries.countries.map(lambda c: country_to_countrycode(c))
fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(lambda cc: countrycode_to_continent(cc))

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-2-cc1be8e03668> in <module>
----> 1 fairsharing_countries = fairsharing_df.explode('countries')
      2 fairsharing_countries['countrycode'] = fairsharing_countries.countries.map(lambda c: country_to_countrycode(c))
      3 fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(lambda cc: countrycode_to_continent(cc))

NameError: name 'fairsharing_df' is not defined

In [3]:

fairsharing_countries[fairsharing_countries.countrycode.isna()].countries.unique()

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-19e7f1ee3008> in <module>
----> 1 fairsharing_countries[fairsharing_countries.countrycode.isna()].countries.unique()

NameError: name 'fairsharing_countries' is not defined

In [29]:

fairsharing_countries[fairsharing_countries.continent.isna()].countries.unique()

Out[29]:

array(['European Union', 'Republic of Ireland', 'Worldwide', 'Antarctica',
       nan], dtype=object)

Manually fixing exceptions

In [30]:

fairsharing_countries.loc[fairsharing_countries.countries == 'Republic of Ireland', ['countries', 'countrycode', 'continent']] = ['Ireland', 'IE', 'EU']
fairsharing_countries.loc[fairsharing_countries.countries == 'Antarctica', ['countrycode', 'continent']] = ['AQ', np.nan]
fairsharing_countries.loc[fairsharing_countries.countries == 'European Union', ['countrycode', 'continent']] = ['EU', 'EU']

In [31]:

fairsharing_countries[fairsharing_countries.countrycode == 'AQ']

Out[31]:

	full_name	short_name	fs_url	url	countries	subjects	countrycode	continent
915	Antabif IPT - AntOBIS IPT - GBIF Belgium	Antabif IPT - AntOBIS IPT - GBIF Belgium	https://fairsharing.org/10.25504/FAIRsharing.e...	http://ipt.biodiversity.aq/	Antarctica	[Biodiversity, Life Science]	AQ	NaN

re3data

In [32]:

re3data_df[re3data_df.latitude.notna()].count()

Out[32]:

index            2693
id               2693
url              2673
official_name    2693
english_name     2034
description        38
latitude         2693
longitude        2693
type             2693
dataprovider     2693
subjects         2693
dtype: int64

Location is basically absent in re3data

In [33]:

reverse_geocoding = pd.DataFrame(rg.search(re3data_df[['latitude', 'longitude']].apply(tuple, axis=1).tolist()))
reverse_geocoding['lat'] = reverse_geocoding['lat'].astype('float')
reverse_geocoding['lon'] = reverse_geocoding['lon'].astype('float')
reverse_geocoding['continent'] = reverse_geocoding.cc.map(countrycode_to_continent)
reverse_geocoding

Loading formatted geocoded file...

Out[33]:

	lat	lon	name	admin1	admin2	cc	continent
0	4.88447	-1.75536	Takoradi	Western		GH	AF
1	4.88447	-1.75536	Takoradi	Western		GH	AF
2	4.88447	-1.75536	Takoradi	Western		GH	AF
3	4.88447	-1.75536	Takoradi	Western		GH	AF
4	4.88447	-1.75536	Takoradi	Western		GH	AF
...	...	...	...	...	...	...	...
2688	4.88447	-1.75536	Takoradi	Western		GH	AF
2689	4.88447	-1.75536	Takoradi	Western		GH	AF
2690	4.88447	-1.75536	Takoradi	Western		GH	AF
2691	4.88447	-1.75536	Takoradi	Western		GH	AF
2692	4.88447	-1.75536	Takoradi	Western		GH	AF

2693 rows × 7 columns

In [34]:

re3data_df = re3data_df.join(reverse_geocoding)

Manual fix of null lat/lon

In [35]:

re3data_df.loc[(re3data_df.latitude == 0.0) & (re3data_df.longitude == 0.0), ['latitude', 'longitude', 'cc', 'continent']] = [np.nan, np.nan, np.nan, np.nan]

OpenDOAR

In [36]:

reverse_geocoding = pd.DataFrame(rg.search(opendoar_df[['latitude', 'longitude']].apply(tuple, axis=1).tolist()))
reverse_geocoding['lat'] = reverse_geocoding['lat'].astype('float')
reverse_geocoding['lon'] = reverse_geocoding['lon'].astype('float')
reverse_geocoding['continent'] = reverse_geocoding.cc.map(countrycode_to_continent)
reverse_geocoding

Out[36]:

	lat	lon	name	admin1	admin2	cc	continent
0	43.26271	-2.92528	Bilbao	Basque Country	Bizkaia	ES	EU
1	4.88447	-1.75536	Takoradi	Western		GH	AF
2	53.98333	-2.78333	Galgate	England	Lancashire	GB	EU
3	34.05223	-118.24368	Los Angeles	California	Los Angeles County	US	NA
4	-22.22111	-54.80556	Dourados	Mato Grosso do Sul	Dourados	BR	SA
...	...	...	...	...	...	...	...
6009	40.85631	14.24641	Napoli	Campania	Provincia di Napoli	IT	EU
6010	38.19394	15.55256	Messina	Sicily	Messina	IT	EU
6011	54.32133	10.13489	Kiel	Schleswig-Holstein		DE	EU
6012	43.40785	-73.25955	Granville	New York	Washington County	US	NA
6013	33.96095	-83.37794	Athens	Georgia	Clarke County	US	NA

6014 rows × 7 columns

In [37]:

opendoar_df = opendoar_df.join(reverse_geocoding)

Manual fix of null lat/lon

In [38]:

opendoar_df.loc[(opendoar_df.latitude == 0.0) & (opendoar_df.longitude == 0.0), ['latitude', 'longitude', 'cc', 'continent']] = [np.nan, np.nan, np.nan, np.nan]

Country intersection

In [39]:

venn2([set(fairsharing_countries.countrycode.dropna()), set(opendoar_df.cc.dropna())], set_labels = ('FAIRsharing', 'OpenDOAR'))
plt.show()

Country coverage

In [40]:

data1 = fairsharing_countries.groupby('countrycode')[['url']].count().sort_values('url', ascending=False)
data2 = opendoar_df.groupby('cc')[['id']].count().sort_values('id', ascending=False)


plot = [
    go.Bar(
        x=data1.index,
        y=data1['url'],
        name='FAIRsharing'
    ),
    go.Bar(
        x=data2.index,
        y=data2['id'],
        name='OpenDOAR'
    )
]

layout = go.Layout(
    title='Country coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

go.Figure(plot, layout).show()

Continental coverage

In [41]:

data1 = fairsharing_countries.groupby('continent')[['url']].count()
data2 = opendoar_df.groupby('continent')[['url']].count()

plot = [
    go.Scatterpolar(
        r=data1.url,
        theta=data1.index,
        fill='toself',
        name='FAIRsharing'),
    go.Scatterpolar(
        r=data2.url,
        theta=data2.index,
        fill='toself',
        name='OpenDOAR')
]

layout = go.Layout(polar=dict(
    radialaxis=dict(
      visible=True
    ),
  )
)

go.Figure(plot, layout).show()

3.6 MiB Raw Blame History Unescape Escape

Loading datasets¶

Basic cleaning¶

Subjects analysis¶

Geographic analysis¶

3.6 MiB

Raw Blame History