In [1]:

import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

FAIRsharing¶

In [2]:

fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', 
                             delimiter='|', header=0,
                             names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df['subjects'] = fairsharing_df.subjects.str.split(pat=',')
fairsharing_df['countries'] = fairsharing_df.countries.str.split(pat=',')
fairsharing_df.head()

Out[2]:

	full_name	short_name	fs_url	url	countries	subjects
0	GenBank	GenBank	https://fairsharing.org/10.25504/FAIRsharing.9...	https://www.ncbi.nlm.nih.gov/genbank/	[European Union, Japan, United States]	[Bioinformatics, Data Management, Data Submiss...
1	GlycoNAVI	GlycoNAVI	https://fairsharing.org/10.25504/FAIRsharing.w...	https://glyconavi.org/	[Japan]	[Chemistry, Glycomics, Life Science, Organic C...
2	ADHDgene	ADHDgene	https://fairsharing.org/10.25504/FAIRsharing.m...	http://adhd.psych.ac.cn/	[China]	[Biomedical Science, Genetics]
3	Allele frequency resource for research and tea...	ALFRED	https://fairsharing.org/10.25504/FAIRsharing.y...	http://alfred.med.yale.edu	[United States]	[Life Science]
4	Animal Transcription Factor Database	AnimalTFDB	https://fairsharing.org/10.25504/FAIRsharing.e...	http://bioinfo.life.hust.edu.cn/AnimalTFDB/	[China]	[Life Science]

In [3]:

fairsharing_df.describe()

Out[3]:

	full_name	short_name	fs_url	url	countries	subjects
count	1752	1752	1752	1752	1749	1690
unique	1752	1741	1752	1752	178	834
top	The Cardiovascular Research Grid	CGD	https://fairsharing.org/bsg-d001750	http://www.bmrb.wisc.edu/	[United States]	[Life Science]
freq	1	3	1	1	588	367

In [4]:

fairsharing_subjects = fairsharing_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)

data = [
    go.Bar(
        x=fairsharing_subjects.index,
        y=fairsharing_subjects['url']
    )
]

layout = go.Layout(
    title='Fairsharing subject coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [5]:

fairsharing_countries = fairsharing_df.explode('countries').groupby('countries')[['url']].count().sort_values('url', ascending=False)

data = [
    go.Bar(
        x=fairsharing_countries.index,
        y=fairsharing_countries['url']
    )
]

layout = go.Layout(
    title='Fairsharing country coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

re3data¶

In [6]:

re3data_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
re3data_df = re3data_df[re3data_df.id.str.contains('re3data')]
re3data_df.head()

Out[6]:

	id	url	official_name	english_name	description	subjects
4	10\|re3data_____::3f2e20af26ead0432f5470d8b739638d	http://planttfdb.cbi.pku.edu.cn/	Plant Transcription Factor Database	PlantTFDB	NaN	['Life Sciences', 'Basic Biological and Medica...
7	10\|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfc	https://spdf.gsfc.nasa.gov/	Space Physics Data Facility	NASA's Space Physics Data Facility SPDF	NaN	['Natural Sciences', 'Astrophysics and Astrono...
13	10\|re3data_____::59521daca59ac29b811343cc4cd370cf	http://card.westgis.ac.cn/	Cold and Arid Regions Science Data Center at L...	CARD WDC for Glaciology and Geocryology World ...	NaN	['Natural Sciences', 'Geosciences (including G...
14	10\|re3data_____::ec1ba1674c852466c266acb64c618d15	https://www.psycharchives.org/	Psycharchives	NaN	NaN	['Humanities and Social Sciences', 'Psychology...
19	10\|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76	https://www.ihfc-iugg.org/products/global-heat...	The Global Heat Flow Database of the Internati...	International Heat-flow Database	NaN	['Natural Sciences', 'Geology and Palaeontolog...

In [7]:

re3data_df.loc[(re3data_df.latitude == 0.0) & (re3data_df.longitude == 0.0), ['latitude', 'longitude']] = [np.nan, np.nan]

In [8]:

re3data_df.subjects

Out[8]:

4       ['Life Sciences', 'Basic Biological and Medica...
7       ['Natural Sciences', 'Astrophysics and Astrono...
13      ['Natural Sciences', 'Geosciences (including G...
14      ['Humanities and Social Sciences', 'Psychology...
19      ['Natural Sciences', 'Geology and Palaeontolog...
                              ...                        
8693    ['Life Sciences', 'Basic Biological and Medica...
8695    ['Natural Sciences', 'Atmospheric Science and ...
8697    ['Natural Sciences', 'Atmospheric Science and ...
8699    ['Natural Sciences', 'Atmospheric Science and ...
8705    ['Life Sciences', 'Plant Sciences', 'Plant Gen...
Name: subjects, Length: 2693, dtype: object

In [9]:

re3data_df['subjects'] = re3data_df.subjects.apply(lambda x: ast.literal_eval(x))

In [10]:

def merge_lists(lists):
    res = []
    for l in lists:
        res = res + l
    return res

re3data_cleaned_subjects = re3data_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
                                .apply(lambda row: row.dropna().tolist(), axis=1)\
                                .reset_index()\
                                .groupby('index')[0].apply(lambda x: merge_lists(x))

In [11]:

re3data_cleaned_subjects

Out[11]:

index
4       [Life Sciences, Basic Biological, Medical Rese...
7       [Natural Sciences, Astrophysics, Astronomy, Ph...
13      [Natural Sciences, Geosciences (including Geog...
14      [Humanities, Social Sciences, Psychology, Soci...
19      [Natural Sciences, Geology, Palaeontology, Geo...
                              ...                        
8693    [Life Sciences, Basic Biological, Medical Rese...
8695    [Natural Sciences, Atmospheric Science, Oceano...
8697    [Natural Sciences, Atmospheric Science, Oceano...
8699    [Natural Sciences, Atmospheric Science, Oceano...
8705    [Life Sciences, Plant Sciences, Plant Genetics...
Name: 0, Length: 2693, dtype: object

In [12]:

re3data_df = re3data_df.join(re3data_cleaned_subjects)

In [13]:

re3data_df.drop(columns=['subjects'], inplace=True)
re3data_df.rename(columns={0:'subjects'}, inplace=True)

In [14]:

re3data_df.describe(include='all')

Out[14]:

	id	url	official_name	english_name	description	latitude	longitude	subjects
count	2693	2673	2693	2034	38	5.000000	5.000000	2693
unique	2693	2661	2668	2010	38	NaN	NaN	1427
top	10\|re3data_____::e59f89142e8d47d32523c53a9137f07b	http://iubio.bio.indiana.edu/	IUBio-Archive	Research Data Repository	IUBio Archive is an archive of biology data an...	NaN	NaN	[Humanities, Social Sciences, Life Sciences, N...
freq	1	2	2	2	1	NaN	NaN	209
mean	NaN	NaN	NaN	NaN	NaN	61.668113	36.623678	NaN
std	NaN	NaN	NaN	NaN	NaN	96.984457	48.547521	NaN
min	NaN	NaN	NaN	NaN	NaN	12.123000	12.123000	NaN
25%	NaN	NaN	NaN	NaN	NaN	12.123000	12.123400	NaN
50%	NaN	NaN	NaN	NaN	NaN	12.123400	12.123400	NaN
75%	NaN	NaN	NaN	NaN	NaN	37.971163	23.748590	NaN
max	NaN	NaN	NaN	NaN	NaN	234.000000	123.000000	NaN

In [15]:

re3data_subjects = re3data_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)

data = [
    go.Bar(
        x=re3data_subjects.index,
        y=re3data_subjects['url']
    )
]

layout = go.Layout(
    title='re3data subject coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

OpenDOAR¶

In [82]:

opendoar_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
opendoar_df = opendoar_df[opendoar_df.id.str.contains('opendoar')]
opendoar_df.head()

Out[82]:

	id	url	official_name	english_name	description	latitude	longitude	subjects
0	10\|opendoar____::e833e042f509c996b1b25324d56659fb	http://www.bilbao.net/bld	BLD - Bilboko Liburutegi Digitala	BLD - Bilboko Liburutegi Digitala	BLD is a repository of digital documents, desi...	43.256699	-2.924100	[]
1	10\|opendoar____::f621585df244e9596dc70a39b579efb1	https://researchdirect.westernsydney.edu.au/	Western Sydney ResearchDirect	Western Sydney ResearchDirect	NaN	0.000000	0.000000	[]
2	10\|opendoar____::437d7d1d97917cd627a34a6a0fb41136	http://redress.lancs.ac.uk/Learning_Space/	Learning Space Catalogue	NaN	This repository is a Social Science e-Science ...	54.010760	-2.784990	['Social Sciences General', 'Science General',...
3	10\|opendoar____::d840cc5d906c3e9c84374c8919d2074e	http://digitallibrary.usc.edu/search/controlle...	USC Digital Library	USC Digital Library	This is an institutional repository providing ...	34.052200	-118.242996	[]
5	10\|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010	http://www.ufgd.edu.br:8080/jspui/	Repositório de Divulgação das Produções Cientí...	Repositório de Divulgação das Produções Cientí...	This site provides access to the research outp...	-22.221800	-54.806400	[]

In [84]:

opendoar_df.subjects

Out[84]:

0                                                      []
1                                                      []
2       ['Social Sciences General', 'Science General',...
3                                                      []
5                                                      []
                              ...                        
8701                                ['Multidisciplinary']
8702                                                   []
8703                           ['Business and Economics']
8704    ['Earth and Planetary Sciences', 'Ecology and ...
8706                                                   []
Name: subjects, Length: 6014, dtype: object

In [85]:

opendoar_df['subjects'] = opendoar_df.subjects.apply(lambda x: ast.literal_eval(x))

In [86]:

opendoar_cleaned_subjects = opendoar_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
                                .apply(lambda row: row.dropna().tolist(), axis=1)\
                                .reset_index()\
                                .groupby('index')[0].apply(lambda x: merge_lists(x))

In [87]:

opendoar_cleaned_subjects

Out[87]:

index
0                                                      []
1                                                      []
2       [Social Sciences General, Science General, Com...
3                                                      []
5                                                      []
                              ...                        
8701                                  [Multidisciplinary]
8702                                                   []
8703                                [Business, Economics]
8704    [Earth, Planetary Sciences, Ecology, Environme...
8706                                                   []
Name: 0, Length: 6014, dtype: object

In [88]:

opendoar_df = opendoar_df.join(opendoar_cleaned_subjects)

In [89]:

opendoar_df.drop(columns=['subjects'], inplace=True)
opendoar_df.rename(columns={0: 'subjects'}, inplace=True)

In [90]:

opendoar_df.describe(include='all')

Out[90]:

	id	url	official_name	english_name	description	latitude	longitude	subjects
count	6014	6013	6014	5500	5776	6014.000000	6014.000000	6014
unique	6014	5953	5946	5413	4920	NaN	NaN	201
top	10\|opendoar____::a2557a7b2e94197ff767970b67041697	http://harp.lib.hiroshima-u.ac.jp/	Hiroshima Associated Repository Portal	AURA	This site provides access to the research outp...	NaN	NaN	[]
freq	1	3	3	4	98	NaN	NaN	5273
mean	NaN	NaN	NaN	NaN	NaN	38.649393	7.810948	NaN
std	NaN	NaN	NaN	NaN	NaN	788.406173	71.689788	NaN
min	NaN	NaN	NaN	NaN	NaN	-79.029999	-683.103027	NaN
25%	NaN	NaN	NaN	NaN	NaN	4.644632	-49.273300	NaN
50%	NaN	NaN	NaN	NaN	NaN	37.930449	4.788870	NaN
75%	NaN	NaN	NaN	NaN	NaN	47.294400	30.685501	NaN
max	NaN	NaN	NaN	NaN	NaN	61138.800781	178.438995	NaN

In [91]:

opendoar_subjects = opendoar_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)

data = [
    go.Bar(
        x=opendoar_subjects.index,
        y=opendoar_subjects['url']
    )
]

layout = go.Layout(
    title='OpenDOAR subject coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

In [92]:

reverse_geocoding = pd.DataFrame(rg.search(opendoar_df[['latitude', 'longitude']].apply(tuple, axis=1).tolist()))
reverse_geocoding['lat'] = reverse_geocoding['lat'].astype('float')
reverse_geocoding['lon'] = reverse_geocoding['lon'].astype('float')
reverse_geocoding

Out[92]:

	lat	lon	name	admin1	admin2	cc
0	43.26271	-2.92528	Bilbao	Basque Country	Bizkaia	ES
1	4.88447	-1.75536	Takoradi	Western		GH
2	53.98333	-2.78333	Galgate	England	Lancashire	GB
3	34.05223	-118.24368	Los Angeles	California	Los Angeles County	US
4	-22.22111	-54.80556	Dourados	Mato Grosso do Sul	Dourados	BR
...	...	...	...	...	...	...
6009	40.85631	14.24641	Napoli	Campania	Provincia di Napoli	IT
6010	38.19394	15.55256	Messina	Sicily	Messina	IT
6011	54.32133	10.13489	Kiel	Schleswig-Holstein		DE
6012	43.40785	-73.25955	Granville	New York	Washington County	US
6013	33.96095	-83.37794	Athens	Georgia	Clarke County	US

6014 rows × 6 columns

In [94]:

opendoar_df = opendoar_df.join(reverse_geocoding[['cc']])

In [96]:

opendoar_df.loc[(opendoar_df.latitude == 0.0) & (opendoar_df.longitude == 0.0), ['latitude', 'longitude', 'cc']] = [np.nan, np.nan, np.nan]

In [103]:

opendoar_countries = opendoar_df.groupby('cc')[['id']].count().sort_values('id', ascending=False)

data = [
    go.Bar(
        x=opendoar_countries.index,
        y=opendoar_countries['id']
    )
]

layout = go.Layout(
    title='OpenDOAR country coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)

3.7 MiB Raw Blame History Unescape Escape

FAIRsharing¶

re3data¶

OpenDOAR¶

3.7 MiB

Raw Blame History