registries_analysis/01.1-explorative.ipynb at c6d01322c3ae6019250bce79bb1a46fff7ac874c

In [1]:

import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

In [2]:

def country_to_countrycode(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha3(country)
        except:
            return np.nan
        
def countrycode_iso2_to_countrycode_iso3(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha3(pycountry_convert.country_alpha2_to_country_name(country))
        except:
            return np.nan

def countrycode_to_continent(country_code):
    if pd.isna(country_code):
        return np.nan
    else:
        try:
            return pycountry_convert.country_alpha2_to_continent_code(pycountry_convert.country_alpha3_to_country_alpha2(country_code))
        except:
            return np.nan

Loading datasets¶

re3data

In [3]:

re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t', 
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'repository_id': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'provider_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    },
                        usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])
re3data_df.head()

Out[3]:

	re3data_id	repository_name	type	subject	provider_type	keyword	institution
0	r3d100000001	Odum Institute Archive Dataverse	[disciplinary]	[1 Humanities and Social Sciences, 111 Social ...	[dataProvider]	[FAIR, Middle East, crime, demography, economy...	[[Odum Institute for Research in Social Scienc...
1	r3d100000002	Access to Archival Databases	[disciplinary]	[1 Humanities and Social Sciences, 102 History...	[dataProvider]	[US History]	[[The U.S. National Archives and Records Admin...
2	r3d100000004	Datenbank Gesprochenes Deutsch	[disciplinary]	[1 Humanities and Social Sciences, 104 Linguis...	[dataProvider, serviceProvider]	[Australian German, FOLK, German dialects, Pfe...	[[Institut für Deutsche Sprache, Archiv für Ge...
3	r3d100000005	UNC Dataverse	[institutional]	[1 Humanities and Social Sciences, 111 Social ...	[dataProvider, serviceProvider]	[FAIR, census, demographic survey, demography,...	[[Odum Institute for Research in Social Scienc...
4	r3d100000006	Archaeology Data Service	[disciplinary]	[1 Humanities and Social Sciences, 101 Ancient...	[dataProvider, serviceProvider]	[FAIR, archaeology, cultural heritage, prehist...	[[Arts and Humanities Research Council, [AHRC]...

HERE I AM FILTERING SERVICE PROVIDERS OUT!!

In [4]:

re3data_df = re3data_df.explode('provider_type')
re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']

In [5]:

re3data_df.describe(include='all')

Out[5]:

	re3data_id	repository_name	type	subject	provider_type	keyword	institution
count	2467	2467	2467	2467	2459	2467	2467
unique	2466	2463	9	1282	1	2248	2447
top	r3d100011987	Landmap	[disciplinary]	[1 Humanities and Social Sciences, 2 Life Scie...	dataProvider	[multidisciplinary]	[[National Center for Biotechnology Informatio...
freq	2	2	1573	200	2459	181	6

openDOAR

In [6]:

opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'subject': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'opendoar_id': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    },
                        usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])
opendoar_df.head()

Out[6]:

	opendoar_id	repository_name	type	subject	institution
0	101	utrecht university repository	institutional	[multidisciplinary]	[[university of utrecht, [universiteit utrecht...
1	115	dspace at indian institute of management kozhi...	institutional	[ecology and environment, social sciences gene...	[[indian institute of management kozhikode, [i...
2	41	caltech engineering and science online	institutional	[biology and biochemistry, chemistry and chemi...	[[california institute of technology, [caltech...
3	119	dcu online research access service	institutional	[multidisciplinary]	[[dublin city university, [dcu], ie, [], , htt...
4	129	earth-prints repository	disciplinary	[earth and planetary sciences]	[[istituto nazionale di geofisica e vulcanolog...

In [7]:

opendoar_df.describe(include='all')

Out[7]:

	opendoar_id	repository_name	type	subject	institution
count	5707.000000	5707	5707	5707	5707
unique	NaN	5670	4	820	5098
top	NaN	arch	institutional	[multidisciplinary]	[[rijksuniversiteit groningen, [rug], nl, [], ...
freq	NaN	3	5067	3212	26
mean	4008.118801	NaN	NaN	NaN	NaN
std	2869.948770	NaN	NaN	NaN	NaN
min	2.000000	NaN	NaN	NaN	NaN
25%	1823.000000	NaN	NaN	NaN	NaN
50%	3361.000000	NaN	NaN	NaN	NaN
75%	5095.000000	NaN	NaN	NaN	NaN
max	10175.000000	NaN	NaN	NaN	NaN

ROAR

In [45]:

roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',
                     usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])
roar_df.head()

Out[45]:

	eprintid	home_page	title	location_country	subjects
0	921	http://alcme.oclc.org/ndltd/index.html	Networked Digital Library of Theses and Disser...	us	NaN
1	1489	http://prensahistorica.mcu.es/prensahistorica/...	Virtual Library of Historical Press	es	NaN
2	606	http://hal.archives-ouvertes.fr/	HAL: Hyper Article en Ligne	fr	NaN
3	606	NaN	NaN	NaN	NaN
4	606	NaN	NaN	NaN	NaN

In [43]:

# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)

In [47]:

roar_df[roar_df.eprintid == 2303]

Out[47]:

	eprintid	home_page	title	location_country	subjects
106	2303	NaN	Faculty Scholarship at The Claremont Colleges	us	AS
107	2303	NaN	NaN	NaN	BF
108	2303	NaN	NaN	NaN	BL
109	2303	NaN	NaN	NaN	CC
110	2303	NaN	NaN	NaN	GN
111	2303	NaN	NaN	NaN	H1
112	2303	NaN	NaN	NaN	HB
113	2303	NaN	NaN	NaN	JA
114	2303	NaN	NaN	NaN	LB
115	2303	NaN	NaN	NaN	NX
116	2303	NaN	NaN	NaN	PQ
117	2303	NaN	NaN	NaN	QA

In [44]:

roar_df.describe(include='all')

Out[44]:

	eprintid	home_page	title	location_country	subjects
count	5314.000000	5263	5268	5024	1225
unique	NaN	5156	5027	134	123
top	NaN	http://ir.lib.isu.edu.tw/	Repositorio Institucional	us	H1
freq	NaN	3	7	877	147
mean	6389.464434	NaN	NaN	NaN	NaN
std	5159.573937	NaN	NaN	NaN	NaN
min	1.000000	NaN	NaN	NaN	NaN
25%	1490.250000	NaN	NaN	NaN	NaN
50%	4990.500000	NaN	NaN	NaN	NaN
75%	10452.750000	NaN	NaN	NaN	NaN
max	17302.000000	NaN	NaN	NaN	NaN

FAIRsharing

In [11]:

fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', 
                             delimiter='|', header=0,
                             names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df.head()

Out[11]:

	full_name	short_name	fs_url	url	countries	subjects
0	GenBank	GenBank	https://fairsharing.org/10.25504/FAIRsharing.9...	https://www.ncbi.nlm.nih.gov/genbank/	European Union,Japan,United States	Bioinformatics,Data Management,Data Submission...
1	GlycoNAVI	GlycoNAVI	https://fairsharing.org/10.25504/FAIRsharing.w...	https://glyconavi.org/	Japan	Chemistry,Glycomics,Life Science,Organic Chemi...
2	ADHDgene	ADHDgene	https://fairsharing.org/10.25504/FAIRsharing.m...	http://adhd.psych.ac.cn/	China	Biomedical Science,Genetics
3	Allele frequency resource for research and tea...	ALFRED	https://fairsharing.org/10.25504/FAIRsharing.y...	http://alfred.med.yale.edu	United States	Life Science
4	Animal Transcription Factor Database	AnimalTFDB	https://fairsharing.org/10.25504/FAIRsharing.e...	http://bioinfo.life.hust.edu.cn/AnimalTFDB/	China	Life Science

In [12]:

fairsharing_df.describe(include='all')

Out[12]:

	full_name	short_name	fs_url	url	countries	subjects
count	1752	1752	1752	1752	1749	1690
unique	1752	1741	1752	1752	178	834
top	CiteAb	CGD	https://fairsharing.org/10.25504/FAIRsharing.1...	http://www.plexdb.org/	United States	Life Science
freq	1	3	1	1	588	367

Subjects analysis¶

re3data

In [13]:

re3data_subjects = re3data_df.explode('subject')

In [37]:

data = re3data_subjects.groupby('subject')[['re3data_id']].count().sort_values('re3data_id', ascending=False)
plot = [
    go.Bar(
        x=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)].index,
        y=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)]['re3data_id'],
        name='re3data tier %s-digits' % tier
    ) for tier in [1,2,3,5]
] 

layout = go.Layout(
    title='Subject coverage re3data',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

OpenDOAR

In [15]:

opendoar_subjects = opendoar_df.explode('subject')

In [16]:

data = opendoar_subjects.groupby('subject')[['opendoar_id']].count().sort_values('opendoar_id', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['opendoar_id'],
    ) 
] 

layout = go.Layout(
    title='Subject coverage OpenDOAR',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

ROAR

In [17]:

roar_df.subjects.unique()

Out[17]:

array([nan, 'DP', 'Q1', 'AS', 'AC', 'AM', 'H1', 'L1', 'AI', 'GC', 'HD28',
       'F1201', 'R1', 'QM', 'BF', 'Z665', 'B1', 'DS', 'NX', 'K1', 'G1',
       'HA', 'D051', 'T1', 'QA', 'E151', 'D901', 'BP', 'QK', 'D1', 'HB',
       'GE', 'QE', 'QA76', 'QC', 'LB', 'RT', 'N1', 'BQ', 'QD', 'RA0421',
       'S1', 'LG', 'SH', 'ZA', 'LE', 'HV', 'DAW', 'C1', 'JA', 'AZ', 'QR',
       'F001', 'QH301', 'QB', 'QL', 'TJ', 'RZ', 'RS', 'GV', 'LB2300',
       'RF', 'RG', 'LA', 'Z004', 'HM', 'GA', 'M1', 'PI', 'BJ', 'BH', 'BC',
       'JL', 'J1', 'RK', 'TP', 'PE', 'TL', 'P1', 'HD61', 'BL', 'RM', 'RB',
       'QA75', 'SD', 'E11', 'HF5601', 'TA', 'D839', 'LF', 'CB', 'PG',
       'QH', 'HC', 'Z719', 'HE', 'TK', 'HT', 'CC', 'PL', 'HF', 'ZA4050',
       'NB', 'CT', 'HJ', 'HG', 'PD', 'JN', 'LD', 'DE', 'DF', 'DK', 'GN',
       'JX', 'QP', 'LC', 'TC'], dtype=object)

In [39]:

roar_df[roar_df.subjects == 'HD28']

Out[39]:

	eprintid	home_page	title	location_country	subjects	continent
352	6412	https://dspace.cvut.cz/	Digital Library of the Czech Technical Univers...	CZE	HD28	EU
8164	14592	http://repository.stie-aub.ac.id	Welcome to Repository STIE AUB Surakarta - Rep...	IDN	HD28	AS
9402	13418	http://repositorio.gerens.edu.pe/	Repositorio de la Escuela de Postgrado Gerens	PER	HD28	SA
10929	11743	http://publikaciotar.repozitorium.uni-bge.hu/	BORY: Budapest Business School Repository	HUN	HD28	EU
11428	11134	https://eduq.info	ÉDUQ: Archive ouverte du réseau collégial du Q...	CAN	HD28	NA

FAIRsharing

In [19]:

fairsharing_subjects = fairsharing_df.explode('subjects')

In [20]:

data = fairsharing_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['url'],
        name='FAIRsharing'
    )
]

layout = go.Layout(
    title='Subject coverage FAIRsharing',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

Geographic analysis¶

re3data

In [21]:

re3data_institutions = re3data_df.explode('institution')[['re3data_id', 'institution']]
re3data_institutions = re3data_institutions[~re3data_institutions.institution.isna()].reset_index(drop=True)
re3data_institutions = re3data_institutions.join(pd.DataFrame(re3data_institutions.institution.to_list(), columns=['org_name', 'org_other_names', 
                                                                                              'org_country', 'org_3', 'org_noprofit', 
                                                                                              'org_url', 'org_ids', 'org_date1', 
                                                                                              'org_date2', 'org_contact']))
re3data_institutions.head()

Out[21]:

	re3data_id	institution	org_name	org_other_names	org_country	org_3	org_noprofit	org_url	org_ids	org_date1	org_contact
0	r3d100000001	[Odum Institute for Research in Social Science...	Odum Institute for Research in Social Science	[]	USA	[general]	non-profit	https://odum.unc.edu/archive/	[]		[]
1	r3d100000002	[The U.S. National Archives and Records Admini...	The U.S. National Archives and Records Adminis...	[NARA, National Archives]	USA	[general]	non-profit	http://www.archives.gov/	[]		[http://www.archives.gov/contact/]
2	r3d100000002	[The USA.gov, [], USA, [general], non-profit, ...	The USA.gov	[]	USA	[general]	non-profit	http://www.usa.gov/	[]		[http://www.usa.gov/Contact.shtml]
3	r3d100000004	[Institut für Deutsche Sprache, Archiv für Ges...	Institut für Deutsche Sprache, Archiv für Gesp...	[AGD]	DEU	[funding, general]	non-profit	http://agd.ids-mannheim.de/index.shtml	[]	2004	[agd@ids-mannheim.de]
4	r3d100000005	[Odum Institute for Research in Social Science...	Odum Institute for Research in Social Science	[]	USA	[technical]	non-profit	https://odum.unc.edu/	[]		[https://odum.unc.edu/contact/contact-form/, o...

In [22]:

re3data_institutions['org_continent'] = re3data_institutions.org_country.map(countrycode_to_continent)

In [23]:

re3data_institutions[re3data_institutions.org_continent.isna()].org_country.unique()

Out[23]:

array(['AAA', 'EEC'], dtype=object)

AAA is used for international collaborations; we skip this. EEC is used for the EU commission; we fix the continent manually.

In [24]:

re3data_institutions.loc[re3data_institutions.org_country == 'EEC', 'org_continent'] = 'EU'

OpenDOAR

In [25]:

opendoar_institutions = opendoar_df.explode('institution')[['opendoar_id', 'institution']]
opendoar_institutions = opendoar_institutions[~opendoar_institutions.institution.isna()].reset_index(drop=True)
opendoar_institutions = opendoar_institutions.join(pd.DataFrame(opendoar_institutions.institution.to_list(), columns=['org_name', 'org_other_names', 
                                                                                              'org_country', 'org_3', 'org_noprofit', 
                                                                                              'org_url', 'org_ids', 'org_date1', 
                                                                                              'org_date2', 'org_contact']))
opendoar_institutions['org_country'] = opendoar_institutions.org_country.map(str.upper, na_action='ignore')
opendoar_institutions['org_country'] = opendoar_institutions.org_country.map(countrycode_iso2_to_countrycode_iso3, na_action='ignore')
opendoar_institutions.head()

Out[25]:

	opendoar_id	institution	org_name	org_other_names	org_country	org_3	org_url	org_ids	org_contact
0	101	[university of utrecht, [universiteit utrecht]...	university of utrecht	[universiteit utrecht]	NLD	[]	https://www.uu.nl	[https://ror.org/04pp8hn57]	[]
1	115	[indian institute of management kozhikode, [ii...	indian institute of management kozhikode	[iimk]	IND	[]	http://www.iimk.ac.in/	[https://ror.org/03m1xdc36]	[]
2	41	[california institute of technology, [caltech]...	california institute of technology	[caltech]	USA	[]	http://www.caltech.edu/	[https://ror.org/05dxps055]	[]
3	119	[dublin city university, [dcu], ie, [], , http...	dublin city university	[dcu]	IRL	[]	http://www.dcu.ie/	[https://ror.org/04a1a1e81]	[]
4	129	[istituto nazionale di geofisica e vulcanologi...	istituto nazionale di geofisica e vulcanologia	[ingv]	ITA	[]	http://www.ingv.it	[https://ror.org/00qps9a02]	[]

In [26]:

opendoar_institutions['org_continent'] = opendoar_institutions.org_country.map(countrycode_to_continent)

In [27]:

opendoar_institutions[opendoar_institutions.org_continent.isna()].org_country.unique()

Out[27]:

array([nan, 'UMI'], dtype=object)

In [28]:

opendoar_institutions.loc[opendoar_institutions.org_country == 'UMI', 'org_continent'] = 'NA'
opendoar_institutions[opendoar_institutions.org_country == 'UMI']

Out[28]:

	opendoar_id	institution	org_name	org_other_names	org_country	org_3	org_noprofit	org_url	org_ids	org_date1	org_date2	org_contact	org_continent
4349	5379	[kettering university, [], um, [], , https://w...	kettering university	[]	UMI	[]		https://www.kettering.edu	[https://ror.org/03rcspa57]			[]	NA

ROAR

In [29]:

roar_df['location_country'] = roar_df.location_country.map(str.upper, na_action='ignore')
roar_df['location_country'] = roar_df.location_country.map(countrycode_iso2_to_countrycode_iso3)
roar_df['continent'] = roar_df.location_country.map(countrycode_to_continent)

FAIRsharing

In [30]:

fairsharing_df['subjects'] = fairsharing_df.subjects.str.split(pat=',')
fairsharing_df['countries'] = fairsharing_df.countries.str.split(pat=',')

fairsharing_countries = fairsharing_df.explode('countries')
fairsharing_countries['countrycode'] = fairsharing_countries.countries.map(country_to_countrycode)
fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(countrycode_to_continent)

In [31]:

fairsharing_countries[fairsharing_countries.countrycode.isna()].countries.unique()

Out[31]:

array(['European Union', 'Republic of Ireland', 'Worldwide', nan],
      dtype=object)

In [32]:

fairsharing_countries[fairsharing_countries.continent.isna()].countries.unique()

Out[32]:

array(['European Union', 'Republic of Ireland', 'Worldwide', 'Antarctica',
       nan], dtype=object)

Fix manually some rows

In [33]:

fairsharing_countries.loc[fairsharing_countries.countries == 'Republic of Ireland', ['countries', 'countrycode', 'continent']] = ['Ireland', 'IE', 'EU']
fairsharing_countries.loc[fairsharing_countries.countries == 'European Union', ['countrycode', 'continent']] = ['EU', 'EU']

Make Antactica disappear (only one repo)

In [34]:

fairsharing_countries.loc[fairsharing_countries.countries == 'Antarctica', ['countrycode', 'continent']] = ['AQ', np.nan]
fairsharing_countries[fairsharing_countries.countrycode == 'AQ']

Out[34]:

	full_name	short_name	fs_url	url	countries	subjects	countrycode	continent
915	Antabif IPT - AntOBIS IPT - GBIF Belgium	Antabif IPT - AntOBIS IPT - GBIF Belgium	https://fairsharing.org/10.25504/FAIRsharing.e...	http://ipt.biodiversity.aq/	Antarctica	[Biodiversity, Life Science]	AQ	NaN

Country coverage¶

In [35]:

data1 = re3data_institutions.groupby('org_country')[['re3data_id']].count().sort_values('re3data_id', ascending=False)
data2 = opendoar_institutions.groupby('org_country')[['opendoar_id']].count().sort_values('opendoar_id', ascending=False)
data3 = roar_df.groupby('location_country')[['eprintid']].count().sort_values('eprintid', ascending=False)
data4 = fairsharing_countries.groupby('countrycode')[['url']].count().sort_values('url', ascending=False)

plot = [
    go.Bar(
        x=data1.index,
        y=data1['re3data_id'],
        name='re3data'
    ),
    go.Bar(
        x=data2.index,
        y=data2['opendoar_id'],
        name='openDOAR',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data3.index,
        y=data3['eprintid'],
        name='ROAR',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data4.index,
        y=data4['url'],
        name='FAIRsharing',
        visible = 'legendonly'
    )
]

layout = go.Layout(
    title='Country coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

go.Figure(plot, layout).show()

Continental coverage¶

In [36]:

data1 = re3data_institutions.groupby('org_continent')[['re3data_id']].count()
data2 = opendoar_institutions.groupby('org_continent')[['opendoar_id']].count()
data3 = roar_df.groupby('continent')[['eprintid']].count()
data4 = fairsharing_countries.groupby('continent')[['url']].count()

plot = [
    go.Scatterpolar(
        r=data1.re3data_id,
        theta=data1.index,
        fill='toself',
        name='re3data'),
    go.Scatterpolar(
        r=data2.opendoar_id,
        theta=data2.index,
        fill='toself',
        name='OpenDOAR'),
    go.Scatterpolar(
        r=data3.eprintid,
        theta=data3.index,
        fill='toself',
        name='ROAR'),
    go.Scatterpolar(
        r=data4.url,
        theta=data4.index,
        fill='toself',
        name='FAIRsharing')
]

layout = go.Layout(polar=dict(
    radialaxis=dict(
      visible=True
    ),
  )
)

go.Figure(plot, layout).show()

401 KiB Raw Blame History

Loading datasets¶

Subjects analysis¶

Geographic analysis¶

Country coverage¶

Continental coverage¶

401 KiB

Raw Blame History