registries_analysis/notebooks/02-subjects&geographic.ipynb

401 KiB

In [1]:
import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)
In [2]:
def country_to_countrycode(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha3(country)
        except:
            return np.nan
        
def countrycode_iso2_to_countrycode_iso3(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha3(pycountry_convert.country_alpha2_to_country_name(country))
        except:
            return np.nan

def countrycode_to_continent(country_code):
    if pd.isna(country_code):
        return np.nan
    else:
        try:
            return pycountry_convert.country_alpha2_to_continent_code(pycountry_convert.country_alpha3_to_country_alpha2(country_code))
        except:
            return np.nan

Loading datasets

re3data

In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t', 
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'repository_id': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'provider_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    },
                        usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])
re3data_df.head()
Out[3]:
re3data_id repository_name type subject provider_type keyword institution
0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] [1 Humanities and Social Sciences, 111 Social ... [dataProvider] [FAIR, Middle East, crime, demography, economy... [[Odum Institute for Research in Social Scienc...
1 r3d100000002 Access to Archival Databases [disciplinary] [1 Humanities and Social Sciences, 102 History... [dataProvider] [US History] [[The U.S. National Archives and Records Admin...
2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] [1 Humanities and Social Sciences, 104 Linguis... [dataProvider, serviceProvider] [Australian German, FOLK, German dialects, Pfe... [[Institut für Deutsche Sprache, Archiv für Ge...
3 r3d100000005 UNC Dataverse [institutional] [1 Humanities and Social Sciences, 111 Social ... [dataProvider, serviceProvider] [FAIR, census, demographic survey, demography,... [[Odum Institute for Research in Social Scienc...
4 r3d100000006 Archaeology Data Service [disciplinary] [1 Humanities and Social Sciences, 101 Ancient... [dataProvider, serviceProvider] [FAIR, archaeology, cultural heritage, prehist... [[Arts and Humanities Research Council, [AHRC]...

HERE I AM FILTERING SERVICE PROVIDERS OUT!!

In [4]:
re3data_df = re3data_df.explode('provider_type')
re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']
In [5]:
re3data_df.describe(include='all')
Out[5]:
re3data_id repository_name type subject provider_type keyword institution
count 2467 2467 2467 2467 2459 2467 2467
unique 2466 2463 9 1282 1 2248 2447
top r3d100011987 Landmap [disciplinary] [1 Humanities and Social Sciences, 2 Life Scie... dataProvider [multidisciplinary] [[National Center for Biotechnology Informatio...
freq 2 2 1573 200 2459 181 6

openDOAR

In [6]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'subject': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'opendoar_id': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    },
                        usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])
opendoar_df.head()
Out[6]:
opendoar_id repository_name type subject institution
0 101 utrecht university repository institutional [multidisciplinary] [[university of utrecht, [universiteit utrecht...
1 115 dspace at indian institute of management kozhi... institutional [ecology and environment, social sciences gene... [[indian institute of management kozhikode, [i...
2 41 caltech engineering and science online institutional [biology and biochemistry, chemistry and chemi... [[california institute of technology, [caltech...
3 119 dcu online research access service institutional [multidisciplinary] [[dublin city university, [dcu], ie, [], , htt...
4 129 earth-prints repository disciplinary [earth and planetary sciences] [[istituto nazionale di geofisica e vulcanolog...
In [7]:
opendoar_df.describe(include='all')
Out[7]:
opendoar_id repository_name type subject institution
count 5707.000000 5707 5707 5707 5707
unique NaN 5670 4 820 5098
top NaN arch institutional [multidisciplinary] [[rijksuniversiteit groningen, [rug], nl, [], ...
freq NaN 3 5067 3212 26
mean 4008.118801 NaN NaN NaN NaN
std 2869.948770 NaN NaN NaN NaN
min 2.000000 NaN NaN NaN NaN
25% 1823.000000 NaN NaN NaN NaN
50% 3361.000000 NaN NaN NaN NaN
75% 5095.000000 NaN NaN NaN NaN
max 10175.000000 NaN NaN NaN NaN

ROAR

In [45]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',
                     usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])
roar_df.head()
Out[45]:
eprintid home_page title location_country subjects
0 921 http://alcme.oclc.org/ndltd/index.html Networked Digital Library of Theses and Disser... us NaN
1 1489 http://prensahistorica.mcu.es/prensahistorica/... Virtual Library of Historical Press es NaN
2 606 http://hal.archives-ouvertes.fr/ HAL: Hyper Article en Ligne fr NaN
3 606 NaN NaN NaN NaN
4 606 NaN NaN NaN NaN
In [43]:
# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)
In [47]:
roar_df[roar_df.eprintid == 2303]
Out[47]:
eprintid home_page title location_country subjects
106 2303 NaN Faculty Scholarship at The Claremont Colleges us AS
107 2303 NaN NaN NaN BF
108 2303 NaN NaN NaN BL
109 2303 NaN NaN NaN CC
110 2303 NaN NaN NaN GN
111 2303 NaN NaN NaN H1
112 2303 NaN NaN NaN HB
113 2303 NaN NaN NaN JA
114 2303 NaN NaN NaN LB
115 2303 NaN NaN NaN NX
116 2303 NaN NaN NaN PQ
117 2303 NaN NaN NaN QA
In [44]:
roar_df.describe(include='all')
Out[44]:
eprintid home_page title location_country subjects
count 5314.000000 5263 5268 5024 1225
unique NaN 5156 5027 134 123
top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional us H1
freq NaN 3 7 877 147
mean 6389.464434 NaN NaN NaN NaN
std 5159.573937 NaN NaN NaN NaN
min 1.000000 NaN NaN NaN NaN
25% 1490.250000 NaN NaN NaN NaN
50% 4990.500000 NaN NaN NaN NaN
75% 10452.750000 NaN NaN NaN NaN
max 17302.000000 NaN NaN NaN NaN

FAIRsharing

In [11]:
fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', 
                             delimiter='|', header=0,
                             names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df.head()
Out[11]:
full_name short_name fs_url url countries subjects
0 GenBank GenBank https://fairsharing.org/10.25504/FAIRsharing.9... https://www.ncbi.nlm.nih.gov/genbank/ European Union,Japan,United States Bioinformatics,Data Management,Data Submission...
1 GlycoNAVI GlycoNAVI https://fairsharing.org/10.25504/FAIRsharing.w... https://glyconavi.org/ Japan Chemistry,Glycomics,Life Science,Organic Chemi...
2 ADHDgene ADHDgene https://fairsharing.org/10.25504/FAIRsharing.m... http://adhd.psych.ac.cn/ China Biomedical Science,Genetics
3 Allele frequency resource for research and tea... ALFRED https://fairsharing.org/10.25504/FAIRsharing.y... http://alfred.med.yale.edu United States Life Science
4 Animal Transcription Factor Database AnimalTFDB https://fairsharing.org/10.25504/FAIRsharing.e... http://bioinfo.life.hust.edu.cn/AnimalTFDB/ China Life Science
In [12]:
fairsharing_df.describe(include='all')
Out[12]:
full_name short_name fs_url url countries subjects
count 1752 1752 1752 1752 1749 1690
unique 1752 1741 1752 1752 178 834
top CiteAb CGD https://fairsharing.org/10.25504/FAIRsharing.1... http://www.plexdb.org/ United States Life Science
freq 1 3 1 1 588 367

Subjects analysis

re3data

In [13]:
re3data_subjects = re3data_df.explode('subject')
In [37]:
data = re3data_subjects.groupby('subject')[['re3data_id']].count().sort_values('re3data_id', ascending=False)
plot = [
    go.Bar(
        x=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)].index,
        y=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)]['re3data_id'],
        name='re3data tier %s-digits' % tier
    ) for tier in [1,2,3,5]
] 

layout = go.Layout(
    title='Subject coverage re3data',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

OpenDOAR

In [15]:
opendoar_subjects = opendoar_df.explode('subject')
In [16]:
data = opendoar_subjects.groupby('subject')[['opendoar_id']].count().sort_values('opendoar_id', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['opendoar_id'],
    ) 
] 

layout = go.Layout(
    title='Subject coverage OpenDOAR',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

ROAR

In [17]:
roar_df.subjects.unique()
Out[17]:
array([nan, 'DP', 'Q1', 'AS', 'AC', 'AM', 'H1', 'L1', 'AI', 'GC', 'HD28',
       'F1201', 'R1', 'QM', 'BF', 'Z665', 'B1', 'DS', 'NX', 'K1', 'G1',
       'HA', 'D051', 'T1', 'QA', 'E151', 'D901', 'BP', 'QK', 'D1', 'HB',
       'GE', 'QE', 'QA76', 'QC', 'LB', 'RT', 'N1', 'BQ', 'QD', 'RA0421',
       'S1', 'LG', 'SH', 'ZA', 'LE', 'HV', 'DAW', 'C1', 'JA', 'AZ', 'QR',
       'F001', 'QH301', 'QB', 'QL', 'TJ', 'RZ', 'RS', 'GV', 'LB2300',
       'RF', 'RG', 'LA', 'Z004', 'HM', 'GA', 'M1', 'PI', 'BJ', 'BH', 'BC',
       'JL', 'J1', 'RK', 'TP', 'PE', 'TL', 'P1', 'HD61', 'BL', 'RM', 'RB',
       'QA75', 'SD', 'E11', 'HF5601', 'TA', 'D839', 'LF', 'CB', 'PG',
       'QH', 'HC', 'Z719', 'HE', 'TK', 'HT', 'CC', 'PL', 'HF', 'ZA4050',
       'NB', 'CT', 'HJ', 'HG', 'PD', 'JN', 'LD', 'DE', 'DF', 'DK', 'GN',
       'JX', 'QP', 'LC', 'TC'], dtype=object)
In [39]:
roar_df[roar_df.subjects == 'HD28']
Out[39]:
eprintid home_page title location_country subjects continent
352 6412 https://dspace.cvut.cz/ Digital Library of the Czech Technical Univers... CZE HD28 EU
8164 14592 http://repository.stie-aub.ac.id Welcome to Repository STIE AUB Surakarta - Rep... IDN HD28 AS
9402 13418 http://repositorio.gerens.edu.pe/ Repositorio de la Escuela de Postgrado Gerens PER HD28 SA
10929 11743 http://publikaciotar.repozitorium.uni-bge.hu/ BORY: Budapest Business School Repository HUN HD28 EU
11428 11134 https://eduq.info ÉDUQ: Archive ouverte du réseau collégial du Q... CAN HD28 NA

FAIRsharing

In [19]:
fairsharing_subjects = fairsharing_df.explode('subjects')
In [20]:
data = fairsharing_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['url'],
        name='FAIRsharing'
    )
]

layout = go.Layout(
    title='Subject coverage FAIRsharing',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

Geographic analysis

re3data

In [21]:
re3data_institutions = re3data_df.explode('institution')[['re3data_id', 'institution']]
re3data_institutions = re3data_institutions[~re3data_institutions.institution.isna()].reset_index(drop=True)
re3data_institutions = re3data_institutions.join(pd.DataFrame(re3data_institutions.institution.to_list(), columns=['org_name', 'org_other_names', 
                                                                                              'org_country', 'org_3', 'org_noprofit', 
                                                                                              'org_url', 'org_ids', 'org_date1', 
                                                                                              'org_date2', 'org_contact']))
re3data_institutions.head()
Out[21]:
re3data_id institution org_name org_other_names org_country org_3 org_noprofit org_url org_ids org_date1 org_date2 org_contact
0 r3d100000001 [Odum Institute for Research in Social Science... Odum Institute for Research in Social Science [] USA [general] non-profit https://odum.unc.edu/archive/ [] []
1 r3d100000002 [The U.S. National Archives and Records Admini... The U.S. National Archives and Records Adminis... [NARA, National Archives] USA [general] non-profit http://www.archives.gov/ [] [http://www.archives.gov/contact/]
2 r3d100000002 [The USA.gov, [], USA, [general], non-profit, ... The USA.gov [] USA [general] non-profit http://www.usa.gov/ [] [http://www.usa.gov/Contact.shtml]
3 r3d100000004 [Institut für Deutsche Sprache, Archiv für Ges... Institut für Deutsche Sprache, Archiv für Gesp... [AGD] DEU [funding, general] non-profit http://agd.ids-mannheim.de/index.shtml [] 2004 [agd@ids-mannheim.de]
4 r3d100000005 [Odum Institute for Research in Social Science... Odum Institute for Research in Social Science [] USA [technical] non-profit https://odum.unc.edu/ [] [https://odum.unc.edu/contact/contact-form/, o...
In [22]:
re3data_institutions['org_continent'] = re3data_institutions.org_country.map(countrycode_to_continent)
In [23]:
re3data_institutions[re3data_institutions.org_continent.isna()].org_country.unique()
Out[23]:
array(['AAA', 'EEC'], dtype=object)

AAA is used for international collaborations; we skip this. EEC is used for the EU commission; we fix the continent manually.

In [24]:
re3data_institutions.loc[re3data_institutions.org_country == 'EEC', 'org_continent'] = 'EU'

OpenDOAR

In [25]:
opendoar_institutions = opendoar_df.explode('institution')[['opendoar_id', 'institution']]
opendoar_institutions = opendoar_institutions[~opendoar_institutions.institution.isna()].reset_index(drop=True)
opendoar_institutions = opendoar_institutions.join(pd.DataFrame(opendoar_institutions.institution.to_list(), columns=['org_name', 'org_other_names', 
                                                                                              'org_country', 'org_3', 'org_noprofit', 
                                                                                              'org_url', 'org_ids', 'org_date1', 
                                                                                              'org_date2', 'org_contact']))
opendoar_institutions['org_country'] = opendoar_institutions.org_country.map(str.upper, na_action='ignore')
opendoar_institutions['org_country'] = opendoar_institutions.org_country.map(countrycode_iso2_to_countrycode_iso3, na_action='ignore')
opendoar_institutions.head()
Out[25]:
opendoar_id institution org_name org_other_names org_country org_3 org_noprofit org_url org_ids org_date1 org_date2 org_contact
0 101 [university of utrecht, [universiteit utrecht]... university of utrecht [universiteit utrecht] NLD [] https://www.uu.nl [https://ror.org/04pp8hn57] []
1 115 [indian institute of management kozhikode, [ii... indian institute of management kozhikode [iimk] IND [] http://www.iimk.ac.in/ [https://ror.org/03m1xdc36] []
2 41 [california institute of technology, [caltech]... california institute of technology [caltech] USA [] http://www.caltech.edu/ [https://ror.org/05dxps055] []
3 119 [dublin city university, [dcu], ie, [], , http... dublin city university [dcu] IRL [] http://www.dcu.ie/ [https://ror.org/04a1a1e81] []
4 129 [istituto nazionale di geofisica e vulcanologi... istituto nazionale di geofisica e vulcanologia [ingv] ITA [] http://www.ingv.it [https://ror.org/00qps9a02] []
In [26]:
opendoar_institutions['org_continent'] = opendoar_institutions.org_country.map(countrycode_to_continent)
In [27]:
opendoar_institutions[opendoar_institutions.org_continent.isna()].org_country.unique()
Out[27]:
array([nan, 'UMI'], dtype=object)
In [28]:
opendoar_institutions.loc[opendoar_institutions.org_country == 'UMI', 'org_continent'] = 'NA'
opendoar_institutions[opendoar_institutions.org_country == 'UMI']
Out[28]:
opendoar_id institution org_name org_other_names org_country org_3 org_noprofit org_url org_ids org_date1 org_date2 org_contact org_continent
4349 5379 [kettering university, [], um, [], , https://w... kettering university [] UMI [] https://www.kettering.edu [https://ror.org/03rcspa57] [] NA

ROAR

In [29]:
roar_df['location_country'] = roar_df.location_country.map(str.upper, na_action='ignore')
roar_df['location_country'] = roar_df.location_country.map(countrycode_iso2_to_countrycode_iso3)
roar_df['continent'] = roar_df.location_country.map(countrycode_to_continent)

FAIRsharing

In [30]:
fairsharing_df['subjects'] = fairsharing_df.subjects.str.split(pat=',')
fairsharing_df['countries'] = fairsharing_df.countries.str.split(pat=',')

fairsharing_countries = fairsharing_df.explode('countries')
fairsharing_countries['countrycode'] = fairsharing_countries.countries.map(country_to_countrycode)
fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(countrycode_to_continent)
In [31]:
fairsharing_countries[fairsharing_countries.countrycode.isna()].countries.unique()
Out[31]:
array(['European Union', 'Republic of Ireland', 'Worldwide', nan],
      dtype=object)
In [32]:
fairsharing_countries[fairsharing_countries.continent.isna()].countries.unique()
Out[32]:
array(['European Union', 'Republic of Ireland', 'Worldwide', 'Antarctica',
       nan], dtype=object)

Fix manually some rows

In [33]:
fairsharing_countries.loc[fairsharing_countries.countries == 'Republic of Ireland', ['countries', 'countrycode', 'continent']] = ['Ireland', 'IE', 'EU']
fairsharing_countries.loc[fairsharing_countries.countries == 'European Union', ['countrycode', 'continent']] = ['EU', 'EU']

Make Antactica disappear (only one repo)

In [34]:
fairsharing_countries.loc[fairsharing_countries.countries == 'Antarctica', ['countrycode', 'continent']] = ['AQ', np.nan]
fairsharing_countries[fairsharing_countries.countrycode == 'AQ']
Out[34]:
full_name short_name fs_url url countries subjects countrycode continent
915 Antabif IPT - AntOBIS IPT - GBIF Belgium Antabif IPT - AntOBIS IPT - GBIF Belgium https://fairsharing.org/10.25504/FAIRsharing.e... http://ipt.biodiversity.aq/ Antarctica [Biodiversity, Life Science] AQ NaN

Country coverage

In [35]:
data1 = re3data_institutions.groupby('org_country')[['re3data_id']].count().sort_values('re3data_id', ascending=False)
data2 = opendoar_institutions.groupby('org_country')[['opendoar_id']].count().sort_values('opendoar_id', ascending=False)
data3 = roar_df.groupby('location_country')[['eprintid']].count().sort_values('eprintid', ascending=False)
data4 = fairsharing_countries.groupby('countrycode')[['url']].count().sort_values('url', ascending=False)

plot = [
    go.Bar(
        x=data1.index,
        y=data1['re3data_id'],
        name='re3data'
    ),
    go.Bar(
        x=data2.index,
        y=data2['opendoar_id'],
        name='openDOAR',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data3.index,
        y=data3['eprintid'],
        name='ROAR',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data4.index,
        y=data4['url'],
        name='FAIRsharing',
        visible = 'legendonly'
    )
]

layout = go.Layout(
    title='Country coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

go.Figure(plot, layout).show()

Continental coverage

In [36]:
data1 = re3data_institutions.groupby('org_continent')[['re3data_id']].count()
data2 = opendoar_institutions.groupby('org_continent')[['opendoar_id']].count()
data3 = roar_df.groupby('continent')[['eprintid']].count()
data4 = fairsharing_countries.groupby('continent')[['url']].count()

plot = [
    go.Scatterpolar(
        r=data1.re3data_id,
        theta=data1.index,
        fill='toself',
        name='re3data'),
    go.Scatterpolar(
        r=data2.opendoar_id,
        theta=data2.index,
        fill='toself',
        name='OpenDOAR'),
    go.Scatterpolar(
        r=data3.eprintid,
        theta=data3.index,
        fill='toself',
        name='ROAR'),
    go.Scatterpolar(
        r=data4.url,
        theta=data4.index,
        fill='toself',
        name='FAIRsharing')
]

layout = go.Layout(polar=dict(
    radialaxis=dict(
      visible=True
    ),
  )
)

go.Figure(plot, layout).show()