In [1]:
import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

In [2]:
def country_to_countrycode(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha3(country)
        except:
            return np.nan
        
def countrycode_iso2_to_countrycode_iso3(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha3(pycountry_convert.country_alpha2_to_country_name(country))
        except:
            return np.nan

def countrycode_to_continent(country_code):
    if pd.isna(country_code):
        return np.nan
    else:
        try:
            return pycountry_convert.country_alpha2_to_continent_code(pycountry_convert.country_alpha3_to_country_alpha2(country_code))
        except:
            return np.nan

## Loading datasets

**re3data**

In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additionalName': ast.literal_eval,
                                    'repositoryIdentifier': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'contentType': ast.literal_eval,
                                    'providerType': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })

re3data_df.head()

Unnamed: 0,orgIdentifier,repositoryName,repositoryName.language,additionalName,repositoryURL,repositoryIdentifier,repositoryContact,description,description.language,type,size,startDate,endDate,repositoryLanguage,subject,missionStatementURL,contentType,providerType,keyword,institution,policy,databaseAccess,databaseLicense,dataAccess,dataLicense,dataUploadType,dataUploadLicense,software,versioning,api,pidSystem,citationGuidelineURL,aidSystem,enhancedPublication,qualityManagement,certificate,metadataStandard,syndication,remarks,entryDate,lastUpdate
0,r3d100000001,Odum Institute Archive Dataverse,eng,[],https://dataverse.unc.edu/dataverse/odum,[],"[""https://dataverse.unc.edu/dataverse/odum#"", ...",The Odum Institute Archive Dataverse contains ...,eng,[disciplinary],"{""size"": ""13 dataverses; 3.050 datasets"", ""upd...",,,"[""eng""]","[{'name': '1 Humanities and Social Sciences', ...",,"[{'name': 'Databases', 'scheme': 'parse'}, {'n...",[dataProvider],"[FAIR, Middle East, crime, demography, economy...",[{'institutionName': 'Odum Institute for Resea...,"[{""policyName"": ""Collection Development Policy...","{""databaseAccessType"": ""open"", ""databaseAcces...","[{""databaseLicenseName"": ""CC0"", ""databaseLicen...","[{""dataAccessType"": ""embargoed"", ""dataAccessRe...","[{""dataLicenseName"": ""CC"", ""dataLicenseURL"": ""...",restricted,[],"[""DataVerse""]",,{},"[""DOI""]",,[],unknown,yes,"[""other""]","[{""metadataStandardName"": ""DDI - Data Document...",{},Odum Dataverse is covered by Thomson Reuters D...,2013-06-10,2021-07-06
1,r3d100000002,Access to Archival Databases,eng,"[{'additionalName': 'AAD', 'additionalNameLang...",https://aad.archives.gov/aad/,"[RRID:SCR_010479, RRID:nlx_157752]","[""https://www.archives.gov/contact""]",You will find in the Access to Archival Databa...,eng,[disciplinary],"{""size"": """", ""updatedp"": """"}",1985,,"[""eng"", ""spa""]","[{'name': '1 Humanities and Social Sciences', ...",https://www.archives.gov/publications/general-...,"[{'name': 'Images', 'scheme': 'parse'}, {'name...",[dataProvider],[US History],[{'institutionName': 'The U.S. National Archiv...,"[{""policyName"": ""Contribution Policy"", ""policy...","{""databaseAccessType"": ""open"", ""databaseAcces...",[],"[{""dataAccessType"": ""open"", ""dataAccessRestric...","[{""dataLicenseName"": ""Copyrights"", ""dataLicens...",restricted,[],"[""unknown""]",no,"{""api"": ""https://www.archives.gov/developer#to...","[""none""]",https://aad.archives.gov/aad/help/getting-star...,[],unknown,unknown,[],[],"{""syndication"": ""http://www.archives.gov/socia...",,2012-07-04,2021-05-25
2,r3d100000004,Datenbank Gesprochenes Deutsch,deu,"[{'additionalName': 'DGD', 'additionalNameLang...",https://dgd.ids-mannheim.de/,[],"[""dgd@ids-mannheim.de""]","The ""Database for Spoken German (DGD)"" is a co...",eng,[disciplinary],"{""size"": ""34 corpora"", ""updatedp"": ""2020-02-03""}",2012,,"[""deu""]","[{'name': '1 Humanities and Social Sciences', ...",https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext...,"[{'name': 'Audiovisual data', 'scheme': 'parse...","[dataProvider, serviceProvider]","[Australian German, FOLK, German dialects, Pfe...",[{'institutionName': 'Institut für Deutsche Sp...,"[{""policyName"": ""Erfurter Aufruf zur Sicherung...","{""databaseAccessType"": ""restricted"", ""databas...",[],"[{""dataAccessType"": ""restricted"", ""dataAccessR...","[{""dataLicenseName"": ""other"", ""dataLicenseURL""...",restricted,[],"[""other""]",yes,{},"[""none""]",http://agd.ids-mannheim.de/konditionen.shtml,[],unknown,unknown,"[""RatSWD""]",[],{},,2012-07-20,2020-08-27
3,r3d100000005,UNC Dataverse,eng,[{'additionalName': 'University of North Carol...,https://dataverse.unc.edu/,[FAIRsharing_doi:10.25504/FAIRsharing.pS2p8c],"[""https://dataverse.unc.edu/"", ""odumarchive@un...",UNC Dataverse is an open-source repository sof...,eng,[institutional],"{""size"": ""186 dataverses; 25.272 studies; 229....",2011,,"[""eng""]","[{'name': '1 Humanities and Social Sciences', ...",https://odum.unc.edu/about/mission-vision/,"[{'name': 'Archived data', 'scheme': 'parse'},...","[dataProvider, serviceProvider]","[FAIR, census, demographic survey, demography,...",[{'institutionName': 'Odum Institute for Resea...,"[{""policyName"": ""Collection Development Policy...","{""databaseAccessType"": ""open"", ""databaseAcces...",[],"[{""dataAccessType"": ""open"", ""dataAccessRestric...","[{""dataLicenseName"": ""CC"", ""dataLicenseURL"": ""...",restricted,"[{""dataUploadLicenseName"": ""Data Deposit Form""...","[""DataVerse""]",yes,"{""api"": ""https://guides.dataverse.org/en/lates...","[""ARK"", ""DOI"", ""PURL"", ""URN"", ""hdl""]",https://dataverse.org/best-practices/data-cita...,[],unknown,yes,[],"[{""metadataStandardName"": ""DDI - Data Document...",{},UNC Dataverse is covered by Clarivate Data Cit...,2012-07-23,2021-10-25
4,r3d100000006,Archaeology Data Service,eng,"[{'additionalName': 'ADS', 'additionalNameLang...",https://archaeologydataservice.ac.uk/,[FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg],"[""help@archaeologydataservice.ac.uk"", ""https:/...",The ADS is an accredited digital repository fo...,eng,[disciplinary],"{""size"": ""1837 results"", ""updatedp"": ""2020-05-...",1996-10-01,,"[""eng""]","[{'name': '1 Humanities and Social Sciences', ...",https://archaeologydataservice.ac.uk/about/our...,"[{'name': 'Archived data', 'scheme': 'parse'},...","[dataProvider, serviceProvider]","[FAIR, archaeology, cultural heritage, prehist...",[{'institutionName': 'Arts and Humanities Rese...,"[{""policyName"": ""ADS Guides to good practice"",...","{""databaseAccessType"": ""open"", ""databaseAcces...","[{""databaseLicenseName"": ""CC"", ""databaseLicens...","[{""dataAccessType"": ""open"", ""dataAccessRestric...","[{""dataLicenseName"": ""CC"", ""dataLicenseURL"": ""...",restricted,"[{""dataUploadLicenseName"": ""Guidelines for Dep...","[""other""]",yes,"{""api"": ""https://archaeologydataservice.ac.uk/...","[""DOI""]",https://archaeologydataservice.ac.uk/advice/te...,[],unknown,yes,"[""other""]","[{""metadataStandardName"": ""DataCite Metadata S...","{""syndication"": ""https://archaeologydataservic...",ADS is covered by Clarivate Data Citation Inde...,2012-07-23,2021-09-02


In [4]:
re3data_df.describe(include='all')

Unnamed: 0,orgIdentifier,repositoryName,repositoryName.language,additionalName,repositoryURL,repositoryIdentifier,repositoryContact,description,description.language,type,size,startDate,endDate,repositoryLanguage,subject,missionStatementURL,contentType,providerType,keyword,institution,policy,databaseAccess,databaseLicense,dataAccess,dataLicense,dataUploadType,dataUploadLicense,software,versioning,api,pidSystem,citationGuidelineURL,aidSystem,enhancedPublication,qualityManagement,certificate,metadataStandard,syndication,remarks,entryDate,lastUpdate
count,2793,2793,2793,2793,2769,2793,2793,2793,2793,2793,2793,1800,172,2793,2793,2373,2793,2793,2793,2793,2793,2793,2793,2793,2793,2778,2793,2793,1339,2793,2793,1532,2793,2793,2793,2793,2793,2793,1694,2793,2793
unique,2793,2791,19,2197,2766,1024,2532,2792,6,9,1321,362,86,110,1418,2304,1351,6,2544,2773,2366,12,377,146,2294,3,695,23,2,1170,29,1337,13,3,3,16,175,544,1673,1316,722
top,r3d100000001,EarthChem Library,eng,[],http://icgem.gfz-potsdam.de/home,[],[],The National Archives and Records Administrati...,eng,[disciplinary],"{""size"": """", ""updatedp"": """"}",2008,2015,"[""eng""]","[{'name': '1 Humanities and Social Sciences', ...",https://learn.scholarsportal.info/all-guides/d...,"[{'name': 'Standard office documents', 'scheme...",[dataProvider],[multidisciplinary],[{'institutionName': 'National Center for Biot...,[][],"{""databaseAccessType"": ""open"", ""databaseAcces...",[],"[{""dataAccessType"": ""open"", ""dataAccessRestric...","[{""dataLicenseName"": ""CC"", ""dataLicenseURL"": ""...",restricted,[],"[""unknown""]",yes,{},"[""none""]",https://dataverse.org/best-practices/data-cita...,[],unknown,yes,[],[],{},is covered by Elsevier.,2018-08-10,2021-09-03
freq,1,2,2596,587,2,1769,170,2,2776,1768,1472,93,12,2088,240,14,29,1806,205,7,319,2624,2201,1292,71,1851,2054,1216,1131,1526,1359,76,2199,1643,1569,2557,1693,2235,17,20,104


**openDOAR**

In [5]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'repository_metadata.content_subjects': ast.literal_eval,
                                    'repository_metadata.alternativename': ast.literal_eval,
                                    'repository_metadata.content_types': ast.literal_eval,
                                    'organization': ast.literal_eval
                                    },
                         dtype={'system_metadata.id': str})

opendoar_df.head()

Unnamed: 0,system_metadata.id,repository_metadata.name,repository_metadata.alternativename,repository_metadata.url,repository_metadata.description,repository_metadata.type,repository_metadata.content_languages,system_metadata.date_modified,system_metadata.date_created,repository_metadata.content_subjects,repository_metadata.content_types,organization,policy_urls,repository_metadata.software,repository_metadata.oai_url,system_metadata.publicly_visible,repository_metadata.repository_status,repository_metadata.fulltext_record_count,repository_metadata.metadata_record_count
0,134,"{""name"": ""eldorado - repository of the tu dort...",[{'name': 'eldorado - ressourcen aus und für l...,https://eldorado.tu-dortmund.de,,institutional,[],2022-01-12 15:34:54,2005-12-19 14:57:52,"[arts, humanities, science, mathematics, socia...","[journal_articles, conference_and_workshop_pap...","[{'name': 'technische universität dortmund', '...",[],"{""name"": ""dspace"", ""version"": """"}",https://eldorado.tu-dortmund.de/oai/request,yes,,9629.0,20963.0
1,58,"{""name"": ""archive ouverte en sciences de linfo...",[{'acronym': '@rchivesic'}],https://archivesic.ccsd.cnrs.fr,,institutional,[],2022-01-12 15:34:53,2006-01-13 12:48:32,"[arts, science, technology, engineering, mathe...","[journal_articles, conference_and_workshop_pap...",[{'name': 'centre pour la communication scient...,[],"{""name"": ""hal"", ""version"": """"}",https://api.archives-ouvertes.fr/oai/archivesic,yes,,55492.0,1137498.0
2,93,"{""name"": ""digitalcommons@the texas medical cen...",[],http://digitalcommons.library.tmc.edu/,,institutional,[],2022-01-12 15:34:53,2006-02-14 11:16:12,[health and medicine],"[journal_articles, theses_and_dissertations]","[{'name': 'texas medical center', 'alternative...",[],"{""name"": ""other"", ""version"": """"}",http://digitalcommons.library.tmc.edu/do/oai/,yes,,2658.0,7268.0
3,68,"{""name"": ""cognitive sciences eprint archive"", ...",[{'acronym': 'cogprints'}],http://cogprints.org/,,disciplinary,[],2022-01-12 15:34:53,2006-01-04 15:01:23,"[humanities, health and medicine, science, soc...","[journal_articles, conference_and_workshop_pap...","[{'name': 'university of southampton', 'altern...",[],"{""name"": ""eprints"", ""version"": """"}",http://cogprints.org/cgi/oai2,yes,,2895.0,4277.0
4,84,"{""name"": ""digital commons@carleton college"", ""...",[],http://digitalcommons.carleton.edu/,,institutional,[],2022-01-12 15:34:53,2006-01-04 16:07:58,"[humanities, science, social sciences]","[journal_articles, unpub_reports_and_working_p...","[{'name': 'carleton college', 'alternativeName...",[],"{""name"": ""other"", ""version"": """"}",,yes,,,42.0


In [6]:
opendoar_df.describe(include='all')

Unnamed: 0,system_metadata.id,repository_metadata.name,repository_metadata.alternativename,repository_metadata.url,repository_metadata.description,repository_metadata.type,repository_metadata.content_languages,system_metadata.date_modified,system_metadata.date_created,repository_metadata.content_subjects,repository_metadata.content_types,organization,policy_urls,repository_metadata.software,repository_metadata.oai_url,system_metadata.publicly_visible,repository_metadata.repository_status,repository_metadata.fulltext_record_count,repository_metadata.metadata_record_count
count,5811.0,5811,5811,5810,0.0,5810,5811,5811,5811,5811,5811,5811,5811,5811,4447,5811,0.0,2292.0,4184.0
unique,5811.0,5780,2116,5772,,4,1,171,5643,237,477,5212,678,32,4415,1,,,
top,134.0,"{""name"": ""arch"", ""language"": ""en""}",[],http://harp.lib.hiroshima-u.ac.jp/,,institutional,[],2022-01-12 15:35:47,2020-09-18 12:53:48,"[science, technology, engineering, mathematics...",[theses_and_dissertations],"[{'name': 'rijksuniversiteit groningen', 'alte...",[],"{""name"": ""dspace"", ""version"": """"}",https://api.figshare.com/v2/oai,yes,,,
freq,1.0,3,3656,3,,5161,5811,73,81,3321,469,26,5131,2273,3,5811,,,
mean,,,,,,,,,,,,,,,,,,5022.89,176555.6
std,,,,,,,,,,,,,,,,,,42126.48,6611068.0
min,,,,,,,,,,,,,,,,,,0.0,0.0
25%,,,,,,,,,,,,,,,,,,0.0,893.75
50%,,,,,,,,,,,,,,,,,,422.5,4012.5
75%,,,,,,,,,,,,,,,,,,2931.5,16293.5


**ROAR**

In [7]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv', dtype='str')
roar_df = roar_df.groupby('eprintid').aggregate(set)

def value_or_list(cell_set):
    copy = set(cell_set)
    copy.discard(np.nan) 
    if len(copy) == 0:
        return np.nan
    if len(copy) == 1:
        return copy.pop()
    return list(copy)
        
roar_df = roar_df.applymap(value_or_list)
roar_df.reset_index(inplace=True)

roar_df.head()

Unnamed: 0,eprintid,rev_number,eprint_status,userid,importid,source,dir,datestamp,lastmod,status_changed,type,succeeds,commentary,metadata_visibility,latitude,longitude,relation_type,relation_uri,item_issues_id,item_issues_type,item_issues_description,item_issues_timestamp,item_issues_status,item_issues_reported_by,item_issues_resolved_by,item_issues_comment,item_issues_count,sword_depositor,sword_slug,exemplar,home_page,title,oai_pmh,sword_endpoint,rss_feed,twitter_feed,description,fulltext,open_access,mandate,organisation_title,organisation_home_page,location_country,location_city,location_latitude,location_longitude,software,geoname,version,subjects,date,note,suggestions,activity_low,activity_medium,activity_high,recordcount,recordhistory,fulltexts_total,fulltexts_docs,fulltexts_rtotal,fulltexts_rdocs,registry_name,registry_id,submit_to,submitted_to_name,submitted_to_done,webometrics_rank,webometrics_size,webometrics_visibility,webometrics_rich_files,webometrics_scholar,monthly_deposits,total_deposits,association
0,1,633,archive,1,,,disk0/00/00/00/01,2010-01-06 13:43:48,2011-07-18 05:40:07,2010-01-06 13:43:48,subject,,,show,,,,,,,,,,,,,0.0,,,,http://archivesic.ccsd.cnrs.fr/,@RCHIVESIC,http://archivesic.ccsd.cnrs.fr/oai/oai.php,,,,,,,,,,fr,,,,hal,geoname_2_FR,other,,2002-05-17 19:24:41,,,0.0,0.0,0.0,25.0,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",,,,,"[opendoar, celestial]","[669, 58]",,,,,,,,,,,
1,10,511,archive,1,,,disk0/00/00/00/10,2010-01-06 13:43:48,2011-07-18 05:40:13,2010-01-06 13:43:48,institutional,,,show,,,,,,,,,,,,,0.0,,,,http://www.diva-portal.org/mdh/,Academic Archive On-line (Mälardalen Universit...,http://www.diva-portal.org/oai/mdh/OAI,,,,,True,True,,,,se,Uppsala,59.8667,17.6333,diva,geoname_2_SE,other,,2005-12-08 13:15:22,,,0.0,0.0,0.0,100.0,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,100,100...",,,,,"[opendoar, celestial]","[258, 526]",,,,,,,,,,,
2,1000,274,archive,1,,,disk0/00/00/10/00,2010-01-06 13:45:01,2011-07-06 08:21:21,2010-01-06 13:45:01,subject,,,show,,,,,,,,,,,,,0.0,,,,http://pam.pisharp.org/,PAM - Portuguese Archive of Mathematics,,,,,,True,True,,,,pt,"Bellevue, WA",47.6034,-122.155,dspace,geoname_2_PT,other,,2006-05-04 10:48:14,,,,,,,,,,,,,,,,,,,,,,,,
3,10001,20,archive,91,,,disk0/00/01/00/01,2015-08-08 14:52:11,2016-03-21 19:44:01,2015-08-08 14:52:11,subject,,,show,,,,,,,,,,,,,,,,,http://edoc.sub.uni-hamburg.de/klimawandel/,Klimawandel Dokumentenserver,http://edoc.sub.uni-hamburg.de/klimawandel/oai,,,,"The ""Documentenserver Klimawandel"" (Repository...",True,True,True,"[Helmholtz-Zentrum Geesthacht, Climate Service...","[http://www.climateservicecenter.de/, http://w...",de,Hamburg,53.5511,9.9937,opus,geoname_2_DE,other,"[GE, S1, GF, HD, G1]",2015-07-02 08:08:31,,,,,,,,,,,,"[opendoar, celestial]","[3408, 5881]",,,,,,,,,,,
4,10008,11,archive,404,,,disk0/00/01/00/08,2015-08-08 14:52:26,2016-03-21 19:43:51,2015-08-08 14:52:26,institutional,,,show,,,,,,,,,,,,,,,,,http://creativematter.skidmore.edu/,Creative Matter | Skidmore College Research,http://creativematter.skidmore.edu/do/oai/,,http://creativematter.skidmore.edu/recent.rss,,"Welcome to Creative Matter, a repository for t...",True,False,False,Skidmore College,http://www.skidmore.edu/,us,Saratoga Springs,43.0961,-73.7818,bepress,geoname_2_US,other,,2015-07-06 17:35:50,,,,,,,,,,,,celestial,5882,,,,,,,,,,,


In [8]:
roar_df.describe(include='all')

Unnamed: 0,eprintid,rev_number,eprint_status,userid,importid,source,dir,datestamp,lastmod,status_changed,type,succeeds,commentary,metadata_visibility,latitude,longitude,relation_type,relation_uri,item_issues_id,item_issues_type,item_issues_description,item_issues_timestamp,item_issues_status,item_issues_reported_by,item_issues_resolved_by,item_issues_comment,item_issues_count,sword_depositor,sword_slug,exemplar,home_page,title,oai_pmh,sword_endpoint,rss_feed,twitter_feed,description,fulltext,open_access,mandate,organisation_title,organisation_home_page,location_country,location_city,location_latitude,location_longitude,software,geoname,version,subjects,date,note,suggestions,activity_low,activity_medium,activity_high,recordcount,recordhistory,fulltexts_total,fulltexts_docs,fulltexts_rtotal,fulltexts_rdocs,registry_name,registry_id,submit_to,submitted_to_name,submitted_to_done,webometrics_rank,webometrics_size,webometrics_visibility,webometrics_rich_files,webometrics_scholar,monthly_deposits,total_deposits,association
count,5444.0,5444.0,5444,5444.0,0.0,0.0,5444,5444,5444,5444,5444,108.0,0.0,5444,0.0,0.0,0.0,0.0,63,63,63,63,63,0.0,0.0,0.0,2242.0,0.0,0.0,268,5437,5442,4332,178,1538,116,3837,4197,4197,3746,4460,4286,5138,3714,3725.0,3708.0,4700,4730,5444,1289,5429,218,189,2288.0,2288.0,2288.0,2290.0,2288,270.0,258.0,270.0,258.0,4605,4580.0,375,205,205,148.0,148.0,148.0,148.0,148.0,756,756.0,223
unique,5444.0,660.0,1,2189.0,,,5444,4198,4043,4230,12,108.0,,2,,,,,48,5,62,4,3,,,,4.0,,,2,5271,5143,4059,172,1485,112,3359,2,2,2,3858,3831,144,1884,2923.0,2953.0,31,126,53,938,4898,210,173,72.0,54.0,16.0,741.0,1702,135.0,118.0,134.0,117.0,7,4261.0,7,1,1,148.0,148.0,148.0,146.0,143.0,346,342.0,3
top,1.0,11.0,archive,1.0,,,disk0/00/00/00/01,2010-01-06 13:43:48,2011-07-06 08:24:53,2010-01-06 13:43:48,institutional,10164.0,,show,,,,,bad_oai_pmh_url_0,duplicate_title,"Duplicate title to <xhtml:table xmlns:xhtml=""h...",2010-01-13 10:44:49,discovered,,,,0.0,,,FALSE,http://eprints.upnjatim.ac.id/,Repositorio Institucional,http://kce.docressources.info/ws/PMBWs_2,http://producao.usp.br/sword/servicedocument,http://eprints.upnjatim.ac.id/cgi/latest_tool?...,http://my.indexcopernicus.com/fredemoreno,info:other:archives.eprints.org:import,TRUE,TRUE,FALSE,Chinese Academy of Science (中国科学院),http://www.cas.cn/,us,Lima,34.1607,-118.139,dspace,geoname_2_US,other,K1,2006-05-04 10:48:14,¿Quién puede depositar documentos en el reposi...,This repository is hosted by the Texas Digital...,0.0,0.0,0.0,100.0,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",0.0,0.0,0.0,0.0,"[opendoar, celestial]",2479.0,"[opendoar, roarmap, celestial]",opendoar,2021-01-25,24.0,46.0,20.0,824.0,806.0,"0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...",0.0,russell_group
freq,1.0,333.0,5444,1330.0,,,1,16,8,16,3853,1.0,,5402,,,,,15,33,2,45,38,,,,2201.0,,,261,4,7,4,2,5,2,112,2805,2696,2748,9,9,891,74,25.0,25.0,2341,845,4841,53,99,2,9,2012.0,2074.0,2210.0,730.0,95,113.0,114.0,113.0,114.0,2106,4.0,119,205,205,1.0,1.0,1.0,3.0,5.0,387,387.0,130
mean,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
std,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
min,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
50%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
75%,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


**FAIRsharing**

In [9]:
with open('../data/raw/fairsharing_dump_api_02_2022.json') as f:
    lines = f.read().splitlines()
    
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))

fairsharing_df.head()

Unnamed: 0,id,type,attributes.created-at,attributes.updated-at,attributes.metadata.doi,attributes.metadata.name,attributes.metadata.status,attributes.metadata.contacts,attributes.metadata.homepage,attributes.metadata.identifier,attributes.metadata.description,attributes.metadata.abbreviation,attributes.metadata.support-links,attributes.metadata.year-creation,attributes.metadata.data-processes,attributes.metadata.cross-references,attributes.legacy-ids,attributes.fairsharing-registry,attributes.record-type,attributes.subjects,attributes.domains,attributes.taxonomies,attributes.user-defined-tags,attributes.countries,attributes.name,attributes.abbreviation,attributes.url,attributes.doi,attributes.fairsharing-licence,attributes.description,attributes.publications,attributes.licence-links,attributes.url-for-logo,attributes.metadata.citations,attributes.metadata.associated-tools,attributes.metadata.deprecation-reason,attributes.metadata.data-access-condition.type,attributes.metadata.data-contact-information,attributes.metadata.data-deposition-condition.url,attributes.metadata.data-deposition-condition.type,attributes.metadata.deprecation-date,attributes.metadata.access-points,attributes.metadata.data-access-condition.url,attributes.metadata.resource-sustainability.url,attributes.metadata.resource-sustainability.name,attributes.metadata.data-preservation-policy.url,attributes.metadata.data-preservation-policy.name,attributes.metadata.data-access-for-pre-publication-review,attributes.metadata.data-versioning,attributes.metadata.data-curation.type,attributes.metadata.data-curation.url,attributes.metadata.citation-to-related-publications,attributes.metadata.tombstone
0,3226,fairsharing-records,2020-12-09T11:53:44.000Z,2022-02-08T10:42:36.452Z,10.25504/FAIRsharing.d6423b,WDC Sunspot Index and Long-term Solar Observat...,ready,"[{'contact-name': 'Frédéric Clette', 'contact-...",http://sidc.be/silso/home,3226,The WDC-SILSO is an activity of the Operationa...,WDC-SILSO,[{'url': 'http://www.sidc.be/silso/taxonomy/te...,2013.0,"[{'url': 'http://www.sidc.be/silso/datafiles',...",[{'url': 'https://www.re3data.org/repository/r...,"[biodbcore-001740, bsg-d001740]",Database,repository,"[Electromagnetism, Astrophysics and Astronomy,...","[Climate, Observation design]",[Not applicable],"[Climate change, earth observation, Electromag...",[Belgium],FAIRsharing record for: WDC Sunspot Index and ...,WDC-SILSO,https://fairsharing.org/10.25504/FAIRsharing.d...,10.25504/FAIRsharing.d6423b,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: The WDC-SIL...,[],"[{'licence-name': 'SILSO legal notices', 'lice...",,,,,,,,,,,,,,,,,,,,,
1,2114,fairsharing-records,2014-11-04T15:23:40.000Z,2022-01-21T14:39:02.195Z,10.25504/FAIRsharing.p06nme,Biological Magnetic Resonance Data Bank,ready,"[{'contact-name': 'Helpdesk', 'contact-email':...",https://bmrb.io/,2114,"BMRB collects, annotates, archives, and dissem...",BMRB,"[{'url': 'https://bmrb.io/bmrb/news/', 'name':...",1988.0,[{'url': 'https://bmrb.io/data_library/rsync.s...,[{'url': 'https://www.re3data.org/repository/r...,"[biodbcore-000584, bsg-d000584]",Database,repository,[Structural Biology],"[Molecular structure, Protein structure, Pepti...",[All],[],[United States],FAIRsharing record for: Biological Magnetic Re...,BMRB,https://fairsharing.org/10.25504/FAIRsharing.p...,10.25504/FAIRsharing.p06nme,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: BMRB collec...,"[{'id': 552, 'pubmed_id': 18288446, 'title': '...",[{'licence-name': 'wwPDB Privacy and Usage Pol...,,"[{'doi': '10.1093/nar/gkm957', 'pubmed-id': 17...","[{'url': 'https://bmrb.io/validate/', 'name': ...",,open,yes,https://bmrb.io/deposit/,open,,,,,,,,,,,,,
2,3022,fairsharing-records,2020-06-17T10:25:30.000Z,2022-02-08T10:41:04.073Z,10.25504/FAIRsharing.8b7a2f,Fisheries and Oceans Canada Pacific Region Dat...,ready,"[{'contact-name': 'Peter Chandler', 'contact-e...",http://www.pac.dfo-mpo.gc.ca/science/oceans/da...,3022,The Institute of Ocean Sciences (IOS)/Ocean Sc...,,[{'url': 'DFO.PAC.SCI.IOSData-DonneesISO.SCI.P...,,[{'name': 'Users must contact the Senior Analy...,[{'url': 'https://www.re3data.org/repository/r...,"[biodbcore-001530, bsg-d001530]",Database,repository,"[Environmental Science, Meteorology, Earth Sci...",[Climate],[Not applicable],"[Salinity, Temperature]",[Canada],FAIRsharing record for: Fisheries and Oceans C...,,https://fairsharing.org/10.25504/FAIRsharing.8...,10.25504/FAIRsharing.8b7a2f,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: The Institu...,[],[{'licence-name': 'Fisheries and Oceans Canada...,,,,,,,,,,,,,,,,,,,,,
3,2998,fairsharing-records,2020-05-21T07:42:30.000Z,2022-02-08T10:40:19.531Z,10.25504/FAIRsharing.e08886,Climate Prediction Center,ready,"[{'contact-name': 'Jon Hoopingarner', 'contact...",https://www.cpc.ncep.noaa.gov/,2998,The Climate Prediction Center (CPC) produces o...,CPC,[{'url': 'https://www.cpc.ncep.noaa.gov/commen...,1970.0,"[{'url': 'https://www.cpc.ncep.noaa.gov/', 'na...",[{'url': 'https://www.re3data.org/repository/r...,"[biodbcore-001504, bsg-d001504]",Database,repository,"[Hydrogeology, Geography, Meteorology, Geodesy...",[Climate],[Not applicable],"[Forecasting, weather]",[United States],FAIRsharing record for: Climate Prediction Center,CPC,https://fairsharing.org/10.25504/FAIRsharing.e...,10.25504/FAIRsharing.e08886,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: The Climate...,[],[{'licence-name': 'National Weather Service Di...,,,,,,,,,,,,,,,,,,,,,
4,2301,fairsharing-records,2016-06-03T14:54:08.000Z,2021-11-24T13:17:51.201Z,10.25504/FAIRsharing.meh9wz,Acytostelium Gene Database,deprecated,[{'contact-name': 'Acytostelium genome consort...,http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...,2301,Genome and transcriptome database of Acytostel...,,,2008.0,,,"[biodbcore-000775, bsg-d000775]",Database,repository,"[Genomics, Life Science, Transcriptomics]","[DNA sequence data, Gene model annotation]",[Acytostelium subglobosum],[],"[United Kingdom, Japan]",FAIRsharing record for: Acytostelium Gene Data...,,https://fairsharing.org/10.25504/FAIRsharing.m...,10.25504/FAIRsharing.meh9wz,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: Genome and ...,"[{'id': 1139, 'pubmed_id': 25758444, 'title': ...",[],,,,This resource is no longer available at the st...,,,,,2021-9-17,,,,,,,,,,,,


In [10]:
fairsharing_df.describe(include='all')

Unnamed: 0,id,type,attributes.created-at,attributes.updated-at,attributes.metadata.doi,attributes.metadata.name,attributes.metadata.status,attributes.metadata.contacts,attributes.metadata.homepage,attributes.metadata.identifier,attributes.metadata.description,attributes.metadata.abbreviation,attributes.metadata.support-links,attributes.metadata.year-creation,attributes.metadata.data-processes,attributes.metadata.cross-references,attributes.legacy-ids,attributes.fairsharing-registry,attributes.record-type,attributes.subjects,attributes.domains,attributes.taxonomies,attributes.user-defined-tags,attributes.countries,attributes.name,attributes.abbreviation,attributes.url,attributes.doi,attributes.fairsharing-licence,attributes.description,attributes.publications,attributes.licence-links,attributes.url-for-logo,attributes.metadata.citations,attributes.metadata.associated-tools,attributes.metadata.deprecation-reason,attributes.metadata.data-access-condition.type,attributes.metadata.data-contact-information,attributes.metadata.data-deposition-condition.url,attributes.metadata.data-deposition-condition.type,attributes.metadata.deprecation-date,attributes.metadata.access-points,attributes.metadata.data-access-condition.url,attributes.metadata.resource-sustainability.url,attributes.metadata.resource-sustainability.name,attributes.metadata.data-preservation-policy.url,attributes.metadata.data-preservation-policy.name,attributes.metadata.data-access-for-pre-publication-review,attributes.metadata.data-versioning,attributes.metadata.data-curation.type,attributes.metadata.data-curation.url,attributes.metadata.citation-to-related-publications,attributes.metadata.tombstone
count,1853.0,1853,1853,1853,1601,1853,1853,1764,1853,1853.0,1853,1671,1663,1541.0,1626,790,1853,1853,1853,1853,1853,1853,1853,1853,1853,1671,1853,1601,1853,1853,1853,1853,18,621,632,363.0,42,47,22,33,238,465,19,2,2,3,3,10,17,22,8,35,1
unique,1853.0,1,1218,1853,1601,1851,4,1623,1853,,1853,1655,1646,,1625,790,1799,1,3,935,1205,385,395,194,1851,1655,1853,1601,1,1853,1135,1119,18,331,627,104.0,2,2,22,2,71,460,19,2,2,3,3,2,2,4,8,2,1
top,3226.0,fairsharing-records,2014-11-04T15:23:40.000Z,2022-02-08T10:42:36.452Z,10.25504/FAIRsharing.d6423b,iDog,ready,[],http://sidc.be/silso/home,,The WDC-SILSO is an activity of the Operationa...,CGD,[{'url': 'https://github.com/gbif/ipt/wiki/IPT...,,[{'url': 'https://site.uit.no/dataverseno/abou...,[{'url': 'https://www.re3data.org/repository/r...,[],Database,repository,[Life Science],[],[All],[],[United States],FAIRsharing record for: iDog,CGD,https://fairsharing.org/10.25504/FAIRsharing.d...,10.25504/FAIRsharing.d6423b,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: The WDC-SIL...,[],[],/rails/active_storage/blobs/redirect/eyJfcmFpb...,[],[],,open,yes,https://bmrb.io/deposit/,controlled,2021-9-17,[{'url': 'https://heidata.uni-heidelberg.de/oa...,https://arch.library.northwestern.edu/about?lo...,https://www.library.northwestern.edu/about/adm...,Commitment to Sustainability: Level 1,http://www.library.northwestern.edu/about/admi...,Digital Preservation Policy: Level 1,yes,yes,manual,https://www.gbif.org/tools/data-validator/about,yes,True
freq,1.0,1853,636,1,1,2,1564,40,1,,1,3,6,,2,1,55,1853,954,345,276,528,1258,607,2,3,1,1,1853,1,690,735,1,285,3,125.0,38,45,1,21,81,3,1,1,1,1,1,9,16,11,1,34,1
mean,,,,,,,,,,2481.862925,,,,2007.894873,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
std,,,,,,,,,,554.072492,,,,10.933713,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
min,,,,,,,,,,1120.0,,,,1894.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25%,,,,,,,,,,2009.0,,,,2004.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
50%,,,,,,,,,,2473.0,,,,2010.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
75%,,,,,,,,,,2938.0,,,,2015.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


## Subjects analysis

**re3data**

In [11]:
re3data_subjects = re3data_df[['orgIdentifier', 'subject']].explode('subject')
re3data_subjects['subject'] = re3data_subjects['subject'].apply(lambda x: x['name'] if x is not np.nan else np.nan)
re3data_subjects

Unnamed: 0,orgIdentifier,subject
0,r3d100000001,1 Humanities and Social Sciences
0,r3d100000001,111 Social Sciences
0,r3d100000001,11104 Political Science
0,r3d100000001,112 Economics
0,r3d100000001,12 Social and Behavioural Sciences
...,...,...
2791,r3d100013733,4 Engineering Sciences
2792,r3d100013735,2 Life Sciences
2792,r3d100013735,"204 Microbiology, Virology and Immunology"
2792,r3d100013735,21 Biology


In [12]:
data = re3data_subjects.groupby('subject')[['orgIdentifier']].count().sort_values('subject', ascending=False)
data
plot = [
    go.Bar(
        x=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)].index,
        y=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)]['orgIdentifier'],
        name='re3data tier %s-digits' % tier
    ) for tier in [1,2,3,5]
] 

layout = go.Layout(
    title='Subject coverage re3data',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

**OpenDOAR**

In [13]:
opendoar_subjects = opendoar_df.explode('repository_metadata.content_subjects')

In [14]:
data = opendoar_subjects.groupby('repository_metadata.content_subjects')[['system_metadata.id']].count().sort_values('system_metadata.id', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['system_metadata.id'],
    ) 
] 

layout = go.Layout(
    title='Subject coverage OpenDOAR',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

**ROAR**

In [15]:
roar_subjects = roar_df.explode('subjects')

In [16]:
data = roar_subjects.groupby('subjects')[['eprintid']].count().sort_values('eprintid', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['eprintid'],
    ) 
] 

layout = go.Layout(
    title='Subject coverage OpenDOAR',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

**FAIRsharing**

In [17]:
fairsharing_subjects = fairsharing_df.explode('attributes.subjects')

In [18]:
data = fairsharing_subjects.groupby('attributes.subjects')[['id']].count().sort_values('id', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['id'],
        name='FAIRsharing'
    )
]

layout = go.Layout(
    title='Subject coverage FAIRsharing',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

## Geographic analysis

**re3data**

In [19]:
re3data_institutions = re3data_df.explode('institution')[['orgIdentifier', 'institution']]
re3data_institutions = re3data_institutions[~re3data_institutions.institution.isna()].reset_index(drop=True)
re3data_institutions = re3data_institutions.join(pd.json_normalize(re3data_institutions.institution))
re3data_institutions.head()

Unnamed: 0,orgIdentifier,institution,institutionName,institutionAdditionalName,institutionCountry,responsabilityType,institutionType,institutionURL,institutionIdentifier,responsibilityStartDate,responsibilityEndDate,institutionContact
0,r3d100000001,{'institutionName': 'Odum Institute for Resear...,Odum Institute for Research in Social Science,[],USA,[general],non-profit,https://odum.unc.edu/archive/,[],,,[]
1,r3d100000002,{'institutionName': 'The U.S. National Archive...,The U.S. National Archives and Records Adminis...,"[NARA, National Archives]",USA,[general],non-profit,http://www.archives.gov/,[],,,[http://www.archives.gov/contact/]
2,r3d100000002,"{'institutionName': 'The USA.gov', 'institutio...",The USA.gov,[],USA,[general],non-profit,http://www.usa.gov/,[],,,[http://www.usa.gov/Contact.shtml]
3,r3d100000004,{'institutionName': 'Institut für Deutsche Spr...,"Institut für Deutsche Sprache, Archiv für Gesp...",[AGD],DEU,"[funding, general]",non-profit,http://agd.ids-mannheim.de/index.shtml,[],2004.0,,[agd@ids-mannheim.de]
4,r3d100000005,{'institutionName': 'Odum Institute for Resear...,Odum Institute for Research in Social Science,[],USA,[technical],non-profit,https://odum.unc.edu/,[],,,"[https://odum.unc.edu/contact/contact-form/, o..."


In [20]:
re3data_institutions['org_continent'] = re3data_institutions.institutionCountry.map(countrycode_to_continent)

In [21]:
re3data_institutions[re3data_institutions.org_continent.isna()].institutionCountry.unique()

array(['AAA', 'EEC'], dtype=object)

AAA is used for international collaborations; we skip this.
EEC is used for the EU commission; we fix the continent manually.

In [22]:
re3data_institutions.loc[re3data_institutions.institutionCountry == 'EEC', 'org_continent'] = 'EU'

**OpenDOAR**

In [23]:
opendoar_institutions = opendoar_df.explode('organization')[['system_metadata.id', 'organization']]
opendoar_institutions = opendoar_institutions[~opendoar_institutions.organization.isna()].reset_index(drop=True)
opendoar_institutions = opendoar_institutions.join(pd.json_normalize(opendoar_institutions.organization))
opendoar_institutions['country'] = opendoar_institutions.country.map(str.upper, na_action='ignore')
opendoar_institutions['country'] = opendoar_institutions.country.map(countrycode_iso2_to_countrycode_iso3, na_action='ignore')
opendoar_institutions.head()

Unnamed: 0,system_metadata.id,organization,name,alternativeName,country,url,identifier,location.latitude,location.longiture
0,134,"{'name': 'technische universität dortmund', 'a...",technische universität dortmund,tu dortmund,DEU,https://www.tu-dortmund.de,"[{'identifier': 'https://ror.org/01k97gp34', '...",,
1,58,{'name': 'centre pour la communication scienti...,centre pour la communication scientifique directe,ccsd,FRA,https://www.ccsd.cnrs.fr,[],,
2,93,"{'name': 'texas medical center', 'alternativeN...",texas medical center,tmc,USA,https://www.tmc.edu,"[{'identifier': 'https://ror.org/00dqsbj20', '...",,
3,68,"{'name': 'university of southampton', 'alterna...",university of southampton,,GBR,https://www.southampton.ac.uk/,"[{'identifier': 'https://ror.org/01ryk1543', '...",,
4,84,"{'name': 'carleton college', 'alternativeName'...",carleton college,,USA,https://www.carleton.edu,"[{'identifier': 'https://ror.org/03jep7677', '...",,


In [24]:
opendoar_institutions['org_continent'] = opendoar_institutions.country.map(countrycode_to_continent)

In [25]:
opendoar_institutions[opendoar_institutions.org_continent.isna()].country.unique()

array([nan, 'UMI'], dtype=object)

In [26]:
opendoar_institutions.loc[opendoar_institutions.country == 'UMI', 'org_continent'] = 'NA'
opendoar_institutions[opendoar_institutions.country == 'UMI']

Unnamed: 0,system_metadata.id,organization,name,alternativeName,country,url,identifier,location.latitude,location.longiture,org_continent
4233,5379,"{'name': 'kettering university', 'alternativeN...",kettering university,,UMI,https://www.kettering.edu,"[{'identifier': 'https://ror.org/03rcspa57', '...",,,


**ROAR**

In [27]:
roar_institutions = roar_df.explode('location_country')
roar_institutions['location_country'] = roar_institutions.location_country.map(str.upper, na_action='ignore')
roar_institutions['location_country'] = roar_institutions.location_country.map(countrycode_iso2_to_countrycode_iso3)
roar_institutions['continent'] = roar_institutions.location_country.map(countrycode_to_continent)

**FAIRsharing**

In [28]:
fairsharing_countries = fairsharing_df.explode('attributes.countries')
fairsharing_countries['countrycode'] = fairsharing_countries['attributes.countries'].map(country_to_countrycode)
fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(countrycode_to_continent)

In [29]:
fairsharing_countries[fairsharing_countries.countrycode.isna()]['attributes.countries'].unique()

array(['European Union', 'Worldwide', nan], dtype=object)

In [30]:
fairsharing_countries[fairsharing_countries.continent.isna()]['attributes.countries'].unique()

array(['European Union', 'Worldwide', nan, 'Antarctica'], dtype=object)

Fix manually some rows

In [31]:
fairsharing_countries.loc[fairsharing_countries['attributes.countries'] == 'Republic of Ireland', ['attributes.countries', 'countrycode', 'continent']] = ['Ireland', 'IE', 'EU']
fairsharing_countries.loc[fairsharing_countries['attributes.countries'] == 'European Union', ['countrycode', 'continent']] = ['EU', 'EU']

Make Antactica disappear (only one repo)

In [32]:
fairsharing_countries.loc[fairsharing_countries['attributes.countries'] == 'Antarctica', ['countrycode', 'continent']] = ['AQ', np.nan]
fairsharing_countries[fairsharing_countries.countrycode == 'AQ']

Unnamed: 0,id,type,attributes.created-at,attributes.updated-at,attributes.metadata.doi,attributes.metadata.name,attributes.metadata.status,attributes.metadata.contacts,attributes.metadata.homepage,attributes.metadata.identifier,attributes.metadata.description,attributes.metadata.abbreviation,attributes.metadata.support-links,attributes.metadata.year-creation,attributes.metadata.data-processes,attributes.metadata.cross-references,attributes.legacy-ids,attributes.fairsharing-registry,attributes.record-type,attributes.subjects,attributes.domains,attributes.taxonomies,attributes.user-defined-tags,attributes.countries,attributes.name,attributes.abbreviation,attributes.url,attributes.doi,attributes.fairsharing-licence,attributes.description,attributes.publications,attributes.licence-links,attributes.url-for-logo,attributes.metadata.citations,attributes.metadata.associated-tools,attributes.metadata.deprecation-reason,attributes.metadata.data-access-condition.type,attributes.metadata.data-contact-information,attributes.metadata.data-deposition-condition.url,attributes.metadata.data-deposition-condition.type,attributes.metadata.deprecation-date,attributes.metadata.access-points,attributes.metadata.data-access-condition.url,attributes.metadata.resource-sustainability.url,attributes.metadata.resource-sustainability.name,attributes.metadata.data-preservation-policy.url,attributes.metadata.data-preservation-policy.name,attributes.metadata.data-access-for-pre-publication-review,attributes.metadata.data-versioning,attributes.metadata.data-curation.type,attributes.metadata.data-curation.url,attributes.metadata.citation-to-related-publications,attributes.metadata.tombstone,countrycode,continent
325,2462,fairsharing-records,2017-06-27T13:30:19.000Z,2021-12-02T18:05:26.741Z,10.25504/FAIRsharing.ewyejx,Antabif IPT - AntOBIS IPT - GBIF Belgium,ready,"[{'contact-name': 'Anton Van de Putte', 'conta...",http://ipt.biodiversity.aq/,2462,The Belgium Biodiversity Platform hosts this d...,,"[{'url': 'a.heughebaert@biodiversity.be', 'nam...",,,,"[biodbcore-000944, bsg-d000944]",Database,repository,"[Biodiversity, Life Science]",[Taxonomic classification],[All],[],Antarctica,FAIRsharing record for: Antabif IPT - AntOBIS ...,,https://fairsharing.org/10.25504/FAIRsharing.e...,10.25504/FAIRsharing.ewyejx,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: The Belgium...,[],"[{'licence-name': 'Apache License 2.0', 'licen...",,[],,,,,,,,,,,,,,,,,,,,AQ,
1094,3654,fairsharing-records,2021-12-02T09:58:02.958Z,2021-12-07T14:13:56.118Z,,SCAR Antarctic Biodiversity Portal,ready,"[{'contact-name': 'Anton Van de Putte', 'conta...",https://www.biodiversity.aq/,3654,Antarctic marine and terrestrial biodiversity ...,,[{'url': 'https://www.biodiversity.aq/how-to/w...,2005.0,[{'url': 'https://www.biodiversity.aq/find-dat...,[{'url': 'https://www.re3data.org/repository/r...,[],Database,knowledgebase,"[Zoology, Taxonomy, Ecology, Biodiversity, Oce...",[],[All],[],Antarctica,FAIRsharing record for: SCAR Antarctic Biodive...,,https://fairsharing.org/fairsharing_records/3654,,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: Antarctic m...,[],[{'licence-name': 'SCAR Antarctic Biodiversity...,,[],[{'url': 'https://www.biodiversity.aq/tools/r-...,,,,,,,[{'url': 'https://data.biodiversity.aq/api/v1....,,,,,,,,,,,,AQ,


### Country coverage

In [33]:
data1 = re3data_institutions.groupby('institutionCountry')[['orgIdentifier']].count().sort_values('orgIdentifier', ascending=False)
data2 = opendoar_institutions.groupby('country')[['system_metadata.id']].count().sort_values('system_metadata.id', ascending=False)
data3 = roar_institutions.groupby('location_country')[['eprintid']].count().sort_values('eprintid', ascending=False)
data4 = fairsharing_countries.groupby('countrycode')[['id']].count().sort_values('id', ascending=False)

plot = [
    go.Bar(
        x=data1.index,
        y=data1['orgIdentifier'],
        name='re3data'
    ),
    go.Bar(
        x=data2.index,
        y=data2['system_metadata.id'],
        name='openDOAR',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data3.index,
        y=data3['eprintid'],
        name='ROAR',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data4.index,
        y=data4['id'],
        name='FAIRsharing',
        visible = 'legendonly'
    )
]

layout = go.Layout(
    title='Country coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

go.Figure(plot, layout).show()

### Continental coverage

In [34]:
data1 = re3data_institutions.groupby('org_continent')[['orgIdentifier']].count()
data2 = opendoar_institutions.groupby('org_continent')[['system_metadata.id']].count()
data3 = roar_institutions.groupby('continent')[['eprintid']].count()
data4 = fairsharing_countries.groupby('continent')[['id']].count()

plot = [
    go.Scatterpolar(
        r=data1.orgIdentifier,
        theta=data1.index,
        fill='toself',
        name='re3data'),
    go.Scatterpolar(
        r=data2['system_metadata.id'],
        theta=data2.index,
        fill='toself',
        name='OpenDOAR'),
    go.Scatterpolar(
        r=data3.eprintid,
        theta=data3.index,
        fill='toself',
        name='ROAR'),
    go.Scatterpolar(
        r=data4.id,
        theta=data4.index,
        fill='toself',
        name='FAIRsharing')
]

layout = go.Layout(polar=dict(
    radialaxis=dict(
      visible=True
    ),
  )
)

go.Figure(plot, layout).show()