registries_analysis/notebooks/02-subjects&geographic.ipynb

4.1 MiB
Raw Blame History

In [1]:
import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)
In [2]:
def country_to_countrycode(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha3(country)
        except:
            return np.nan
        
def countrycode_iso2_to_countrycode_iso3(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha3(pycountry_convert.country_alpha2_to_country_name(country))
        except:
            return np.nan

def countrycode_to_continent(country_code):
    if pd.isna(country_code):
        return np.nan
    else:
        try:
            return pycountry_convert.country_alpha2_to_continent_code(pycountry_convert.country_alpha3_to_country_alpha2(country_code))
        except:
            return np.nan

Loading datasets

re3data

In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additionalName': ast.literal_eval,
                                    'repositoryIdentifier': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'contentType': ast.literal_eval,
                                    'providerType': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })

re3data_df.head()
Out[3]:
orgIdentifier repositoryName repositoryName.language additionalName repositoryURL repositoryIdentifier repositoryContact description description.language type size startDate endDate repositoryLanguage subject missionStatementURL contentType providerType keyword institution policy databaseAccess databaseLicense dataAccess dataLicense dataUploadType dataUploadLicense software versioning api pidSystem citationGuidelineURL aidSystem enhancedPublication qualityManagement certificate metadataStandard syndication remarks entryDate lastUpdate
0 r3d100000001 Odum Institute Archive Dataverse eng [] https://dataverse.unc.edu/dataverse/odum [] ["https://dataverse.unc.edu/dataverse/odum#", ... The Odum Institute Archive Dataverse contains ... eng [disciplinary] {"size": "13 dataverses; 3.050 datasets", "upd... NaN NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... NaN [{'name': 'Databases', 'scheme': 'parse'}, {'n... [dataProvider] [FAIR, Middle East, crime, demography, economy... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC0", "databaseLicen... [{"dataAccessType": "embargoed", "dataAccessRe... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [] ["DataVerse"] NaN {} ["DOI"] NaN [] unknown yes ["other"] [{"metadataStandardName": "DDI - Data Document... {} Odum Dataverse is covered by Thomson Reuters D... 2013-06-10 2021-07-06
1 r3d100000002 Access to Archival Databases eng [{'additionalName': 'AAD', 'additionalNameLang... https://aad.archives.gov/aad/ [RRID:SCR_010479, RRID:nlx_157752] ["https://www.archives.gov/contact"] You will find in the Access to Archival Databa... eng [disciplinary] {"size": "", "updatedp": ""} 1985 NaN ["eng", "spa"] [{'name': '1 Humanities and Social Sciences', ... https://www.archives.gov/publications/general-... [{'name': 'Images', 'scheme': 'parse'}, {'name... [dataProvider] [US History] [{'institutionName': 'The U.S. National Archiv... [{"policyName": "Contribution Policy", "policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "Copyrights", "dataLicens... restricted [] ["unknown"] no {"api": "https://www.archives.gov/developer#to... ["none"] https://aad.archives.gov/aad/help/getting-star... [] unknown unknown [] [] {"syndication": "http://www.archives.gov/socia... NaN 2012-07-04 2021-05-25
2 r3d100000004 Datenbank Gesprochenes Deutsch deu [{'additionalName': 'DGD', 'additionalNameLang... https://dgd.ids-mannheim.de/ [] ["dgd@ids-mannheim.de"] The "Database for Spoken German (DGD)" is a co... eng [disciplinary] {"size": "34 corpora", "updatedp": "2020-02-03"} 2012 NaN ["deu"] [{'name': '1 Humanities and Social Sciences', ... https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... [{'name': 'Audiovisual data', 'scheme': 'parse... [dataProvider, serviceProvider] [Australian German, FOLK, German dialects, Pfe... [{'institutionName': 'Institut für Deutsche Sp... [{"policyName": "Erfurter Aufruf zur Sicherung... {"databaseAccessType": "restricted", "databas... [] [{"dataAccessType": "restricted", "dataAccessR... [{"dataLicenseName": "other", "dataLicenseURL"... restricted [] ["other"] yes {} ["none"] http://agd.ids-mannheim.de/konditionen.shtml [] unknown unknown ["RatSWD"] [] {} NaN 2012-07-20 2020-08-27
3 r3d100000005 UNC Dataverse eng [{'additionalName': 'University of North Carol... https://dataverse.unc.edu/ [FAIRsharing_doi:10.25504/FAIRsharing.pS2p8c] ["https://dataverse.unc.edu/", "odumarchive@un... UNC Dataverse is an open-source repository sof... eng [institutional] {"size": "186 dataverses; 25.272 studies; 229.... 2011 NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://odum.unc.edu/about/mission-vision/ [{'name': 'Archived data', 'scheme': 'parse'},... [dataProvider, serviceProvider] [FAIR, census, demographic survey, demography,... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Data Deposit Form"... ["DataVerse"] yes {"api": "https://guides.dataverse.org/en/lates... ["ARK", "DOI", "PURL", "URN", "hdl"] https://dataverse.org/best-practices/data-cita... [] unknown yes [] [{"metadataStandardName": "DDI - Data Document... {} UNC Dataverse is covered by Clarivate Data Cit... 2012-07-23 2021-10-25
4 r3d100000006 Archaeology Data Service eng [{'additionalName': 'ADS', 'additionalNameLang... https://archaeologydataservice.ac.uk/ [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] ["help@archaeologydataservice.ac.uk", "https:/... The ADS is an accredited digital repository fo... eng [disciplinary] {"size": "1837 results", "updatedp": "2020-05-... 1996-10-01 NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://archaeologydataservice.ac.uk/about/our... [{'name': 'Archived data', 'scheme': 'parse'},... [dataProvider, serviceProvider] [FAIR, archaeology, cultural heritage, prehist... [{'institutionName': 'Arts and Humanities Rese... [{"policyName": "ADS Guides to good practice",... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC", "databaseLicens... [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Guidelines for Dep... ["other"] yes {"api": "https://archaeologydataservice.ac.uk/... ["DOI"] https://archaeologydataservice.ac.uk/advice/te... [] unknown yes ["other"] [{"metadataStandardName": "DataCite Metadata S... {"syndication": "https://archaeologydataservic... ADS is covered by Clarivate Data Citation Inde... 2012-07-23 2021-09-02
In [4]:
re3data_df.describe(include='all')
Out[4]:
orgIdentifier repositoryName repositoryName.language additionalName repositoryURL repositoryIdentifier repositoryContact description description.language type size startDate endDate repositoryLanguage subject missionStatementURL contentType providerType keyword institution policy databaseAccess databaseLicense dataAccess dataLicense dataUploadType dataUploadLicense software versioning api pidSystem citationGuidelineURL aidSystem enhancedPublication qualityManagement certificate metadataStandard syndication remarks entryDate lastUpdate
count 2793 2793 2793 2793 2769 2793 2793 2793 2793 2793 2793 1800 172 2793 2793 2373 2793 2793 2793 2793 2793 2793 2793 2793 2793 2778 2793 2793 1339 2793 2793 1532 2793 2793 2793 2793 2793 2793 1694 2793 2793
unique 2793 2791 19 2197 2766 1024 2532 2792 6 9 1321 362 86 110 1418 2304 1351 6 2544 2773 2366 12 377 146 2294 3 695 23 2 1170 29 1337 13 3 3 16 175 544 1673 1316 722
top r3d100000001 EarthChem Library eng [] http://icgem.gfz-potsdam.de/home [] [] The National Archives and Records Administrati... eng [disciplinary] {"size": "", "updatedp": ""} 2008 2015 ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://learn.scholarsportal.info/all-guides/d... [{'name': 'Standard office documents', 'scheme... [dataProvider] [multidisciplinary] [{'institutionName': 'National Center for Biot... [][] {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [] ["unknown"] yes {} ["none"] https://dataverse.org/best-practices/data-cita... [] unknown yes [] [] {} is covered by Elsevier. 2018-08-10 2021-09-03
freq 1 2 2596 587 2 1769 170 2 2776 1768 1472 93 12 2088 240 14 29 1806 205 7 319 2624 2201 1292 71 1851 2054 1216 1131 1526 1359 76 2199 1643 1569 2557 1693 2235 17 20 104

openDOAR

In [5]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'repository_metadata.content_subjects': ast.literal_eval,
                                    'repository_metadata.alternativename': ast.literal_eval,
                                    'repository_metadata.content_types': ast.literal_eval,
                                    'organization': ast.literal_eval
                                    },
                         dtype={'system_metadata.id': str})

opendoar_df.head()
Out[5]:
system_metadata.id repository_metadata.name repository_metadata.alternativename repository_metadata.url repository_metadata.description repository_metadata.type repository_metadata.content_languages system_metadata.date_modified system_metadata.date_created repository_metadata.content_subjects repository_metadata.content_types organization policy_urls repository_metadata.software repository_metadata.oai_url system_metadata.publicly_visible repository_metadata.repository_status repository_metadata.fulltext_record_count repository_metadata.metadata_record_count
0 134 {"name": "eldorado - repository of the tu dort... [{'name': 'eldorado - ressourcen aus und für l... https://eldorado.tu-dortmund.de NaN institutional [] 2022-01-12 15:34:54 2005-12-19 14:57:52 [arts, humanities, science, mathematics, socia... [journal_articles, conference_and_workshop_pap... [{'name': 'technische universität dortmund', '... [] {"name": "dspace", "version": ""} https://eldorado.tu-dortmund.de/oai/request yes NaN 9629.0 20963.0
1 58 {"name": "archive ouverte en sciences de linfo... [{'acronym': '@rchivesic'}] https://archivesic.ccsd.cnrs.fr NaN institutional [] 2022-01-12 15:34:53 2006-01-13 12:48:32 [arts, science, technology, engineering, mathe... [journal_articles, conference_and_workshop_pap... [{'name': 'centre pour la communication scient... [] {"name": "hal", "version": ""} https://api.archives-ouvertes.fr/oai/archivesic yes NaN 55492.0 1137498.0
2 93 {"name": "digitalcommons@the texas medical cen... [] http://digitalcommons.library.tmc.edu/ NaN institutional [] 2022-01-12 15:34:53 2006-02-14 11:16:12 [health and medicine] [journal_articles, theses_and_dissertations] [{'name': 'texas medical center', 'alternative... [] {"name": "other", "version": ""} http://digitalcommons.library.tmc.edu/do/oai/ yes NaN 2658.0 7268.0
3 68 {"name": "cognitive sciences eprint archive", ... [{'acronym': 'cogprints'}] http://cogprints.org/ NaN disciplinary [] 2022-01-12 15:34:53 2006-01-04 15:01:23 [humanities, health and medicine, science, soc... [journal_articles, conference_and_workshop_pap... [{'name': 'university of southampton', 'altern... [] {"name": "eprints", "version": ""} http://cogprints.org/cgi/oai2 yes NaN 2895.0 4277.0
4 84 {"name": "digital commons@carleton college", "... [] http://digitalcommons.carleton.edu/ NaN institutional [] 2022-01-12 15:34:53 2006-01-04 16:07:58 [humanities, science, social sciences] [journal_articles, unpub_reports_and_working_p... [{'name': 'carleton college', 'alternativeName... [] {"name": "other", "version": ""} NaN yes NaN NaN 42.0
In [6]:
opendoar_df.describe(include='all')
Out[6]:
system_metadata.id repository_metadata.name repository_metadata.alternativename repository_metadata.url repository_metadata.description repository_metadata.type repository_metadata.content_languages system_metadata.date_modified system_metadata.date_created repository_metadata.content_subjects repository_metadata.content_types organization policy_urls repository_metadata.software repository_metadata.oai_url system_metadata.publicly_visible repository_metadata.repository_status repository_metadata.fulltext_record_count repository_metadata.metadata_record_count
count 5811 5811 5811 5810 0.0 5810 5811 5811 5811 5811 5811 5811 5811 5811 4447 5811 0.0 2.292000e+03 4.184000e+03
unique 5811 5780 2116 5772 NaN 4 1 171 5643 237 477 5212 678 32 4415 1 NaN NaN NaN
top 134 {"name": "arch", "language": "en"} [] http://harp.lib.hiroshima-u.ac.jp/ NaN institutional [] 2022-01-12 15:35:47 2020-09-18 12:53:48 [science, technology, engineering, mathematics... [theses_and_dissertations] [{'name': 'rijksuniversiteit groningen', 'alte... [] {"name": "dspace", "version": ""} https://api.figshare.com/v2/oai yes NaN NaN NaN
freq 1 3 3656 3 NaN 5161 5811 73 81 3321 469 26 5131 2273 3 5811 NaN NaN NaN
mean NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.022890e+03 1.765556e+05
std NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.212648e+04 6.611068e+06
min NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000e+00 0.000000e+00
25% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000e+00 8.937500e+02
50% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.225000e+02 4.012500e+03
75% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.931500e+03 1.629350e+04
max NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.817531e+06 4.200000e+08

ROAR

In [7]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv', dtype='str')
roar_df = roar_df.groupby('eprintid').aggregate(set)

def value_or_list(cell_set):
    copy = set(cell_set)
    copy.discard(np.nan) 
    if len(copy) == 0:
        return np.nan
    if len(copy) == 1:
        return copy.pop()
    return list(copy)
        
roar_df = roar_df.applymap(value_or_list)
roar_df.reset_index(inplace=True)

roar_df.head()
Out[7]:
eprintid rev_number eprint_status userid importid source dir datestamp lastmod status_changed type succeeds commentary metadata_visibility latitude longitude relation_type relation_uri item_issues_id item_issues_type item_issues_description item_issues_timestamp item_issues_status item_issues_reported_by item_issues_resolved_by item_issues_comment item_issues_count sword_depositor sword_slug exemplar home_page title oai_pmh sword_endpoint rss_feed twitter_feed description fulltext open_access mandate organisation_title organisation_home_page location_country location_city location_latitude location_longitude software geoname version subjects date note suggestions activity_low activity_medium activity_high recordcount recordhistory fulltexts_total fulltexts_docs fulltexts_rtotal fulltexts_rdocs registry_name registry_id submit_to submitted_to_name submitted_to_done webometrics_rank webometrics_size webometrics_visibility webometrics_rich_files webometrics_scholar monthly_deposits total_deposits association
0 1 633 archive 1 NaN NaN disk0/00/00/00/01 2010-01-06 13:43:48 2011-07-18 05:40:07 2010-01-06 13:43:48 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://archivesic.ccsd.cnrs.fr/ @RCHIVESIC http://archivesic.ccsd.cnrs.fr/oai/oai.php NaN NaN NaN NaN NaN NaN NaN NaN NaN fr NaN NaN NaN hal geoname_2_FR other NaN 2002-05-17 19:24:41 NaN NaN 0 0 0 25 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... NaN NaN NaN NaN [opendoar, celestial] [669, 58] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 10 511 archive 1 NaN NaN disk0/00/00/00/10 2010-01-06 13:43:48 2011-07-18 05:40:13 2010-01-06 13:43:48 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://www.diva-portal.org/mdh/ Academic Archive On-line (Mälardalen Universit... http://www.diva-portal.org/oai/mdh/OAI NaN NaN NaN NaN TRUE TRUE NaN NaN NaN se Uppsala 59.8667 17.6333 diva geoname_2_SE other NaN 2005-12-08 13:15:22 NaN NaN 0 0 0 100 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,100,100... NaN NaN NaN NaN [opendoar, celestial] [258, 526] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1000 274 archive 1 NaN NaN disk0/00/00/10/00 2010-01-06 13:45:01 2011-07-06 08:21:21 2010-01-06 13:45:01 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://pam.pisharp.org/ PAM - Portuguese Archive of Mathematics NaN NaN NaN NaN NaN TRUE TRUE NaN NaN NaN pt Bellevue, WA 47.6034 -122.155 dspace geoname_2_PT other NaN 2006-05-04 10:48:14 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 10001 20 archive 91 NaN NaN disk0/00/01/00/01 2015-08-08 14:52:11 2016-03-21 19:44:01 2015-08-08 14:52:11 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://edoc.sub.uni-hamburg.de/klimawandel/ Klimawandel Dokumentenserver http://edoc.sub.uni-hamburg.de/klimawandel/oai NaN NaN NaN The "Documentenserver Klimawandel" (Repository... TRUE TRUE TRUE [Helmholtz-Zentrum Geesthacht, Climate Service... [http://www.climateservicecenter.de/, http://w... de Hamburg 53.5511 9.9937 opus geoname_2_DE other [GE, S1, GF, HD, G1] 2015-07-02 08:08:31 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN [opendoar, celestial] [3408, 5881] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 10008 11 archive 404 NaN NaN disk0/00/01/00/08 2015-08-08 14:52:26 2016-03-21 19:43:51 2015-08-08 14:52:26 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://creativematter.skidmore.edu/ Creative Matter | Skidmore College Research http://creativematter.skidmore.edu/do/oai/ NaN http://creativematter.skidmore.edu/recent.rss NaN Welcome to Creative Matter, a repository for t... TRUE FALSE FALSE Skidmore College http://www.skidmore.edu/ us Saratoga Springs 43.0961 -73.7818 bepress geoname_2_US other NaN 2015-07-06 17:35:50 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN celestial 5882 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [8]:
roar_df.describe(include='all')
Out[8]:
eprintid rev_number eprint_status userid importid source dir datestamp lastmod status_changed type succeeds commentary metadata_visibility latitude longitude relation_type relation_uri item_issues_id item_issues_type item_issues_description item_issues_timestamp item_issues_status item_issues_reported_by item_issues_resolved_by item_issues_comment item_issues_count sword_depositor sword_slug exemplar home_page title oai_pmh sword_endpoint rss_feed twitter_feed description fulltext open_access mandate organisation_title organisation_home_page location_country location_city location_latitude location_longitude software geoname version subjects date note suggestions activity_low activity_medium activity_high recordcount recordhistory fulltexts_total fulltexts_docs fulltexts_rtotal fulltexts_rdocs registry_name registry_id submit_to submitted_to_name submitted_to_done webometrics_rank webometrics_size webometrics_visibility webometrics_rich_files webometrics_scholar monthly_deposits total_deposits association
count 5444 5444 5444 5444 0.0 0.0 5444 5444 5444 5444 5444 108 0.0 5444 0.0 0.0 0.0 0.0 63 63 63 63 63 0.0 0.0 0.0 2242 0.0 0.0 268 5437 5442 4332 178 1538 116 3837 4197 4197 3746 4460 4286 5138 3714 3725 3708 4700 4730 5444 1289 5429 218 189 2288 2288 2288 2290 2288 270 258 270 258 4605 4580 375 205 205 148 148 148 148 148 756 756 223
unique 5444 660 1 2189 NaN NaN 5444 4198 4043 4230 12 108 NaN 2 NaN NaN NaN NaN 48 5 62 4 3 NaN NaN NaN 4 NaN NaN 2 5271 5143 4059 172 1485 112 3359 2 2 2 3858 3831 144 1884 2923 2953 31 126 53 938 4898 210 173 72 54 16 741 1702 135 118 134 117 7 4261 7 1 1 148 148 148 146 143 346 342 3
top 1 11 archive 1 NaN NaN disk0/00/00/00/01 2010-01-06 13:43:48 2011-07-06 08:24:53 2010-01-06 13:43:48 institutional 10164 NaN show NaN NaN NaN NaN bad_oai_pmh_url_0 duplicate_title Duplicate title to <xhtml:table xmlns:xhtml="h... 2010-01-13 10:44:49 discovered NaN NaN NaN 0 NaN NaN FALSE http://eprints.upnjatim.ac.id/ Repositorio Institucional http://kce.docressources.info/ws/PMBWs_2 http://producao.usp.br/sword/servicedocument http://eprints.upnjatim.ac.id/cgi/latest_tool?... http://my.indexcopernicus.com/fredemoreno info:other:archives.eprints.org:import TRUE TRUE FALSE Chinese Academy of Science (中国科学院) http://www.cas.cn/ us Lima 34.1607 -118.139 dspace geoname_2_US other K1 2006-05-04 10:48:14 ¿Quién puede depositar documentos en el reposi... This repository is hosted by the Texas Digital... 0 0 0 100 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... 0 0 0 0 [opendoar, celestial] 2479 [opendoar, roarmap, celestial] opendoar 2021-01-25 24 46 20 824 806 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... 0 russell_group
freq 1 333 5444 1330 NaN NaN 1 16 8 16 3853 1 NaN 5402 NaN NaN NaN NaN 15 33 2 45 38 NaN NaN NaN 2201 NaN NaN 261 4 7 4 2 5 2 112 2805 2696 2748 9 9 891 74 25 25 2341 845 4841 53 99 2 9 2012 2074 2210 730 95 113 114 113 114 2106 4 119 205 205 1 1 1 3 5 387 387 130
mean NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
std NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
50% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
75% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
max NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

FAIRsharing

In [9]:
with open('../data/raw/fairsharing_dump_api_02_2022.json') as f:
    lines = f.read().splitlines()
    
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))

fairsharing_df.head()
Out[9]:
id type attributes.created-at attributes.updated-at attributes.metadata.doi attributes.metadata.name attributes.metadata.status attributes.metadata.contacts attributes.metadata.homepage attributes.metadata.identifier attributes.metadata.description attributes.metadata.abbreviation attributes.metadata.support-links attributes.metadata.year-creation attributes.metadata.data-processes attributes.metadata.cross-references attributes.legacy-ids attributes.fairsharing-registry attributes.record-type attributes.subjects attributes.domains attributes.taxonomies attributes.user-defined-tags attributes.countries attributes.name attributes.abbreviation attributes.url attributes.doi attributes.fairsharing-licence attributes.description attributes.publications attributes.licence-links attributes.url-for-logo attributes.metadata.citations attributes.metadata.associated-tools attributes.metadata.deprecation-reason attributes.metadata.data-access-condition.type attributes.metadata.data-contact-information attributes.metadata.data-deposition-condition.url attributes.metadata.data-deposition-condition.type attributes.metadata.deprecation-date attributes.metadata.access-points attributes.metadata.data-access-condition.url attributes.metadata.resource-sustainability.url attributes.metadata.resource-sustainability.name attributes.metadata.data-preservation-policy.url attributes.metadata.data-preservation-policy.name attributes.metadata.data-access-for-pre-publication-review attributes.metadata.data-versioning attributes.metadata.data-curation.type attributes.metadata.data-curation.url attributes.metadata.citation-to-related-publications attributes.metadata.tombstone
0 3226 fairsharing-records 2020-12-09T11:53:44.000Z 2022-02-08T10:42:36.452Z 10.25504/FAIRsharing.d6423b WDC Sunspot Index and Long-term Solar Observat... ready [{'contact-name': 'Frédéric Clette', 'contact-... http://sidc.be/silso/home 3226 The WDC-SILSO is an activity of the Operationa... WDC-SILSO [{'url': 'http://www.sidc.be/silso/taxonomy/te... 2013.0 [{'url': 'http://www.sidc.be/silso/datafiles',... [{'url': 'https://www.re3data.org/repository/r... [biodbcore-001740, bsg-d001740] Database repository [Electromagnetism, Astrophysics and Astronomy,... [Climate, Observation design] [Not applicable] [Climate change, earth observation, Electromag... [Belgium] FAIRsharing record for: WDC Sunspot Index and ... WDC-SILSO https://fairsharing.org/10.25504/FAIRsharing.d... 10.25504/FAIRsharing.d6423b https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The WDC-SIL... [] [{'licence-name': 'SILSO legal notices', 'lice... None NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 2114 fairsharing-records 2014-11-04T15:23:40.000Z 2022-01-21T14:39:02.195Z 10.25504/FAIRsharing.p06nme Biological Magnetic Resonance Data Bank ready [{'contact-name': 'Helpdesk', 'contact-email':... https://bmrb.io/ 2114 BMRB collects, annotates, archives, and dissem... BMRB [{'url': 'https://bmrb.io/bmrb/news/', 'name':... 1988.0 [{'url': 'https://bmrb.io/data_library/rsync.s... [{'url': 'https://www.re3data.org/repository/r... [biodbcore-000584, bsg-d000584] Database repository [Structural Biology] [Molecular structure, Protein structure, Pepti... [All] [] [United States] FAIRsharing record for: Biological Magnetic Re... BMRB https://fairsharing.org/10.25504/FAIRsharing.p... 10.25504/FAIRsharing.p06nme https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: BMRB collec... [{'id': 552, 'pubmed_id': 18288446, 'title': '... [{'licence-name': 'wwPDB Privacy and Usage Pol... None [{'doi': '10.1093/nar/gkm957', 'pubmed-id': 17... [{'url': 'https://bmrb.io/validate/', 'name': ... open yes https://bmrb.io/deposit/ open NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 3022 fairsharing-records 2020-06-17T10:25:30.000Z 2022-02-08T10:41:04.073Z 10.25504/FAIRsharing.8b7a2f Fisheries and Oceans Canada Pacific Region Dat... ready [{'contact-name': 'Peter Chandler', 'contact-e... http://www.pac.dfo-mpo.gc.ca/science/oceans/da... 3022 The Institute of Ocean Sciences (IOS)/Ocean Sc... None [{'url': 'DFO.PAC.SCI.IOSData-DonneesISO.SCI.P... NaN [{'name': 'Users must contact the Senior Analy... [{'url': 'https://www.re3data.org/repository/r... [biodbcore-001530, bsg-d001530] Database repository [Environmental Science, Meteorology, Earth Sci... [Climate] [Not applicable] [Salinity, Temperature] [Canada] FAIRsharing record for: Fisheries and Oceans C... None https://fairsharing.org/10.25504/FAIRsharing.8... 10.25504/FAIRsharing.8b7a2f https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The Institu... [] [{'licence-name': 'Fisheries and Oceans Canada... None NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 2998 fairsharing-records 2020-05-21T07:42:30.000Z 2022-02-08T10:40:19.531Z 10.25504/FAIRsharing.e08886 Climate Prediction Center ready [{'contact-name': 'Jon Hoopingarner', 'contact... https://www.cpc.ncep.noaa.gov/ 2998 The Climate Prediction Center (CPC) produces o... CPC [{'url': 'https://www.cpc.ncep.noaa.gov/commen... 1970.0 [{'url': 'https://www.cpc.ncep.noaa.gov/', 'na... [{'url': 'https://www.re3data.org/repository/r... [biodbcore-001504, bsg-d001504] Database repository [Hydrogeology, Geography, Meteorology, Geodesy... [Climate] [Not applicable] [Forecasting, weather] [United States] FAIRsharing record for: Climate Prediction Center CPC https://fairsharing.org/10.25504/FAIRsharing.e... 10.25504/FAIRsharing.e08886 https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The Climate... [] [{'licence-name': 'National Weather Service Di... None NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 2301 fairsharing-records 2016-06-03T14:54:08.000Z 2021-11-24T13:17:51.201Z 10.25504/FAIRsharing.meh9wz Acytostelium Gene Database deprecated [{'contact-name': 'Acytostelium genome consort... http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b... 2301 Genome and transcriptome database of Acytostel... NaN NaN 2008.0 NaN NaN [biodbcore-000775, bsg-d000775] Database repository [Genomics, Life Science, Transcriptomics] [DNA sequence data, Gene model annotation] [Acytostelium subglobosum] [] [United Kingdom, Japan] FAIRsharing record for: Acytostelium Gene Data... None https://fairsharing.org/10.25504/FAIRsharing.m... 10.25504/FAIRsharing.meh9wz https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: Genome and ... [{'id': 1139, 'pubmed_id': 25758444, 'title': ... [] None NaN NaN This resource is no longer available at the st... NaN NaN NaN NaN 2021-9-17 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [10]:
fairsharing_df.describe(include='all')
Out[10]:
id type attributes.created-at attributes.updated-at attributes.metadata.doi attributes.metadata.name attributes.metadata.status attributes.metadata.contacts attributes.metadata.homepage attributes.metadata.identifier attributes.metadata.description attributes.metadata.abbreviation attributes.metadata.support-links attributes.metadata.year-creation attributes.metadata.data-processes attributes.metadata.cross-references attributes.legacy-ids attributes.fairsharing-registry attributes.record-type attributes.subjects attributes.domains attributes.taxonomies attributes.user-defined-tags attributes.countries attributes.name attributes.abbreviation attributes.url attributes.doi attributes.fairsharing-licence attributes.description attributes.publications attributes.licence-links attributes.url-for-logo attributes.metadata.citations attributes.metadata.associated-tools attributes.metadata.deprecation-reason attributes.metadata.data-access-condition.type attributes.metadata.data-contact-information attributes.metadata.data-deposition-condition.url attributes.metadata.data-deposition-condition.type attributes.metadata.deprecation-date attributes.metadata.access-points attributes.metadata.data-access-condition.url attributes.metadata.resource-sustainability.url attributes.metadata.resource-sustainability.name attributes.metadata.data-preservation-policy.url attributes.metadata.data-preservation-policy.name attributes.metadata.data-access-for-pre-publication-review attributes.metadata.data-versioning attributes.metadata.data-curation.type attributes.metadata.data-curation.url attributes.metadata.citation-to-related-publications attributes.metadata.tombstone
count 1853 1853 1853 1853 1601 1853 1853 1764 1853 1853.000000 1853 1671 1663 1541.000000 1626 790 1853 1853 1853 1853 1853 1853 1853 1853 1853 1671 1853 1601 1853 1853 1853 1853 18 621 632 363 42 47 22 33 238 465 19 2 2 3 3 10 17 22 8 35 1
unique 1853 1 1218 1853 1601 1851 4 1623 1853 NaN 1853 1655 1646 NaN 1625 790 1799 1 3 935 1205 385 395 194 1851 1655 1853 1601 1 1853 1135 1119 18 331 627 104 2 2 22 2 71 460 19 2 2 3 3 2 2 4 8 2 1
top 3226 fairsharing-records 2014-11-04T15:23:40.000Z 2022-02-08T10:42:36.452Z 10.25504/FAIRsharing.d6423b iDog ready [] http://sidc.be/silso/home NaN The WDC-SILSO is an activity of the Operationa... CGD [{'url': 'https://github.com/gbif/ipt/wiki/IPT... NaN [{'url': 'https://site.uit.no/dataverseno/abou... [{'url': 'https://www.re3data.org/repository/r... [] Database repository [Life Science] [] [All] [] [United States] FAIRsharing record for: iDog CGD https://fairsharing.org/10.25504/FAIRsharing.d... 10.25504/FAIRsharing.d6423b https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The WDC-SIL... [] [] /rails/active_storage/blobs/redirect/eyJfcmFpb... [] [] open yes https://bmrb.io/deposit/ controlled 2021-9-17 [{'url': 'https://heidata.uni-heidelberg.de/oa... https://arch.library.northwestern.edu/about?lo... https://www.library.northwestern.edu/about/adm... Commitment to Sustainability: Level 1 http://www.library.northwestern.edu/about/admi... Digital Preservation Policy: Level 1 yes yes manual https://www.gbif.org/tools/data-validator/about yes True
freq 1 1853 636 1 1 2 1564 40 1 NaN 1 3 6 NaN 2 1 55 1853 954 345 276 528 1258 607 2 3 1 1 1853 1 690 735 1 285 3 125 38 45 1 21 81 3 1 1 1 1 1 9 16 11 1 34 1
mean NaN NaN NaN NaN NaN NaN NaN NaN NaN 2481.862925 NaN NaN NaN 2007.894873 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
std NaN NaN NaN NaN NaN NaN NaN NaN NaN 554.072492 NaN NaN NaN 10.933713 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min NaN NaN NaN NaN NaN NaN NaN NaN NaN 1120.000000 NaN NaN NaN 1894.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25% NaN NaN NaN NaN NaN NaN NaN NaN NaN 2009.000000 NaN NaN NaN 2004.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
50% NaN NaN NaN NaN NaN NaN NaN NaN NaN 2473.000000 NaN NaN NaN 2010.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
75% NaN NaN NaN NaN NaN NaN NaN NaN NaN 2938.000000 NaN NaN NaN 2015.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
max NaN NaN NaN NaN NaN NaN NaN NaN NaN 3827.000000 NaN NaN NaN 2022.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

Subjects analysis

re3data

In [11]:
re3data_subjects = re3data_df[['orgIdentifier', 'subject']].explode('subject')
re3data_subjects['subject'] = re3data_subjects['subject'].apply(lambda x: x['name'] if x is not np.nan else np.nan)
re3data_subjects
Out[11]:
orgIdentifier subject
0 r3d100000001 1 Humanities and Social Sciences
0 r3d100000001 111 Social Sciences
0 r3d100000001 11104 Political Science
0 r3d100000001 112 Economics
0 r3d100000001 12 Social and Behavioural Sciences
... ... ...
2791 r3d100013733 4 Engineering Sciences
2792 r3d100013735 2 Life Sciences
2792 r3d100013735 204 Microbiology, Virology and Immunology
2792 r3d100013735 21 Biology
2792 r3d100013735 22 Medicine

17032 rows × 2 columns

In [12]:
data = re3data_subjects.groupby('subject')[['orgIdentifier']].count().sort_values('subject', ascending=False)
data
plot = [
    go.Bar(
        x=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)].index,
        y=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)]['orgIdentifier'],
        name='re3data tier %s-digits' % tier
    ) for tier in [1,2,3,5]
] 

layout = go.Layout(
    title='Subject coverage re3data',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

OpenDOAR

In [13]:
opendoar_subjects = opendoar_df.explode('repository_metadata.content_subjects')
In [14]:
data = opendoar_subjects.groupby('repository_metadata.content_subjects')[['system_metadata.id']].count().sort_values('system_metadata.id', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['system_metadata.id'],
    ) 
] 

layout = go.Layout(
    title='Subject coverage OpenDOAR',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

ROAR

In [15]:
roar_subjects = roar_df.explode('subjects')
In [16]:
data = roar_subjects.groupby('subjects')[['eprintid']].count().sort_values('eprintid', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['eprintid'],
    ) 
] 

layout = go.Layout(
    title='Subject coverage OpenDOAR',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

FAIRsharing

In [17]:
fairsharing_subjects = fairsharing_df.explode('attributes.subjects')
In [18]:
data = fairsharing_subjects.groupby('attributes.subjects')[['id']].count().sort_values('id', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['id'],
        name='FAIRsharing'
    )
]

layout = go.Layout(
    title='Subject coverage FAIRsharing',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

Geographic analysis

re3data

In [19]:
re3data_institutions = re3data_df.explode('institution')[['orgIdentifier', 'institution']]
re3data_institutions = re3data_institutions[~re3data_institutions.institution.isna()].reset_index(drop=True)
re3data_institutions = re3data_institutions.join(pd.json_normalize(re3data_institutions.institution))
re3data_institutions.head()
Out[19]:
orgIdentifier institution institutionName institutionAdditionalName institutionCountry responsabilityType institutionType institutionURL institutionIdentifier responsibilityStartDate responsibilityEndDate institutionContact
0 r3d100000001 {'institutionName': 'Odum Institute for Resear... Odum Institute for Research in Social Science [] USA [general] non-profit https://odum.unc.edu/archive/ [] []
1 r3d100000002 {'institutionName': 'The U.S. National Archive... The U.S. National Archives and Records Adminis... [NARA, National Archives] USA [general] non-profit http://www.archives.gov/ [] [http://www.archives.gov/contact/]
2 r3d100000002 {'institutionName': 'The USA.gov', 'institutio... The USA.gov [] USA [general] non-profit http://www.usa.gov/ [] [http://www.usa.gov/Contact.shtml]
3 r3d100000004 {'institutionName': 'Institut für Deutsche Spr... Institut für Deutsche Sprache, Archiv für Gesp... [AGD] DEU [funding, general] non-profit http://agd.ids-mannheim.de/index.shtml [] 2004 [agd@ids-mannheim.de]
4 r3d100000005 {'institutionName': 'Odum Institute for Resear... Odum Institute for Research in Social Science [] USA [technical] non-profit https://odum.unc.edu/ [] [https://odum.unc.edu/contact/contact-form/, o...
In [20]:
re3data_institutions['org_continent'] = re3data_institutions.institutionCountry.map(countrycode_to_continent)
In [21]:
re3data_institutions[re3data_institutions.org_continent.isna()].institutionCountry.unique()
Out[21]:
array(['AAA', 'EEC'], dtype=object)

AAA is used for international collaborations; we skip this. EEC is used for the EU commission; we fix the continent manually.

In [22]:
re3data_institutions.loc[re3data_institutions.institutionCountry == 'EEC', 'org_continent'] = 'EU'

OpenDOAR

In [23]:
opendoar_institutions = opendoar_df.explode('organization')[['system_metadata.id', 'organization']]
opendoar_institutions = opendoar_institutions[~opendoar_institutions.organization.isna()].reset_index(drop=True)
opendoar_institutions = opendoar_institutions.join(pd.json_normalize(opendoar_institutions.organization))
opendoar_institutions['country'] = opendoar_institutions.country.map(str.upper, na_action='ignore')
opendoar_institutions['country'] = opendoar_institutions.country.map(countrycode_iso2_to_countrycode_iso3, na_action='ignore')
opendoar_institutions.head()
Out[23]:
system_metadata.id organization name alternativeName country url identifier location.latitude location.longiture
0 134 {'name': 'technische universität dortmund', 'a... technische universität dortmund tu dortmund DEU https://www.tu-dortmund.de [{'identifier': 'https://ror.org/01k97gp34', '...
1 58 {'name': 'centre pour la communication scienti... centre pour la communication scientifique directe ccsd FRA https://www.ccsd.cnrs.fr []
2 93 {'name': 'texas medical center', 'alternativeN... texas medical center tmc USA https://www.tmc.edu [{'identifier': 'https://ror.org/00dqsbj20', '...
3 68 {'name': 'university of southampton', 'alterna... university of southampton GBR https://www.southampton.ac.uk/ [{'identifier': 'https://ror.org/01ryk1543', '...
4 84 {'name': 'carleton college', 'alternativeName'... carleton college USA https://www.carleton.edu [{'identifier': 'https://ror.org/03jep7677', '...
In [24]:
opendoar_institutions['org_continent'] = opendoar_institutions.country.map(countrycode_to_continent)
In [25]:
opendoar_institutions[opendoar_institutions.org_continent.isna()].country.unique()
Out[25]:
array([nan, 'UMI'], dtype=object)
In [26]:
opendoar_institutions.loc[opendoar_institutions.country == 'UMI', 'org_continent'] = 'NA'
opendoar_institutions[opendoar_institutions.country == 'UMI']
Out[26]:
system_metadata.id organization name alternativeName country url identifier location.latitude location.longiture org_continent
4233 5379 {'name': 'kettering university', 'alternativeN... kettering university UMI https://www.kettering.edu [{'identifier': 'https://ror.org/03rcspa57', '... NA

ROAR

In [27]:
roar_institutions = roar_df.explode('location_country')
roar_institutions['location_country'] = roar_institutions.location_country.map(str.upper, na_action='ignore')
roar_institutions['location_country'] = roar_institutions.location_country.map(countrycode_iso2_to_countrycode_iso3)
roar_institutions['continent'] = roar_institutions.location_country.map(countrycode_to_continent)

FAIRsharing

In [28]:
fairsharing_countries = fairsharing_df.explode('attributes.countries')
fairsharing_countries['countrycode'] = fairsharing_countries['attributes.countries'].map(country_to_countrycode)
fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(countrycode_to_continent)
In [29]:
fairsharing_countries[fairsharing_countries.countrycode.isna()]['attributes.countries'].unique()
Out[29]:
array(['European Union', 'Worldwide', nan], dtype=object)
In [30]:
fairsharing_countries[fairsharing_countries.continent.isna()]['attributes.countries'].unique()
Out[30]:
array(['European Union', 'Worldwide', nan, 'Antarctica'], dtype=object)

Fix manually some rows

In [31]:
fairsharing_countries.loc[fairsharing_countries['attributes.countries'] == 'Republic of Ireland', ['attributes.countries', 'countrycode', 'continent']] = ['Ireland', 'IE', 'EU']
fairsharing_countries.loc[fairsharing_countries['attributes.countries'] == 'European Union', ['countrycode', 'continent']] = ['EU', 'EU']

Make Antactica disappear (only one repo)

In [32]:
fairsharing_countries.loc[fairsharing_countries['attributes.countries'] == 'Antarctica', ['countrycode', 'continent']] = ['AQ', np.nan]
fairsharing_countries[fairsharing_countries.countrycode == 'AQ']
Out[32]:
id type attributes.created-at attributes.updated-at attributes.metadata.doi attributes.metadata.name attributes.metadata.status attributes.metadata.contacts attributes.metadata.homepage attributes.metadata.identifier attributes.metadata.description attributes.metadata.abbreviation attributes.metadata.support-links attributes.metadata.year-creation attributes.metadata.data-processes attributes.metadata.cross-references attributes.legacy-ids attributes.fairsharing-registry attributes.record-type attributes.subjects attributes.domains attributes.taxonomies attributes.user-defined-tags attributes.countries attributes.name attributes.abbreviation attributes.url attributes.doi attributes.fairsharing-licence attributes.description attributes.publications attributes.licence-links attributes.url-for-logo attributes.metadata.citations attributes.metadata.associated-tools attributes.metadata.deprecation-reason attributes.metadata.data-access-condition.type attributes.metadata.data-contact-information attributes.metadata.data-deposition-condition.url attributes.metadata.data-deposition-condition.type attributes.metadata.deprecation-date attributes.metadata.access-points attributes.metadata.data-access-condition.url attributes.metadata.resource-sustainability.url attributes.metadata.resource-sustainability.name attributes.metadata.data-preservation-policy.url attributes.metadata.data-preservation-policy.name attributes.metadata.data-access-for-pre-publication-review attributes.metadata.data-versioning attributes.metadata.data-curation.type attributes.metadata.data-curation.url attributes.metadata.citation-to-related-publications attributes.metadata.tombstone countrycode continent
325 2462 fairsharing-records 2017-06-27T13:30:19.000Z 2021-12-02T18:05:26.741Z 10.25504/FAIRsharing.ewyejx Antabif IPT - AntOBIS IPT - GBIF Belgium ready [{'contact-name': 'Anton Van de Putte', 'conta... http://ipt.biodiversity.aq/ 2462 The Belgium Biodiversity Platform hosts this d... NaN [{'url': 'a.heughebaert@biodiversity.be', 'nam... NaN NaN NaN [biodbcore-000944, bsg-d000944] Database repository [Biodiversity, Life Science] [Taxonomic classification] [All] [] Antarctica FAIRsharing record for: Antabif IPT - AntOBIS ... None https://fairsharing.org/10.25504/FAIRsharing.e... 10.25504/FAIRsharing.ewyejx https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The Belgium... [] [{'licence-name': 'Apache License 2.0', 'licen... None [] NaN None NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN AQ NaN
1094 3654 fairsharing-records 2021-12-02T09:58:02.958Z 2021-12-07T14:13:56.118Z NaN SCAR Antarctic Biodiversity Portal ready [{'contact-name': 'Anton Van de Putte', 'conta... https://www.biodiversity.aq/ 3654 Antarctic marine and terrestrial biodiversity ... None [{'url': 'https://www.biodiversity.aq/how-to/w... 2005.0 [{'url': 'https://www.biodiversity.aq/find-dat... [{'url': 'https://www.re3data.org/repository/r... [] Database knowledgebase [Zoology, Taxonomy, Ecology, Biodiversity, Oce... [] [All] [] Antarctica FAIRsharing record for: SCAR Antarctic Biodive... None https://fairsharing.org/fairsharing_records/3654 None https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: Antarctic m... [] [{'licence-name': 'SCAR Antarctic Biodiversity... None [] [{'url': 'https://www.biodiversity.aq/tools/r-... NaN NaN NaN NaN NaN [{'url': 'https://data.biodiversity.aq/api/v1.... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN AQ NaN

Country coverage

In [33]:
data1 = re3data_institutions.groupby('institutionCountry')[['orgIdentifier']].count().sort_values('orgIdentifier', ascending=False)
data2 = opendoar_institutions.groupby('country')[['system_metadata.id']].count().sort_values('system_metadata.id', ascending=False)
data3 = roar_institutions.groupby('location_country')[['eprintid']].count().sort_values('eprintid', ascending=False)
data4 = fairsharing_countries.groupby('countrycode')[['id']].count().sort_values('id', ascending=False)

plot = [
    go.Bar(
        x=data1.index,
        y=data1['orgIdentifier'],
        name='re3data'
    ),
    go.Bar(
        x=data2.index,
        y=data2['system_metadata.id'],
        name='openDOAR',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data3.index,
        y=data3['eprintid'],
        name='ROAR',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data4.index,
        y=data4['id'],
        name='FAIRsharing',
        visible = 'legendonly'
    )
]

layout = go.Layout(
    title='Country coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

go.Figure(plot, layout).show()

Continental coverage

In [34]:
data1 = re3data_institutions.groupby('org_continent')[['orgIdentifier']].count()
data2 = opendoar_institutions.groupby('org_continent')[['system_metadata.id']].count()
data3 = roar_institutions.groupby('continent')[['eprintid']].count()
data4 = fairsharing_countries.groupby('continent')[['id']].count()

plot = [
    go.Scatterpolar(
        r=data1.orgIdentifier,
        theta=data1.index,
        fill='toself',
        name='re3data'),
    go.Scatterpolar(
        r=data2['system_metadata.id'],
        theta=data2.index,
        fill='toself',
        name='OpenDOAR'),
    go.Scatterpolar(
        r=data3.eprintid,
        theta=data3.index,
        fill='toself',
        name='ROAR'),
    go.Scatterpolar(
        r=data4.id,
        theta=data4.index,
        fill='toself',
        name='FAIRsharing')
]

layout = go.Layout(polar=dict(
    radialaxis=dict(
      visible=True
    ),
  )
)

go.Figure(plot, layout).show()