registries_analysis/notebooks/02-subjects&geographic.ipynb

4.1 MiB
Raw Blame History

In [13]:
import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)
In [14]:
def country_to_countrycode(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha3(country)
        except:
            return np.nan
        
def countrycode_iso2_to_countrycode_iso3(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha3(pycountry_convert.country_alpha2_to_country_name(country))
        except:
            return np.nan

def countrycode_to_continent(country_code):
    if pd.isna(country_code):
        return np.nan
    else:
        try:
            return pycountry_convert.country_alpha2_to_continent_code(pycountry_convert.country_alpha3_to_country_alpha2(country_code))
        except:
            return np.nan

Loading datasets

re3data

In [15]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additionalName': ast.literal_eval,
                                    'repositoryIdentifier': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'contentType': ast.literal_eval,
                                    'providerType': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })

re3data_df.head()
Out[15]:
orgIdentifier repositoryName repositoryName.language additionalName repositoryURL repositoryIdentifier repositoryContact description description.language type size startDate endDate repositoryLanguage subject missionStatementURL contentType providerType keyword institution policy databaseAccess databaseLicense dataAccess dataLicense dataUploadType dataUploadLicense software versioning api pidSystem citationGuidelineURL aidSystem enhancedPublication qualityManagement certificate metadataStandard syndication remarks entryDate lastUpdate
0 r3d100000001 Odum Institute Archive Dataverse eng [] https://dataverse.unc.edu/dataverse/odum [] ["https://dataverse.unc.edu/dataverse/odum#", ... The Odum Institute Archive Dataverse contains ... eng [disciplinary] {"size": "13 dataverses; 3.050 datasets", "upd... NaN NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... NaN [{'name': 'Databases', 'scheme': 'parse'}, {'n... [dataProvider] [FAIR, Middle East, crime, demography, economy... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC0", "databaseLicen... [{"dataAccessType": "embargoed", "dataAccessRe... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [] ["DataVerse"] NaN {} ["DOI"] NaN [] unknown yes ["other"] [{"metadataStandardName": "DDI - Data Document... {} Odum Dataverse is covered by Thomson Reuters D... 2013-06-10 2021-07-06
1 r3d100000002 Access to Archival Databases eng [{'additionalName': 'AAD', 'additionalNameLang... https://aad.archives.gov/aad/ [RRID:SCR_010479, RRID:nlx_157752] ["https://www.archives.gov/contact"] You will find in the Access to Archival Databa... eng [disciplinary] {"size": "", "updatedp": ""} 1985 NaN ["eng", "spa"] [{'name': '1 Humanities and Social Sciences', ... https://www.archives.gov/publications/general-... [{'name': 'Images', 'scheme': 'parse'}, {'name... [dataProvider] [US History] [{'institutionName': 'The U.S. National Archiv... [{"policyName": "Contribution Policy", "policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "Copyrights", "dataLicens... restricted [] ["unknown"] no {"api": "https://www.archives.gov/developer#to... ["none"] https://aad.archives.gov/aad/help/getting-star... [] unknown unknown [] [] {"syndication": "http://www.archives.gov/socia... NaN 2012-07-04 2021-05-25
2 r3d100000004 Datenbank Gesprochenes Deutsch deu [{'additionalName': 'DGD', 'additionalNameLang... https://dgd.ids-mannheim.de/ [] ["dgd@ids-mannheim.de"] The "Database for Spoken German (DGD)" is a co... eng [disciplinary] {"size": "34 corpora", "updatedp": "2020-02-03"} 2012 NaN ["deu"] [{'name': '1 Humanities and Social Sciences', ... https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... [{'name': 'Audiovisual data', 'scheme': 'parse... [dataProvider, serviceProvider] [Australian German, FOLK, German dialects, Pfe... [{'institutionName': 'Institut für Deutsche Sp... [{"policyName": "Erfurter Aufruf zur Sicherung... {"databaseAccessType": "restricted", "databas... [] [{"dataAccessType": "restricted", "dataAccessR... [{"dataLicenseName": "other", "dataLicenseURL"... restricted [] ["other"] yes {} ["none"] http://agd.ids-mannheim.de/konditionen.shtml [] unknown unknown ["RatSWD"] [] {} NaN 2012-07-20 2020-08-27
3 r3d100000005 UNC Dataverse eng [{'additionalName': 'University of North Carol... https://dataverse.unc.edu/ [FAIRsharing_doi:10.25504/FAIRsharing.pS2p8c] ["https://dataverse.unc.edu/", "odumarchive@un... UNC Dataverse is an open-source repository sof... eng [institutional] {"size": "186 dataverses; 25.272 studies; 229.... 2011 NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://odum.unc.edu/about/mission-vision/ [{'name': 'Archived data', 'scheme': 'parse'},... [dataProvider, serviceProvider] [FAIR, census, demographic survey, demography,... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Data Deposit Form"... ["DataVerse"] yes {"api": "https://guides.dataverse.org/en/lates... ["ARK", "DOI", "PURL", "URN", "hdl"] https://dataverse.org/best-practices/data-cita... [] unknown yes [] [{"metadataStandardName": "DDI - Data Document... {} UNC Dataverse is covered by Clarivate Data Cit... 2012-07-23 2021-10-25
4 r3d100000006 Archaeology Data Service eng [{'additionalName': 'ADS', 'additionalNameLang... https://archaeologydataservice.ac.uk/ [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] ["help@archaeologydataservice.ac.uk", "https:/... The ADS is an accredited digital repository fo... eng [disciplinary] {"size": "1837 results", "updatedp": "2020-05-... 1996-10-01 NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://archaeologydataservice.ac.uk/about/our... [{'name': 'Archived data', 'scheme': 'parse'},... [dataProvider, serviceProvider] [FAIR, archaeology, cultural heritage, prehist... [{'institutionName': 'Arts and Humanities Rese... [{"policyName": "ADS Guides to good practice",... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC", "databaseLicens... [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Guidelines for Dep... ["other"] yes {"api": "https://archaeologydataservice.ac.uk/... ["DOI"] https://archaeologydataservice.ac.uk/advice/te... [] unknown yes ["other"] [{"metadataStandardName": "DataCite Metadata S... {"syndication": "https://archaeologydataservic... ADS is covered by Clarivate Data Citation Inde... 2012-07-23 2021-09-02
In [16]:
re3data_df.describe(include='all')
Out[16]:
orgIdentifier repositoryName repositoryName.language additionalName repositoryURL repositoryIdentifier repositoryContact description description.language type size startDate endDate repositoryLanguage subject missionStatementURL contentType providerType keyword institution policy databaseAccess databaseLicense dataAccess dataLicense dataUploadType dataUploadLicense software versioning api pidSystem citationGuidelineURL aidSystem enhancedPublication qualityManagement certificate metadataStandard syndication remarks entryDate lastUpdate
count 2793 2793 2793 2793 2769 2793 2793 2793 2793 2793 2793 1800 172 2793 2793 2373 2793 2793 2793 2793 2793 2793 2793 2793 2793 2778 2793 2793 1339 2793 2793 1532 2793 2793 2793 2793 2793 2793 1694 2793 2793
unique 2793 2791 19 2197 2766 1024 2532 2792 6 9 1321 362 86 110 1418 2304 1351 6 2544 2773 2366 12 377 146 2294 3 695 23 2 1170 29 1337 13 3 3 16 175 544 1673 1316 722
top r3d100000001 EarthChem Library eng [] http://icgem.gfz-potsdam.de/home [] [] The National Archives and Records Administrati... eng [disciplinary] {"size": "", "updatedp": ""} 2008 2015 ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://learn.scholarsportal.info/all-guides/d... [{'name': 'Standard office documents', 'scheme... [dataProvider] [multidisciplinary] [{'institutionName': 'National Center for Biot... [][] {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [] ["unknown"] yes {} ["none"] https://dataverse.org/best-practices/data-cita... [] unknown yes [] [] {} is covered by Elsevier. 2018-08-10 2021-09-03
freq 1 2 2596 587 2 1769 170 2 2776 1768 1472 93 12 2088 240 14 29 1806 205 7 319 2624 2201 1292 71 1851 2054 1216 1131 1526 1359 76 2199 1643 1569 2557 1693 2235 17 20 104

openDOAR

In [17]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'repository_metadata.content_subjects': ast.literal_eval,
                                    'repository_metadata.alternativename': ast.literal_eval,
                                    'repository_metadata.content_types': ast.literal_eval,
                                    'organization': ast.literal_eval
                                    },
                         dtype={'system_metadata.id': str})

opendoar_df.head()
Out[17]:
system_metadata.id repository_metadata.name repository_metadata.alternativename repository_metadata.url repository_metadata.description repository_metadata.type repository_metadata.content_languages system_metadata.date_modified system_metadata.date_created repository_metadata.content_subjects repository_metadata.content_types organization policy_urls repository_metadata.software repository_metadata.oai_url system_metadata.publicly_visible repository_metadata.repository_status repository_metadata.fulltext_record_count repository_metadata.metadata_record_count
0 134 {"name": "eldorado - repository of the tu dort... [{'name': 'eldorado - ressourcen aus und für l... https://eldorado.tu-dortmund.de NaN institutional [] 2022-01-12 15:34:54 2005-12-19 14:57:52 [arts, humanities, science, mathematics, socia... [journal_articles, conference_and_workshop_pap... [{'name': 'technische universität dortmund', '... [] {"name": "dspace", "version": ""} https://eldorado.tu-dortmund.de/oai/request yes NaN 9629.0 20963.0
1 58 {"name": "archive ouverte en sciences de linfo... [{'acronym': '@rchivesic'}] https://archivesic.ccsd.cnrs.fr NaN institutional [] 2022-01-12 15:34:53 2006-01-13 12:48:32 [arts, science, technology, engineering, mathe... [journal_articles, conference_and_workshop_pap... [{'name': 'centre pour la communication scient... [] {"name": "hal", "version": ""} https://api.archives-ouvertes.fr/oai/archivesic yes NaN 55492.0 1137498.0
2 93 {"name": "digitalcommons@the texas medical cen... [] http://digitalcommons.library.tmc.edu/ NaN institutional [] 2022-01-12 15:34:53 2006-02-14 11:16:12 [health and medicine] [journal_articles, theses_and_dissertations] [{'name': 'texas medical center', 'alternative... [] {"name": "other", "version": ""} http://digitalcommons.library.tmc.edu/do/oai/ yes NaN 2658.0 7268.0
3 68 {"name": "cognitive sciences eprint archive", ... [{'acronym': 'cogprints'}] http://cogprints.org/ NaN disciplinary [] 2022-01-12 15:34:53 2006-01-04 15:01:23 [humanities, health and medicine, science, soc... [journal_articles, conference_and_workshop_pap... [{'name': 'university of southampton', 'altern... [] {"name": "eprints", "version": ""} http://cogprints.org/cgi/oai2 yes NaN 2895.0 4277.0
4 84 {"name": "digital commons@carleton college", "... [] http://digitalcommons.carleton.edu/ NaN institutional [] 2022-01-12 15:34:53 2006-01-04 16:07:58 [humanities, science, social sciences] [journal_articles, unpub_reports_and_working_p... [{'name': 'carleton college', 'alternativeName... [] {"name": "other", "version": ""} NaN yes NaN NaN 42.0
In [18]:
opendoar_df.describe(include='all')
Out[18]:
system_metadata.id repository_metadata.name repository_metadata.alternativename repository_metadata.url repository_metadata.description repository_metadata.type repository_metadata.content_languages system_metadata.date_modified system_metadata.date_created repository_metadata.content_subjects repository_metadata.content_types organization policy_urls repository_metadata.software repository_metadata.oai_url system_metadata.publicly_visible repository_metadata.repository_status repository_metadata.fulltext_record_count repository_metadata.metadata_record_count
count 5811 5811 5811 5810 0.0 5810 5811 5811 5811 5811 5811 5811 5811 5811 4447 5811 0.0 2.292000e+03 4.184000e+03
unique 5811 5780 2116 5772 NaN 4 1 171 5643 237 477 5212 678 32 4415 1 NaN NaN NaN
top 134 {"name": "arch", "language": "en"} [] http://harp.lib.hiroshima-u.ac.jp/ NaN institutional [] 2022-01-12 15:35:47 2020-09-18 12:53:48 [science, technology, engineering, mathematics... [theses_and_dissertations] [{'name': 'rijksuniversiteit groningen', 'alte... [] {"name": "dspace", "version": ""} https://api.figshare.com/v2/oai yes NaN NaN NaN
freq 1 3 3656 3 NaN 5161 5811 73 81 3321 469 26 5131 2273 3 5811 NaN NaN NaN
mean NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.022890e+03 1.765556e+05
std NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.212648e+04 6.611068e+06
min NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000e+00 0.000000e+00
25% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000e+00 8.937500e+02
50% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.225000e+02 4.012500e+03
75% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.931500e+03 1.629350e+04
max NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.817531e+06 4.200000e+08

ROAR

In [19]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv', dtype='str')
roar_df = roar_df.groupby('eprintid').aggregate(set)

def value_or_list(cell_set):
    copy = set(cell_set)
    copy.discard(np.nan) 
    if len(copy) == 0:
        return np.nan
    if len(copy) == 1:
        return copy.pop()
    return list(copy)
        
roar_df = roar_df.applymap(value_or_list)
roar_df.reset_index(inplace=True)

roar_df.head()
Out[19]:
eprintid rev_number eprint_status userid importid source dir datestamp lastmod status_changed type succeeds commentary metadata_visibility latitude longitude relation_type relation_uri item_issues_id item_issues_type item_issues_description item_issues_timestamp item_issues_status item_issues_reported_by item_issues_resolved_by item_issues_comment item_issues_count sword_depositor sword_slug exemplar home_page title oai_pmh sword_endpoint rss_feed twitter_feed description fulltext open_access mandate organisation_title organisation_home_page location_country location_city location_latitude location_longitude software geoname version subjects date note suggestions activity_low activity_medium activity_high recordcount recordhistory fulltexts_total fulltexts_docs fulltexts_rtotal fulltexts_rdocs registry_name registry_id submit_to submitted_to_name submitted_to_done webometrics_rank webometrics_size webometrics_visibility webometrics_rich_files webometrics_scholar monthly_deposits total_deposits association
0 1 633 archive 1 NaN NaN disk0/00/00/00/01 2010-01-06 13:43:48 2011-07-18 05:40:07 2010-01-06 13:43:48 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://archivesic.ccsd.cnrs.fr/ @RCHIVESIC http://archivesic.ccsd.cnrs.fr/oai/oai.php NaN NaN NaN NaN NaN NaN NaN NaN NaN fr NaN NaN NaN hal geoname_2_FR other NaN 2002-05-17 19:24:41 NaN NaN 0 0 0 25 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... NaN NaN NaN NaN [opendoar, celestial] [58, 669] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 10 511 archive 1 NaN NaN disk0/00/00/00/10 2010-01-06 13:43:48 2011-07-18 05:40:13 2010-01-06 13:43:48 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://www.diva-portal.org/mdh/ Academic Archive On-line (Mälardalen Universit... http://www.diva-portal.org/oai/mdh/OAI NaN NaN NaN NaN TRUE TRUE NaN NaN NaN se Uppsala 59.8667 17.6333 diva geoname_2_SE other NaN 2005-12-08 13:15:22 NaN NaN 0 0 0 100 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,100,100... NaN NaN NaN NaN [opendoar, celestial] [526, 258] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1000 274 archive 1 NaN NaN disk0/00/00/10/00 2010-01-06 13:45:01 2011-07-06 08:21:21 2010-01-06 13:45:01 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://pam.pisharp.org/ PAM - Portuguese Archive of Mathematics NaN NaN NaN NaN NaN TRUE TRUE NaN NaN NaN pt Bellevue, WA 47.6034 -122.155 dspace geoname_2_PT other NaN 2006-05-04 10:48:14 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 10001 20 archive 91 NaN NaN disk0/00/01/00/01 2015-08-08 14:52:11 2016-03-21 19:44:01 2015-08-08 14:52:11 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://edoc.sub.uni-hamburg.de/klimawandel/ Klimawandel Dokumentenserver http://edoc.sub.uni-hamburg.de/klimawandel/oai NaN NaN NaN The "Documentenserver Klimawandel" (Repository... TRUE TRUE TRUE [Climate Service Center 2.0, KLIMZUG projects,... [http://www.climateservicecenter.de/, http://w... de Hamburg 53.5511 9.9937 opus geoname_2_DE other [GE, GF, HD, S1, G1] 2015-07-02 08:08:31 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN [opendoar, celestial] [5881, 3408] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 10008 11 archive 404 NaN NaN disk0/00/01/00/08 2015-08-08 14:52:26 2016-03-21 19:43:51 2015-08-08 14:52:26 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://creativematter.skidmore.edu/ Creative Matter | Skidmore College Research http://creativematter.skidmore.edu/do/oai/ NaN http://creativematter.skidmore.edu/recent.rss NaN Welcome to Creative Matter, a repository for t... TRUE FALSE FALSE Skidmore College http://www.skidmore.edu/ us Saratoga Springs 43.0961 -73.7818 bepress geoname_2_US other NaN 2015-07-06 17:35:50 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN celestial 5882 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [20]:
roar_df.describe(include='all')
Out[20]:
eprintid rev_number eprint_status userid importid source dir datestamp lastmod status_changed type succeeds commentary metadata_visibility latitude longitude relation_type relation_uri item_issues_id item_issues_type item_issues_description item_issues_timestamp item_issues_status item_issues_reported_by item_issues_resolved_by item_issues_comment item_issues_count sword_depositor sword_slug exemplar home_page title oai_pmh sword_endpoint rss_feed twitter_feed description fulltext open_access mandate organisation_title organisation_home_page location_country location_city location_latitude location_longitude software geoname version subjects date note suggestions activity_low activity_medium activity_high recordcount recordhistory fulltexts_total fulltexts_docs fulltexts_rtotal fulltexts_rdocs registry_name registry_id submit_to submitted_to_name submitted_to_done webometrics_rank webometrics_size webometrics_visibility webometrics_rich_files webometrics_scholar monthly_deposits total_deposits association
count 5444 5444 5444 5444 0.0 0.0 5444 5444 5444 5444 5444 108 0.0 5444 0.0 0.0 0.0 0.0 63 63 63 63 63 0.0 0.0 0.0 2242 0.0 0.0 268 5437 5442 4332 178 1538 116 3837 4197 4197 3746 4460 4286 5138 3714 3725 3708 4700 4730 5444 1289 5429 218 189 2288 2288 2288 2290 2288 270 258 270 258 4605 4580 375 205 205 148 148 148 148 148 756 756 223
unique 5444 660 1 2189 NaN NaN 5444 4198 4043 4230 12 108 NaN 2 NaN NaN NaN NaN 48 5 62 4 3 NaN NaN NaN 4 NaN NaN 2 5271 5143 4059 172 1485 112 3359 2 2 2 3858 3831 144 1884 2923 2953 31 126 53 938 4898 210 173 72 54 16 741 1702 135 118 134 117 7 4259 7 1 1 148 148 148 146 143 346 342 3
top 1 11 archive 1 NaN NaN disk0/00/00/00/01 2010-01-06 13:43:48 2011-07-06 08:24:53 2010-01-06 13:43:48 institutional 10164 NaN show NaN NaN NaN NaN bad_oai_pmh_url_0 duplicate_title Duplicate title to <xhtml:table xmlns:xhtml="h... 2010-01-13 10:44:49 discovered NaN NaN NaN 0 NaN NaN FALSE http://eprints.upnjatim.ac.id/ Repositorio Institucional http://kce.docressources.info/ws/PMBWs_2 http://producao.usp.br/sword/servicedocument http://eprints.upnjatim.ac.id/cgi/latest_tool?... http://my.indexcopernicus.com/fredemoreno info:other:archives.eprints.org:import TRUE TRUE FALSE Chinese Academy of Science (中国科学院) http://www.cas.cn/ us Lima 34.1607 -118.139 dspace geoname_2_US other K1 2006-05-04 10:48:14 ¿Quién puede depositar documentos en el reposi... This repository is hosted by the Texas Digital... 0 0 0 100 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... 0 0 0 0 [opendoar, celestial] 2479 [roarmap, opendoar, celestial] opendoar 2021-01-25 24 46 20 824 806 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... 0 russell_group
freq 1 333 5444 1330 NaN NaN 1 16 8 16 3853 1 NaN 5402 NaN NaN NaN NaN 15 33 2 45 38 NaN NaN NaN 2201 NaN NaN 261 4 7 4 2 5 2 112 2805 2696 2748 9 9 891 74 25 25 2341 845 4841 53 99 2 9 2012 2074 2210 730 95 113 114 113 114 2106 4 119 205 205 1 1 1 3 5 387 387 130
mean NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
std NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
50% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
75% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
max NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

FAIRsharing

In [21]:
with open('../data/raw/fairsharing_dump_api_09_2021.json') as f:
    lines = f.read().splitlines()
    
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))

fairsharing_df.head()
Out[21]:
id type attributes.created-at attributes.updated-at attributes.metadata.doi attributes.metadata.name attributes.metadata.status attributes.metadata.contacts attributes.metadata.homepage attributes.metadata.identifier attributes.metadata.description attributes.metadata.support-links attributes.metadata.year-creation attributes.metadata.data-processes attributes.legacy-ids attributes.fairsharing-registry attributes.record-type attributes.subjects attributes.domains attributes.taxonomies attributes.user-defined-tags attributes.countries attributes.name attributes.abbreviation attributes.url attributes.doi attributes.fairsharing-licence attributes.description attributes.publications attributes.licence-links attributes.metadata.citations attributes.metadata.abbreviation attributes.metadata.access-points attributes.metadata.associated-tools attributes.metadata.deprecation-date attributes.metadata.deprecation-reason attributes.metadata.tombstone
0 1723 fairsharing-records 2014-11-04T15:23:40.000Z 2021-09-30T11:39:06.829Z 10.25504/FAIRsharing.8t18te Cell Image Library ready [{'contact-name': 'David Orloff', 'contact-ema... http://www.cellimagelibrary.org 1723 This library is a public and easily accessible... [{'url': 'http://www.cellimagelibrary.org/page... 2010.0 [{'name': 'live update', 'type': 'data release... [biodbcore-000180, bsg-d000180] Database repository [Cell Biology, Life Science] [Cell, Microscopy, Light microscopy, Electron ... [All] [] [United States] FAIRsharing record for: Cell Image Library None https://fairsharing.org/10.25504/FAIRsharing.8... 10.25504/FAIRsharing.8t18te https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: This librar... [{'id': 232, 'pubmed_id': 23203874, 'title': '... [{'licence-name': 'Cell Image Library Data Pol... NaN NaN NaN NaN NaN NaN NaN
1 3101 fairsharing-records 2020-09-16T08:49:13.000Z 2021-09-30T11:36:45.452Z NaN WHOI Ship Data-Grabber System ready NaN http://4dgeo.whoi.edu/shipdata/SDG_shipdata.html 3101 The WHOI Ship DataGrabber system provides the ... [{'url': 'http://4dgeo.whoi.edu/shipdata/SDG_o... 2004.0 [{'url': 'http://4dgeo.whoi.edu/sdg-bin/dv_mai... [biodbcore-001609, bsg-d001609] Database repository [Earth Science, Water Research, Oceanography] [] [Not applicable] [subseafloor environments] [United States] FAIRsharing record for: WHOI Ship Data-Grabber... None https://fairsharing.org/fairsharing_records/3101 None https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The WHOI Sh... [] [{'licence-name': 'NDSF Data Archive Policy', ... NaN NaN NaN NaN NaN NaN NaN
2 2649 fairsharing-records 2018-08-07T20:23:32.000Z 2021-09-30T11:39:07.898Z NaN Electron Microscope Public Image Archive ready [{'contact-name': 'General contact', 'contact-... https://www.ebi.ac.uk/pdbe/emdb/empiar/ 2649 EMPIAR, the Electron Microscopy Public Image A... [{'url': 'https://www.ebi.ac.uk/support/EMPIAR... 2015.0 [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... [biodbcore-001140, bsg-d001140] Database repository [Bioinformatics, Biology] [Protein image, Microscopy, Electron microscop... [All] [] [Greece, Czech Republic, United Kingdom, Icela... FAIRsharing record for: Electron Microscope Pu... EMPIAR https://fairsharing.org/fairsharing_records/2649 None https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: EMPIAR, the... [{'id': 2232, 'pubmed_id': 27067018, 'title': ... [{'licence-name': 'EMBL-EBI Terms of Use', 'li... [{'doi': '10.1038/nmeth.3806', 'pubmed-id': 27... EMPIAR [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... NaN NaN NaN
3 2657 fairsharing-records 2018-08-13T15:12:11.000Z 2021-09-30T11:37:28.736Z 10.25504/FAIRsharing.tnByoG ClinicalStudyDataRequest.com ready [{'contact-email': 'support@clinicalstudydatar... https://clinicalstudydatarequest.com/ 2657 ClinicalStudyDataRequest.com (CSDR) is a conso... [{'url': 'https://clinicalstudydatarequest.com... 2014.0 [{'url': 'https://clinicalstudydatarequest.com... [biodbcore-001149, bsg-d001149] Database repository [Preclinical Studies, Biomedical Science] [] [Homo sapiens] [] [Worldwide] FAIRsharing record for: ClinicalStudyDataReque... CSDR https://fairsharing.org/10.25504/FAIRsharing.t... 10.25504/FAIRsharing.tnByoG https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: ClinicalStu... [] [{'licence-name': 'CSDR Data Sharing Agreement... NaN CSDR NaN NaN NaN NaN NaN
4 2078 fairsharing-records 2014-11-04T15:23:40.000Z 2021-09-30T11:34:43.129Z 10.25504/FAIRsharing.3axym7 Germplasm Resources Information Network ready [{'contact-email': 'dbmu@ars-grin.gov'}] https://www.ars-grin.gov/ 2078 GRIN provides National Genetic Resources Progr... [{'url': 'https://www.ars-grin.gov/Pages/Colle... 2010.0 [{'url': 'https://www.ars-grin.gov/', 'name': ... [biodbcore-000546, bsg-d000546] Database repository [Life Science] [Cell, Cell culture, Germplasm] [Bacteria, Metazoa, Viridiplantae] [] [United States] FAIRsharing record for: Germplasm Resources In... GRIN https://fairsharing.org/10.25504/FAIRsharing.3... 10.25504/FAIRsharing.3axym7 https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: GRIN provid... [] [] NaN GRIN NaN NaN NaN NaN NaN
In [22]:
fairsharing_df.describe(include='all')
Out[22]:
id type attributes.created-at attributes.updated-at attributes.metadata.doi attributes.metadata.name attributes.metadata.status attributes.metadata.contacts attributes.metadata.homepage attributes.metadata.identifier attributes.metadata.description attributes.metadata.support-links attributes.metadata.year-creation attributes.metadata.data-processes attributes.legacy-ids attributes.fairsharing-registry attributes.record-type attributes.subjects attributes.domains attributes.taxonomies attributes.user-defined-tags attributes.countries attributes.name attributes.abbreviation attributes.url attributes.doi attributes.fairsharing-licence attributes.description attributes.publications attributes.licence-links attributes.metadata.citations attributes.metadata.abbreviation attributes.metadata.access-points attributes.metadata.associated-tools attributes.metadata.deprecation-date attributes.metadata.deprecation-reason attributes.metadata.tombstone
count 1797 1797 1797 1797 1354 1797 1797 1678 1797 1797.000000 1797 1608 1492.000000 1565 1797 1797 1797 1797 1797 1797 1797 1797 1797 1638 1797 1354 1797 1797 1797 1797 326 1638 449 618 217 217 1
unique 1797 1 1162 1797 1354 1796 4 1576 1797 NaN 1797 1594 NaN 1563 1797 1 3 888 1163 378 384 185 1796 1626 1797 1354 1 1797 1109 1082 320 1626 444 615 55 86 1
top 1723 fairsharing-records 2014-11-04T15:23:40.000Z 2021-09-30T11:39:06.829Z 10.25504/FAIRsharing.8t18te OmicsDB ready [{'contact-name': 'Sam Hokin', 'contact-email'... http://www.cellimagelibrary.org NaN This library is a public and easily accessible... [{'url': 'https://github.com/gbif/ipt/wiki/IPT... NaN [{'url': 'http://qf.iodp.tamu.edu/qfsearch/sea... [biodbcore-000180, bsg-d000180] Database repository [Life Science] [] [All] [] [United States] FAIRsharing record for: OmicsDB CGD https://fairsharing.org/10.25504/FAIRsharing.8... 10.25504/FAIRsharing.8t18te https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: This librar... [] [] [{'doi': '10.1093/nar/gkz890', 'pubmed-id': 31... CGD [{'url': 'https://github.com/Ensembl', 'name':... [{'url': 'http://www.h-invitational.jp/hinv/bl... 2021-9-17 This resource is no longer available at the st... True
freq 1 1797 636 1 1 2 1540 6 1 NaN 1 6 NaN 2 1 1797 926 350 265 502 1193 594 2 3 1 1 1797 1 661 716 6 3 3 2 84 113 1
mean NaN NaN NaN NaN NaN NaN NaN NaN NaN 2446.100167 NaN NaN 2007.636059 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
std NaN NaN NaN NaN NaN NaN NaN NaN NaN 520.058757 NaN NaN 10.953269 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min NaN NaN NaN NaN NaN NaN NaN NaN NaN 1547.000000 NaN NaN 1894.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25% NaN NaN NaN NaN NaN NaN NaN NaN NaN 1996.000000 NaN NaN 2004.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
50% NaN NaN NaN NaN NaN NaN NaN NaN NaN 2445.000000 NaN NaN 2010.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
75% NaN NaN NaN NaN NaN NaN NaN NaN NaN 2897.000000 NaN NaN 2014.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
max NaN NaN NaN NaN NaN NaN NaN NaN NaN 3346.000000 NaN NaN 2021.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

Subjects analysis

re3data

In [23]:
re3data_subjects = re3data_df[['orgIdentifier', 'subject']].explode('subject')
re3data_subjects['subject'] = re3data_subjects['subject'].apply(lambda x: x['name'] if x is not np.nan else np.nan)
re3data_subjects
Out[23]:
orgIdentifier subject
0 r3d100000001 1 Humanities and Social Sciences
0 r3d100000001 111 Social Sciences
0 r3d100000001 11104 Political Science
0 r3d100000001 112 Economics
0 r3d100000001 12 Social and Behavioural Sciences
... ... ...
2791 r3d100013733 4 Engineering Sciences
2792 r3d100013735 2 Life Sciences
2792 r3d100013735 204 Microbiology, Virology and Immunology
2792 r3d100013735 21 Biology
2792 r3d100013735 22 Medicine

17032 rows × 2 columns

In [24]:
data = re3data_subjects.groupby('subject')[['orgIdentifier']].count().sort_values('subject', ascending=False)
data
plot = [
    go.Bar(
        x=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)].index,
        y=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)]['orgIdentifier'],
        name='re3data tier %s-digits' % tier
    ) for tier in [1,2,3,5]
] 

layout = go.Layout(
    title='Subject coverage re3data',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

OpenDOAR

In [25]:
opendoar_subjects = opendoar_df.explode('repository_metadata.content_subjects')
In [26]:
data = opendoar_subjects.groupby('repository_metadata.content_subjects')[['system_metadata.id']].count().sort_values('system_metadata.id', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['system_metadata.id'],
    ) 
] 

layout = go.Layout(
    title='Subject coverage OpenDOAR',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

ROAR

In [27]:
roar_subjects = roar_df.explode('subjects')
In [28]:
data = roar_subjects.groupby('subjects')[['eprintid']].count().sort_values('eprintid', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['eprintid'],
    ) 
] 

layout = go.Layout(
    title='Subject coverage OpenDOAR',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

FAIRsharing

In [29]:
fairsharing_subjects = fairsharing_df.explode('attributes.subjects')
In [30]:
data = fairsharing_subjects.groupby('attributes.subjects')[['id']].count().sort_values('id', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['id'],
        name='FAIRsharing'
    )
]

layout = go.Layout(
    title='Subject coverage FAIRsharing',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

Geographic analysis

re3data

In [31]:
re3data_institutions = re3data_df.explode('institution')[['orgIdentifier', 'institution']]
re3data_institutions = re3data_institutions[~re3data_institutions.institution.isna()].reset_index(drop=True)
re3data_institutions = re3data_institutions.join(pd.json_normalize(re3data_institutions.institution))
re3data_institutions.head()
Out[31]:
orgIdentifier institution institutionName institutionAdditionalName institutionCountry responsabilityType institutionType institutionURL institutionIdentifier responsibilityStartDate responsibilityEndDate institutionContact
0 r3d100000001 {'institutionName': 'Odum Institute for Resear... Odum Institute for Research in Social Science [] USA [general] non-profit https://odum.unc.edu/archive/ [] []
1 r3d100000002 {'institutionName': 'The U.S. National Archive... The U.S. National Archives and Records Adminis... [NARA, National Archives] USA [general] non-profit http://www.archives.gov/ [] [http://www.archives.gov/contact/]
2 r3d100000002 {'institutionName': 'The USA.gov', 'institutio... The USA.gov [] USA [general] non-profit http://www.usa.gov/ [] [http://www.usa.gov/Contact.shtml]
3 r3d100000004 {'institutionName': 'Institut für Deutsche Spr... Institut für Deutsche Sprache, Archiv für Gesp... [AGD] DEU [funding, general] non-profit http://agd.ids-mannheim.de/index.shtml [] 2004 [agd@ids-mannheim.de]
4 r3d100000005 {'institutionName': 'Odum Institute for Resear... Odum Institute for Research in Social Science [] USA [technical] non-profit https://odum.unc.edu/ [] [https://odum.unc.edu/contact/contact-form/, o...
In [32]:
re3data_institutions['org_continent'] = re3data_institutions.institutionCountry.map(countrycode_to_continent)
In [33]:
re3data_institutions[re3data_institutions.org_continent.isna()].institutionCountry.unique()
Out[33]:
array(['AAA', 'EEC'], dtype=object)

AAA is used for international collaborations; we skip this. EEC is used for the EU commission; we fix the continent manually.

In [34]:
re3data_institutions.loc[re3data_institutions.institutionCountry == 'EEC', 'org_continent'] = 'EU'

OpenDOAR

In [35]:
opendoar_institutions = opendoar_df.explode('organization')[['system_metadata.id', 'organization']]
opendoar_institutions = opendoar_institutions[~opendoar_institutions.organization.isna()].reset_index(drop=True)
opendoar_institutions = opendoar_institutions.join(pd.json_normalize(opendoar_institutions.organization))
opendoar_institutions['country'] = opendoar_institutions.country.map(str.upper, na_action='ignore')
opendoar_institutions['country'] = opendoar_institutions.country.map(countrycode_iso2_to_countrycode_iso3, na_action='ignore')
opendoar_institutions.head()
Out[35]:
system_metadata.id organization name alternativeName country url identifier location.latitude location.longiture
0 134 {'name': 'technische universität dortmund', 'a... technische universität dortmund tu dortmund DEU https://www.tu-dortmund.de [{'identifier': 'https://ror.org/01k97gp34', '...
1 58 {'name': 'centre pour la communication scienti... centre pour la communication scientifique directe ccsd FRA https://www.ccsd.cnrs.fr []
2 93 {'name': 'texas medical center', 'alternativeN... texas medical center tmc USA https://www.tmc.edu [{'identifier': 'https://ror.org/00dqsbj20', '...
3 68 {'name': 'university of southampton', 'alterna... university of southampton GBR https://www.southampton.ac.uk/ [{'identifier': 'https://ror.org/01ryk1543', '...
4 84 {'name': 'carleton college', 'alternativeName'... carleton college USA https://www.carleton.edu [{'identifier': 'https://ror.org/03jep7677', '...
In [36]:
opendoar_institutions['org_continent'] = opendoar_institutions.country.map(countrycode_to_continent)
In [37]:
opendoar_institutions[opendoar_institutions.org_continent.isna()].country.unique()
Out[37]:
array([nan, 'UMI'], dtype=object)
In [38]:
opendoar_institutions.loc[opendoar_institutions.country == 'UMI', 'org_continent'] = 'NA'
opendoar_institutions[opendoar_institutions.country == 'UMI']
Out[38]:
system_metadata.id organization name alternativeName country url identifier location.latitude location.longiture org_continent
4233 5379 {'name': 'kettering university', 'alternativeN... kettering university UMI https://www.kettering.edu [{'identifier': 'https://ror.org/03rcspa57', '... NA

ROAR

In [39]:
roar_institutions = roar_df.explode('location_country')
roar_institutions['location_country'] = roar_institutions.location_country.map(str.upper, na_action='ignore')
roar_institutions['location_country'] = roar_institutions.location_country.map(countrycode_iso2_to_countrycode_iso3)
roar_institutions['continent'] = roar_institutions.location_country.map(countrycode_to_continent)

FAIRsharing

In [40]:
fairsharing_countries = fairsharing_df.explode('attributes.countries')
fairsharing_countries['countrycode'] = fairsharing_countries['attributes.countries'].map(country_to_countrycode)
fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(countrycode_to_continent)
In [41]:
fairsharing_countries[fairsharing_countries.countrycode.isna()]['attributes.countries'].unique()
Out[41]:
array(['Worldwide', 'European Union', nan], dtype=object)
In [42]:
fairsharing_countries[fairsharing_countries.continent.isna()]['attributes.countries'].unique()
Out[42]:
array(['Worldwide', 'European Union', nan, 'Antarctica'], dtype=object)

Fix manually some rows

In [43]:
fairsharing_countries.loc[fairsharing_countries['attributes.countries'] == 'Republic of Ireland', ['attributes.countries', 'countrycode', 'continent']] = ['Ireland', 'IE', 'EU']
fairsharing_countries.loc[fairsharing_countries['attributes.countries'] == 'European Union', ['countrycode', 'continent']] = ['EU', 'EU']

Make Antactica disappear (only one repo)

In [44]:
fairsharing_countries.loc[fairsharing_countries['attributes.countries'] == 'Antarctica', ['countrycode', 'continent']] = ['AQ', np.nan]
fairsharing_countries[fairsharing_countries.countrycode == 'AQ']
Out[44]:
id type attributes.created-at attributes.updated-at attributes.metadata.doi attributes.metadata.name attributes.metadata.status attributes.metadata.contacts attributes.metadata.homepage attributes.metadata.identifier attributes.metadata.description attributes.metadata.support-links attributes.metadata.year-creation attributes.metadata.data-processes attributes.legacy-ids attributes.fairsharing-registry attributes.record-type attributes.subjects attributes.domains attributes.taxonomies attributes.user-defined-tags attributes.countries attributes.name attributes.abbreviation attributes.url attributes.doi attributes.fairsharing-licence attributes.description attributes.publications attributes.licence-links attributes.metadata.citations attributes.metadata.abbreviation attributes.metadata.access-points attributes.metadata.associated-tools attributes.metadata.deprecation-date attributes.metadata.deprecation-reason attributes.metadata.tombstone countrycode continent
782 2462 fairsharing-records 2017-06-27T13:30:19.000Z 2021-09-30T11:35:28.523Z 10.25504/FAIRsharing.ewyejx Antabif IPT - AntOBIS IPT - GBIF Belgium ready [{'contact-name': 'Anton Van de Putte', 'conta... http://ipt.biodiversity.aq/ 2462 The Belgium Biodiversity Platform hosts this d... [{'url': 'a.heughebaert@biodiversity.be', 'nam... NaN NaN [biodbcore-000944, bsg-d000944] Database repository [Biodiversity, Life Science] [Taxonomic classification] [All] [] Antarctica FAIRsharing record for: Antabif IPT - AntOBIS ... None https://fairsharing.org/10.25504/FAIRsharing.e... 10.25504/FAIRsharing.ewyejx https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The Belgium... [] [{'licence-name': 'Apache License 2.0', 'licen... NaN NaN NaN NaN NaN NaN NaN AQ NaN

Country coverage

In [45]:
data1 = re3data_institutions.groupby('institutionCountry')[['orgIdentifier']].count().sort_values('orgIdentifier', ascending=False)
data2 = opendoar_institutions.groupby('country')[['system_metadata.id']].count().sort_values('system_metadata.id', ascending=False)
data3 = roar_institutions.groupby('location_country')[['eprintid']].count().sort_values('eprintid', ascending=False)
data4 = fairsharing_countries.groupby('countrycode')[['id']].count().sort_values('id', ascending=False)

plot = [
    go.Bar(
        x=data1.index,
        y=data1['orgIdentifier'],
        name='re3data'
    ),
    go.Bar(
        x=data2.index,
        y=data2['system_metadata.id'],
        name='openDOAR',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data3.index,
        y=data3['eprintid'],
        name='ROAR',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data4.index,
        y=data4['id'],
        name='FAIRsharing',
        visible = 'legendonly'
    )
]

layout = go.Layout(
    title='Country coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

go.Figure(plot, layout).show()

Continental coverage

In [46]:
data1 = re3data_institutions.groupby('org_continent')[['orgIdentifier']].count()
data2 = opendoar_institutions.groupby('org_continent')[['system_metadata.id']].count()
data3 = roar_institutions.groupby('continent')[['eprintid']].count()
data4 = fairsharing_countries.groupby('continent')[['id']].count()

plot = [
    go.Scatterpolar(
        r=data1.orgIdentifier,
        theta=data1.index,
        fill='toself',
        name='re3data'),
    go.Scatterpolar(
        r=data2['system_metadata.id'],
        theta=data2.index,
        fill='toself',
        name='OpenDOAR'),
    go.Scatterpolar(
        r=data3.eprintid,
        theta=data3.index,
        fill='toself',
        name='ROAR'),
    go.Scatterpolar(
        r=data4.id,
        theta=data4.index,
        fill='toself',
        name='FAIRsharing')
]

layout = go.Layout(polar=dict(
    radialaxis=dict(
      visible=True
    ),
  )
)

go.Figure(plot, layout).show()