registries_analysis/notebooks/02-subjects&geographic.ipynb

3.9 MiB
Raw Blame History

In [1]:
import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)
In [2]:
def country_to_countrycode(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha3(country)
        except:
            return np.nan
        
def countrycode_iso2_to_countrycode_iso3(country):
    if pd.isna(country):
        return np.nan
    else:
        try:
            return pycountry_convert.country_name_to_country_alpha3(pycountry_convert.country_alpha2_to_country_name(country))
        except:
            return np.nan

def countrycode_to_continent(country_code):
    if pd.isna(country_code):
        return np.nan
    else:
        try:
            return pycountry_convert.country_alpha2_to_continent_code(pycountry_convert.country_alpha3_to_country_alpha2(country_code))
        except:
            return np.nan

Loading datasets

re3data

In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additionalName': ast.literal_eval,
                                    'repositoryIdentifier': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'contentType': ast.literal_eval,
                                    'providerType': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })

re3data_df.head()
Out[3]:
orgIdentifier repositoryName repositoryName.language additionalName repositoryURL repositoryIdentifier repositoryContact description description.language type size startDate endDate repositoryLanguage subject missionStatementURL contentType providerType keyword institution policy databaseAccess databaseLicense dataAccess dataLicense dataUploadType dataUploadLicense software versioning api pidSystem citationGuidelineURL aidSystem enhancedPublication qualityManagement certificate metadataStandard syndication remarks entryDate lastUpdate
0 r3d100000001 Odum Institute Archive Dataverse eng [] https://dataverse.unc.edu/dataverse/odum [] ["https://dataverse.unc.edu/dataverse/odum#", ... The Odum Institute Archive Dataverse contains ... eng [disciplinary] {"size": "13 dataverses; 3.050 datasets", "upd... NaN NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... NaN [{'name': 'Databases', 'scheme': 'parse'}, {'n... [dataProvider] [FAIR, Middle East, crime, demography, economy... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC0", "databaseLicen... [{"dataAccessType": "embargoed", "dataAccessRe... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [] ["DataVerse"] NaN {} ["DOI"] NaN [] unknown yes ["other"] [{"metadataStandardName": "DDI - Data Document... {} Odum Dataverse is covered by Thomson Reuters D... 2013-06-10 2021-07-06
1 r3d100000002 Access to Archival Databases eng [{'additionalName': 'AAD', 'additionalNameLang... https://aad.archives.gov/aad/ [RRID:SCR_010479, RRID:nlx_157752] ["https://www.archives.gov/contact"] You will find in the Access to Archival Databa... eng [disciplinary] {"size": "", "updatedp": ""} 1985 NaN ["eng", "spa"] [{'name': '1 Humanities and Social Sciences', ... https://www.archives.gov/publications/general-... [{'name': 'Images', 'scheme': 'parse'}, {'name... [dataProvider] [US History] [{'institutionName': 'The U.S. National Archiv... [{"policyName": "Contribution Policy", "policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "Copyrights", "dataLicens... restricted [] ["unknown"] no {"api": "https://www.archives.gov/developer#to... ["none"] https://aad.archives.gov/aad/help/getting-star... [] unknown unknown [] [] {"syndication": "http://www.archives.gov/socia... NaN 2012-07-04 2021-05-25
2 r3d100000004 Datenbank Gesprochenes Deutsch deu [{'additionalName': 'DGD', 'additionalNameLang... https://dgd.ids-mannheim.de/ [] ["dgd@ids-mannheim.de"] The "Database for Spoken German (DGD)" is a co... eng [disciplinary] {"size": "34 corpora", "updatedp": "2020-02-03"} 2012 NaN ["deu"] [{'name': '1 Humanities and Social Sciences', ... https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... [{'name': 'Audiovisual data', 'scheme': 'parse... [dataProvider, serviceProvider] [Australian German, FOLK, German dialects, Pfe... [{'institutionName': 'Institut für Deutsche Sp... [{"policyName": "Erfurter Aufruf zur Sicherung... {"databaseAccessType": "restricted", "databas... [] [{"dataAccessType": "restricted", "dataAccessR... [{"dataLicenseName": "other", "dataLicenseURL"... restricted [] ["other"] yes {} ["none"] http://agd.ids-mannheim.de/konditionen.shtml [] unknown unknown ["RatSWD"] [] {} NaN 2012-07-20 2020-08-27
3 r3d100000005 UNC Dataverse eng [{'additionalName': 'University of North Carol... https://dataverse.unc.edu/ [] ["https://dataverse.unc.edu/", "odumarchive@un... UNC Dataverse is an open-source repository sof... eng [institutional] {"size": "186 dataverses; 25.272 studies; 229.... 2011 NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://odum.unc.edu/about/mission-vision/ [{'name': 'Archived data', 'scheme': 'parse'},... [dataProvider, serviceProvider] [FAIR, census, demographic survey, demography,... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Data Deposit Form"... ["DataVerse"] yes {"api": "https://guides.dataverse.org/en/lates... ["ARK", "DOI", "PURL", "URN", "hdl"] https://dataverse.org/best-practices/data-cita... [] unknown yes [] [{"metadataStandardName": "DDI - Data Document... {} UNC Dataverse is covered by Clarivate Data Cit... 2012-07-23 2021-08-11
4 r3d100000006 Archaeology Data Service eng [{'additionalName': 'ADS', 'additionalNameLang... https://archaeologydataservice.ac.uk/ [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] ["help@archaeologydataservice.ac.uk", "https:/... The ADS is an accredited digital repository fo... eng [disciplinary] {"size": "1837 results", "updatedp": "2020-05-... 1996-10-01 NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://archaeologydataservice.ac.uk/about/our... [{'name': 'Archived data', 'scheme': 'parse'},... [dataProvider, serviceProvider] [FAIR, archaeology, cultural heritage, prehist... [{'institutionName': 'Arts and Humanities Rese... [{"policyName": "ADS Guides to good practice",... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC", "databaseLicens... [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Guidelines for Dep... ["other"] yes {"api": "https://archaeologydataservice.ac.uk/... ["DOI"] https://archaeologydataservice.ac.uk/advice/te... [] unknown yes ["other"] [{"metadataStandardName": "DataCite Metadata S... {"syndication": "https://archaeologydataservic... ADS is covered by Clarivate Data Citation Inde... 2012-07-23 2021-09-02
In [4]:
re3data_df.describe(include='all')
Out[4]:
orgIdentifier repositoryName repositoryName.language additionalName repositoryURL repositoryIdentifier repositoryContact description description.language type size startDate endDate repositoryLanguage subject missionStatementURL contentType providerType keyword institution policy databaseAccess databaseLicense dataAccess dataLicense dataUploadType dataUploadLicense software versioning api pidSystem citationGuidelineURL aidSystem enhancedPublication qualityManagement certificate metadataStandard syndication remarks entryDate lastUpdate
count 2739 2739 2739 2739 2716 2739 2739 2739 2739 2739 2739 1776 157 2739 2739 2318 2739 2739 2739 2739 2739 2739 2739 2739 2739 2711 2739 2739 1316 2739 2739 1512 2739 2737 2739 2739 2739 2739 1674 2739 2739
unique 2739 2736 19 2162 2713 864 2459 2737 6 9 1289 352 80 107 1389 2249 1338 5 2504 2720 2319 12 375 145 2263 3 681 23 2 1146 29 1321 12 3 3 14 172 563 1656 1275 740
top r3d100000001 Språkbanken eng [] http://icgem.gfz-potsdam.de/home [] [] The National Archives and Records Administrati... eng [disciplinary] {"size": "", "updatedp": ""} 2008 2015 ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://learn.scholarsportal.info/all-guides/d... [{'name': 'Standard office documents', 'scheme... [dataProvider] [multidisciplinary] [{'institutionName': 'National Center for Biot... [][] {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [] ["unknown"] yes {} ["none"] https://dataverse.org/best-practices/data-cita... [] unknown yes [] [] {} is covered by Elsevier. 2016-05-10 2021-09-03
freq 1 2 2554 569 2 1876 202 2 2723 1733 1450 92 11 2063 226 14 30 1771 193 6 312 2571 2159 1269 64 1793 2013 1226 1108 1498 1361 72 2155 1608 1515 2509 1669 2162 14 20 137

openDOAR

In [48]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'repository_metadata.content_subjects': ast.literal_eval,
                                    'repository_metadata.alternativename': ast.literal_eval,
                                    'repository_metadata.content_types': ast.literal_eval,
                                    'organization': ast.literal_eval
                                    },
                         dtype={'system_metadata.id': str})

opendoar_df.head()
Out[48]:
system_metadata.id repository_metadata.name repository_metadata.alternativename repository_metadata.url repository_metadata.description repository_metadata.type repository_metadata.content_languages system_metadata.date_modified system_metadata.date_created repository_metadata.content_subjects repository_metadata.content_types organization policy_urls repository_metadata.software repository_metadata.oai_url system_metadata.publicly_visible repository_metadata.repository_status repository_metadata.fulltext_record_count repository_metadata.metadata_record_count
0 175 {"name": "hku theses online", "language": "en"} [] http://hub.hku.hk/handle/10722/1057 this is an institutional repository providing ... institutional ["zh", "en"] 2021-03-25 10:16:18 2005-12-21 12:44:08 [multidisciplinary] [bibliographic_references, theses_and_disserta... [{'name': 'university of hong kong', 'alternat... [] {"name": "dspace", "version": "cris-5.3.1-snap... NaN yes fully_functional NaN 11850.0
1 64 {"name": "research support scheme - central eu... [] http://rss.archives.ceu.hu/ this is an institutional repository collecting... institutional ["cs", "en", "hu", "ru"] 2021-03-25 09:48:31 2006-01-04 14:59:30 [multidisciplinary] [unpub_reports_and_working_papers] [{'name': 'central european university', 'alte... [] {"name": "eprints", "version": "2.2.1"} http://rss.archives.ceu.hu/perl/oai2 yes fully_functional NaN 164.0
2 151 {"name": "cadmus, eui research repository", "l... [] http://cadmus.eui.eu/ cadmus is the name of the eui research reposit... institutional ["nl", "en", "fr", "de", "it"] 2021-09-13 13:35:36 2006-01-04 12:07:07 [history and archaeology, multidisciplinary, s... [journal_articles, theses_and_dissertations, u... [{'name': 'european university institute', 'al... [{"policy_url": "https://www.eui.eu/research/e... {"name": "dspace", "version": "5.2"} http://cadmus.eui.eu/oai/request yes fully_functional 3867.0 24869.0
3 105 {"name": "document server@uhasselt", "language... [] https://doclib.uhasselt.be/dspace/ this site is a university repository providing... institutional ["nl", "en", "fr", "de"] 2021-04-16 15:23:52 2006-01-24 15:46:44 [multidisciplinary] [journal_articles, conference_and_workshop_pap... [{'name': 'uhasselt', 'alternativeName': 'hass... [] {"name": "dspace", "version": "1.7.2"} http://doclib.uhasselt.be/dspace-oai/request yes fully_functional 0.0 27376.0
4 101 {"name": "utrecht university repository", "lan... [] http://dspace.library.uu.nl this site is a university repository providing... institutional ["nl", "en"] 2021-04-16 15:22:03 2006-01-13 12:55:13 [multidisciplinary] [journal_articles, conference_and_workshop_pap... [{'name': 'university of utrecht', 'alternativ... [] {"name": "dspace", "version": ""} https://dspace.library.uu.nl/oai/request yes fully_functional 1686.0 185637.0
In [6]:
opendoar_df.describe(include='all')
Out[6]:
system_metadata.id repository_metadata.name repository_metadata.alternativename repository_metadata.url repository_metadata.description repository_metadata.type repository_metadata.content_languages system_metadata.date_modified system_metadata.date_created repository_metadata.content_subjects repository_metadata.content_types organization policy_urls repository_metadata.software repository_metadata.oai_url system_metadata.publicly_visible repository_metadata.repository_status repository_metadata.fulltext_record_count repository_metadata.metadata_record_count
count 5742 5742 5742 5742 5421 5742 5742 5742 5742 5742 5742 5742 5742 5742 4402 5742 5595 2.299000e+03 4.197000e+03
unique 5742 5713 2108 5705 4619 4 330 2372 5573 821 478 5201 642 321 4370 1 7 NaN NaN
top 175 {"name": "hiroshima associated repository port... [] http://harp.lib.hiroshima-u.ac.jp/ this site provides access to the research outp... institutional ["en"] 2020-09-18 12:53:48 2020-09-18 12:53:48 ["multidisciplinary"] [theses_and_dissertations] [{'name': 'rijksuniversiteit groningen', 'alte... [] {"name": "dspace", "version": ""} https://kidoks.bsz-bw.de/oai yes fully_functional NaN NaN
freq 1 3 3595 3 95 5096 1917 82 82 3227 465 26 5098 822 3 5742 5276 NaN NaN
mean NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.010186e+03 1.760546e+05
std NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.206295e+04 6.600825e+06
min NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000e+00 0.000000e+00
25% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000e+00 8.950000e+02
50% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.220000e+02 4.026000e+03
75% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.930500e+03 1.630400e+04
max NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.817531e+06 4.200000e+08

ROAR

In [9]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv', dtype='str')
roar_df = roar_df.groupby('eprintid').aggregate(set)

def value_or_list(cell_set):
    copy = set(cell_set)
    copy.discard(np.nan) 
    if len(copy) == 0:
        return np.nan
    if len(copy) == 1:
        return copy.pop()
    return list(copy)
        
roar_df = roar_df.applymap(value_or_list)
roar_df.reset_index(inplace=True)

roar_df.head()
Out[9]:
eprintid rev_number eprint_status userid importid source dir datestamp lastmod status_changed type succeeds commentary metadata_visibility latitude longitude relation_type relation_uri item_issues_id item_issues_type item_issues_description item_issues_timestamp item_issues_status item_issues_reported_by item_issues_resolved_by item_issues_comment item_issues_count sword_depositor sword_slug exemplar home_page title oai_pmh sword_endpoint rss_feed twitter_feed description fulltext open_access mandate organisation_title organisation_home_page location_country location_city location_latitude location_longitude software geoname version subjects date note suggestions activity_low activity_medium activity_high recordcount recordhistory fulltexts_total fulltexts_docs fulltexts_rtotal fulltexts_rdocs registry_name registry_id submit_to submitted_to_name submitted_to_done webometrics_rank webometrics_size webometrics_visibility webometrics_rich_files webometrics_scholar monthly_deposits total_deposits association
0 1 633 archive 1 NaN NaN disk0/00/00/00/01 2010-01-06 13:43:48 2011-07-18 05:40:07 2010-01-06 13:43:48 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://archivesic.ccsd.cnrs.fr/ @RCHIVESIC http://archivesic.ccsd.cnrs.fr/oai/oai.php NaN NaN NaN NaN NaN NaN NaN NaN NaN fr NaN NaN NaN hal geoname_2_FR other NaN 2002-05-17 19:24:41 NaN NaN 0 0 0 25 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... NaN NaN NaN NaN [opendoar, celestial] [58, 669] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 10 511 archive 1 NaN NaN disk0/00/00/00/10 2010-01-06 13:43:48 2011-07-18 05:40:13 2010-01-06 13:43:48 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://www.diva-portal.org/mdh/ Academic Archive On-line (Mälardalen Universit... http://www.diva-portal.org/oai/mdh/OAI NaN NaN NaN NaN TRUE TRUE NaN NaN NaN se Uppsala 59.8667 17.6333 diva geoname_2_SE other NaN 2005-12-08 13:15:22 NaN NaN 0 0 0 100 0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,8,39,100,100,100... NaN NaN NaN NaN [opendoar, celestial] [258, 526] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 1000 274 archive 1 NaN NaN disk0/00/00/10/00 2010-01-06 13:45:01 2011-07-06 08:21:21 2010-01-06 13:45:01 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN http://pam.pisharp.org/ PAM - Portuguese Archive of Mathematics NaN NaN NaN NaN NaN TRUE TRUE NaN NaN NaN pt Bellevue, WA 47.6034 -122.155 dspace geoname_2_PT other NaN 2006-05-04 10:48:14 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 10001 20 archive 91 NaN NaN disk0/00/01/00/01 2015-08-08 14:52:11 2016-03-21 19:44:01 2015-08-08 14:52:11 subject NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://edoc.sub.uni-hamburg.de/klimawandel/ Klimawandel Dokumentenserver http://edoc.sub.uni-hamburg.de/klimawandel/oai NaN NaN NaN The "Documentenserver Klimawandel" (Repository... TRUE TRUE TRUE [KLIMZUG projects, Climate Service Center 2.0,... [http://www.climateservicecenter.de/, http://w... de Hamburg 53.5511 9.9937 opus geoname_2_DE other [GF, GE, HD, G1, S1] 2015-07-02 08:08:31 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN [opendoar, celestial] [3408, 5881] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
4 10008 11 archive 404 NaN NaN disk0/00/01/00/08 2015-08-08 14:52:26 2016-03-21 19:43:51 2015-08-08 14:52:26 institutional NaN NaN show NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN http://creativematter.skidmore.edu/ Creative Matter | Skidmore College Research http://creativematter.skidmore.edu/do/oai/ NaN http://creativematter.skidmore.edu/recent.rss NaN Welcome to Creative Matter, a repository for t... TRUE FALSE FALSE Skidmore College http://www.skidmore.edu/ us Saratoga Springs 43.0961 -73.7818 bepress geoname_2_US other NaN 2015-07-06 17:35:50 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN celestial 5882 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [12]:
roar_df.describe(include='all')
Out[12]:
eprintid rev_number eprint_status userid importid source dir datestamp lastmod status_changed type succeeds commentary metadata_visibility latitude longitude relation_type relation_uri item_issues_id item_issues_type item_issues_description item_issues_timestamp item_issues_status item_issues_reported_by item_issues_resolved_by item_issues_comment item_issues_count sword_depositor sword_slug exemplar home_page title oai_pmh sword_endpoint rss_feed twitter_feed description fulltext open_access mandate organisation_title organisation_home_page location_country location_city location_latitude location_longitude software geoname version subjects date note suggestions activity_low activity_medium activity_high recordcount recordhistory fulltexts_total fulltexts_docs fulltexts_rtotal fulltexts_rdocs registry_name registry_id submit_to submitted_to_name submitted_to_done webometrics_rank webometrics_size webometrics_visibility webometrics_rich_files webometrics_scholar monthly_deposits total_deposits association
count 5375 5375 5375 5375 0.0 0.0 5375 5375 5375 5375 5375 107 0.0 5375 0.0 0.0 0.0 0.0 63 63 63 63 63 0.0 0.0 0.0 2245 0.0 0.0 265 5368 5373 4267 176 1521 115 3782 4127 4127 3676 4396 4226 5080 3655 3681 3664 4637 4671 5375 1250 5360 215 187 2291 2291 2291 2293 2291 270 258 270 258 4603 4578 293 205 205 148 148 148 148 148 756 756 217
unique 5375 658 1 2135 NaN NaN 5375 4127 3966 4158 12 107 NaN 2 NaN NaN NaN NaN 48 5 62 4 3 NaN NaN NaN 4 NaN NaN 2 5202 5076 3994 170 1468 111 3304 2 2 2 3802 3771 143 1861 2887 2917 31 126 53 906 4830 207 171 72 54 16 741 1704 135 118 134 117 9 4256 7 1 1 148 148 148 146 143 346 342 3
top 1 11 archive 1 NaN NaN disk0/00/00/00/01 2010-01-06 13:43:48 2011-07-06 08:24:53 2010-01-06 13:43:48 institutional 10164 NaN show NaN NaN NaN NaN bad_oai_pmh_url_0 duplicate_title Duplicate title to <xhtml:table xmlns:xhtml="h... 2010-01-13 10:44:49 discovered NaN NaN NaN 0 NaN NaN FALSE http://eprints.upnjatim.ac.id/ Repositorio Institucional http://kce.docressources.info/ws/PMBWs_2 http://producao.usp.br/sword/servicedocument http://eprints.upnjatim.ac.id/cgi/latest_tool?... http://my.indexcopernicus.com/fredemoreno info:other:archives.eprints.org:import TRUE TRUE FALSE Chinese Academy of Science (中国科学院) http://www.cas.cn/ us Lima 34.1607 -118.139 dspace geoname_2_US other K1 2006-05-04 10:48:14 DSpace@Işık is a growing collection of Işık Un... This repository is hosted by the Texas Digital... 0 0 0 100 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... 0 0 0 0 [opendoar, celestial] 2479 [opendoar, celestial, roarmap] opendoar 2021-01-25 24 46 20 824 806 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,... 0 russell_group
freq 1 332 5375 1333 NaN NaN 1 16 8 16 3795 1 NaN 5334 NaN NaN NaN NaN 15 33 2 45 38 NaN NaN NaN 2204 NaN NaN 258 4 7 4 2 5 2 112 2758 2652 2699 9 9 886 69 25 25 2307 840 4771 53 99 2 9 2015 2077 2213 733 95 113 114 113 114 1775 4 92 205 205 1 1 1 3 5 387 387 127
mean NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
std NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
50% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
75% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
max NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

FAIRsharing

In [57]:
with open('../data/raw/fairsharing_dump_api_09_2021.json') as f:
    lines = f.read().splitlines()
    
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))

fairsharing_df.head()
Out[57]:
id type attributes.created-at attributes.updated-at attributes.metadata.doi attributes.metadata.name attributes.metadata.status attributes.metadata.contacts attributes.metadata.homepage attributes.metadata.identifier attributes.metadata.description attributes.metadata.support-links attributes.metadata.year-creation attributes.metadata.data-processes attributes.legacy-ids attributes.fairsharing-registry attributes.record-type attributes.subjects attributes.domains attributes.taxonomies attributes.user-defined-tags attributes.countries attributes.name attributes.abbreviation attributes.url attributes.doi attributes.fairsharing-licence attributes.description attributes.publications attributes.licence-links attributes.metadata.citations attributes.metadata.abbreviation attributes.metadata.access-points attributes.metadata.associated-tools attributes.metadata.deprecation-date attributes.metadata.deprecation-reason attributes.metadata.tombstone
0 1723 fairsharing-records 2014-11-04T15:23:40.000Z 2021-09-30T11:39:06.829Z 10.25504/FAIRsharing.8t18te Cell Image Library ready [{'contact-name': 'David Orloff', 'contact-ema... http://www.cellimagelibrary.org 1723 This library is a public and easily accessible... [{'url': 'http://www.cellimagelibrary.org/page... 2010.0 [{'name': 'live update', 'type': 'data release... [biodbcore-000180, bsg-d000180] Database repository [Cell Biology, Life Science] [Cell, Microscopy, Light microscopy, Electron ... [All] [] [United States] FAIRsharing record for: Cell Image Library None https://fairsharing.org/10.25504/FAIRsharing.8... 10.25504/FAIRsharing.8t18te https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: This librar... [{'id': 232, 'pubmed_id': 23203874, 'title': '... [{'licence-name': 'Cell Image Library Data Pol... NaN NaN NaN NaN NaN NaN NaN
1 3101 fairsharing-records 2020-09-16T08:49:13.000Z 2021-09-30T11:36:45.452Z NaN WHOI Ship Data-Grabber System ready NaN http://4dgeo.whoi.edu/shipdata/SDG_shipdata.html 3101 The WHOI Ship DataGrabber system provides the ... [{'url': 'http://4dgeo.whoi.edu/shipdata/SDG_o... 2004.0 [{'url': 'http://4dgeo.whoi.edu/sdg-bin/dv_mai... [biodbcore-001609, bsg-d001609] Database repository [Earth Science, Water Research, Oceanography] [] [Not applicable] [subseafloor environments] [United States] FAIRsharing record for: WHOI Ship Data-Grabber... None https://fairsharing.org/fairsharing_records/3101 None https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The WHOI Sh... [] [{'licence-name': 'NDSF Data Archive Policy', ... NaN NaN NaN NaN NaN NaN NaN
2 2649 fairsharing-records 2018-08-07T20:23:32.000Z 2021-09-30T11:39:07.898Z NaN Electron Microscope Public Image Archive ready [{'contact-name': 'General contact', 'contact-... https://www.ebi.ac.uk/pdbe/emdb/empiar/ 2649 EMPIAR, the Electron Microscopy Public Image A... [{'url': 'https://www.ebi.ac.uk/support/EMPIAR... 2015.0 [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... [biodbcore-001140, bsg-d001140] Database repository [Bioinformatics, Biology] [Protein image, Microscopy, Electron microscop... [All] [] [Greece, Czech Republic, United Kingdom, Icela... FAIRsharing record for: Electron Microscope Pu... EMPIAR https://fairsharing.org/fairsharing_records/2649 None https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: EMPIAR, the... [{'id': 2232, 'pubmed_id': 27067018, 'title': ... [{'licence-name': 'EMBL-EBI Terms of Use', 'li... [{'doi': '10.1038/nmeth.3806', 'pubmed-id': 27... EMPIAR [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... NaN NaN NaN
3 2657 fairsharing-records 2018-08-13T15:12:11.000Z 2021-09-30T11:37:28.736Z 10.25504/FAIRsharing.tnByoG ClinicalStudyDataRequest.com ready [{'contact-email': 'support@clinicalstudydatar... https://clinicalstudydatarequest.com/ 2657 ClinicalStudyDataRequest.com (CSDR) is a conso... [{'url': 'https://clinicalstudydatarequest.com... 2014.0 [{'url': 'https://clinicalstudydatarequest.com... [biodbcore-001149, bsg-d001149] Database repository [Preclinical Studies, Biomedical Science] [] [Homo sapiens] [] [Worldwide] FAIRsharing record for: ClinicalStudyDataReque... CSDR https://fairsharing.org/10.25504/FAIRsharing.t... 10.25504/FAIRsharing.tnByoG https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: ClinicalStu... [] [{'licence-name': 'CSDR Data Sharing Agreement... NaN CSDR NaN NaN NaN NaN NaN
4 2078 fairsharing-records 2014-11-04T15:23:40.000Z 2021-09-30T11:34:43.129Z 10.25504/FAIRsharing.3axym7 Germplasm Resources Information Network ready [{'contact-email': 'dbmu@ars-grin.gov'}] https://www.ars-grin.gov/ 2078 GRIN provides National Genetic Resources Progr... [{'url': 'https://www.ars-grin.gov/Pages/Colle... 2010.0 [{'url': 'https://www.ars-grin.gov/', 'name': ... [biodbcore-000546, bsg-d000546] Database repository [Life Science] [Cell, Cell culture, Germplasm] [Bacteria, Metazoa, Viridiplantae] [] [United States] FAIRsharing record for: Germplasm Resources In... GRIN https://fairsharing.org/10.25504/FAIRsharing.3... 10.25504/FAIRsharing.3axym7 https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: GRIN provid... [] [] NaN GRIN NaN NaN NaN NaN NaN
In [58]:
fairsharing_df.describe(include='all')
Out[58]:
id type attributes.created-at attributes.updated-at attributes.metadata.doi attributes.metadata.name attributes.metadata.status attributes.metadata.contacts attributes.metadata.homepage attributes.metadata.identifier attributes.metadata.description attributes.metadata.support-links attributes.metadata.year-creation attributes.metadata.data-processes attributes.legacy-ids attributes.fairsharing-registry attributes.record-type attributes.subjects attributes.domains attributes.taxonomies attributes.user-defined-tags attributes.countries attributes.name attributes.abbreviation attributes.url attributes.doi attributes.fairsharing-licence attributes.description attributes.publications attributes.licence-links attributes.metadata.citations attributes.metadata.abbreviation attributes.metadata.access-points attributes.metadata.associated-tools attributes.metadata.deprecation-date attributes.metadata.deprecation-reason attributes.metadata.tombstone
count 1797 1797 1797 1797 1354 1797 1797 1678 1797 1797.000000 1797 1608 1492.000000 1565 1797 1797 1797 1797 1797 1797 1797 1797 1797 1638 1797 1354 1797 1797 1797 1797 326 1638 449 618 217 217 1
unique 1797 1 1162 1797 1354 1796 4 1576 1797 NaN 1797 1594 NaN 1563 1797 1 3 888 1163 378 384 185 1796 1626 1797 1354 1 1797 1109 1082 320 1626 444 615 55 86 1
top 1723 fairsharing-records 2014-11-04T15:23:40.000Z 2021-09-30T11:39:06.829Z 10.25504/FAIRsharing.8t18te OmicsDB ready [{'contact-name': 'Sam Hokin', 'contact-email'... http://www.cellimagelibrary.org NaN This library is a public and easily accessible... [{'url': 'https://github.com/gbif/ipt/wiki/IPT... NaN [{'url': 'http://qf.iodp.tamu.edu/qfsearch/sea... [biodbcore-000180, bsg-d000180] Database repository [Life Science] [] [All] [] [United States] FAIRsharing record for: OmicsDB CGD https://fairsharing.org/10.25504/FAIRsharing.8... 10.25504/FAIRsharing.8t18te https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: This librar... [] [] [{'doi': '10.1093/nar/gkz890', 'pubmed-id': 31... CGD [{'url': 'https://github.com/Ensembl', 'name':... [{'url': 'http://www.h-invitational.jp/hinv/bl... 2021-9-17 This resource is no longer available at the st... True
freq 1 1797 636 1 1 2 1540 6 1 NaN 1 6 NaN 2 1 1797 926 350 265 502 1193 594 2 3 1 1 1797 1 661 716 6 3 3 2 84 113 1
mean NaN NaN NaN NaN NaN NaN NaN NaN NaN 2446.100167 NaN NaN 2007.636059 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
std NaN NaN NaN NaN NaN NaN NaN NaN NaN 520.058757 NaN NaN 10.953269 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min NaN NaN NaN NaN NaN NaN NaN NaN NaN 1547.000000 NaN NaN 1894.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25% NaN NaN NaN NaN NaN NaN NaN NaN NaN 1996.000000 NaN NaN 2004.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
50% NaN NaN NaN NaN NaN NaN NaN NaN NaN 2445.000000 NaN NaN 2010.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
75% NaN NaN NaN NaN NaN NaN NaN NaN NaN 2897.000000 NaN NaN 2014.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
max NaN NaN NaN NaN NaN NaN NaN NaN NaN 3346.000000 NaN NaN 2021.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

Subjects analysis

re3data

In [38]:
re3data_subjects = re3data_df[['orgIdentifier', 'subject']].explode('subject')
re3data_subjects['subject'] = re3data_subjects['subject'].apply(lambda x: x['name'] if x is not np.nan else np.nan)
re3data_subjects
Out[38]:
orgIdentifier subject
0 r3d100000001 1 Humanities and Social Sciences
0 r3d100000001 111 Social Sciences
0 r3d100000001 11104 Political Science
0 r3d100000001 112 Economics
0 r3d100000001 12 Social and Behavioural Sciences
... ... ...
2738 r3d100013652 102 History
2738 r3d100013652 105 Literary Studies
2738 r3d100013652 108 Philosophy
2738 r3d100013652 10801 History of Philosophy
2738 r3d100013652 11 Humanities

16654 rows × 2 columns

In [42]:
data = re3data_subjects.groupby('subject')[['orgIdentifier']].count().sort_values('subject', ascending=False)
data
plot = [
    go.Bar(
        x=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)].index,
        y=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)]['orgIdentifier'],
        name='re3data tier %s-digits' % tier
    ) for tier in [1,2,3,5]
] 

layout = go.Layout(
    title='Subject coverage re3data',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

OpenDOAR

In [49]:
opendoar_subjects = opendoar_df.explode('repository_metadata.content_subjects')
In [50]:
data = opendoar_subjects.groupby('repository_metadata.content_subjects')[['system_metadata.id']].count().sort_values('system_metadata.id', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['system_metadata.id'],
    ) 
] 

layout = go.Layout(
    title='Subject coverage OpenDOAR',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

ROAR

In [55]:
roar_subjects = roar_df.explode('subjects')
In [56]:
data = roar_subjects.groupby('subjects')[['eprintid']].count().sort_values('eprintid', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['eprintid'],
    ) 
] 

layout = go.Layout(
    title='Subject coverage OpenDOAR',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

FAIRsharing

In [59]:
fairsharing_subjects = fairsharing_df.explode('attributes.subjects')
In [61]:
data = fairsharing_subjects.groupby('attributes.subjects')[['id']].count().sort_values('id', ascending=False)
plot = [
    go.Bar(
        x=data.index,
        y=data['id'],
        name='FAIRsharing'
    )
]

layout = go.Layout(
    title='Subject coverage FAIRsharing',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

fig = go.Figure(plot, layout).show()

Geographic analysis

re3data

In [21]:
re3data_institutions = re3data_df.explode('institution')[['re3data_id', 'institution']]
re3data_institutions = re3data_institutions[~re3data_institutions.institution.isna()].reset_index(drop=True)
re3data_institutions = re3data_institutions.join(pd.DataFrame(re3data_institutions.institution.to_list(), columns=['org_name', 'org_other_names', 
                                                                                              'org_country', 'org_3', 'org_noprofit', 
                                                                                              'org_url', 'org_ids', 'org_date1', 
                                                                                              'org_date2', 'org_contact']))
re3data_institutions.head()
Out[21]:
re3data_id institution org_name org_other_names org_country org_3 org_noprofit org_url org_ids org_date1 org_date2 org_contact
0 r3d100000001 [Odum Institute for Research in Social Science... Odum Institute for Research in Social Science [] USA [general] non-profit https://odum.unc.edu/archive/ [] []
1 r3d100000002 [The U.S. National Archives and Records Admini... The U.S. National Archives and Records Adminis... [NARA, National Archives] USA [general] non-profit http://www.archives.gov/ [] [http://www.archives.gov/contact/]
2 r3d100000002 [The USA.gov, [], USA, [general], non-profit, ... The USA.gov [] USA [general] non-profit http://www.usa.gov/ [] [http://www.usa.gov/Contact.shtml]
3 r3d100000004 [Institut für Deutsche Sprache, Archiv für Ges... Institut für Deutsche Sprache, Archiv für Gesp... [AGD] DEU [funding, general] non-profit http://agd.ids-mannheim.de/index.shtml [] 2004 [agd@ids-mannheim.de]
4 r3d100000005 [Odum Institute for Research in Social Science... Odum Institute for Research in Social Science [] USA [technical] non-profit https://odum.unc.edu/ [] [https://odum.unc.edu/contact/contact-form/, o...
In [22]:
re3data_institutions['org_continent'] = re3data_institutions.org_country.map(countrycode_to_continent)
In [23]:
re3data_institutions[re3data_institutions.org_continent.isna()].org_country.unique()
Out[23]:
array(['AAA', 'EEC'], dtype=object)

AAA is used for international collaborations; we skip this. EEC is used for the EU commission; we fix the continent manually.

In [24]:
re3data_institutions.loc[re3data_institutions.org_country == 'EEC', 'org_continent'] = 'EU'

OpenDOAR

In [25]:
opendoar_institutions = opendoar_df.explode('institution')[['opendoar_id', 'institution']]
opendoar_institutions = opendoar_institutions[~opendoar_institutions.institution.isna()].reset_index(drop=True)
opendoar_institutions = opendoar_institutions.join(pd.DataFrame(opendoar_institutions.institution.to_list(), columns=['org_name', 'org_other_names', 
                                                                                              'org_country', 'org_3', 'org_noprofit', 
                                                                                              'org_url', 'org_ids', 'org_date1', 
                                                                                              'org_date2', 'org_contact']))
opendoar_institutions['org_country'] = opendoar_institutions.org_country.map(str.upper, na_action='ignore')
opendoar_institutions['org_country'] = opendoar_institutions.org_country.map(countrycode_iso2_to_countrycode_iso3, na_action='ignore')
opendoar_institutions.head()
Out[25]:
opendoar_id institution org_name org_other_names org_country org_3 org_noprofit org_url org_ids org_date1 org_date2 org_contact
0 101 [university of utrecht, [universiteit utrecht]... university of utrecht [universiteit utrecht] NLD [] https://www.uu.nl [https://ror.org/04pp8hn57] []
1 115 [indian institute of management kozhikode, [ii... indian institute of management kozhikode [iimk] IND [] http://www.iimk.ac.in/ [https://ror.org/03m1xdc36] []
2 41 [california institute of technology, [caltech]... california institute of technology [caltech] USA [] http://www.caltech.edu/ [https://ror.org/05dxps055] []
3 119 [dublin city university, [dcu], ie, [], , http... dublin city university [dcu] IRL [] http://www.dcu.ie/ [https://ror.org/04a1a1e81] []
4 129 [istituto nazionale di geofisica e vulcanologi... istituto nazionale di geofisica e vulcanologia [ingv] ITA [] http://www.ingv.it [https://ror.org/00qps9a02] []
In [26]:
opendoar_institutions['org_continent'] = opendoar_institutions.org_country.map(countrycode_to_continent)
In [27]:
opendoar_institutions[opendoar_institutions.org_continent.isna()].org_country.unique()
Out[27]:
array([nan, 'UMI'], dtype=object)
In [28]:
opendoar_institutions.loc[opendoar_institutions.org_country == 'UMI', 'org_continent'] = 'NA'
opendoar_institutions[opendoar_institutions.org_country == 'UMI']
Out[28]:
opendoar_id institution org_name org_other_names org_country org_3 org_noprofit org_url org_ids org_date1 org_date2 org_contact org_continent
4349 5379 [kettering university, [], um, [], , https://w... kettering university [] UMI [] https://www.kettering.edu [https://ror.org/03rcspa57] [] NA

ROAR

In [29]:
roar_df['location_country'] = roar_df.location_country.map(str.upper, na_action='ignore')
roar_df['location_country'] = roar_df.location_country.map(countrycode_iso2_to_countrycode_iso3)
roar_df['continent'] = roar_df.location_country.map(countrycode_to_continent)

FAIRsharing

In [30]:
fairsharing_df['subjects'] = fairsharing_df.subjects.str.split(pat=',')
fairsharing_df['countries'] = fairsharing_df.countries.str.split(pat=',')

fairsharing_countries = fairsharing_df.explode('countries')
fairsharing_countries['countrycode'] = fairsharing_countries.countries.map(country_to_countrycode)
fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(countrycode_to_continent)
In [31]:
fairsharing_countries[fairsharing_countries.countrycode.isna()].countries.unique()
Out[31]:
array(['European Union', 'Republic of Ireland', 'Worldwide', nan],
      dtype=object)
In [32]:
fairsharing_countries[fairsharing_countries.continent.isna()].countries.unique()
Out[32]:
array(['European Union', 'Republic of Ireland', 'Worldwide', 'Antarctica',
       nan], dtype=object)

Fix manually some rows

In [33]:
fairsharing_countries.loc[fairsharing_countries.countries == 'Republic of Ireland', ['countries', 'countrycode', 'continent']] = ['Ireland', 'IE', 'EU']
fairsharing_countries.loc[fairsharing_countries.countries == 'European Union', ['countrycode', 'continent']] = ['EU', 'EU']

Make Antactica disappear (only one repo)

In [34]:
fairsharing_countries.loc[fairsharing_countries.countries == 'Antarctica', ['countrycode', 'continent']] = ['AQ', np.nan]
fairsharing_countries[fairsharing_countries.countrycode == 'AQ']
Out[34]:
full_name short_name fs_url url countries subjects countrycode continent
915 Antabif IPT - AntOBIS IPT - GBIF Belgium Antabif IPT - AntOBIS IPT - GBIF Belgium https://fairsharing.org/10.25504/FAIRsharing.e... http://ipt.biodiversity.aq/ Antarctica [Biodiversity, Life Science] AQ NaN

Country coverage

In [35]:
data1 = re3data_institutions.groupby('org_country')[['re3data_id']].count().sort_values('re3data_id', ascending=False)
data2 = opendoar_institutions.groupby('org_country')[['opendoar_id']].count().sort_values('opendoar_id', ascending=False)
data3 = roar_df.groupby('location_country')[['eprintid']].count().sort_values('eprintid', ascending=False)
data4 = fairsharing_countries.groupby('countrycode')[['url']].count().sort_values('url', ascending=False)

plot = [
    go.Bar(
        x=data1.index,
        y=data1['re3data_id'],
        name='re3data'
    ),
    go.Bar(
        x=data2.index,
        y=data2['opendoar_id'],
        name='openDOAR',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data3.index,
        y=data3['eprintid'],
        name='ROAR',
        visible = 'legendonly'
    ),
    go.Bar(
        x=data4.index,
        y=data4['url'],
        name='FAIRsharing',
        visible = 'legendonly'
    )
]

layout = go.Layout(
    title='Country coverage',
    xaxis=dict(tickangle=45, tickfont=dict(size=12))
)

go.Figure(plot, layout).show()

Continental coverage

In [36]:
data1 = re3data_institutions.groupby('org_continent')[['re3data_id']].count()
data2 = opendoar_institutions.groupby('org_continent')[['opendoar_id']].count()
data3 = roar_df.groupby('continent')[['eprintid']].count()
data4 = fairsharing_countries.groupby('continent')[['url']].count()

plot = [
    go.Scatterpolar(
        r=data1.re3data_id,
        theta=data1.index,
        fill='toself',
        name='re3data'),
    go.Scatterpolar(
        r=data2.opendoar_id,
        theta=data2.index,
        fill='toself',
        name='OpenDOAR'),
    go.Scatterpolar(
        r=data3.eprintid,
        theta=data3.index,
        fill='toself',
        name='ROAR'),
    go.Scatterpolar(
        r=data4.url,
        theta=data4.index,
        fill='toself',
        name='FAIRsharing')
]

layout = go.Layout(polar=dict(
    radialaxis=dict(
      visible=True
    ),
  )
)

go.Figure(plot, layout).show()