registries_analysis/notebooks/01.1-exploration-re3data.ipynb

48 KiB

In [1]:
import ast
import csv
import json

import numpy as np
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

Loading dataset

In [2]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additionalName': ast.literal_eval,
                                    'repositoryIdentifier': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'contentType': ast.literal_eval,
                                    'providerType': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })

re3data_df.head()
Out[2]:
orgIdentifier repositoryName repositoryName.language additionalName repositoryURL repositoryIdentifier repositoryContact description description.language type size startDate endDate repositoryLanguage subject missionStatementURL contentType providerType keyword institution policy databaseAccess databaseLicense dataAccess dataLicense dataUploadType dataUploadLicense software versioning api pidSystem citationGuidelineURL aidSystem enhancedPublication qualityManagement certificate metadataStandard syndication remarks entryDate lastUpdate
0 r3d100000001 Odum Institute Archive Dataverse eng [] https://dataverse.unc.edu/dataverse/odum [] ["https://dataverse.unc.edu/dataverse/odum#", ... The Odum Institute Archive Dataverse contains ... eng [disciplinary] {"size": "13 dataverses; 3.050 datasets", "upd... NaN NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... NaN [{'name': 'Databases', 'scheme': 'parse'}, {'n... [dataProvider] [FAIR, Middle East, crime, demography, economy... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC0", "databaseLicen... [{"dataAccessType": "embargoed", "dataAccessRe... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [] ["DataVerse"] NaN {} ["DOI"] NaN [] unknown yes ["other"] [{"metadataStandardName": "DDI - Data Document... {} Odum Dataverse is covered by Thomson Reuters D... 2013-06-10 2021-07-06
1 r3d100000002 Access to Archival Databases eng [{'additionalName': 'AAD', 'additionalNameLang... https://aad.archives.gov/aad/ [RRID:SCR_010479, RRID:nlx_157752] ["https://www.archives.gov/contact"] You will find in the Access to Archival Databa... eng [disciplinary] {"size": "", "updatedp": ""} 1985 NaN ["eng", "spa"] [{'name': '1 Humanities and Social Sciences', ... https://www.archives.gov/publications/general-... [{'name': 'Images', 'scheme': 'parse'}, {'name... [dataProvider] [US History] [{'institutionName': 'The U.S. National Archiv... [{"policyName": "Contribution Policy", "policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "Copyrights", "dataLicens... restricted [] ["unknown"] no {"api": "https://www.archives.gov/developer#to... ["none"] https://aad.archives.gov/aad/help/getting-star... [] unknown unknown [] [] {"syndication": "http://www.archives.gov/socia... NaN 2012-07-04 2021-05-25
2 r3d100000004 Datenbank Gesprochenes Deutsch deu [{'additionalName': 'DGD', 'additionalNameLang... https://dgd.ids-mannheim.de/ [] ["dgd@ids-mannheim.de"] The "Database for Spoken German (DGD)" is a co... eng [disciplinary] {"size": "34 corpora", "updatedp": "2020-02-03"} 2012 NaN ["deu"] [{'name': '1 Humanities and Social Sciences', ... https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... [{'name': 'Audiovisual data', 'scheme': 'parse... [dataProvider, serviceProvider] [Australian German, FOLK, German dialects, Pfe... [{'institutionName': 'Institut für Deutsche Sp... [{"policyName": "Erfurter Aufruf zur Sicherung... {"databaseAccessType": "restricted", "databas... [] [{"dataAccessType": "restricted", "dataAccessR... [{"dataLicenseName": "other", "dataLicenseURL"... restricted [] ["other"] yes {} ["none"] http://agd.ids-mannheim.de/konditionen.shtml [] unknown unknown ["RatSWD"] [] {} NaN 2012-07-20 2020-08-27
3 r3d100000005 UNC Dataverse eng [{'additionalName': 'University of North Carol... https://dataverse.unc.edu/ [] ["https://dataverse.unc.edu/", "odumarchive@un... UNC Dataverse is an open-source repository sof... eng [institutional] {"size": "186 dataverses; 25.272 studies; 229.... 2011 NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://odum.unc.edu/about/mission-vision/ [{'name': 'Archived data', 'scheme': 'parse'},... [dataProvider, serviceProvider] [FAIR, census, demographic survey, demography,... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Data Deposit Form"... ["DataVerse"] yes {"api": "https://guides.dataverse.org/en/lates... ["ARK", "DOI", "PURL", "URN", "hdl"] https://dataverse.org/best-practices/data-cita... [] unknown yes [] [{"metadataStandardName": "DDI - Data Document... {} UNC Dataverse is covered by Clarivate Data Cit... 2012-07-23 2021-08-11
4 r3d100000006 Archaeology Data Service eng [{'additionalName': 'ADS', 'additionalNameLang... https://archaeologydataservice.ac.uk/ [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] ["help@archaeologydataservice.ac.uk", "https:/... The ADS is an accredited digital repository fo... eng [disciplinary] {"size": "1837 results", "updatedp": "2020-05-... 1996-10-01 NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://archaeologydataservice.ac.uk/about/our... [{'name': 'Archived data', 'scheme': 'parse'},... [dataProvider, serviceProvider] [FAIR, archaeology, cultural heritage, prehist... [{'institutionName': 'Arts and Humanities Rese... [{"policyName": "ADS Guides to good practice",... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC", "databaseLicens... [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Guidelines for Dep... ["other"] yes {"api": "https://archaeologydataservice.ac.uk/... ["DOI"] https://archaeologydataservice.ac.uk/advice/te... [] unknown yes ["other"] [{"metadataStandardName": "DataCite Metadata S... {"syndication": "https://archaeologydataservic... ADS is covered by Clarivate Data Citation Inde... 2012-07-23 2021-09-02
In [3]:
re3data_df.columns
Out[3]:
Index(['orgIdentifier', 'repositoryName', 'repositoryName.language',
       'additionalName', 'repositoryURL', 'repositoryIdentifier',
       'repositoryContact', 'description', 'description.language', 'type',
       'size', 'startDate', 'endDate', 'repositoryLanguage', 'subject',
       'missionStatementURL', 'contentType', 'providerType', 'keyword',
       'institution', 'policy', 'databaseAccess', 'databaseLicense',
       'dataAccess', 'dataLicense', 'dataUploadType', 'dataUploadLicense',
       'software', 'versioning', 'api', 'pidSystem', 'citationGuidelineURL',
       'aidSystem', 'enhancedPublication', 'qualityManagement', 'certificate',
       'metadataStandard', 'syndication', 'remarks', 'entryDate',
       'lastUpdate'],
      dtype='object')
In [4]:
def empty_list_is_nan(cell):
    if isinstance(cell, list):
        return np.nan if len(cell) == 0 else cell
    else:
        return cell
    
re3data_df = re3data_df.applymap(empty_list_is_nan)
In [5]:
re3data_df.describe(include='all')
Out[5]:
orgIdentifier repositoryName repositoryName.language additionalName repositoryURL repositoryIdentifier repositoryContact description description.language type size startDate endDate repositoryLanguage subject missionStatementURL contentType providerType keyword institution policy databaseAccess databaseLicense dataAccess dataLicense dataUploadType dataUploadLicense software versioning api pidSystem citationGuidelineURL aidSystem enhancedPublication qualityManagement certificate metadataStandard syndication remarks entryDate lastUpdate
count 2739 2739 2739 2170 2716 863 2739 2739 2739 2710 2739 1776 157 2739 2720 2318 2732 2735 2732 2738 2739 2739 2739 2739 2739 2711 2739 2739 1316 2739 2739 1512 2739 2737 2739 2739 2739 2739 1674 2739 2739
unique 2739 2736 19 2161 2713 863 2459 2737 6 8 1289 352 80 107 1388 2249 1337 4 2503 2719 2319 12 375 145 2263 3 681 23 2 1146 29 1321 12 3 3 14 172 563 1656 1275 740
top r3d100000001 Språkbanken eng [{'additionalName': 'MPC', 'additionalNameLang... http://icgem.gfz-potsdam.de/home [RRID:SCR_010479, RRID:nlx_157752] [] The National Archives and Records Administrati... eng [disciplinary] {"size": "", "updatedp": ""} 2008 2015 ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://learn.scholarsportal.info/all-guides/d... [{'name': 'Standard office documents', 'scheme... [dataProvider] [multidisciplinary] [{'institutionName': 'National Center for Biot... [][] {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [] ["unknown"] yes {} ["none"] https://dataverse.org/best-practices/data-cita... [] unknown yes [] [] {} is covered by Elsevier. 2016-05-10 2021-09-03
freq 1 2 2554 2 2 1 202 2 2723 1733 1450 92 11 2063 226 14 30 1771 193 6 312 2571 2159 1269 64 1793 2013 1226 1108 1498 1361 72 2155 1608 1515 2509 1669 2162 14 20 137
In [6]:
re3data_df.isna().sum()
Out[6]:
orgIdentifier                 0
repositoryName                0
repositoryName.language       0
additionalName              569
repositoryURL                23
repositoryIdentifier       1876
repositoryContact             0
description                   0
description.language          0
type                         29
size                          0
startDate                   963
endDate                    2582
repositoryLanguage            0
subject                      19
missionStatementURL         421
contentType                   7
providerType                  4
keyword                       7
institution                   1
policy                        0
databaseAccess                0
databaseLicense               0
dataAccess                    0
dataLicense                   0
dataUploadType               28
dataUploadLicense             0
software                      0
versioning                 1423
api                           0
pidSystem                     0
citationGuidelineURL       1227
aidSystem                     0
enhancedPublication           2
qualityManagement             0
certificate                   0
metadataStandard              0
syndication                   0
remarks                    1065
entryDate                     0
lastUpdate                    0
dtype: int64
In [7]:
types = re3data_df.contentType.explode().apply(lambda x: x['name'] if x is not np.nan else np.nan)
pd.DataFrame(types).groupby('contentType').size()
Out[7]:
contentType
Archived data                               658
Audiovisual data                            542
Configuration data                           79
Databases                                   586
Images                                     1378
Networkbased data                           153
Plain text                                 1158
Raw data                                   1197
Scientific and statistical data formats    1685
Software applications                       456
Source code                                 209
Standard office documents                  1684
Structured graphics                         917
Structured text                             848
other                                       962
dtype: int64
In [8]:
pd.DataFrame(re3data_df.providerType.explode()).groupby('providerType').size()
Out[8]:
providerType
dataProvider       2491
serviceProvider     963
dtype: int64