registries_analysis/notebooks/01.1-exploration-re3data.ipynb

48 KiB

In [1]:
import ast
import csv
import json

import numpy as np
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

Loading dataset

In [2]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additionalName': ast.literal_eval,
                                    'repositoryIdentifier': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'contentType': ast.literal_eval,
                                    'providerType': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })

re3data_df.head()
Out[2]:
orgIdentifier repositoryName repositoryName.language additionalName repositoryURL repositoryIdentifier repositoryContact description description.language type size startDate endDate repositoryLanguage subject missionStatementURL contentType providerType keyword institution policy databaseAccess databaseLicense dataAccess dataLicense dataUploadType dataUploadLicense software versioning api pidSystem citationGuidelineURL aidSystem enhancedPublication qualityManagement certificate metadataStandard syndication remarks entryDate lastUpdate
0 r3d100000001 Odum Institute Archive Dataverse eng [] https://dataverse.unc.edu/dataverse/odum [] ["https://dataverse.unc.edu/dataverse/odum#", ... The Odum Institute Archive Dataverse contains ... eng [disciplinary] {"size": "13 dataverses; 3.050 datasets", "upd... NaN NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... NaN [{'name': 'Databases', 'scheme': 'parse'}, {'n... [dataProvider] [FAIR, Middle East, crime, demography, economy... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC0", "databaseLicen... [{"dataAccessType": "embargoed", "dataAccessRe... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [] ["DataVerse"] NaN {} ["DOI"] NaN [] unknown yes ["other"] [{"metadataStandardName": "DDI - Data Document... {} Odum Dataverse is covered by Thomson Reuters D... 2013-06-10 2021-07-06
1 r3d100000002 Access to Archival Databases eng [{'additionalName': 'AAD', 'additionalNameLang... https://aad.archives.gov/aad/ [RRID:SCR_010479, RRID:nlx_157752] ["https://www.archives.gov/contact"] You will find in the Access to Archival Databa... eng [disciplinary] {"size": "", "updatedp": ""} 1985 NaN ["eng", "spa"] [{'name': '1 Humanities and Social Sciences', ... https://www.archives.gov/publications/general-... [{'name': 'Images', 'scheme': 'parse'}, {'name... [dataProvider] [US History] [{'institutionName': 'The U.S. National Archiv... [{"policyName": "Contribution Policy", "policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "Copyrights", "dataLicens... restricted [] ["unknown"] no {"api": "https://www.archives.gov/developer#to... ["none"] https://aad.archives.gov/aad/help/getting-star... [] unknown unknown [] [] {"syndication": "http://www.archives.gov/socia... NaN 2012-07-04 2021-05-25
2 r3d100000004 Datenbank Gesprochenes Deutsch deu [{'additionalName': 'DGD', 'additionalNameLang... https://dgd.ids-mannheim.de/ [] ["dgd@ids-mannheim.de"] The "Database for Spoken German (DGD)" is a co... eng [disciplinary] {"size": "34 corpora", "updatedp": "2020-02-03"} 2012 NaN ["deu"] [{'name': '1 Humanities and Social Sciences', ... https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... [{'name': 'Audiovisual data', 'scheme': 'parse... [dataProvider, serviceProvider] [Australian German, FOLK, German dialects, Pfe... [{'institutionName': 'Institut für Deutsche Sp... [{"policyName": "Erfurter Aufruf zur Sicherung... {"databaseAccessType": "restricted", "databas... [] [{"dataAccessType": "restricted", "dataAccessR... [{"dataLicenseName": "other", "dataLicenseURL"... restricted [] ["other"] yes {} ["none"] http://agd.ids-mannheim.de/konditionen.shtml [] unknown unknown ["RatSWD"] [] {} NaN 2012-07-20 2020-08-27
3 r3d100000005 UNC Dataverse eng [{'additionalName': 'University of North Carol... https://dataverse.unc.edu/ [FAIRsharing_doi:10.25504/FAIRsharing.pS2p8c] ["https://dataverse.unc.edu/", "odumarchive@un... UNC Dataverse is an open-source repository sof... eng [institutional] {"size": "186 dataverses; 25.272 studies; 229.... 2011 NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://odum.unc.edu/about/mission-vision/ [{'name': 'Archived data', 'scheme': 'parse'},... [dataProvider, serviceProvider] [FAIR, census, demographic survey, demography,... [{'institutionName': 'Odum Institute for Resea... [{"policyName": "Collection Development Policy... {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Data Deposit Form"... ["DataVerse"] yes {"api": "https://guides.dataverse.org/en/lates... ["ARK", "DOI", "PURL", "URN", "hdl"] https://dataverse.org/best-practices/data-cita... [] unknown yes [] [{"metadataStandardName": "DDI - Data Document... {} UNC Dataverse is covered by Clarivate Data Cit... 2012-07-23 2021-10-25
4 r3d100000006 Archaeology Data Service eng [{'additionalName': 'ADS', 'additionalNameLang... https://archaeologydataservice.ac.uk/ [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] ["help@archaeologydataservice.ac.uk", "https:/... The ADS is an accredited digital repository fo... eng [disciplinary] {"size": "1837 results", "updatedp": "2020-05-... 1996-10-01 NaN ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://archaeologydataservice.ac.uk/about/our... [{'name': 'Archived data', 'scheme': 'parse'},... [dataProvider, serviceProvider] [FAIR, archaeology, cultural heritage, prehist... [{'institutionName': 'Arts and Humanities Rese... [{"policyName": "ADS Guides to good practice",... {"databaseAccessType": "open", "databaseAcces... [{"databaseLicenseName": "CC", "databaseLicens... [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [{"dataUploadLicenseName": "Guidelines for Dep... ["other"] yes {"api": "https://archaeologydataservice.ac.uk/... ["DOI"] https://archaeologydataservice.ac.uk/advice/te... [] unknown yes ["other"] [{"metadataStandardName": "DataCite Metadata S... {"syndication": "https://archaeologydataservic... ADS is covered by Clarivate Data Citation Inde... 2012-07-23 2021-09-02
In [3]:
re3data_df.columns
Out[3]:
Index(['orgIdentifier', 'repositoryName', 'repositoryName.language',
       'additionalName', 'repositoryURL', 'repositoryIdentifier',
       'repositoryContact', 'description', 'description.language', 'type',
       'size', 'startDate', 'endDate', 'repositoryLanguage', 'subject',
       'missionStatementURL', 'contentType', 'providerType', 'keyword',
       'institution', 'policy', 'databaseAccess', 'databaseLicense',
       'dataAccess', 'dataLicense', 'dataUploadType', 'dataUploadLicense',
       'software', 'versioning', 'api', 'pidSystem', 'citationGuidelineURL',
       'aidSystem', 'enhancedPublication', 'qualityManagement', 'certificate',
       'metadataStandard', 'syndication', 'remarks', 'entryDate',
       'lastUpdate'],
      dtype='object')
In [4]:
def empty_list_is_nan(cell):
    if isinstance(cell, list):
        return np.nan if len(cell) == 0 else cell
    else:
        return cell
    
re3data_df = re3data_df.applymap(empty_list_is_nan)
In [5]:
re3data_df.describe(include='all')
Out[5]:
orgIdentifier repositoryName repositoryName.language additionalName repositoryURL repositoryIdentifier repositoryContact description description.language type size startDate endDate repositoryLanguage subject missionStatementURL contentType providerType keyword institution policy databaseAccess databaseLicense dataAccess dataLicense dataUploadType dataUploadLicense software versioning api pidSystem citationGuidelineURL aidSystem enhancedPublication qualityManagement certificate metadataStandard syndication remarks entryDate lastUpdate
count 2793 2793 2793 2206 2769 1024 2793 2793 2793 2777 2793 1800 172 2793 2789 2373 2787 2788 2785 2792 2793 2793 2793 2793 2793 2778 2793 2793 1339 2793 2793 1532 2793 2793 2793 2793 2793 2793 1694 2793 2793
unique 2793 2791 19 2196 2766 1023 2532 2792 6 8 1321 362 86 110 1417 2304 1350 5 2543 2772 2366 12 377 146 2294 3 695 23 2 1170 29 1337 13 3 3 16 175 544 1673 1316 722
top r3d100000001 EarthChem Library eng [{'additionalName': 'FRED', 'additionalNameLan... http://icgem.gfz-potsdam.de/home [biodbcore-001574] [] The National Archives and Records Administrati... eng [disciplinary] {"size": "", "updatedp": ""} 2008 2015 ["eng"] [{'name': '1 Humanities and Social Sciences', ... https://learn.scholarsportal.info/all-guides/d... [{'name': 'Standard office documents', 'scheme... [dataProvider] [multidisciplinary] [{'institutionName': 'National Center for Biot... [][] {"databaseAccessType": "open", "databaseAcces... [] [{"dataAccessType": "open", "dataAccessRestric... [{"dataLicenseName": "CC", "dataLicenseURL": "... restricted [] ["unknown"] yes {} ["none"] https://dataverse.org/best-practices/data-cita... [] unknown yes [] [] {} is covered by Elsevier. 2018-08-10 2021-09-03
freq 1 2 2596 2 2 2 170 2 2776 1768 1472 93 12 2088 240 14 29 1806 205 7 319 2624 2201 1292 71 1851 2054 1216 1131 1526 1359 76 2199 1643 1569 2557 1693 2235 17 20 104
In [6]:
re3data_df.isna().sum()
Out[6]:
orgIdentifier                 0
repositoryName                0
repositoryName.language       0
additionalName              587
repositoryURL                24
repositoryIdentifier       1769
repositoryContact             0
description                   0
description.language          0
type                         16
size                          0
startDate                   993
endDate                    2621
repositoryLanguage            0
subject                       4
missionStatementURL         420
contentType                   6
providerType                  5
keyword                       8
institution                   1
policy                        0
databaseAccess                0
databaseLicense               0
dataAccess                    0
dataLicense                   0
dataUploadType               15
dataUploadLicense             0
software                      0
versioning                 1454
api                           0
pidSystem                     0
citationGuidelineURL       1261
aidSystem                     0
enhancedPublication           0
qualityManagement             0
certificate                   0
metadataStandard              0
syndication                   0
remarks                    1099
entryDate                     0
lastUpdate                    0
dtype: int64
In [7]:
types = re3data_df.contentType.explode().apply(lambda x: x['name'] if x is not np.nan else np.nan)
pd.DataFrame(types).groupby('contentType').size()
Out[7]:
contentType
Archived data                               677
Audiovisual data                            561
Configuration data                           84
Databases                                   593
Images                                     1407
Networkbased data                           155
Plain text                                 1173
Raw data                                   1224
Scientific and statistical data formats    1725
Software applications                       462
Source code                                 217
Standard office documents                  1719
Structured graphics                         937
Structured text                             878
other                                       979
dtype: int64
In [8]:
pd.DataFrame(re3data_df.providerType.explode()).groupby('providerType').size()
Out[8]:
providerType
dataProvider       2539
serviceProvider     982
dtype: int64