In [1]:
import ast
import csv
import json

import numpy as np
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

## Loading dataset

In [2]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additionalName': ast.literal_eval,
                                    'repositoryIdentifier': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'contentType': ast.literal_eval,
                                    'providerType': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })

re3data_df.head()

Unnamed: 0,orgIdentifier,repositoryName,repositoryName.language,additionalName,repositoryURL,repositoryIdentifier,repositoryContact,description,description.language,type,size,startDate,endDate,repositoryLanguage,subject,missionStatementURL,contentType,providerType,keyword,institution,policy,databaseAccess,databaseLicense,dataAccess,dataLicense,dataUploadType,dataUploadLicense,software,versioning,api,pidSystem,citationGuidelineURL,aidSystem,enhancedPublication,qualityManagement,certificate,metadataStandard,syndication,remarks,entryDate,lastUpdate
0,r3d100000001,Odum Institute Archive Dataverse,eng,[],https://dataverse.unc.edu/dataverse/odum,[],"[""https://dataverse.unc.edu/dataverse/odum#"", ...",The Odum Institute Archive Dataverse contains ...,eng,[disciplinary],"{""size"": ""13 dataverses; 3.050 datasets"", ""upd...",,,"[""eng""]","[{'name': '1 Humanities and Social Sciences', ...",,"[{'name': 'Databases', 'scheme': 'parse'}, {'n...",[dataProvider],"[FAIR, Middle East, crime, demography, economy...",[{'institutionName': 'Odum Institute for Resea...,"[{""policyName"": ""Collection Development Policy...","{""databaseAccessType"": ""open"", ""databaseAcces...","[{""databaseLicenseName"": ""CC0"", ""databaseLicen...","[{""dataAccessType"": ""embargoed"", ""dataAccessRe...","[{""dataLicenseName"": ""CC"", ""dataLicenseURL"": ""...",restricted,[],"[""DataVerse""]",,{},"[""DOI""]",,[],unknown,yes,"[""other""]","[{""metadataStandardName"": ""DDI - Data Document...",{},Odum Dataverse is covered by Thomson Reuters D...,2013-06-10,2021-07-06
1,r3d100000002,Access to Archival Databases,eng,"[{'additionalName': 'AAD', 'additionalNameLang...",https://aad.archives.gov/aad/,"[RRID:SCR_010479, RRID:nlx_157752]","[""https://www.archives.gov/contact""]",You will find in the Access to Archival Databa...,eng,[disciplinary],"{""size"": """", ""updatedp"": """"}",1985,,"[""eng"", ""spa""]","[{'name': '1 Humanities and Social Sciences', ...",https://www.archives.gov/publications/general-...,"[{'name': 'Images', 'scheme': 'parse'}, {'name...",[dataProvider],[US History],[{'institutionName': 'The U.S. National Archiv...,"[{""policyName"": ""Contribution Policy"", ""policy...","{""databaseAccessType"": ""open"", ""databaseAcces...",[],"[{""dataAccessType"": ""open"", ""dataAccessRestric...","[{""dataLicenseName"": ""Copyrights"", ""dataLicens...",restricted,[],"[""unknown""]",no,"{""api"": ""https://www.archives.gov/developer#to...","[""none""]",https://aad.archives.gov/aad/help/getting-star...,[],unknown,unknown,[],[],"{""syndication"": ""http://www.archives.gov/socia...",,2012-07-04,2021-05-25
2,r3d100000004,Datenbank Gesprochenes Deutsch,deu,"[{'additionalName': 'DGD', 'additionalNameLang...",https://dgd.ids-mannheim.de/,[],"[""dgd@ids-mannheim.de""]","The ""Database for Spoken German (DGD)"" is a co...",eng,[disciplinary],"{""size"": ""34 corpora"", ""updatedp"": ""2020-02-03""}",2012,,"[""deu""]","[{'name': '1 Humanities and Social Sciences', ...",https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext...,"[{'name': 'Audiovisual data', 'scheme': 'parse...","[dataProvider, serviceProvider]","[Australian German, FOLK, German dialects, Pfe...",[{'institutionName': 'Institut für Deutsche Sp...,"[{""policyName"": ""Erfurter Aufruf zur Sicherung...","{""databaseAccessType"": ""restricted"", ""databas...",[],"[{""dataAccessType"": ""restricted"", ""dataAccessR...","[{""dataLicenseName"": ""other"", ""dataLicenseURL""...",restricted,[],"[""other""]",yes,{},"[""none""]",http://agd.ids-mannheim.de/konditionen.shtml,[],unknown,unknown,"[""RatSWD""]",[],{},,2012-07-20,2020-08-27
3,r3d100000005,UNC Dataverse,eng,[{'additionalName': 'University of North Carol...,https://dataverse.unc.edu/,[FAIRsharing_doi:10.25504/FAIRsharing.pS2p8c],"[""https://dataverse.unc.edu/"", ""odumarchive@un...",UNC Dataverse is an open-source repository sof...,eng,[institutional],"{""size"": ""186 dataverses; 25.272 studies; 229....",2011,,"[""eng""]","[{'name': '1 Humanities and Social Sciences', ...",https://odum.unc.edu/about/mission-vision/,"[{'name': 'Archived data', 'scheme': 'parse'},...","[dataProvider, serviceProvider]","[FAIR, census, demographic survey, demography,...",[{'institutionName': 'Odum Institute for Resea...,"[{""policyName"": ""Collection Development Policy...","{""databaseAccessType"": ""open"", ""databaseAcces...",[],"[{""dataAccessType"": ""open"", ""dataAccessRestric...","[{""dataLicenseName"": ""CC"", ""dataLicenseURL"": ""...",restricted,"[{""dataUploadLicenseName"": ""Data Deposit Form""...","[""DataVerse""]",yes,"{""api"": ""https://guides.dataverse.org/en/lates...","[""ARK"", ""DOI"", ""PURL"", ""URN"", ""hdl""]",https://dataverse.org/best-practices/data-cita...,[],unknown,yes,[],"[{""metadataStandardName"": ""DDI - Data Document...",{},UNC Dataverse is covered by Clarivate Data Cit...,2012-07-23,2021-10-25
4,r3d100000006,Archaeology Data Service,eng,"[{'additionalName': 'ADS', 'additionalNameLang...",https://archaeologydataservice.ac.uk/,[FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg],"[""help@archaeologydataservice.ac.uk"", ""https:/...",The ADS is an accredited digital repository fo...,eng,[disciplinary],"{""size"": ""1837 results"", ""updatedp"": ""2020-05-...",1996-10-01,,"[""eng""]","[{'name': '1 Humanities and Social Sciences', ...",https://archaeologydataservice.ac.uk/about/our...,"[{'name': 'Archived data', 'scheme': 'parse'},...","[dataProvider, serviceProvider]","[FAIR, archaeology, cultural heritage, prehist...",[{'institutionName': 'Arts and Humanities Rese...,"[{""policyName"": ""ADS Guides to good practice"",...","{""databaseAccessType"": ""open"", ""databaseAcces...","[{""databaseLicenseName"": ""CC"", ""databaseLicens...","[{""dataAccessType"": ""open"", ""dataAccessRestric...","[{""dataLicenseName"": ""CC"", ""dataLicenseURL"": ""...",restricted,"[{""dataUploadLicenseName"": ""Guidelines for Dep...","[""other""]",yes,"{""api"": ""https://archaeologydataservice.ac.uk/...","[""DOI""]",https://archaeologydataservice.ac.uk/advice/te...,[],unknown,yes,"[""other""]","[{""metadataStandardName"": ""DataCite Metadata S...","{""syndication"": ""https://archaeologydataservic...",ADS is covered by Clarivate Data Citation Inde...,2012-07-23,2021-09-02


In [3]:
re3data_df.columns

Index(['orgIdentifier', 'repositoryName', 'repositoryName.language',
       'additionalName', 'repositoryURL', 'repositoryIdentifier',
       'repositoryContact', 'description', 'description.language', 'type',
       'size', 'startDate', 'endDate', 'repositoryLanguage', 'subject',
       'missionStatementURL', 'contentType', 'providerType', 'keyword',
       'institution', 'policy', 'databaseAccess', 'databaseLicense',
       'dataAccess', 'dataLicense', 'dataUploadType', 'dataUploadLicense',
       'software', 'versioning', 'api', 'pidSystem', 'citationGuidelineURL',
       'aidSystem', 'enhancedPublication', 'qualityManagement', 'certificate',
       'metadataStandard', 'syndication', 'remarks', 'entryDate',
       'lastUpdate'],
      dtype='object')

In [4]:
def empty_list_is_nan(cell):
    if isinstance(cell, list):
        return np.nan if len(cell) == 0 else cell
    else:
        return cell
    
re3data_df = re3data_df.applymap(empty_list_is_nan)

In [5]:
re3data_df.describe(include='all')

Unnamed: 0,orgIdentifier,repositoryName,repositoryName.language,additionalName,repositoryURL,repositoryIdentifier,repositoryContact,description,description.language,type,size,startDate,endDate,repositoryLanguage,subject,missionStatementURL,contentType,providerType,keyword,institution,policy,databaseAccess,databaseLicense,dataAccess,dataLicense,dataUploadType,dataUploadLicense,software,versioning,api,pidSystem,citationGuidelineURL,aidSystem,enhancedPublication,qualityManagement,certificate,metadataStandard,syndication,remarks,entryDate,lastUpdate
count,2793,2793,2793,2206,2769,1024,2793,2793,2793,2777,2793,1800,172,2793,2789,2373,2787,2788,2785,2792,2793,2793,2793,2793,2793,2778,2793,2793,1339,2793,2793,1532,2793,2793,2793,2793,2793,2793,1694,2793,2793
unique,2793,2791,19,2196,2766,1023,2532,2792,6,8,1321,362,86,110,1417,2304,1350,5,2543,2772,2366,12,377,146,2294,3,695,23,2,1170,29,1337,13,3,3,16,175,544,1673,1316,722
top,r3d100000001,EarthChem Library,eng,"[{'additionalName': 'FRED', 'additionalNameLan...",http://icgem.gfz-potsdam.de/home,[biodbcore-001574],[],The National Archives and Records Administrati...,eng,[disciplinary],"{""size"": """", ""updatedp"": """"}",2008,2015,"[""eng""]","[{'name': '1 Humanities and Social Sciences', ...",https://learn.scholarsportal.info/all-guides/d...,"[{'name': 'Standard office documents', 'scheme...",[dataProvider],[multidisciplinary],[{'institutionName': 'National Center for Biot...,[][],"{""databaseAccessType"": ""open"", ""databaseAcces...",[],"[{""dataAccessType"": ""open"", ""dataAccessRestric...","[{""dataLicenseName"": ""CC"", ""dataLicenseURL"": ""...",restricted,[],"[""unknown""]",yes,{},"[""none""]",https://dataverse.org/best-practices/data-cita...,[],unknown,yes,[],[],{},is covered by Elsevier.,2018-08-10,2021-09-03
freq,1,2,2596,2,2,2,170,2,2776,1768,1472,93,12,2088,240,14,29,1806,205,7,319,2624,2201,1292,71,1851,2054,1216,1131,1526,1359,76,2199,1643,1569,2557,1693,2235,17,20,104


In [6]:
re3data_df.isna().sum()

orgIdentifier                 0
repositoryName                0
repositoryName.language       0
additionalName              587
repositoryURL                24
repositoryIdentifier       1769
repositoryContact             0
description                   0
description.language          0
type                         16
size                          0
startDate                   993
endDate                    2621
repositoryLanguage            0
subject                       4
missionStatementURL         420
contentType                   6
providerType                  5
keyword                       8
institution                   1
policy                        0
databaseAccess                0
databaseLicense               0
dataAccess                    0
dataLicense                   0
dataUploadType               15
dataUploadLicense             0
software                      0
versioning                 1454
api                           0
pidSystem                     0
citation

In [7]:
types = re3data_df.contentType.explode().apply(lambda x: x['name'] if x is not np.nan else np.nan)
pd.DataFrame(types).groupby('contentType').size()

contentType
Archived data                               677
Audiovisual data                            561
Configuration data                           84
Databases                                   593
Images                                     1407
Networkbased data                           155
Plain text                                 1173
Raw data                                   1224
Scientific and statistical data formats    1725
Software applications                       462
Source code                                 217
Standard office documents                  1719
Structured graphics                         937
Structured text                             878
other                                       979
dtype: int64

In [8]:
pd.DataFrame(re3data_df.providerType.explode()).groupby('providerType').size()

providerType
dataProvider       2539
serviceProvider     982
dtype: int64