Information to check
- names
- description
- url
- subjects & keywords
- content type
- repo type
- policies



In [1]:
import ast
import csv
import json

import numpy as np
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

## Loading dataset

In [2]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t', 
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'repository_id': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'provider_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })
re3data_df.head()

Unnamed: 0,openaire_id,re3data_id,repository_name,additional_name,repository_url,repository_id,description,type,size,update_date,start_date,end_date,subject,mission_statement,content_type,provider_type,keyword,institution,policy,database_access,database_license,data_access,data_license,data_upload,data_upload_license,software,versioning,api,pid_system,citation_guideline_url,aid_system,enhanced_publication,quality_management,certificate,metadata_standard,syndication,remarks,entry_date,last_update
0,re3data_____::91780fe96da5ba32f804e43359c154ba,r3d100000001,Odum Institute Archive Dataverse,[],https://dataverse.unc.edu/dataverse/odum,[],The Odum Institute Archive Dataverse contains ...,[disciplinary],13 dataverses; 3.050 datasets,2020-12-04,,,"[1 Humanities and Social Sciences, 111 Social ...",False,"[Databases, Plain text, Scientific and statist...",[dataProvider],"[FAIR, Middle East, crime, demography, economy...",[[Odum Institute for Research in Social Scienc...,True,True,True,True,True,True,False,True,,False,True,True,True,unknown,yes,True,True,False,Odum Dataverse is covered by Thomson Reuters D...,2013-06-10,2021-07-06
1,re3data_____::cc3ea05c863cd49af75f7f54e0e86f09,r3d100000002,Access to Archival Databases,[AAD],https://aad.archives.gov/aad/,"[RRID:SCR_010479, RRID:nlx_157752]",You will find in the Access to Archival Databa...,[disciplinary],,,1985,,"[1 Humanities and Social Sciences, 102 History...",True,"[Images, Standard office documents, Structured...",[dataProvider],[US History],[[The U.S. National Archives and Records Admin...,True,True,False,True,True,True,False,True,no,True,True,True,True,unknown,unknown,False,False,True,,2012-07-04,2021-05-25
2,re3data_____::a2f73fbe91311f4356d0d7957c441773,r3d100000004,Datenbank Gesprochenes Deutsch,"[DGD, DGD2 (formerly), Database for Spoken Ger...",https://dgd.ids-mannheim.de/,[],"The ""Database for Spoken German (DGD)"" is a co...",[disciplinary],34 corpora,2020-02-03,2012,,"[1 Humanities and Social Sciences, 104 Linguis...",True,"[Audiovisual data, Standard office documents, ...","[dataProvider, serviceProvider]","[Australian German, FOLK, German dialects, Pfe...","[[Institut f端r Deutsche Sprache, Archiv f端r Ge...",True,True,False,True,True,True,False,True,yes,False,True,True,True,unknown,unknown,True,False,False,,2012-07-20,2020-08-27
3,re3data_____::0394b97eb11f19785cbca1ec830429da,r3d100000005,UNC Dataverse,[University of North Carolina Dataverse],https://dataverse.unc.edu/,[],UNC Dataverse is an open-source repository sof...,[institutional],186 dataverses; 25.272 studies; 229.442 files,2020-11-30,2011,,"[1 Humanities and Social Sciences, 111 Social ...",True,"[Archived data, Plain text, Raw data, Scientif...","[dataProvider, serviceProvider]","[FAIR, census, demographic survey, demography,...",[[Odum Institute for Research in Social Scienc...,True,True,False,True,True,True,True,True,yes,True,True,True,True,unknown,yes,False,True,False,The Odum Institute houses one of the oldest an...,2012-07-23,2020-11-30
4,re3data_____::a48f09c562b247a9919acfe195549b47,r3d100000006,Archaeology Data Service,[ADS],https://archaeologydataservice.ac.uk/,[FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg],The ADS is an accredited digital repository fo...,[disciplinary],1837 results,2020-05-20,1996-10-01,,"[1 Humanities and Social Sciences, 101 Ancient...",True,"[Archived data, Audiovisual data, Databases, I...","[dataProvider, serviceProvider]","[FAIR, archaeology, cultural heritage, prehist...","[[Arts and Humanities Research Council, [AHRC]...",True,True,True,True,True,True,True,True,yes,True,True,True,True,unknown,yes,True,True,True,ADS is covered by Clarivate Data Citation Inde...,2012-07-23,2021-06-11


In [3]:
re3data_df.columns

Index(['openaire_id', 're3data_id', 'repository_name', 'additional_name',
       'repository_url', 'repository_id', 'description', 'type', 'size',
       'update_date', 'start_date', 'end_date', 'subject', 'mission_statement',
       'content_type', 'provider_type', 'keyword', 'institution', 'policy',
       'database_access', 'database_license', 'data_access', 'data_license',
       'data_upload', 'data_upload_license', 'software', 'versioning', 'api',
       'pid_system', 'citation_guideline_url', 'aid_system',
       'enhanced_publication', 'quality_management', 'certificate',
       'metadata_standard', 'syndication', 'remarks', 'entry_date',
       'last_update'],
      dtype='object')

In [4]:
def empty_list_is_nan(cell):
    if isinstance(cell, list):
        return np.nan if len(cell) == 0 else cell
    else:
        return cell
    
re3data_df = re3data_df.applymap(empty_list_is_nan)

In [5]:
re3data_df.describe(include='all')

Unnamed: 0,openaire_id,re3data_id,repository_name,additional_name,repository_url,repository_id,description,type,size,update_date,start_date,end_date,subject,mission_statement,content_type,provider_type,keyword,institution,policy,database_access,database_license,data_access,data_license,data_upload,data_upload_license,software,versioning,api,pid_system,citation_guideline_url,aid_system,enhanced_publication,quality_management,certificate,metadata_standard,syndication,remarks,entry_date,last_update
count,2707,2707,2707,2137,2686,829,2707,2677,1260,1248,1762,146,2685,2707,2700,2699,2699,2706,2707,2707,2707,2707,2707,2707,2707,2707,1292,2707,2707,2707,2707,2704,2705,2707,2707,2707,1637,2707,2707
unique,2707,2707,2704,2128,2683,828,2705,8,1233,687,351,79,1367,2,1323,4,2474,2685,2,1,2,2,2,2,2,2,2,2,2,1,1,3,3,2,2,2,1632,1259,814
top,re3data_____::4cea5a5ea78542232a51190879756661,r3d100011254,EarthChem Library,[IRIS],http://www.jcvi.org/cms/home/,[doi:10.17171/1-6],The repository is no longer available. >>>!!!<...,[disciplinary],2 datasets,2019-05-15,2008,2015,"[1 Humanities and Social Sciences, 2 Life Scie...",true,[Standard office documents],[dataProvider],[multidisciplinary],[[National Center for Biotechnology Informatio...,true,true,false,true,true,true,false,true,yes,false,true,true,true,unknown,yes,false,false,false,The National Institute of Standards and Techno...,2016-05-10,2021-07-02
freq,1,1,2,2,2,2,2,1713,6,15,92,11,222,2286,30,1748,190,6,2394,2707,2134,2701,2693,2681,1988,2227,1086,1485,2448,2707,2707,1592,1492,2481,1655,2129,3,20,47


In [6]:
re3data_df.isna().sum()

openaire_id                  0
re3data_id                   0
repository_name              0
additional_name            570
repository_url              21
repository_id             1878
description                  0
type                        30
size                      1447
update_date               1459
start_date                 945
end_date                  2561
subject                     22
mission_statement            0
content_type                 7
provider_type                8
keyword                      8
institution                  1
policy                       0
database_access              0
database_license             0
data_access                  0
data_license                 0
data_upload                  0
data_upload_license          0
software                     0
versioning                1415
api                          0
pid_system                   0
citation_guideline_url       0
aid_system                   0
enhanced_publication         3
quality_

In [7]:
re3data_df.content_type.explode().unique()

array(['Databases', 'Plain text',
       'Scientific and statistical data formats',
       'Standard office documents', 'other', 'Images', 'Structured text',
       'Audiovisual data', 'Archived data', 'Raw data',
       'Software applications', 'Source code', 'Structured graphics',
       'Configuration data', 'Networkbased data', nan], dtype=object)

In [8]:
re3data_df.provider_type.explode().unique()

array(['dataProvider', 'serviceProvider', nan], dtype=object)