48 KiB
48 KiB
In [1]:
import ast
import csv
import json
import numpy as np
import pandas as pd
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('display.max_columns', None)
Loading dataset¶
In [2]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
converters={'subject': ast.literal_eval,
'keyword': ast.literal_eval,
'additionalName': ast.literal_eval,
'repositoryIdentifier': ast.literal_eval,
'type': ast.literal_eval,
'contentType': ast.literal_eval,
'providerType': ast.literal_eval,
'institution': ast.literal_eval
})
re3data_df.head()
Out[2]:
orgIdentifier | repositoryName | repositoryName.language | additionalName | repositoryURL | repositoryIdentifier | repositoryContact | description | description.language | type | size | startDate | endDate | repositoryLanguage | subject | missionStatementURL | contentType | providerType | keyword | institution | policy | databaseAccess | databaseLicense | dataAccess | dataLicense | dataUploadType | dataUploadLicense | software | versioning | api | pidSystem | citationGuidelineURL | aidSystem | enhancedPublication | qualityManagement | certificate | metadataStandard | syndication | remarks | entryDate | lastUpdate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | r3d100000001 | Odum Institute Archive Dataverse | eng | [] | https://dataverse.unc.edu/dataverse/odum | [] | ["https://dataverse.unc.edu/dataverse/odum#", ... | The Odum Institute Archive Dataverse contains ... | eng | [disciplinary] | {"size": "13 dataverses; 3.050 datasets", "upd... | NaN | NaN | ["eng"] | [{'name': '1 Humanities and Social Sciences', ... | NaN | [{'name': 'Databases', 'scheme': 'parse'}, {'n... | [dataProvider] | [FAIR, Middle East, crime, demography, economy... | [{'institutionName': 'Odum Institute for Resea... | [{"policyName": "Collection Development Policy... | {"databaseAccessType": "open", "databaseAcces... | [{"databaseLicenseName": "CC0", "databaseLicen... | [{"dataAccessType": "embargoed", "dataAccessRe... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [] | ["DataVerse"] | NaN | {} | ["DOI"] | NaN | [] | unknown | yes | ["other"] | [{"metadataStandardName": "DDI - Data Document... | {} | Odum Dataverse is covered by Thomson Reuters D... | 2013-06-10 | 2021-07-06 |
1 | r3d100000002 | Access to Archival Databases | eng | [{'additionalName': 'AAD', 'additionalNameLang... | https://aad.archives.gov/aad/ | [RRID:SCR_010479, RRID:nlx_157752] | ["https://www.archives.gov/contact"] | You will find in the Access to Archival Databa... | eng | [disciplinary] | {"size": "", "updatedp": ""} | 1985 | NaN | ["eng", "spa"] | [{'name': '1 Humanities and Social Sciences', ... | https://www.archives.gov/publications/general-... | [{'name': 'Images', 'scheme': 'parse'}, {'name... | [dataProvider] | [US History] | [{'institutionName': 'The U.S. National Archiv... | [{"policyName": "Contribution Policy", "policy... | {"databaseAccessType": "open", "databaseAcces... | [] | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "Copyrights", "dataLicens... | restricted | [] | ["unknown"] | no | {"api": "https://www.archives.gov/developer#to... | ["none"] | https://aad.archives.gov/aad/help/getting-star... | [] | unknown | unknown | [] | [] | {"syndication": "http://www.archives.gov/socia... | NaN | 2012-07-04 | 2021-05-25 |
2 | r3d100000004 | Datenbank Gesprochenes Deutsch | deu | [{'additionalName': 'DGD', 'additionalNameLang... | https://dgd.ids-mannheim.de/ | [] | ["dgd@ids-mannheim.de"] | The "Database for Spoken German (DGD)" is a co... | eng | [disciplinary] | {"size": "34 corpora", "updatedp": "2020-02-03"} | 2012 | NaN | ["deu"] | [{'name': '1 Humanities and Social Sciences', ... | https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... | [{'name': 'Audiovisual data', 'scheme': 'parse... | [dataProvider, serviceProvider] | [Australian German, FOLK, German dialects, Pfe... | [{'institutionName': 'Institut für Deutsche Sp... | [{"policyName": "Erfurter Aufruf zur Sicherung... | {"databaseAccessType": "restricted", "databas... | [] | [{"dataAccessType": "restricted", "dataAccessR... | [{"dataLicenseName": "other", "dataLicenseURL"... | restricted | [] | ["other"] | yes | {} | ["none"] | http://agd.ids-mannheim.de/konditionen.shtml | [] | unknown | unknown | ["RatSWD"] | [] | {} | NaN | 2012-07-20 | 2020-08-27 |
3 | r3d100000005 | UNC Dataverse | eng | [{'additionalName': 'University of North Carol... | https://dataverse.unc.edu/ | [] | ["https://dataverse.unc.edu/", "odumarchive@un... | UNC Dataverse is an open-source repository sof... | eng | [institutional] | {"size": "186 dataverses; 25.272 studies; 229.... | 2011 | NaN | ["eng"] | [{'name': '1 Humanities and Social Sciences', ... | https://odum.unc.edu/about/mission-vision/ | [{'name': 'Archived data', 'scheme': 'parse'},... | [dataProvider, serviceProvider] | [FAIR, census, demographic survey, demography,... | [{'institutionName': 'Odum Institute for Resea... | [{"policyName": "Collection Development Policy... | {"databaseAccessType": "open", "databaseAcces... | [] | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [{"dataUploadLicenseName": "Data Deposit Form"... | ["DataVerse"] | yes | {"api": "https://guides.dataverse.org/en/lates... | ["ARK", "DOI", "PURL", "URN", "hdl"] | https://dataverse.org/best-practices/data-cita... | [] | unknown | yes | [] | [{"metadataStandardName": "DDI - Data Document... | {} | UNC Dataverse is covered by Clarivate Data Cit... | 2012-07-23 | 2021-08-11 |
4 | r3d100000006 | Archaeology Data Service | eng | [{'additionalName': 'ADS', 'additionalNameLang... | https://archaeologydataservice.ac.uk/ | [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] | ["help@archaeologydataservice.ac.uk", "https:/... | The ADS is an accredited digital repository fo... | eng | [disciplinary] | {"size": "1837 results", "updatedp": "2020-05-... | 1996-10-01 | NaN | ["eng"] | [{'name': '1 Humanities and Social Sciences', ... | https://archaeologydataservice.ac.uk/about/our... | [{'name': 'Archived data', 'scheme': 'parse'},... | [dataProvider, serviceProvider] | [FAIR, archaeology, cultural heritage, prehist... | [{'institutionName': 'Arts and Humanities Rese... | [{"policyName": "ADS Guides to good practice",... | {"databaseAccessType": "open", "databaseAcces... | [{"databaseLicenseName": "CC", "databaseLicens... | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [{"dataUploadLicenseName": "Guidelines for Dep... | ["other"] | yes | {"api": "https://archaeologydataservice.ac.uk/... | ["DOI"] | https://archaeologydataservice.ac.uk/advice/te... | [] | unknown | yes | ["other"] | [{"metadataStandardName": "DataCite Metadata S... | {"syndication": "https://archaeologydataservic... | ADS is covered by Clarivate Data Citation Inde... | 2012-07-23 | 2021-09-02 |
In [3]:
re3data_df.columns
Out[3]:
Index(['orgIdentifier', 'repositoryName', 'repositoryName.language', 'additionalName', 'repositoryURL', 'repositoryIdentifier', 'repositoryContact', 'description', 'description.language', 'type', 'size', 'startDate', 'endDate', 'repositoryLanguage', 'subject', 'missionStatementURL', 'contentType', 'providerType', 'keyword', 'institution', 'policy', 'databaseAccess', 'databaseLicense', 'dataAccess', 'dataLicense', 'dataUploadType', 'dataUploadLicense', 'software', 'versioning', 'api', 'pidSystem', 'citationGuidelineURL', 'aidSystem', 'enhancedPublication', 'qualityManagement', 'certificate', 'metadataStandard', 'syndication', 'remarks', 'entryDate', 'lastUpdate'], dtype='object')
In [4]:
def empty_list_is_nan(cell):
if isinstance(cell, list):
return np.nan if len(cell) == 0 else cell
else:
return cell
re3data_df = re3data_df.applymap(empty_list_is_nan)
In [5]:
re3data_df.describe(include='all')
Out[5]:
orgIdentifier | repositoryName | repositoryName.language | additionalName | repositoryURL | repositoryIdentifier | repositoryContact | description | description.language | type | size | startDate | endDate | repositoryLanguage | subject | missionStatementURL | contentType | providerType | keyword | institution | policy | databaseAccess | databaseLicense | dataAccess | dataLicense | dataUploadType | dataUploadLicense | software | versioning | api | pidSystem | citationGuidelineURL | aidSystem | enhancedPublication | qualityManagement | certificate | metadataStandard | syndication | remarks | entryDate | lastUpdate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2739 | 2739 | 2739 | 2170 | 2716 | 863 | 2739 | 2739 | 2739 | 2710 | 2739 | 1776 | 157 | 2739 | 2720 | 2318 | 2732 | 2735 | 2732 | 2738 | 2739 | 2739 | 2739 | 2739 | 2739 | 2711 | 2739 | 2739 | 1316 | 2739 | 2739 | 1512 | 2739 | 2737 | 2739 | 2739 | 2739 | 2739 | 1674 | 2739 | 2739 |
unique | 2739 | 2736 | 19 | 2161 | 2713 | 863 | 2459 | 2737 | 6 | 8 | 1289 | 352 | 80 | 107 | 1388 | 2249 | 1337 | 4 | 2503 | 2719 | 2319 | 12 | 375 | 145 | 2263 | 3 | 681 | 23 | 2 | 1146 | 29 | 1321 | 12 | 3 | 3 | 14 | 172 | 563 | 1656 | 1275 | 740 |
top | r3d100000001 | Språkbanken | eng | [{'additionalName': 'MPC', 'additionalNameLang... | http://icgem.gfz-potsdam.de/home | [RRID:SCR_010479, RRID:nlx_157752] | [] | The National Archives and Records Administrati... | eng | [disciplinary] | {"size": "", "updatedp": ""} | 2008 | 2015 | ["eng"] | [{'name': '1 Humanities and Social Sciences', ... | https://learn.scholarsportal.info/all-guides/d... | [{'name': 'Standard office documents', 'scheme... | [dataProvider] | [multidisciplinary] | [{'institutionName': 'National Center for Biot... | [][] | {"databaseAccessType": "open", "databaseAcces... | [] | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [] | ["unknown"] | yes | {} | ["none"] | https://dataverse.org/best-practices/data-cita... | [] | unknown | yes | [] | [] | {} | is covered by Elsevier. | 2016-05-10 | 2021-09-03 |
freq | 1 | 2 | 2554 | 2 | 2 | 1 | 202 | 2 | 2723 | 1733 | 1450 | 92 | 11 | 2063 | 226 | 14 | 30 | 1771 | 193 | 6 | 312 | 2571 | 2159 | 1269 | 64 | 1793 | 2013 | 1226 | 1108 | 1498 | 1361 | 72 | 2155 | 1608 | 1515 | 2509 | 1669 | 2162 | 14 | 20 | 137 |
In [6]:
re3data_df.isna().sum()
Out[6]:
orgIdentifier 0 repositoryName 0 repositoryName.language 0 additionalName 569 repositoryURL 23 repositoryIdentifier 1876 repositoryContact 0 description 0 description.language 0 type 29 size 0 startDate 963 endDate 2582 repositoryLanguage 0 subject 19 missionStatementURL 421 contentType 7 providerType 4 keyword 7 institution 1 policy 0 databaseAccess 0 databaseLicense 0 dataAccess 0 dataLicense 0 dataUploadType 28 dataUploadLicense 0 software 0 versioning 1423 api 0 pidSystem 0 citationGuidelineURL 1227 aidSystem 0 enhancedPublication 2 qualityManagement 0 certificate 0 metadataStandard 0 syndication 0 remarks 1065 entryDate 0 lastUpdate 0 dtype: int64
In [7]:
types = re3data_df.contentType.explode().apply(lambda x: x['name'] if x is not np.nan else np.nan)
pd.DataFrame(types).groupby('contentType').size()
Out[7]:
contentType Archived data 658 Audiovisual data 542 Configuration data 79 Databases 586 Images 1378 Networkbased data 153 Plain text 1158 Raw data 1197 Scientific and statistical data formats 1685 Software applications 456 Source code 209 Standard office documents 1684 Structured graphics 917 Structured text 848 other 962 dtype: int64
In [8]:
pd.DataFrame(re3data_df.providerType.explode()).groupby('providerType').size()
Out[8]:
providerType dataProvider 2491 serviceProvider 963 dtype: int64