48 KiB
48 KiB
In [1]:
import ast
import csv
import json
import numpy as np
import pandas as pd
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('display.max_columns', None)
Loading dataset¶
In [2]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
converters={'subject': ast.literal_eval,
'keyword': ast.literal_eval,
'additionalName': ast.literal_eval,
'repositoryIdentifier': ast.literal_eval,
'type': ast.literal_eval,
'contentType': ast.literal_eval,
'providerType': ast.literal_eval,
'institution': ast.literal_eval
})
re3data_df.head()
Out[2]:
orgIdentifier | repositoryName | repositoryName.language | additionalName | repositoryURL | repositoryIdentifier | repositoryContact | description | description.language | type | size | startDate | endDate | repositoryLanguage | subject | missionStatementURL | contentType | providerType | keyword | institution | policy | databaseAccess | databaseLicense | dataAccess | dataLicense | dataUploadType | dataUploadLicense | software | versioning | api | pidSystem | citationGuidelineURL | aidSystem | enhancedPublication | qualityManagement | certificate | metadataStandard | syndication | remarks | entryDate | lastUpdate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | r3d100000001 | Odum Institute Archive Dataverse | eng | [] | https://dataverse.unc.edu/dataverse/odum | [] | ["https://dataverse.unc.edu/dataverse/odum#", ... | The Odum Institute Archive Dataverse contains ... | eng | [disciplinary] | {"size": "13 dataverses; 3.050 datasets", "upd... | NaN | NaN | ["eng"] | [{'name': '1 Humanities and Social Sciences', ... | NaN | [{'name': 'Databases', 'scheme': 'parse'}, {'n... | [dataProvider] | [FAIR, Middle East, crime, demography, economy... | [{'institutionName': 'Odum Institute for Resea... | [{"policyName": "Collection Development Policy... | {"databaseAccessType": "open", "databaseAcces... | [{"databaseLicenseName": "CC0", "databaseLicen... | [{"dataAccessType": "embargoed", "dataAccessRe... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [] | ["DataVerse"] | NaN | {} | ["DOI"] | NaN | [] | unknown | yes | ["other"] | [{"metadataStandardName": "DDI - Data Document... | {} | Odum Dataverse is covered by Thomson Reuters D... | 2013-06-10 | 2021-07-06 |
1 | r3d100000002 | Access to Archival Databases | eng | [{'additionalName': 'AAD', 'additionalNameLang... | https://aad.archives.gov/aad/ | [RRID:SCR_010479, RRID:nlx_157752] | ["https://www.archives.gov/contact"] | You will find in the Access to Archival Databa... | eng | [disciplinary] | {"size": "", "updatedp": ""} | 1985 | NaN | ["eng", "spa"] | [{'name': '1 Humanities and Social Sciences', ... | https://www.archives.gov/publications/general-... | [{'name': 'Images', 'scheme': 'parse'}, {'name... | [dataProvider] | [US History] | [{'institutionName': 'The U.S. National Archiv... | [{"policyName": "Contribution Policy", "policy... | {"databaseAccessType": "open", "databaseAcces... | [] | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "Copyrights", "dataLicens... | restricted | [] | ["unknown"] | no | {"api": "https://www.archives.gov/developer#to... | ["none"] | https://aad.archives.gov/aad/help/getting-star... | [] | unknown | unknown | [] | [] | {"syndication": "http://www.archives.gov/socia... | NaN | 2012-07-04 | 2021-05-25 |
2 | r3d100000004 | Datenbank Gesprochenes Deutsch | deu | [{'additionalName': 'DGD', 'additionalNameLang... | https://dgd.ids-mannheim.de/ | [] | ["dgd@ids-mannheim.de"] | The "Database for Spoken German (DGD)" is a co... | eng | [disciplinary] | {"size": "34 corpora", "updatedp": "2020-02-03"} | 2012 | NaN | ["deu"] | [{'name': '1 Humanities and Social Sciences', ... | https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... | [{'name': 'Audiovisual data', 'scheme': 'parse... | [dataProvider, serviceProvider] | [Australian German, FOLK, German dialects, Pfe... | [{'institutionName': 'Institut für Deutsche Sp... | [{"policyName": "Erfurter Aufruf zur Sicherung... | {"databaseAccessType": "restricted", "databas... | [] | [{"dataAccessType": "restricted", "dataAccessR... | [{"dataLicenseName": "other", "dataLicenseURL"... | restricted | [] | ["other"] | yes | {} | ["none"] | http://agd.ids-mannheim.de/konditionen.shtml | [] | unknown | unknown | ["RatSWD"] | [] | {} | NaN | 2012-07-20 | 2020-08-27 |
3 | r3d100000005 | UNC Dataverse | eng | [{'additionalName': 'University of North Carol... | https://dataverse.unc.edu/ | [FAIRsharing_doi:10.25504/FAIRsharing.pS2p8c] | ["https://dataverse.unc.edu/", "odumarchive@un... | UNC Dataverse is an open-source repository sof... | eng | [institutional] | {"size": "186 dataverses; 25.272 studies; 229.... | 2011 | NaN | ["eng"] | [{'name': '1 Humanities and Social Sciences', ... | https://odum.unc.edu/about/mission-vision/ | [{'name': 'Archived data', 'scheme': 'parse'},... | [dataProvider, serviceProvider] | [FAIR, census, demographic survey, demography,... | [{'institutionName': 'Odum Institute for Resea... | [{"policyName": "Collection Development Policy... | {"databaseAccessType": "open", "databaseAcces... | [] | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [{"dataUploadLicenseName": "Data Deposit Form"... | ["DataVerse"] | yes | {"api": "https://guides.dataverse.org/en/lates... | ["ARK", "DOI", "PURL", "URN", "hdl"] | https://dataverse.org/best-practices/data-cita... | [] | unknown | yes | [] | [{"metadataStandardName": "DDI - Data Document... | {} | UNC Dataverse is covered by Clarivate Data Cit... | 2012-07-23 | 2021-10-25 |
4 | r3d100000006 | Archaeology Data Service | eng | [{'additionalName': 'ADS', 'additionalNameLang... | https://archaeologydataservice.ac.uk/ | [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] | ["help@archaeologydataservice.ac.uk", "https:/... | The ADS is an accredited digital repository fo... | eng | [disciplinary] | {"size": "1837 results", "updatedp": "2020-05-... | 1996-10-01 | NaN | ["eng"] | [{'name': '1 Humanities and Social Sciences', ... | https://archaeologydataservice.ac.uk/about/our... | [{'name': 'Archived data', 'scheme': 'parse'},... | [dataProvider, serviceProvider] | [FAIR, archaeology, cultural heritage, prehist... | [{'institutionName': 'Arts and Humanities Rese... | [{"policyName": "ADS Guides to good practice",... | {"databaseAccessType": "open", "databaseAcces... | [{"databaseLicenseName": "CC", "databaseLicens... | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [{"dataUploadLicenseName": "Guidelines for Dep... | ["other"] | yes | {"api": "https://archaeologydataservice.ac.uk/... | ["DOI"] | https://archaeologydataservice.ac.uk/advice/te... | [] | unknown | yes | ["other"] | [{"metadataStandardName": "DataCite Metadata S... | {"syndication": "https://archaeologydataservic... | ADS is covered by Clarivate Data Citation Inde... | 2012-07-23 | 2021-09-02 |
In [3]:
re3data_df.columns
Out[3]:
Index(['orgIdentifier', 'repositoryName', 'repositoryName.language', 'additionalName', 'repositoryURL', 'repositoryIdentifier', 'repositoryContact', 'description', 'description.language', 'type', 'size', 'startDate', 'endDate', 'repositoryLanguage', 'subject', 'missionStatementURL', 'contentType', 'providerType', 'keyword', 'institution', 'policy', 'databaseAccess', 'databaseLicense', 'dataAccess', 'dataLicense', 'dataUploadType', 'dataUploadLicense', 'software', 'versioning', 'api', 'pidSystem', 'citationGuidelineURL', 'aidSystem', 'enhancedPublication', 'qualityManagement', 'certificate', 'metadataStandard', 'syndication', 'remarks', 'entryDate', 'lastUpdate'], dtype='object')
In [4]:
def empty_list_is_nan(cell):
if isinstance(cell, list):
return np.nan if len(cell) == 0 else cell
else:
return cell
re3data_df = re3data_df.applymap(empty_list_is_nan)
In [5]:
re3data_df.describe(include='all')
Out[5]:
orgIdentifier | repositoryName | repositoryName.language | additionalName | repositoryURL | repositoryIdentifier | repositoryContact | description | description.language | type | size | startDate | endDate | repositoryLanguage | subject | missionStatementURL | contentType | providerType | keyword | institution | policy | databaseAccess | databaseLicense | dataAccess | dataLicense | dataUploadType | dataUploadLicense | software | versioning | api | pidSystem | citationGuidelineURL | aidSystem | enhancedPublication | qualityManagement | certificate | metadataStandard | syndication | remarks | entryDate | lastUpdate | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2793 | 2793 | 2793 | 2206 | 2769 | 1024 | 2793 | 2793 | 2793 | 2777 | 2793 | 1800 | 172 | 2793 | 2789 | 2373 | 2787 | 2788 | 2785 | 2792 | 2793 | 2793 | 2793 | 2793 | 2793 | 2778 | 2793 | 2793 | 1339 | 2793 | 2793 | 1532 | 2793 | 2793 | 2793 | 2793 | 2793 | 2793 | 1694 | 2793 | 2793 |
unique | 2793 | 2791 | 19 | 2196 | 2766 | 1023 | 2532 | 2792 | 6 | 8 | 1321 | 362 | 86 | 110 | 1417 | 2304 | 1350 | 5 | 2543 | 2772 | 2366 | 12 | 377 | 146 | 2294 | 3 | 695 | 23 | 2 | 1170 | 29 | 1337 | 13 | 3 | 3 | 16 | 175 | 544 | 1673 | 1316 | 722 |
top | r3d100000001 | EarthChem Library | eng | [{'additionalName': 'FRED', 'additionalNameLan... | http://icgem.gfz-potsdam.de/home | [biodbcore-001574] | [] | The National Archives and Records Administrati... | eng | [disciplinary] | {"size": "", "updatedp": ""} | 2008 | 2015 | ["eng"] | [{'name': '1 Humanities and Social Sciences', ... | https://learn.scholarsportal.info/all-guides/d... | [{'name': 'Standard office documents', 'scheme... | [dataProvider] | [multidisciplinary] | [{'institutionName': 'National Center for Biot... | [][] | {"databaseAccessType": "open", "databaseAcces... | [] | [{"dataAccessType": "open", "dataAccessRestric... | [{"dataLicenseName": "CC", "dataLicenseURL": "... | restricted | [] | ["unknown"] | yes | {} | ["none"] | https://dataverse.org/best-practices/data-cita... | [] | unknown | yes | [] | [] | {} | is covered by Elsevier. | 2018-08-10 | 2021-09-03 |
freq | 1 | 2 | 2596 | 2 | 2 | 2 | 170 | 2 | 2776 | 1768 | 1472 | 93 | 12 | 2088 | 240 | 14 | 29 | 1806 | 205 | 7 | 319 | 2624 | 2201 | 1292 | 71 | 1851 | 2054 | 1216 | 1131 | 1526 | 1359 | 76 | 2199 | 1643 | 1569 | 2557 | 1693 | 2235 | 17 | 20 | 104 |
In [6]:
re3data_df.isna().sum()
Out[6]:
orgIdentifier 0 repositoryName 0 repositoryName.language 0 additionalName 587 repositoryURL 24 repositoryIdentifier 1769 repositoryContact 0 description 0 description.language 0 type 16 size 0 startDate 993 endDate 2621 repositoryLanguage 0 subject 4 missionStatementURL 420 contentType 6 providerType 5 keyword 8 institution 1 policy 0 databaseAccess 0 databaseLicense 0 dataAccess 0 dataLicense 0 dataUploadType 15 dataUploadLicense 0 software 0 versioning 1454 api 0 pidSystem 0 citationGuidelineURL 1261 aidSystem 0 enhancedPublication 0 qualityManagement 0 certificate 0 metadataStandard 0 syndication 0 remarks 1099 entryDate 0 lastUpdate 0 dtype: int64
In [7]:
types = re3data_df.contentType.explode().apply(lambda x: x['name'] if x is not np.nan else np.nan)
pd.DataFrame(types).groupby('contentType').size()
Out[7]:
contentType Archived data 677 Audiovisual data 561 Configuration data 84 Databases 593 Images 1407 Networkbased data 155 Plain text 1173 Raw data 1224 Scientific and statistical data formats 1725 Software applications 462 Source code 217 Standard office documents 1719 Structured graphics 937 Structured text 878 other 979 dtype: int64
In [8]:
pd.DataFrame(re3data_df.providerType.explode()).groupby('providerType').size()
Out[8]:
providerType dataProvider 2539 serviceProvider 982 dtype: int64