registries_analysis/notebooks/01.1-exploration-re3data.ipynb

1102 lines
48 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"import csv\n",
"import json\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
"import plotly.express as px\n",
"\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading dataset"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orgIdentifier</th>\n",
" <th>repositoryName</th>\n",
" <th>repositoryName.language</th>\n",
" <th>additionalName</th>\n",
" <th>repositoryURL</th>\n",
" <th>repositoryIdentifier</th>\n",
" <th>repositoryContact</th>\n",
" <th>description</th>\n",
" <th>description.language</th>\n",
" <th>type</th>\n",
" <th>size</th>\n",
" <th>startDate</th>\n",
" <th>endDate</th>\n",
" <th>repositoryLanguage</th>\n",
" <th>subject</th>\n",
" <th>missionStatementURL</th>\n",
" <th>contentType</th>\n",
" <th>providerType</th>\n",
" <th>keyword</th>\n",
" <th>institution</th>\n",
" <th>policy</th>\n",
" <th>databaseAccess</th>\n",
" <th>databaseLicense</th>\n",
" <th>dataAccess</th>\n",
" <th>dataLicense</th>\n",
" <th>dataUploadType</th>\n",
" <th>dataUploadLicense</th>\n",
" <th>software</th>\n",
" <th>versioning</th>\n",
" <th>api</th>\n",
" <th>pidSystem</th>\n",
" <th>citationGuidelineURL</th>\n",
" <th>aidSystem</th>\n",
" <th>enhancedPublication</th>\n",
" <th>qualityManagement</th>\n",
" <th>certificate</th>\n",
" <th>metadataStandard</th>\n",
" <th>syndication</th>\n",
" <th>remarks</th>\n",
" <th>entryDate</th>\n",
" <th>lastUpdate</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>r3d100000001</td>\n",
" <td>Odum Institute Archive Dataverse</td>\n",
" <td>eng</td>\n",
" <td>[]</td>\n",
" <td>https://dataverse.unc.edu/dataverse/odum</td>\n",
" <td>[]</td>\n",
" <td>[\"https://dataverse.unc.edu/dataverse/odum#\", ...</td>\n",
" <td>The Odum Institute Archive Dataverse contains ...</td>\n",
" <td>eng</td>\n",
" <td>[disciplinary]</td>\n",
" <td>{\"size\": \"13 dataverses; 3.050 datasets\", \"upd...</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>[\"eng\"]</td>\n",
" <td>[{'name': '1 Humanities and Social Sciences', ...</td>\n",
" <td>NaN</td>\n",
" <td>[{'name': 'Databases', 'scheme': 'parse'}, {'n...</td>\n",
" <td>[dataProvider]</td>\n",
" <td>[FAIR, Middle East, crime, demography, economy...</td>\n",
" <td>[{'institutionName': 'Odum Institute for Resea...</td>\n",
" <td>[{\"policyName\": \"Collection Development Policy...</td>\n",
" <td>{\"databaseAccessType\": \"open\", \"databaseAcces...</td>\n",
" <td>[{\"databaseLicenseName\": \"CC0\", \"databaseLicen...</td>\n",
" <td>[{\"dataAccessType\": \"embargoed\", \"dataAccessRe...</td>\n",
" <td>[{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"...</td>\n",
" <td>restricted</td>\n",
" <td>[]</td>\n",
" <td>[\"DataVerse\"]</td>\n",
" <td>NaN</td>\n",
" <td>{}</td>\n",
" <td>[\"DOI\"]</td>\n",
" <td>NaN</td>\n",
" <td>[]</td>\n",
" <td>unknown</td>\n",
" <td>yes</td>\n",
" <td>[\"other\"]</td>\n",
" <td>[{\"metadataStandardName\": \"DDI - Data Document...</td>\n",
" <td>{}</td>\n",
" <td>Odum Dataverse is covered by Thomson Reuters D...</td>\n",
" <td>2013-06-10</td>\n",
" <td>2021-07-06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>r3d100000002</td>\n",
" <td>Access to Archival Databases</td>\n",
" <td>eng</td>\n",
" <td>[{'additionalName': 'AAD', 'additionalNameLang...</td>\n",
" <td>https://aad.archives.gov/aad/</td>\n",
" <td>[RRID:SCR_010479, RRID:nlx_157752]</td>\n",
" <td>[\"https://www.archives.gov/contact\"]</td>\n",
" <td>You will find in the Access to Archival Databa...</td>\n",
" <td>eng</td>\n",
" <td>[disciplinary]</td>\n",
" <td>{\"size\": \"\", \"updatedp\": \"\"}</td>\n",
" <td>1985</td>\n",
" <td>NaN</td>\n",
" <td>[\"eng\", \"spa\"]</td>\n",
" <td>[{'name': '1 Humanities and Social Sciences', ...</td>\n",
" <td>https://www.archives.gov/publications/general-...</td>\n",
" <td>[{'name': 'Images', 'scheme': 'parse'}, {'name...</td>\n",
" <td>[dataProvider]</td>\n",
" <td>[US History]</td>\n",
" <td>[{'institutionName': 'The U.S. National Archiv...</td>\n",
" <td>[{\"policyName\": \"Contribution Policy\", \"policy...</td>\n",
" <td>{\"databaseAccessType\": \"open\", \"databaseAcces...</td>\n",
" <td>[]</td>\n",
" <td>[{\"dataAccessType\": \"open\", \"dataAccessRestric...</td>\n",
" <td>[{\"dataLicenseName\": \"Copyrights\", \"dataLicens...</td>\n",
" <td>restricted</td>\n",
" <td>[]</td>\n",
" <td>[\"unknown\"]</td>\n",
" <td>no</td>\n",
" <td>{\"api\": \"https://www.archives.gov/developer#to...</td>\n",
" <td>[\"none\"]</td>\n",
" <td>https://aad.archives.gov/aad/help/getting-star...</td>\n",
" <td>[]</td>\n",
" <td>unknown</td>\n",
" <td>unknown</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>{\"syndication\": \"http://www.archives.gov/socia...</td>\n",
" <td>NaN</td>\n",
" <td>2012-07-04</td>\n",
" <td>2021-05-25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>r3d100000004</td>\n",
" <td>Datenbank Gesprochenes Deutsch</td>\n",
" <td>deu</td>\n",
" <td>[{'additionalName': 'DGD', 'additionalNameLang...</td>\n",
" <td>https://dgd.ids-mannheim.de/</td>\n",
" <td>[]</td>\n",
" <td>[\"dgd@ids-mannheim.de\"]</td>\n",
" <td>The \"Database for Spoken German (DGD)\" is a co...</td>\n",
" <td>eng</td>\n",
" <td>[disciplinary]</td>\n",
" <td>{\"size\": \"34 corpora\", \"updatedp\": \"2020-02-03\"}</td>\n",
" <td>2012</td>\n",
" <td>NaN</td>\n",
" <td>[\"deu\"]</td>\n",
" <td>[{'name': '1 Humanities and Social Sciences', ...</td>\n",
" <td>https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext...</td>\n",
" <td>[{'name': 'Audiovisual data', 'scheme': 'parse...</td>\n",
" <td>[dataProvider, serviceProvider]</td>\n",
" <td>[Australian German, FOLK, German dialects, Pfe...</td>\n",
" <td>[{'institutionName': 'Institut für Deutsche Sp...</td>\n",
" <td>[{\"policyName\": \"Erfurter Aufruf zur Sicherung...</td>\n",
" <td>{\"databaseAccessType\": \"restricted\", \"databas...</td>\n",
" <td>[]</td>\n",
" <td>[{\"dataAccessType\": \"restricted\", \"dataAccessR...</td>\n",
" <td>[{\"dataLicenseName\": \"other\", \"dataLicenseURL\"...</td>\n",
" <td>restricted</td>\n",
" <td>[]</td>\n",
" <td>[\"other\"]</td>\n",
" <td>yes</td>\n",
" <td>{}</td>\n",
" <td>[\"none\"]</td>\n",
" <td>http://agd.ids-mannheim.de/konditionen.shtml</td>\n",
" <td>[]</td>\n",
" <td>unknown</td>\n",
" <td>unknown</td>\n",
" <td>[\"RatSWD\"]</td>\n",
" <td>[]</td>\n",
" <td>{}</td>\n",
" <td>NaN</td>\n",
" <td>2012-07-20</td>\n",
" <td>2020-08-27</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>r3d100000005</td>\n",
" <td>UNC Dataverse</td>\n",
" <td>eng</td>\n",
" <td>[{'additionalName': 'University of North Carol...</td>\n",
" <td>https://dataverse.unc.edu/</td>\n",
" <td>[]</td>\n",
" <td>[\"https://dataverse.unc.edu/\", \"odumarchive@un...</td>\n",
" <td>UNC Dataverse is an open-source repository sof...</td>\n",
" <td>eng</td>\n",
" <td>[institutional]</td>\n",
" <td>{\"size\": \"186 dataverses; 25.272 studies; 229....</td>\n",
" <td>2011</td>\n",
" <td>NaN</td>\n",
" <td>[\"eng\"]</td>\n",
" <td>[{'name': '1 Humanities and Social Sciences', ...</td>\n",
" <td>https://odum.unc.edu/about/mission-vision/</td>\n",
" <td>[{'name': 'Archived data', 'scheme': 'parse'},...</td>\n",
" <td>[dataProvider, serviceProvider]</td>\n",
" <td>[FAIR, census, demographic survey, demography,...</td>\n",
" <td>[{'institutionName': 'Odum Institute for Resea...</td>\n",
" <td>[{\"policyName\": \"Collection Development Policy...</td>\n",
" <td>{\"databaseAccessType\": \"open\", \"databaseAcces...</td>\n",
" <td>[]</td>\n",
" <td>[{\"dataAccessType\": \"open\", \"dataAccessRestric...</td>\n",
" <td>[{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"...</td>\n",
" <td>restricted</td>\n",
" <td>[{\"dataUploadLicenseName\": \"Data Deposit Form\"...</td>\n",
" <td>[\"DataVerse\"]</td>\n",
" <td>yes</td>\n",
" <td>{\"api\": \"https://guides.dataverse.org/en/lates...</td>\n",
" <td>[\"ARK\", \"DOI\", \"PURL\", \"URN\", \"hdl\"]</td>\n",
" <td>https://dataverse.org/best-practices/data-cita...</td>\n",
" <td>[]</td>\n",
" <td>unknown</td>\n",
" <td>yes</td>\n",
" <td>[]</td>\n",
" <td>[{\"metadataStandardName\": \"DDI - Data Document...</td>\n",
" <td>{}</td>\n",
" <td>UNC Dataverse is covered by Clarivate Data Cit...</td>\n",
" <td>2012-07-23</td>\n",
" <td>2021-08-11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>r3d100000006</td>\n",
" <td>Archaeology Data Service</td>\n",
" <td>eng</td>\n",
" <td>[{'additionalName': 'ADS', 'additionalNameLang...</td>\n",
" <td>https://archaeologydataservice.ac.uk/</td>\n",
" <td>[FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg]</td>\n",
" <td>[\"help@archaeologydataservice.ac.uk\", \"https:/...</td>\n",
" <td>The ADS is an accredited digital repository fo...</td>\n",
" <td>eng</td>\n",
" <td>[disciplinary]</td>\n",
" <td>{\"size\": \"1837 results\", \"updatedp\": \"2020-05-...</td>\n",
" <td>1996-10-01</td>\n",
" <td>NaN</td>\n",
" <td>[\"eng\"]</td>\n",
" <td>[{'name': '1 Humanities and Social Sciences', ...</td>\n",
" <td>https://archaeologydataservice.ac.uk/about/our...</td>\n",
" <td>[{'name': 'Archived data', 'scheme': 'parse'},...</td>\n",
" <td>[dataProvider, serviceProvider]</td>\n",
" <td>[FAIR, archaeology, cultural heritage, prehist...</td>\n",
" <td>[{'institutionName': 'Arts and Humanities Rese...</td>\n",
" <td>[{\"policyName\": \"ADS Guides to good practice\",...</td>\n",
" <td>{\"databaseAccessType\": \"open\", \"databaseAcces...</td>\n",
" <td>[{\"databaseLicenseName\": \"CC\", \"databaseLicens...</td>\n",
" <td>[{\"dataAccessType\": \"open\", \"dataAccessRestric...</td>\n",
" <td>[{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"...</td>\n",
" <td>restricted</td>\n",
" <td>[{\"dataUploadLicenseName\": \"Guidelines for Dep...</td>\n",
" <td>[\"other\"]</td>\n",
" <td>yes</td>\n",
" <td>{\"api\": \"https://archaeologydataservice.ac.uk/...</td>\n",
" <td>[\"DOI\"]</td>\n",
" <td>https://archaeologydataservice.ac.uk/advice/te...</td>\n",
" <td>[]</td>\n",
" <td>unknown</td>\n",
" <td>yes</td>\n",
" <td>[\"other\"]</td>\n",
" <td>[{\"metadataStandardName\": \"DataCite Metadata S...</td>\n",
" <td>{\"syndication\": \"https://archaeologydataservic...</td>\n",
" <td>ADS is covered by Clarivate Data Citation Inde...</td>\n",
" <td>2012-07-23</td>\n",
" <td>2021-09-02</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orgIdentifier repositoryName repositoryName.language \\\n",
"0 r3d100000001 Odum Institute Archive Dataverse eng \n",
"1 r3d100000002 Access to Archival Databases eng \n",
"2 r3d100000004 Datenbank Gesprochenes Deutsch deu \n",
"3 r3d100000005 UNC Dataverse eng \n",
"4 r3d100000006 Archaeology Data Service eng \n",
"\n",
" additionalName \\\n",
"0 [] \n",
"1 [{'additionalName': 'AAD', 'additionalNameLang... \n",
"2 [{'additionalName': 'DGD', 'additionalNameLang... \n",
"3 [{'additionalName': 'University of North Carol... \n",
"4 [{'additionalName': 'ADS', 'additionalNameLang... \n",
"\n",
" repositoryURL \\\n",
"0 https://dataverse.unc.edu/dataverse/odum \n",
"1 https://aad.archives.gov/aad/ \n",
"2 https://dgd.ids-mannheim.de/ \n",
"3 https://dataverse.unc.edu/ \n",
"4 https://archaeologydataservice.ac.uk/ \n",
"\n",
" repositoryIdentifier \\\n",
"0 [] \n",
"1 [RRID:SCR_010479, RRID:nlx_157752] \n",
"2 [] \n",
"3 [] \n",
"4 [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] \n",
"\n",
" repositoryContact \\\n",
"0 [\"https://dataverse.unc.edu/dataverse/odum#\", ... \n",
"1 [\"https://www.archives.gov/contact\"] \n",
"2 [\"dgd@ids-mannheim.de\"] \n",
"3 [\"https://dataverse.unc.edu/\", \"odumarchive@un... \n",
"4 [\"help@archaeologydataservice.ac.uk\", \"https:/... \n",
"\n",
" description description.language \\\n",
"0 The Odum Institute Archive Dataverse contains ... eng \n",
"1 You will find in the Access to Archival Databa... eng \n",
"2 The \"Database for Spoken German (DGD)\" is a co... eng \n",
"3 UNC Dataverse is an open-source repository sof... eng \n",
"4 The ADS is an accredited digital repository fo... eng \n",
"\n",
" type size \\\n",
"0 [disciplinary] {\"size\": \"13 dataverses; 3.050 datasets\", \"upd... \n",
"1 [disciplinary] {\"size\": \"\", \"updatedp\": \"\"} \n",
"2 [disciplinary] {\"size\": \"34 corpora\", \"updatedp\": \"2020-02-03\"} \n",
"3 [institutional] {\"size\": \"186 dataverses; 25.272 studies; 229.... \n",
"4 [disciplinary] {\"size\": \"1837 results\", \"updatedp\": \"2020-05-... \n",
"\n",
" startDate endDate repositoryLanguage \\\n",
"0 NaN NaN [\"eng\"] \n",
"1 1985 NaN [\"eng\", \"spa\"] \n",
"2 2012 NaN [\"deu\"] \n",
"3 2011 NaN [\"eng\"] \n",
"4 1996-10-01 NaN [\"eng\"] \n",
"\n",
" subject \\\n",
"0 [{'name': '1 Humanities and Social Sciences', ... \n",
"1 [{'name': '1 Humanities and Social Sciences', ... \n",
"2 [{'name': '1 Humanities and Social Sciences', ... \n",
"3 [{'name': '1 Humanities and Social Sciences', ... \n",
"4 [{'name': '1 Humanities and Social Sciences', ... \n",
"\n",
" missionStatementURL \\\n",
"0 NaN \n",
"1 https://www.archives.gov/publications/general-... \n",
"2 https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... \n",
"3 https://odum.unc.edu/about/mission-vision/ \n",
"4 https://archaeologydataservice.ac.uk/about/our... \n",
"\n",
" contentType \\\n",
"0 [{'name': 'Databases', 'scheme': 'parse'}, {'n... \n",
"1 [{'name': 'Images', 'scheme': 'parse'}, {'name... \n",
"2 [{'name': 'Audiovisual data', 'scheme': 'parse... \n",
"3 [{'name': 'Archived data', 'scheme': 'parse'},... \n",
"4 [{'name': 'Archived data', 'scheme': 'parse'},... \n",
"\n",
" providerType \\\n",
"0 [dataProvider] \n",
"1 [dataProvider] \n",
"2 [dataProvider, serviceProvider] \n",
"3 [dataProvider, serviceProvider] \n",
"4 [dataProvider, serviceProvider] \n",
"\n",
" keyword \\\n",
"0 [FAIR, Middle East, crime, demography, economy... \n",
"1 [US History] \n",
"2 [Australian German, FOLK, German dialects, Pfe... \n",
"3 [FAIR, census, demographic survey, demography,... \n",
"4 [FAIR, archaeology, cultural heritage, prehist... \n",
"\n",
" institution \\\n",
"0 [{'institutionName': 'Odum Institute for Resea... \n",
"1 [{'institutionName': 'The U.S. National Archiv... \n",
"2 [{'institutionName': 'Institut für Deutsche Sp... \n",
"3 [{'institutionName': 'Odum Institute for Resea... \n",
"4 [{'institutionName': 'Arts and Humanities Rese... \n",
"\n",
" policy \\\n",
"0 [{\"policyName\": \"Collection Development Policy... \n",
"1 [{\"policyName\": \"Contribution Policy\", \"policy... \n",
"2 [{\"policyName\": \"Erfurter Aufruf zur Sicherung... \n",
"3 [{\"policyName\": \"Collection Development Policy... \n",
"4 [{\"policyName\": \"ADS Guides to good practice\",... \n",
"\n",
" databaseAccess \\\n",
"0 {\"databaseAccessType\": \"open\", \"databaseAcces... \n",
"1 {\"databaseAccessType\": \"open\", \"databaseAcces... \n",
"2 {\"databaseAccessType\": \"restricted\", \"databas... \n",
"3 {\"databaseAccessType\": \"open\", \"databaseAcces... \n",
"4 {\"databaseAccessType\": \"open\", \"databaseAcces... \n",
"\n",
" databaseLicense \\\n",
"0 [{\"databaseLicenseName\": \"CC0\", \"databaseLicen... \n",
"1 [] \n",
"2 [] \n",
"3 [] \n",
"4 [{\"databaseLicenseName\": \"CC\", \"databaseLicens... \n",
"\n",
" dataAccess \\\n",
"0 [{\"dataAccessType\": \"embargoed\", \"dataAccessRe... \n",
"1 [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n",
"2 [{\"dataAccessType\": \"restricted\", \"dataAccessR... \n",
"3 [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n",
"4 [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n",
"\n",
" dataLicense dataUploadType \\\n",
"0 [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n",
"1 [{\"dataLicenseName\": \"Copyrights\", \"dataLicens... restricted \n",
"2 [{\"dataLicenseName\": \"other\", \"dataLicenseURL\"... restricted \n",
"3 [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n",
"4 [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n",
"\n",
" dataUploadLicense software \\\n",
"0 [] [\"DataVerse\"] \n",
"1 [] [\"unknown\"] \n",
"2 [] [\"other\"] \n",
"3 [{\"dataUploadLicenseName\": \"Data Deposit Form\"... [\"DataVerse\"] \n",
"4 [{\"dataUploadLicenseName\": \"Guidelines for Dep... [\"other\"] \n",
"\n",
" versioning api \\\n",
"0 NaN {} \n",
"1 no {\"api\": \"https://www.archives.gov/developer#to... \n",
"2 yes {} \n",
"3 yes {\"api\": \"https://guides.dataverse.org/en/lates... \n",
"4 yes {\"api\": \"https://archaeologydataservice.ac.uk/... \n",
"\n",
" pidSystem \\\n",
"0 [\"DOI\"] \n",
"1 [\"none\"] \n",
"2 [\"none\"] \n",
"3 [\"ARK\", \"DOI\", \"PURL\", \"URN\", \"hdl\"] \n",
"4 [\"DOI\"] \n",
"\n",
" citationGuidelineURL aidSystem \\\n",
"0 NaN [] \n",
"1 https://aad.archives.gov/aad/help/getting-star... [] \n",
"2 http://agd.ids-mannheim.de/konditionen.shtml [] \n",
"3 https://dataverse.org/best-practices/data-cita... [] \n",
"4 https://archaeologydataservice.ac.uk/advice/te... [] \n",
"\n",
" enhancedPublication qualityManagement certificate \\\n",
"0 unknown yes [\"other\"] \n",
"1 unknown unknown [] \n",
"2 unknown unknown [\"RatSWD\"] \n",
"3 unknown yes [] \n",
"4 unknown yes [\"other\"] \n",
"\n",
" metadataStandard \\\n",
"0 [{\"metadataStandardName\": \"DDI - Data Document... \n",
"1 [] \n",
"2 [] \n",
"3 [{\"metadataStandardName\": \"DDI - Data Document... \n",
"4 [{\"metadataStandardName\": \"DataCite Metadata S... \n",
"\n",
" syndication \\\n",
"0 {} \n",
"1 {\"syndication\": \"http://www.archives.gov/socia... \n",
"2 {} \n",
"3 {} \n",
"4 {\"syndication\": \"https://archaeologydataservic... \n",
"\n",
" remarks entryDate lastUpdate \n",
"0 Odum Dataverse is covered by Thomson Reuters D... 2013-06-10 2021-07-06 \n",
"1 NaN 2012-07-04 2021-05-25 \n",
"2 NaN 2012-07-20 2020-08-27 \n",
"3 UNC Dataverse is covered by Clarivate Data Cit... 2012-07-23 2021-08-11 \n",
"4 ADS is covered by Clarivate Data Citation Inde... 2012-07-23 2021-09-02 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t',\n",
" converters={'subject': ast.literal_eval,\n",
" 'keyword': ast.literal_eval,\n",
" 'additionalName': ast.literal_eval,\n",
" 'repositoryIdentifier': ast.literal_eval,\n",
" 'type': ast.literal_eval,\n",
" 'contentType': ast.literal_eval,\n",
" 'providerType': ast.literal_eval,\n",
" 'institution': ast.literal_eval\n",
" })\n",
"\n",
"re3data_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['orgIdentifier', 'repositoryName', 'repositoryName.language',\n",
" 'additionalName', 'repositoryURL', 'repositoryIdentifier',\n",
" 'repositoryContact', 'description', 'description.language', 'type',\n",
" 'size', 'startDate', 'endDate', 'repositoryLanguage', 'subject',\n",
" 'missionStatementURL', 'contentType', 'providerType', 'keyword',\n",
" 'institution', 'policy', 'databaseAccess', 'databaseLicense',\n",
" 'dataAccess', 'dataLicense', 'dataUploadType', 'dataUploadLicense',\n",
" 'software', 'versioning', 'api', 'pidSystem', 'citationGuidelineURL',\n",
" 'aidSystem', 'enhancedPublication', 'qualityManagement', 'certificate',\n",
" 'metadataStandard', 'syndication', 'remarks', 'entryDate',\n",
" 'lastUpdate'],\n",
" dtype='object')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re3data_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def empty_list_is_nan(cell):\n",
" if isinstance(cell, list):\n",
" return np.nan if len(cell) == 0 else cell\n",
" else:\n",
" return cell\n",
" \n",
"re3data_df = re3data_df.applymap(empty_list_is_nan)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orgIdentifier</th>\n",
" <th>repositoryName</th>\n",
" <th>repositoryName.language</th>\n",
" <th>additionalName</th>\n",
" <th>repositoryURL</th>\n",
" <th>repositoryIdentifier</th>\n",
" <th>repositoryContact</th>\n",
" <th>description</th>\n",
" <th>description.language</th>\n",
" <th>type</th>\n",
" <th>size</th>\n",
" <th>startDate</th>\n",
" <th>endDate</th>\n",
" <th>repositoryLanguage</th>\n",
" <th>subject</th>\n",
" <th>missionStatementURL</th>\n",
" <th>contentType</th>\n",
" <th>providerType</th>\n",
" <th>keyword</th>\n",
" <th>institution</th>\n",
" <th>policy</th>\n",
" <th>databaseAccess</th>\n",
" <th>databaseLicense</th>\n",
" <th>dataAccess</th>\n",
" <th>dataLicense</th>\n",
" <th>dataUploadType</th>\n",
" <th>dataUploadLicense</th>\n",
" <th>software</th>\n",
" <th>versioning</th>\n",
" <th>api</th>\n",
" <th>pidSystem</th>\n",
" <th>citationGuidelineURL</th>\n",
" <th>aidSystem</th>\n",
" <th>enhancedPublication</th>\n",
" <th>qualityManagement</th>\n",
" <th>certificate</th>\n",
" <th>metadataStandard</th>\n",
" <th>syndication</th>\n",
" <th>remarks</th>\n",
" <th>entryDate</th>\n",
" <th>lastUpdate</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" <td>2170</td>\n",
" <td>2716</td>\n",
" <td>863</td>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" <td>2710</td>\n",
" <td>2739</td>\n",
" <td>1776</td>\n",
" <td>157</td>\n",
" <td>2739</td>\n",
" <td>2720</td>\n",
" <td>2318</td>\n",
" <td>2732</td>\n",
" <td>2735</td>\n",
" <td>2732</td>\n",
" <td>2738</td>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" <td>2711</td>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" <td>1316</td>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" <td>1512</td>\n",
" <td>2739</td>\n",
" <td>2737</td>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" <td>1674</td>\n",
" <td>2739</td>\n",
" <td>2739</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>2739</td>\n",
" <td>2736</td>\n",
" <td>19</td>\n",
" <td>2161</td>\n",
" <td>2713</td>\n",
" <td>863</td>\n",
" <td>2459</td>\n",
" <td>2737</td>\n",
" <td>6</td>\n",
" <td>8</td>\n",
" <td>1289</td>\n",
" <td>352</td>\n",
" <td>80</td>\n",
" <td>107</td>\n",
" <td>1388</td>\n",
" <td>2249</td>\n",
" <td>1337</td>\n",
" <td>4</td>\n",
" <td>2503</td>\n",
" <td>2719</td>\n",
" <td>2319</td>\n",
" <td>12</td>\n",
" <td>375</td>\n",
" <td>145</td>\n",
" <td>2263</td>\n",
" <td>3</td>\n",
" <td>681</td>\n",
" <td>23</td>\n",
" <td>2</td>\n",
" <td>1146</td>\n",
" <td>29</td>\n",
" <td>1321</td>\n",
" <td>12</td>\n",
" <td>3</td>\n",
" <td>3</td>\n",
" <td>14</td>\n",
" <td>172</td>\n",
" <td>563</td>\n",
" <td>1656</td>\n",
" <td>1275</td>\n",
" <td>740</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>r3d100000001</td>\n",
" <td>Språkbanken</td>\n",
" <td>eng</td>\n",
" <td>[{'additionalName': 'MPC', 'additionalNameLang...</td>\n",
" <td>http://icgem.gfz-potsdam.de/home</td>\n",
" <td>[RRID:SCR_010479, RRID:nlx_157752]</td>\n",
" <td>[]</td>\n",
" <td>The National Archives and Records Administrati...</td>\n",
" <td>eng</td>\n",
" <td>[disciplinary]</td>\n",
" <td>{\"size\": \"\", \"updatedp\": \"\"}</td>\n",
" <td>2008</td>\n",
" <td>2015</td>\n",
" <td>[\"eng\"]</td>\n",
" <td>[{'name': '1 Humanities and Social Sciences', ...</td>\n",
" <td>https://learn.scholarsportal.info/all-guides/d...</td>\n",
" <td>[{'name': 'Standard office documents', 'scheme...</td>\n",
" <td>[dataProvider]</td>\n",
" <td>[multidisciplinary]</td>\n",
" <td>[{'institutionName': 'National Center for Biot...</td>\n",
" <td>[][]</td>\n",
" <td>{\"databaseAccessType\": \"open\", \"databaseAcces...</td>\n",
" <td>[]</td>\n",
" <td>[{\"dataAccessType\": \"open\", \"dataAccessRestric...</td>\n",
" <td>[{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"...</td>\n",
" <td>restricted</td>\n",
" <td>[]</td>\n",
" <td>[\"unknown\"]</td>\n",
" <td>yes</td>\n",
" <td>{}</td>\n",
" <td>[\"none\"]</td>\n",
" <td>https://dataverse.org/best-practices/data-cita...</td>\n",
" <td>[]</td>\n",
" <td>unknown</td>\n",
" <td>yes</td>\n",
" <td>[]</td>\n",
" <td>[]</td>\n",
" <td>{}</td>\n",
" <td>is covered by Elsevier.</td>\n",
" <td>2016-05-10</td>\n",
" <td>2021-09-03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>1</td>\n",
" <td>2</td>\n",
" <td>2554</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>202</td>\n",
" <td>2</td>\n",
" <td>2723</td>\n",
" <td>1733</td>\n",
" <td>1450</td>\n",
" <td>92</td>\n",
" <td>11</td>\n",
" <td>2063</td>\n",
" <td>226</td>\n",
" <td>14</td>\n",
" <td>30</td>\n",
" <td>1771</td>\n",
" <td>193</td>\n",
" <td>6</td>\n",
" <td>312</td>\n",
" <td>2571</td>\n",
" <td>2159</td>\n",
" <td>1269</td>\n",
" <td>64</td>\n",
" <td>1793</td>\n",
" <td>2013</td>\n",
" <td>1226</td>\n",
" <td>1108</td>\n",
" <td>1498</td>\n",
" <td>1361</td>\n",
" <td>72</td>\n",
" <td>2155</td>\n",
" <td>1608</td>\n",
" <td>1515</td>\n",
" <td>2509</td>\n",
" <td>1669</td>\n",
" <td>2162</td>\n",
" <td>14</td>\n",
" <td>20</td>\n",
" <td>137</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" orgIdentifier repositoryName repositoryName.language \\\n",
"count 2739 2739 2739 \n",
"unique 2739 2736 19 \n",
"top r3d100000001 Språkbanken eng \n",
"freq 1 2 2554 \n",
"\n",
" additionalName \\\n",
"count 2170 \n",
"unique 2161 \n",
"top [{'additionalName': 'MPC', 'additionalNameLang... \n",
"freq 2 \n",
"\n",
" repositoryURL repositoryIdentifier \\\n",
"count 2716 863 \n",
"unique 2713 863 \n",
"top http://icgem.gfz-potsdam.de/home [RRID:SCR_010479, RRID:nlx_157752] \n",
"freq 2 1 \n",
"\n",
" repositoryContact description \\\n",
"count 2739 2739 \n",
"unique 2459 2737 \n",
"top [] The National Archives and Records Administrati... \n",
"freq 202 2 \n",
"\n",
" description.language type size \\\n",
"count 2739 2710 2739 \n",
"unique 6 8 1289 \n",
"top eng [disciplinary] {\"size\": \"\", \"updatedp\": \"\"} \n",
"freq 2723 1733 1450 \n",
"\n",
" startDate endDate repositoryLanguage \\\n",
"count 1776 157 2739 \n",
"unique 352 80 107 \n",
"top 2008 2015 [\"eng\"] \n",
"freq 92 11 2063 \n",
"\n",
" subject \\\n",
"count 2720 \n",
"unique 1388 \n",
"top [{'name': '1 Humanities and Social Sciences', ... \n",
"freq 226 \n",
"\n",
" missionStatementURL \\\n",
"count 2318 \n",
"unique 2249 \n",
"top https://learn.scholarsportal.info/all-guides/d... \n",
"freq 14 \n",
"\n",
" contentType providerType \\\n",
"count 2732 2735 \n",
"unique 1337 4 \n",
"top [{'name': 'Standard office documents', 'scheme... [dataProvider] \n",
"freq 30 1771 \n",
"\n",
" keyword \\\n",
"count 2732 \n",
"unique 2503 \n",
"top [multidisciplinary] \n",
"freq 193 \n",
"\n",
" institution policy \\\n",
"count 2738 2739 \n",
"unique 2719 2319 \n",
"top [{'institutionName': 'National Center for Biot... [][] \n",
"freq 6 312 \n",
"\n",
" databaseAccess databaseLicense \\\n",
"count 2739 2739 \n",
"unique 12 375 \n",
"top {\"databaseAccessType\": \"open\", \"databaseAcces... [] \n",
"freq 2571 2159 \n",
"\n",
" dataAccess \\\n",
"count 2739 \n",
"unique 145 \n",
"top [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n",
"freq 1269 \n",
"\n",
" dataLicense dataUploadType \\\n",
"count 2739 2711 \n",
"unique 2263 3 \n",
"top [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n",
"freq 64 1793 \n",
"\n",
" dataUploadLicense software versioning api pidSystem \\\n",
"count 2739 2739 1316 2739 2739 \n",
"unique 681 23 2 1146 29 \n",
"top [] [\"unknown\"] yes {} [\"none\"] \n",
"freq 2013 1226 1108 1498 1361 \n",
"\n",
" citationGuidelineURL aidSystem \\\n",
"count 1512 2739 \n",
"unique 1321 12 \n",
"top https://dataverse.org/best-practices/data-cita... [] \n",
"freq 72 2155 \n",
"\n",
" enhancedPublication qualityManagement certificate metadataStandard \\\n",
"count 2737 2739 2739 2739 \n",
"unique 3 3 14 172 \n",
"top unknown yes [] [] \n",
"freq 1608 1515 2509 1669 \n",
"\n",
" syndication remarks entryDate lastUpdate \n",
"count 2739 1674 2739 2739 \n",
"unique 563 1656 1275 740 \n",
"top {} is covered by Elsevier. 2016-05-10 2021-09-03 \n",
"freq 2162 14 20 137 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re3data_df.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"orgIdentifier 0\n",
"repositoryName 0\n",
"repositoryName.language 0\n",
"additionalName 569\n",
"repositoryURL 23\n",
"repositoryIdentifier 1876\n",
"repositoryContact 0\n",
"description 0\n",
"description.language 0\n",
"type 29\n",
"size 0\n",
"startDate 963\n",
"endDate 2582\n",
"repositoryLanguage 0\n",
"subject 19\n",
"missionStatementURL 421\n",
"contentType 7\n",
"providerType 4\n",
"keyword 7\n",
"institution 1\n",
"policy 0\n",
"databaseAccess 0\n",
"databaseLicense 0\n",
"dataAccess 0\n",
"dataLicense 0\n",
"dataUploadType 28\n",
"dataUploadLicense 0\n",
"software 0\n",
"versioning 1423\n",
"api 0\n",
"pidSystem 0\n",
"citationGuidelineURL 1227\n",
"aidSystem 0\n",
"enhancedPublication 2\n",
"qualityManagement 0\n",
"certificate 0\n",
"metadataStandard 0\n",
"syndication 0\n",
"remarks 1065\n",
"entryDate 0\n",
"lastUpdate 0\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re3data_df.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"contentType\n",
"Archived data 658\n",
"Audiovisual data 542\n",
"Configuration data 79\n",
"Databases 586\n",
"Images 1378\n",
"Networkbased data 153\n",
"Plain text 1158\n",
"Raw data 1197\n",
"Scientific and statistical data formats 1685\n",
"Software applications 456\n",
"Source code 209\n",
"Standard office documents 1684\n",
"Structured graphics 917\n",
"Structured text 848\n",
"other 962\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"types = re3data_df.contentType.explode().apply(lambda x: x['name'] if x is not np.nan else np.nan)\n",
"pd.DataFrame(types).groupby('contentType').size()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"providerType\n",
"dataProvider 2491\n",
"serviceProvider 963\n",
"dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(re3data_df.providerType.explode()).groupby('providerType').size()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}