{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import ast\n", "import csv\n", "import json\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import plotly\n", "from plotly.offline import iplot, init_notebook_mode\n", "import plotly.graph_objs as go\n", "import plotly.express as px\n", "\n", "pd.set_option('display.max_columns', None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Loading dataset" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orgIdentifierrepositoryNamerepositoryName.languageadditionalNamerepositoryURLrepositoryIdentifierrepositoryContactdescriptiondescription.languagetypesizestartDateendDaterepositoryLanguagesubjectmissionStatementURLcontentTypeproviderTypekeywordinstitutionpolicydatabaseAccessdatabaseLicensedataAccessdataLicensedataUploadTypedataUploadLicensesoftwareversioningapipidSystemcitationGuidelineURLaidSystemenhancedPublicationqualityManagementcertificatemetadataStandardsyndicationremarksentryDatelastUpdate
0r3d100000001Odum Institute Archive Dataverseeng[]https://dataverse.unc.edu/dataverse/odum[][\"https://dataverse.unc.edu/dataverse/odum#\", ...The Odum Institute Archive Dataverse contains ...eng[disciplinary]{\"size\": \"13 dataverses; 3.050 datasets\", \"upd...NaNNaN[\"eng\"][{'name': '1 Humanities and Social Sciences', ...NaN[{'name': 'Databases', 'scheme': 'parse'}, {'n...[dataProvider][FAIR, Middle East, crime, demography, economy...[{'institutionName': 'Odum Institute for Resea...[{\"policyName\": \"Collection Development Policy...{\"databaseAccessType\": \"open\", \"databaseAcces...[{\"databaseLicenseName\": \"CC0\", \"databaseLicen...[{\"dataAccessType\": \"embargoed\", \"dataAccessRe...[{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"...restricted[][\"DataVerse\"]NaN{}[\"DOI\"]NaN[]unknownyes[\"other\"][{\"metadataStandardName\": \"DDI - Data Document...{}Odum Dataverse is covered by Thomson Reuters D...2013-06-102021-07-06
1r3d100000002Access to Archival Databaseseng[{'additionalName': 'AAD', 'additionalNameLang...https://aad.archives.gov/aad/[RRID:SCR_010479, RRID:nlx_157752][\"https://www.archives.gov/contact\"]You will find in the Access to Archival Databa...eng[disciplinary]{\"size\": \"\", \"updatedp\": \"\"}1985NaN[\"eng\", \"spa\"][{'name': '1 Humanities and Social Sciences', ...https://www.archives.gov/publications/general-...[{'name': 'Images', 'scheme': 'parse'}, {'name...[dataProvider][US History][{'institutionName': 'The U.S. National Archiv...[{\"policyName\": \"Contribution Policy\", \"policy...{\"databaseAccessType\": \"open\", \"databaseAcces...[][{\"dataAccessType\": \"open\", \"dataAccessRestric...[{\"dataLicenseName\": \"Copyrights\", \"dataLicens...restricted[][\"unknown\"]no{\"api\": \"https://www.archives.gov/developer#to...[\"none\"]https://aad.archives.gov/aad/help/getting-star...[]unknownunknown[][]{\"syndication\": \"http://www.archives.gov/socia...NaN2012-07-042021-05-25
2r3d100000004Datenbank Gesprochenes Deutschdeu[{'additionalName': 'DGD', 'additionalNameLang...https://dgd.ids-mannheim.de/[][\"dgd@ids-mannheim.de\"]The \"Database for Spoken German (DGD)\" is a co...eng[disciplinary]{\"size\": \"34 corpora\", \"updatedp\": \"2020-02-03\"}2012NaN[\"deu\"][{'name': '1 Humanities and Social Sciences', ...https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext...[{'name': 'Audiovisual data', 'scheme': 'parse...[dataProvider, serviceProvider][Australian German, FOLK, German dialects, Pfe...[{'institutionName': 'Institut für Deutsche Sp...[{\"policyName\": \"Erfurter Aufruf zur Sicherung...{\"databaseAccessType\": \"restricted\", \"databas...[][{\"dataAccessType\": \"restricted\", \"dataAccessR...[{\"dataLicenseName\": \"other\", \"dataLicenseURL\"...restricted[][\"other\"]yes{}[\"none\"]http://agd.ids-mannheim.de/konditionen.shtml[]unknownunknown[\"RatSWD\"][]{}NaN2012-07-202020-08-27
3r3d100000005UNC Dataverseeng[{'additionalName': 'University of North Carol...https://dataverse.unc.edu/[FAIRsharing_doi:10.25504/FAIRsharing.pS2p8c][\"https://dataverse.unc.edu/\", \"odumarchive@un...UNC Dataverse is an open-source repository sof...eng[institutional]{\"size\": \"186 dataverses; 25.272 studies; 229....2011NaN[\"eng\"][{'name': '1 Humanities and Social Sciences', ...https://odum.unc.edu/about/mission-vision/[{'name': 'Archived data', 'scheme': 'parse'},...[dataProvider, serviceProvider][FAIR, census, demographic survey, demography,...[{'institutionName': 'Odum Institute for Resea...[{\"policyName\": \"Collection Development Policy...{\"databaseAccessType\": \"open\", \"databaseAcces...[][{\"dataAccessType\": \"open\", \"dataAccessRestric...[{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"...restricted[{\"dataUploadLicenseName\": \"Data Deposit Form\"...[\"DataVerse\"]yes{\"api\": \"https://guides.dataverse.org/en/lates...[\"ARK\", \"DOI\", \"PURL\", \"URN\", \"hdl\"]https://dataverse.org/best-practices/data-cita...[]unknownyes[][{\"metadataStandardName\": \"DDI - Data Document...{}UNC Dataverse is covered by Clarivate Data Cit...2012-07-232021-10-25
4r3d100000006Archaeology Data Serviceeng[{'additionalName': 'ADS', 'additionalNameLang...https://archaeologydataservice.ac.uk/[FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg][\"help@archaeologydataservice.ac.uk\", \"https:/...The ADS is an accredited digital repository fo...eng[disciplinary]{\"size\": \"1837 results\", \"updatedp\": \"2020-05-...1996-10-01NaN[\"eng\"][{'name': '1 Humanities and Social Sciences', ...https://archaeologydataservice.ac.uk/about/our...[{'name': 'Archived data', 'scheme': 'parse'},...[dataProvider, serviceProvider][FAIR, archaeology, cultural heritage, prehist...[{'institutionName': 'Arts and Humanities Rese...[{\"policyName\": \"ADS Guides to good practice\",...{\"databaseAccessType\": \"open\", \"databaseAcces...[{\"databaseLicenseName\": \"CC\", \"databaseLicens...[{\"dataAccessType\": \"open\", \"dataAccessRestric...[{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"...restricted[{\"dataUploadLicenseName\": \"Guidelines for Dep...[\"other\"]yes{\"api\": \"https://archaeologydataservice.ac.uk/...[\"DOI\"]https://archaeologydataservice.ac.uk/advice/te...[]unknownyes[\"other\"][{\"metadataStandardName\": \"DataCite Metadata S...{\"syndication\": \"https://archaeologydataservic...ADS is covered by Clarivate Data Citation Inde...2012-07-232021-09-02
\n", "
" ], "text/plain": [ " orgIdentifier repositoryName repositoryName.language \\\n", "0 r3d100000001 Odum Institute Archive Dataverse eng \n", "1 r3d100000002 Access to Archival Databases eng \n", "2 r3d100000004 Datenbank Gesprochenes Deutsch deu \n", "3 r3d100000005 UNC Dataverse eng \n", "4 r3d100000006 Archaeology Data Service eng \n", "\n", " additionalName \\\n", "0 [] \n", "1 [{'additionalName': 'AAD', 'additionalNameLang... \n", "2 [{'additionalName': 'DGD', 'additionalNameLang... \n", "3 [{'additionalName': 'University of North Carol... \n", "4 [{'additionalName': 'ADS', 'additionalNameLang... \n", "\n", " repositoryURL \\\n", "0 https://dataverse.unc.edu/dataverse/odum \n", "1 https://aad.archives.gov/aad/ \n", "2 https://dgd.ids-mannheim.de/ \n", "3 https://dataverse.unc.edu/ \n", "4 https://archaeologydataservice.ac.uk/ \n", "\n", " repositoryIdentifier \\\n", "0 [] \n", "1 [RRID:SCR_010479, RRID:nlx_157752] \n", "2 [] \n", "3 [FAIRsharing_doi:10.25504/FAIRsharing.pS2p8c] \n", "4 [FAIRsharing_doi:10.25504/FAIRsharing.hm1mfg] \n", "\n", " repositoryContact \\\n", "0 [\"https://dataverse.unc.edu/dataverse/odum#\", ... \n", "1 [\"https://www.archives.gov/contact\"] \n", "2 [\"dgd@ids-mannheim.de\"] \n", "3 [\"https://dataverse.unc.edu/\", \"odumarchive@un... \n", "4 [\"help@archaeologydataservice.ac.uk\", \"https:/... \n", "\n", " description description.language \\\n", "0 The Odum Institute Archive Dataverse contains ... eng \n", "1 You will find in the Access to Archival Databa... eng \n", "2 The \"Database for Spoken German (DGD)\" is a co... eng \n", "3 UNC Dataverse is an open-source repository sof... eng \n", "4 The ADS is an accredited digital repository fo... eng \n", "\n", " type size \\\n", "0 [disciplinary] {\"size\": \"13 dataverses; 3.050 datasets\", \"upd... \n", "1 [disciplinary] {\"size\": \"\", \"updatedp\": \"\"} \n", "2 [disciplinary] {\"size\": \"34 corpora\", \"updatedp\": \"2020-02-03\"} \n", "3 [institutional] {\"size\": \"186 dataverses; 25.272 studies; 229.... \n", "4 [disciplinary] {\"size\": \"1837 results\", \"updatedp\": \"2020-05-... \n", "\n", " startDate endDate repositoryLanguage \\\n", "0 NaN NaN [\"eng\"] \n", "1 1985 NaN [\"eng\", \"spa\"] \n", "2 2012 NaN [\"deu\"] \n", "3 2011 NaN [\"eng\"] \n", "4 1996-10-01 NaN [\"eng\"] \n", "\n", " subject \\\n", "0 [{'name': '1 Humanities and Social Sciences', ... \n", "1 [{'name': '1 Humanities and Social Sciences', ... \n", "2 [{'name': '1 Humanities and Social Sciences', ... \n", "3 [{'name': '1 Humanities and Social Sciences', ... \n", "4 [{'name': '1 Humanities and Social Sciences', ... \n", "\n", " missionStatementURL \\\n", "0 NaN \n", "1 https://www.archives.gov/publications/general-... \n", "2 https://dgd.ids-mannheim.de/dgd/pragdb.dgd_ext... \n", "3 https://odum.unc.edu/about/mission-vision/ \n", "4 https://archaeologydataservice.ac.uk/about/our... \n", "\n", " contentType \\\n", "0 [{'name': 'Databases', 'scheme': 'parse'}, {'n... \n", "1 [{'name': 'Images', 'scheme': 'parse'}, {'name... \n", "2 [{'name': 'Audiovisual data', 'scheme': 'parse... \n", "3 [{'name': 'Archived data', 'scheme': 'parse'},... \n", "4 [{'name': 'Archived data', 'scheme': 'parse'},... \n", "\n", " providerType \\\n", "0 [dataProvider] \n", "1 [dataProvider] \n", "2 [dataProvider, serviceProvider] \n", "3 [dataProvider, serviceProvider] \n", "4 [dataProvider, serviceProvider] \n", "\n", " keyword \\\n", "0 [FAIR, Middle East, crime, demography, economy... \n", "1 [US History] \n", "2 [Australian German, FOLK, German dialects, Pfe... \n", "3 [FAIR, census, demographic survey, demography,... \n", "4 [FAIR, archaeology, cultural heritage, prehist... \n", "\n", " institution \\\n", "0 [{'institutionName': 'Odum Institute for Resea... \n", "1 [{'institutionName': 'The U.S. National Archiv... \n", "2 [{'institutionName': 'Institut für Deutsche Sp... \n", "3 [{'institutionName': 'Odum Institute for Resea... \n", "4 [{'institutionName': 'Arts and Humanities Rese... \n", "\n", " policy \\\n", "0 [{\"policyName\": \"Collection Development Policy... \n", "1 [{\"policyName\": \"Contribution Policy\", \"policy... \n", "2 [{\"policyName\": \"Erfurter Aufruf zur Sicherung... \n", "3 [{\"policyName\": \"Collection Development Policy... \n", "4 [{\"policyName\": \"ADS Guides to good practice\",... \n", "\n", " databaseAccess \\\n", "0 {\"databaseAccessType\": \"open\", \"databaseAcces... \n", "1 {\"databaseAccessType\": \"open\", \"databaseAcces... \n", "2 {\"databaseAccessType\": \"restricted\", \"databas... \n", "3 {\"databaseAccessType\": \"open\", \"databaseAcces... \n", "4 {\"databaseAccessType\": \"open\", \"databaseAcces... \n", "\n", " databaseLicense \\\n", "0 [{\"databaseLicenseName\": \"CC0\", \"databaseLicen... \n", "1 [] \n", "2 [] \n", "3 [] \n", "4 [{\"databaseLicenseName\": \"CC\", \"databaseLicens... \n", "\n", " dataAccess \\\n", "0 [{\"dataAccessType\": \"embargoed\", \"dataAccessRe... \n", "1 [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n", "2 [{\"dataAccessType\": \"restricted\", \"dataAccessR... \n", "3 [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n", "4 [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n", "\n", " dataLicense dataUploadType \\\n", "0 [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n", "1 [{\"dataLicenseName\": \"Copyrights\", \"dataLicens... restricted \n", "2 [{\"dataLicenseName\": \"other\", \"dataLicenseURL\"... restricted \n", "3 [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n", "4 [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n", "\n", " dataUploadLicense software \\\n", "0 [] [\"DataVerse\"] \n", "1 [] [\"unknown\"] \n", "2 [] [\"other\"] \n", "3 [{\"dataUploadLicenseName\": \"Data Deposit Form\"... [\"DataVerse\"] \n", "4 [{\"dataUploadLicenseName\": \"Guidelines for Dep... [\"other\"] \n", "\n", " versioning api \\\n", "0 NaN {} \n", "1 no {\"api\": \"https://www.archives.gov/developer#to... \n", "2 yes {} \n", "3 yes {\"api\": \"https://guides.dataverse.org/en/lates... \n", "4 yes {\"api\": \"https://archaeologydataservice.ac.uk/... \n", "\n", " pidSystem \\\n", "0 [\"DOI\"] \n", "1 [\"none\"] \n", "2 [\"none\"] \n", "3 [\"ARK\", \"DOI\", \"PURL\", \"URN\", \"hdl\"] \n", "4 [\"DOI\"] \n", "\n", " citationGuidelineURL aidSystem \\\n", "0 NaN [] \n", "1 https://aad.archives.gov/aad/help/getting-star... [] \n", "2 http://agd.ids-mannheim.de/konditionen.shtml [] \n", "3 https://dataverse.org/best-practices/data-cita... [] \n", "4 https://archaeologydataservice.ac.uk/advice/te... [] \n", "\n", " enhancedPublication qualityManagement certificate \\\n", "0 unknown yes [\"other\"] \n", "1 unknown unknown [] \n", "2 unknown unknown [\"RatSWD\"] \n", "3 unknown yes [] \n", "4 unknown yes [\"other\"] \n", "\n", " metadataStandard \\\n", "0 [{\"metadataStandardName\": \"DDI - Data Document... \n", "1 [] \n", "2 [] \n", "3 [{\"metadataStandardName\": \"DDI - Data Document... \n", "4 [{\"metadataStandardName\": \"DataCite Metadata S... \n", "\n", " syndication \\\n", "0 {} \n", "1 {\"syndication\": \"http://www.archives.gov/socia... \n", "2 {} \n", "3 {} \n", "4 {\"syndication\": \"https://archaeologydataservic... \n", "\n", " remarks entryDate lastUpdate \n", "0 Odum Dataverse is covered by Thomson Reuters D... 2013-06-10 2021-07-06 \n", "1 NaN 2012-07-04 2021-05-25 \n", "2 NaN 2012-07-20 2020-08-27 \n", "3 UNC Dataverse is covered by Clarivate Data Cit... 2012-07-23 2021-10-25 \n", "4 ADS is covered by Clarivate Data Citation Inde... 2012-07-23 2021-09-02 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t',\n", " converters={'subject': ast.literal_eval,\n", " 'keyword': ast.literal_eval,\n", " 'additionalName': ast.literal_eval,\n", " 'repositoryIdentifier': ast.literal_eval,\n", " 'type': ast.literal_eval,\n", " 'contentType': ast.literal_eval,\n", " 'providerType': ast.literal_eval,\n", " 'institution': ast.literal_eval\n", " })\n", "\n", "re3data_df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['orgIdentifier', 'repositoryName', 'repositoryName.language',\n", " 'additionalName', 'repositoryURL', 'repositoryIdentifier',\n", " 'repositoryContact', 'description', 'description.language', 'type',\n", " 'size', 'startDate', 'endDate', 'repositoryLanguage', 'subject',\n", " 'missionStatementURL', 'contentType', 'providerType', 'keyword',\n", " 'institution', 'policy', 'databaseAccess', 'databaseLicense',\n", " 'dataAccess', 'dataLicense', 'dataUploadType', 'dataUploadLicense',\n", " 'software', 'versioning', 'api', 'pidSystem', 'citationGuidelineURL',\n", " 'aidSystem', 'enhancedPublication', 'qualityManagement', 'certificate',\n", " 'metadataStandard', 'syndication', 'remarks', 'entryDate',\n", " 'lastUpdate'],\n", " dtype='object')" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re3data_df.columns" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def empty_list_is_nan(cell):\n", " if isinstance(cell, list):\n", " return np.nan if len(cell) == 0 else cell\n", " else:\n", " return cell\n", " \n", "re3data_df = re3data_df.applymap(empty_list_is_nan)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
orgIdentifierrepositoryNamerepositoryName.languageadditionalNamerepositoryURLrepositoryIdentifierrepositoryContactdescriptiondescription.languagetypesizestartDateendDaterepositoryLanguagesubjectmissionStatementURLcontentTypeproviderTypekeywordinstitutionpolicydatabaseAccessdatabaseLicensedataAccessdataLicensedataUploadTypedataUploadLicensesoftwareversioningapipidSystemcitationGuidelineURLaidSystemenhancedPublicationqualityManagementcertificatemetadataStandardsyndicationremarksentryDatelastUpdate
count2793279327932206276910242793279327932777279318001722793278923732787278827852792279327932793279327932778279327931339279327931532279327932793279327932793169427932793
unique2793279119219627661023253227926813213628611014172304135052543277223661237714622943695232117029133713331617554416731316722
topr3d100000001EarthChem Libraryeng[{'additionalName': 'FRED', 'additionalNameLan...http://icgem.gfz-potsdam.de/home[biodbcore-001574][]The National Archives and Records Administrati...eng[disciplinary]{\"size\": \"\", \"updatedp\": \"\"}20082015[\"eng\"][{'name': '1 Humanities and Social Sciences', ...https://learn.scholarsportal.info/all-guides/d...[{'name': 'Standard office documents', 'scheme...[dataProvider][multidisciplinary][{'institutionName': 'National Center for Biot...[][]{\"databaseAccessType\": \"open\", \"databaseAcces...[][{\"dataAccessType\": \"open\", \"dataAccessRestric...[{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"...restricted[][\"unknown\"]yes{}[\"none\"]https://dataverse.org/best-practices/data-cita...[]unknownyes[][]{}is covered by Elsevier.2018-08-102021-09-03
freq12259622217022776176814729312208824014291806205731926242201129271185120541216113115261359762199164315692557169322351720104
\n", "
" ], "text/plain": [ " orgIdentifier repositoryName repositoryName.language \\\n", "count 2793 2793 2793 \n", "unique 2793 2791 19 \n", "top r3d100000001 EarthChem Library eng \n", "freq 1 2 2596 \n", "\n", " additionalName \\\n", "count 2206 \n", "unique 2196 \n", "top [{'additionalName': 'FRED', 'additionalNameLan... \n", "freq 2 \n", "\n", " repositoryURL repositoryIdentifier \\\n", "count 2769 1024 \n", "unique 2766 1023 \n", "top http://icgem.gfz-potsdam.de/home [biodbcore-001574] \n", "freq 2 2 \n", "\n", " repositoryContact description \\\n", "count 2793 2793 \n", "unique 2532 2792 \n", "top [] The National Archives and Records Administrati... \n", "freq 170 2 \n", "\n", " description.language type size \\\n", "count 2793 2777 2793 \n", "unique 6 8 1321 \n", "top eng [disciplinary] {\"size\": \"\", \"updatedp\": \"\"} \n", "freq 2776 1768 1472 \n", "\n", " startDate endDate repositoryLanguage \\\n", "count 1800 172 2793 \n", "unique 362 86 110 \n", "top 2008 2015 [\"eng\"] \n", "freq 93 12 2088 \n", "\n", " subject \\\n", "count 2789 \n", "unique 1417 \n", "top [{'name': '1 Humanities and Social Sciences', ... \n", "freq 240 \n", "\n", " missionStatementURL \\\n", "count 2373 \n", "unique 2304 \n", "top https://learn.scholarsportal.info/all-guides/d... \n", "freq 14 \n", "\n", " contentType providerType \\\n", "count 2787 2788 \n", "unique 1350 5 \n", "top [{'name': 'Standard office documents', 'scheme... [dataProvider] \n", "freq 29 1806 \n", "\n", " keyword \\\n", "count 2785 \n", "unique 2543 \n", "top [multidisciplinary] \n", "freq 205 \n", "\n", " institution policy \\\n", "count 2792 2793 \n", "unique 2772 2366 \n", "top [{'institutionName': 'National Center for Biot... [][] \n", "freq 7 319 \n", "\n", " databaseAccess databaseLicense \\\n", "count 2793 2793 \n", "unique 12 377 \n", "top {\"databaseAccessType\": \"open\", \"databaseAcces... [] \n", "freq 2624 2201 \n", "\n", " dataAccess \\\n", "count 2793 \n", "unique 146 \n", "top [{\"dataAccessType\": \"open\", \"dataAccessRestric... \n", "freq 1292 \n", "\n", " dataLicense dataUploadType \\\n", "count 2793 2778 \n", "unique 2294 3 \n", "top [{\"dataLicenseName\": \"CC\", \"dataLicenseURL\": \"... restricted \n", "freq 71 1851 \n", "\n", " dataUploadLicense software versioning api pidSystem \\\n", "count 2793 2793 1339 2793 2793 \n", "unique 695 23 2 1170 29 \n", "top [] [\"unknown\"] yes {} [\"none\"] \n", "freq 2054 1216 1131 1526 1359 \n", "\n", " citationGuidelineURL aidSystem \\\n", "count 1532 2793 \n", "unique 1337 13 \n", "top https://dataverse.org/best-practices/data-cita... [] \n", "freq 76 2199 \n", "\n", " enhancedPublication qualityManagement certificate metadataStandard \\\n", "count 2793 2793 2793 2793 \n", "unique 3 3 16 175 \n", "top unknown yes [] [] \n", "freq 1643 1569 2557 1693 \n", "\n", " syndication remarks entryDate lastUpdate \n", "count 2793 1694 2793 2793 \n", "unique 544 1673 1316 722 \n", "top {} is covered by Elsevier. 2018-08-10 2021-09-03 \n", "freq 2235 17 20 104 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re3data_df.describe(include='all')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "orgIdentifier 0\n", "repositoryName 0\n", "repositoryName.language 0\n", "additionalName 587\n", "repositoryURL 24\n", "repositoryIdentifier 1769\n", "repositoryContact 0\n", "description 0\n", "description.language 0\n", "type 16\n", "size 0\n", "startDate 993\n", "endDate 2621\n", "repositoryLanguage 0\n", "subject 4\n", "missionStatementURL 420\n", "contentType 6\n", "providerType 5\n", "keyword 8\n", "institution 1\n", "policy 0\n", "databaseAccess 0\n", "databaseLicense 0\n", "dataAccess 0\n", "dataLicense 0\n", "dataUploadType 15\n", "dataUploadLicense 0\n", "software 0\n", "versioning 1454\n", "api 0\n", "pidSystem 0\n", "citationGuidelineURL 1261\n", "aidSystem 0\n", "enhancedPublication 0\n", "qualityManagement 0\n", "certificate 0\n", "metadataStandard 0\n", "syndication 0\n", "remarks 1099\n", "entryDate 0\n", "lastUpdate 0\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "re3data_df.isna().sum()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "contentType\n", "Archived data 677\n", "Audiovisual data 561\n", "Configuration data 84\n", "Databases 593\n", "Images 1407\n", "Networkbased data 155\n", "Plain text 1173\n", "Raw data 1224\n", "Scientific and statistical data formats 1725\n", "Software applications 462\n", "Source code 217\n", "Standard office documents 1719\n", "Structured graphics 937\n", "Structured text 878\n", "other 979\n", "dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "types = re3data_df.contentType.explode().apply(lambda x: x['name'] if x is not np.nan else np.nan)\n", "pd.DataFrame(types).groupby('contentType').size()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "providerType\n", "dataProvider 2539\n", "serviceProvider 982\n", "dtype: int64" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(re3data_df.providerType.explode()).groupby('providerType').size()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }