2021-07-22 11:35:40 +02:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 1,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"import ast\n",
|
|
|
|
"import csv\n",
|
|
|
|
"import json\n",
|
|
|
|
"\n",
|
|
|
|
"import numpy as np\n",
|
|
|
|
"import pandas as pd\n",
|
|
|
|
"\n",
|
|
|
|
"import plotly\n",
|
|
|
|
"from plotly.offline import iplot, init_notebook_mode\n",
|
|
|
|
"import plotly.graph_objs as go\n",
|
|
|
|
"import plotly.express as px\n",
|
|
|
|
"\n",
|
|
|
|
"pd.set_option('display.max_columns', None)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## Loading datasets"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-10-08 12:46:14 +02:00
|
|
|
"execution_count": 2,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <th>system_metadata.id</th>\n",
|
|
|
|
" <th>repository_metadata.name</th>\n",
|
|
|
|
" <th>repository_metadata.alternativename</th>\n",
|
|
|
|
" <th>repository_metadata.url</th>\n",
|
|
|
|
" <th>repository_metadata.description</th>\n",
|
|
|
|
" <th>repository_metadata.type</th>\n",
|
|
|
|
" <th>repository_metadata.content_languages</th>\n",
|
|
|
|
" <th>system_metadata.date_modified</th>\n",
|
|
|
|
" <th>system_metadata.date_created</th>\n",
|
|
|
|
" <th>repository_metadata.content_subjects</th>\n",
|
|
|
|
" <th>repository_metadata.content_types</th>\n",
|
|
|
|
" <th>organization</th>\n",
|
|
|
|
" <th>policy_urls</th>\n",
|
|
|
|
" <th>repository_metadata.software</th>\n",
|
|
|
|
" <th>repository_metadata.oai_url</th>\n",
|
|
|
|
" <th>system_metadata.publicly_visible</th>\n",
|
|
|
|
" <th>repository_metadata.repository_status</th>\n",
|
|
|
|
" <th>repository_metadata.fulltext_record_count</th>\n",
|
|
|
|
" <th>repository_metadata.metadata_record_count</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>0</th>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>175</td>\n",
|
|
|
|
" <td>{\"name\": \"hku theses online\", \"language\": \"en\"}</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[]</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>http://hub.hku.hk/handle/10722/1057</td>\n",
|
|
|
|
" <td>this is an institutional repository providing ...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>institutional</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>[\"zh\", \"en\"]</td>\n",
|
|
|
|
" <td>2021-03-25 10:16:18</td>\n",
|
|
|
|
" <td>2005-12-21 12:44:08</td>\n",
|
|
|
|
" <td>[\"multidisciplinary\"]</td>\n",
|
|
|
|
" <td>[bibliographic_references, theses_and_disserta...</td>\n",
|
|
|
|
" <td>[{'name': 'university of hong kong', 'alternat...</td>\n",
|
|
|
|
" <td>[]</td>\n",
|
|
|
|
" <td>{\"name\": \"dspace\", \"version\": \"cris-5.3.1-snap...</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>yes</td>\n",
|
|
|
|
" <td>fully_functional</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>11850.0</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>1</th>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>64</td>\n",
|
|
|
|
" <td>{\"name\": \"research support scheme - central eu...</td>\n",
|
|
|
|
" <td>[]</td>\n",
|
|
|
|
" <td>http://rss.archives.ceu.hu/</td>\n",
|
|
|
|
" <td>this is an institutional repository collecting...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>institutional</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>[\"cs\", \"en\", \"hu\", \"ru\"]</td>\n",
|
|
|
|
" <td>2021-03-25 09:48:31</td>\n",
|
|
|
|
" <td>2006-01-04 14:59:30</td>\n",
|
|
|
|
" <td>[\"multidisciplinary\"]</td>\n",
|
|
|
|
" <td>[unpub_reports_and_working_papers]</td>\n",
|
|
|
|
" <td>[{'name': 'central european university', 'alte...</td>\n",
|
|
|
|
" <td>[]</td>\n",
|
|
|
|
" <td>{\"name\": \"eprints\", \"version\": \"2.2.1\"}</td>\n",
|
|
|
|
" <td>http://rss.archives.ceu.hu/perl/oai2</td>\n",
|
|
|
|
" <td>yes</td>\n",
|
|
|
|
" <td>fully_functional</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>164.0</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>2</th>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>151</td>\n",
|
|
|
|
" <td>{\"name\": \"cadmus, eui research repository\", \"l...</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[]</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>http://cadmus.eui.eu/</td>\n",
|
|
|
|
" <td>cadmus is the name of the eui research reposit...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>institutional</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>[\"nl\", \"en\", \"fr\", \"de\", \"it\"]</td>\n",
|
|
|
|
" <td>2021-09-13 13:35:36</td>\n",
|
|
|
|
" <td>2006-01-04 12:07:07</td>\n",
|
|
|
|
" <td>[\"history and archaeology\", \"multidisciplinary...</td>\n",
|
|
|
|
" <td>[journal_articles, theses_and_dissertations, u...</td>\n",
|
|
|
|
" <td>[{'name': 'european university institute', 'al...</td>\n",
|
|
|
|
" <td>[{\"policy_url\": \"https://www.eui.eu/research/e...</td>\n",
|
|
|
|
" <td>{\"name\": \"dspace\", \"version\": \"5.2\"}</td>\n",
|
|
|
|
" <td>http://cadmus.eui.eu/oai/request</td>\n",
|
|
|
|
" <td>yes</td>\n",
|
|
|
|
" <td>fully_functional</td>\n",
|
|
|
|
" <td>3867.0</td>\n",
|
|
|
|
" <td>24869.0</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>3</th>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>105</td>\n",
|
|
|
|
" <td>{\"name\": \"document server@uhasselt\", \"language...</td>\n",
|
|
|
|
" <td>[]</td>\n",
|
|
|
|
" <td>https://doclib.uhasselt.be/dspace/</td>\n",
|
|
|
|
" <td>this site is a university repository providing...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>institutional</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>[\"nl\", \"en\", \"fr\", \"de\"]</td>\n",
|
|
|
|
" <td>2021-04-16 15:23:52</td>\n",
|
|
|
|
" <td>2006-01-24 15:46:44</td>\n",
|
|
|
|
" <td>[\"multidisciplinary\"]</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>[{'name': 'uhasselt', 'alternativeName': 'hass...</td>\n",
|
|
|
|
" <td>[]</td>\n",
|
|
|
|
" <td>{\"name\": \"dspace\", \"version\": \"1.7.2\"}</td>\n",
|
|
|
|
" <td>http://doclib.uhasselt.be/dspace-oai/request</td>\n",
|
|
|
|
" <td>yes</td>\n",
|
|
|
|
" <td>fully_functional</td>\n",
|
|
|
|
" <td>0.0</td>\n",
|
|
|
|
" <td>27376.0</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>4</th>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>101</td>\n",
|
|
|
|
" <td>{\"name\": \"utrecht university repository\", \"lan...</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[]</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>http://dspace.library.uu.nl</td>\n",
|
|
|
|
" <td>this site is a university repository providing...</td>\n",
|
|
|
|
" <td>institutional</td>\n",
|
|
|
|
" <td>[\"nl\", \"en\"]</td>\n",
|
|
|
|
" <td>2021-04-16 15:22:03</td>\n",
|
|
|
|
" <td>2006-01-13 12:55:13</td>\n",
|
|
|
|
" <td>[\"multidisciplinary\"]</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>[{'name': 'university of utrecht', 'alternativ...</td>\n",
|
|
|
|
" <td>[]</td>\n",
|
|
|
|
" <td>{\"name\": \"dspace\", \"version\": \"\"}</td>\n",
|
|
|
|
" <td>https://dspace.library.uu.nl/oai/request</td>\n",
|
|
|
|
" <td>yes</td>\n",
|
|
|
|
" <td>fully_functional</td>\n",
|
|
|
|
" <td>1686.0</td>\n",
|
|
|
|
" <td>185637.0</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
2021-10-08 12:46:14 +02:00
|
|
|
" system_metadata.id repository_metadata.name \\\n",
|
|
|
|
"0 175 {\"name\": \"hku theses online\", \"language\": \"en\"} \n",
|
|
|
|
"1 64 {\"name\": \"research support scheme - central eu... \n",
|
|
|
|
"2 151 {\"name\": \"cadmus, eui research repository\", \"l... \n",
|
|
|
|
"3 105 {\"name\": \"document server@uhasselt\", \"language... \n",
|
|
|
|
"4 101 {\"name\": \"utrecht university repository\", \"lan... \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" repository_metadata.alternativename repository_metadata.url \\\n",
|
|
|
|
"0 [] http://hub.hku.hk/handle/10722/1057 \n",
|
|
|
|
"1 [] http://rss.archives.ceu.hu/ \n",
|
|
|
|
"2 [] http://cadmus.eui.eu/ \n",
|
|
|
|
"3 [] https://doclib.uhasselt.be/dspace/ \n",
|
|
|
|
"4 [] http://dspace.library.uu.nl \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" repository_metadata.description repository_metadata.type \\\n",
|
|
|
|
"0 this is an institutional repository providing ... institutional \n",
|
|
|
|
"1 this is an institutional repository collecting... institutional \n",
|
|
|
|
"2 cadmus is the name of the eui research reposit... institutional \n",
|
|
|
|
"3 this site is a university repository providing... institutional \n",
|
|
|
|
"4 this site is a university repository providing... institutional \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" repository_metadata.content_languages system_metadata.date_modified \\\n",
|
|
|
|
"0 [\"zh\", \"en\"] 2021-03-25 10:16:18 \n",
|
|
|
|
"1 [\"cs\", \"en\", \"hu\", \"ru\"] 2021-03-25 09:48:31 \n",
|
|
|
|
"2 [\"nl\", \"en\", \"fr\", \"de\", \"it\"] 2021-09-13 13:35:36 \n",
|
|
|
|
"3 [\"nl\", \"en\", \"fr\", \"de\"] 2021-04-16 15:23:52 \n",
|
|
|
|
"4 [\"nl\", \"en\"] 2021-04-16 15:22:03 \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" system_metadata.date_created \\\n",
|
|
|
|
"0 2005-12-21 12:44:08 \n",
|
|
|
|
"1 2006-01-04 14:59:30 \n",
|
|
|
|
"2 2006-01-04 12:07:07 \n",
|
|
|
|
"3 2006-01-24 15:46:44 \n",
|
|
|
|
"4 2006-01-13 12:55:13 \n",
|
2021-07-22 11:35:40 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" repository_metadata.content_subjects \\\n",
|
|
|
|
"0 [\"multidisciplinary\"] \n",
|
|
|
|
"1 [\"multidisciplinary\"] \n",
|
|
|
|
"2 [\"history and archaeology\", \"multidisciplinary... \n",
|
|
|
|
"3 [\"multidisciplinary\"] \n",
|
|
|
|
"4 [\"multidisciplinary\"] \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" repository_metadata.content_types \\\n",
|
|
|
|
"0 [bibliographic_references, theses_and_disserta... \n",
|
|
|
|
"1 [unpub_reports_and_working_papers] \n",
|
|
|
|
"2 [journal_articles, theses_and_dissertations, u... \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"3 [journal_articles, conference_and_workshop_pap... \n",
|
|
|
|
"4 [journal_articles, conference_and_workshop_pap... \n",
|
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" organization \\\n",
|
|
|
|
"0 [{'name': 'university of hong kong', 'alternat... \n",
|
|
|
|
"1 [{'name': 'central european university', 'alte... \n",
|
|
|
|
"2 [{'name': 'european university institute', 'al... \n",
|
|
|
|
"3 [{'name': 'uhasselt', 'alternativeName': 'hass... \n",
|
|
|
|
"4 [{'name': 'university of utrecht', 'alternativ... \n",
|
|
|
|
"\n",
|
|
|
|
" policy_urls \\\n",
|
|
|
|
"0 [] \n",
|
|
|
|
"1 [] \n",
|
|
|
|
"2 [{\"policy_url\": \"https://www.eui.eu/research/e... \n",
|
|
|
|
"3 [] \n",
|
|
|
|
"4 [] \n",
|
|
|
|
"\n",
|
|
|
|
" repository_metadata.software \\\n",
|
|
|
|
"0 {\"name\": \"dspace\", \"version\": \"cris-5.3.1-snap... \n",
|
|
|
|
"1 {\"name\": \"eprints\", \"version\": \"2.2.1\"} \n",
|
|
|
|
"2 {\"name\": \"dspace\", \"version\": \"5.2\"} \n",
|
|
|
|
"3 {\"name\": \"dspace\", \"version\": \"1.7.2\"} \n",
|
|
|
|
"4 {\"name\": \"dspace\", \"version\": \"\"} \n",
|
|
|
|
"\n",
|
|
|
|
" repository_metadata.oai_url \\\n",
|
|
|
|
"0 NaN \n",
|
|
|
|
"1 http://rss.archives.ceu.hu/perl/oai2 \n",
|
|
|
|
"2 http://cadmus.eui.eu/oai/request \n",
|
|
|
|
"3 http://doclib.uhasselt.be/dspace-oai/request \n",
|
|
|
|
"4 https://dspace.library.uu.nl/oai/request \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" system_metadata.publicly_visible repository_metadata.repository_status \\\n",
|
|
|
|
"0 yes fully_functional \n",
|
|
|
|
"1 yes fully_functional \n",
|
|
|
|
"2 yes fully_functional \n",
|
|
|
|
"3 yes fully_functional \n",
|
|
|
|
"4 yes fully_functional \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" repository_metadata.fulltext_record_count \\\n",
|
|
|
|
"0 NaN \n",
|
|
|
|
"1 NaN \n",
|
|
|
|
"2 3867.0 \n",
|
|
|
|
"3 0.0 \n",
|
|
|
|
"4 1686.0 \n",
|
|
|
|
"\n",
|
|
|
|
" repository_metadata.metadata_record_count \n",
|
|
|
|
"0 11850.0 \n",
|
|
|
|
"1 164.0 \n",
|
|
|
|
"2 24869.0 \n",
|
|
|
|
"3 27376.0 \n",
|
|
|
|
"4 185637.0 "
|
2021-07-22 11:35:40 +02:00
|
|
|
]
|
|
|
|
},
|
2021-10-08 12:46:14 +02:00
|
|
|
"execution_count": 2,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" converters={'repository_metadata.content_subjects_phrases': ast.literal_eval,\n",
|
|
|
|
" 'repository_metadata.alternativename': ast.literal_eval,\n",
|
|
|
|
" 'repository_metadata.content_types': ast.literal_eval,\n",
|
|
|
|
" 'organization': ast.literal_eval\n",
|
|
|
|
" },\n",
|
|
|
|
" dtype={'system_metadata.id': str})\n",
|
|
|
|
"\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
"opendoar_df.head()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-10-08 12:46:14 +02:00
|
|
|
"execution_count": 3,
|
2021-07-23 15:28:23 +02:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
2021-10-08 12:46:14 +02:00
|
|
|
"Index(['system_metadata.id', 'repository_metadata.name',\n",
|
|
|
|
" 'repository_metadata.alternativename', 'repository_metadata.url',\n",
|
|
|
|
" 'repository_metadata.description', 'repository_metadata.type',\n",
|
|
|
|
" 'repository_metadata.content_languages',\n",
|
|
|
|
" 'system_metadata.date_modified', 'system_metadata.date_created',\n",
|
|
|
|
" 'repository_metadata.content_subjects',\n",
|
|
|
|
" 'repository_metadata.content_types', 'organization', 'policy_urls',\n",
|
|
|
|
" 'repository_metadata.software', 'repository_metadata.oai_url',\n",
|
|
|
|
" 'system_metadata.publicly_visible',\n",
|
|
|
|
" 'repository_metadata.repository_status',\n",
|
|
|
|
" 'repository_metadata.fulltext_record_count',\n",
|
|
|
|
" 'repository_metadata.metadata_record_count'],\n",
|
2021-07-23 15:28:23 +02:00
|
|
|
" dtype='object')"
|
|
|
|
]
|
|
|
|
},
|
2021-10-08 12:46:14 +02:00
|
|
|
"execution_count": 3,
|
2021-07-23 15:28:23 +02:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"opendoar_df.columns"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-10-08 12:46:14 +02:00
|
|
|
"execution_count": 4,
|
2021-07-23 15:28:23 +02:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def empty_list_is_nan(cell):\n",
|
|
|
|
" if isinstance(cell, list):\n",
|
|
|
|
" return np.nan if len(cell) == 0 else cell\n",
|
|
|
|
" else:\n",
|
|
|
|
" return cell\n",
|
|
|
|
" \n",
|
|
|
|
"opendoar_df = opendoar_df.applymap(empty_list_is_nan)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-10-08 12:46:14 +02:00
|
|
|
"execution_count": 5,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <th>system_metadata.id</th>\n",
|
|
|
|
" <th>repository_metadata.name</th>\n",
|
|
|
|
" <th>repository_metadata.alternativename</th>\n",
|
|
|
|
" <th>repository_metadata.url</th>\n",
|
|
|
|
" <th>repository_metadata.description</th>\n",
|
|
|
|
" <th>repository_metadata.type</th>\n",
|
|
|
|
" <th>repository_metadata.content_languages</th>\n",
|
|
|
|
" <th>system_metadata.date_modified</th>\n",
|
|
|
|
" <th>system_metadata.date_created</th>\n",
|
|
|
|
" <th>repository_metadata.content_subjects</th>\n",
|
|
|
|
" <th>repository_metadata.content_types</th>\n",
|
|
|
|
" <th>organization</th>\n",
|
|
|
|
" <th>policy_urls</th>\n",
|
|
|
|
" <th>repository_metadata.software</th>\n",
|
|
|
|
" <th>repository_metadata.oai_url</th>\n",
|
|
|
|
" <th>system_metadata.publicly_visible</th>\n",
|
|
|
|
" <th>repository_metadata.repository_status</th>\n",
|
|
|
|
" <th>repository_metadata.fulltext_record_count</th>\n",
|
|
|
|
" <th>repository_metadata.metadata_record_count</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>count</th>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>2147</td>\n",
|
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>5421</td>\n",
|
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>5598</td>\n",
|
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>4402</td>\n",
|
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>5595</td>\n",
|
|
|
|
" <td>2.299000e+03</td>\n",
|
|
|
|
" <td>4.197000e+03</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>unique</th>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>5713</td>\n",
|
|
|
|
" <td>2107</td>\n",
|
|
|
|
" <td>5705</td>\n",
|
|
|
|
" <td>4619</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>4</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>330</td>\n",
|
|
|
|
" <td>2372</td>\n",
|
|
|
|
" <td>5573</td>\n",
|
|
|
|
" <td>821</td>\n",
|
|
|
|
" <td>477</td>\n",
|
|
|
|
" <td>5201</td>\n",
|
|
|
|
" <td>642</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>321</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>4370</td>\n",
|
|
|
|
" <td>1</td>\n",
|
|
|
|
" <td>7</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>top</th>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>175</td>\n",
|
|
|
|
" <td>{\"name\": \"hiroshima associated repository port...</td>\n",
|
|
|
|
" <td>[{'acronym': 'aura'}]</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>http://harp.lib.hiroshima-u.ac.jp/</td>\n",
|
|
|
|
" <td>this site provides access to the research outp...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>institutional</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>[\"en\"]</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>2020-09-18 12:53:48</td>\n",
|
|
|
|
" <td>2020-09-18 12:53:48</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>[\"multidisciplinary\"]</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[theses_and_dissertations]</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>[{'name': 'rijksuniversiteit groningen', 'alte...</td>\n",
|
|
|
|
" <td>[]</td>\n",
|
|
|
|
" <td>{\"name\": \"dspace\", \"version\": \"\"}</td>\n",
|
|
|
|
" <td>https://kidoks.bsz-bw.de/oai</td>\n",
|
|
|
|
" <td>yes</td>\n",
|
|
|
|
" <td>fully_functional</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>freq</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>1</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>3</td>\n",
|
2021-07-23 15:28:23 +02:00
|
|
|
" <td>4</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>3</td>\n",
|
|
|
|
" <td>95</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>5096</td>\n",
|
|
|
|
" <td>1917</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>82</td>\n",
|
|
|
|
" <td>82</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>3227</td>\n",
|
|
|
|
" <td>465</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>26</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>5098</td>\n",
|
|
|
|
" <td>822</td>\n",
|
|
|
|
" <td>3</td>\n",
|
|
|
|
" <td>5742</td>\n",
|
|
|
|
" <td>5276</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>mean</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>5.010186e+03</td>\n",
|
|
|
|
" <td>1.760546e+05</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>std</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>4.206295e+04</td>\n",
|
|
|
|
" <td>6.600825e+06</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>min</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>0.000000e+00</td>\n",
|
|
|
|
" <td>0.000000e+00</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>25%</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>0.000000e+00</td>\n",
|
|
|
|
" <td>8.950000e+02</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>50%</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>4.220000e+02</td>\n",
|
|
|
|
" <td>4.026000e+03</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>75%</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>2.930500e+03</td>\n",
|
|
|
|
" <td>1.630400e+04</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>max</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" <td>1.817531e+06</td>\n",
|
|
|
|
" <td>4.200000e+08</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
2021-10-08 12:46:14 +02:00
|
|
|
" system_metadata.id repository_metadata.name \\\n",
|
|
|
|
"count 5742 5742 \n",
|
|
|
|
"unique 5742 5713 \n",
|
|
|
|
"top 175 {\"name\": \"hiroshima associated repository port... \n",
|
|
|
|
"freq 1 3 \n",
|
|
|
|
"mean NaN NaN \n",
|
|
|
|
"std NaN NaN \n",
|
|
|
|
"min NaN NaN \n",
|
|
|
|
"25% NaN NaN \n",
|
|
|
|
"50% NaN NaN \n",
|
|
|
|
"75% NaN NaN \n",
|
|
|
|
"max NaN NaN \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" repository_metadata.alternativename \\\n",
|
|
|
|
"count 2147 \n",
|
|
|
|
"unique 2107 \n",
|
|
|
|
"top [{'acronym': 'aura'}] \n",
|
|
|
|
"freq 4 \n",
|
|
|
|
"mean NaN \n",
|
|
|
|
"std NaN \n",
|
|
|
|
"min NaN \n",
|
|
|
|
"25% NaN \n",
|
|
|
|
"50% NaN \n",
|
|
|
|
"75% NaN \n",
|
|
|
|
"max NaN \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" repository_metadata.url \\\n",
|
|
|
|
"count 5742 \n",
|
|
|
|
"unique 5705 \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"top http://harp.lib.hiroshima-u.ac.jp/ \n",
|
|
|
|
"freq 3 \n",
|
|
|
|
"mean NaN \n",
|
|
|
|
"std NaN \n",
|
|
|
|
"min NaN \n",
|
|
|
|
"25% NaN \n",
|
|
|
|
"50% NaN \n",
|
|
|
|
"75% NaN \n",
|
|
|
|
"max NaN \n",
|
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" repository_metadata.description \\\n",
|
|
|
|
"count 5421 \n",
|
|
|
|
"unique 4619 \n",
|
|
|
|
"top this site provides access to the research outp... \n",
|
|
|
|
"freq 95 \n",
|
|
|
|
"mean NaN \n",
|
|
|
|
"std NaN \n",
|
|
|
|
"min NaN \n",
|
|
|
|
"25% NaN \n",
|
|
|
|
"50% NaN \n",
|
|
|
|
"75% NaN \n",
|
|
|
|
"max NaN \n",
|
|
|
|
"\n",
|
|
|
|
" repository_metadata.type repository_metadata.content_languages \\\n",
|
|
|
|
"count 5742 5742 \n",
|
|
|
|
"unique 4 330 \n",
|
|
|
|
"top institutional [\"en\"] \n",
|
|
|
|
"freq 5096 1917 \n",
|
|
|
|
"mean NaN NaN \n",
|
|
|
|
"std NaN NaN \n",
|
|
|
|
"min NaN NaN \n",
|
|
|
|
"25% NaN NaN \n",
|
|
|
|
"50% NaN NaN \n",
|
|
|
|
"75% NaN NaN \n",
|
|
|
|
"max NaN NaN \n",
|
|
|
|
"\n",
|
|
|
|
" system_metadata.date_modified system_metadata.date_created \\\n",
|
|
|
|
"count 5742 5742 \n",
|
|
|
|
"unique 2372 5573 \n",
|
|
|
|
"top 2020-09-18 12:53:48 2020-09-18 12:53:48 \n",
|
|
|
|
"freq 82 82 \n",
|
|
|
|
"mean NaN NaN \n",
|
|
|
|
"std NaN NaN \n",
|
|
|
|
"min NaN NaN \n",
|
|
|
|
"25% NaN NaN \n",
|
|
|
|
"50% NaN NaN \n",
|
|
|
|
"75% NaN NaN \n",
|
|
|
|
"max NaN NaN \n",
|
|
|
|
"\n",
|
|
|
|
" repository_metadata.content_subjects repository_metadata.content_types \\\n",
|
|
|
|
"count 5742 5598 \n",
|
|
|
|
"unique 821 477 \n",
|
|
|
|
"top [\"multidisciplinary\"] [theses_and_dissertations] \n",
|
|
|
|
"freq 3227 465 \n",
|
|
|
|
"mean NaN NaN \n",
|
|
|
|
"std NaN NaN \n",
|
|
|
|
"min NaN NaN \n",
|
|
|
|
"25% NaN NaN \n",
|
|
|
|
"50% NaN NaN \n",
|
|
|
|
"75% NaN NaN \n",
|
|
|
|
"max NaN NaN \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" organization policy_urls \\\n",
|
|
|
|
"count 5742 5742 \n",
|
|
|
|
"unique 5201 642 \n",
|
|
|
|
"top [{'name': 'rijksuniversiteit groningen', 'alte... [] \n",
|
|
|
|
"freq 26 5098 \n",
|
|
|
|
"mean NaN NaN \n",
|
|
|
|
"std NaN NaN \n",
|
|
|
|
"min NaN NaN \n",
|
|
|
|
"25% NaN NaN \n",
|
|
|
|
"50% NaN NaN \n",
|
|
|
|
"75% NaN NaN \n",
|
|
|
|
"max NaN NaN \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" repository_metadata.software repository_metadata.oai_url \\\n",
|
|
|
|
"count 5742 4402 \n",
|
|
|
|
"unique 321 4370 \n",
|
|
|
|
"top {\"name\": \"dspace\", \"version\": \"\"} https://kidoks.bsz-bw.de/oai \n",
|
|
|
|
"freq 822 3 \n",
|
|
|
|
"mean NaN NaN \n",
|
|
|
|
"std NaN NaN \n",
|
|
|
|
"min NaN NaN \n",
|
|
|
|
"25% NaN NaN \n",
|
|
|
|
"50% NaN NaN \n",
|
|
|
|
"75% NaN NaN \n",
|
|
|
|
"max NaN NaN \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" system_metadata.publicly_visible repository_metadata.repository_status \\\n",
|
|
|
|
"count 5742 5595 \n",
|
|
|
|
"unique 1 7 \n",
|
|
|
|
"top yes fully_functional \n",
|
|
|
|
"freq 5742 5276 \n",
|
|
|
|
"mean NaN NaN \n",
|
|
|
|
"std NaN NaN \n",
|
|
|
|
"min NaN NaN \n",
|
|
|
|
"25% NaN NaN \n",
|
|
|
|
"50% NaN NaN \n",
|
|
|
|
"75% NaN NaN \n",
|
|
|
|
"max NaN NaN \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"\n",
|
2021-10-08 12:46:14 +02:00
|
|
|
" repository_metadata.fulltext_record_count \\\n",
|
|
|
|
"count 2.299000e+03 \n",
|
|
|
|
"unique NaN \n",
|
|
|
|
"top NaN \n",
|
|
|
|
"freq NaN \n",
|
|
|
|
"mean 5.010186e+03 \n",
|
|
|
|
"std 4.206295e+04 \n",
|
|
|
|
"min 0.000000e+00 \n",
|
|
|
|
"25% 0.000000e+00 \n",
|
|
|
|
"50% 4.220000e+02 \n",
|
|
|
|
"75% 2.930500e+03 \n",
|
|
|
|
"max 1.817531e+06 \n",
|
|
|
|
"\n",
|
|
|
|
" repository_metadata.metadata_record_count \n",
|
|
|
|
"count 4.197000e+03 \n",
|
|
|
|
"unique NaN \n",
|
|
|
|
"top NaN \n",
|
|
|
|
"freq NaN \n",
|
|
|
|
"mean 1.760546e+05 \n",
|
|
|
|
"std 6.600825e+06 \n",
|
|
|
|
"min 0.000000e+00 \n",
|
|
|
|
"25% 8.950000e+02 \n",
|
|
|
|
"50% 4.026000e+03 \n",
|
|
|
|
"75% 1.630400e+04 \n",
|
|
|
|
"max 4.200000e+08 "
|
2021-07-22 11:35:40 +02:00
|
|
|
]
|
|
|
|
},
|
2021-10-08 12:46:14 +02:00
|
|
|
"execution_count": 5,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2021-07-23 12:41:17 +02:00
|
|
|
"opendoar_df.describe(include='all')"
|
2021-07-22 11:35:40 +02:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-10-08 12:46:14 +02:00
|
|
|
"execution_count": 6,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
2021-10-08 12:46:14 +02:00
|
|
|
"system_metadata.id 0\n",
|
|
|
|
"repository_metadata.name 0\n",
|
|
|
|
"repository_metadata.alternativename 3595\n",
|
|
|
|
"repository_metadata.url 0\n",
|
|
|
|
"repository_metadata.description 321\n",
|
|
|
|
"repository_metadata.type 0\n",
|
|
|
|
"repository_metadata.content_languages 0\n",
|
|
|
|
"system_metadata.date_modified 0\n",
|
|
|
|
"system_metadata.date_created 0\n",
|
|
|
|
"repository_metadata.content_subjects 0\n",
|
|
|
|
"repository_metadata.content_types 144\n",
|
|
|
|
"organization 0\n",
|
|
|
|
"policy_urls 0\n",
|
|
|
|
"repository_metadata.software 0\n",
|
|
|
|
"repository_metadata.oai_url 1340\n",
|
|
|
|
"system_metadata.publicly_visible 0\n",
|
|
|
|
"repository_metadata.repository_status 147\n",
|
|
|
|
"repository_metadata.fulltext_record_count 3443\n",
|
|
|
|
"repository_metadata.metadata_record_count 1545\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"dtype: int64"
|
2021-07-22 11:35:40 +02:00
|
|
|
]
|
|
|
|
},
|
2021-10-08 12:46:14 +02:00
|
|
|
"execution_count": 6,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2021-07-23 12:41:17 +02:00
|
|
|
"opendoar_df.isna().sum()"
|
2021-07-22 11:35:40 +02:00
|
|
|
]
|
|
|
|
},
|
2021-07-23 15:28:23 +02:00
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-10-08 12:46:14 +02:00
|
|
|
"execution_count": 7,
|
2021-07-23 15:28:23 +02:00
|
|
|
"metadata": {},
|
2021-10-08 12:46:14 +02:00
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"repository_metadata.content_types\n",
|
|
|
|
"bibliographic_references 865\n",
|
|
|
|
"books_chapters_and_sections 2194\n",
|
|
|
|
"conference_and_workshop_papers 1981\n",
|
|
|
|
"datasets 401\n",
|
|
|
|
"journal_articles 4030\n",
|
|
|
|
"learning_objects 789\n",
|
|
|
|
"other_special_item_types 1759\n",
|
|
|
|
"patents 182\n",
|
|
|
|
"software 92\n",
|
|
|
|
"theses_and_dissertations 3319\n",
|
|
|
|
"unpub_reports_and_working_papers 1904\n",
|
|
|
|
"dtype: int64"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 7,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"pd.DataFrame(opendoar_df['repository_metadata.content_types'].explode()).groupby('repository_metadata.content_types').size()"
|
|
|
|
]
|
2021-07-23 15:28:23 +02:00
|
|
|
},
|
2021-07-22 11:35:40 +02:00
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-07-23 12:41:17 +02:00
|
|
|
"execution_count": null,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
2021-07-23 12:41:17 +02:00
|
|
|
"outputs": [],
|
|
|
|
"source": []
|
2021-07-22 11:35:40 +02:00
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": "Python 3",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": "3.8.3"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 4
|
|
|
|
}
|