registries_analysis/notebooks/01.2-exploration-opendoar.i...

919 lines
40 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"import csv\n",
"import json\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
"import plotly.express as px\n",
"\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading datasets"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>system_metadata.id</th>\n",
" <th>repository_metadata.name</th>\n",
" <th>repository_metadata.alternativename</th>\n",
" <th>repository_metadata.url</th>\n",
" <th>repository_metadata.description</th>\n",
" <th>repository_metadata.type</th>\n",
" <th>repository_metadata.content_languages</th>\n",
" <th>system_metadata.date_modified</th>\n",
" <th>system_metadata.date_created</th>\n",
" <th>repository_metadata.content_subjects</th>\n",
" <th>repository_metadata.content_types</th>\n",
" <th>organization</th>\n",
" <th>policy_urls</th>\n",
" <th>repository_metadata.software</th>\n",
" <th>repository_metadata.oai_url</th>\n",
" <th>system_metadata.publicly_visible</th>\n",
" <th>repository_metadata.repository_status</th>\n",
" <th>repository_metadata.fulltext_record_count</th>\n",
" <th>repository_metadata.metadata_record_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>175</td>\n",
" <td>{\"name\": \"hku theses online\", \"language\": \"en\"}</td>\n",
" <td>[]</td>\n",
" <td>http://hub.hku.hk/handle/10722/1057</td>\n",
" <td>this is an institutional repository providing ...</td>\n",
" <td>institutional</td>\n",
" <td>[\"zh\", \"en\"]</td>\n",
" <td>2021-03-25 10:16:18</td>\n",
" <td>2005-12-21 12:44:08</td>\n",
" <td>[\"multidisciplinary\"]</td>\n",
" <td>[bibliographic_references, theses_and_disserta...</td>\n",
" <td>[{'name': 'university of hong kong', 'alternat...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"dspace\", \"version\": \"cris-5.3.1-snap...</td>\n",
" <td>NaN</td>\n",
" <td>yes</td>\n",
" <td>fully_functional</td>\n",
" <td>NaN</td>\n",
" <td>11850.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>64</td>\n",
" <td>{\"name\": \"research support scheme - central eu...</td>\n",
" <td>[]</td>\n",
" <td>http://rss.archives.ceu.hu/</td>\n",
" <td>this is an institutional repository collecting...</td>\n",
" <td>institutional</td>\n",
" <td>[\"cs\", \"en\", \"hu\", \"ru\"]</td>\n",
" <td>2021-03-25 09:48:31</td>\n",
" <td>2006-01-04 14:59:30</td>\n",
" <td>[\"multidisciplinary\"]</td>\n",
" <td>[unpub_reports_and_working_papers]</td>\n",
" <td>[{'name': 'central european university', 'alte...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"eprints\", \"version\": \"2.2.1\"}</td>\n",
" <td>http://rss.archives.ceu.hu/perl/oai2</td>\n",
" <td>yes</td>\n",
" <td>fully_functional</td>\n",
" <td>NaN</td>\n",
" <td>164.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>151</td>\n",
" <td>{\"name\": \"cadmus, eui research repository\", \"l...</td>\n",
" <td>[]</td>\n",
" <td>http://cadmus.eui.eu/</td>\n",
" <td>cadmus is the name of the eui research reposit...</td>\n",
" <td>institutional</td>\n",
" <td>[\"nl\", \"en\", \"fr\", \"de\", \"it\"]</td>\n",
" <td>2021-09-13 13:35:36</td>\n",
" <td>2006-01-04 12:07:07</td>\n",
" <td>[\"history and archaeology\", \"multidisciplinary...</td>\n",
" <td>[journal_articles, theses_and_dissertations, u...</td>\n",
" <td>[{'name': 'european university institute', 'al...</td>\n",
" <td>[{\"policy_url\": \"https://www.eui.eu/research/e...</td>\n",
" <td>{\"name\": \"dspace\", \"version\": \"5.2\"}</td>\n",
" <td>http://cadmus.eui.eu/oai/request</td>\n",
" <td>yes</td>\n",
" <td>fully_functional</td>\n",
" <td>3867.0</td>\n",
" <td>24869.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>105</td>\n",
" <td>{\"name\": \"document server@uhasselt\", \"language...</td>\n",
" <td>[]</td>\n",
" <td>https://doclib.uhasselt.be/dspace/</td>\n",
" <td>this site is a university repository providing...</td>\n",
" <td>institutional</td>\n",
" <td>[\"nl\", \"en\", \"fr\", \"de\"]</td>\n",
" <td>2021-04-16 15:23:52</td>\n",
" <td>2006-01-24 15:46:44</td>\n",
" <td>[\"multidisciplinary\"]</td>\n",
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
" <td>[{'name': 'uhasselt', 'alternativeName': 'hass...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"dspace\", \"version\": \"1.7.2\"}</td>\n",
" <td>http://doclib.uhasselt.be/dspace-oai/request</td>\n",
" <td>yes</td>\n",
" <td>fully_functional</td>\n",
" <td>0.0</td>\n",
" <td>27376.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>101</td>\n",
" <td>{\"name\": \"utrecht university repository\", \"lan...</td>\n",
" <td>[]</td>\n",
" <td>http://dspace.library.uu.nl</td>\n",
" <td>this site is a university repository providing...</td>\n",
" <td>institutional</td>\n",
" <td>[\"nl\", \"en\"]</td>\n",
" <td>2021-04-16 15:22:03</td>\n",
" <td>2006-01-13 12:55:13</td>\n",
" <td>[\"multidisciplinary\"]</td>\n",
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
" <td>[{'name': 'university of utrecht', 'alternativ...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"dspace\", \"version\": \"\"}</td>\n",
" <td>https://dspace.library.uu.nl/oai/request</td>\n",
" <td>yes</td>\n",
" <td>fully_functional</td>\n",
" <td>1686.0</td>\n",
" <td>185637.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" system_metadata.id repository_metadata.name \\\n",
"0 175 {\"name\": \"hku theses online\", \"language\": \"en\"} \n",
"1 64 {\"name\": \"research support scheme - central eu... \n",
"2 151 {\"name\": \"cadmus, eui research repository\", \"l... \n",
"3 105 {\"name\": \"document server@uhasselt\", \"language... \n",
"4 101 {\"name\": \"utrecht university repository\", \"lan... \n",
"\n",
" repository_metadata.alternativename repository_metadata.url \\\n",
"0 [] http://hub.hku.hk/handle/10722/1057 \n",
"1 [] http://rss.archives.ceu.hu/ \n",
"2 [] http://cadmus.eui.eu/ \n",
"3 [] https://doclib.uhasselt.be/dspace/ \n",
"4 [] http://dspace.library.uu.nl \n",
"\n",
" repository_metadata.description repository_metadata.type \\\n",
"0 this is an institutional repository providing ... institutional \n",
"1 this is an institutional repository collecting... institutional \n",
"2 cadmus is the name of the eui research reposit... institutional \n",
"3 this site is a university repository providing... institutional \n",
"4 this site is a university repository providing... institutional \n",
"\n",
" repository_metadata.content_languages system_metadata.date_modified \\\n",
"0 [\"zh\", \"en\"] 2021-03-25 10:16:18 \n",
"1 [\"cs\", \"en\", \"hu\", \"ru\"] 2021-03-25 09:48:31 \n",
"2 [\"nl\", \"en\", \"fr\", \"de\", \"it\"] 2021-09-13 13:35:36 \n",
"3 [\"nl\", \"en\", \"fr\", \"de\"] 2021-04-16 15:23:52 \n",
"4 [\"nl\", \"en\"] 2021-04-16 15:22:03 \n",
"\n",
" system_metadata.date_created \\\n",
"0 2005-12-21 12:44:08 \n",
"1 2006-01-04 14:59:30 \n",
"2 2006-01-04 12:07:07 \n",
"3 2006-01-24 15:46:44 \n",
"4 2006-01-13 12:55:13 \n",
"\n",
" repository_metadata.content_subjects \\\n",
"0 [\"multidisciplinary\"] \n",
"1 [\"multidisciplinary\"] \n",
"2 [\"history and archaeology\", \"multidisciplinary... \n",
"3 [\"multidisciplinary\"] \n",
"4 [\"multidisciplinary\"] \n",
"\n",
" repository_metadata.content_types \\\n",
"0 [bibliographic_references, theses_and_disserta... \n",
"1 [unpub_reports_and_working_papers] \n",
"2 [journal_articles, theses_and_dissertations, u... \n",
"3 [journal_articles, conference_and_workshop_pap... \n",
"4 [journal_articles, conference_and_workshop_pap... \n",
"\n",
" organization \\\n",
"0 [{'name': 'university of hong kong', 'alternat... \n",
"1 [{'name': 'central european university', 'alte... \n",
"2 [{'name': 'european university institute', 'al... \n",
"3 [{'name': 'uhasselt', 'alternativeName': 'hass... \n",
"4 [{'name': 'university of utrecht', 'alternativ... \n",
"\n",
" policy_urls \\\n",
"0 [] \n",
"1 [] \n",
"2 [{\"policy_url\": \"https://www.eui.eu/research/e... \n",
"3 [] \n",
"4 [] \n",
"\n",
" repository_metadata.software \\\n",
"0 {\"name\": \"dspace\", \"version\": \"cris-5.3.1-snap... \n",
"1 {\"name\": \"eprints\", \"version\": \"2.2.1\"} \n",
"2 {\"name\": \"dspace\", \"version\": \"5.2\"} \n",
"3 {\"name\": \"dspace\", \"version\": \"1.7.2\"} \n",
"4 {\"name\": \"dspace\", \"version\": \"\"} \n",
"\n",
" repository_metadata.oai_url \\\n",
"0 NaN \n",
"1 http://rss.archives.ceu.hu/perl/oai2 \n",
"2 http://cadmus.eui.eu/oai/request \n",
"3 http://doclib.uhasselt.be/dspace-oai/request \n",
"4 https://dspace.library.uu.nl/oai/request \n",
"\n",
" system_metadata.publicly_visible repository_metadata.repository_status \\\n",
"0 yes fully_functional \n",
"1 yes fully_functional \n",
"2 yes fully_functional \n",
"3 yes fully_functional \n",
"4 yes fully_functional \n",
"\n",
" repository_metadata.fulltext_record_count \\\n",
"0 NaN \n",
"1 NaN \n",
"2 3867.0 \n",
"3 0.0 \n",
"4 1686.0 \n",
"\n",
" repository_metadata.metadata_record_count \n",
"0 11850.0 \n",
"1 164.0 \n",
"2 24869.0 \n",
"3 27376.0 \n",
"4 185637.0 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n",
" converters={'repository_metadata.content_subjects': ast.literal_eval,\n",
" 'repository_metadata.alternativename': ast.literal_eval,\n",
" 'repository_metadata.content_types': ast.literal_eval,\n",
" 'organization': ast.literal_eval\n",
" },\n",
" dtype={'system_metadata.id': str})\n",
"\n",
"opendoar_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['system_metadata.id', 'repository_metadata.name',\n",
" 'repository_metadata.alternativename', 'repository_metadata.url',\n",
" 'repository_metadata.description', 'repository_metadata.type',\n",
" 'repository_metadata.content_languages',\n",
" 'system_metadata.date_modified', 'system_metadata.date_created',\n",
" 'repository_metadata.content_subjects',\n",
" 'repository_metadata.content_types', 'organization', 'policy_urls',\n",
" 'repository_metadata.software', 'repository_metadata.oai_url',\n",
" 'system_metadata.publicly_visible',\n",
" 'repository_metadata.repository_status',\n",
" 'repository_metadata.fulltext_record_count',\n",
" 'repository_metadata.metadata_record_count'],\n",
" dtype='object')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def empty_list_is_nan(cell):\n",
" if isinstance(cell, list):\n",
" return np.nan if len(cell) == 0 else cell\n",
" else:\n",
" return cell\n",
" \n",
"opendoar_df = opendoar_df.applymap(empty_list_is_nan)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>system_metadata.id</th>\n",
" <th>repository_metadata.name</th>\n",
" <th>repository_metadata.alternativename</th>\n",
" <th>repository_metadata.url</th>\n",
" <th>repository_metadata.description</th>\n",
" <th>repository_metadata.type</th>\n",
" <th>repository_metadata.content_languages</th>\n",
" <th>system_metadata.date_modified</th>\n",
" <th>system_metadata.date_created</th>\n",
" <th>repository_metadata.content_subjects</th>\n",
" <th>repository_metadata.content_types</th>\n",
" <th>organization</th>\n",
" <th>policy_urls</th>\n",
" <th>repository_metadata.software</th>\n",
" <th>repository_metadata.oai_url</th>\n",
" <th>system_metadata.publicly_visible</th>\n",
" <th>repository_metadata.repository_status</th>\n",
" <th>repository_metadata.fulltext_record_count</th>\n",
" <th>repository_metadata.metadata_record_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>5742</td>\n",
" <td>5742</td>\n",
" <td>2147</td>\n",
" <td>5742</td>\n",
" <td>5421</td>\n",
" <td>5742</td>\n",
" <td>5742</td>\n",
" <td>5742</td>\n",
" <td>5742</td>\n",
" <td>5742</td>\n",
" <td>5598</td>\n",
" <td>5742</td>\n",
" <td>5742</td>\n",
" <td>5742</td>\n",
" <td>4402</td>\n",
" <td>5742</td>\n",
" <td>5595</td>\n",
" <td>2.299000e+03</td>\n",
" <td>4.197000e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>5742</td>\n",
" <td>5713</td>\n",
" <td>2107</td>\n",
" <td>5705</td>\n",
" <td>4619</td>\n",
" <td>4</td>\n",
" <td>330</td>\n",
" <td>2372</td>\n",
" <td>5573</td>\n",
" <td>821</td>\n",
" <td>477</td>\n",
" <td>5201</td>\n",
" <td>642</td>\n",
" <td>321</td>\n",
" <td>4370</td>\n",
" <td>1</td>\n",
" <td>7</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>175</td>\n",
" <td>{\"name\": \"hiroshima associated repository port...</td>\n",
" <td>[{'acronym': 'aura'}]</td>\n",
" <td>http://harp.lib.hiroshima-u.ac.jp/</td>\n",
" <td>this site provides access to the research outp...</td>\n",
" <td>institutional</td>\n",
" <td>[\"en\"]</td>\n",
" <td>2020-09-18 12:53:48</td>\n",
" <td>2020-09-18 12:53:48</td>\n",
" <td>[\"multidisciplinary\"]</td>\n",
" <td>[theses_and_dissertations]</td>\n",
" <td>[{'name': 'rijksuniversiteit groningen', 'alte...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"dspace\", \"version\": \"\"}</td>\n",
" <td>https://kidoks.bsz-bw.de/oai</td>\n",
" <td>yes</td>\n",
" <td>fully_functional</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>95</td>\n",
" <td>5096</td>\n",
" <td>1917</td>\n",
" <td>82</td>\n",
" <td>82</td>\n",
" <td>3227</td>\n",
" <td>465</td>\n",
" <td>26</td>\n",
" <td>5098</td>\n",
" <td>822</td>\n",
" <td>3</td>\n",
" <td>5742</td>\n",
" <td>5276</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5.010186e+03</td>\n",
" <td>1.760546e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4.206295e+04</td>\n",
" <td>6.600825e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.000000e+00</td>\n",
" <td>8.950000e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4.220000e+02</td>\n",
" <td>4.026000e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.930500e+03</td>\n",
" <td>1.630400e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.817531e+06</td>\n",
" <td>4.200000e+08</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" system_metadata.id repository_metadata.name \\\n",
"count 5742 5742 \n",
"unique 5742 5713 \n",
"top 175 {\"name\": \"hiroshima associated repository port... \n",
"freq 1 3 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" repository_metadata.alternativename \\\n",
"count 2147 \n",
"unique 2107 \n",
"top [{'acronym': 'aura'}] \n",
"freq 4 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" repository_metadata.url \\\n",
"count 5742 \n",
"unique 5705 \n",
"top http://harp.lib.hiroshima-u.ac.jp/ \n",
"freq 3 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" repository_metadata.description \\\n",
"count 5421 \n",
"unique 4619 \n",
"top this site provides access to the research outp... \n",
"freq 95 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" repository_metadata.type repository_metadata.content_languages \\\n",
"count 5742 5742 \n",
"unique 4 330 \n",
"top institutional [\"en\"] \n",
"freq 5096 1917 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" system_metadata.date_modified system_metadata.date_created \\\n",
"count 5742 5742 \n",
"unique 2372 5573 \n",
"top 2020-09-18 12:53:48 2020-09-18 12:53:48 \n",
"freq 82 82 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" repository_metadata.content_subjects repository_metadata.content_types \\\n",
"count 5742 5598 \n",
"unique 821 477 \n",
"top [\"multidisciplinary\"] [theses_and_dissertations] \n",
"freq 3227 465 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" organization policy_urls \\\n",
"count 5742 5742 \n",
"unique 5201 642 \n",
"top [{'name': 'rijksuniversiteit groningen', 'alte... [] \n",
"freq 26 5098 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" repository_metadata.software repository_metadata.oai_url \\\n",
"count 5742 4402 \n",
"unique 321 4370 \n",
"top {\"name\": \"dspace\", \"version\": \"\"} https://kidoks.bsz-bw.de/oai \n",
"freq 822 3 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" system_metadata.publicly_visible repository_metadata.repository_status \\\n",
"count 5742 5595 \n",
"unique 1 7 \n",
"top yes fully_functional \n",
"freq 5742 5276 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" repository_metadata.fulltext_record_count \\\n",
"count 2.299000e+03 \n",
"unique NaN \n",
"top NaN \n",
"freq NaN \n",
"mean 5.010186e+03 \n",
"std 4.206295e+04 \n",
"min 0.000000e+00 \n",
"25% 0.000000e+00 \n",
"50% 4.220000e+02 \n",
"75% 2.930500e+03 \n",
"max 1.817531e+06 \n",
"\n",
" repository_metadata.metadata_record_count \n",
"count 4.197000e+03 \n",
"unique NaN \n",
"top NaN \n",
"freq NaN \n",
"mean 1.760546e+05 \n",
"std 6.600825e+06 \n",
"min 0.000000e+00 \n",
"25% 8.950000e+02 \n",
"50% 4.026000e+03 \n",
"75% 1.630400e+04 \n",
"max 4.200000e+08 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"system_metadata.id 0\n",
"repository_metadata.name 0\n",
"repository_metadata.alternativename 3595\n",
"repository_metadata.url 0\n",
"repository_metadata.description 321\n",
"repository_metadata.type 0\n",
"repository_metadata.content_languages 0\n",
"system_metadata.date_modified 0\n",
"system_metadata.date_created 0\n",
"repository_metadata.content_subjects 0\n",
"repository_metadata.content_types 144\n",
"organization 0\n",
"policy_urls 0\n",
"repository_metadata.software 0\n",
"repository_metadata.oai_url 1340\n",
"system_metadata.publicly_visible 0\n",
"repository_metadata.repository_status 147\n",
"repository_metadata.fulltext_record_count 3443\n",
"repository_metadata.metadata_record_count 1545\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"repository_metadata.content_types\n",
"bibliographic_references 865\n",
"books_chapters_and_sections 2194\n",
"conference_and_workshop_papers 1981\n",
"datasets 401\n",
"journal_articles 4030\n",
"learning_objects 789\n",
"other_special_item_types 1759\n",
"patents 182\n",
"software 92\n",
"theses_and_dissertations 3319\n",
"unpub_reports_and_working_papers 1904\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(opendoar_df['repository_metadata.content_types'].explode()).groupby('repository_metadata.content_types').size()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}