registries_analysis/notebooks/01.2-exploration-opendoar.i...

787 lines
30 KiB
Plaintext
Raw Normal View History

2021-07-22 11:35:40 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"import csv\n",
"import json\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
"import plotly.express as px\n",
"\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading datasets"
]
},
{
"cell_type": "code",
2021-07-23 15:28:23 +02:00
"execution_count": 24,
2021-07-22 11:35:40 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2021-07-23 12:41:17 +02:00
" <th>openaire_id</th>\n",
2021-07-22 11:35:40 +02:00
" <th>opendoar_id</th>\n",
" <th>repository_name</th>\n",
2021-07-23 12:41:17 +02:00
" <th>additional_name</th>\n",
" <th>repository_url</th>\n",
" <th>description</th>\n",
2021-07-22 11:35:40 +02:00
" <th>type</th>\n",
2021-07-23 12:41:17 +02:00
" <th>update_date</th>\n",
" <th>start_date</th>\n",
2021-07-22 11:35:40 +02:00
" <th>subject</th>\n",
2021-07-23 12:41:17 +02:00
" <th>content_type</th>\n",
2021-07-22 11:35:40 +02:00
" <th>institution</th>\n",
2021-07-23 12:41:17 +02:00
" <th>metadata_policy</th>\n",
" <th>data_policy</th>\n",
" <th>submission_policy</th>\n",
" <th>content_policy</th>\n",
" <th>software</th>\n",
" <th>api</th>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
2021-07-23 12:41:17 +02:00
" <td>opendoar____::38b3eff8baf56627478ec76a704e9b52</td>\n",
2021-07-22 11:35:40 +02:00
" <td>101</td>\n",
" <td>utrecht university repository</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[]</td>\n",
" <td>http://dspace.library.uu.nl</td>\n",
" <td>this site is a university repository providing...</td>\n",
2021-07-22 11:35:40 +02:00
" <td>institutional</td>\n",
2021-07-23 12:41:17 +02:00
" <td>2021-04-16 15:22:03</td>\n",
" <td>2006-01-13 12:55:13</td>\n",
2021-07-22 11:35:40 +02:00
" <td>[multidisciplinary]</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
2021-07-22 11:35:40 +02:00
" <td>[[university of utrecht, [universiteit utrecht...</td>\n",
2021-07-23 12:41:17 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>False</td>\n",
" <td>True</td>\n",
" <td>dspace</td>\n",
" <td>true</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2021-07-23 12:41:17 +02:00
" <td>opendoar____::2b44928ae11fb9384c4cf38708677c48</td>\n",
2021-07-22 11:35:40 +02:00
" <td>115</td>\n",
" <td>dspace at indian institute of management kozhi...</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[dspace@iimk]</td>\n",
" <td>http://dspace.iimk.ac.in/</td>\n",
" <td>this site is a subject based university reposi...</td>\n",
2021-07-22 11:35:40 +02:00
" <td>institutional</td>\n",
2021-07-23 12:41:17 +02:00
" <td>2021-02-18 17:36:43</td>\n",
" <td>2006-01-04 11:54:34</td>\n",
2021-07-22 11:35:40 +02:00
" <td>[ecology and environment, social sciences gene...</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
2021-07-22 11:35:40 +02:00
" <td>[[indian institute of management kozhikode, [i...</td>\n",
2021-07-23 12:41:17 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>dspace 4.1</td>\n",
" <td>true</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2021-07-23 12:41:17 +02:00
" <td>opendoar____::3416a75f4cea9109507cacd8e2f2aefc</td>\n",
2021-07-22 11:35:40 +02:00
" <td>41</td>\n",
" <td>caltech engineering and science online</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[]</td>\n",
" <td>http://calteches.library.caltech.edu/</td>\n",
" <td>the caltech archives holds approximately 220 c...</td>\n",
2021-07-22 11:35:40 +02:00
" <td>institutional</td>\n",
2021-07-23 12:41:17 +02:00
" <td>2021-02-18 17:36:28</td>\n",
" <td>2006-01-04 14:47:04</td>\n",
2021-07-22 11:35:40 +02:00
" <td>[biology and biochemistry, chemistry and chemi...</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
2021-07-22 11:35:40 +02:00
" <td>[[california institute of technology, [caltech...</td>\n",
2021-07-23 12:41:17 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>eprints 3.1.3</td>\n",
" <td>true</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2021-07-23 12:41:17 +02:00
" <td>opendoar____::07e1cd7dca89a1678042477183b7ac3f</td>\n",
2021-07-22 11:35:40 +02:00
" <td>119</td>\n",
" <td>dcu online research access service</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[doras]</td>\n",
" <td>http://doras.dcu.ie/</td>\n",
" <td>this site is an institutional repository provi...</td>\n",
2021-07-22 11:35:40 +02:00
" <td>institutional</td>\n",
2021-07-23 12:41:17 +02:00
" <td>2021-02-18 17:36:44</td>\n",
" <td>2006-01-04 11:15:19</td>\n",
2021-07-22 11:35:40 +02:00
" <td>[multidisciplinary]</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
2021-07-22 11:35:40 +02:00
" <td>[[dublin city university, [dcu], ie, [], , htt...</td>\n",
2021-07-23 12:41:17 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>eprints 3.0.5</td>\n",
" <td>true</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2021-07-23 12:41:17 +02:00
" <td>opendoar____::d1f491a404d6854880943e5c3cd9ca25</td>\n",
2021-07-22 11:35:40 +02:00
" <td>129</td>\n",
" <td>earth-prints repository</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[]</td>\n",
" <td>http://www.earth-prints.org/</td>\n",
" <td>a subject based repository providing open acce...</td>\n",
2021-07-22 11:35:40 +02:00
" <td>disciplinary</td>\n",
2021-07-23 12:41:17 +02:00
" <td>2021-04-19 08:28:38</td>\n",
" <td>2006-01-30 16:43:11</td>\n",
2021-07-22 11:35:40 +02:00
" <td>[earth and planetary sciences]</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
2021-07-22 11:35:40 +02:00
" <td>[[istituto nazionale di geofisica e vulcanolog...</td>\n",
2021-07-23 12:41:17 +02:00
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>True</td>\n",
" <td>dspace 5.8.1-snapshot</td>\n",
" <td>true</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2021-07-23 12:41:17 +02:00
" openaire_id opendoar_id \\\n",
"0 opendoar____::38b3eff8baf56627478ec76a704e9b52 101 \n",
"1 opendoar____::2b44928ae11fb9384c4cf38708677c48 115 \n",
"2 opendoar____::3416a75f4cea9109507cacd8e2f2aefc 41 \n",
"3 opendoar____::07e1cd7dca89a1678042477183b7ac3f 119 \n",
"4 opendoar____::d1f491a404d6854880943e5c3cd9ca25 129 \n",
"\n",
" repository_name additional_name \\\n",
"0 utrecht university repository [] \n",
"1 dspace at indian institute of management kozhi... [dspace@iimk] \n",
"2 caltech engineering and science online [] \n",
"3 dcu online research access service [doras] \n",
"4 earth-prints repository [] \n",
"\n",
" repository_url \\\n",
"0 http://dspace.library.uu.nl \n",
"1 http://dspace.iimk.ac.in/ \n",
"2 http://calteches.library.caltech.edu/ \n",
"3 http://doras.dcu.ie/ \n",
"4 http://www.earth-prints.org/ \n",
"\n",
" description type \\\n",
"0 this site is a university repository providing... institutional \n",
"1 this site is a subject based university reposi... institutional \n",
"2 the caltech archives holds approximately 220 c... institutional \n",
"3 this site is an institutional repository provi... institutional \n",
"4 a subject based repository providing open acce... disciplinary \n",
"\n",
" update_date start_date \\\n",
"0 2021-04-16 15:22:03 2006-01-13 12:55:13 \n",
"1 2021-02-18 17:36:43 2006-01-04 11:54:34 \n",
"2 2021-02-18 17:36:28 2006-01-04 14:47:04 \n",
"3 2021-02-18 17:36:44 2006-01-04 11:15:19 \n",
"4 2021-04-19 08:28:38 2006-01-30 16:43:11 \n",
2021-07-22 11:35:40 +02:00
"\n",
2021-07-23 12:41:17 +02:00
" subject \\\n",
"0 [multidisciplinary] \n",
"1 [ecology and environment, social sciences gene... \n",
"2 [biology and biochemistry, chemistry and chemi... \n",
"3 [multidisciplinary] \n",
"4 [earth and planetary sciences] \n",
"\n",
" content_type \\\n",
"0 [journal_articles, conference_and_workshop_pap... \n",
"1 [journal_articles, conference_and_workshop_pap... \n",
"2 [journal_articles, conference_and_workshop_pap... \n",
"3 [journal_articles, conference_and_workshop_pap... \n",
"4 [journal_articles, conference_and_workshop_pap... \n",
"\n",
" institution metadata_policy \\\n",
"0 [[university of utrecht, [universiteit utrecht... True \n",
"1 [[indian institute of management kozhikode, [i... True \n",
"2 [[california institute of technology, [caltech... True \n",
"3 [[dublin city university, [dcu], ie, [], , htt... True \n",
"4 [[istituto nazionale di geofisica e vulcanolog... True \n",
"\n",
" data_policy submission_policy content_policy software \\\n",
"0 True False True dspace \n",
"1 True True True dspace 4.1 \n",
"2 True True True eprints 3.1.3 \n",
"3 True True True eprints 3.0.5 \n",
"4 True True True dspace 5.8.1-snapshot \n",
"\n",
" api \n",
"0 true \n",
"1 true \n",
"2 true \n",
"3 true \n",
"4 true "
2021-07-22 11:35:40 +02:00
]
},
2021-07-23 15:28:23 +02:00
"execution_count": 24,
2021-07-22 11:35:40 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n",
" converters={'subject': ast.literal_eval,\n",
" 'additional_name': ast.literal_eval,\n",
" 'opendoar_id': ast.literal_eval,\n",
" 'content_type': ast.literal_eval,\n",
" 'institution': ast.literal_eval\n",
2021-07-23 12:41:17 +02:00
" })\n",
2021-07-22 11:35:40 +02:00
"opendoar_df.head()"
]
},
{
"cell_type": "code",
2021-07-23 15:28:23 +02:00
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['openaire_id', 'opendoar_id', 'repository_name', 'additional_name',\n",
" 'repository_url', 'description', 'type', 'update_date', 'start_date',\n",
" 'subject', 'content_type', 'institution', 'metadata_policy',\n",
" 'data_policy', 'submission_policy', 'content_policy', 'software',\n",
" 'api'],\n",
" dtype='object')"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"def empty_list_is_nan(cell):\n",
" if isinstance(cell, list):\n",
" return np.nan if len(cell) == 0 else cell\n",
" else:\n",
" return cell\n",
" \n",
"opendoar_df = opendoar_df.applymap(empty_list_is_nan)"
]
},
{
"cell_type": "code",
"execution_count": 29,
2021-07-22 11:35:40 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
2021-07-23 12:41:17 +02:00
" <th>openaire_id</th>\n",
2021-07-22 11:35:40 +02:00
" <th>opendoar_id</th>\n",
" <th>repository_name</th>\n",
2021-07-23 12:41:17 +02:00
" <th>additional_name</th>\n",
" <th>repository_url</th>\n",
" <th>description</th>\n",
2021-07-22 11:35:40 +02:00
" <th>type</th>\n",
2021-07-23 12:41:17 +02:00
" <th>update_date</th>\n",
" <th>start_date</th>\n",
2021-07-22 11:35:40 +02:00
" <th>subject</th>\n",
2021-07-23 12:41:17 +02:00
" <th>content_type</th>\n",
2021-07-22 11:35:40 +02:00
" <th>institution</th>\n",
2021-07-23 12:41:17 +02:00
" <th>metadata_policy</th>\n",
" <th>data_policy</th>\n",
" <th>submission_policy</th>\n",
" <th>content_policy</th>\n",
" <th>software</th>\n",
" <th>api</th>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
2021-07-23 12:41:17 +02:00
" <td>5707</td>\n",
2021-07-22 11:35:40 +02:00
" <td>5707.000000</td>\n",
" <td>5707</td>\n",
2021-07-23 15:28:23 +02:00
" <td>2138</td>\n",
2021-07-22 11:35:40 +02:00
" <td>5707</td>\n",
2021-07-23 12:41:17 +02:00
" <td>5425</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
2021-07-23 15:28:23 +02:00
" <td>5542</td>\n",
" <td>5563</td>\n",
2021-07-23 12:41:17 +02:00
" <td>5707</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
2021-07-22 11:35:40 +02:00
" <td>5707</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
2021-07-23 12:41:17 +02:00
" <td>5707</td>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
" <td>5670</td>\n",
2021-07-23 15:28:23 +02:00
" <td>2096</td>\n",
2021-07-23 12:41:17 +02:00
" <td>5670</td>\n",
" <td>4622</td>\n",
2021-07-22 11:35:40 +02:00
" <td>4</td>\n",
2021-07-23 12:41:17 +02:00
" <td>2501</td>\n",
" <td>5538</td>\n",
2021-07-23 15:28:23 +02:00
" <td>819</td>\n",
" <td>476</td>\n",
2021-07-22 11:35:40 +02:00
" <td>5098</td>\n",
2021-07-23 12:41:17 +02:00
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>321</td>\n",
" <td>2</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
2021-07-23 12:41:17 +02:00
" <td>opendoar____::3cf166c6b73f030b4f67eeaeba301103</td>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
2021-07-23 12:41:17 +02:00
" <td>hiroshima associated repository portal</td>\n",
" <td>[]</td>\n",
" <td>http://harp.lib.hiroshima-u.ac.jp/</td>\n",
" <td>this site provides access to the research outp...</td>\n",
2021-07-22 11:35:40 +02:00
" <td>institutional</td>\n",
2021-07-23 12:41:17 +02:00
" <td>2020-09-18 12:53:48</td>\n",
" <td>2020-09-18 12:53:48</td>\n",
2021-07-22 11:35:40 +02:00
" <td>[multidisciplinary]</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[theses_and_dissertations]</td>\n",
2021-07-22 11:35:40 +02:00
" <td>[[rijksuniversiteit groningen, [rug], nl, [], ...</td>\n",
2021-07-23 12:41:17 +02:00
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>False</td>\n",
" <td>dspace</td>\n",
" <td>true</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
2021-07-23 12:41:17 +02:00
" <td>1</td>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
" <td>3</td>\n",
2021-07-23 15:28:23 +02:00
" <td>4</td>\n",
2021-07-23 12:41:17 +02:00
" <td>3</td>\n",
" <td>95</td>\n",
2021-07-22 11:35:40 +02:00
" <td>5067</td>\n",
2021-07-23 12:41:17 +02:00
" <td>82</td>\n",
" <td>82</td>\n",
2021-07-22 11:35:40 +02:00
" <td>3212</td>\n",
2021-07-23 12:41:17 +02:00
" <td>460</td>\n",
2021-07-22 11:35:40 +02:00
" <td>26</td>\n",
2021-07-23 12:41:17 +02:00
" <td>4116</td>\n",
" <td>4101</td>\n",
" <td>5016</td>\n",
" <td>4075</td>\n",
" <td>800</td>\n",
" <td>4374</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>4008.118801</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>2869.948770</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>2.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>1823.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>3361.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-23 12:41:17 +02:00
" <th>75%</th>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
2021-07-23 12:41:17 +02:00
" <td>5095.000000</td>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
2021-07-23 12:41:17 +02:00
" <th>max</th>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
2021-07-23 12:41:17 +02:00
" <td>10175.000000</td>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2021-07-23 12:41:17 +02:00
" openaire_id opendoar_id \\\n",
"count 5707 5707.000000 \n",
"unique 5707 NaN \n",
"top opendoar____::3cf166c6b73f030b4f67eeaeba301103 NaN \n",
"freq 1 NaN \n",
"mean NaN 4008.118801 \n",
"std NaN 2869.948770 \n",
"min NaN 2.000000 \n",
"25% NaN 1823.000000 \n",
"50% NaN 3361.000000 \n",
"75% NaN 5095.000000 \n",
"max NaN 10175.000000 \n",
"\n",
" repository_name additional_name \\\n",
2021-07-23 15:28:23 +02:00
"count 5707 2138 \n",
"unique 5670 2096 \n",
2021-07-23 12:41:17 +02:00
"top hiroshima associated repository portal [] \n",
2021-07-23 15:28:23 +02:00
"freq 3 4 \n",
2021-07-23 12:41:17 +02:00
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" repository_url \\\n",
"count 5707 \n",
"unique 5670 \n",
"top http://harp.lib.hiroshima-u.ac.jp/ \n",
"freq 3 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" description type \\\n",
"count 5425 5707 \n",
"unique 4622 4 \n",
"top this site provides access to the research outp... institutional \n",
"freq 95 5067 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" update_date start_date subject \\\n",
2021-07-23 15:28:23 +02:00
"count 5707 5707 5542 \n",
"unique 2501 5538 819 \n",
2021-07-23 12:41:17 +02:00
"top 2020-09-18 12:53:48 2020-09-18 12:53:48 [multidisciplinary] \n",
"freq 82 82 3212 \n",
"mean NaN NaN NaN \n",
"std NaN NaN NaN \n",
"min NaN NaN NaN \n",
"25% NaN NaN NaN \n",
"50% NaN NaN NaN \n",
"75% NaN NaN NaN \n",
"max NaN NaN NaN \n",
"\n",
" content_type \\\n",
2021-07-23 15:28:23 +02:00
"count 5563 \n",
"unique 476 \n",
2021-07-23 12:41:17 +02:00
"top [theses_and_dissertations] \n",
"freq 460 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" institution metadata_policy \\\n",
"count 5707 5707 \n",
"unique 5098 2 \n",
"top [[rijksuniversiteit groningen, [rug], nl, [], ... False \n",
"freq 26 4116 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" data_policy submission_policy content_policy software api \n",
"count 5707 5707 5707 5707 5707 \n",
"unique 2 2 2 321 2 \n",
"top False False False dspace true \n",
"freq 4101 5016 4075 800 4374 \n",
"mean NaN NaN NaN NaN NaN \n",
"std NaN NaN NaN NaN NaN \n",
"min NaN NaN NaN NaN NaN \n",
"25% NaN NaN NaN NaN NaN \n",
"50% NaN NaN NaN NaN NaN \n",
"75% NaN NaN NaN NaN NaN \n",
"max NaN NaN NaN NaN NaN "
2021-07-22 11:35:40 +02:00
]
},
2021-07-23 15:28:23 +02:00
"execution_count": 29,
2021-07-22 11:35:40 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2021-07-23 12:41:17 +02:00
"opendoar_df.describe(include='all')"
2021-07-22 11:35:40 +02:00
]
},
{
"cell_type": "code",
2021-07-23 15:28:23 +02:00
"execution_count": 30,
2021-07-22 11:35:40 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
2021-07-23 15:28:23 +02:00
"openaire_id 0\n",
"opendoar_id 0\n",
"repository_name 0\n",
"additional_name 3569\n",
"repository_url 0\n",
"description 282\n",
"type 0\n",
"update_date 0\n",
"start_date 0\n",
"subject 165\n",
"content_type 144\n",
"institution 0\n",
"metadata_policy 0\n",
"data_policy 0\n",
"submission_policy 0\n",
"content_policy 0\n",
"software 0\n",
"api 0\n",
2021-07-23 12:41:17 +02:00
"dtype: int64"
2021-07-22 11:35:40 +02:00
]
},
2021-07-23 15:28:23 +02:00
"execution_count": 30,
2021-07-22 11:35:40 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2021-07-23 12:41:17 +02:00
"opendoar_df.isna().sum()"
2021-07-22 11:35:40 +02:00
]
},
2021-07-23 15:28:23 +02:00
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
2021-07-22 11:35:40 +02:00
{
"cell_type": "code",
2021-07-23 12:41:17 +02:00
"execution_count": null,
2021-07-22 11:35:40 +02:00
"metadata": {},
2021-07-23 12:41:17 +02:00
"outputs": [],
"source": []
2021-07-22 11:35:40 +02:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}