2021-07-22 11:35:40 +02:00
|
|
|
{
|
|
|
|
"cells": [
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 1,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"import ast\n",
|
|
|
|
"import csv\n",
|
|
|
|
"import json\n",
|
|
|
|
"\n",
|
|
|
|
"import numpy as np\n",
|
|
|
|
"import pandas as pd\n",
|
|
|
|
"\n",
|
|
|
|
"import plotly\n",
|
|
|
|
"from plotly.offline import iplot, init_notebook_mode\n",
|
|
|
|
"import plotly.graph_objs as go\n",
|
|
|
|
"import plotly.express as px\n",
|
|
|
|
"\n",
|
|
|
|
"pd.set_option('display.max_columns', None)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "markdown",
|
|
|
|
"metadata": {},
|
|
|
|
"source": [
|
|
|
|
"## Loading datasets"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-07-23 15:28:23 +02:00
|
|
|
"execution_count": 24,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>openaire_id</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <th>opendoar_id</th>\n",
|
|
|
|
" <th>repository_name</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>additional_name</th>\n",
|
|
|
|
" <th>repository_url</th>\n",
|
|
|
|
" <th>description</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <th>type</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>update_date</th>\n",
|
|
|
|
" <th>start_date</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <th>subject</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>content_type</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <th>institution</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>metadata_policy</th>\n",
|
|
|
|
" <th>data_policy</th>\n",
|
|
|
|
" <th>submission_policy</th>\n",
|
|
|
|
" <th>content_policy</th>\n",
|
|
|
|
" <th>software</th>\n",
|
|
|
|
" <th>api</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>0</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>opendoar____::38b3eff8baf56627478ec76a704e9b52</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>101</td>\n",
|
|
|
|
" <td>utrecht university repository</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[]</td>\n",
|
|
|
|
" <td>http://dspace.library.uu.nl</td>\n",
|
|
|
|
" <td>this site is a university repository providing...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>institutional</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>2021-04-16 15:22:03</td>\n",
|
|
|
|
" <td>2006-01-13 12:55:13</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>[multidisciplinary]</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>[[university of utrecht, [universiteit utrecht...</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>dspace</td>\n",
|
|
|
|
" <td>true</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>1</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>opendoar____::2b44928ae11fb9384c4cf38708677c48</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>115</td>\n",
|
|
|
|
" <td>dspace at indian institute of management kozhi...</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[dspace@iimk]</td>\n",
|
|
|
|
" <td>http://dspace.iimk.ac.in/</td>\n",
|
|
|
|
" <td>this site is a subject based university reposi...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>institutional</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>2021-02-18 17:36:43</td>\n",
|
|
|
|
" <td>2006-01-04 11:54:34</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>[ecology and environment, social sciences gene...</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>[[indian institute of management kozhikode, [i...</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>dspace 4.1</td>\n",
|
|
|
|
" <td>true</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>2</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>opendoar____::3416a75f4cea9109507cacd8e2f2aefc</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>41</td>\n",
|
|
|
|
" <td>caltech engineering and science online</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[]</td>\n",
|
|
|
|
" <td>http://calteches.library.caltech.edu/</td>\n",
|
|
|
|
" <td>the caltech archives holds approximately 220 c...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>institutional</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>2021-02-18 17:36:28</td>\n",
|
|
|
|
" <td>2006-01-04 14:47:04</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>[biology and biochemistry, chemistry and chemi...</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>[[california institute of technology, [caltech...</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>eprints 3.1.3</td>\n",
|
|
|
|
" <td>true</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>3</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>opendoar____::07e1cd7dca89a1678042477183b7ac3f</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>119</td>\n",
|
|
|
|
" <td>dcu online research access service</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[doras]</td>\n",
|
|
|
|
" <td>http://doras.dcu.ie/</td>\n",
|
|
|
|
" <td>this site is an institutional repository provi...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>institutional</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>2021-02-18 17:36:44</td>\n",
|
|
|
|
" <td>2006-01-04 11:15:19</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>[multidisciplinary]</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>[[dublin city university, [dcu], ie, [], , htt...</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>eprints 3.0.5</td>\n",
|
|
|
|
" <td>true</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>4</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>opendoar____::d1f491a404d6854880943e5c3cd9ca25</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>129</td>\n",
|
|
|
|
" <td>earth-prints repository</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[]</td>\n",
|
|
|
|
" <td>http://www.earth-prints.org/</td>\n",
|
|
|
|
" <td>a subject based repository providing open acce...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>disciplinary</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>2021-04-19 08:28:38</td>\n",
|
|
|
|
" <td>2006-01-30 16:43:11</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>[earth and planetary sciences]</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>[[istituto nazionale di geofisica e vulcanolog...</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>True</td>\n",
|
|
|
|
" <td>dspace 5.8.1-snapshot</td>\n",
|
|
|
|
" <td>true</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
2021-07-23 12:41:17 +02:00
|
|
|
" openaire_id opendoar_id \\\n",
|
|
|
|
"0 opendoar____::38b3eff8baf56627478ec76a704e9b52 101 \n",
|
|
|
|
"1 opendoar____::2b44928ae11fb9384c4cf38708677c48 115 \n",
|
|
|
|
"2 opendoar____::3416a75f4cea9109507cacd8e2f2aefc 41 \n",
|
|
|
|
"3 opendoar____::07e1cd7dca89a1678042477183b7ac3f 119 \n",
|
|
|
|
"4 opendoar____::d1f491a404d6854880943e5c3cd9ca25 129 \n",
|
|
|
|
"\n",
|
|
|
|
" repository_name additional_name \\\n",
|
|
|
|
"0 utrecht university repository [] \n",
|
|
|
|
"1 dspace at indian institute of management kozhi... [dspace@iimk] \n",
|
|
|
|
"2 caltech engineering and science online [] \n",
|
|
|
|
"3 dcu online research access service [doras] \n",
|
|
|
|
"4 earth-prints repository [] \n",
|
|
|
|
"\n",
|
|
|
|
" repository_url \\\n",
|
|
|
|
"0 http://dspace.library.uu.nl \n",
|
|
|
|
"1 http://dspace.iimk.ac.in/ \n",
|
|
|
|
"2 http://calteches.library.caltech.edu/ \n",
|
|
|
|
"3 http://doras.dcu.ie/ \n",
|
|
|
|
"4 http://www.earth-prints.org/ \n",
|
|
|
|
"\n",
|
|
|
|
" description type \\\n",
|
|
|
|
"0 this site is a university repository providing... institutional \n",
|
|
|
|
"1 this site is a subject based university reposi... institutional \n",
|
|
|
|
"2 the caltech archives holds approximately 220 c... institutional \n",
|
|
|
|
"3 this site is an institutional repository provi... institutional \n",
|
|
|
|
"4 a subject based repository providing open acce... disciplinary \n",
|
|
|
|
"\n",
|
|
|
|
" update_date start_date \\\n",
|
|
|
|
"0 2021-04-16 15:22:03 2006-01-13 12:55:13 \n",
|
|
|
|
"1 2021-02-18 17:36:43 2006-01-04 11:54:34 \n",
|
|
|
|
"2 2021-02-18 17:36:28 2006-01-04 14:47:04 \n",
|
|
|
|
"3 2021-02-18 17:36:44 2006-01-04 11:15:19 \n",
|
|
|
|
"4 2021-04-19 08:28:38 2006-01-30 16:43:11 \n",
|
2021-07-22 11:35:40 +02:00
|
|
|
"\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" subject \\\n",
|
|
|
|
"0 [multidisciplinary] \n",
|
|
|
|
"1 [ecology and environment, social sciences gene... \n",
|
|
|
|
"2 [biology and biochemistry, chemistry and chemi... \n",
|
|
|
|
"3 [multidisciplinary] \n",
|
|
|
|
"4 [earth and planetary sciences] \n",
|
|
|
|
"\n",
|
|
|
|
" content_type \\\n",
|
|
|
|
"0 [journal_articles, conference_and_workshop_pap... \n",
|
|
|
|
"1 [journal_articles, conference_and_workshop_pap... \n",
|
|
|
|
"2 [journal_articles, conference_and_workshop_pap... \n",
|
|
|
|
"3 [journal_articles, conference_and_workshop_pap... \n",
|
|
|
|
"4 [journal_articles, conference_and_workshop_pap... \n",
|
|
|
|
"\n",
|
|
|
|
" institution metadata_policy \\\n",
|
|
|
|
"0 [[university of utrecht, [universiteit utrecht... True \n",
|
|
|
|
"1 [[indian institute of management kozhikode, [i... True \n",
|
|
|
|
"2 [[california institute of technology, [caltech... True \n",
|
|
|
|
"3 [[dublin city university, [dcu], ie, [], , htt... True \n",
|
|
|
|
"4 [[istituto nazionale di geofisica e vulcanolog... True \n",
|
|
|
|
"\n",
|
|
|
|
" data_policy submission_policy content_policy software \\\n",
|
|
|
|
"0 True False True dspace \n",
|
|
|
|
"1 True True True dspace 4.1 \n",
|
|
|
|
"2 True True True eprints 3.1.3 \n",
|
|
|
|
"3 True True True eprints 3.0.5 \n",
|
|
|
|
"4 True True True dspace 5.8.1-snapshot \n",
|
|
|
|
"\n",
|
|
|
|
" api \n",
|
|
|
|
"0 true \n",
|
|
|
|
"1 true \n",
|
|
|
|
"2 true \n",
|
|
|
|
"3 true \n",
|
|
|
|
"4 true "
|
2021-07-22 11:35:40 +02:00
|
|
|
]
|
|
|
|
},
|
2021-07-23 15:28:23 +02:00
|
|
|
"execution_count": 24,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n",
|
|
|
|
" converters={'subject': ast.literal_eval,\n",
|
|
|
|
" 'additional_name': ast.literal_eval,\n",
|
|
|
|
" 'opendoar_id': ast.literal_eval,\n",
|
|
|
|
" 'content_type': ast.literal_eval,\n",
|
|
|
|
" 'institution': ast.literal_eval\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" })\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
"opendoar_df.head()"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-07-23 15:28:23 +02:00
|
|
|
"execution_count": 25,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
|
|
|
"Index(['openaire_id', 'opendoar_id', 'repository_name', 'additional_name',\n",
|
|
|
|
" 'repository_url', 'description', 'type', 'update_date', 'start_date',\n",
|
|
|
|
" 'subject', 'content_type', 'institution', 'metadata_policy',\n",
|
|
|
|
" 'data_policy', 'submission_policy', 'content_policy', 'software',\n",
|
|
|
|
" 'api'],\n",
|
|
|
|
" dtype='object')"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
"execution_count": 25,
|
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
|
|
|
"opendoar_df.columns"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 28,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": [
|
|
|
|
"def empty_list_is_nan(cell):\n",
|
|
|
|
" if isinstance(cell, list):\n",
|
|
|
|
" return np.nan if len(cell) == 0 else cell\n",
|
|
|
|
" else:\n",
|
|
|
|
" return cell\n",
|
|
|
|
" \n",
|
|
|
|
"opendoar_df = opendoar_df.applymap(empty_list_is_nan)"
|
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": 29,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/html": [
|
|
|
|
"<div>\n",
|
|
|
|
"<style scoped>\n",
|
|
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
|
|
" vertical-align: middle;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe tbody tr th {\n",
|
|
|
|
" vertical-align: top;\n",
|
|
|
|
" }\n",
|
|
|
|
"\n",
|
|
|
|
" .dataframe thead th {\n",
|
|
|
|
" text-align: right;\n",
|
|
|
|
" }\n",
|
|
|
|
"</style>\n",
|
|
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
|
|
" <thead>\n",
|
|
|
|
" <tr style=\"text-align: right;\">\n",
|
|
|
|
" <th></th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>openaire_id</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <th>opendoar_id</th>\n",
|
|
|
|
" <th>repository_name</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>additional_name</th>\n",
|
|
|
|
" <th>repository_url</th>\n",
|
|
|
|
" <th>description</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <th>type</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>update_date</th>\n",
|
|
|
|
" <th>start_date</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <th>subject</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>content_type</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <th>institution</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>metadata_policy</th>\n",
|
|
|
|
" <th>data_policy</th>\n",
|
|
|
|
" <th>submission_policy</th>\n",
|
|
|
|
" <th>content_policy</th>\n",
|
|
|
|
" <th>software</th>\n",
|
|
|
|
" <th>api</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" </thead>\n",
|
|
|
|
" <tbody>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>count</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>5707</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>5707.000000</td>\n",
|
|
|
|
" <td>5707</td>\n",
|
2021-07-23 15:28:23 +02:00
|
|
|
" <td>2138</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>5707</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>5425</td>\n",
|
|
|
|
" <td>5707</td>\n",
|
|
|
|
" <td>5707</td>\n",
|
|
|
|
" <td>5707</td>\n",
|
2021-07-23 15:28:23 +02:00
|
|
|
" <td>5542</td>\n",
|
|
|
|
" <td>5563</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>5707</td>\n",
|
|
|
|
" <td>5707</td>\n",
|
|
|
|
" <td>5707</td>\n",
|
|
|
|
" <td>5707</td>\n",
|
|
|
|
" <td>5707</td>\n",
|
|
|
|
" <td>5707</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>5707</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>unique</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>5707</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>5670</td>\n",
|
2021-07-23 15:28:23 +02:00
|
|
|
" <td>2096</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>5670</td>\n",
|
|
|
|
" <td>4622</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>4</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>2501</td>\n",
|
|
|
|
" <td>5538</td>\n",
|
2021-07-23 15:28:23 +02:00
|
|
|
" <td>819</td>\n",
|
|
|
|
" <td>476</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>5098</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>2</td>\n",
|
|
|
|
" <td>321</td>\n",
|
|
|
|
" <td>2</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>top</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>opendoar____::3cf166c6b73f030b4f67eeaeba301103</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>hiroshima associated repository portal</td>\n",
|
|
|
|
" <td>[]</td>\n",
|
|
|
|
" <td>http://harp.lib.hiroshima-u.ac.jp/</td>\n",
|
|
|
|
" <td>this site provides access to the research outp...</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>institutional</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>2020-09-18 12:53:48</td>\n",
|
|
|
|
" <td>2020-09-18 12:53:48</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>[multidisciplinary]</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>[theses_and_dissertations]</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>[[rijksuniversiteit groningen, [rug], nl, [], ...</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>False</td>\n",
|
|
|
|
" <td>dspace</td>\n",
|
|
|
|
" <td>true</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>freq</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>1</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>3</td>\n",
|
2021-07-23 15:28:23 +02:00
|
|
|
" <td>4</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>3</td>\n",
|
|
|
|
" <td>95</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>5067</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>82</td>\n",
|
|
|
|
" <td>82</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>3212</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>460</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>26</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>4116</td>\n",
|
|
|
|
" <td>4101</td>\n",
|
|
|
|
" <td>5016</td>\n",
|
|
|
|
" <td>4075</td>\n",
|
|
|
|
" <td>800</td>\n",
|
|
|
|
" <td>4374</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>mean</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>4008.118801</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>std</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>2869.948770</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>min</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>2.000000</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>25%</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>1823.000000</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
|
|
|
" <th>50%</th>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>3361.000000</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>75%</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>5095.000000</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" <tr>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <th>max</th>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
" <td>10175.000000</td>\n",
|
2021-07-22 11:35:40 +02:00
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
" </tr>\n",
|
|
|
|
" </tbody>\n",
|
|
|
|
"</table>\n",
|
|
|
|
"</div>"
|
|
|
|
],
|
|
|
|
"text/plain": [
|
2021-07-23 12:41:17 +02:00
|
|
|
" openaire_id opendoar_id \\\n",
|
|
|
|
"count 5707 5707.000000 \n",
|
|
|
|
"unique 5707 NaN \n",
|
|
|
|
"top opendoar____::3cf166c6b73f030b4f67eeaeba301103 NaN \n",
|
|
|
|
"freq 1 NaN \n",
|
|
|
|
"mean NaN 4008.118801 \n",
|
|
|
|
"std NaN 2869.948770 \n",
|
|
|
|
"min NaN 2.000000 \n",
|
|
|
|
"25% NaN 1823.000000 \n",
|
|
|
|
"50% NaN 3361.000000 \n",
|
|
|
|
"75% NaN 5095.000000 \n",
|
|
|
|
"max NaN 10175.000000 \n",
|
|
|
|
"\n",
|
|
|
|
" repository_name additional_name \\\n",
|
2021-07-23 15:28:23 +02:00
|
|
|
"count 5707 2138 \n",
|
|
|
|
"unique 5670 2096 \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"top hiroshima associated repository portal [] \n",
|
2021-07-23 15:28:23 +02:00
|
|
|
"freq 3 4 \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"mean NaN NaN \n",
|
|
|
|
"std NaN NaN \n",
|
|
|
|
"min NaN NaN \n",
|
|
|
|
"25% NaN NaN \n",
|
|
|
|
"50% NaN NaN \n",
|
|
|
|
"75% NaN NaN \n",
|
|
|
|
"max NaN NaN \n",
|
|
|
|
"\n",
|
|
|
|
" repository_url \\\n",
|
|
|
|
"count 5707 \n",
|
|
|
|
"unique 5670 \n",
|
|
|
|
"top http://harp.lib.hiroshima-u.ac.jp/ \n",
|
|
|
|
"freq 3 \n",
|
|
|
|
"mean NaN \n",
|
|
|
|
"std NaN \n",
|
|
|
|
"min NaN \n",
|
|
|
|
"25% NaN \n",
|
|
|
|
"50% NaN \n",
|
|
|
|
"75% NaN \n",
|
|
|
|
"max NaN \n",
|
|
|
|
"\n",
|
|
|
|
" description type \\\n",
|
|
|
|
"count 5425 5707 \n",
|
|
|
|
"unique 4622 4 \n",
|
|
|
|
"top this site provides access to the research outp... institutional \n",
|
|
|
|
"freq 95 5067 \n",
|
|
|
|
"mean NaN NaN \n",
|
|
|
|
"std NaN NaN \n",
|
|
|
|
"min NaN NaN \n",
|
|
|
|
"25% NaN NaN \n",
|
|
|
|
"50% NaN NaN \n",
|
|
|
|
"75% NaN NaN \n",
|
|
|
|
"max NaN NaN \n",
|
|
|
|
"\n",
|
|
|
|
" update_date start_date subject \\\n",
|
2021-07-23 15:28:23 +02:00
|
|
|
"count 5707 5707 5542 \n",
|
|
|
|
"unique 2501 5538 819 \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"top 2020-09-18 12:53:48 2020-09-18 12:53:48 [multidisciplinary] \n",
|
|
|
|
"freq 82 82 3212 \n",
|
|
|
|
"mean NaN NaN NaN \n",
|
|
|
|
"std NaN NaN NaN \n",
|
|
|
|
"min NaN NaN NaN \n",
|
|
|
|
"25% NaN NaN NaN \n",
|
|
|
|
"50% NaN NaN NaN \n",
|
|
|
|
"75% NaN NaN NaN \n",
|
|
|
|
"max NaN NaN NaN \n",
|
|
|
|
"\n",
|
|
|
|
" content_type \\\n",
|
2021-07-23 15:28:23 +02:00
|
|
|
"count 5563 \n",
|
|
|
|
"unique 476 \n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"top [theses_and_dissertations] \n",
|
|
|
|
"freq 460 \n",
|
|
|
|
"mean NaN \n",
|
|
|
|
"std NaN \n",
|
|
|
|
"min NaN \n",
|
|
|
|
"25% NaN \n",
|
|
|
|
"50% NaN \n",
|
|
|
|
"75% NaN \n",
|
|
|
|
"max NaN \n",
|
|
|
|
"\n",
|
|
|
|
" institution metadata_policy \\\n",
|
|
|
|
"count 5707 5707 \n",
|
|
|
|
"unique 5098 2 \n",
|
|
|
|
"top [[rijksuniversiteit groningen, [rug], nl, [], ... False \n",
|
|
|
|
"freq 26 4116 \n",
|
|
|
|
"mean NaN NaN \n",
|
|
|
|
"std NaN NaN \n",
|
|
|
|
"min NaN NaN \n",
|
|
|
|
"25% NaN NaN \n",
|
|
|
|
"50% NaN NaN \n",
|
|
|
|
"75% NaN NaN \n",
|
|
|
|
"max NaN NaN \n",
|
|
|
|
"\n",
|
|
|
|
" data_policy submission_policy content_policy software api \n",
|
|
|
|
"count 5707 5707 5707 5707 5707 \n",
|
|
|
|
"unique 2 2 2 321 2 \n",
|
|
|
|
"top False False False dspace true \n",
|
|
|
|
"freq 4101 5016 4075 800 4374 \n",
|
|
|
|
"mean NaN NaN NaN NaN NaN \n",
|
|
|
|
"std NaN NaN NaN NaN NaN \n",
|
|
|
|
"min NaN NaN NaN NaN NaN \n",
|
|
|
|
"25% NaN NaN NaN NaN NaN \n",
|
|
|
|
"50% NaN NaN NaN NaN NaN \n",
|
|
|
|
"75% NaN NaN NaN NaN NaN \n",
|
|
|
|
"max NaN NaN NaN NaN NaN "
|
2021-07-22 11:35:40 +02:00
|
|
|
]
|
|
|
|
},
|
2021-07-23 15:28:23 +02:00
|
|
|
"execution_count": 29,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2021-07-23 12:41:17 +02:00
|
|
|
"opendoar_df.describe(include='all')"
|
2021-07-22 11:35:40 +02:00
|
|
|
]
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-07-23 15:28:23 +02:00
|
|
|
"execution_count": 30,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
|
|
|
"outputs": [
|
|
|
|
{
|
|
|
|
"data": {
|
|
|
|
"text/plain": [
|
2021-07-23 15:28:23 +02:00
|
|
|
"openaire_id 0\n",
|
|
|
|
"opendoar_id 0\n",
|
|
|
|
"repository_name 0\n",
|
|
|
|
"additional_name 3569\n",
|
|
|
|
"repository_url 0\n",
|
|
|
|
"description 282\n",
|
|
|
|
"type 0\n",
|
|
|
|
"update_date 0\n",
|
|
|
|
"start_date 0\n",
|
|
|
|
"subject 165\n",
|
|
|
|
"content_type 144\n",
|
|
|
|
"institution 0\n",
|
|
|
|
"metadata_policy 0\n",
|
|
|
|
"data_policy 0\n",
|
|
|
|
"submission_policy 0\n",
|
|
|
|
"content_policy 0\n",
|
|
|
|
"software 0\n",
|
|
|
|
"api 0\n",
|
2021-07-23 12:41:17 +02:00
|
|
|
"dtype: int64"
|
2021-07-22 11:35:40 +02:00
|
|
|
]
|
|
|
|
},
|
2021-07-23 15:28:23 +02:00
|
|
|
"execution_count": 30,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
|
|
|
"output_type": "execute_result"
|
|
|
|
}
|
|
|
|
],
|
|
|
|
"source": [
|
2021-07-23 12:41:17 +02:00
|
|
|
"opendoar_df.isna().sum()"
|
2021-07-22 11:35:40 +02:00
|
|
|
]
|
|
|
|
},
|
2021-07-23 15:28:23 +02:00
|
|
|
{
|
|
|
|
"cell_type": "code",
|
|
|
|
"execution_count": null,
|
|
|
|
"metadata": {},
|
|
|
|
"outputs": [],
|
|
|
|
"source": []
|
|
|
|
},
|
2021-07-22 11:35:40 +02:00
|
|
|
{
|
|
|
|
"cell_type": "code",
|
2021-07-23 12:41:17 +02:00
|
|
|
"execution_count": null,
|
2021-07-22 11:35:40 +02:00
|
|
|
"metadata": {},
|
2021-07-23 12:41:17 +02:00
|
|
|
"outputs": [],
|
|
|
|
"source": []
|
2021-07-22 11:35:40 +02:00
|
|
|
}
|
|
|
|
],
|
|
|
|
"metadata": {
|
|
|
|
"kernelspec": {
|
|
|
|
"display_name": "Python 3",
|
|
|
|
"language": "python",
|
|
|
|
"name": "python3"
|
|
|
|
},
|
|
|
|
"language_info": {
|
|
|
|
"codemirror_mode": {
|
|
|
|
"name": "ipython",
|
|
|
|
"version": 3
|
|
|
|
},
|
|
|
|
"file_extension": ".py",
|
|
|
|
"mimetype": "text/x-python",
|
|
|
|
"name": "python",
|
|
|
|
"nbconvert_exporter": "python",
|
|
|
|
"pygments_lexer": "ipython3",
|
|
|
|
"version": "3.8.3"
|
|
|
|
}
|
|
|
|
},
|
|
|
|
"nbformat": 4,
|
|
|
|
"nbformat_minor": 4
|
|
|
|
}
|