registries_analysis/notebooks/01.3-exploration-opendoar.i...

925 lines
39 KiB
Plaintext
Raw Normal View History

2021-07-22 11:35:40 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"import csv\n",
"import json\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
"import plotly.express as px\n",
"\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading datasets"
]
},
{
"cell_type": "code",
"execution_count": 2,
2021-07-22 11:35:40 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>system_metadata.id</th>\n",
" <th>repository_metadata.name</th>\n",
" <th>repository_metadata.alternativename</th>\n",
" <th>repository_metadata.url</th>\n",
" <th>repository_metadata.description</th>\n",
" <th>repository_metadata.type</th>\n",
" <th>repository_metadata.content_languages</th>\n",
" <th>system_metadata.date_modified</th>\n",
" <th>system_metadata.date_created</th>\n",
" <th>repository_metadata.content_subjects</th>\n",
" <th>repository_metadata.content_types</th>\n",
" <th>organization</th>\n",
" <th>policy_urls</th>\n",
" <th>repository_metadata.software</th>\n",
" <th>repository_metadata.oai_url</th>\n",
" <th>system_metadata.publicly_visible</th>\n",
" <th>repository_metadata.repository_status</th>\n",
" <th>repository_metadata.fulltext_record_count</th>\n",
" <th>repository_metadata.metadata_record_count</th>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
2022-02-14 13:34:42 +01:00
" <td>134</td>\n",
" <td>{\"name\": \"eldorado - repository of the tu dort...</td>\n",
" <td>[{'name': 'eldorado - ressourcen aus und für l...</td>\n",
" <td>https://eldorado.tu-dortmund.de</td>\n",
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>institutional</td>\n",
" <td>[]</td>\n",
2022-02-14 13:34:42 +01:00
" <td>2022-01-12 15:34:54</td>\n",
" <td>2005-12-19 14:57:52</td>\n",
" <td>[arts, humanities, science, mathematics, socia...</td>\n",
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
" <td>[{'name': 'technische universität dortmund', '...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"dspace\", \"version\": \"\"}</td>\n",
" <td>https://eldorado.tu-dortmund.de/oai/request</td>\n",
" <td>yes</td>\n",
" <td>NaN</td>\n",
2022-02-14 13:34:42 +01:00
" <td>9629.0</td>\n",
" <td>20963.0</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
2022-02-14 13:34:42 +01:00
" <td>58</td>\n",
" <td>{\"name\": \"archive ouverte en sciences de linfo...</td>\n",
" <td>[{'acronym': '@rchivesic'}]</td>\n",
" <td>https://archivesic.ccsd.cnrs.fr</td>\n",
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>institutional</td>\n",
" <td>[]</td>\n",
2022-02-14 13:34:42 +01:00
" <td>2022-01-12 15:34:53</td>\n",
" <td>2006-01-13 12:48:32</td>\n",
" <td>[arts, science, technology, engineering, mathe...</td>\n",
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
" <td>[{'name': 'centre pour la communication scient...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"hal\", \"version\": \"\"}</td>\n",
" <td>https://api.archives-ouvertes.fr/oai/archivesic</td>\n",
" <td>yes</td>\n",
" <td>NaN</td>\n",
2022-02-14 13:34:42 +01:00
" <td>55492.0</td>\n",
" <td>1137498.0</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
2022-02-14 13:34:42 +01:00
" <td>93</td>\n",
" <td>{\"name\": \"digitalcommons@the texas medical cen...</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[]</td>\n",
2022-02-14 13:34:42 +01:00
" <td>http://digitalcommons.library.tmc.edu/</td>\n",
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>institutional</td>\n",
2022-02-14 13:34:42 +01:00
" <td>[]</td>\n",
" <td>2022-01-12 15:34:53</td>\n",
" <td>2006-02-14 11:16:12</td>\n",
" <td>[health and medicine]</td>\n",
" <td>[journal_articles, theses_and_dissertations]</td>\n",
" <td>[{'name': 'texas medical center', 'alternative...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"other\", \"version\": \"\"}</td>\n",
" <td>http://digitalcommons.library.tmc.edu/do/oai/</td>\n",
" <td>yes</td>\n",
2022-02-14 13:34:42 +01:00
" <td>NaN</td>\n",
" <td>2658.0</td>\n",
" <td>7268.0</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
2022-02-14 13:34:42 +01:00
" <td>68</td>\n",
" <td>{\"name\": \"cognitive sciences eprint archive\", ...</td>\n",
" <td>[{'acronym': 'cogprints'}]</td>\n",
" <td>http://cogprints.org/</td>\n",
" <td>NaN</td>\n",
" <td>disciplinary</td>\n",
" <td>[]</td>\n",
2022-02-14 13:34:42 +01:00
" <td>2022-01-12 15:34:53</td>\n",
" <td>2006-01-04 15:01:23</td>\n",
" <td>[humanities, health and medicine, science, soc...</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
2022-02-14 13:34:42 +01:00
" <td>[{'name': 'university of southampton', 'altern...</td>\n",
" <td>[]</td>\n",
2022-02-14 13:34:42 +01:00
" <td>{\"name\": \"eprints\", \"version\": \"\"}</td>\n",
" <td>http://cogprints.org/cgi/oai2</td>\n",
" <td>yes</td>\n",
2022-02-14 13:34:42 +01:00
" <td>NaN</td>\n",
" <td>2895.0</td>\n",
" <td>4277.0</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
2022-02-14 13:34:42 +01:00
" <td>84</td>\n",
" <td>{\"name\": \"digital commons@carleton college\", \"...</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[]</td>\n",
2022-02-14 13:34:42 +01:00
" <td>http://digitalcommons.carleton.edu/</td>\n",
" <td>NaN</td>\n",
" <td>institutional</td>\n",
" <td>[]</td>\n",
2022-02-14 13:34:42 +01:00
" <td>2022-01-12 15:34:53</td>\n",
" <td>2006-01-04 16:07:58</td>\n",
" <td>[humanities, science, social sciences]</td>\n",
" <td>[journal_articles, unpub_reports_and_working_p...</td>\n",
" <td>[{'name': 'carleton college', 'alternativeName...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"other\", \"version\": \"\"}</td>\n",
" <td>NaN</td>\n",
" <td>yes</td>\n",
2022-02-14 13:34:42 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>42.0</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" system_metadata.id repository_metadata.name \\\n",
2022-02-14 13:34:42 +01:00
"0 134 {\"name\": \"eldorado - repository of the tu dort... \n",
"1 58 {\"name\": \"archive ouverte en sciences de linfo... \n",
"2 93 {\"name\": \"digitalcommons@the texas medical cen... \n",
"3 68 {\"name\": \"cognitive sciences eprint archive\", ... \n",
"4 84 {\"name\": \"digital commons@carleton college\", \"... \n",
2021-07-23 12:41:17 +02:00
"\n",
2022-02-14 13:34:42 +01:00
" repository_metadata.alternativename \\\n",
"0 [{'name': 'eldorado - ressourcen aus und für l... \n",
"1 [{'acronym': '@rchivesic'}] \n",
"2 [] \n",
"3 [{'acronym': 'cogprints'}] \n",
"4 [] \n",
2021-07-23 12:41:17 +02:00
"\n",
2022-02-14 13:34:42 +01:00
" repository_metadata.url repository_metadata.description \\\n",
"0 https://eldorado.tu-dortmund.de NaN \n",
"1 https://archivesic.ccsd.cnrs.fr NaN \n",
"2 http://digitalcommons.library.tmc.edu/ NaN \n",
"3 http://cogprints.org/ NaN \n",
"4 http://digitalcommons.carleton.edu/ NaN \n",
2021-07-23 12:41:17 +02:00
"\n",
2022-02-14 13:34:42 +01:00
" repository_metadata.type repository_metadata.content_languages \\\n",
"0 institutional [] \n",
"1 institutional [] \n",
"2 institutional [] \n",
"3 disciplinary [] \n",
"4 institutional [] \n",
2021-07-23 12:41:17 +02:00
"\n",
2022-02-14 13:34:42 +01:00
" system_metadata.date_modified system_metadata.date_created \\\n",
"0 2022-01-12 15:34:54 2005-12-19 14:57:52 \n",
"1 2022-01-12 15:34:53 2006-01-13 12:48:32 \n",
"2 2022-01-12 15:34:53 2006-02-14 11:16:12 \n",
"3 2022-01-12 15:34:53 2006-01-04 15:01:23 \n",
"4 2022-01-12 15:34:53 2006-01-04 16:07:58 \n",
2021-07-22 11:35:40 +02:00
"\n",
" repository_metadata.content_subjects \\\n",
2022-02-14 13:34:42 +01:00
"0 [arts, humanities, science, mathematics, socia... \n",
"1 [arts, science, technology, engineering, mathe... \n",
"2 [health and medicine] \n",
"3 [humanities, health and medicine, science, soc... \n",
"4 [humanities, science, social sciences] \n",
2021-07-23 12:41:17 +02:00
"\n",
" repository_metadata.content_types \\\n",
2022-02-14 13:34:42 +01:00
"0 [journal_articles, conference_and_workshop_pap... \n",
"1 [journal_articles, conference_and_workshop_pap... \n",
"2 [journal_articles, theses_and_dissertations] \n",
2021-07-23 12:41:17 +02:00
"3 [journal_articles, conference_and_workshop_pap... \n",
2022-02-14 13:34:42 +01:00
"4 [journal_articles, unpub_reports_and_working_p... \n",
2021-07-23 12:41:17 +02:00
"\n",
2022-02-14 13:34:42 +01:00
" organization policy_urls \\\n",
"0 [{'name': 'technische universität dortmund', '... [] \n",
"1 [{'name': 'centre pour la communication scient... [] \n",
"2 [{'name': 'texas medical center', 'alternative... [] \n",
"3 [{'name': 'university of southampton', 'altern... [] \n",
"4 [{'name': 'carleton college', 'alternativeName... [] \n",
"\n",
2022-02-14 13:34:42 +01:00
" repository_metadata.software \\\n",
"0 {\"name\": \"dspace\", \"version\": \"\"} \n",
"1 {\"name\": \"hal\", \"version\": \"\"} \n",
"2 {\"name\": \"other\", \"version\": \"\"} \n",
"3 {\"name\": \"eprints\", \"version\": \"\"} \n",
"4 {\"name\": \"other\", \"version\": \"\"} \n",
"\n",
2022-02-14 13:34:42 +01:00
" repository_metadata.oai_url \\\n",
"0 https://eldorado.tu-dortmund.de/oai/request \n",
"1 https://api.archives-ouvertes.fr/oai/archivesic \n",
"2 http://digitalcommons.library.tmc.edu/do/oai/ \n",
"3 http://cogprints.org/cgi/oai2 \n",
"4 NaN \n",
2021-07-23 12:41:17 +02:00
"\n",
2022-02-14 13:34:42 +01:00
" system_metadata.publicly_visible repository_metadata.repository_status \\\n",
"0 yes NaN \n",
"1 yes NaN \n",
"2 yes NaN \n",
"3 yes NaN \n",
"4 yes NaN \n",
2021-07-23 12:41:17 +02:00
"\n",
" repository_metadata.fulltext_record_count \\\n",
2022-02-14 13:34:42 +01:00
"0 9629.0 \n",
"1 55492.0 \n",
"2 2658.0 \n",
"3 2895.0 \n",
"4 NaN \n",
"\n",
" repository_metadata.metadata_record_count \n",
2022-02-14 13:34:42 +01:00
"0 20963.0 \n",
"1 1137498.0 \n",
"2 7268.0 \n",
"3 4277.0 \n",
"4 42.0 "
2021-07-22 11:35:40 +02:00
]
},
"execution_count": 2,
2021-07-22 11:35:40 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n",
" converters={'repository_metadata.content_subjects': ast.literal_eval,\n",
" 'repository_metadata.alternativename': ast.literal_eval,\n",
" 'repository_metadata.content_types': ast.literal_eval,\n",
" 'organization': ast.literal_eval\n",
" },\n",
" dtype={'system_metadata.id': str})\n",
"\n",
2021-07-22 11:35:40 +02:00
"opendoar_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
2021-07-23 15:28:23 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['system_metadata.id', 'repository_metadata.name',\n",
" 'repository_metadata.alternativename', 'repository_metadata.url',\n",
" 'repository_metadata.description', 'repository_metadata.type',\n",
" 'repository_metadata.content_languages',\n",
" 'system_metadata.date_modified', 'system_metadata.date_created',\n",
" 'repository_metadata.content_subjects',\n",
" 'repository_metadata.content_types', 'organization', 'policy_urls',\n",
" 'repository_metadata.software', 'repository_metadata.oai_url',\n",
" 'system_metadata.publicly_visible',\n",
" 'repository_metadata.repository_status',\n",
" 'repository_metadata.fulltext_record_count',\n",
" 'repository_metadata.metadata_record_count'],\n",
2021-07-23 15:28:23 +02:00
" dtype='object')"
]
},
"execution_count": 3,
2021-07-23 15:28:23 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 4,
2021-07-23 15:28:23 +02:00
"metadata": {},
"outputs": [],
"source": [
"def empty_list_is_nan(cell):\n",
" if isinstance(cell, list):\n",
" return np.nan if len(cell) == 0 else cell\n",
" else:\n",
" return cell\n",
" \n",
"opendoar_df = opendoar_df.applymap(empty_list_is_nan)"
]
},
{
"cell_type": "code",
"execution_count": 5,
2021-07-22 11:35:40 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>system_metadata.id</th>\n",
" <th>repository_metadata.name</th>\n",
" <th>repository_metadata.alternativename</th>\n",
" <th>repository_metadata.url</th>\n",
" <th>repository_metadata.description</th>\n",
" <th>repository_metadata.type</th>\n",
" <th>repository_metadata.content_languages</th>\n",
" <th>system_metadata.date_modified</th>\n",
" <th>system_metadata.date_created</th>\n",
" <th>repository_metadata.content_subjects</th>\n",
" <th>repository_metadata.content_types</th>\n",
" <th>organization</th>\n",
" <th>policy_urls</th>\n",
" <th>repository_metadata.software</th>\n",
" <th>repository_metadata.oai_url</th>\n",
" <th>system_metadata.publicly_visible</th>\n",
" <th>repository_metadata.repository_status</th>\n",
" <th>repository_metadata.fulltext_record_count</th>\n",
" <th>repository_metadata.metadata_record_count</th>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
2022-02-14 13:34:42 +01:00
" <td>5811</td>\n",
" <td>5811</td>\n",
" <td>2155</td>\n",
" <td>5810</td>\n",
" <td>0.0</td>\n",
" <td>5810</td>\n",
" <td>5811</td>\n",
" <td>5811</td>\n",
" <td>5811</td>\n",
" <td>5644</td>\n",
" <td>5667</td>\n",
" <td>5811</td>\n",
" <td>5811</td>\n",
" <td>5811</td>\n",
" <td>4447</td>\n",
" <td>5811</td>\n",
" <td>0.0</td>\n",
" <td>2.292000e+03</td>\n",
" <td>4.184000e+03</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
2022-02-14 13:34:42 +01:00
" <td>5811</td>\n",
" <td>5780</td>\n",
" <td>2115</td>\n",
" <td>5772</td>\n",
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>4</td>\n",
" <td>1</td>\n",
2022-02-14 13:34:42 +01:00
" <td>171</td>\n",
" <td>5643</td>\n",
" <td>236</td>\n",
" <td>476</td>\n",
" <td>5212</td>\n",
" <td>678</td>\n",
" <td>32</td>\n",
" <td>4415</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
2022-02-14 13:34:42 +01:00
" <td>134</td>\n",
" <td>{\"name\": \"arch\", \"language\": \"en\"}</td>\n",
" <td>[{'acronym': 'aura'}]</td>\n",
2021-07-23 12:41:17 +02:00
" <td>http://harp.lib.hiroshima-u.ac.jp/</td>\n",
2022-02-14 13:34:42 +01:00
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>institutional</td>\n",
2022-02-14 13:34:42 +01:00
" <td>[]</td>\n",
" <td>2022-01-12 15:35:47</td>\n",
2021-07-23 12:41:17 +02:00
" <td>2020-09-18 12:53:48</td>\n",
2022-02-14 13:34:42 +01:00
" <td>[science, technology, engineering, mathematics...</td>\n",
2021-07-23 12:41:17 +02:00
" <td>[theses_and_dissertations]</td>\n",
" <td>[{'name': 'rijksuniversiteit groningen', 'alte...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"dspace\", \"version\": \"\"}</td>\n",
2022-02-14 13:34:42 +01:00
" <td>https://api.figshare.com/v2/oai</td>\n",
" <td>yes</td>\n",
2022-02-14 13:34:42 +01:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
2021-07-23 12:41:17 +02:00
" <td>1</td>\n",
2021-07-22 11:35:40 +02:00
" <td>3</td>\n",
2021-07-23 15:28:23 +02:00
" <td>4</td>\n",
2021-07-23 12:41:17 +02:00
" <td>3</td>\n",
2022-02-14 13:34:42 +01:00
" <td>NaN</td>\n",
" <td>5161</td>\n",
" <td>5811</td>\n",
" <td>73</td>\n",
" <td>81</td>\n",
" <td>3321</td>\n",
" <td>469</td>\n",
2021-07-22 11:35:40 +02:00
" <td>26</td>\n",
2022-02-14 13:34:42 +01:00
" <td>5131</td>\n",
" <td>2273</td>\n",
" <td>3</td>\n",
2022-02-14 13:34:42 +01:00
" <td>5811</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2022-02-14 13:34:42 +01:00
" <td>5.022890e+03</td>\n",
" <td>1.765556e+05</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2022-02-14 13:34:42 +01:00
" <td>4.212648e+04</td>\n",
" <td>6.611068e+06</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.000000e+00</td>\n",
2022-02-14 13:34:42 +01:00
" <td>8.937500e+02</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-23 12:41:17 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2022-02-14 13:34:42 +01:00
" <td>4.225000e+02</td>\n",
" <td>4.012500e+03</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
2021-07-23 12:41:17 +02:00
" <th>75%</th>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
2022-02-14 13:34:42 +01:00
" <td>2.931500e+03</td>\n",
" <td>1.629350e+04</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" <tr>\n",
2021-07-23 12:41:17 +02:00
" <th>max</th>\n",
2021-07-22 11:35:40 +02:00
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.817531e+06</td>\n",
" <td>4.200000e+08</td>\n",
2021-07-22 11:35:40 +02:00
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2022-02-14 13:34:42 +01:00
" system_metadata.id repository_metadata.name \\\n",
"count 5811 5811 \n",
"unique 5811 5780 \n",
"top 134 {\"name\": \"arch\", \"language\": \"en\"} \n",
"freq 1 3 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
2021-07-23 12:41:17 +02:00
"\n",
" repository_metadata.alternativename \\\n",
2022-02-14 13:34:42 +01:00
"count 2155 \n",
"unique 2115 \n",
"top [{'acronym': 'aura'}] \n",
"freq 4 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
2021-07-23 12:41:17 +02:00
"\n",
2022-02-14 13:34:42 +01:00
" repository_metadata.url repository_metadata.description \\\n",
"count 5810 0.0 \n",
"unique 5772 NaN \n",
"top http://harp.lib.hiroshima-u.ac.jp/ NaN \n",
"freq 3 NaN \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" repository_metadata.type repository_metadata.content_languages \\\n",
2022-02-14 13:34:42 +01:00
"count 5810 5811 \n",
"unique 4 1 \n",
"top institutional [] \n",
"freq 5161 5811 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" system_metadata.date_modified system_metadata.date_created \\\n",
2022-02-14 13:34:42 +01:00
"count 5811 5811 \n",
"unique 171 5643 \n",
"top 2022-01-12 15:35:47 2020-09-18 12:53:48 \n",
"freq 73 81 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
2022-02-14 13:34:42 +01:00
" repository_metadata.content_subjects \\\n",
"count 5644 \n",
"unique 236 \n",
"top [science, technology, engineering, mathematics... \n",
"freq 3321 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" repository_metadata.content_types \\\n",
"count 5667 \n",
"unique 476 \n",
"top [theses_and_dissertations] \n",
"freq 469 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
2021-07-23 12:41:17 +02:00
"\n",
" organization policy_urls \\\n",
2022-02-14 13:34:42 +01:00
"count 5811 5811 \n",
"unique 5212 678 \n",
"top [{'name': 'rijksuniversiteit groningen', 'alte... [] \n",
2022-02-14 13:34:42 +01:00
"freq 26 5131 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
2021-07-23 12:41:17 +02:00
"\n",
2022-02-14 13:34:42 +01:00
" repository_metadata.software repository_metadata.oai_url \\\n",
"count 5811 4447 \n",
"unique 32 4415 \n",
"top {\"name\": \"dspace\", \"version\": \"\"} https://api.figshare.com/v2/oai \n",
"freq 2273 3 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" system_metadata.publicly_visible \\\n",
"count 5811 \n",
"unique 1 \n",
"top yes \n",
"freq 5811 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
2021-07-23 12:41:17 +02:00
"\n",
2022-02-14 13:34:42 +01:00
" repository_metadata.repository_status \\\n",
"count 0.0 \n",
"unique NaN \n",
"top NaN \n",
"freq NaN \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
2021-07-23 12:41:17 +02:00
"\n",
" repository_metadata.fulltext_record_count \\\n",
2022-02-14 13:34:42 +01:00
"count 2.292000e+03 \n",
"unique NaN \n",
"top NaN \n",
"freq NaN \n",
2022-02-14 13:34:42 +01:00
"mean 5.022890e+03 \n",
"std 4.212648e+04 \n",
"min 0.000000e+00 \n",
"25% 0.000000e+00 \n",
2022-02-14 13:34:42 +01:00
"50% 4.225000e+02 \n",
"75% 2.931500e+03 \n",
"max 1.817531e+06 \n",
"\n",
" repository_metadata.metadata_record_count \n",
2022-02-14 13:34:42 +01:00
"count 4.184000e+03 \n",
"unique NaN \n",
"top NaN \n",
"freq NaN \n",
2022-02-14 13:34:42 +01:00
"mean 1.765556e+05 \n",
"std 6.611068e+06 \n",
"min 0.000000e+00 \n",
2022-02-14 13:34:42 +01:00
"25% 8.937500e+02 \n",
"50% 4.012500e+03 \n",
"75% 1.629350e+04 \n",
"max 4.200000e+08 "
2021-07-22 11:35:40 +02:00
]
},
"execution_count": 5,
2021-07-22 11:35:40 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2021-07-23 12:41:17 +02:00
"opendoar_df.describe(include='all')"
2021-07-22 11:35:40 +02:00
]
},
{
"cell_type": "code",
"execution_count": 6,
2021-07-22 11:35:40 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"system_metadata.id 0\n",
"repository_metadata.name 0\n",
2022-02-14 13:34:42 +01:00
"repository_metadata.alternativename 3656\n",
"repository_metadata.url 1\n",
"repository_metadata.description 5811\n",
"repository_metadata.type 1\n",
"repository_metadata.content_languages 0\n",
"system_metadata.date_modified 0\n",
"system_metadata.date_created 0\n",
2022-02-14 13:34:42 +01:00
"repository_metadata.content_subjects 167\n",
"repository_metadata.content_types 144\n",
"organization 0\n",
"policy_urls 0\n",
"repository_metadata.software 0\n",
2022-02-14 13:34:42 +01:00
"repository_metadata.oai_url 1364\n",
"system_metadata.publicly_visible 0\n",
2022-02-14 13:34:42 +01:00
"repository_metadata.repository_status 5811\n",
"repository_metadata.fulltext_record_count 3519\n",
"repository_metadata.metadata_record_count 1627\n",
2021-07-23 12:41:17 +02:00
"dtype: int64"
2021-07-22 11:35:40 +02:00
]
},
"execution_count": 6,
2021-07-22 11:35:40 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
2021-07-23 12:41:17 +02:00
"opendoar_df.isna().sum()"
2021-07-22 11:35:40 +02:00
]
},
2021-07-23 15:28:23 +02:00
{
"cell_type": "code",
"execution_count": 7,
2021-07-23 15:28:23 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"repository_metadata.content_types\n",
2022-02-14 13:34:42 +01:00
"bibliographic_references 858\n",
"books_chapters_and_sections 2246\n",
"conference_and_workshop_papers 2037\n",
"datasets 427\n",
"journal_articles 4069\n",
"learning_objects 807\n",
"other_special_item_types 1800\n",
"patents 200\n",
"software 105\n",
"theses_and_dissertations 3377\n",
"unpub_reports_and_working_papers 1953\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(opendoar_df['repository_metadata.content_types'].explode()).groupby('repository_metadata.content_types').size()"
]
2021-07-23 15:28:23 +02:00
},
2021-07-22 11:35:40 +02:00
{
"cell_type": "code",
2021-07-23 12:41:17 +02:00
"execution_count": null,
2021-07-22 11:35:40 +02:00
"metadata": {},
2021-07-23 12:41:17 +02:00
"outputs": [],
"source": []
2021-07-22 11:35:40 +02:00
}
],
"metadata": {
"kernelspec": {
2022-02-14 13:34:42 +01:00
"display_name": "Python 3 (ipykernel)",
2021-07-22 11:35:40 +02:00
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}