initial commit

master
Andrea Mannocci 2 years ago
commit 1326d79e2c

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.2 KiB

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

@ -0,0 +1,918 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"import csv\n",
"import json\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import requests\n",
"\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading datasets"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>system_metadata.id</th>\n",
" <th>repository_metadata.name</th>\n",
" <th>repository_metadata.alternativename</th>\n",
" <th>repository_metadata.url</th>\n",
" <th>repository_metadata.description</th>\n",
" <th>repository_metadata.type</th>\n",
" <th>repository_metadata.content_languages</th>\n",
" <th>system_metadata.date_modified</th>\n",
" <th>system_metadata.date_created</th>\n",
" <th>repository_metadata.content_subjects</th>\n",
" <th>repository_metadata.content_types</th>\n",
" <th>organization</th>\n",
" <th>policy_urls</th>\n",
" <th>repository_metadata.software</th>\n",
" <th>repository_metadata.oai_url</th>\n",
" <th>system_metadata.publicly_visible</th>\n",
" <th>repository_metadata.repository_status</th>\n",
" <th>repository_metadata.fulltext_record_count</th>\n",
" <th>repository_metadata.metadata_record_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>134</td>\n",
" <td>{'name': 'eldorado - repository of the tu dort...</td>\n",
" <td>[{'name': 'eldorado - ressourcen aus und für l...</td>\n",
" <td>https://eldorado.tu-dortmund.de</td>\n",
" <td>NaN</td>\n",
" <td>institutional</td>\n",
" <td>[]</td>\n",
" <td>2022-01-12 15:34:54</td>\n",
" <td>2005-12-19 14:57:52</td>\n",
" <td>[arts, humanities, science, mathematics, socia...</td>\n",
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
" <td>[{'name': 'technische universität dortmund', '...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"dspace\", \"version\": \"\"}</td>\n",
" <td>https://eldorado.tu-dortmund.de/oai/request</td>\n",
" <td>yes</td>\n",
" <td>NaN</td>\n",
" <td>9629.0</td>\n",
" <td>20963.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>58</td>\n",
" <td>{'name': 'archive ouverte en sciences de linfo...</td>\n",
" <td>[{'acronym': '@rchivesic'}]</td>\n",
" <td>https://archivesic.ccsd.cnrs.fr</td>\n",
" <td>NaN</td>\n",
" <td>institutional</td>\n",
" <td>[]</td>\n",
" <td>2022-01-12 15:34:53</td>\n",
" <td>2006-01-13 12:48:32</td>\n",
" <td>[arts, science, technology, engineering, mathe...</td>\n",
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
" <td>[{'name': 'centre pour la communication scient...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"hal\", \"version\": \"\"}</td>\n",
" <td>https://api.archives-ouvertes.fr/oai/archivesic</td>\n",
" <td>yes</td>\n",
" <td>NaN</td>\n",
" <td>55492.0</td>\n",
" <td>1137498.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>93</td>\n",
" <td>{'name': 'digitalcommons@the texas medical cen...</td>\n",
" <td>[]</td>\n",
" <td>http://digitalcommons.library.tmc.edu/</td>\n",
" <td>NaN</td>\n",
" <td>institutional</td>\n",
" <td>[]</td>\n",
" <td>2022-01-12 15:34:53</td>\n",
" <td>2006-02-14 11:16:12</td>\n",
" <td>[health and medicine]</td>\n",
" <td>[journal_articles, theses_and_dissertations]</td>\n",
" <td>[{'name': 'texas medical center', 'alternative...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"other\", \"version\": \"\"}</td>\n",
" <td>http://digitalcommons.library.tmc.edu/do/oai/</td>\n",
" <td>yes</td>\n",
" <td>NaN</td>\n",
" <td>2658.0</td>\n",
" <td>7268.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>68</td>\n",
" <td>{'name': 'cognitive sciences eprint archive', ...</td>\n",
" <td>[{'acronym': 'cogprints'}]</td>\n",
" <td>http://cogprints.org/</td>\n",
" <td>NaN</td>\n",
" <td>disciplinary</td>\n",
" <td>[]</td>\n",
" <td>2022-01-12 15:34:53</td>\n",
" <td>2006-01-04 15:01:23</td>\n",
" <td>[humanities, health and medicine, science, soc...</td>\n",
" <td>[journal_articles, conference_and_workshop_pap...</td>\n",
" <td>[{'name': 'university of southampton', 'altern...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"eprints\", \"version\": \"\"}</td>\n",
" <td>http://cogprints.org/cgi/oai2</td>\n",
" <td>yes</td>\n",
" <td>NaN</td>\n",
" <td>2895.0</td>\n",
" <td>4277.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>84</td>\n",
" <td>{'name': 'digital commons@carleton college', '...</td>\n",
" <td>[]</td>\n",
" <td>http://digitalcommons.carleton.edu/</td>\n",
" <td>NaN</td>\n",
" <td>institutional</td>\n",
" <td>[]</td>\n",
" <td>2022-01-12 15:34:53</td>\n",
" <td>2006-01-04 16:07:58</td>\n",
" <td>[humanities, science, social sciences]</td>\n",
" <td>[journal_articles, unpub_reports_and_working_p...</td>\n",
" <td>[{'name': 'carleton college', 'alternativeName...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"other\", \"version\": \"\"}</td>\n",
" <td>NaN</td>\n",
" <td>yes</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>42.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" system_metadata.id repository_metadata.name \\\n",
"0 134 {'name': 'eldorado - repository of the tu dort... \n",
"1 58 {'name': 'archive ouverte en sciences de linfo... \n",
"2 93 {'name': 'digitalcommons@the texas medical cen... \n",
"3 68 {'name': 'cognitive sciences eprint archive', ... \n",
"4 84 {'name': 'digital commons@carleton college', '... \n",
"\n",
" repository_metadata.alternativename \\\n",
"0 [{'name': 'eldorado - ressourcen aus und für l... \n",
"1 [{'acronym': '@rchivesic'}] \n",
"2 [] \n",
"3 [{'acronym': 'cogprints'}] \n",
"4 [] \n",
"\n",
" repository_metadata.url repository_metadata.description \\\n",
"0 https://eldorado.tu-dortmund.de NaN \n",
"1 https://archivesic.ccsd.cnrs.fr NaN \n",
"2 http://digitalcommons.library.tmc.edu/ NaN \n",
"3 http://cogprints.org/ NaN \n",
"4 http://digitalcommons.carleton.edu/ NaN \n",
"\n",
" repository_metadata.type repository_metadata.content_languages \\\n",
"0 institutional [] \n",
"1 institutional [] \n",
"2 institutional [] \n",
"3 disciplinary [] \n",
"4 institutional [] \n",
"\n",
" system_metadata.date_modified system_metadata.date_created \\\n",
"0 2022-01-12 15:34:54 2005-12-19 14:57:52 \n",
"1 2022-01-12 15:34:53 2006-01-13 12:48:32 \n",
"2 2022-01-12 15:34:53 2006-02-14 11:16:12 \n",
"3 2022-01-12 15:34:53 2006-01-04 15:01:23 \n",
"4 2022-01-12 15:34:53 2006-01-04 16:07:58 \n",
"\n",
" repository_metadata.content_subjects \\\n",
"0 [arts, humanities, science, mathematics, socia... \n",
"1 [arts, science, technology, engineering, mathe... \n",
"2 [health and medicine] \n",
"3 [humanities, health and medicine, science, soc... \n",
"4 [humanities, science, social sciences] \n",
"\n",
" repository_metadata.content_types \\\n",
"0 [journal_articles, conference_and_workshop_pap... \n",
"1 [journal_articles, conference_and_workshop_pap... \n",
"2 [journal_articles, theses_and_dissertations] \n",
"3 [journal_articles, conference_and_workshop_pap... \n",
"4 [journal_articles, unpub_reports_and_working_p... \n",
"\n",
" organization policy_urls \\\n",
"0 [{'name': 'technische universität dortmund', '... [] \n",
"1 [{'name': 'centre pour la communication scient... [] \n",
"2 [{'name': 'texas medical center', 'alternative... [] \n",
"3 [{'name': 'university of southampton', 'altern... [] \n",
"4 [{'name': 'carleton college', 'alternativeName... [] \n",
"\n",
" repository_metadata.software \\\n",
"0 {\"name\": \"dspace\", \"version\": \"\"} \n",
"1 {\"name\": \"hal\", \"version\": \"\"} \n",
"2 {\"name\": \"other\", \"version\": \"\"} \n",
"3 {\"name\": \"eprints\", \"version\": \"\"} \n",
"4 {\"name\": \"other\", \"version\": \"\"} \n",
"\n",
" repository_metadata.oai_url \\\n",
"0 https://eldorado.tu-dortmund.de/oai/request \n",
"1 https://api.archives-ouvertes.fr/oai/archivesic \n",
"2 http://digitalcommons.library.tmc.edu/do/oai/ \n",
"3 http://cogprints.org/cgi/oai2 \n",
"4 NaN \n",
"\n",
" system_metadata.publicly_visible repository_metadata.repository_status \\\n",
"0 yes NaN \n",
"1 yes NaN \n",
"2 yes NaN \n",
"3 yes NaN \n",
"4 yes NaN \n",
"\n",
" repository_metadata.fulltext_record_count \\\n",
"0 9629.0 \n",
"1 55492.0 \n",
"2 2658.0 \n",
"3 2895.0 \n",
"4 NaN \n",
"\n",
" repository_metadata.metadata_record_count \n",
"0 20963.0 \n",
"1 1137498.0 \n",
"2 7268.0 \n",
"3 4277.0 \n",
"4 42.0 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n",
" converters={'repository_metadata.content_subjects': ast.literal_eval,\n",
" 'repository_metadata.name': ast.literal_eval,\n",
" 'repository_metadata.alternativename': ast.literal_eval,\n",
" 'repository_metadata.content_types': ast.literal_eval,\n",
" 'organization': ast.literal_eval\n",
" },\n",
" dtype={'system_metadata.id': str})\n",
"\n",
"opendoar_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['system_metadata.id', 'repository_metadata.name',\n",
" 'repository_metadata.alternativename', 'repository_metadata.url',\n",
" 'repository_metadata.description', 'repository_metadata.type',\n",
" 'repository_metadata.content_languages',\n",
" 'system_metadata.date_modified', 'system_metadata.date_created',\n",
" 'repository_metadata.content_subjects',\n",
" 'repository_metadata.content_types', 'organization', 'policy_urls',\n",
" 'repository_metadata.software', 'repository_metadata.oai_url',\n",
" 'system_metadata.publicly_visible',\n",
" 'repository_metadata.repository_status',\n",
" 'repository_metadata.fulltext_record_count',\n",
" 'repository_metadata.metadata_record_count'],\n",
" dtype='object')"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df.columns"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def empty_list_is_nan(cell):\n",
" if isinstance(cell, list):\n",
" return np.nan if len(cell) == 0 else cell\n",
" else:\n",
" return cell\n",
" \n",
"opendoar_df = opendoar_df.applymap(empty_list_is_nan)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>system_metadata.id</th>\n",
" <th>repository_metadata.name</th>\n",
" <th>repository_metadata.alternativename</th>\n",
" <th>repository_metadata.url</th>\n",
" <th>repository_metadata.description</th>\n",
" <th>repository_metadata.type</th>\n",
" <th>repository_metadata.content_languages</th>\n",
" <th>system_metadata.date_modified</th>\n",
" <th>system_metadata.date_created</th>\n",
" <th>repository_metadata.content_subjects</th>\n",
" <th>repository_metadata.content_types</th>\n",
" <th>organization</th>\n",
" <th>policy_urls</th>\n",
" <th>repository_metadata.software</th>\n",
" <th>repository_metadata.oai_url</th>\n",
" <th>system_metadata.publicly_visible</th>\n",
" <th>repository_metadata.repository_status</th>\n",
" <th>repository_metadata.fulltext_record_count</th>\n",
" <th>repository_metadata.metadata_record_count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>5811</td>\n",
" <td>5811</td>\n",
" <td>2155</td>\n",
" <td>5810</td>\n",
" <td>0.0</td>\n",
" <td>5810</td>\n",
" <td>5811</td>\n",
" <td>5811</td>\n",
" <td>5811</td>\n",
" <td>5644</td>\n",
" <td>5667</td>\n",
" <td>5811</td>\n",
" <td>5811</td>\n",
" <td>5811</td>\n",
" <td>4447</td>\n",
" <td>5811</td>\n",
" <td>0.0</td>\n",
" <td>2.292000e+03</td>\n",
" <td>4.184000e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>5811</td>\n",
" <td>5780</td>\n",
" <td>2115</td>\n",
" <td>5772</td>\n",
" <td>NaN</td>\n",
" <td>4</td>\n",
" <td>1</td>\n",
" <td>171</td>\n",
" <td>5643</td>\n",
" <td>236</td>\n",
" <td>476</td>\n",
" <td>5212</td>\n",
" <td>678</td>\n",
" <td>32</td>\n",
" <td>4415</td>\n",
" <td>1</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>134</td>\n",
" <td>{'name': 'arch', 'language': 'en'}</td>\n",
" <td>[{'acronym': 'aura'}]</td>\n",
" <td>http://harp.lib.hiroshima-u.ac.jp/</td>\n",
" <td>NaN</td>\n",
" <td>institutional</td>\n",
" <td>[]</td>\n",
" <td>2022-01-12 15:35:47</td>\n",
" <td>2020-09-18 12:53:48</td>\n",
" <td>[science, technology, engineering, mathematics...</td>\n",
" <td>[theses_and_dissertations]</td>\n",
" <td>[{'name': 'rijksuniversiteit groningen', 'alte...</td>\n",
" <td>[]</td>\n",
" <td>{\"name\": \"dspace\", \"version\": \"\"}</td>\n",
" <td>https://api.figshare.com/v2/oai</td>\n",
" <td>yes</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>4</td>\n",
" <td>3</td>\n",
" <td>NaN</td>\n",
" <td>5161</td>\n",
" <td>5811</td>\n",
" <td>73</td>\n",
" <td>81</td>\n",
" <td>3321</td>\n",
" <td>469</td>\n",
" <td>26</td>\n",
" <td>5131</td>\n",
" <td>2273</td>\n",
" <td>3</td>\n",
" <td>5811</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>5.022890e+03</td>\n",
" <td>1.765556e+05</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4.212648e+04</td>\n",
" <td>6.611068e+06</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.000000e+00</td>\n",
" <td>0.000000e+00</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>0.000000e+00</td>\n",
" <td>8.937500e+02</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>4.225000e+02</td>\n",
" <td>4.012500e+03</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>2.931500e+03</td>\n",
" <td>1.629350e+04</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>1.817531e+06</td>\n",
" <td>4.200000e+08</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" system_metadata.id repository_metadata.name \\\n",
"count 5811 5811 \n",
"unique 5811 5780 \n",
"top 134 {'name': 'arch', 'language': 'en'} \n",
"freq 1 3 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" repository_metadata.alternativename \\\n",
"count 2155 \n",
"unique 2115 \n",
"top [{'acronym': 'aura'}] \n",
"freq 4 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" repository_metadata.url repository_metadata.description \\\n",
"count 5810 0.0 \n",
"unique 5772 NaN \n",
"top http://harp.lib.hiroshima-u.ac.jp/ NaN \n",
"freq 3 NaN \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" repository_metadata.type repository_metadata.content_languages \\\n",
"count 5810 5811 \n",
"unique 4 1 \n",
"top institutional [] \n",
"freq 5161 5811 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" system_metadata.date_modified system_metadata.date_created \\\n",
"count 5811 5811 \n",
"unique 171 5643 \n",
"top 2022-01-12 15:35:47 2020-09-18 12:53:48 \n",
"freq 73 81 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" repository_metadata.content_subjects \\\n",
"count 5644 \n",
"unique 236 \n",
"top [science, technology, engineering, mathematics... \n",
"freq 3321 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" repository_metadata.content_types \\\n",
"count 5667 \n",
"unique 476 \n",
"top [theses_and_dissertations] \n",
"freq 469 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" organization policy_urls \\\n",
"count 5811 5811 \n",
"unique 5212 678 \n",
"top [{'name': 'rijksuniversiteit groningen', 'alte... [] \n",
"freq 26 5131 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" repository_metadata.software repository_metadata.oai_url \\\n",
"count 5811 4447 \n",
"unique 32 4415 \n",
"top {\"name\": \"dspace\", \"version\": \"\"} https://api.figshare.com/v2/oai \n",
"freq 2273 3 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN \n",
"\n",
" system_metadata.publicly_visible \\\n",
"count 5811 \n",
"unique 1 \n",
"top yes \n",
"freq 5811 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" repository_metadata.repository_status \\\n",
"count 0.0 \n",
"unique NaN \n",
"top NaN \n",
"freq NaN \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN \n",
"\n",
" repository_metadata.fulltext_record_count \\\n",
"count 2.292000e+03 \n",
"unique NaN \n",
"top NaN \n",
"freq NaN \n",
"mean 5.022890e+03 \n",
"std 4.212648e+04 \n",
"min 0.000000e+00 \n",
"25% 0.000000e+00 \n",
"50% 4.225000e+02 \n",
"75% 2.931500e+03 \n",
"max 1.817531e+06 \n",
"\n",
" repository_metadata.metadata_record_count \n",
"count 4.184000e+03 \n",
"unique NaN \n",
"top NaN \n",
"freq NaN \n",
"mean 1.765556e+05 \n",
"std 6.611068e+06 \n",
"min 0.000000e+00 \n",
"25% 8.937500e+02 \n",
"50% 4.012500e+03 \n",
"75% 1.629350e+04 \n",
"max 4.200000e+08 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"system_metadata.id 0\n",
"repository_metadata.name 0\n",
"repository_metadata.alternativename 3656\n",
"repository_metadata.url 1\n",
"repository_metadata.description 5811\n",
"repository_metadata.type 1\n",
"repository_metadata.content_languages 0\n",
"system_metadata.date_modified 0\n",
"system_metadata.date_created 0\n",
"repository_metadata.content_subjects 167\n",
"repository_metadata.content_types 144\n",
"organization 0\n",
"policy_urls 0\n",
"repository_metadata.software 0\n",
"repository_metadata.oai_url 1364\n",
"system_metadata.publicly_visible 0\n",
"repository_metadata.repository_status 5811\n",
"repository_metadata.fulltext_record_count 3519\n",
"repository_metadata.metadata_record_count 1627\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"repository_metadata.content_types\n",
"bibliographic_references 858\n",
"books_chapters_and_sections 2246\n",
"conference_and_workshop_papers 2037\n",
"datasets 427\n",
"journal_articles 4069\n",
"learning_objects 807\n",
"other_special_item_types 1800\n",
"patents 200\n",
"software 105\n",
"theses_and_dissertations 3377\n",
"unpub_reports_and_working_papers 1953\n",
"dtype: int64"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.DataFrame(opendoar_df['repository_metadata.content_types'].explode()).groupby('repository_metadata.content_types').size()"
]
}
],
"metadata": {
"interpreter": {
"hash": "ccdc3acc266150d74575e7f25ef162b022ec22dae7e3244cf5a4f2ecbaf21c19"
},
"kernelspec": {
"display_name": "Python 3.9.12 ('data-science')",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long
Loading…
Cancel
Save