{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import ast\n", "import csv\n", "import json\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import plotly\n", "from plotly.offline import iplot, init_notebook_mode\n", "import plotly.graph_objs as go\n", "import plotly.express as px\n", "\n", "pd.set_option('display.max_columns', None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Loading datasets" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
system_metadata.idrepository_metadata.namerepository_metadata.alternativenamerepository_metadata.urlrepository_metadata.descriptionrepository_metadata.typerepository_metadata.content_languagessystem_metadata.date_modifiedsystem_metadata.date_createdrepository_metadata.content_subjectsrepository_metadata.content_typesorganizationpolicy_urlsrepository_metadata.softwarerepository_metadata.oai_urlsystem_metadata.publicly_visiblerepository_metadata.repository_statusrepository_metadata.fulltext_record_countrepository_metadata.metadata_record_count
0134{\"name\": \"eldorado - repository of the tu dort...[{'name': 'eldorado - ressourcen aus und für l...https://eldorado.tu-dortmund.deNaNinstitutional[]2022-01-12 15:34:542005-12-19 14:57:52[arts, humanities, science, mathematics, socia...[journal_articles, conference_and_workshop_pap...[{'name': 'technische universität dortmund', '...[]{\"name\": \"dspace\", \"version\": \"\"}https://eldorado.tu-dortmund.de/oai/requestyesNaN9629.020963.0
158{\"name\": \"archive ouverte en sciences de linfo...[{'acronym': '@rchivesic'}]https://archivesic.ccsd.cnrs.frNaNinstitutional[]2022-01-12 15:34:532006-01-13 12:48:32[arts, science, technology, engineering, mathe...[journal_articles, conference_and_workshop_pap...[{'name': 'centre pour la communication scient...[]{\"name\": \"hal\", \"version\": \"\"}https://api.archives-ouvertes.fr/oai/archivesicyesNaN55492.01137498.0
293{\"name\": \"digitalcommons@the texas medical cen...[]http://digitalcommons.library.tmc.edu/NaNinstitutional[]2022-01-12 15:34:532006-02-14 11:16:12[health and medicine][journal_articles, theses_and_dissertations][{'name': 'texas medical center', 'alternative...[]{\"name\": \"other\", \"version\": \"\"}http://digitalcommons.library.tmc.edu/do/oai/yesNaN2658.07268.0
368{\"name\": \"cognitive sciences eprint archive\", ...[{'acronym': 'cogprints'}]http://cogprints.org/NaNdisciplinary[]2022-01-12 15:34:532006-01-04 15:01:23[humanities, health and medicine, science, soc...[journal_articles, conference_and_workshop_pap...[{'name': 'university of southampton', 'altern...[]{\"name\": \"eprints\", \"version\": \"\"}http://cogprints.org/cgi/oai2yesNaN2895.04277.0
484{\"name\": \"digital commons@carleton college\", \"...[]http://digitalcommons.carleton.edu/NaNinstitutional[]2022-01-12 15:34:532006-01-04 16:07:58[humanities, science, social sciences][journal_articles, unpub_reports_and_working_p...[{'name': 'carleton college', 'alternativeName...[]{\"name\": \"other\", \"version\": \"\"}NaNyesNaNNaN42.0
\n", "
" ], "text/plain": [ " system_metadata.id repository_metadata.name \\\n", "0 134 {\"name\": \"eldorado - repository of the tu dort... \n", "1 58 {\"name\": \"archive ouverte en sciences de linfo... \n", "2 93 {\"name\": \"digitalcommons@the texas medical cen... \n", "3 68 {\"name\": \"cognitive sciences eprint archive\", ... \n", "4 84 {\"name\": \"digital commons@carleton college\", \"... \n", "\n", " repository_metadata.alternativename \\\n", "0 [{'name': 'eldorado - ressourcen aus und für l... \n", "1 [{'acronym': '@rchivesic'}] \n", "2 [] \n", "3 [{'acronym': 'cogprints'}] \n", "4 [] \n", "\n", " repository_metadata.url repository_metadata.description \\\n", "0 https://eldorado.tu-dortmund.de NaN \n", "1 https://archivesic.ccsd.cnrs.fr NaN \n", "2 http://digitalcommons.library.tmc.edu/ NaN \n", "3 http://cogprints.org/ NaN \n", "4 http://digitalcommons.carleton.edu/ NaN \n", "\n", " repository_metadata.type repository_metadata.content_languages \\\n", "0 institutional [] \n", "1 institutional [] \n", "2 institutional [] \n", "3 disciplinary [] \n", "4 institutional [] \n", "\n", " system_metadata.date_modified system_metadata.date_created \\\n", "0 2022-01-12 15:34:54 2005-12-19 14:57:52 \n", "1 2022-01-12 15:34:53 2006-01-13 12:48:32 \n", "2 2022-01-12 15:34:53 2006-02-14 11:16:12 \n", "3 2022-01-12 15:34:53 2006-01-04 15:01:23 \n", "4 2022-01-12 15:34:53 2006-01-04 16:07:58 \n", "\n", " repository_metadata.content_subjects \\\n", "0 [arts, humanities, science, mathematics, socia... \n", "1 [arts, science, technology, engineering, mathe... \n", "2 [health and medicine] \n", "3 [humanities, health and medicine, science, soc... \n", "4 [humanities, science, social sciences] \n", "\n", " repository_metadata.content_types \\\n", "0 [journal_articles, conference_and_workshop_pap... \n", "1 [journal_articles, conference_and_workshop_pap... \n", "2 [journal_articles, theses_and_dissertations] \n", "3 [journal_articles, conference_and_workshop_pap... \n", "4 [journal_articles, unpub_reports_and_working_p... \n", "\n", " organization policy_urls \\\n", "0 [{'name': 'technische universität dortmund', '... [] \n", "1 [{'name': 'centre pour la communication scient... [] \n", "2 [{'name': 'texas medical center', 'alternative... [] \n", "3 [{'name': 'university of southampton', 'altern... [] \n", "4 [{'name': 'carleton college', 'alternativeName... [] \n", "\n", " repository_metadata.software \\\n", "0 {\"name\": \"dspace\", \"version\": \"\"} \n", "1 {\"name\": \"hal\", \"version\": \"\"} \n", "2 {\"name\": \"other\", \"version\": \"\"} \n", "3 {\"name\": \"eprints\", \"version\": \"\"} \n", "4 {\"name\": \"other\", \"version\": \"\"} \n", "\n", " repository_metadata.oai_url \\\n", "0 https://eldorado.tu-dortmund.de/oai/request \n", "1 https://api.archives-ouvertes.fr/oai/archivesic \n", "2 http://digitalcommons.library.tmc.edu/do/oai/ \n", "3 http://cogprints.org/cgi/oai2 \n", "4 NaN \n", "\n", " system_metadata.publicly_visible repository_metadata.repository_status \\\n", "0 yes NaN \n", "1 yes NaN \n", "2 yes NaN \n", "3 yes NaN \n", "4 yes NaN \n", "\n", " repository_metadata.fulltext_record_count \\\n", "0 9629.0 \n", "1 55492.0 \n", "2 2658.0 \n", "3 2895.0 \n", "4 NaN \n", "\n", " repository_metadata.metadata_record_count \n", "0 20963.0 \n", "1 1137498.0 \n", "2 7268.0 \n", "3 4277.0 \n", "4 42.0 " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n", " converters={'repository_metadata.content_subjects': ast.literal_eval,\n", " 'repository_metadata.alternativename': ast.literal_eval,\n", " 'repository_metadata.content_types': ast.literal_eval,\n", " 'organization': ast.literal_eval\n", " },\n", " dtype={'system_metadata.id': str})\n", "\n", "opendoar_df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['system_metadata.id', 'repository_metadata.name',\n", " 'repository_metadata.alternativename', 'repository_metadata.url',\n", " 'repository_metadata.description', 'repository_metadata.type',\n", " 'repository_metadata.content_languages',\n", " 'system_metadata.date_modified', 'system_metadata.date_created',\n", " 'repository_metadata.content_subjects',\n", " 'repository_metadata.content_types', 'organization', 'policy_urls',\n", " 'repository_metadata.software', 'repository_metadata.oai_url',\n", " 'system_metadata.publicly_visible',\n", " 'repository_metadata.repository_status',\n", " 'repository_metadata.fulltext_record_count',\n", " 'repository_metadata.metadata_record_count'],\n", " dtype='object')" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "opendoar_df.columns" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "def empty_list_is_nan(cell):\n", " if isinstance(cell, list):\n", " return np.nan if len(cell) == 0 else cell\n", " else:\n", " return cell\n", " \n", "opendoar_df = opendoar_df.applymap(empty_list_is_nan)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
system_metadata.idrepository_metadata.namerepository_metadata.alternativenamerepository_metadata.urlrepository_metadata.descriptionrepository_metadata.typerepository_metadata.content_languagessystem_metadata.date_modifiedsystem_metadata.date_createdrepository_metadata.content_subjectsrepository_metadata.content_typesorganizationpolicy_urlsrepository_metadata.softwarerepository_metadata.oai_urlsystem_metadata.publicly_visiblerepository_metadata.repository_statusrepository_metadata.fulltext_record_countrepository_metadata.metadata_record_count
count58115811215558100.0581058115811581156445667581158115811444758110.02.292000e+034.184000e+03
unique5811578021155772NaN41171564323647652126783244151NaNNaNNaN
top134{\"name\": \"arch\", \"language\": \"en\"}[{'acronym': 'aura'}]http://harp.lib.hiroshima-u.ac.jp/NaNinstitutional[]2022-01-12 15:35:472020-09-18 12:53:48[science, technology, engineering, mathematics...[theses_and_dissertations][{'name': 'rijksuniversiteit groningen', 'alte...[]{\"name\": \"dspace\", \"version\": \"\"}https://api.figshare.com/v2/oaiyesNaNNaNNaN
freq1343NaN5161581173813321469265131227335811NaNNaNNaN
meanNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN5.022890e+031.765556e+05
stdNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN4.212648e+046.611068e+06
minNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0.000000e+000.000000e+00
25%NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN0.000000e+008.937500e+02
50%NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN4.225000e+024.012500e+03
75%NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN2.931500e+031.629350e+04
maxNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN1.817531e+064.200000e+08
\n", "
" ], "text/plain": [ " system_metadata.id repository_metadata.name \\\n", "count 5811 5811 \n", "unique 5811 5780 \n", "top 134 {\"name\": \"arch\", \"language\": \"en\"} \n", "freq 1 3 \n", "mean NaN NaN \n", "std NaN NaN \n", "min NaN NaN \n", "25% NaN NaN \n", "50% NaN NaN \n", "75% NaN NaN \n", "max NaN NaN \n", "\n", " repository_metadata.alternativename \\\n", "count 2155 \n", "unique 2115 \n", "top [{'acronym': 'aura'}] \n", "freq 4 \n", "mean NaN \n", "std NaN \n", "min NaN \n", "25% NaN \n", "50% NaN \n", "75% NaN \n", "max NaN \n", "\n", " repository_metadata.url repository_metadata.description \\\n", "count 5810 0.0 \n", "unique 5772 NaN \n", "top http://harp.lib.hiroshima-u.ac.jp/ NaN \n", "freq 3 NaN \n", "mean NaN NaN \n", "std NaN NaN \n", "min NaN NaN \n", "25% NaN NaN \n", "50% NaN NaN \n", "75% NaN NaN \n", "max NaN NaN \n", "\n", " repository_metadata.type repository_metadata.content_languages \\\n", "count 5810 5811 \n", "unique 4 1 \n", "top institutional [] \n", "freq 5161 5811 \n", "mean NaN NaN \n", "std NaN NaN \n", "min NaN NaN \n", "25% NaN NaN \n", "50% NaN NaN \n", "75% NaN NaN \n", "max NaN NaN \n", "\n", " system_metadata.date_modified system_metadata.date_created \\\n", "count 5811 5811 \n", "unique 171 5643 \n", "top 2022-01-12 15:35:47 2020-09-18 12:53:48 \n", "freq 73 81 \n", "mean NaN NaN \n", "std NaN NaN \n", "min NaN NaN \n", "25% NaN NaN \n", "50% NaN NaN \n", "75% NaN NaN \n", "max NaN NaN \n", "\n", " repository_metadata.content_subjects \\\n", "count 5644 \n", "unique 236 \n", "top [science, technology, engineering, mathematics... \n", "freq 3321 \n", "mean NaN \n", "std NaN \n", "min NaN \n", "25% NaN \n", "50% NaN \n", "75% NaN \n", "max NaN \n", "\n", " repository_metadata.content_types \\\n", "count 5667 \n", "unique 476 \n", "top [theses_and_dissertations] \n", "freq 469 \n", "mean NaN \n", "std NaN \n", "min NaN \n", "25% NaN \n", "50% NaN \n", "75% NaN \n", "max NaN \n", "\n", " organization policy_urls \\\n", "count 5811 5811 \n", "unique 5212 678 \n", "top [{'name': 'rijksuniversiteit groningen', 'alte... [] \n", "freq 26 5131 \n", "mean NaN NaN \n", "std NaN NaN \n", "min NaN NaN \n", "25% NaN NaN \n", "50% NaN NaN \n", "75% NaN NaN \n", "max NaN NaN \n", "\n", " repository_metadata.software repository_metadata.oai_url \\\n", "count 5811 4447 \n", "unique 32 4415 \n", "top {\"name\": \"dspace\", \"version\": \"\"} https://api.figshare.com/v2/oai \n", "freq 2273 3 \n", "mean NaN NaN \n", "std NaN NaN \n", "min NaN NaN \n", "25% NaN NaN \n", "50% NaN NaN \n", "75% NaN NaN \n", "max NaN NaN \n", "\n", " system_metadata.publicly_visible \\\n", "count 5811 \n", "unique 1 \n", "top yes \n", "freq 5811 \n", "mean NaN \n", "std NaN \n", "min NaN \n", "25% NaN \n", "50% NaN \n", "75% NaN \n", "max NaN \n", "\n", " repository_metadata.repository_status \\\n", "count 0.0 \n", "unique NaN \n", "top NaN \n", "freq NaN \n", "mean NaN \n", "std NaN \n", "min NaN \n", "25% NaN \n", "50% NaN \n", "75% NaN \n", "max NaN \n", "\n", " repository_metadata.fulltext_record_count \\\n", "count 2.292000e+03 \n", "unique NaN \n", "top NaN \n", "freq NaN \n", "mean 5.022890e+03 \n", "std 4.212648e+04 \n", "min 0.000000e+00 \n", "25% 0.000000e+00 \n", "50% 4.225000e+02 \n", "75% 2.931500e+03 \n", "max 1.817531e+06 \n", "\n", " repository_metadata.metadata_record_count \n", "count 4.184000e+03 \n", "unique NaN \n", "top NaN \n", "freq NaN \n", "mean 1.765556e+05 \n", "std 6.611068e+06 \n", "min 0.000000e+00 \n", "25% 8.937500e+02 \n", "50% 4.012500e+03 \n", "75% 1.629350e+04 \n", "max 4.200000e+08 " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "opendoar_df.describe(include='all')" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "system_metadata.id 0\n", "repository_metadata.name 0\n", "repository_metadata.alternativename 3656\n", "repository_metadata.url 1\n", "repository_metadata.description 5811\n", "repository_metadata.type 1\n", "repository_metadata.content_languages 0\n", "system_metadata.date_modified 0\n", "system_metadata.date_created 0\n", "repository_metadata.content_subjects 167\n", "repository_metadata.content_types 144\n", "organization 0\n", "policy_urls 0\n", "repository_metadata.software 0\n", "repository_metadata.oai_url 1364\n", "system_metadata.publicly_visible 0\n", "repository_metadata.repository_status 5811\n", "repository_metadata.fulltext_record_count 3519\n", "repository_metadata.metadata_record_count 1627\n", "dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "opendoar_df.isna().sum()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "repository_metadata.content_types\n", "bibliographic_references 858\n", "books_chapters_and_sections 2246\n", "conference_and_workshop_papers 2037\n", "datasets 427\n", "journal_articles 4069\n", "learning_objects 807\n", "other_special_item_types 1800\n", "patents 200\n", "software 105\n", "theses_and_dissertations 3377\n", "unpub_reports_and_working_papers 1953\n", "dtype: int64" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.DataFrame(opendoar_df['repository_metadata.content_types'].explode()).groupby('repository_metadata.content_types').size()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }