diff --git a/notebooks/01.1-exploration-re3data.ipynb b/notebooks/01.1-exploration-re3data.ipynb new file mode 100644 index 0000000..b79a854 --- /dev/null +++ b/notebooks/01.1-exploration-re3data.ipynb @@ -0,0 +1,1330 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import csv\n", + "import json\n", + "import reverse_geocoder as rg\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import pycountry_convert\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib_venn import venn2, venn2_circles\n", + "\n", + "import plotly\n", + "from plotly.offline import iplot, init_notebook_mode\n", + "import plotly.graph_objs as go\n", + "import plotly.express as px\n", + "\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**re3data**" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
re3data_idrepository_nametypesubjectprovider_typekeywordinstitution
0r3d100000001Odum Institute Archive Dataverse[disciplinary][1 Humanities and Social Sciences, 111 Social ...[dataProvider][FAIR, Middle East, crime, demography, economy...[[Odum Institute for Research in Social Scienc...
1r3d100000002Access to Archival Databases[disciplinary][1 Humanities and Social Sciences, 102 History...[dataProvider][US History][[The U.S. National Archives and Records Admin...
2r3d100000004Datenbank Gesprochenes Deutsch[disciplinary][1 Humanities and Social Sciences, 104 Linguis...[dataProvider, serviceProvider][Australian German, FOLK, German dialects, Pfe...[[Institut für Deutsche Sprache, Archiv für Ge...
3r3d100000005UNC Dataverse[institutional][1 Humanities and Social Sciences, 111 Social ...[dataProvider, serviceProvider][FAIR, census, demographic survey, demography,...[[Odum Institute for Research in Social Scienc...
4r3d100000006Archaeology Data Service[disciplinary][1 Humanities and Social Sciences, 101 Ancient...[dataProvider, serviceProvider][FAIR, archaeology, cultural heritage, prehist...[[Arts and Humanities Research Council, [AHRC]...
\n", + "
" + ], + "text/plain": [ + " re3data_id repository_name type \\\n", + "0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] \n", + "1 r3d100000002 Access to Archival Databases [disciplinary] \n", + "2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] \n", + "3 r3d100000005 UNC Dataverse [institutional] \n", + "4 r3d100000006 Archaeology Data Service [disciplinary] \n", + "\n", + " subject \\\n", + "0 [1 Humanities and Social Sciences, 111 Social ... \n", + "1 [1 Humanities and Social Sciences, 102 History... \n", + "2 [1 Humanities and Social Sciences, 104 Linguis... \n", + "3 [1 Humanities and Social Sciences, 111 Social ... \n", + "4 [1 Humanities and Social Sciences, 101 Ancient... \n", + "\n", + " provider_type \\\n", + "0 [dataProvider] \n", + "1 [dataProvider] \n", + "2 [dataProvider, serviceProvider] \n", + "3 [dataProvider, serviceProvider] \n", + "4 [dataProvider, serviceProvider] \n", + "\n", + " keyword \\\n", + "0 [FAIR, Middle East, crime, demography, economy... \n", + "1 [US History] \n", + "2 [Australian German, FOLK, German dialects, Pfe... \n", + "3 [FAIR, census, demographic survey, demography,... \n", + "4 [FAIR, archaeology, cultural heritage, prehist... \n", + "\n", + " institution \n", + "0 [[Odum Institute for Research in Social Scienc... \n", + "1 [[The U.S. National Archives and Records Admin... \n", + "2 [[Institut für Deutsche Sprache, Archiv für Ge... \n", + "3 [[Odum Institute for Research in Social Scienc... \n", + "4 [[Arts and Humanities Research Council, [AHRC]... " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n", + " converters={'subject': ast.literal_eval,\n", + " 'keyword': ast.literal_eval,\n", + " 'additional_name': ast.literal_eval,\n", + " 'repository_id': ast.literal_eval,\n", + " 'type': ast.literal_eval,\n", + " 'content_type': ast.literal_eval,\n", + " 'provider_type': ast.literal_eval,\n", + " 'institution': ast.literal_eval\n", + " },\n", + " usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])\n", + "re3data_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "re3data_df = re3data_df.explode('provider_type')\n", + "re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
re3data_idrepository_nametypesubjectprovider_typekeywordinstitution
count2467246724672467245924672467
unique2466246391282122482447
topr3d100011987Landmap[disciplinary][1 Humanities and Social Sciences, 2 Life Scie...dataProvider[multidisciplinary][[National Center for Biotechnology Informatio...
freq22157320024591816
\n", + "
" + ], + "text/plain": [ + " re3data_id repository_name type \\\n", + "count 2467 2467 2467 \n", + "unique 2466 2463 9 \n", + "top r3d100011987 Landmap [disciplinary] \n", + "freq 2 2 1573 \n", + "\n", + " subject provider_type \\\n", + "count 2467 2459 \n", + "unique 1282 1 \n", + "top [1 Humanities and Social Sciences, 2 Life Scie... dataProvider \n", + "freq 200 2459 \n", + "\n", + " keyword institution \n", + "count 2467 2467 \n", + "unique 2248 2447 \n", + "top [multidisciplinary] [[National Center for Biotechnology Informatio... \n", + "freq 181 6 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**openDOAR**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
opendoar_idrepository_nametypesubjectinstitution
0101utrecht university repositoryinstitutional[multidisciplinary][[university of utrecht, [universiteit utrecht...
1115dspace at indian institute of management kozhi...institutional[ecology and environment, social sciences gene...[[indian institute of management kozhikode, [i...
241caltech engineering and science onlineinstitutional[biology and biochemistry, chemistry and chemi...[[california institute of technology, [caltech...
3119dcu online research access serviceinstitutional[multidisciplinary][[dublin city university, [dcu], ie, [], , htt...
4129earth-prints repositorydisciplinary[earth and planetary sciences][[istituto nazionale di geofisica e vulcanolog...
\n", + "
" + ], + "text/plain": [ + " opendoar_id repository_name \\\n", + "0 101 utrecht university repository \n", + "1 115 dspace at indian institute of management kozhi... \n", + "2 41 caltech engineering and science online \n", + "3 119 dcu online research access service \n", + "4 129 earth-prints repository \n", + "\n", + " type subject \\\n", + "0 institutional [multidisciplinary] \n", + "1 institutional [ecology and environment, social sciences gene... \n", + "2 institutional [biology and biochemistry, chemistry and chemi... \n", + "3 institutional [multidisciplinary] \n", + "4 disciplinary [earth and planetary sciences] \n", + "\n", + " institution \n", + "0 [[university of utrecht, [universiteit utrecht... \n", + "1 [[indian institute of management kozhikode, [i... \n", + "2 [[california institute of technology, [caltech... \n", + "3 [[dublin city university, [dcu], ie, [], , htt... \n", + "4 [[istituto nazionale di geofisica e vulcanolog... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n", + " converters={'subject': ast.literal_eval,\n", + " 'additional_name': ast.literal_eval,\n", + " 'opendoar_id': ast.literal_eval,\n", + " 'content_type': ast.literal_eval,\n", + " 'institution': ast.literal_eval\n", + " },\n", + " usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])\n", + "opendoar_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
opendoar_idrepository_nametypesubjectinstitution
count5707.0000005707570757075707
uniqueNaN567048205098
topNaNarchinstitutional[multidisciplinary][[rijksuniversiteit groningen, [rug], nl, [], ...
freqNaN35067321226
mean4008.118801NaNNaNNaNNaN
std2869.948770NaNNaNNaNNaN
min2.000000NaNNaNNaNNaN
25%1823.000000NaNNaNNaNNaN
50%3361.000000NaNNaNNaNNaN
75%5095.000000NaNNaNNaNNaN
max10175.000000NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " opendoar_id repository_name type subject \\\n", + "count 5707.000000 5707 5707 5707 \n", + "unique NaN 5670 4 820 \n", + "top NaN arch institutional [multidisciplinary] \n", + "freq NaN 3 5067 3212 \n", + "mean 4008.118801 NaN NaN NaN \n", + "std 2869.948770 NaN NaN NaN \n", + "min 2.000000 NaN NaN NaN \n", + "25% 1823.000000 NaN NaN NaN \n", + "50% 3361.000000 NaN NaN NaN \n", + "75% 5095.000000 NaN NaN NaN \n", + "max 10175.000000 NaN NaN NaN \n", + "\n", + " institution \n", + "count 5707 \n", + "unique 5098 \n", + "top [[rijksuniversiteit groningen, [rug], nl, [], ... \n", + "freq 26 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**ROAR**" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eprintidhome_pagetitlelocation_countrysubjects
0921http://alcme.oclc.org/ndltd/index.htmlNetworked Digital Library of Theses and Disser...usNaN
11489http://prensahistorica.mcu.es/prensahistorica/...Virtual Library of Historical PressesNaN
2606http://hal.archives-ouvertes.fr/HAL: Hyper Article en LignefrNaN
3606NaNNaNNaNNaN
4606NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " eprintid home_page \\\n", + "0 921 http://alcme.oclc.org/ndltd/index.html \n", + "1 1489 http://prensahistorica.mcu.es/prensahistorica/... \n", + "2 606 http://hal.archives-ouvertes.fr/ \n", + "3 606 NaN \n", + "4 606 NaN \n", + "\n", + " title location_country subjects \n", + "0 Networked Digital Library of Theses and Disser... us NaN \n", + "1 Virtual Library of Historical Press es NaN \n", + "2 HAL: Hyper Article en Ligne fr NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',\n", + " usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])\n", + "roar_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eprintidhome_pagetitlelocation_countrysubjects
1062303NaNFaculty Scholarship at The Claremont CollegesusAS
1072303NaNNaNNaNBF
1082303NaNNaNNaNBL
1092303NaNNaNNaNCC
1102303NaNNaNNaNGN
1112303NaNNaNNaNH1
1122303NaNNaNNaNHB
1132303NaNNaNNaNJA
1142303NaNNaNNaNLB
1152303NaNNaNNaNNX
1162303NaNNaNNaNPQ
1172303NaNNaNNaNQA
\n", + "
" + ], + "text/plain": [ + " eprintid home_page title \\\n", + "106 2303 NaN Faculty Scholarship at The Claremont Colleges \n", + "107 2303 NaN NaN \n", + "108 2303 NaN NaN \n", + "109 2303 NaN NaN \n", + "110 2303 NaN NaN \n", + "111 2303 NaN NaN \n", + "112 2303 NaN NaN \n", + "113 2303 NaN NaN \n", + "114 2303 NaN NaN \n", + "115 2303 NaN NaN \n", + "116 2303 NaN NaN \n", + "117 2303 NaN NaN \n", + "\n", + " location_country subjects \n", + "106 us AS \n", + "107 NaN BF \n", + "108 NaN BL \n", + "109 NaN CC \n", + "110 NaN GN \n", + "111 NaN H1 \n", + "112 NaN HB \n", + "113 NaN JA \n", + "114 NaN LB \n", + "115 NaN NX \n", + "116 NaN PQ \n", + "117 NaN QA " + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roar_df[roar_df.eprintid == 2303]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eprintidhome_pagetitlelocation_countrysubjects
count5314.0000005263526850241225
uniqueNaN51565027134123
topNaNhttp://ir.lib.isu.edu.tw/Repositorio InstitucionalusH1
freqNaN37877147
mean6389.464434NaNNaNNaNNaN
std5159.573937NaNNaNNaNNaN
min1.000000NaNNaNNaNNaN
25%1490.250000NaNNaNNaNNaN
50%4990.500000NaNNaNNaNNaN
75%10452.750000NaNNaNNaNNaN
max17302.000000NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " eprintid home_page title \\\n", + "count 5314.000000 5263 5268 \n", + "unique NaN 5156 5027 \n", + "top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional \n", + "freq NaN 3 7 \n", + "mean 6389.464434 NaN NaN \n", + "std 5159.573937 NaN NaN \n", + "min 1.000000 NaN NaN \n", + "25% 1490.250000 NaN NaN \n", + "50% 4990.500000 NaN NaN \n", + "75% 10452.750000 NaN NaN \n", + "max 17302.000000 NaN NaN \n", + "\n", + " location_country subjects \n", + "count 5024 1225 \n", + "unique 134 123 \n", + "top us H1 \n", + "freq 877 147 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roar_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**FAIRsharing**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_nameshort_namefs_urlurlcountriessubjects
0GenBankGenBankhttps://fairsharing.org/10.25504/FAIRsharing.9...https://www.ncbi.nlm.nih.gov/genbank/European Union,Japan,United StatesBioinformatics,Data Management,Data Submission...
1GlycoNAVIGlycoNAVIhttps://fairsharing.org/10.25504/FAIRsharing.w...https://glyconavi.org/JapanChemistry,Glycomics,Life Science,Organic Chemi...
2ADHDgeneADHDgenehttps://fairsharing.org/10.25504/FAIRsharing.m...http://adhd.psych.ac.cn/ChinaBiomedical Science,Genetics
3Allele frequency resource for research and tea...ALFREDhttps://fairsharing.org/10.25504/FAIRsharing.y...http://alfred.med.yale.eduUnited StatesLife Science
4Animal Transcription Factor DatabaseAnimalTFDBhttps://fairsharing.org/10.25504/FAIRsharing.e...http://bioinfo.life.hust.edu.cn/AnimalTFDB/ChinaLife Science
\n", + "
" + ], + "text/plain": [ + " full_name short_name \\\n", + "0 GenBank GenBank \n", + "1 GlycoNAVI GlycoNAVI \n", + "2 ADHDgene ADHDgene \n", + "3 Allele frequency resource for research and tea... ALFRED \n", + "4 Animal Transcription Factor Database AnimalTFDB \n", + "\n", + " fs_url \\\n", + "0 https://fairsharing.org/10.25504/FAIRsharing.9... \n", + "1 https://fairsharing.org/10.25504/FAIRsharing.w... \n", + "2 https://fairsharing.org/10.25504/FAIRsharing.m... \n", + "3 https://fairsharing.org/10.25504/FAIRsharing.y... \n", + "4 https://fairsharing.org/10.25504/FAIRsharing.e... \n", + "\n", + " url \\\n", + "0 https://www.ncbi.nlm.nih.gov/genbank/ \n", + "1 https://glyconavi.org/ \n", + "2 http://adhd.psych.ac.cn/ \n", + "3 http://alfred.med.yale.edu \n", + "4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n", + "\n", + " countries \\\n", + "0 European Union,Japan,United States \n", + "1 Japan \n", + "2 China \n", + "3 United States \n", + "4 China \n", + "\n", + " subjects \n", + "0 Bioinformatics,Data Management,Data Submission... \n", + "1 Chemistry,Glycomics,Life Science,Organic Chemi... \n", + "2 Biomedical Science,Genetics \n", + "3 Life Science \n", + "4 Life Science " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n", + " delimiter='|', header=0,\n", + " names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n", + "fairsharing_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_nameshort_namefs_urlurlcountriessubjects
count175217521752175217491690
unique1752174117521752178834
topCiteAbCGDhttps://fairsharing.org/10.25504/FAIRsharing.1...http://www.plexdb.org/United StatesLife Science
freq1311588367
\n", + "
" + ], + "text/plain": [ + " full_name short_name \\\n", + "count 1752 1752 \n", + "unique 1752 1741 \n", + "top CiteAb CGD \n", + "freq 1 3 \n", + "\n", + " fs_url \\\n", + "count 1752 \n", + "unique 1752 \n", + "top https://fairsharing.org/10.25504/FAIRsharing.1... \n", + "freq 1 \n", + "\n", + " url countries subjects \n", + "count 1752 1749 1690 \n", + "unique 1752 178 834 \n", + "top http://www.plexdb.org/ United States Life Science \n", + "freq 1 588 367 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fairsharing_df.describe(include='all')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/01.2-exploration-opendoar.ipynb b/notebooks/01.2-exploration-opendoar.ipynb new file mode 100644 index 0000000..b79a854 --- /dev/null +++ b/notebooks/01.2-exploration-opendoar.ipynb @@ -0,0 +1,1330 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import csv\n", + "import json\n", + "import reverse_geocoder as rg\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import pycountry_convert\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib_venn import venn2, venn2_circles\n", + "\n", + "import plotly\n", + "from plotly.offline import iplot, init_notebook_mode\n", + "import plotly.graph_objs as go\n", + "import plotly.express as px\n", + "\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**re3data**" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
re3data_idrepository_nametypesubjectprovider_typekeywordinstitution
0r3d100000001Odum Institute Archive Dataverse[disciplinary][1 Humanities and Social Sciences, 111 Social ...[dataProvider][FAIR, Middle East, crime, demography, economy...[[Odum Institute for Research in Social Scienc...
1r3d100000002Access to Archival Databases[disciplinary][1 Humanities and Social Sciences, 102 History...[dataProvider][US History][[The U.S. National Archives and Records Admin...
2r3d100000004Datenbank Gesprochenes Deutsch[disciplinary][1 Humanities and Social Sciences, 104 Linguis...[dataProvider, serviceProvider][Australian German, FOLK, German dialects, Pfe...[[Institut für Deutsche Sprache, Archiv für Ge...
3r3d100000005UNC Dataverse[institutional][1 Humanities and Social Sciences, 111 Social ...[dataProvider, serviceProvider][FAIR, census, demographic survey, demography,...[[Odum Institute for Research in Social Scienc...
4r3d100000006Archaeology Data Service[disciplinary][1 Humanities and Social Sciences, 101 Ancient...[dataProvider, serviceProvider][FAIR, archaeology, cultural heritage, prehist...[[Arts and Humanities Research Council, [AHRC]...
\n", + "
" + ], + "text/plain": [ + " re3data_id repository_name type \\\n", + "0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] \n", + "1 r3d100000002 Access to Archival Databases [disciplinary] \n", + "2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] \n", + "3 r3d100000005 UNC Dataverse [institutional] \n", + "4 r3d100000006 Archaeology Data Service [disciplinary] \n", + "\n", + " subject \\\n", + "0 [1 Humanities and Social Sciences, 111 Social ... \n", + "1 [1 Humanities and Social Sciences, 102 History... \n", + "2 [1 Humanities and Social Sciences, 104 Linguis... \n", + "3 [1 Humanities and Social Sciences, 111 Social ... \n", + "4 [1 Humanities and Social Sciences, 101 Ancient... \n", + "\n", + " provider_type \\\n", + "0 [dataProvider] \n", + "1 [dataProvider] \n", + "2 [dataProvider, serviceProvider] \n", + "3 [dataProvider, serviceProvider] \n", + "4 [dataProvider, serviceProvider] \n", + "\n", + " keyword \\\n", + "0 [FAIR, Middle East, crime, demography, economy... \n", + "1 [US History] \n", + "2 [Australian German, FOLK, German dialects, Pfe... \n", + "3 [FAIR, census, demographic survey, demography,... \n", + "4 [FAIR, archaeology, cultural heritage, prehist... \n", + "\n", + " institution \n", + "0 [[Odum Institute for Research in Social Scienc... \n", + "1 [[The U.S. National Archives and Records Admin... \n", + "2 [[Institut für Deutsche Sprache, Archiv für Ge... \n", + "3 [[Odum Institute for Research in Social Scienc... \n", + "4 [[Arts and Humanities Research Council, [AHRC]... " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n", + " converters={'subject': ast.literal_eval,\n", + " 'keyword': ast.literal_eval,\n", + " 'additional_name': ast.literal_eval,\n", + " 'repository_id': ast.literal_eval,\n", + " 'type': ast.literal_eval,\n", + " 'content_type': ast.literal_eval,\n", + " 'provider_type': ast.literal_eval,\n", + " 'institution': ast.literal_eval\n", + " },\n", + " usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])\n", + "re3data_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "re3data_df = re3data_df.explode('provider_type')\n", + "re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
re3data_idrepository_nametypesubjectprovider_typekeywordinstitution
count2467246724672467245924672467
unique2466246391282122482447
topr3d100011987Landmap[disciplinary][1 Humanities and Social Sciences, 2 Life Scie...dataProvider[multidisciplinary][[National Center for Biotechnology Informatio...
freq22157320024591816
\n", + "
" + ], + "text/plain": [ + " re3data_id repository_name type \\\n", + "count 2467 2467 2467 \n", + "unique 2466 2463 9 \n", + "top r3d100011987 Landmap [disciplinary] \n", + "freq 2 2 1573 \n", + "\n", + " subject provider_type \\\n", + "count 2467 2459 \n", + "unique 1282 1 \n", + "top [1 Humanities and Social Sciences, 2 Life Scie... dataProvider \n", + "freq 200 2459 \n", + "\n", + " keyword institution \n", + "count 2467 2467 \n", + "unique 2248 2447 \n", + "top [multidisciplinary] [[National Center for Biotechnology Informatio... \n", + "freq 181 6 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**openDOAR**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
opendoar_idrepository_nametypesubjectinstitution
0101utrecht university repositoryinstitutional[multidisciplinary][[university of utrecht, [universiteit utrecht...
1115dspace at indian institute of management kozhi...institutional[ecology and environment, social sciences gene...[[indian institute of management kozhikode, [i...
241caltech engineering and science onlineinstitutional[biology and biochemistry, chemistry and chemi...[[california institute of technology, [caltech...
3119dcu online research access serviceinstitutional[multidisciplinary][[dublin city university, [dcu], ie, [], , htt...
4129earth-prints repositorydisciplinary[earth and planetary sciences][[istituto nazionale di geofisica e vulcanolog...
\n", + "
" + ], + "text/plain": [ + " opendoar_id repository_name \\\n", + "0 101 utrecht university repository \n", + "1 115 dspace at indian institute of management kozhi... \n", + "2 41 caltech engineering and science online \n", + "3 119 dcu online research access service \n", + "4 129 earth-prints repository \n", + "\n", + " type subject \\\n", + "0 institutional [multidisciplinary] \n", + "1 institutional [ecology and environment, social sciences gene... \n", + "2 institutional [biology and biochemistry, chemistry and chemi... \n", + "3 institutional [multidisciplinary] \n", + "4 disciplinary [earth and planetary sciences] \n", + "\n", + " institution \n", + "0 [[university of utrecht, [universiteit utrecht... \n", + "1 [[indian institute of management kozhikode, [i... \n", + "2 [[california institute of technology, [caltech... \n", + "3 [[dublin city university, [dcu], ie, [], , htt... \n", + "4 [[istituto nazionale di geofisica e vulcanolog... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n", + " converters={'subject': ast.literal_eval,\n", + " 'additional_name': ast.literal_eval,\n", + " 'opendoar_id': ast.literal_eval,\n", + " 'content_type': ast.literal_eval,\n", + " 'institution': ast.literal_eval\n", + " },\n", + " usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])\n", + "opendoar_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
opendoar_idrepository_nametypesubjectinstitution
count5707.0000005707570757075707
uniqueNaN567048205098
topNaNarchinstitutional[multidisciplinary][[rijksuniversiteit groningen, [rug], nl, [], ...
freqNaN35067321226
mean4008.118801NaNNaNNaNNaN
std2869.948770NaNNaNNaNNaN
min2.000000NaNNaNNaNNaN
25%1823.000000NaNNaNNaNNaN
50%3361.000000NaNNaNNaNNaN
75%5095.000000NaNNaNNaNNaN
max10175.000000NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " opendoar_id repository_name type subject \\\n", + "count 5707.000000 5707 5707 5707 \n", + "unique NaN 5670 4 820 \n", + "top NaN arch institutional [multidisciplinary] \n", + "freq NaN 3 5067 3212 \n", + "mean 4008.118801 NaN NaN NaN \n", + "std 2869.948770 NaN NaN NaN \n", + "min 2.000000 NaN NaN NaN \n", + "25% 1823.000000 NaN NaN NaN \n", + "50% 3361.000000 NaN NaN NaN \n", + "75% 5095.000000 NaN NaN NaN \n", + "max 10175.000000 NaN NaN NaN \n", + "\n", + " institution \n", + "count 5707 \n", + "unique 5098 \n", + "top [[rijksuniversiteit groningen, [rug], nl, [], ... \n", + "freq 26 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**ROAR**" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eprintidhome_pagetitlelocation_countrysubjects
0921http://alcme.oclc.org/ndltd/index.htmlNetworked Digital Library of Theses and Disser...usNaN
11489http://prensahistorica.mcu.es/prensahistorica/...Virtual Library of Historical PressesNaN
2606http://hal.archives-ouvertes.fr/HAL: Hyper Article en LignefrNaN
3606NaNNaNNaNNaN
4606NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " eprintid home_page \\\n", + "0 921 http://alcme.oclc.org/ndltd/index.html \n", + "1 1489 http://prensahistorica.mcu.es/prensahistorica/... \n", + "2 606 http://hal.archives-ouvertes.fr/ \n", + "3 606 NaN \n", + "4 606 NaN \n", + "\n", + " title location_country subjects \n", + "0 Networked Digital Library of Theses and Disser... us NaN \n", + "1 Virtual Library of Historical Press es NaN \n", + "2 HAL: Hyper Article en Ligne fr NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',\n", + " usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])\n", + "roar_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eprintidhome_pagetitlelocation_countrysubjects
1062303NaNFaculty Scholarship at The Claremont CollegesusAS
1072303NaNNaNNaNBF
1082303NaNNaNNaNBL
1092303NaNNaNNaNCC
1102303NaNNaNNaNGN
1112303NaNNaNNaNH1
1122303NaNNaNNaNHB
1132303NaNNaNNaNJA
1142303NaNNaNNaNLB
1152303NaNNaNNaNNX
1162303NaNNaNNaNPQ
1172303NaNNaNNaNQA
\n", + "
" + ], + "text/plain": [ + " eprintid home_page title \\\n", + "106 2303 NaN Faculty Scholarship at The Claremont Colleges \n", + "107 2303 NaN NaN \n", + "108 2303 NaN NaN \n", + "109 2303 NaN NaN \n", + "110 2303 NaN NaN \n", + "111 2303 NaN NaN \n", + "112 2303 NaN NaN \n", + "113 2303 NaN NaN \n", + "114 2303 NaN NaN \n", + "115 2303 NaN NaN \n", + "116 2303 NaN NaN \n", + "117 2303 NaN NaN \n", + "\n", + " location_country subjects \n", + "106 us AS \n", + "107 NaN BF \n", + "108 NaN BL \n", + "109 NaN CC \n", + "110 NaN GN \n", + "111 NaN H1 \n", + "112 NaN HB \n", + "113 NaN JA \n", + "114 NaN LB \n", + "115 NaN NX \n", + "116 NaN PQ \n", + "117 NaN QA " + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roar_df[roar_df.eprintid == 2303]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eprintidhome_pagetitlelocation_countrysubjects
count5314.0000005263526850241225
uniqueNaN51565027134123
topNaNhttp://ir.lib.isu.edu.tw/Repositorio InstitucionalusH1
freqNaN37877147
mean6389.464434NaNNaNNaNNaN
std5159.573937NaNNaNNaNNaN
min1.000000NaNNaNNaNNaN
25%1490.250000NaNNaNNaNNaN
50%4990.500000NaNNaNNaNNaN
75%10452.750000NaNNaNNaNNaN
max17302.000000NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " eprintid home_page title \\\n", + "count 5314.000000 5263 5268 \n", + "unique NaN 5156 5027 \n", + "top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional \n", + "freq NaN 3 7 \n", + "mean 6389.464434 NaN NaN \n", + "std 5159.573937 NaN NaN \n", + "min 1.000000 NaN NaN \n", + "25% 1490.250000 NaN NaN \n", + "50% 4990.500000 NaN NaN \n", + "75% 10452.750000 NaN NaN \n", + "max 17302.000000 NaN NaN \n", + "\n", + " location_country subjects \n", + "count 5024 1225 \n", + "unique 134 123 \n", + "top us H1 \n", + "freq 877 147 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roar_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**FAIRsharing**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_nameshort_namefs_urlurlcountriessubjects
0GenBankGenBankhttps://fairsharing.org/10.25504/FAIRsharing.9...https://www.ncbi.nlm.nih.gov/genbank/European Union,Japan,United StatesBioinformatics,Data Management,Data Submission...
1GlycoNAVIGlycoNAVIhttps://fairsharing.org/10.25504/FAIRsharing.w...https://glyconavi.org/JapanChemistry,Glycomics,Life Science,Organic Chemi...
2ADHDgeneADHDgenehttps://fairsharing.org/10.25504/FAIRsharing.m...http://adhd.psych.ac.cn/ChinaBiomedical Science,Genetics
3Allele frequency resource for research and tea...ALFREDhttps://fairsharing.org/10.25504/FAIRsharing.y...http://alfred.med.yale.eduUnited StatesLife Science
4Animal Transcription Factor DatabaseAnimalTFDBhttps://fairsharing.org/10.25504/FAIRsharing.e...http://bioinfo.life.hust.edu.cn/AnimalTFDB/ChinaLife Science
\n", + "
" + ], + "text/plain": [ + " full_name short_name \\\n", + "0 GenBank GenBank \n", + "1 GlycoNAVI GlycoNAVI \n", + "2 ADHDgene ADHDgene \n", + "3 Allele frequency resource for research and tea... ALFRED \n", + "4 Animal Transcription Factor Database AnimalTFDB \n", + "\n", + " fs_url \\\n", + "0 https://fairsharing.org/10.25504/FAIRsharing.9... \n", + "1 https://fairsharing.org/10.25504/FAIRsharing.w... \n", + "2 https://fairsharing.org/10.25504/FAIRsharing.m... \n", + "3 https://fairsharing.org/10.25504/FAIRsharing.y... \n", + "4 https://fairsharing.org/10.25504/FAIRsharing.e... \n", + "\n", + " url \\\n", + "0 https://www.ncbi.nlm.nih.gov/genbank/ \n", + "1 https://glyconavi.org/ \n", + "2 http://adhd.psych.ac.cn/ \n", + "3 http://alfred.med.yale.edu \n", + "4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n", + "\n", + " countries \\\n", + "0 European Union,Japan,United States \n", + "1 Japan \n", + "2 China \n", + "3 United States \n", + "4 China \n", + "\n", + " subjects \n", + "0 Bioinformatics,Data Management,Data Submission... \n", + "1 Chemistry,Glycomics,Life Science,Organic Chemi... \n", + "2 Biomedical Science,Genetics \n", + "3 Life Science \n", + "4 Life Science " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n", + " delimiter='|', header=0,\n", + " names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n", + "fairsharing_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_nameshort_namefs_urlurlcountriessubjects
count175217521752175217491690
unique1752174117521752178834
topCiteAbCGDhttps://fairsharing.org/10.25504/FAIRsharing.1...http://www.plexdb.org/United StatesLife Science
freq1311588367
\n", + "
" + ], + "text/plain": [ + " full_name short_name \\\n", + "count 1752 1752 \n", + "unique 1752 1741 \n", + "top CiteAb CGD \n", + "freq 1 3 \n", + "\n", + " fs_url \\\n", + "count 1752 \n", + "unique 1752 \n", + "top https://fairsharing.org/10.25504/FAIRsharing.1... \n", + "freq 1 \n", + "\n", + " url countries subjects \n", + "count 1752 1749 1690 \n", + "unique 1752 178 834 \n", + "top http://www.plexdb.org/ United States Life Science \n", + "freq 1 588 367 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fairsharing_df.describe(include='all')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/01.3-exploration-roar.ipynb b/notebooks/01.3-exploration-roar.ipynb new file mode 100644 index 0000000..b79a854 --- /dev/null +++ b/notebooks/01.3-exploration-roar.ipynb @@ -0,0 +1,1330 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import csv\n", + "import json\n", + "import reverse_geocoder as rg\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import pycountry_convert\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib_venn import venn2, venn2_circles\n", + "\n", + "import plotly\n", + "from plotly.offline import iplot, init_notebook_mode\n", + "import plotly.graph_objs as go\n", + "import plotly.express as px\n", + "\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**re3data**" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
re3data_idrepository_nametypesubjectprovider_typekeywordinstitution
0r3d100000001Odum Institute Archive Dataverse[disciplinary][1 Humanities and Social Sciences, 111 Social ...[dataProvider][FAIR, Middle East, crime, demography, economy...[[Odum Institute for Research in Social Scienc...
1r3d100000002Access to Archival Databases[disciplinary][1 Humanities and Social Sciences, 102 History...[dataProvider][US History][[The U.S. National Archives and Records Admin...
2r3d100000004Datenbank Gesprochenes Deutsch[disciplinary][1 Humanities and Social Sciences, 104 Linguis...[dataProvider, serviceProvider][Australian German, FOLK, German dialects, Pfe...[[Institut für Deutsche Sprache, Archiv für Ge...
3r3d100000005UNC Dataverse[institutional][1 Humanities and Social Sciences, 111 Social ...[dataProvider, serviceProvider][FAIR, census, demographic survey, demography,...[[Odum Institute for Research in Social Scienc...
4r3d100000006Archaeology Data Service[disciplinary][1 Humanities and Social Sciences, 101 Ancient...[dataProvider, serviceProvider][FAIR, archaeology, cultural heritage, prehist...[[Arts and Humanities Research Council, [AHRC]...
\n", + "
" + ], + "text/plain": [ + " re3data_id repository_name type \\\n", + "0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] \n", + "1 r3d100000002 Access to Archival Databases [disciplinary] \n", + "2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] \n", + "3 r3d100000005 UNC Dataverse [institutional] \n", + "4 r3d100000006 Archaeology Data Service [disciplinary] \n", + "\n", + " subject \\\n", + "0 [1 Humanities and Social Sciences, 111 Social ... \n", + "1 [1 Humanities and Social Sciences, 102 History... \n", + "2 [1 Humanities and Social Sciences, 104 Linguis... \n", + "3 [1 Humanities and Social Sciences, 111 Social ... \n", + "4 [1 Humanities and Social Sciences, 101 Ancient... \n", + "\n", + " provider_type \\\n", + "0 [dataProvider] \n", + "1 [dataProvider] \n", + "2 [dataProvider, serviceProvider] \n", + "3 [dataProvider, serviceProvider] \n", + "4 [dataProvider, serviceProvider] \n", + "\n", + " keyword \\\n", + "0 [FAIR, Middle East, crime, demography, economy... \n", + "1 [US History] \n", + "2 [Australian German, FOLK, German dialects, Pfe... \n", + "3 [FAIR, census, demographic survey, demography,... \n", + "4 [FAIR, archaeology, cultural heritage, prehist... \n", + "\n", + " institution \n", + "0 [[Odum Institute for Research in Social Scienc... \n", + "1 [[The U.S. National Archives and Records Admin... \n", + "2 [[Institut für Deutsche Sprache, Archiv für Ge... \n", + "3 [[Odum Institute for Research in Social Scienc... \n", + "4 [[Arts and Humanities Research Council, [AHRC]... " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n", + " converters={'subject': ast.literal_eval,\n", + " 'keyword': ast.literal_eval,\n", + " 'additional_name': ast.literal_eval,\n", + " 'repository_id': ast.literal_eval,\n", + " 'type': ast.literal_eval,\n", + " 'content_type': ast.literal_eval,\n", + " 'provider_type': ast.literal_eval,\n", + " 'institution': ast.literal_eval\n", + " },\n", + " usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])\n", + "re3data_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "re3data_df = re3data_df.explode('provider_type')\n", + "re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
re3data_idrepository_nametypesubjectprovider_typekeywordinstitution
count2467246724672467245924672467
unique2466246391282122482447
topr3d100011987Landmap[disciplinary][1 Humanities and Social Sciences, 2 Life Scie...dataProvider[multidisciplinary][[National Center for Biotechnology Informatio...
freq22157320024591816
\n", + "
" + ], + "text/plain": [ + " re3data_id repository_name type \\\n", + "count 2467 2467 2467 \n", + "unique 2466 2463 9 \n", + "top r3d100011987 Landmap [disciplinary] \n", + "freq 2 2 1573 \n", + "\n", + " subject provider_type \\\n", + "count 2467 2459 \n", + "unique 1282 1 \n", + "top [1 Humanities and Social Sciences, 2 Life Scie... dataProvider \n", + "freq 200 2459 \n", + "\n", + " keyword institution \n", + "count 2467 2467 \n", + "unique 2248 2447 \n", + "top [multidisciplinary] [[National Center for Biotechnology Informatio... \n", + "freq 181 6 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**openDOAR**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
opendoar_idrepository_nametypesubjectinstitution
0101utrecht university repositoryinstitutional[multidisciplinary][[university of utrecht, [universiteit utrecht...
1115dspace at indian institute of management kozhi...institutional[ecology and environment, social sciences gene...[[indian institute of management kozhikode, [i...
241caltech engineering and science onlineinstitutional[biology and biochemistry, chemistry and chemi...[[california institute of technology, [caltech...
3119dcu online research access serviceinstitutional[multidisciplinary][[dublin city university, [dcu], ie, [], , htt...
4129earth-prints repositorydisciplinary[earth and planetary sciences][[istituto nazionale di geofisica e vulcanolog...
\n", + "
" + ], + "text/plain": [ + " opendoar_id repository_name \\\n", + "0 101 utrecht university repository \n", + "1 115 dspace at indian institute of management kozhi... \n", + "2 41 caltech engineering and science online \n", + "3 119 dcu online research access service \n", + "4 129 earth-prints repository \n", + "\n", + " type subject \\\n", + "0 institutional [multidisciplinary] \n", + "1 institutional [ecology and environment, social sciences gene... \n", + "2 institutional [biology and biochemistry, chemistry and chemi... \n", + "3 institutional [multidisciplinary] \n", + "4 disciplinary [earth and planetary sciences] \n", + "\n", + " institution \n", + "0 [[university of utrecht, [universiteit utrecht... \n", + "1 [[indian institute of management kozhikode, [i... \n", + "2 [[california institute of technology, [caltech... \n", + "3 [[dublin city university, [dcu], ie, [], , htt... \n", + "4 [[istituto nazionale di geofisica e vulcanolog... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n", + " converters={'subject': ast.literal_eval,\n", + " 'additional_name': ast.literal_eval,\n", + " 'opendoar_id': ast.literal_eval,\n", + " 'content_type': ast.literal_eval,\n", + " 'institution': ast.literal_eval\n", + " },\n", + " usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])\n", + "opendoar_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
opendoar_idrepository_nametypesubjectinstitution
count5707.0000005707570757075707
uniqueNaN567048205098
topNaNarchinstitutional[multidisciplinary][[rijksuniversiteit groningen, [rug], nl, [], ...
freqNaN35067321226
mean4008.118801NaNNaNNaNNaN
std2869.948770NaNNaNNaNNaN
min2.000000NaNNaNNaNNaN
25%1823.000000NaNNaNNaNNaN
50%3361.000000NaNNaNNaNNaN
75%5095.000000NaNNaNNaNNaN
max10175.000000NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " opendoar_id repository_name type subject \\\n", + "count 5707.000000 5707 5707 5707 \n", + "unique NaN 5670 4 820 \n", + "top NaN arch institutional [multidisciplinary] \n", + "freq NaN 3 5067 3212 \n", + "mean 4008.118801 NaN NaN NaN \n", + "std 2869.948770 NaN NaN NaN \n", + "min 2.000000 NaN NaN NaN \n", + "25% 1823.000000 NaN NaN NaN \n", + "50% 3361.000000 NaN NaN NaN \n", + "75% 5095.000000 NaN NaN NaN \n", + "max 10175.000000 NaN NaN NaN \n", + "\n", + " institution \n", + "count 5707 \n", + "unique 5098 \n", + "top [[rijksuniversiteit groningen, [rug], nl, [], ... \n", + "freq 26 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**ROAR**" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eprintidhome_pagetitlelocation_countrysubjects
0921http://alcme.oclc.org/ndltd/index.htmlNetworked Digital Library of Theses and Disser...usNaN
11489http://prensahistorica.mcu.es/prensahistorica/...Virtual Library of Historical PressesNaN
2606http://hal.archives-ouvertes.fr/HAL: Hyper Article en LignefrNaN
3606NaNNaNNaNNaN
4606NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " eprintid home_page \\\n", + "0 921 http://alcme.oclc.org/ndltd/index.html \n", + "1 1489 http://prensahistorica.mcu.es/prensahistorica/... \n", + "2 606 http://hal.archives-ouvertes.fr/ \n", + "3 606 NaN \n", + "4 606 NaN \n", + "\n", + " title location_country subjects \n", + "0 Networked Digital Library of Theses and Disser... us NaN \n", + "1 Virtual Library of Historical Press es NaN \n", + "2 HAL: Hyper Article en Ligne fr NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',\n", + " usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])\n", + "roar_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eprintidhome_pagetitlelocation_countrysubjects
1062303NaNFaculty Scholarship at The Claremont CollegesusAS
1072303NaNNaNNaNBF
1082303NaNNaNNaNBL
1092303NaNNaNNaNCC
1102303NaNNaNNaNGN
1112303NaNNaNNaNH1
1122303NaNNaNNaNHB
1132303NaNNaNNaNJA
1142303NaNNaNNaNLB
1152303NaNNaNNaNNX
1162303NaNNaNNaNPQ
1172303NaNNaNNaNQA
\n", + "
" + ], + "text/plain": [ + " eprintid home_page title \\\n", + "106 2303 NaN Faculty Scholarship at The Claremont Colleges \n", + "107 2303 NaN NaN \n", + "108 2303 NaN NaN \n", + "109 2303 NaN NaN \n", + "110 2303 NaN NaN \n", + "111 2303 NaN NaN \n", + "112 2303 NaN NaN \n", + "113 2303 NaN NaN \n", + "114 2303 NaN NaN \n", + "115 2303 NaN NaN \n", + "116 2303 NaN NaN \n", + "117 2303 NaN NaN \n", + "\n", + " location_country subjects \n", + "106 us AS \n", + "107 NaN BF \n", + "108 NaN BL \n", + "109 NaN CC \n", + "110 NaN GN \n", + "111 NaN H1 \n", + "112 NaN HB \n", + "113 NaN JA \n", + "114 NaN LB \n", + "115 NaN NX \n", + "116 NaN PQ \n", + "117 NaN QA " + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roar_df[roar_df.eprintid == 2303]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eprintidhome_pagetitlelocation_countrysubjects
count5314.0000005263526850241225
uniqueNaN51565027134123
topNaNhttp://ir.lib.isu.edu.tw/Repositorio InstitucionalusH1
freqNaN37877147
mean6389.464434NaNNaNNaNNaN
std5159.573937NaNNaNNaNNaN
min1.000000NaNNaNNaNNaN
25%1490.250000NaNNaNNaNNaN
50%4990.500000NaNNaNNaNNaN
75%10452.750000NaNNaNNaNNaN
max17302.000000NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " eprintid home_page title \\\n", + "count 5314.000000 5263 5268 \n", + "unique NaN 5156 5027 \n", + "top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional \n", + "freq NaN 3 7 \n", + "mean 6389.464434 NaN NaN \n", + "std 5159.573937 NaN NaN \n", + "min 1.000000 NaN NaN \n", + "25% 1490.250000 NaN NaN \n", + "50% 4990.500000 NaN NaN \n", + "75% 10452.750000 NaN NaN \n", + "max 17302.000000 NaN NaN \n", + "\n", + " location_country subjects \n", + "count 5024 1225 \n", + "unique 134 123 \n", + "top us H1 \n", + "freq 877 147 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roar_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**FAIRsharing**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_nameshort_namefs_urlurlcountriessubjects
0GenBankGenBankhttps://fairsharing.org/10.25504/FAIRsharing.9...https://www.ncbi.nlm.nih.gov/genbank/European Union,Japan,United StatesBioinformatics,Data Management,Data Submission...
1GlycoNAVIGlycoNAVIhttps://fairsharing.org/10.25504/FAIRsharing.w...https://glyconavi.org/JapanChemistry,Glycomics,Life Science,Organic Chemi...
2ADHDgeneADHDgenehttps://fairsharing.org/10.25504/FAIRsharing.m...http://adhd.psych.ac.cn/ChinaBiomedical Science,Genetics
3Allele frequency resource for research and tea...ALFREDhttps://fairsharing.org/10.25504/FAIRsharing.y...http://alfred.med.yale.eduUnited StatesLife Science
4Animal Transcription Factor DatabaseAnimalTFDBhttps://fairsharing.org/10.25504/FAIRsharing.e...http://bioinfo.life.hust.edu.cn/AnimalTFDB/ChinaLife Science
\n", + "
" + ], + "text/plain": [ + " full_name short_name \\\n", + "0 GenBank GenBank \n", + "1 GlycoNAVI GlycoNAVI \n", + "2 ADHDgene ADHDgene \n", + "3 Allele frequency resource for research and tea... ALFRED \n", + "4 Animal Transcription Factor Database AnimalTFDB \n", + "\n", + " fs_url \\\n", + "0 https://fairsharing.org/10.25504/FAIRsharing.9... \n", + "1 https://fairsharing.org/10.25504/FAIRsharing.w... \n", + "2 https://fairsharing.org/10.25504/FAIRsharing.m... \n", + "3 https://fairsharing.org/10.25504/FAIRsharing.y... \n", + "4 https://fairsharing.org/10.25504/FAIRsharing.e... \n", + "\n", + " url \\\n", + "0 https://www.ncbi.nlm.nih.gov/genbank/ \n", + "1 https://glyconavi.org/ \n", + "2 http://adhd.psych.ac.cn/ \n", + "3 http://alfred.med.yale.edu \n", + "4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n", + "\n", + " countries \\\n", + "0 European Union,Japan,United States \n", + "1 Japan \n", + "2 China \n", + "3 United States \n", + "4 China \n", + "\n", + " subjects \n", + "0 Bioinformatics,Data Management,Data Submission... \n", + "1 Chemistry,Glycomics,Life Science,Organic Chemi... \n", + "2 Biomedical Science,Genetics \n", + "3 Life Science \n", + "4 Life Science " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n", + " delimiter='|', header=0,\n", + " names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n", + "fairsharing_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_nameshort_namefs_urlurlcountriessubjects
count175217521752175217491690
unique1752174117521752178834
topCiteAbCGDhttps://fairsharing.org/10.25504/FAIRsharing.1...http://www.plexdb.org/United StatesLife Science
freq1311588367
\n", + "
" + ], + "text/plain": [ + " full_name short_name \\\n", + "count 1752 1752 \n", + "unique 1752 1741 \n", + "top CiteAb CGD \n", + "freq 1 3 \n", + "\n", + " fs_url \\\n", + "count 1752 \n", + "unique 1752 \n", + "top https://fairsharing.org/10.25504/FAIRsharing.1... \n", + "freq 1 \n", + "\n", + " url countries subjects \n", + "count 1752 1749 1690 \n", + "unique 1752 178 834 \n", + "top http://www.plexdb.org/ United States Life Science \n", + "freq 1 588 367 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fairsharing_df.describe(include='all')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/01.4-exploration-fairsharing.ipynb b/notebooks/01.4-exploration-fairsharing.ipynb new file mode 100644 index 0000000..b79a854 --- /dev/null +++ b/notebooks/01.4-exploration-fairsharing.ipynb @@ -0,0 +1,1330 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import csv\n", + "import json\n", + "import reverse_geocoder as rg\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "import pycountry_convert\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib_venn import venn2, venn2_circles\n", + "\n", + "import plotly\n", + "from plotly.offline import iplot, init_notebook_mode\n", + "import plotly.graph_objs as go\n", + "import plotly.express as px\n", + "\n", + "pd.set_option('display.max_columns', None)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Loading datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**re3data**" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
re3data_idrepository_nametypesubjectprovider_typekeywordinstitution
0r3d100000001Odum Institute Archive Dataverse[disciplinary][1 Humanities and Social Sciences, 111 Social ...[dataProvider][FAIR, Middle East, crime, demography, economy...[[Odum Institute for Research in Social Scienc...
1r3d100000002Access to Archival Databases[disciplinary][1 Humanities and Social Sciences, 102 History...[dataProvider][US History][[The U.S. National Archives and Records Admin...
2r3d100000004Datenbank Gesprochenes Deutsch[disciplinary][1 Humanities and Social Sciences, 104 Linguis...[dataProvider, serviceProvider][Australian German, FOLK, German dialects, Pfe...[[Institut für Deutsche Sprache, Archiv für Ge...
3r3d100000005UNC Dataverse[institutional][1 Humanities and Social Sciences, 111 Social ...[dataProvider, serviceProvider][FAIR, census, demographic survey, demography,...[[Odum Institute for Research in Social Scienc...
4r3d100000006Archaeology Data Service[disciplinary][1 Humanities and Social Sciences, 101 Ancient...[dataProvider, serviceProvider][FAIR, archaeology, cultural heritage, prehist...[[Arts and Humanities Research Council, [AHRC]...
\n", + "
" + ], + "text/plain": [ + " re3data_id repository_name type \\\n", + "0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] \n", + "1 r3d100000002 Access to Archival Databases [disciplinary] \n", + "2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] \n", + "3 r3d100000005 UNC Dataverse [institutional] \n", + "4 r3d100000006 Archaeology Data Service [disciplinary] \n", + "\n", + " subject \\\n", + "0 [1 Humanities and Social Sciences, 111 Social ... \n", + "1 [1 Humanities and Social Sciences, 102 History... \n", + "2 [1 Humanities and Social Sciences, 104 Linguis... \n", + "3 [1 Humanities and Social Sciences, 111 Social ... \n", + "4 [1 Humanities and Social Sciences, 101 Ancient... \n", + "\n", + " provider_type \\\n", + "0 [dataProvider] \n", + "1 [dataProvider] \n", + "2 [dataProvider, serviceProvider] \n", + "3 [dataProvider, serviceProvider] \n", + "4 [dataProvider, serviceProvider] \n", + "\n", + " keyword \\\n", + "0 [FAIR, Middle East, crime, demography, economy... \n", + "1 [US History] \n", + "2 [Australian German, FOLK, German dialects, Pfe... \n", + "3 [FAIR, census, demographic survey, demography,... \n", + "4 [FAIR, archaeology, cultural heritage, prehist... \n", + "\n", + " institution \n", + "0 [[Odum Institute for Research in Social Scienc... \n", + "1 [[The U.S. National Archives and Records Admin... \n", + "2 [[Institut für Deutsche Sprache, Archiv für Ge... \n", + "3 [[Odum Institute for Research in Social Scienc... \n", + "4 [[Arts and Humanities Research Council, [AHRC]... " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n", + " converters={'subject': ast.literal_eval,\n", + " 'keyword': ast.literal_eval,\n", + " 'additional_name': ast.literal_eval,\n", + " 'repository_id': ast.literal_eval,\n", + " 'type': ast.literal_eval,\n", + " 'content_type': ast.literal_eval,\n", + " 'provider_type': ast.literal_eval,\n", + " 'institution': ast.literal_eval\n", + " },\n", + " usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])\n", + "re3data_df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "re3data_df = re3data_df.explode('provider_type')\n", + "re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
re3data_idrepository_nametypesubjectprovider_typekeywordinstitution
count2467246724672467245924672467
unique2466246391282122482447
topr3d100011987Landmap[disciplinary][1 Humanities and Social Sciences, 2 Life Scie...dataProvider[multidisciplinary][[National Center for Biotechnology Informatio...
freq22157320024591816
\n", + "
" + ], + "text/plain": [ + " re3data_id repository_name type \\\n", + "count 2467 2467 2467 \n", + "unique 2466 2463 9 \n", + "top r3d100011987 Landmap [disciplinary] \n", + "freq 2 2 1573 \n", + "\n", + " subject provider_type \\\n", + "count 2467 2459 \n", + "unique 1282 1 \n", + "top [1 Humanities and Social Sciences, 2 Life Scie... dataProvider \n", + "freq 200 2459 \n", + "\n", + " keyword institution \n", + "count 2467 2467 \n", + "unique 2248 2447 \n", + "top [multidisciplinary] [[National Center for Biotechnology Informatio... \n", + "freq 181 6 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**openDOAR**" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
opendoar_idrepository_nametypesubjectinstitution
0101utrecht university repositoryinstitutional[multidisciplinary][[university of utrecht, [universiteit utrecht...
1115dspace at indian institute of management kozhi...institutional[ecology and environment, social sciences gene...[[indian institute of management kozhikode, [i...
241caltech engineering and science onlineinstitutional[biology and biochemistry, chemistry and chemi...[[california institute of technology, [caltech...
3119dcu online research access serviceinstitutional[multidisciplinary][[dublin city university, [dcu], ie, [], , htt...
4129earth-prints repositorydisciplinary[earth and planetary sciences][[istituto nazionale di geofisica e vulcanolog...
\n", + "
" + ], + "text/plain": [ + " opendoar_id repository_name \\\n", + "0 101 utrecht university repository \n", + "1 115 dspace at indian institute of management kozhi... \n", + "2 41 caltech engineering and science online \n", + "3 119 dcu online research access service \n", + "4 129 earth-prints repository \n", + "\n", + " type subject \\\n", + "0 institutional [multidisciplinary] \n", + "1 institutional [ecology and environment, social sciences gene... \n", + "2 institutional [biology and biochemistry, chemistry and chemi... \n", + "3 institutional [multidisciplinary] \n", + "4 disciplinary [earth and planetary sciences] \n", + "\n", + " institution \n", + "0 [[university of utrecht, [universiteit utrecht... \n", + "1 [[indian institute of management kozhikode, [i... \n", + "2 [[california institute of technology, [caltech... \n", + "3 [[dublin city university, [dcu], ie, [], , htt... \n", + "4 [[istituto nazionale di geofisica e vulcanolog... " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n", + " converters={'subject': ast.literal_eval,\n", + " 'additional_name': ast.literal_eval,\n", + " 'opendoar_id': ast.literal_eval,\n", + " 'content_type': ast.literal_eval,\n", + " 'institution': ast.literal_eval\n", + " },\n", + " usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])\n", + "opendoar_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
opendoar_idrepository_nametypesubjectinstitution
count5707.0000005707570757075707
uniqueNaN567048205098
topNaNarchinstitutional[multidisciplinary][[rijksuniversiteit groningen, [rug], nl, [], ...
freqNaN35067321226
mean4008.118801NaNNaNNaNNaN
std2869.948770NaNNaNNaNNaN
min2.000000NaNNaNNaNNaN
25%1823.000000NaNNaNNaNNaN
50%3361.000000NaNNaNNaNNaN
75%5095.000000NaNNaNNaNNaN
max10175.000000NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " opendoar_id repository_name type subject \\\n", + "count 5707.000000 5707 5707 5707 \n", + "unique NaN 5670 4 820 \n", + "top NaN arch institutional [multidisciplinary] \n", + "freq NaN 3 5067 3212 \n", + "mean 4008.118801 NaN NaN NaN \n", + "std 2869.948770 NaN NaN NaN \n", + "min 2.000000 NaN NaN NaN \n", + "25% 1823.000000 NaN NaN NaN \n", + "50% 3361.000000 NaN NaN NaN \n", + "75% 5095.000000 NaN NaN NaN \n", + "max 10175.000000 NaN NaN NaN \n", + "\n", + " institution \n", + "count 5707 \n", + "unique 5098 \n", + "top [[rijksuniversiteit groningen, [rug], nl, [], ... \n", + "freq 26 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**ROAR**" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eprintidhome_pagetitlelocation_countrysubjects
0921http://alcme.oclc.org/ndltd/index.htmlNetworked Digital Library of Theses and Disser...usNaN
11489http://prensahistorica.mcu.es/prensahistorica/...Virtual Library of Historical PressesNaN
2606http://hal.archives-ouvertes.fr/HAL: Hyper Article en LignefrNaN
3606NaNNaNNaNNaN
4606NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " eprintid home_page \\\n", + "0 921 http://alcme.oclc.org/ndltd/index.html \n", + "1 1489 http://prensahistorica.mcu.es/prensahistorica/... \n", + "2 606 http://hal.archives-ouvertes.fr/ \n", + "3 606 NaN \n", + "4 606 NaN \n", + "\n", + " title location_country subjects \n", + "0 Networked Digital Library of Theses and Disser... us NaN \n", + "1 Virtual Library of Historical Press es NaN \n", + "2 HAL: Hyper Article en Ligne fr NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN " + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',\n", + " usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])\n", + "roar_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eprintidhome_pagetitlelocation_countrysubjects
1062303NaNFaculty Scholarship at The Claremont CollegesusAS
1072303NaNNaNNaNBF
1082303NaNNaNNaNBL
1092303NaNNaNNaNCC
1102303NaNNaNNaNGN
1112303NaNNaNNaNH1
1122303NaNNaNNaNHB
1132303NaNNaNNaNJA
1142303NaNNaNNaNLB
1152303NaNNaNNaNNX
1162303NaNNaNNaNPQ
1172303NaNNaNNaNQA
\n", + "
" + ], + "text/plain": [ + " eprintid home_page title \\\n", + "106 2303 NaN Faculty Scholarship at The Claremont Colleges \n", + "107 2303 NaN NaN \n", + "108 2303 NaN NaN \n", + "109 2303 NaN NaN \n", + "110 2303 NaN NaN \n", + "111 2303 NaN NaN \n", + "112 2303 NaN NaN \n", + "113 2303 NaN NaN \n", + "114 2303 NaN NaN \n", + "115 2303 NaN NaN \n", + "116 2303 NaN NaN \n", + "117 2303 NaN NaN \n", + "\n", + " location_country subjects \n", + "106 us AS \n", + "107 NaN BF \n", + "108 NaN BL \n", + "109 NaN CC \n", + "110 NaN GN \n", + "111 NaN H1 \n", + "112 NaN HB \n", + "113 NaN JA \n", + "114 NaN LB \n", + "115 NaN NX \n", + "116 NaN PQ \n", + "117 NaN QA " + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roar_df[roar_df.eprintid == 2303]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
eprintidhome_pagetitlelocation_countrysubjects
count5314.0000005263526850241225
uniqueNaN51565027134123
topNaNhttp://ir.lib.isu.edu.tw/Repositorio InstitucionalusH1
freqNaN37877147
mean6389.464434NaNNaNNaNNaN
std5159.573937NaNNaNNaNNaN
min1.000000NaNNaNNaNNaN
25%1490.250000NaNNaNNaNNaN
50%4990.500000NaNNaNNaNNaN
75%10452.750000NaNNaNNaNNaN
max17302.000000NaNNaNNaNNaN
\n", + "
" + ], + "text/plain": [ + " eprintid home_page title \\\n", + "count 5314.000000 5263 5268 \n", + "unique NaN 5156 5027 \n", + "top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional \n", + "freq NaN 3 7 \n", + "mean 6389.464434 NaN NaN \n", + "std 5159.573937 NaN NaN \n", + "min 1.000000 NaN NaN \n", + "25% 1490.250000 NaN NaN \n", + "50% 4990.500000 NaN NaN \n", + "75% 10452.750000 NaN NaN \n", + "max 17302.000000 NaN NaN \n", + "\n", + " location_country subjects \n", + "count 5024 1225 \n", + "unique 134 123 \n", + "top us H1 \n", + "freq 877 147 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "roar_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**FAIRsharing**" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_nameshort_namefs_urlurlcountriessubjects
0GenBankGenBankhttps://fairsharing.org/10.25504/FAIRsharing.9...https://www.ncbi.nlm.nih.gov/genbank/European Union,Japan,United StatesBioinformatics,Data Management,Data Submission...
1GlycoNAVIGlycoNAVIhttps://fairsharing.org/10.25504/FAIRsharing.w...https://glyconavi.org/JapanChemistry,Glycomics,Life Science,Organic Chemi...
2ADHDgeneADHDgenehttps://fairsharing.org/10.25504/FAIRsharing.m...http://adhd.psych.ac.cn/ChinaBiomedical Science,Genetics
3Allele frequency resource for research and tea...ALFREDhttps://fairsharing.org/10.25504/FAIRsharing.y...http://alfred.med.yale.eduUnited StatesLife Science
4Animal Transcription Factor DatabaseAnimalTFDBhttps://fairsharing.org/10.25504/FAIRsharing.e...http://bioinfo.life.hust.edu.cn/AnimalTFDB/ChinaLife Science
\n", + "
" + ], + "text/plain": [ + " full_name short_name \\\n", + "0 GenBank GenBank \n", + "1 GlycoNAVI GlycoNAVI \n", + "2 ADHDgene ADHDgene \n", + "3 Allele frequency resource for research and tea... ALFRED \n", + "4 Animal Transcription Factor Database AnimalTFDB \n", + "\n", + " fs_url \\\n", + "0 https://fairsharing.org/10.25504/FAIRsharing.9... \n", + "1 https://fairsharing.org/10.25504/FAIRsharing.w... \n", + "2 https://fairsharing.org/10.25504/FAIRsharing.m... \n", + "3 https://fairsharing.org/10.25504/FAIRsharing.y... \n", + "4 https://fairsharing.org/10.25504/FAIRsharing.e... \n", + "\n", + " url \\\n", + "0 https://www.ncbi.nlm.nih.gov/genbank/ \n", + "1 https://glyconavi.org/ \n", + "2 http://adhd.psych.ac.cn/ \n", + "3 http://alfred.med.yale.edu \n", + "4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n", + "\n", + " countries \\\n", + "0 European Union,Japan,United States \n", + "1 Japan \n", + "2 China \n", + "3 United States \n", + "4 China \n", + "\n", + " subjects \n", + "0 Bioinformatics,Data Management,Data Submission... \n", + "1 Chemistry,Glycomics,Life Science,Organic Chemi... \n", + "2 Biomedical Science,Genetics \n", + "3 Life Science \n", + "4 Life Science " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n", + " delimiter='|', header=0,\n", + " names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n", + "fairsharing_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
full_nameshort_namefs_urlurlcountriessubjects
count175217521752175217491690
unique1752174117521752178834
topCiteAbCGDhttps://fairsharing.org/10.25504/FAIRsharing.1...http://www.plexdb.org/United StatesLife Science
freq1311588367
\n", + "
" + ], + "text/plain": [ + " full_name short_name \\\n", + "count 1752 1752 \n", + "unique 1752 1741 \n", + "top CiteAb CGD \n", + "freq 1 3 \n", + "\n", + " fs_url \\\n", + "count 1752 \n", + "unique 1752 \n", + "top https://fairsharing.org/10.25504/FAIRsharing.1... \n", + "freq 1 \n", + "\n", + " url countries subjects \n", + "count 1752 1749 1690 \n", + "unique 1752 178 834 \n", + "top http://www.plexdb.org/ United States Life Science \n", + "freq 1 588 367 " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "fairsharing_df.describe(include='all')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/01.1-explorative.ipynb b/notebooks/02-subjects&geographic.ipynb similarity index 100% rename from notebooks/01.1-explorative.ipynb rename to notebooks/02-subjects&geographic.ipynb diff --git a/notebooks/Untitled.ipynb b/notebooks/Untitled.ipynb deleted file mode 100644 index 34f374f..0000000 --- a/notebooks/Untitled.ipynb +++ /dev/null @@ -1,32 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.3" - } - }, - "nbformat": 4, - "nbformat_minor": 4 -}