From c052601c9050d97e7eccfa96149e3faf79b264f2 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Tue, 6 Jul 2021 15:23:24 +0200 Subject: [PATCH] restructured analysis --- notebooks/01-Explorative.ipynb | 6936 +++++++++++++------------------- 1 file changed, 2882 insertions(+), 4054 deletions(-) diff --git a/notebooks/01-Explorative.ipynb b/notebooks/01-Explorative.ipynb index af4a892..dc6fe31 100644 --- a/notebooks/01-Explorative.ipynb +++ b/notebooks/01-Explorative.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": 1, + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ @@ -14,22 +14,59 @@ "import numpy as np\n", "import pandas as pd\n", "\n", + "import pycountry_convert\n", + "\n", + "import matplotlib.pyplot as plt\n", + "from matplotlib_venn import venn2, venn2_circles\n", + "\n", "import plotly\n", "from plotly.offline import iplot, init_notebook_mode\n", "import plotly.graph_objs as go\n", "import plotly.express as px" ] }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def country_to_countrycode(country):\n", + " if pd.isna(country):\n", + " return np.nan\n", + " else:\n", + " try:\n", + " return pycountry_convert.country_name_to_country_alpha2(country)\n", + " except:\n", + " return np.nan\n", + "\n", + "def countrycode_to_continent(country_code):\n", + " if pd.isna(country_code):\n", + " return np.nan\n", + " else:\n", + " try:\n", + " return pycountry_convert.country_alpha2_to_continent_code(country_code)\n", + " except:\n", + " return np.nan" + ] + }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# FAIRsharing" + "## Loading datasets" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**FAIRsharing**" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -148,7 +185,7 @@ "4 [Life Science] " ] }, - "execution_count": 2, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -164,7 +201,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -217,10 +254,10 @@ " \n", " \n", " top\n", - " The Cardiovascular Research Grid\n", + " FunTree: A Resource For Exploring The Function...\n", " CGD\n", - " https://fairsharing.org/bsg-d001750\n", - " http://www.bmrb.wisc.edu/\n", + " https://fairsharing.org/10.25504/FAIRsharing.5...\n", + " https://idn.ceos.org\n", " [United States]\n", " [Life Science]\n", " \n", @@ -238,26 +275,26 @@ "" ], "text/plain": [ - " full_name short_name \\\n", - "count 1752 1752 \n", - "unique 1752 1741 \n", - "top The Cardiovascular Research Grid CGD \n", - "freq 1 3 \n", + " full_name short_name \\\n", + "count 1752 1752 \n", + "unique 1752 1741 \n", + "top FunTree: A Resource For Exploring The Function... CGD \n", + "freq 1 3 \n", "\n", - " fs_url url \\\n", - "count 1752 1752 \n", - "unique 1752 1752 \n", - "top https://fairsharing.org/bsg-d001750 http://www.bmrb.wisc.edu/ \n", - "freq 1 1 \n", + " fs_url \\\n", + "count 1752 \n", + "unique 1752 \n", + "top https://fairsharing.org/10.25504/FAIRsharing.5... \n", + "freq 1 \n", "\n", - " countries subjects \n", - "count 1749 1690 \n", - "unique 178 834 \n", - "top [United States] [Life Science] \n", - "freq 588 367 " + " url countries subjects \n", + "count 1752 1749 1690 \n", + "unique 1752 178 834 \n", + "top https://idn.ceos.org [United States] [Life Science] \n", + "freq 1 588 367 " ] }, - "execution_count": 3, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -266,9 +303,1096 @@ "fairsharing_df.describe()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**re3data**" + ] + }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexidurlofficial_nameenglish_namedescriptionlatitudelongitudesubjects
0410|re3data_____::3f2e20af26ead0432f5470d8b739638dhttp://planttfdb.cbi.pku.edu.cn/Plant Transcription Factor DatabasePlantTFDBNaN0.00.0['Life Sciences', 'Basic Biological and Medica...
1710|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfchttps://spdf.gsfc.nasa.gov/Space Physics Data FacilityNASA's Space Physics Data Facility SPDFNaN0.00.0['Natural Sciences', 'Astrophysics and Astrono...
21310|re3data_____::59521daca59ac29b811343cc4cd370cfhttp://card.westgis.ac.cn/Cold and Arid Regions Science Data Center at L...CARD WDC for Glaciology and Geocryology World ...NaN0.00.0['Natural Sciences', 'Geosciences (including G...
31410|re3data_____::ec1ba1674c852466c266acb64c618d15https://www.psycharchives.org/PsycharchivesNaNNaN0.00.0['Humanities and Social Sciences', 'Psychology...
41910|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76https://www.ihfc-iugg.org/products/global-heat...The Global Heat Flow Database of the Internati...International Heat-flow DatabaseNaN0.00.0['Natural Sciences', 'Geology and Palaeontolog...
\n", + "
" + ], + "text/plain": [ + " index id \\\n", + "0 4 10|re3data_____::3f2e20af26ead0432f5470d8b739638d \n", + "1 7 10|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfc \n", + "2 13 10|re3data_____::59521daca59ac29b811343cc4cd370cf \n", + "3 14 10|re3data_____::ec1ba1674c852466c266acb64c618d15 \n", + "4 19 10|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76 \n", + "\n", + " url \\\n", + "0 http://planttfdb.cbi.pku.edu.cn/ \n", + "1 https://spdf.gsfc.nasa.gov/ \n", + "2 http://card.westgis.ac.cn/ \n", + "3 https://www.psycharchives.org/ \n", + "4 https://www.ihfc-iugg.org/products/global-heat... \n", + "\n", + " official_name \\\n", + "0 Plant Transcription Factor Database \n", + "1 Space Physics Data Facility \n", + "2 Cold and Arid Regions Science Data Center at L... \n", + "3 Psycharchives \n", + "4 The Global Heat Flow Database of the Internati... \n", + "\n", + " english_name description latitude \\\n", + "0 PlantTFDB NaN 0.0 \n", + "1 NASA's Space Physics Data Facility SPDF NaN 0.0 \n", + "2 CARD WDC for Glaciology and Geocryology World ... NaN 0.0 \n", + "3 NaN NaN 0.0 \n", + "4 International Heat-flow Database NaN 0.0 \n", + "\n", + " longitude subjects \n", + "0 0.0 ['Life Sciences', 'Basic Biological and Medica... \n", + "1 0.0 ['Natural Sciences', 'Astrophysics and Astrono... \n", + "2 0.0 ['Natural Sciences', 'Geosciences (including G... \n", + "3 0.0 ['Humanities and Social Sciences', 'Psychology... \n", + "4 0.0 ['Natural Sciences', 'Geology and Palaeontolog... " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df = pd.read_csv('../data/raw/re3data_opendoar.csv')\n", + "re3data_df = re3data_df[re3data_df.id.str.contains('re3data')].reset_index()\n", + "re3data_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexidurlofficial_nameenglish_namedescriptionlatitudelongitudesubjects
count2693.0000002693267326932034382693.0000002693.0000002693
uniqueNaN269326612668201038NaNNaN1427
topNaN10|re3data_____::fc8141eebc533cb225498718479f4e66http://wdcpc.org/European Climate Assessment & Dataset projectECA&DThe Atmospheric Science Data Center (ASDC) at ...NaNNaN['Humanities and Social Sciences', 'Life Scien...
freqNaN12221NaNNaN209
mean4443.650947NaNNaNNaNNaNNaN0.1144970.067998NaN
std2518.294468NaNNaNNaNNaNNaN4.5854692.447173NaN
min4.000000NaNNaNNaNNaNNaN0.0000000.000000NaN
25%2266.000000NaNNaNNaNNaNNaN0.0000000.000000NaN
50%4506.000000NaNNaNNaNNaNNaN0.0000000.000000NaN
75%6660.000000NaNNaNNaNNaNNaN0.0000000.000000NaN
max8705.000000NaNNaNNaNNaNNaN234.000000123.000000NaN
\n", + "
" + ], + "text/plain": [ + " index id \\\n", + "count 2693.000000 2693 \n", + "unique NaN 2693 \n", + "top NaN 10|re3data_____::fc8141eebc533cb225498718479f4e66 \n", + "freq NaN 1 \n", + "mean 4443.650947 NaN \n", + "std 2518.294468 NaN \n", + "min 4.000000 NaN \n", + "25% 2266.000000 NaN \n", + "50% 4506.000000 NaN \n", + "75% 6660.000000 NaN \n", + "max 8705.000000 NaN \n", + "\n", + " url official_name \\\n", + "count 2673 2693 \n", + "unique 2661 2668 \n", + "top http://wdcpc.org/ European Climate Assessment & Dataset project \n", + "freq 2 2 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " english_name description \\\n", + "count 2034 38 \n", + "unique 2010 38 \n", + "top ECA&D The Atmospheric Science Data Center (ASDC) at ... \n", + "freq 2 1 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " latitude longitude \\\n", + "count 2693.000000 2693.000000 \n", + "unique NaN NaN \n", + "top NaN NaN \n", + "freq NaN NaN \n", + "mean 0.114497 0.067998 \n", + "std 4.585469 2.447173 \n", + "min 0.000000 0.000000 \n", + "25% 0.000000 0.000000 \n", + "50% 0.000000 0.000000 \n", + "75% 0.000000 0.000000 \n", + "max 234.000000 123.000000 \n", + "\n", + " subjects \n", + "count 2693 \n", + "unique 1427 \n", + "top ['Humanities and Social Sciences', 'Life Scien... \n", + "freq 209 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**OpenDOAR**" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexidurlofficial_nameenglish_namedescriptionlatitudelongitudesubjects
0010|opendoar____::e833e042f509c996b1b25324d56659fbhttp://www.bilbao.net/bldBLD - Bilboko Liburutegi DigitalaBLD - Bilboko Liburutegi DigitalaBLD is a repository of digital documents, desi...43.256699-2.924100[]
1110|opendoar____::f621585df244e9596dc70a39b579efb1https://researchdirect.westernsydney.edu.au/Western Sydney ResearchDirectWestern Sydney ResearchDirectNaN0.0000000.000000[]
2210|opendoar____::437d7d1d97917cd627a34a6a0fb41136http://redress.lancs.ac.uk/Learning_Space/Learning Space CatalogueNaNThis repository is a Social Science e-Science ...54.010760-2.784990['Social Sciences General', 'Science General',...
3310|opendoar____::d840cc5d906c3e9c84374c8919d2074ehttp://digitallibrary.usc.edu/search/controlle...USC Digital LibraryUSC Digital LibraryThis is an institutional repository providing ...34.052200-118.242996[]
4510|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010http://www.ufgd.edu.br:8080/jspui/Repositório de Divulgação das Produções Cientí...Repositório de Divulgação das Produções Cientí...This site provides access to the research outp...-22.221800-54.806400[]
\n", + "
" + ], + "text/plain": [ + " index id \\\n", + "0 0 10|opendoar____::e833e042f509c996b1b25324d56659fb \n", + "1 1 10|opendoar____::f621585df244e9596dc70a39b579efb1 \n", + "2 2 10|opendoar____::437d7d1d97917cd627a34a6a0fb41136 \n", + "3 3 10|opendoar____::d840cc5d906c3e9c84374c8919d2074e \n", + "4 5 10|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010 \n", + "\n", + " url \\\n", + "0 http://www.bilbao.net/bld \n", + "1 https://researchdirect.westernsydney.edu.au/ \n", + "2 http://redress.lancs.ac.uk/Learning_Space/ \n", + "3 http://digitallibrary.usc.edu/search/controlle... \n", + "4 http://www.ufgd.edu.br:8080/jspui/ \n", + "\n", + " official_name \\\n", + "0 BLD - Bilboko Liburutegi Digitala \n", + "1 Western Sydney ResearchDirect \n", + "2 Learning Space Catalogue \n", + "3 USC Digital Library \n", + "4 Repositório de Divulgação das Produções Cientí... \n", + "\n", + " english_name \\\n", + "0 BLD - Bilboko Liburutegi Digitala \n", + "1 Western Sydney ResearchDirect \n", + "2 NaN \n", + "3 USC Digital Library \n", + "4 Repositório de Divulgação das Produções Cientí... \n", + "\n", + " description latitude longitude \\\n", + "0 BLD is a repository of digital documents, desi... 43.256699 -2.924100 \n", + "1 NaN 0.000000 0.000000 \n", + "2 This repository is a Social Science e-Science ... 54.010760 -2.784990 \n", + "3 This is an institutional repository providing ... 34.052200 -118.242996 \n", + "4 This site provides access to the research outp... -22.221800 -54.806400 \n", + "\n", + " subjects \n", + "0 [] \n", + "1 [] \n", + "2 ['Social Sciences General', 'Science General',... \n", + "3 [] \n", + "4 [] " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df = pd.read_csv('../data/raw/re3data_opendoar.csv')\n", + "opendoar_df = opendoar_df[opendoar_df.id.str.contains('opendoar')].reset_index()\n", + "opendoar_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
indexidurlofficial_nameenglish_namedescriptionlatitudelongitudesubjects
count6014.000000601460136014550057766014.0000006014.0000006014
uniqueNaN60145953594654134920NaNNaN201
topNaN10|opendoar____::17256f049f1e3fede17c7a313f7657f4http://harp.lib.hiroshima-u.ac.jp/Hiroshima Associated Repository PortalAURAThis site provides access to the research outp...NaNNaN[]
freqNaN133498NaNNaN5273
mean4312.407549NaNNaNNaNNaNNaN38.6493937.810948NaN
std2510.699848NaNNaNNaNNaNNaN788.40617371.689788NaN
min0.000000NaNNaNNaNNaNNaN-79.029999-683.103027NaN
25%2129.250000NaNNaNNaNNaNNaN4.644632-49.273300NaN
50%4297.000000NaNNaNNaNNaNNaN37.9304494.788870NaN
75%6476.750000NaNNaNNaNNaNNaN47.29440030.685501NaN
max8706.000000NaNNaNNaNNaNNaN61138.800781178.438995NaN
\n", + "
" + ], + "text/plain": [ + " index id \\\n", + "count 6014.000000 6014 \n", + "unique NaN 6014 \n", + "top NaN 10|opendoar____::17256f049f1e3fede17c7a313f7657f4 \n", + "freq NaN 1 \n", + "mean 4312.407549 NaN \n", + "std 2510.699848 NaN \n", + "min 0.000000 NaN \n", + "25% 2129.250000 NaN \n", + "50% 4297.000000 NaN \n", + "75% 6476.750000 NaN \n", + "max 8706.000000 NaN \n", + "\n", + " url \\\n", + "count 6013 \n", + "unique 5953 \n", + "top http://harp.lib.hiroshima-u.ac.jp/ \n", + "freq 3 \n", + "mean NaN \n", + "std NaN \n", + "min NaN \n", + "25% NaN \n", + "50% NaN \n", + "75% NaN \n", + "max NaN \n", + "\n", + " official_name english_name \\\n", + "count 6014 5500 \n", + "unique 5946 5413 \n", + "top Hiroshima Associated Repository Portal AURA \n", + "freq 3 4 \n", + "mean NaN NaN \n", + "std NaN NaN \n", + "min NaN NaN \n", + "25% NaN NaN \n", + "50% NaN NaN \n", + "75% NaN NaN \n", + "max NaN NaN \n", + "\n", + " description latitude \\\n", + "count 5776 6014.000000 \n", + "unique 4920 NaN \n", + "top This site provides access to the research outp... NaN \n", + "freq 98 NaN \n", + "mean NaN 38.649393 \n", + "std NaN 788.406173 \n", + "min NaN -79.029999 \n", + "25% NaN 4.644632 \n", + "50% NaN 37.930449 \n", + "75% NaN 47.294400 \n", + "max NaN 61138.800781 \n", + "\n", + " longitude subjects \n", + "count 6014.000000 6014 \n", + "unique NaN 201 \n", + "top NaN [] \n", + "freq NaN 5273 \n", + "mean 7.810948 NaN \n", + "std 71.689788 NaN \n", + "min -683.103027 NaN \n", + "25% -49.273300 NaN \n", + "50% 4.788870 NaN \n", + "75% 30.685501 NaN \n", + "max 178.438995 NaN " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df.describe(include='all')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Basic cleaning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**re3data**" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "re3data_df.loc[(re3data_df.latitude == 0.0) & (re3data_df.longitude == 0.0), ['latitude', 'longitude']] = [np.nan, np.nan]" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 ['Life Sciences', 'Basic Biological and Medica...\n", + "1 ['Natural Sciences', 'Astrophysics and Astrono...\n", + "2 ['Natural Sciences', 'Geosciences (including G...\n", + "3 ['Humanities and Social Sciences', 'Psychology...\n", + "4 ['Natural Sciences', 'Geology and Palaeontolog...\n", + " ... \n", + "2688 ['Life Sciences', 'Basic Biological and Medica...\n", + "2689 ['Natural Sciences', 'Atmospheric Science and ...\n", + "2690 ['Natural Sciences', 'Atmospheric Science and ...\n", + "2691 ['Natural Sciences', 'Atmospheric Science and ...\n", + "2692 ['Life Sciences', 'Plant Sciences', 'Plant Gen...\n", + "Name: subjects, Length: 2693, dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_df.subjects" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "re3data_df['subjects'] = re3data_df.subjects.apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def merge_lists(lists):\n", + " res = []\n", + " for l in lists:\n", + " res = res + l\n", + " return res\n", + "\n", + "re3data_cleaned_subjects = re3data_df.explode('subjects').subjects.str.split(',| and ', expand=True)\\\n", + " .apply(lambda row: row.dropna().tolist(), axis=1)\\\n", + " .reset_index()\\\n", + " .groupby('index')[0].apply(lambda x: merge_lists(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "index\n", + "0 [Life Sciences, Basic Biological, Medical Rese...\n", + "1 [Natural Sciences, Astrophysics, Astronomy, Ph...\n", + "2 [Natural Sciences, Geosciences (including Geog...\n", + "3 [Humanities, Social Sciences, Psychology, Soci...\n", + "4 [Natural Sciences, Geology, Palaeontology, Geo...\n", + " ... \n", + "2688 [Life Sciences, Basic Biological, Medical Rese...\n", + "2689 [Natural Sciences, Atmospheric Science, Oceano...\n", + "2690 [Natural Sciences, Atmospheric Science, Oceano...\n", + "2691 [Natural Sciences, Atmospheric Science, Oceano...\n", + "2692 [Life Sciences, Plant Sciences, Plant Genetics...\n", + "Name: 0, Length: 2693, dtype: object" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "re3data_cleaned_subjects" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "re3data_df = re3data_df.join(re3data_cleaned_subjects)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "re3data_df.drop(columns=['subjects'], inplace=True)\n", + "re3data_df.rename(columns={0:'subjects'}, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**OpenDOAR**" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0 []\n", + "1 []\n", + "2 ['Social Sciences General', 'Science General',...\n", + "3 []\n", + "4 []\n", + " ... \n", + "6009 ['Multidisciplinary']\n", + "6010 []\n", + "6011 ['Business and Economics']\n", + "6012 ['Earth and Planetary Sciences', 'Ecology and ...\n", + "6013 []\n", + "Name: subjects, Length: 6014, dtype: object" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_df.subjects" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "opendoar_df['subjects'] = opendoar_df.subjects.apply(lambda x: ast.literal_eval(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "opendoar_cleaned_subjects = opendoar_df.explode('subjects').subjects.str.split(',| and ', expand=True)\\\n", + " .apply(lambda row: row.dropna().tolist(), axis=1)\\\n", + " .reset_index()\\\n", + " .groupby('index')[0].apply(lambda x: merge_lists(x))" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "index\n", + "0 []\n", + "1 []\n", + "2 [Social Sciences General, Science General, Com...\n", + "3 []\n", + "4 []\n", + " ... \n", + "6009 [Multidisciplinary]\n", + "6010 []\n", + "6011 [Business, Economics]\n", + "6012 [Earth, Planetary Sciences, Ecology, Environme...\n", + "6013 []\n", + "Name: 0, Length: 6014, dtype: object" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "opendoar_cleaned_subjects" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "opendoar_df = opendoar_df.join(opendoar_cleaned_subjects)" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [], + "source": [ + "opendoar_df.drop(columns=['subjects'], inplace=True)\n", + "opendoar_df.rename(columns={0: 'subjects'}, inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Subjects analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "fairsharing_subjects = fairsharing_df.explode('subjects')\n", + "re3data_subjects = re3data_df.explode('subjects')\n", + "opendoar_subjects = opendoar_df.explode('subjects')" + ] + }, + { + "cell_type": "code", + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -357,12 +1481,11 @@ "data": { "application/vnd.plotly.v1+json": { "config": { - "linkText": "Export to plot.ly", - "plotlyServerURL": "https://plot.ly", - "showLink": false + "plotlyServerURL": "https://plot.ly" }, "data": [ { + "name": "FAIRsharing", "type": "bar", "x": [ "Life Science", @@ -988,2455 +2111,11 @@ 1, 1 ] - } - ], - "layout": { - "template": { - "data": { - "bar": [ - { - "error_x": { - "color": "#2a3f5f" - }, - "error_y": { - "color": "#2a3f5f" - }, - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - } - }, - "type": "bar" - } - ], - "barpolar": [ - { - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - } - }, - "type": "barpolar" - } - ], - "carpet": [ - { - "aaxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "baxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "type": "carpet" - } - ], - "choropleth": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "choropleth" - } - ], - "contour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "contour" - } - ], - "contourcarpet": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "contourcarpet" - } - ], - "heatmap": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmap" - } - ], - "heatmapgl": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmapgl" - } - ], - "histogram": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "histogram" - } - ], - "histogram2d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2d" - } - ], - "histogram2dcontour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2dcontour" - } - ], - "mesh3d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "mesh3d" - } - ], - "parcoords": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "parcoords" - } - ], - "pie": [ - { - "automargin": true, - "type": "pie" - } - ], - "scatter": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter" - } - ], - "scatter3d": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter3d" - } - ], - "scattercarpet": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattercarpet" - } - ], - "scattergeo": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergeo" - } - ], - "scattergl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergl" - } - ], - "scattermapbox": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermapbox" - } - ], - "scatterpolar": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolar" - } - ], - "scatterpolargl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolargl" - } - ], - "scatterternary": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterternary" - } - ], - "surface": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "surface" - } - ], - "table": [ - { - "cells": { - "fill": { - "color": "#EBF0F8" - }, - "line": { - "color": "white" - } - }, - "header": { - "fill": { - "color": "#C8D4E3" - }, - "line": { - "color": "white" - } - }, - "type": "table" - } - ] - }, - "layout": { - "annotationdefaults": { - "arrowcolor": "#2a3f5f", - "arrowhead": 0, - "arrowwidth": 1 - }, - "autotypenumbers": "strict", - "coloraxis": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "colorscale": { - "diverging": [ - [ - 0, - "#8e0152" - ], - [ - 0.1, - "#c51b7d" - ], - [ - 0.2, - "#de77ae" - ], - [ - 0.3, - "#f1b6da" - ], - [ - 0.4, - "#fde0ef" - ], - [ - 0.5, - "#f7f7f7" - ], - [ - 0.6, - "#e6f5d0" - ], - [ - 0.7, - "#b8e186" - ], - [ - 0.8, - "#7fbc41" - ], - [ - 0.9, - "#4d9221" - ], - [ - 1, - "#276419" - ] - ], - "sequential": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "sequentialminus": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "colorway": [ - "#636efa", - "#EF553B", - "#00cc96", - "#ab63fa", - "#FFA15A", - "#19d3f3", - "#FF6692", - "#B6E880", - "#FF97FF", - "#FECB52" - ], - "font": { - "color": "#2a3f5f" - }, - "geo": { - "bgcolor": "white", - "lakecolor": "white", - "landcolor": "#E5ECF6", - "showlakes": true, - "showland": true, - "subunitcolor": "white" - }, - "hoverlabel": { - "align": "left" - }, - "hovermode": "closest", - "mapbox": { - "style": "light" - }, - "paper_bgcolor": "white", - "plot_bgcolor": "#E5ECF6", - "polar": { - "angularaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "radialaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "scene": { - "xaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "yaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "zaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - } - }, - "shapedefaults": { - "line": { - "color": "#2a3f5f" - } - }, - "ternary": { - "aaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "baxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "caxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "title": { - "x": 0.05 - }, - "xaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - }, - "yaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - } - } }, - "title": { - "text": "Fairsharing subject coverage" - }, - "xaxis": { - "tickangle": 45, - "tickfont": { - "size": 12 - } - } - } - }, - "text/html": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "fairsharing_subjects = fairsharing_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)\n", - "\n", - "data = [\n", - " go.Bar(\n", - " x=fairsharing_subjects.index,\n", - " y=fairsharing_subjects['url']\n", - " )\n", - "]\n", - "\n", - "layout = go.Layout(\n", - " title='Fairsharing subject coverage',\n", - " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", - ")\n", - "fig = go.Figure(data=data, layout=layout)\n", - "plotly.offline.iplot(fig)" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.plotly.v1+json": { - "config": { - "linkText": "Export to plot.ly", - "plotlyServerURL": "https://plot.ly", - "showLink": false - }, - "data": [ - { - "type": "bar", - "x": [ - "United States", - "United Kingdom", - "Germany", - "France", - "Switzerland", - "China", - "Netherlands", - "Italy", - "Canada", - "Belgium", - "Spain", - "Japan", - "Sweden", - "Czech Republic", - "Norway", - "Denmark", - "European Union", - "Austria", - "Finland", - "Republic of Ireland", - "Australia", - "Israel", - "Portugal", - "Hungary", - "Greece", - "Malta", - "Lithuania", - "Slovakia", - "Iceland", - "Luxembourg", - "Montenegro", - "Croatia", - "Worldwide", - "India", - "Poland", - "Singapore", - "South Korea", - "Russia", - "South Africa", - "Taiwan", - "Brazil", - "New Zealand", - "Mexico", - "Saudi Arabia", - "Bulgaria", - "Hong Kong", - "Argentina", - "Turkey", - "Cyprus", - "Morocco", - "Uganda", - "Estonia", - "Romania", - "Thailand", - "Pakistan", - "Costa Rica", - "Uruguay", - "United Arab Emirates", - "Togo", - "Antarctica", - "Panama", - "Honduras", - "Benin", - "Cameroon", - "Chile", - "Colombia", - "Egypt", - "El Salvador", - "Ethiopia", - "Faroe Islands", - "Greenland", - "Indonesia", - "Nigeria", - "Kenya", - "Latvia", - "Madagascar", - "Malawi", - "Mali", - "Mauritania", - "Mozambique", - "Nicaragua", - "Niger", - "Zimbabwe" - ], - "y": [ - 686, - 248, - 192, - 162, - 114, - 99, - 96, - 91, - 86, - 83, - 83, - 80, - 76, - 71, - 69, - 67, - 66, - 64, - 63, - 62, - 62, - 61, - 60, - 59, - 58, - 53, - 52, - 52, - 52, - 52, - 51, - 51, - 49, - 32, - 11, - 10, - 10, - 9, - 9, - 8, - 8, - 8, - 8, - 6, - 3, - 3, - 3, - 3, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 2, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1, - 1 - ] - } - ], - "layout": { - "template": { - "data": { - "bar": [ - { - "error_x": { - "color": "#2a3f5f" - }, - "error_y": { - "color": "#2a3f5f" - }, - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - } - }, - "type": "bar" - } - ], - "barpolar": [ - { - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - } - }, - "type": "barpolar" - } - ], - "carpet": [ - { - "aaxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "baxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "type": "carpet" - } - ], - "choropleth": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "choropleth" - } - ], - "contour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "contour" - } - ], - "contourcarpet": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "contourcarpet" - } - ], - "heatmap": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmap" - } - ], - "heatmapgl": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmapgl" - } - ], - "histogram": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "histogram" - } - ], - "histogram2d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2d" - } - ], - "histogram2dcontour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2dcontour" - } - ], - "mesh3d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "mesh3d" - } - ], - "parcoords": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "parcoords" - } - ], - "pie": [ - { - "automargin": true, - "type": "pie" - } - ], - "scatter": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter" - } - ], - "scatter3d": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter3d" - } - ], - "scattercarpet": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattercarpet" - } - ], - "scattergeo": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergeo" - } - ], - "scattergl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergl" - } - ], - "scattermapbox": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermapbox" - } - ], - "scatterpolar": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolar" - } - ], - "scatterpolargl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolargl" - } - ], - "scatterternary": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterternary" - } - ], - "surface": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "surface" - } - ], - "table": [ - { - "cells": { - "fill": { - "color": "#EBF0F8" - }, - "line": { - "color": "white" - } - }, - "header": { - "fill": { - "color": "#C8D4E3" - }, - "line": { - "color": "white" - } - }, - "type": "table" - } - ] - }, - "layout": { - "annotationdefaults": { - "arrowcolor": "#2a3f5f", - "arrowhead": 0, - "arrowwidth": 1 - }, - "autotypenumbers": "strict", - "coloraxis": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "colorscale": { - "diverging": [ - [ - 0, - "#8e0152" - ], - [ - 0.1, - "#c51b7d" - ], - [ - 0.2, - "#de77ae" - ], - [ - 0.3, - "#f1b6da" - ], - [ - 0.4, - "#fde0ef" - ], - [ - 0.5, - "#f7f7f7" - ], - [ - 0.6, - "#e6f5d0" - ], - [ - 0.7, - "#b8e186" - ], - [ - 0.8, - "#7fbc41" - ], - [ - 0.9, - "#4d9221" - ], - [ - 1, - "#276419" - ] - ], - "sequential": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "sequentialminus": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "colorway": [ - "#636efa", - "#EF553B", - "#00cc96", - "#ab63fa", - "#FFA15A", - "#19d3f3", - "#FF6692", - "#B6E880", - "#FF97FF", - "#FECB52" - ], - "font": { - "color": "#2a3f5f" - }, - "geo": { - "bgcolor": "white", - "lakecolor": "white", - "landcolor": "#E5ECF6", - "showlakes": true, - "showland": true, - "subunitcolor": "white" - }, - "hoverlabel": { - "align": "left" - }, - "hovermode": "closest", - "mapbox": { - "style": "light" - }, - "paper_bgcolor": "white", - "plot_bgcolor": "#E5ECF6", - "polar": { - "angularaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "radialaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "scene": { - "xaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "yaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "zaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - } - }, - "shapedefaults": { - "line": { - "color": "#2a3f5f" - } - }, - "ternary": { - "aaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "baxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "caxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "title": { - "x": 0.05 - }, - "xaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - }, - "yaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - } - } - }, - "title": { - "text": "Fairsharing country coverage" - }, - "xaxis": { - "tickangle": 45, - "tickfont": { - "size": 12 - } - } - } - }, - "text/html": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "fairsharing_countries = fairsharing_df.explode('countries').groupby('countries')[['url']].count().sort_values('url', ascending=False)\n", - "\n", - "data = [\n", - " go.Bar(\n", - " x=fairsharing_countries.index,\n", - " y=fairsharing_countries['url']\n", - " )\n", - "]\n", - "\n", - "layout = go.Layout(\n", - " title='Fairsharing country coverage',\n", - " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", - ")\n", - "fig = go.Figure(data=data, layout=layout)\n", - "plotly.offline.iplot(fig)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# re3data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idurlofficial_nameenglish_namedescriptionlatitudelongitudesubjects
410|re3data_____::3f2e20af26ead0432f5470d8b739638dhttp://planttfdb.cbi.pku.edu.cn/Plant Transcription Factor DatabasePlantTFDBNaN0.00.0['Life Sciences', 'Basic Biological and Medica...
710|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfchttps://spdf.gsfc.nasa.gov/Space Physics Data FacilityNASA's Space Physics Data Facility SPDFNaN0.00.0['Natural Sciences', 'Astrophysics and Astrono...
1310|re3data_____::59521daca59ac29b811343cc4cd370cfhttp://card.westgis.ac.cn/Cold and Arid Regions Science Data Center at L...CARD WDC for Glaciology and Geocryology World ...NaN0.00.0['Natural Sciences', 'Geosciences (including G...
1410|re3data_____::ec1ba1674c852466c266acb64c618d15https://www.psycharchives.org/PsycharchivesNaNNaN0.00.0['Humanities and Social Sciences', 'Psychology...
1910|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76https://www.ihfc-iugg.org/products/global-heat...The Global Heat Flow Database of the Internati...International Heat-flow DatabaseNaN0.00.0['Natural Sciences', 'Geology and Palaeontolog...
\n", - "
" - ], - "text/plain": [ - " id \\\n", - "4 10|re3data_____::3f2e20af26ead0432f5470d8b739638d \n", - "7 10|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfc \n", - "13 10|re3data_____::59521daca59ac29b811343cc4cd370cf \n", - "14 10|re3data_____::ec1ba1674c852466c266acb64c618d15 \n", - "19 10|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76 \n", - "\n", - " url \\\n", - "4 http://planttfdb.cbi.pku.edu.cn/ \n", - "7 https://spdf.gsfc.nasa.gov/ \n", - "13 http://card.westgis.ac.cn/ \n", - "14 https://www.psycharchives.org/ \n", - "19 https://www.ihfc-iugg.org/products/global-heat... \n", - "\n", - " official_name \\\n", - "4 Plant Transcription Factor Database \n", - "7 Space Physics Data Facility \n", - "13 Cold and Arid Regions Science Data Center at L... \n", - "14 Psycharchives \n", - "19 The Global Heat Flow Database of the Internati... \n", - "\n", - " english_name description latitude \\\n", - "4 PlantTFDB NaN 0.0 \n", - "7 NASA's Space Physics Data Facility SPDF NaN 0.0 \n", - "13 CARD WDC for Glaciology and Geocryology World ... NaN 0.0 \n", - "14 NaN NaN 0.0 \n", - "19 International Heat-flow Database NaN 0.0 \n", - "\n", - " longitude subjects \n", - "4 0.0 ['Life Sciences', 'Basic Biological and Medica... \n", - "7 0.0 ['Natural Sciences', 'Astrophysics and Astrono... \n", - "13 0.0 ['Natural Sciences', 'Geosciences (including G... \n", - "14 0.0 ['Humanities and Social Sciences', 'Psychology... \n", - "19 0.0 ['Natural Sciences', 'Geology and Palaeontolog... " - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "re3data_df = pd.read_csv('../data/raw/re3data_opendoar.csv')\n", - "re3data_df = re3data_df[re3data_df.id.str.contains('re3data')]\n", - "re3data_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "re3data_df.loc[(re3data_df.latitude == 0.0) & (re3data_df.longitude == 0.0), ['latitude', 'longitude']] = [np.nan, np.nan]" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "4 ['Life Sciences', 'Basic Biological and Medica...\n", - "7 ['Natural Sciences', 'Astrophysics and Astrono...\n", - "13 ['Natural Sciences', 'Geosciences (including G...\n", - "14 ['Humanities and Social Sciences', 'Psychology...\n", - "19 ['Natural Sciences', 'Geology and Palaeontolog...\n", - " ... \n", - "8693 ['Life Sciences', 'Basic Biological and Medica...\n", - "8695 ['Natural Sciences', 'Atmospheric Science and ...\n", - "8697 ['Natural Sciences', 'Atmospheric Science and ...\n", - "8699 ['Natural Sciences', 'Atmospheric Science and ...\n", - "8705 ['Life Sciences', 'Plant Sciences', 'Plant Gen...\n", - "Name: subjects, Length: 2693, dtype: object" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "re3data_df.subjects" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "re3data_df['subjects'] = re3data_df.subjects.apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "def merge_lists(lists):\n", - " res = []\n", - " for l in lists:\n", - " res = res + l\n", - " return res\n", - "\n", - "re3data_cleaned_subjects = re3data_df.explode('subjects').subjects.str.split(',| and ', expand=True)\\\n", - " .apply(lambda row: row.dropna().tolist(), axis=1)\\\n", - " .reset_index()\\\n", - " .groupby('index')[0].apply(lambda x: merge_lists(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "index\n", - "4 [Life Sciences, Basic Biological, Medical Rese...\n", - "7 [Natural Sciences, Astrophysics, Astronomy, Ph...\n", - "13 [Natural Sciences, Geosciences (including Geog...\n", - "14 [Humanities, Social Sciences, Psychology, Soci...\n", - "19 [Natural Sciences, Geology, Palaeontology, Geo...\n", - " ... \n", - "8693 [Life Sciences, Basic Biological, Medical Rese...\n", - "8695 [Natural Sciences, Atmospheric Science, Oceano...\n", - "8697 [Natural Sciences, Atmospheric Science, Oceano...\n", - "8699 [Natural Sciences, Atmospheric Science, Oceano...\n", - "8705 [Life Sciences, Plant Sciences, Plant Genetics...\n", - "Name: 0, Length: 2693, dtype: object" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "re3data_cleaned_subjects" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "re3data_df = re3data_df.join(re3data_cleaned_subjects)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "re3data_df.drop(columns=['subjects'], inplace=True)\n", - "re3data_df.rename(columns={0:'subjects'}, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idurlofficial_nameenglish_namedescriptionlatitudelongitudesubjects
count2693267326932034385.0000005.0000002693
unique269326612668201038NaNNaN1427
top10|re3data_____::e59f89142e8d47d32523c53a9137f07bhttp://iubio.bio.indiana.edu/IUBio-ArchiveResearch Data RepositoryIUBio Archive is an archive of biology data an...NaNNaN[Humanities, Social Sciences, Life Sciences, N...
freq12221NaNNaN209
meanNaNNaNNaNNaNNaN61.66811336.623678NaN
stdNaNNaNNaNNaNNaN96.98445748.547521NaN
minNaNNaNNaNNaNNaN12.12300012.123000NaN
25%NaNNaNNaNNaNNaN12.12300012.123400NaN
50%NaNNaNNaNNaNNaN12.12340012.123400NaN
75%NaNNaNNaNNaNNaN37.97116323.748590NaN
maxNaNNaNNaNNaNNaN234.000000123.000000NaN
\n", - "
" - ], - "text/plain": [ - " id \\\n", - "count 2693 \n", - "unique 2693 \n", - "top 10|re3data_____::e59f89142e8d47d32523c53a9137f07b \n", - "freq 1 \n", - "mean NaN \n", - "std NaN \n", - "min NaN \n", - "25% NaN \n", - "50% NaN \n", - "75% NaN \n", - "max NaN \n", - "\n", - " url official_name \\\n", - "count 2673 2693 \n", - "unique 2661 2668 \n", - "top http://iubio.bio.indiana.edu/ IUBio-Archive \n", - "freq 2 2 \n", - "mean NaN NaN \n", - "std NaN NaN \n", - "min NaN NaN \n", - "25% NaN NaN \n", - "50% NaN NaN \n", - "75% NaN NaN \n", - "max NaN NaN \n", - "\n", - " english_name \\\n", - "count 2034 \n", - "unique 2010 \n", - "top Research Data Repository \n", - "freq 2 \n", - "mean NaN \n", - "std NaN \n", - "min NaN \n", - "25% NaN \n", - "50% NaN \n", - "75% NaN \n", - "max NaN \n", - "\n", - " description latitude \\\n", - "count 38 5.000000 \n", - "unique 38 NaN \n", - "top IUBio Archive is an archive of biology data an... NaN \n", - "freq 1 NaN \n", - "mean NaN 61.668113 \n", - "std NaN 96.984457 \n", - "min NaN 12.123000 \n", - "25% NaN 12.123000 \n", - "50% NaN 12.123400 \n", - "75% NaN 37.971163 \n", - "max NaN 234.000000 \n", - "\n", - " longitude subjects \n", - "count 5.000000 2693 \n", - "unique NaN 1427 \n", - "top NaN [Humanities, Social Sciences, Life Sciences, N... \n", - "freq NaN 209 \n", - "mean 36.623678 NaN \n", - "std 48.547521 NaN \n", - "min 12.123000 NaN \n", - "25% 12.123400 NaN \n", - "50% 12.123400 NaN \n", - "75% 23.748590 NaN \n", - "max 123.000000 NaN " - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "re3data_df.describe(include='all')" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.plotly.v1+json": { - "config": { - "linkText": "Export to plot.ly", - "plotlyServerURL": "https://plot.ly", - "showLink": false - }, - "data": [ { + "name": "re3data", "type": "bar", + "visible": "legendonly", "x": [ "Life Sciences", "Natural Sciences", @@ -4267,1391 +2946,11 @@ 1, 1 ] - } - ], - "layout": { - "template": { - "data": { - "bar": [ - { - "error_x": { - "color": "#2a3f5f" - }, - "error_y": { - "color": "#2a3f5f" - }, - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - } - }, - "type": "bar" - } - ], - "barpolar": [ - { - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - } - }, - "type": "barpolar" - } - ], - "carpet": [ - { - "aaxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "baxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "type": "carpet" - } - ], - "choropleth": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "choropleth" - } - ], - "contour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "contour" - } - ], - "contourcarpet": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "contourcarpet" - } - ], - "heatmap": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmap" - } - ], - "heatmapgl": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmapgl" - } - ], - "histogram": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "histogram" - } - ], - "histogram2d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2d" - } - ], - "histogram2dcontour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2dcontour" - } - ], - "mesh3d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "mesh3d" - } - ], - "parcoords": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "parcoords" - } - ], - "pie": [ - { - "automargin": true, - "type": "pie" - } - ], - "scatter": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter" - } - ], - "scatter3d": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter3d" - } - ], - "scattercarpet": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattercarpet" - } - ], - "scattergeo": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergeo" - } - ], - "scattergl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergl" - } - ], - "scattermapbox": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermapbox" - } - ], - "scatterpolar": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolar" - } - ], - "scatterpolargl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolargl" - } - ], - "scatterternary": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterternary" - } - ], - "surface": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "surface" - } - ], - "table": [ - { - "cells": { - "fill": { - "color": "#EBF0F8" - }, - "line": { - "color": "white" - } - }, - "header": { - "fill": { - "color": "#C8D4E3" - }, - "line": { - "color": "white" - } - }, - "type": "table" - } - ] - }, - "layout": { - "annotationdefaults": { - "arrowcolor": "#2a3f5f", - "arrowhead": 0, - "arrowwidth": 1 - }, - "autotypenumbers": "strict", - "coloraxis": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "colorscale": { - "diverging": [ - [ - 0, - "#8e0152" - ], - [ - 0.1, - "#c51b7d" - ], - [ - 0.2, - "#de77ae" - ], - [ - 0.3, - "#f1b6da" - ], - [ - 0.4, - "#fde0ef" - ], - [ - 0.5, - "#f7f7f7" - ], - [ - 0.6, - "#e6f5d0" - ], - [ - 0.7, - "#b8e186" - ], - [ - 0.8, - "#7fbc41" - ], - [ - 0.9, - "#4d9221" - ], - [ - 1, - "#276419" - ] - ], - "sequential": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "sequentialminus": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "colorway": [ - "#636efa", - "#EF553B", - "#00cc96", - "#ab63fa", - "#FFA15A", - "#19d3f3", - "#FF6692", - "#B6E880", - "#FF97FF", - "#FECB52" - ], - "font": { - "color": "#2a3f5f" - }, - "geo": { - "bgcolor": "white", - "lakecolor": "white", - "landcolor": "#E5ECF6", - "showlakes": true, - "showland": true, - "subunitcolor": "white" - }, - "hoverlabel": { - "align": "left" - }, - "hovermode": "closest", - "mapbox": { - "style": "light" - }, - "paper_bgcolor": "white", - "plot_bgcolor": "#E5ECF6", - "polar": { - "angularaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "radialaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "scene": { - "xaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "yaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "zaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - } - }, - "shapedefaults": { - "line": { - "color": "#2a3f5f" - } - }, - "ternary": { - "aaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "baxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "caxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "title": { - "x": 0.05 - }, - "xaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - }, - "yaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - } - } }, - "title": { - "text": "re3data subject coverage" - }, - "xaxis": { - "tickangle": 45, - "tickfont": { - "size": 12 - } - } - } - }, - "text/html": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "re3data_subjects = re3data_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)\n", - "\n", - "data = [\n", - " go.Bar(\n", - " x=re3data_subjects.index,\n", - " y=re3data_subjects['url']\n", - " )\n", - "]\n", - "\n", - "layout = go.Layout(\n", - " title='re3data subject coverage',\n", - " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", - ")\n", - "fig = go.Figure(data=data, layout=layout)\n", - "plotly.offline.iplot(fig)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# OpenDOAR" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idurlofficial_nameenglish_namedescriptionlatitudelongitudesubjects
010|opendoar____::e833e042f509c996b1b25324d56659fbhttp://www.bilbao.net/bldBLD - Bilboko Liburutegi DigitalaBLD - Bilboko Liburutegi DigitalaBLD is a repository of digital documents, desi...43.256699-2.924100[]
110|opendoar____::f621585df244e9596dc70a39b579efb1https://researchdirect.westernsydney.edu.au/Western Sydney ResearchDirectWestern Sydney ResearchDirectNaN0.0000000.000000[]
210|opendoar____::437d7d1d97917cd627a34a6a0fb41136http://redress.lancs.ac.uk/Learning_Space/Learning Space CatalogueNaNThis repository is a Social Science e-Science ...54.010760-2.784990['Social Sciences General', 'Science General',...
310|opendoar____::d840cc5d906c3e9c84374c8919d2074ehttp://digitallibrary.usc.edu/search/controlle...USC Digital LibraryUSC Digital LibraryThis is an institutional repository providing ...34.052200-118.242996[]
510|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010http://www.ufgd.edu.br:8080/jspui/Repositório de Divulgação das Produções Cientí...Repositório de Divulgação das Produções Cientí...This site provides access to the research outp...-22.221800-54.806400[]
\n", - "
" - ], - "text/plain": [ - " id \\\n", - "0 10|opendoar____::e833e042f509c996b1b25324d56659fb \n", - "1 10|opendoar____::f621585df244e9596dc70a39b579efb1 \n", - "2 10|opendoar____::437d7d1d97917cd627a34a6a0fb41136 \n", - "3 10|opendoar____::d840cc5d906c3e9c84374c8919d2074e \n", - "5 10|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010 \n", - "\n", - " url \\\n", - "0 http://www.bilbao.net/bld \n", - "1 https://researchdirect.westernsydney.edu.au/ \n", - "2 http://redress.lancs.ac.uk/Learning_Space/ \n", - "3 http://digitallibrary.usc.edu/search/controlle... \n", - "5 http://www.ufgd.edu.br:8080/jspui/ \n", - "\n", - " official_name \\\n", - "0 BLD - Bilboko Liburutegi Digitala \n", - "1 Western Sydney ResearchDirect \n", - "2 Learning Space Catalogue \n", - "3 USC Digital Library \n", - "5 Repositório de Divulgação das Produções Cientí... \n", - "\n", - " english_name \\\n", - "0 BLD - Bilboko Liburutegi Digitala \n", - "1 Western Sydney ResearchDirect \n", - "2 NaN \n", - "3 USC Digital Library \n", - "5 Repositório de Divulgação das Produções Cientí... \n", - "\n", - " description latitude longitude \\\n", - "0 BLD is a repository of digital documents, desi... 43.256699 -2.924100 \n", - "1 NaN 0.000000 0.000000 \n", - "2 This repository is a Social Science e-Science ... 54.010760 -2.784990 \n", - "3 This is an institutional repository providing ... 34.052200 -118.242996 \n", - "5 This site provides access to the research outp... -22.221800 -54.806400 \n", - "\n", - " subjects \n", - "0 [] \n", - "1 [] \n", - "2 ['Social Sciences General', 'Science General',... \n", - "3 [] \n", - "5 [] " - ] - }, - "execution_count": 82, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "opendoar_df = pd.read_csv('../data/raw/re3data_opendoar.csv')\n", - "opendoar_df = opendoar_df[opendoar_df.id.str.contains('opendoar')]\n", - "opendoar_df.head()" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 []\n", - "1 []\n", - "2 ['Social Sciences General', 'Science General',...\n", - "3 []\n", - "5 []\n", - " ... \n", - "8701 ['Multidisciplinary']\n", - "8702 []\n", - "8703 ['Business and Economics']\n", - "8704 ['Earth and Planetary Sciences', 'Ecology and ...\n", - "8706 []\n", - "Name: subjects, Length: 6014, dtype: object" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "opendoar_df.subjects" - ] - }, - { - "cell_type": "code", - "execution_count": 85, - "metadata": {}, - "outputs": [], - "source": [ - "opendoar_df['subjects'] = opendoar_df.subjects.apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 86, - "metadata": {}, - "outputs": [], - "source": [ - "opendoar_cleaned_subjects = opendoar_df.explode('subjects').subjects.str.split(',| and ', expand=True)\\\n", - " .apply(lambda row: row.dropna().tolist(), axis=1)\\\n", - " .reset_index()\\\n", - " .groupby('index')[0].apply(lambda x: merge_lists(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 87, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "index\n", - "0 []\n", - "1 []\n", - "2 [Social Sciences General, Science General, Com...\n", - "3 []\n", - "5 []\n", - " ... \n", - "8701 [Multidisciplinary]\n", - "8702 []\n", - "8703 [Business, Economics]\n", - "8704 [Earth, Planetary Sciences, Ecology, Environme...\n", - "8706 []\n", - "Name: 0, Length: 6014, dtype: object" - ] - }, - "execution_count": 87, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "opendoar_cleaned_subjects" - ] - }, - { - "cell_type": "code", - "execution_count": 88, - "metadata": {}, - "outputs": [], - "source": [ - "opendoar_df = opendoar_df.join(opendoar_cleaned_subjects)" - ] - }, - { - "cell_type": "code", - "execution_count": 89, - "metadata": {}, - "outputs": [], - "source": [ - "opendoar_df.drop(columns=['subjects'], inplace=True)\n", - "opendoar_df.rename(columns={0: 'subjects'}, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 90, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idurlofficial_nameenglish_namedescriptionlatitudelongitudesubjects
count601460136014550057766014.0000006014.0000006014
unique60145953594654134920NaNNaN201
top10|opendoar____::a2557a7b2e94197ff767970b67041697http://harp.lib.hiroshima-u.ac.jp/Hiroshima Associated Repository PortalAURAThis site provides access to the research outp...NaNNaN[]
freq133498NaNNaN5273
meanNaNNaNNaNNaNNaN38.6493937.810948NaN
stdNaNNaNNaNNaNNaN788.40617371.689788NaN
minNaNNaNNaNNaNNaN-79.029999-683.103027NaN
25%NaNNaNNaNNaNNaN4.644632-49.273300NaN
50%NaNNaNNaNNaNNaN37.9304494.788870NaN
75%NaNNaNNaNNaNNaN47.29440030.685501NaN
maxNaNNaNNaNNaNNaN61138.800781178.438995NaN
\n", - "
" - ], - "text/plain": [ - " id \\\n", - "count 6014 \n", - "unique 6014 \n", - "top 10|opendoar____::a2557a7b2e94197ff767970b67041697 \n", - "freq 1 \n", - "mean NaN \n", - "std NaN \n", - "min NaN \n", - "25% NaN \n", - "50% NaN \n", - "75% NaN \n", - "max NaN \n", - "\n", - " url \\\n", - "count 6013 \n", - "unique 5953 \n", - "top http://harp.lib.hiroshima-u.ac.jp/ \n", - "freq 3 \n", - "mean NaN \n", - "std NaN \n", - "min NaN \n", - "25% NaN \n", - "50% NaN \n", - "75% NaN \n", - "max NaN \n", - "\n", - " official_name english_name \\\n", - "count 6014 5500 \n", - "unique 5946 5413 \n", - "top Hiroshima Associated Repository Portal AURA \n", - "freq 3 4 \n", - "mean NaN NaN \n", - "std NaN NaN \n", - "min NaN NaN \n", - "25% NaN NaN \n", - "50% NaN NaN \n", - "75% NaN NaN \n", - "max NaN NaN \n", - "\n", - " description latitude \\\n", - "count 5776 6014.000000 \n", - "unique 4920 NaN \n", - "top This site provides access to the research outp... NaN \n", - "freq 98 NaN \n", - "mean NaN 38.649393 \n", - "std NaN 788.406173 \n", - "min NaN -79.029999 \n", - "25% NaN 4.644632 \n", - "50% NaN 37.930449 \n", - "75% NaN 47.294400 \n", - "max NaN 61138.800781 \n", - "\n", - " longitude subjects \n", - "count 6014.000000 6014 \n", - "unique NaN 201 \n", - "top NaN [] \n", - "freq NaN 5273 \n", - "mean 7.810948 NaN \n", - "std 71.689788 NaN \n", - "min -683.103027 NaN \n", - "25% -49.273300 NaN \n", - "50% 4.788870 NaN \n", - "75% 30.685501 NaN \n", - "max 178.438995 NaN " - ] - }, - "execution_count": 90, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "opendoar_df.describe(include='all')" - ] - }, - { - "cell_type": "code", - "execution_count": 91, - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.plotly.v1+json": { - "config": { - "linkText": "Export to plot.ly", - "plotlyServerURL": "https://plot.ly", - "showLink": false - }, - "data": [ { + "name": "OpenDOAR", "type": "bar", + "visible": "legendonly", "x": [ "Multidisciplinary", "Medicine", @@ -6592,7 +3891,7 @@ } }, "title": { - "text": "OpenDOAR subject coverage" + "text": "Subject coverage" }, "xaxis": { "tickangle": 45, @@ -6603,9 +3902,9 @@ } }, "text/html": [ - "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data1 = fairsharing_countries.groupby('continent')[['url']].count()\n", + "data2 = opendoar_df.groupby('continent')[['url']].count()\n", + "\n", + "plot = [\n", + " go.Scatterpolar(\n", + " r=data1.url,\n", + " theta=data1.index,\n", + " fill='toself',\n", + " name='FAIRsharing'),\n", + " go.Scatterpolar(\n", + " r=data2.url,\n", + " theta=data2.index,\n", + " fill='toself',\n", + " name='OpenDOAR')\n", + "]\n", + "\n", + "layout = go.Layout(polar=dict(\n", + " radialaxis=dict(\n", + " visible=True\n", + " ),\n", + " )\n", + ")\n", + "\n", + "go.Figure(plot, layout).show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, { "cell_type": "code", "execution_count": null,