diff --git a/notebooks/01-Explorative.ipynb b/notebooks/01-Explorative.ipynb
index af4a892..dc6fe31 100644
--- a/notebooks/01-Explorative.ipynb
+++ b/notebooks/01-Explorative.ipynb
@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
- "execution_count": 1,
+ "execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
@@ -14,22 +14,59 @@
"import numpy as np\n",
"import pandas as pd\n",
"\n",
+ "import pycountry_convert\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "from matplotlib_venn import venn2, venn2_circles\n",
+ "\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
"import plotly.express as px"
]
},
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def country_to_countrycode(country):\n",
+ " if pd.isna(country):\n",
+ " return np.nan\n",
+ " else:\n",
+ " try:\n",
+ " return pycountry_convert.country_name_to_country_alpha2(country)\n",
+ " except:\n",
+ " return np.nan\n",
+ "\n",
+ "def countrycode_to_continent(country_code):\n",
+ " if pd.isna(country_code):\n",
+ " return np.nan\n",
+ " else:\n",
+ " try:\n",
+ " return pycountry_convert.country_alpha2_to_continent_code(country_code)\n",
+ " except:\n",
+ " return np.nan"
+ ]
+ },
{
"cell_type": "markdown",
"metadata": {},
"source": [
- "# FAIRsharing"
+ "## Loading datasets"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**FAIRsharing**"
]
},
{
"cell_type": "code",
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"outputs": [
{
@@ -148,7 +185,7 @@
"4 [Life Science] "
]
},
- "execution_count": 2,
+ "execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
@@ -164,7 +201,7 @@
},
{
"cell_type": "code",
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -217,10 +254,10 @@
" \n",
"
\n",
" top | \n",
- " The Cardiovascular Research Grid | \n",
+ " FunTree: A Resource For Exploring The Function... | \n",
" CGD | \n",
- " https://fairsharing.org/bsg-d001750 | \n",
- " http://www.bmrb.wisc.edu/ | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.5... | \n",
+ " https://idn.ceos.org | \n",
" [United States] | \n",
" [Life Science] | \n",
"
\n",
@@ -238,26 +275,26 @@
""
],
"text/plain": [
- " full_name short_name \\\n",
- "count 1752 1752 \n",
- "unique 1752 1741 \n",
- "top The Cardiovascular Research Grid CGD \n",
- "freq 1 3 \n",
+ " full_name short_name \\\n",
+ "count 1752 1752 \n",
+ "unique 1752 1741 \n",
+ "top FunTree: A Resource For Exploring The Function... CGD \n",
+ "freq 1 3 \n",
"\n",
- " fs_url url \\\n",
- "count 1752 1752 \n",
- "unique 1752 1752 \n",
- "top https://fairsharing.org/bsg-d001750 http://www.bmrb.wisc.edu/ \n",
- "freq 1 1 \n",
+ " fs_url \\\n",
+ "count 1752 \n",
+ "unique 1752 \n",
+ "top https://fairsharing.org/10.25504/FAIRsharing.5... \n",
+ "freq 1 \n",
"\n",
- " countries subjects \n",
- "count 1749 1690 \n",
- "unique 178 834 \n",
- "top [United States] [Life Science] \n",
- "freq 588 367 "
+ " url countries subjects \n",
+ "count 1752 1749 1690 \n",
+ "unique 1752 178 834 \n",
+ "top https://idn.ceos.org [United States] [Life Science] \n",
+ "freq 1 588 367 "
]
},
- "execution_count": 3,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -266,9 +303,1096 @@
"fairsharing_df.describe()"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**re3data**"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " id | \n",
+ " url | \n",
+ " official_name | \n",
+ " english_name | \n",
+ " description | \n",
+ " latitude | \n",
+ " longitude | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 4 | \n",
+ " 10|re3data_____::3f2e20af26ead0432f5470d8b739638d | \n",
+ " http://planttfdb.cbi.pku.edu.cn/ | \n",
+ " Plant Transcription Factor Database | \n",
+ " PlantTFDB | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ['Life Sciences', 'Basic Biological and Medica... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 7 | \n",
+ " 10|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfc | \n",
+ " https://spdf.gsfc.nasa.gov/ | \n",
+ " Space Physics Data Facility | \n",
+ " NASA's Space Physics Data Facility SPDF | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ['Natural Sciences', 'Astrophysics and Astrono... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 13 | \n",
+ " 10|re3data_____::59521daca59ac29b811343cc4cd370cf | \n",
+ " http://card.westgis.ac.cn/ | \n",
+ " Cold and Arid Regions Science Data Center at L... | \n",
+ " CARD WDC for Glaciology and Geocryology World ... | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ['Natural Sciences', 'Geosciences (including G... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 14 | \n",
+ " 10|re3data_____::ec1ba1674c852466c266acb64c618d15 | \n",
+ " https://www.psycharchives.org/ | \n",
+ " Psycharchives | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ['Humanities and Social Sciences', 'Psychology... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 19 | \n",
+ " 10|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76 | \n",
+ " https://www.ihfc-iugg.org/products/global-heat... | \n",
+ " The Global Heat Flow Database of the Internati... | \n",
+ " International Heat-flow Database | \n",
+ " NaN | \n",
+ " 0.0 | \n",
+ " 0.0 | \n",
+ " ['Natural Sciences', 'Geology and Palaeontolog... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index id \\\n",
+ "0 4 10|re3data_____::3f2e20af26ead0432f5470d8b739638d \n",
+ "1 7 10|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfc \n",
+ "2 13 10|re3data_____::59521daca59ac29b811343cc4cd370cf \n",
+ "3 14 10|re3data_____::ec1ba1674c852466c266acb64c618d15 \n",
+ "4 19 10|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76 \n",
+ "\n",
+ " url \\\n",
+ "0 http://planttfdb.cbi.pku.edu.cn/ \n",
+ "1 https://spdf.gsfc.nasa.gov/ \n",
+ "2 http://card.westgis.ac.cn/ \n",
+ "3 https://www.psycharchives.org/ \n",
+ "4 https://www.ihfc-iugg.org/products/global-heat... \n",
+ "\n",
+ " official_name \\\n",
+ "0 Plant Transcription Factor Database \n",
+ "1 Space Physics Data Facility \n",
+ "2 Cold and Arid Regions Science Data Center at L... \n",
+ "3 Psycharchives \n",
+ "4 The Global Heat Flow Database of the Internati... \n",
+ "\n",
+ " english_name description latitude \\\n",
+ "0 PlantTFDB NaN 0.0 \n",
+ "1 NASA's Space Physics Data Facility SPDF NaN 0.0 \n",
+ "2 CARD WDC for Glaciology and Geocryology World ... NaN 0.0 \n",
+ "3 NaN NaN 0.0 \n",
+ "4 International Heat-flow Database NaN 0.0 \n",
+ "\n",
+ " longitude subjects \n",
+ "0 0.0 ['Life Sciences', 'Basic Biological and Medica... \n",
+ "1 0.0 ['Natural Sciences', 'Astrophysics and Astrono... \n",
+ "2 0.0 ['Natural Sciences', 'Geosciences (including G... \n",
+ "3 0.0 ['Humanities and Social Sciences', 'Psychology... \n",
+ "4 0.0 ['Natural Sciences', 'Geology and Palaeontolog... "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "re3data_df = pd.read_csv('../data/raw/re3data_opendoar.csv')\n",
+ "re3data_df = re3data_df[re3data_df.id.str.contains('re3data')].reset_index()\n",
+ "re3data_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " id | \n",
+ " url | \n",
+ " official_name | \n",
+ " english_name | \n",
+ " description | \n",
+ " latitude | \n",
+ " longitude | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 2693.000000 | \n",
+ " 2693 | \n",
+ " 2673 | \n",
+ " 2693 | \n",
+ " 2034 | \n",
+ " 38 | \n",
+ " 2693.000000 | \n",
+ " 2693.000000 | \n",
+ " 2693 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " NaN | \n",
+ " 2693 | \n",
+ " 2661 | \n",
+ " 2668 | \n",
+ " 2010 | \n",
+ " 38 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1427 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " NaN | \n",
+ " 10|re3data_____::fc8141eebc533cb225498718479f4e66 | \n",
+ " http://wdcpc.org/ | \n",
+ " European Climate Assessment & Dataset project | \n",
+ " ECA&D | \n",
+ " The Atmospheric Science Data Center (ASDC) at ... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " ['Humanities and Social Sciences', 'Life Scien... | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 209 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 4443.650947 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.114497 | \n",
+ " 0.067998 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 2518.294468 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 4.585469 | \n",
+ " 2.447173 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 4.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 2266.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 4506.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 6660.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 8705.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 234.000000 | \n",
+ " 123.000000 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index id \\\n",
+ "count 2693.000000 2693 \n",
+ "unique NaN 2693 \n",
+ "top NaN 10|re3data_____::fc8141eebc533cb225498718479f4e66 \n",
+ "freq NaN 1 \n",
+ "mean 4443.650947 NaN \n",
+ "std 2518.294468 NaN \n",
+ "min 4.000000 NaN \n",
+ "25% 2266.000000 NaN \n",
+ "50% 4506.000000 NaN \n",
+ "75% 6660.000000 NaN \n",
+ "max 8705.000000 NaN \n",
+ "\n",
+ " url official_name \\\n",
+ "count 2673 2693 \n",
+ "unique 2661 2668 \n",
+ "top http://wdcpc.org/ European Climate Assessment & Dataset project \n",
+ "freq 2 2 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " english_name description \\\n",
+ "count 2034 38 \n",
+ "unique 2010 38 \n",
+ "top ECA&D The Atmospheric Science Data Center (ASDC) at ... \n",
+ "freq 2 1 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " latitude longitude \\\n",
+ "count 2693.000000 2693.000000 \n",
+ "unique NaN NaN \n",
+ "top NaN NaN \n",
+ "freq NaN NaN \n",
+ "mean 0.114497 0.067998 \n",
+ "std 4.585469 2.447173 \n",
+ "min 0.000000 0.000000 \n",
+ "25% 0.000000 0.000000 \n",
+ "50% 0.000000 0.000000 \n",
+ "75% 0.000000 0.000000 \n",
+ "max 234.000000 123.000000 \n",
+ "\n",
+ " subjects \n",
+ "count 2693 \n",
+ "unique 1427 \n",
+ "top ['Humanities and Social Sciences', 'Life Scien... \n",
+ "freq 209 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "re3data_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**OpenDOAR**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " id | \n",
+ " url | \n",
+ " official_name | \n",
+ " english_name | \n",
+ " description | \n",
+ " latitude | \n",
+ " longitude | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 10|opendoar____::e833e042f509c996b1b25324d56659fb | \n",
+ " http://www.bilbao.net/bld | \n",
+ " BLD - Bilboko Liburutegi Digitala | \n",
+ " BLD - Bilboko Liburutegi Digitala | \n",
+ " BLD is a repository of digital documents, desi... | \n",
+ " 43.256699 | \n",
+ " -2.924100 | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 10|opendoar____::f621585df244e9596dc70a39b579efb1 | \n",
+ " https://researchdirect.westernsydney.edu.au/ | \n",
+ " Western Sydney ResearchDirect | \n",
+ " Western Sydney ResearchDirect | \n",
+ " NaN | \n",
+ " 0.000000 | \n",
+ " 0.000000 | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 10|opendoar____::437d7d1d97917cd627a34a6a0fb41136 | \n",
+ " http://redress.lancs.ac.uk/Learning_Space/ | \n",
+ " Learning Space Catalogue | \n",
+ " NaN | \n",
+ " This repository is a Social Science e-Science ... | \n",
+ " 54.010760 | \n",
+ " -2.784990 | \n",
+ " ['Social Sciences General', 'Science General',... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 10|opendoar____::d840cc5d906c3e9c84374c8919d2074e | \n",
+ " http://digitallibrary.usc.edu/search/controlle... | \n",
+ " USC Digital Library | \n",
+ " USC Digital Library | \n",
+ " This is an institutional repository providing ... | \n",
+ " 34.052200 | \n",
+ " -118.242996 | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 10|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010 | \n",
+ " http://www.ufgd.edu.br:8080/jspui/ | \n",
+ " Repositório de Divulgação das Produções Cientí... | \n",
+ " Repositório de Divulgação das Produções Cientí... | \n",
+ " This site provides access to the research outp... | \n",
+ " -22.221800 | \n",
+ " -54.806400 | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index id \\\n",
+ "0 0 10|opendoar____::e833e042f509c996b1b25324d56659fb \n",
+ "1 1 10|opendoar____::f621585df244e9596dc70a39b579efb1 \n",
+ "2 2 10|opendoar____::437d7d1d97917cd627a34a6a0fb41136 \n",
+ "3 3 10|opendoar____::d840cc5d906c3e9c84374c8919d2074e \n",
+ "4 5 10|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010 \n",
+ "\n",
+ " url \\\n",
+ "0 http://www.bilbao.net/bld \n",
+ "1 https://researchdirect.westernsydney.edu.au/ \n",
+ "2 http://redress.lancs.ac.uk/Learning_Space/ \n",
+ "3 http://digitallibrary.usc.edu/search/controlle... \n",
+ "4 http://www.ufgd.edu.br:8080/jspui/ \n",
+ "\n",
+ " official_name \\\n",
+ "0 BLD - Bilboko Liburutegi Digitala \n",
+ "1 Western Sydney ResearchDirect \n",
+ "2 Learning Space Catalogue \n",
+ "3 USC Digital Library \n",
+ "4 Repositório de Divulgação das Produções Cientí... \n",
+ "\n",
+ " english_name \\\n",
+ "0 BLD - Bilboko Liburutegi Digitala \n",
+ "1 Western Sydney ResearchDirect \n",
+ "2 NaN \n",
+ "3 USC Digital Library \n",
+ "4 Repositório de Divulgação das Produções Cientí... \n",
+ "\n",
+ " description latitude longitude \\\n",
+ "0 BLD is a repository of digital documents, desi... 43.256699 -2.924100 \n",
+ "1 NaN 0.000000 0.000000 \n",
+ "2 This repository is a Social Science e-Science ... 54.010760 -2.784990 \n",
+ "3 This is an institutional repository providing ... 34.052200 -118.242996 \n",
+ "4 This site provides access to the research outp... -22.221800 -54.806400 \n",
+ "\n",
+ " subjects \n",
+ "0 [] \n",
+ "1 [] \n",
+ "2 ['Social Sciences General', 'Science General',... \n",
+ "3 [] \n",
+ "4 [] "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "opendoar_df = pd.read_csv('../data/raw/re3data_opendoar.csv')\n",
+ "opendoar_df = opendoar_df[opendoar_df.id.str.contains('opendoar')].reset_index()\n",
+ "opendoar_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " index | \n",
+ " id | \n",
+ " url | \n",
+ " official_name | \n",
+ " english_name | \n",
+ " description | \n",
+ " latitude | \n",
+ " longitude | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 6014.000000 | \n",
+ " 6014 | \n",
+ " 6013 | \n",
+ " 6014 | \n",
+ " 5500 | \n",
+ " 5776 | \n",
+ " 6014.000000 | \n",
+ " 6014.000000 | \n",
+ " 6014 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " NaN | \n",
+ " 6014 | \n",
+ " 5953 | \n",
+ " 5946 | \n",
+ " 5413 | \n",
+ " 4920 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 201 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " NaN | \n",
+ " 10|opendoar____::17256f049f1e3fede17c7a313f7657f4 | \n",
+ " http://harp.lib.hiroshima-u.ac.jp/ | \n",
+ " Hiroshima Associated Repository Portal | \n",
+ " AURA | \n",
+ " This site provides access to the research outp... | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " [] | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " NaN | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 98 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 5273 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 4312.407549 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 38.649393 | \n",
+ " 7.810948 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 2510.699848 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 788.406173 | \n",
+ " 71.689788 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 0.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " -79.029999 | \n",
+ " -683.103027 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 2129.250000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 4.644632 | \n",
+ " -49.273300 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 4297.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 37.930449 | \n",
+ " 4.788870 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 6476.750000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 47.294400 | \n",
+ " 30.685501 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 8706.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 61138.800781 | \n",
+ " 178.438995 | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " index id \\\n",
+ "count 6014.000000 6014 \n",
+ "unique NaN 6014 \n",
+ "top NaN 10|opendoar____::17256f049f1e3fede17c7a313f7657f4 \n",
+ "freq NaN 1 \n",
+ "mean 4312.407549 NaN \n",
+ "std 2510.699848 NaN \n",
+ "min 0.000000 NaN \n",
+ "25% 2129.250000 NaN \n",
+ "50% 4297.000000 NaN \n",
+ "75% 6476.750000 NaN \n",
+ "max 8706.000000 NaN \n",
+ "\n",
+ " url \\\n",
+ "count 6013 \n",
+ "unique 5953 \n",
+ "top http://harp.lib.hiroshima-u.ac.jp/ \n",
+ "freq 3 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN \n",
+ "\n",
+ " official_name english_name \\\n",
+ "count 6014 5500 \n",
+ "unique 5946 5413 \n",
+ "top Hiroshima Associated Repository Portal AURA \n",
+ "freq 3 4 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN \n",
+ "\n",
+ " description latitude \\\n",
+ "count 5776 6014.000000 \n",
+ "unique 4920 NaN \n",
+ "top This site provides access to the research outp... NaN \n",
+ "freq 98 NaN \n",
+ "mean NaN 38.649393 \n",
+ "std NaN 788.406173 \n",
+ "min NaN -79.029999 \n",
+ "25% NaN 4.644632 \n",
+ "50% NaN 37.930449 \n",
+ "75% NaN 47.294400 \n",
+ "max NaN 61138.800781 \n",
+ "\n",
+ " longitude subjects \n",
+ "count 6014.000000 6014 \n",
+ "unique NaN 201 \n",
+ "top NaN [] \n",
+ "freq NaN 5273 \n",
+ "mean 7.810948 NaN \n",
+ "std 71.689788 NaN \n",
+ "min -683.103027 NaN \n",
+ "25% -49.273300 NaN \n",
+ "50% 4.788870 NaN \n",
+ "75% 30.685501 NaN \n",
+ "max 178.438995 NaN "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "opendoar_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Basic cleaning"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**re3data**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "re3data_df.loc[(re3data_df.latitude == 0.0) & (re3data_df.longitude == 0.0), ['latitude', 'longitude']] = [np.nan, np.nan]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 ['Life Sciences', 'Basic Biological and Medica...\n",
+ "1 ['Natural Sciences', 'Astrophysics and Astrono...\n",
+ "2 ['Natural Sciences', 'Geosciences (including G...\n",
+ "3 ['Humanities and Social Sciences', 'Psychology...\n",
+ "4 ['Natural Sciences', 'Geology and Palaeontolog...\n",
+ " ... \n",
+ "2688 ['Life Sciences', 'Basic Biological and Medica...\n",
+ "2689 ['Natural Sciences', 'Atmospheric Science and ...\n",
+ "2690 ['Natural Sciences', 'Atmospheric Science and ...\n",
+ "2691 ['Natural Sciences', 'Atmospheric Science and ...\n",
+ "2692 ['Life Sciences', 'Plant Sciences', 'Plant Gen...\n",
+ "Name: subjects, Length: 2693, dtype: object"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "re3data_df.subjects"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "re3data_df['subjects'] = re3data_df.subjects.apply(lambda x: ast.literal_eval(x))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def merge_lists(lists):\n",
+ " res = []\n",
+ " for l in lists:\n",
+ " res = res + l\n",
+ " return res\n",
+ "\n",
+ "re3data_cleaned_subjects = re3data_df.explode('subjects').subjects.str.split(',| and ', expand=True)\\\n",
+ " .apply(lambda row: row.dropna().tolist(), axis=1)\\\n",
+ " .reset_index()\\\n",
+ " .groupby('index')[0].apply(lambda x: merge_lists(x))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "index\n",
+ "0 [Life Sciences, Basic Biological, Medical Rese...\n",
+ "1 [Natural Sciences, Astrophysics, Astronomy, Ph...\n",
+ "2 [Natural Sciences, Geosciences (including Geog...\n",
+ "3 [Humanities, Social Sciences, Psychology, Soci...\n",
+ "4 [Natural Sciences, Geology, Palaeontology, Geo...\n",
+ " ... \n",
+ "2688 [Life Sciences, Basic Biological, Medical Rese...\n",
+ "2689 [Natural Sciences, Atmospheric Science, Oceano...\n",
+ "2690 [Natural Sciences, Atmospheric Science, Oceano...\n",
+ "2691 [Natural Sciences, Atmospheric Science, Oceano...\n",
+ "2692 [Life Sciences, Plant Sciences, Plant Genetics...\n",
+ "Name: 0, Length: 2693, dtype: object"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "re3data_cleaned_subjects"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "re3data_df = re3data_df.join(re3data_cleaned_subjects)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "re3data_df.drop(columns=['subjects'], inplace=True)\n",
+ "re3data_df.rename(columns={0:'subjects'}, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**OpenDOAR**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0 []\n",
+ "1 []\n",
+ "2 ['Social Sciences General', 'Science General',...\n",
+ "3 []\n",
+ "4 []\n",
+ " ... \n",
+ "6009 ['Multidisciplinary']\n",
+ "6010 []\n",
+ "6011 ['Business and Economics']\n",
+ "6012 ['Earth and Planetary Sciences', 'Ecology and ...\n",
+ "6013 []\n",
+ "Name: subjects, Length: 6014, dtype: object"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "opendoar_df.subjects"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "opendoar_df['subjects'] = opendoar_df.subjects.apply(lambda x: ast.literal_eval(x))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "opendoar_cleaned_subjects = opendoar_df.explode('subjects').subjects.str.split(',| and ', expand=True)\\\n",
+ " .apply(lambda row: row.dropna().tolist(), axis=1)\\\n",
+ " .reset_index()\\\n",
+ " .groupby('index')[0].apply(lambda x: merge_lists(x))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "index\n",
+ "0 []\n",
+ "1 []\n",
+ "2 [Social Sciences General, Science General, Com...\n",
+ "3 []\n",
+ "4 []\n",
+ " ... \n",
+ "6009 [Multidisciplinary]\n",
+ "6010 []\n",
+ "6011 [Business, Economics]\n",
+ "6012 [Earth, Planetary Sciences, Ecology, Environme...\n",
+ "6013 []\n",
+ "Name: 0, Length: 6014, dtype: object"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "opendoar_cleaned_subjects"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 20,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "opendoar_df = opendoar_df.join(opendoar_cleaned_subjects)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 21,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "opendoar_df.drop(columns=['subjects'], inplace=True)\n",
+ "opendoar_df.rename(columns={0: 'subjects'}, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Subjects analysis"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "fairsharing_subjects = fairsharing_df.explode('subjects')\n",
+ "re3data_subjects = re3data_df.explode('subjects')\n",
+ "opendoar_subjects = opendoar_df.explode('subjects')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
"metadata": {},
"outputs": [
{
@@ -357,12 +1481,11 @@
"data": {
"application/vnd.plotly.v1+json": {
"config": {
- "linkText": "Export to plot.ly",
- "plotlyServerURL": "https://plot.ly",
- "showLink": false
+ "plotlyServerURL": "https://plot.ly"
},
"data": [
{
+ "name": "FAIRsharing",
"type": "bar",
"x": [
"Life Science",
@@ -988,2455 +2111,11 @@
1,
1
]
- }
- ],
- "layout": {
- "template": {
- "data": {
- "bar": [
- {
- "error_x": {
- "color": "#2a3f5f"
- },
- "error_y": {
- "color": "#2a3f5f"
- },
- "marker": {
- "line": {
- "color": "#E5ECF6",
- "width": 0.5
- }
- },
- "type": "bar"
- }
- ],
- "barpolar": [
- {
- "marker": {
- "line": {
- "color": "#E5ECF6",
- "width": 0.5
- }
- },
- "type": "barpolar"
- }
- ],
- "carpet": [
- {
- "aaxis": {
- "endlinecolor": "#2a3f5f",
- "gridcolor": "white",
- "linecolor": "white",
- "minorgridcolor": "white",
- "startlinecolor": "#2a3f5f"
- },
- "baxis": {
- "endlinecolor": "#2a3f5f",
- "gridcolor": "white",
- "linecolor": "white",
- "minorgridcolor": "white",
- "startlinecolor": "#2a3f5f"
- },
- "type": "carpet"
- }
- ],
- "choropleth": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "choropleth"
- }
- ],
- "contour": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "contour"
- }
- ],
- "contourcarpet": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "contourcarpet"
- }
- ],
- "heatmap": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "heatmap"
- }
- ],
- "heatmapgl": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "heatmapgl"
- }
- ],
- "histogram": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "histogram"
- }
- ],
- "histogram2d": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "histogram2d"
- }
- ],
- "histogram2dcontour": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "histogram2dcontour"
- }
- ],
- "mesh3d": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "mesh3d"
- }
- ],
- "parcoords": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "parcoords"
- }
- ],
- "pie": [
- {
- "automargin": true,
- "type": "pie"
- }
- ],
- "scatter": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatter"
- }
- ],
- "scatter3d": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatter3d"
- }
- ],
- "scattercarpet": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattercarpet"
- }
- ],
- "scattergeo": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattergeo"
- }
- ],
- "scattergl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattergl"
- }
- ],
- "scattermapbox": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattermapbox"
- }
- ],
- "scatterpolar": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterpolar"
- }
- ],
- "scatterpolargl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterpolargl"
- }
- ],
- "scatterternary": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterternary"
- }
- ],
- "surface": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "surface"
- }
- ],
- "table": [
- {
- "cells": {
- "fill": {
- "color": "#EBF0F8"
- },
- "line": {
- "color": "white"
- }
- },
- "header": {
- "fill": {
- "color": "#C8D4E3"
- },
- "line": {
- "color": "white"
- }
- },
- "type": "table"
- }
- ]
- },
- "layout": {
- "annotationdefaults": {
- "arrowcolor": "#2a3f5f",
- "arrowhead": 0,
- "arrowwidth": 1
- },
- "autotypenumbers": "strict",
- "coloraxis": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "colorscale": {
- "diverging": [
- [
- 0,
- "#8e0152"
- ],
- [
- 0.1,
- "#c51b7d"
- ],
- [
- 0.2,
- "#de77ae"
- ],
- [
- 0.3,
- "#f1b6da"
- ],
- [
- 0.4,
- "#fde0ef"
- ],
- [
- 0.5,
- "#f7f7f7"
- ],
- [
- 0.6,
- "#e6f5d0"
- ],
- [
- 0.7,
- "#b8e186"
- ],
- [
- 0.8,
- "#7fbc41"
- ],
- [
- 0.9,
- "#4d9221"
- ],
- [
- 1,
- "#276419"
- ]
- ],
- "sequential": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "sequentialminus": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ]
- },
- "colorway": [
- "#636efa",
- "#EF553B",
- "#00cc96",
- "#ab63fa",
- "#FFA15A",
- "#19d3f3",
- "#FF6692",
- "#B6E880",
- "#FF97FF",
- "#FECB52"
- ],
- "font": {
- "color": "#2a3f5f"
- },
- "geo": {
- "bgcolor": "white",
- "lakecolor": "white",
- "landcolor": "#E5ECF6",
- "showlakes": true,
- "showland": true,
- "subunitcolor": "white"
- },
- "hoverlabel": {
- "align": "left"
- },
- "hovermode": "closest",
- "mapbox": {
- "style": "light"
- },
- "paper_bgcolor": "white",
- "plot_bgcolor": "#E5ECF6",
- "polar": {
- "angularaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "bgcolor": "#E5ECF6",
- "radialaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- }
- },
- "scene": {
- "xaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- },
- "yaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- },
- "zaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- }
- },
- "shapedefaults": {
- "line": {
- "color": "#2a3f5f"
- }
- },
- "ternary": {
- "aaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "baxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "bgcolor": "#E5ECF6",
- "caxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- }
- },
- "title": {
- "x": 0.05
- },
- "xaxis": {
- "automargin": true,
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": "",
- "title": {
- "standoff": 15
- },
- "zerolinecolor": "white",
- "zerolinewidth": 2
- },
- "yaxis": {
- "automargin": true,
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": "",
- "title": {
- "standoff": 15
- },
- "zerolinecolor": "white",
- "zerolinewidth": 2
- }
- }
},
- "title": {
- "text": "Fairsharing subject coverage"
- },
- "xaxis": {
- "tickangle": 45,
- "tickfont": {
- "size": 12
- }
- }
- }
- },
- "text/html": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "fairsharing_subjects = fairsharing_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)\n",
- "\n",
- "data = [\n",
- " go.Bar(\n",
- " x=fairsharing_subjects.index,\n",
- " y=fairsharing_subjects['url']\n",
- " )\n",
- "]\n",
- "\n",
- "layout = go.Layout(\n",
- " title='Fairsharing subject coverage',\n",
- " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n",
- ")\n",
- "fig = go.Figure(data=data, layout=layout)\n",
- "plotly.offline.iplot(fig)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 5,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.plotly.v1+json": {
- "config": {
- "linkText": "Export to plot.ly",
- "plotlyServerURL": "https://plot.ly",
- "showLink": false
- },
- "data": [
- {
- "type": "bar",
- "x": [
- "United States",
- "United Kingdom",
- "Germany",
- "France",
- "Switzerland",
- "China",
- "Netherlands",
- "Italy",
- "Canada",
- "Belgium",
- "Spain",
- "Japan",
- "Sweden",
- "Czech Republic",
- "Norway",
- "Denmark",
- "European Union",
- "Austria",
- "Finland",
- "Republic of Ireland",
- "Australia",
- "Israel",
- "Portugal",
- "Hungary",
- "Greece",
- "Malta",
- "Lithuania",
- "Slovakia",
- "Iceland",
- "Luxembourg",
- "Montenegro",
- "Croatia",
- "Worldwide",
- "India",
- "Poland",
- "Singapore",
- "South Korea",
- "Russia",
- "South Africa",
- "Taiwan",
- "Brazil",
- "New Zealand",
- "Mexico",
- "Saudi Arabia",
- "Bulgaria",
- "Hong Kong",
- "Argentina",
- "Turkey",
- "Cyprus",
- "Morocco",
- "Uganda",
- "Estonia",
- "Romania",
- "Thailand",
- "Pakistan",
- "Costa Rica",
- "Uruguay",
- "United Arab Emirates",
- "Togo",
- "Antarctica",
- "Panama",
- "Honduras",
- "Benin",
- "Cameroon",
- "Chile",
- "Colombia",
- "Egypt",
- "El Salvador",
- "Ethiopia",
- "Faroe Islands",
- "Greenland",
- "Indonesia",
- "Nigeria",
- "Kenya",
- "Latvia",
- "Madagascar",
- "Malawi",
- "Mali",
- "Mauritania",
- "Mozambique",
- "Nicaragua",
- "Niger",
- "Zimbabwe"
- ],
- "y": [
- 686,
- 248,
- 192,
- 162,
- 114,
- 99,
- 96,
- 91,
- 86,
- 83,
- 83,
- 80,
- 76,
- 71,
- 69,
- 67,
- 66,
- 64,
- 63,
- 62,
- 62,
- 61,
- 60,
- 59,
- 58,
- 53,
- 52,
- 52,
- 52,
- 52,
- 51,
- 51,
- 49,
- 32,
- 11,
- 10,
- 10,
- 9,
- 9,
- 8,
- 8,
- 8,
- 8,
- 6,
- 3,
- 3,
- 3,
- 3,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 2,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1,
- 1
- ]
- }
- ],
- "layout": {
- "template": {
- "data": {
- "bar": [
- {
- "error_x": {
- "color": "#2a3f5f"
- },
- "error_y": {
- "color": "#2a3f5f"
- },
- "marker": {
- "line": {
- "color": "#E5ECF6",
- "width": 0.5
- }
- },
- "type": "bar"
- }
- ],
- "barpolar": [
- {
- "marker": {
- "line": {
- "color": "#E5ECF6",
- "width": 0.5
- }
- },
- "type": "barpolar"
- }
- ],
- "carpet": [
- {
- "aaxis": {
- "endlinecolor": "#2a3f5f",
- "gridcolor": "white",
- "linecolor": "white",
- "minorgridcolor": "white",
- "startlinecolor": "#2a3f5f"
- },
- "baxis": {
- "endlinecolor": "#2a3f5f",
- "gridcolor": "white",
- "linecolor": "white",
- "minorgridcolor": "white",
- "startlinecolor": "#2a3f5f"
- },
- "type": "carpet"
- }
- ],
- "choropleth": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "choropleth"
- }
- ],
- "contour": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "contour"
- }
- ],
- "contourcarpet": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "contourcarpet"
- }
- ],
- "heatmap": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "heatmap"
- }
- ],
- "heatmapgl": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "heatmapgl"
- }
- ],
- "histogram": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "histogram"
- }
- ],
- "histogram2d": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "histogram2d"
- }
- ],
- "histogram2dcontour": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "histogram2dcontour"
- }
- ],
- "mesh3d": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "mesh3d"
- }
- ],
- "parcoords": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "parcoords"
- }
- ],
- "pie": [
- {
- "automargin": true,
- "type": "pie"
- }
- ],
- "scatter": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatter"
- }
- ],
- "scatter3d": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatter3d"
- }
- ],
- "scattercarpet": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattercarpet"
- }
- ],
- "scattergeo": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattergeo"
- }
- ],
- "scattergl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattergl"
- }
- ],
- "scattermapbox": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattermapbox"
- }
- ],
- "scatterpolar": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterpolar"
- }
- ],
- "scatterpolargl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterpolargl"
- }
- ],
- "scatterternary": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterternary"
- }
- ],
- "surface": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "surface"
- }
- ],
- "table": [
- {
- "cells": {
- "fill": {
- "color": "#EBF0F8"
- },
- "line": {
- "color": "white"
- }
- },
- "header": {
- "fill": {
- "color": "#C8D4E3"
- },
- "line": {
- "color": "white"
- }
- },
- "type": "table"
- }
- ]
- },
- "layout": {
- "annotationdefaults": {
- "arrowcolor": "#2a3f5f",
- "arrowhead": 0,
- "arrowwidth": 1
- },
- "autotypenumbers": "strict",
- "coloraxis": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "colorscale": {
- "diverging": [
- [
- 0,
- "#8e0152"
- ],
- [
- 0.1,
- "#c51b7d"
- ],
- [
- 0.2,
- "#de77ae"
- ],
- [
- 0.3,
- "#f1b6da"
- ],
- [
- 0.4,
- "#fde0ef"
- ],
- [
- 0.5,
- "#f7f7f7"
- ],
- [
- 0.6,
- "#e6f5d0"
- ],
- [
- 0.7,
- "#b8e186"
- ],
- [
- 0.8,
- "#7fbc41"
- ],
- [
- 0.9,
- "#4d9221"
- ],
- [
- 1,
- "#276419"
- ]
- ],
- "sequential": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "sequentialminus": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ]
- },
- "colorway": [
- "#636efa",
- "#EF553B",
- "#00cc96",
- "#ab63fa",
- "#FFA15A",
- "#19d3f3",
- "#FF6692",
- "#B6E880",
- "#FF97FF",
- "#FECB52"
- ],
- "font": {
- "color": "#2a3f5f"
- },
- "geo": {
- "bgcolor": "white",
- "lakecolor": "white",
- "landcolor": "#E5ECF6",
- "showlakes": true,
- "showland": true,
- "subunitcolor": "white"
- },
- "hoverlabel": {
- "align": "left"
- },
- "hovermode": "closest",
- "mapbox": {
- "style": "light"
- },
- "paper_bgcolor": "white",
- "plot_bgcolor": "#E5ECF6",
- "polar": {
- "angularaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "bgcolor": "#E5ECF6",
- "radialaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- }
- },
- "scene": {
- "xaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- },
- "yaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- },
- "zaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- }
- },
- "shapedefaults": {
- "line": {
- "color": "#2a3f5f"
- }
- },
- "ternary": {
- "aaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "baxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "bgcolor": "#E5ECF6",
- "caxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- }
- },
- "title": {
- "x": 0.05
- },
- "xaxis": {
- "automargin": true,
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": "",
- "title": {
- "standoff": 15
- },
- "zerolinecolor": "white",
- "zerolinewidth": 2
- },
- "yaxis": {
- "automargin": true,
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": "",
- "title": {
- "standoff": 15
- },
- "zerolinecolor": "white",
- "zerolinewidth": 2
- }
- }
- },
- "title": {
- "text": "Fairsharing country coverage"
- },
- "xaxis": {
- "tickangle": 45,
- "tickfont": {
- "size": 12
- }
- }
- }
- },
- "text/html": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "fairsharing_countries = fairsharing_df.explode('countries').groupby('countries')[['url']].count().sort_values('url', ascending=False)\n",
- "\n",
- "data = [\n",
- " go.Bar(\n",
- " x=fairsharing_countries.index,\n",
- " y=fairsharing_countries['url']\n",
- " )\n",
- "]\n",
- "\n",
- "layout = go.Layout(\n",
- " title='Fairsharing country coverage',\n",
- " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n",
- ")\n",
- "fig = go.Figure(data=data, layout=layout)\n",
- "plotly.offline.iplot(fig)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# re3data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 6,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " url | \n",
- " official_name | \n",
- " english_name | \n",
- " description | \n",
- " latitude | \n",
- " longitude | \n",
- " subjects | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 4 | \n",
- " 10|re3data_____::3f2e20af26ead0432f5470d8b739638d | \n",
- " http://planttfdb.cbi.pku.edu.cn/ | \n",
- " Plant Transcription Factor Database | \n",
- " PlantTFDB | \n",
- " NaN | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ['Life Sciences', 'Basic Biological and Medica... | \n",
- "
\n",
- " \n",
- " 7 | \n",
- " 10|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfc | \n",
- " https://spdf.gsfc.nasa.gov/ | \n",
- " Space Physics Data Facility | \n",
- " NASA's Space Physics Data Facility SPDF | \n",
- " NaN | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ['Natural Sciences', 'Astrophysics and Astrono... | \n",
- "
\n",
- " \n",
- " 13 | \n",
- " 10|re3data_____::59521daca59ac29b811343cc4cd370cf | \n",
- " http://card.westgis.ac.cn/ | \n",
- " Cold and Arid Regions Science Data Center at L... | \n",
- " CARD WDC for Glaciology and Geocryology World ... | \n",
- " NaN | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ['Natural Sciences', 'Geosciences (including G... | \n",
- "
\n",
- " \n",
- " 14 | \n",
- " 10|re3data_____::ec1ba1674c852466c266acb64c618d15 | \n",
- " https://www.psycharchives.org/ | \n",
- " Psycharchives | \n",
- " NaN | \n",
- " NaN | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ['Humanities and Social Sciences', 'Psychology... | \n",
- "
\n",
- " \n",
- " 19 | \n",
- " 10|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76 | \n",
- " https://www.ihfc-iugg.org/products/global-heat... | \n",
- " The Global Heat Flow Database of the Internati... | \n",
- " International Heat-flow Database | \n",
- " NaN | \n",
- " 0.0 | \n",
- " 0.0 | \n",
- " ['Natural Sciences', 'Geology and Palaeontolog... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id \\\n",
- "4 10|re3data_____::3f2e20af26ead0432f5470d8b739638d \n",
- "7 10|re3data_____::e1db3f9d2fa6c8d8067bc471ab50bdfc \n",
- "13 10|re3data_____::59521daca59ac29b811343cc4cd370cf \n",
- "14 10|re3data_____::ec1ba1674c852466c266acb64c618d15 \n",
- "19 10|re3data_____::2ada591fb1bc9aee72a6d3e0c1ae8a76 \n",
- "\n",
- " url \\\n",
- "4 http://planttfdb.cbi.pku.edu.cn/ \n",
- "7 https://spdf.gsfc.nasa.gov/ \n",
- "13 http://card.westgis.ac.cn/ \n",
- "14 https://www.psycharchives.org/ \n",
- "19 https://www.ihfc-iugg.org/products/global-heat... \n",
- "\n",
- " official_name \\\n",
- "4 Plant Transcription Factor Database \n",
- "7 Space Physics Data Facility \n",
- "13 Cold and Arid Regions Science Data Center at L... \n",
- "14 Psycharchives \n",
- "19 The Global Heat Flow Database of the Internati... \n",
- "\n",
- " english_name description latitude \\\n",
- "4 PlantTFDB NaN 0.0 \n",
- "7 NASA's Space Physics Data Facility SPDF NaN 0.0 \n",
- "13 CARD WDC for Glaciology and Geocryology World ... NaN 0.0 \n",
- "14 NaN NaN 0.0 \n",
- "19 International Heat-flow Database NaN 0.0 \n",
- "\n",
- " longitude subjects \n",
- "4 0.0 ['Life Sciences', 'Basic Biological and Medica... \n",
- "7 0.0 ['Natural Sciences', 'Astrophysics and Astrono... \n",
- "13 0.0 ['Natural Sciences', 'Geosciences (including G... \n",
- "14 0.0 ['Humanities and Social Sciences', 'Psychology... \n",
- "19 0.0 ['Natural Sciences', 'Geology and Palaeontolog... "
- ]
- },
- "execution_count": 6,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "re3data_df = pd.read_csv('../data/raw/re3data_opendoar.csv')\n",
- "re3data_df = re3data_df[re3data_df.id.str.contains('re3data')]\n",
- "re3data_df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 7,
- "metadata": {},
- "outputs": [],
- "source": [
- "re3data_df.loc[(re3data_df.latitude == 0.0) & (re3data_df.longitude == 0.0), ['latitude', 'longitude']] = [np.nan, np.nan]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 8,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "4 ['Life Sciences', 'Basic Biological and Medica...\n",
- "7 ['Natural Sciences', 'Astrophysics and Astrono...\n",
- "13 ['Natural Sciences', 'Geosciences (including G...\n",
- "14 ['Humanities and Social Sciences', 'Psychology...\n",
- "19 ['Natural Sciences', 'Geology and Palaeontolog...\n",
- " ... \n",
- "8693 ['Life Sciences', 'Basic Biological and Medica...\n",
- "8695 ['Natural Sciences', 'Atmospheric Science and ...\n",
- "8697 ['Natural Sciences', 'Atmospheric Science and ...\n",
- "8699 ['Natural Sciences', 'Atmospheric Science and ...\n",
- "8705 ['Life Sciences', 'Plant Sciences', 'Plant Gen...\n",
- "Name: subjects, Length: 2693, dtype: object"
- ]
- },
- "execution_count": 8,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "re3data_df.subjects"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 9,
- "metadata": {},
- "outputs": [],
- "source": [
- "re3data_df['subjects'] = re3data_df.subjects.apply(lambda x: ast.literal_eval(x))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 10,
- "metadata": {},
- "outputs": [],
- "source": [
- "def merge_lists(lists):\n",
- " res = []\n",
- " for l in lists:\n",
- " res = res + l\n",
- " return res\n",
- "\n",
- "re3data_cleaned_subjects = re3data_df.explode('subjects').subjects.str.split(',| and ', expand=True)\\\n",
- " .apply(lambda row: row.dropna().tolist(), axis=1)\\\n",
- " .reset_index()\\\n",
- " .groupby('index')[0].apply(lambda x: merge_lists(x))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "index\n",
- "4 [Life Sciences, Basic Biological, Medical Rese...\n",
- "7 [Natural Sciences, Astrophysics, Astronomy, Ph...\n",
- "13 [Natural Sciences, Geosciences (including Geog...\n",
- "14 [Humanities, Social Sciences, Psychology, Soci...\n",
- "19 [Natural Sciences, Geology, Palaeontology, Geo...\n",
- " ... \n",
- "8693 [Life Sciences, Basic Biological, Medical Rese...\n",
- "8695 [Natural Sciences, Atmospheric Science, Oceano...\n",
- "8697 [Natural Sciences, Atmospheric Science, Oceano...\n",
- "8699 [Natural Sciences, Atmospheric Science, Oceano...\n",
- "8705 [Life Sciences, Plant Sciences, Plant Genetics...\n",
- "Name: 0, Length: 2693, dtype: object"
- ]
- },
- "execution_count": 11,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "re3data_cleaned_subjects"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {},
- "outputs": [],
- "source": [
- "re3data_df = re3data_df.join(re3data_cleaned_subjects)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 13,
- "metadata": {},
- "outputs": [],
- "source": [
- "re3data_df.drop(columns=['subjects'], inplace=True)\n",
- "re3data_df.rename(columns={0:'subjects'}, inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " url | \n",
- " official_name | \n",
- " english_name | \n",
- " description | \n",
- " latitude | \n",
- " longitude | \n",
- " subjects | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " count | \n",
- " 2693 | \n",
- " 2673 | \n",
- " 2693 | \n",
- " 2034 | \n",
- " 38 | \n",
- " 5.000000 | \n",
- " 5.000000 | \n",
- " 2693 | \n",
- "
\n",
- " \n",
- " unique | \n",
- " 2693 | \n",
- " 2661 | \n",
- " 2668 | \n",
- " 2010 | \n",
- " 38 | \n",
- " NaN | \n",
- " NaN | \n",
- " 1427 | \n",
- "
\n",
- " \n",
- " top | \n",
- " 10|re3data_____::e59f89142e8d47d32523c53a9137f07b | \n",
- " http://iubio.bio.indiana.edu/ | \n",
- " IUBio-Archive | \n",
- " Research Data Repository | \n",
- " IUBio Archive is an archive of biology data an... | \n",
- " NaN | \n",
- " NaN | \n",
- " [Humanities, Social Sciences, Life Sciences, N... | \n",
- "
\n",
- " \n",
- " freq | \n",
- " 1 | \n",
- " 2 | \n",
- " 2 | \n",
- " 2 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " 209 | \n",
- "
\n",
- " \n",
- " mean | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 61.668113 | \n",
- " 36.623678 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " std | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 96.984457 | \n",
- " 48.547521 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " min | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 12.123000 | \n",
- " 12.123000 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 25% | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 12.123000 | \n",
- " 12.123400 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 50% | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 12.123400 | \n",
- " 12.123400 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 75% | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 37.971163 | \n",
- " 23.748590 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " max | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 234.000000 | \n",
- " 123.000000 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id \\\n",
- "count 2693 \n",
- "unique 2693 \n",
- "top 10|re3data_____::e59f89142e8d47d32523c53a9137f07b \n",
- "freq 1 \n",
- "mean NaN \n",
- "std NaN \n",
- "min NaN \n",
- "25% NaN \n",
- "50% NaN \n",
- "75% NaN \n",
- "max NaN \n",
- "\n",
- " url official_name \\\n",
- "count 2673 2693 \n",
- "unique 2661 2668 \n",
- "top http://iubio.bio.indiana.edu/ IUBio-Archive \n",
- "freq 2 2 \n",
- "mean NaN NaN \n",
- "std NaN NaN \n",
- "min NaN NaN \n",
- "25% NaN NaN \n",
- "50% NaN NaN \n",
- "75% NaN NaN \n",
- "max NaN NaN \n",
- "\n",
- " english_name \\\n",
- "count 2034 \n",
- "unique 2010 \n",
- "top Research Data Repository \n",
- "freq 2 \n",
- "mean NaN \n",
- "std NaN \n",
- "min NaN \n",
- "25% NaN \n",
- "50% NaN \n",
- "75% NaN \n",
- "max NaN \n",
- "\n",
- " description latitude \\\n",
- "count 38 5.000000 \n",
- "unique 38 NaN \n",
- "top IUBio Archive is an archive of biology data an... NaN \n",
- "freq 1 NaN \n",
- "mean NaN 61.668113 \n",
- "std NaN 96.984457 \n",
- "min NaN 12.123000 \n",
- "25% NaN 12.123000 \n",
- "50% NaN 12.123400 \n",
- "75% NaN 37.971163 \n",
- "max NaN 234.000000 \n",
- "\n",
- " longitude subjects \n",
- "count 5.000000 2693 \n",
- "unique NaN 1427 \n",
- "top NaN [Humanities, Social Sciences, Life Sciences, N... \n",
- "freq NaN 209 \n",
- "mean 36.623678 NaN \n",
- "std 48.547521 NaN \n",
- "min 12.123000 NaN \n",
- "25% 12.123400 NaN \n",
- "50% 12.123400 NaN \n",
- "75% 23.748590 NaN \n",
- "max 123.000000 NaN "
- ]
- },
- "execution_count": 14,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "re3data_df.describe(include='all')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 15,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.plotly.v1+json": {
- "config": {
- "linkText": "Export to plot.ly",
- "plotlyServerURL": "https://plot.ly",
- "showLink": false
- },
- "data": [
{
+ "name": "re3data",
"type": "bar",
+ "visible": "legendonly",
"x": [
"Life Sciences",
"Natural Sciences",
@@ -4267,1391 +2946,11 @@
1,
1
]
- }
- ],
- "layout": {
- "template": {
- "data": {
- "bar": [
- {
- "error_x": {
- "color": "#2a3f5f"
- },
- "error_y": {
- "color": "#2a3f5f"
- },
- "marker": {
- "line": {
- "color": "#E5ECF6",
- "width": 0.5
- }
- },
- "type": "bar"
- }
- ],
- "barpolar": [
- {
- "marker": {
- "line": {
- "color": "#E5ECF6",
- "width": 0.5
- }
- },
- "type": "barpolar"
- }
- ],
- "carpet": [
- {
- "aaxis": {
- "endlinecolor": "#2a3f5f",
- "gridcolor": "white",
- "linecolor": "white",
- "minorgridcolor": "white",
- "startlinecolor": "#2a3f5f"
- },
- "baxis": {
- "endlinecolor": "#2a3f5f",
- "gridcolor": "white",
- "linecolor": "white",
- "minorgridcolor": "white",
- "startlinecolor": "#2a3f5f"
- },
- "type": "carpet"
- }
- ],
- "choropleth": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "choropleth"
- }
- ],
- "contour": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "contour"
- }
- ],
- "contourcarpet": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "contourcarpet"
- }
- ],
- "heatmap": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "heatmap"
- }
- ],
- "heatmapgl": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "heatmapgl"
- }
- ],
- "histogram": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "histogram"
- }
- ],
- "histogram2d": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "histogram2d"
- }
- ],
- "histogram2dcontour": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "histogram2dcontour"
- }
- ],
- "mesh3d": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "type": "mesh3d"
- }
- ],
- "parcoords": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "parcoords"
- }
- ],
- "pie": [
- {
- "automargin": true,
- "type": "pie"
- }
- ],
- "scatter": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatter"
- }
- ],
- "scatter3d": [
- {
- "line": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatter3d"
- }
- ],
- "scattercarpet": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattercarpet"
- }
- ],
- "scattergeo": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattergeo"
- }
- ],
- "scattergl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattergl"
- }
- ],
- "scattermapbox": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scattermapbox"
- }
- ],
- "scatterpolar": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterpolar"
- }
- ],
- "scatterpolargl": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterpolargl"
- }
- ],
- "scatterternary": [
- {
- "marker": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "type": "scatterternary"
- }
- ],
- "surface": [
- {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- },
- "colorscale": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "type": "surface"
- }
- ],
- "table": [
- {
- "cells": {
- "fill": {
- "color": "#EBF0F8"
- },
- "line": {
- "color": "white"
- }
- },
- "header": {
- "fill": {
- "color": "#C8D4E3"
- },
- "line": {
- "color": "white"
- }
- },
- "type": "table"
- }
- ]
- },
- "layout": {
- "annotationdefaults": {
- "arrowcolor": "#2a3f5f",
- "arrowhead": 0,
- "arrowwidth": 1
- },
- "autotypenumbers": "strict",
- "coloraxis": {
- "colorbar": {
- "outlinewidth": 0,
- "ticks": ""
- }
- },
- "colorscale": {
- "diverging": [
- [
- 0,
- "#8e0152"
- ],
- [
- 0.1,
- "#c51b7d"
- ],
- [
- 0.2,
- "#de77ae"
- ],
- [
- 0.3,
- "#f1b6da"
- ],
- [
- 0.4,
- "#fde0ef"
- ],
- [
- 0.5,
- "#f7f7f7"
- ],
- [
- 0.6,
- "#e6f5d0"
- ],
- [
- 0.7,
- "#b8e186"
- ],
- [
- 0.8,
- "#7fbc41"
- ],
- [
- 0.9,
- "#4d9221"
- ],
- [
- 1,
- "#276419"
- ]
- ],
- "sequential": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ],
- "sequentialminus": [
- [
- 0,
- "#0d0887"
- ],
- [
- 0.1111111111111111,
- "#46039f"
- ],
- [
- 0.2222222222222222,
- "#7201a8"
- ],
- [
- 0.3333333333333333,
- "#9c179e"
- ],
- [
- 0.4444444444444444,
- "#bd3786"
- ],
- [
- 0.5555555555555556,
- "#d8576b"
- ],
- [
- 0.6666666666666666,
- "#ed7953"
- ],
- [
- 0.7777777777777778,
- "#fb9f3a"
- ],
- [
- 0.8888888888888888,
- "#fdca26"
- ],
- [
- 1,
- "#f0f921"
- ]
- ]
- },
- "colorway": [
- "#636efa",
- "#EF553B",
- "#00cc96",
- "#ab63fa",
- "#FFA15A",
- "#19d3f3",
- "#FF6692",
- "#B6E880",
- "#FF97FF",
- "#FECB52"
- ],
- "font": {
- "color": "#2a3f5f"
- },
- "geo": {
- "bgcolor": "white",
- "lakecolor": "white",
- "landcolor": "#E5ECF6",
- "showlakes": true,
- "showland": true,
- "subunitcolor": "white"
- },
- "hoverlabel": {
- "align": "left"
- },
- "hovermode": "closest",
- "mapbox": {
- "style": "light"
- },
- "paper_bgcolor": "white",
- "plot_bgcolor": "#E5ECF6",
- "polar": {
- "angularaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "bgcolor": "#E5ECF6",
- "radialaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- }
- },
- "scene": {
- "xaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- },
- "yaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- },
- "zaxis": {
- "backgroundcolor": "#E5ECF6",
- "gridcolor": "white",
- "gridwidth": 2,
- "linecolor": "white",
- "showbackground": true,
- "ticks": "",
- "zerolinecolor": "white"
- }
- },
- "shapedefaults": {
- "line": {
- "color": "#2a3f5f"
- }
- },
- "ternary": {
- "aaxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "baxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- },
- "bgcolor": "#E5ECF6",
- "caxis": {
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": ""
- }
- },
- "title": {
- "x": 0.05
- },
- "xaxis": {
- "automargin": true,
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": "",
- "title": {
- "standoff": 15
- },
- "zerolinecolor": "white",
- "zerolinewidth": 2
- },
- "yaxis": {
- "automargin": true,
- "gridcolor": "white",
- "linecolor": "white",
- "ticks": "",
- "title": {
- "standoff": 15
- },
- "zerolinecolor": "white",
- "zerolinewidth": 2
- }
- }
},
- "title": {
- "text": "re3data subject coverage"
- },
- "xaxis": {
- "tickangle": 45,
- "tickfont": {
- "size": 12
- }
- }
- }
- },
- "text/html": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
- }
- ],
- "source": [
- "re3data_subjects = re3data_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)\n",
- "\n",
- "data = [\n",
- " go.Bar(\n",
- " x=re3data_subjects.index,\n",
- " y=re3data_subjects['url']\n",
- " )\n",
- "]\n",
- "\n",
- "layout = go.Layout(\n",
- " title='re3data subject coverage',\n",
- " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n",
- ")\n",
- "fig = go.Figure(data=data, layout=layout)\n",
- "plotly.offline.iplot(fig)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "# OpenDOAR"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 82,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " url | \n",
- " official_name | \n",
- " english_name | \n",
- " description | \n",
- " latitude | \n",
- " longitude | \n",
- " subjects | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 10|opendoar____::e833e042f509c996b1b25324d56659fb | \n",
- " http://www.bilbao.net/bld | \n",
- " BLD - Bilboko Liburutegi Digitala | \n",
- " BLD - Bilboko Liburutegi Digitala | \n",
- " BLD is a repository of digital documents, desi... | \n",
- " 43.256699 | \n",
- " -2.924100 | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 1 | \n",
- " 10|opendoar____::f621585df244e9596dc70a39b579efb1 | \n",
- " https://researchdirect.westernsydney.edu.au/ | \n",
- " Western Sydney ResearchDirect | \n",
- " Western Sydney ResearchDirect | \n",
- " NaN | \n",
- " 0.000000 | \n",
- " 0.000000 | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 2 | \n",
- " 10|opendoar____::437d7d1d97917cd627a34a6a0fb41136 | \n",
- " http://redress.lancs.ac.uk/Learning_Space/ | \n",
- " Learning Space Catalogue | \n",
- " NaN | \n",
- " This repository is a Social Science e-Science ... | \n",
- " 54.010760 | \n",
- " -2.784990 | \n",
- " ['Social Sciences General', 'Science General',... | \n",
- "
\n",
- " \n",
- " 3 | \n",
- " 10|opendoar____::d840cc5d906c3e9c84374c8919d2074e | \n",
- " http://digitallibrary.usc.edu/search/controlle... | \n",
- " USC Digital Library | \n",
- " USC Digital Library | \n",
- " This is an institutional repository providing ... | \n",
- " 34.052200 | \n",
- " -118.242996 | \n",
- " [] | \n",
- "
\n",
- " \n",
- " 5 | \n",
- " 10|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010 | \n",
- " http://www.ufgd.edu.br:8080/jspui/ | \n",
- " Repositório de Divulgação das Produções Cientí... | \n",
- " Repositório de Divulgação das Produções Cientí... | \n",
- " This site provides access to the research outp... | \n",
- " -22.221800 | \n",
- " -54.806400 | \n",
- " [] | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id \\\n",
- "0 10|opendoar____::e833e042f509c996b1b25324d56659fb \n",
- "1 10|opendoar____::f621585df244e9596dc70a39b579efb1 \n",
- "2 10|opendoar____::437d7d1d97917cd627a34a6a0fb41136 \n",
- "3 10|opendoar____::d840cc5d906c3e9c84374c8919d2074e \n",
- "5 10|opendoar____::4ba3c163cd1efd4c14e3a415fa0a3010 \n",
- "\n",
- " url \\\n",
- "0 http://www.bilbao.net/bld \n",
- "1 https://researchdirect.westernsydney.edu.au/ \n",
- "2 http://redress.lancs.ac.uk/Learning_Space/ \n",
- "3 http://digitallibrary.usc.edu/search/controlle... \n",
- "5 http://www.ufgd.edu.br:8080/jspui/ \n",
- "\n",
- " official_name \\\n",
- "0 BLD - Bilboko Liburutegi Digitala \n",
- "1 Western Sydney ResearchDirect \n",
- "2 Learning Space Catalogue \n",
- "3 USC Digital Library \n",
- "5 Repositório de Divulgação das Produções Cientí... \n",
- "\n",
- " english_name \\\n",
- "0 BLD - Bilboko Liburutegi Digitala \n",
- "1 Western Sydney ResearchDirect \n",
- "2 NaN \n",
- "3 USC Digital Library \n",
- "5 Repositório de Divulgação das Produções Cientí... \n",
- "\n",
- " description latitude longitude \\\n",
- "0 BLD is a repository of digital documents, desi... 43.256699 -2.924100 \n",
- "1 NaN 0.000000 0.000000 \n",
- "2 This repository is a Social Science e-Science ... 54.010760 -2.784990 \n",
- "3 This is an institutional repository providing ... 34.052200 -118.242996 \n",
- "5 This site provides access to the research outp... -22.221800 -54.806400 \n",
- "\n",
- " subjects \n",
- "0 [] \n",
- "1 [] \n",
- "2 ['Social Sciences General', 'Science General',... \n",
- "3 [] \n",
- "5 [] "
- ]
- },
- "execution_count": 82,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "opendoar_df = pd.read_csv('../data/raw/re3data_opendoar.csv')\n",
- "opendoar_df = opendoar_df[opendoar_df.id.str.contains('opendoar')]\n",
- "opendoar_df.head()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 84,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 []\n",
- "1 []\n",
- "2 ['Social Sciences General', 'Science General',...\n",
- "3 []\n",
- "5 []\n",
- " ... \n",
- "8701 ['Multidisciplinary']\n",
- "8702 []\n",
- "8703 ['Business and Economics']\n",
- "8704 ['Earth and Planetary Sciences', 'Ecology and ...\n",
- "8706 []\n",
- "Name: subjects, Length: 6014, dtype: object"
- ]
- },
- "execution_count": 84,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "opendoar_df.subjects"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 85,
- "metadata": {},
- "outputs": [],
- "source": [
- "opendoar_df['subjects'] = opendoar_df.subjects.apply(lambda x: ast.literal_eval(x))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 86,
- "metadata": {},
- "outputs": [],
- "source": [
- "opendoar_cleaned_subjects = opendoar_df.explode('subjects').subjects.str.split(',| and ', expand=True)\\\n",
- " .apply(lambda row: row.dropna().tolist(), axis=1)\\\n",
- " .reset_index()\\\n",
- " .groupby('index')[0].apply(lambda x: merge_lists(x))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 87,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "index\n",
- "0 []\n",
- "1 []\n",
- "2 [Social Sciences General, Science General, Com...\n",
- "3 []\n",
- "5 []\n",
- " ... \n",
- "8701 [Multidisciplinary]\n",
- "8702 []\n",
- "8703 [Business, Economics]\n",
- "8704 [Earth, Planetary Sciences, Ecology, Environme...\n",
- "8706 []\n",
- "Name: 0, Length: 6014, dtype: object"
- ]
- },
- "execution_count": 87,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "opendoar_cleaned_subjects"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 88,
- "metadata": {},
- "outputs": [],
- "source": [
- "opendoar_df = opendoar_df.join(opendoar_cleaned_subjects)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 89,
- "metadata": {},
- "outputs": [],
- "source": [
- "opendoar_df.drop(columns=['subjects'], inplace=True)\n",
- "opendoar_df.rename(columns={0: 'subjects'}, inplace=True)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 90,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " id | \n",
- " url | \n",
- " official_name | \n",
- " english_name | \n",
- " description | \n",
- " latitude | \n",
- " longitude | \n",
- " subjects | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " count | \n",
- " 6014 | \n",
- " 6013 | \n",
- " 6014 | \n",
- " 5500 | \n",
- " 5776 | \n",
- " 6014.000000 | \n",
- " 6014.000000 | \n",
- " 6014 | \n",
- "
\n",
- " \n",
- " unique | \n",
- " 6014 | \n",
- " 5953 | \n",
- " 5946 | \n",
- " 5413 | \n",
- " 4920 | \n",
- " NaN | \n",
- " NaN | \n",
- " 201 | \n",
- "
\n",
- " \n",
- " top | \n",
- " 10|opendoar____::a2557a7b2e94197ff767970b67041697 | \n",
- " http://harp.lib.hiroshima-u.ac.jp/ | \n",
- " Hiroshima Associated Repository Portal | \n",
- " AURA | \n",
- " This site provides access to the research outp... | \n",
- " NaN | \n",
- " NaN | \n",
- " [] | \n",
- "
\n",
- " \n",
- " freq | \n",
- " 1 | \n",
- " 3 | \n",
- " 3 | \n",
- " 4 | \n",
- " 98 | \n",
- " NaN | \n",
- " NaN | \n",
- " 5273 | \n",
- "
\n",
- " \n",
- " mean | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 38.649393 | \n",
- " 7.810948 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " std | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 788.406173 | \n",
- " 71.689788 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " min | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " -79.029999 | \n",
- " -683.103027 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 25% | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 4.644632 | \n",
- " -49.273300 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 50% | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 37.930449 | \n",
- " 4.788870 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 75% | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 47.294400 | \n",
- " 30.685501 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " max | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 61138.800781 | \n",
- " 178.438995 | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " id \\\n",
- "count 6014 \n",
- "unique 6014 \n",
- "top 10|opendoar____::a2557a7b2e94197ff767970b67041697 \n",
- "freq 1 \n",
- "mean NaN \n",
- "std NaN \n",
- "min NaN \n",
- "25% NaN \n",
- "50% NaN \n",
- "75% NaN \n",
- "max NaN \n",
- "\n",
- " url \\\n",
- "count 6013 \n",
- "unique 5953 \n",
- "top http://harp.lib.hiroshima-u.ac.jp/ \n",
- "freq 3 \n",
- "mean NaN \n",
- "std NaN \n",
- "min NaN \n",
- "25% NaN \n",
- "50% NaN \n",
- "75% NaN \n",
- "max NaN \n",
- "\n",
- " official_name english_name \\\n",
- "count 6014 5500 \n",
- "unique 5946 5413 \n",
- "top Hiroshima Associated Repository Portal AURA \n",
- "freq 3 4 \n",
- "mean NaN NaN \n",
- "std NaN NaN \n",
- "min NaN NaN \n",
- "25% NaN NaN \n",
- "50% NaN NaN \n",
- "75% NaN NaN \n",
- "max NaN NaN \n",
- "\n",
- " description latitude \\\n",
- "count 5776 6014.000000 \n",
- "unique 4920 NaN \n",
- "top This site provides access to the research outp... NaN \n",
- "freq 98 NaN \n",
- "mean NaN 38.649393 \n",
- "std NaN 788.406173 \n",
- "min NaN -79.029999 \n",
- "25% NaN 4.644632 \n",
- "50% NaN 37.930449 \n",
- "75% NaN 47.294400 \n",
- "max NaN 61138.800781 \n",
- "\n",
- " longitude subjects \n",
- "count 6014.000000 6014 \n",
- "unique NaN 201 \n",
- "top NaN [] \n",
- "freq NaN 5273 \n",
- "mean 7.810948 NaN \n",
- "std 71.689788 NaN \n",
- "min -683.103027 NaN \n",
- "25% -49.273300 NaN \n",
- "50% 4.788870 NaN \n",
- "75% 30.685501 NaN \n",
- "max 178.438995 NaN "
- ]
- },
- "execution_count": 90,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "opendoar_df.describe(include='all')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 91,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "application/vnd.plotly.v1+json": {
- "config": {
- "linkText": "Export to plot.ly",
- "plotlyServerURL": "https://plot.ly",
- "showLink": false
- },
- "data": [
{
+ "name": "OpenDOAR",
"type": "bar",
+ "visible": "legendonly",
"x": [
"Multidisciplinary",
"Medicine",
@@ -6592,7 +3891,7 @@
}
},
"title": {
- "text": "OpenDOAR subject coverage"
+ "text": "Subject coverage"
},
"xaxis": {
"tickangle": 45,
@@ -6603,9 +3902,9 @@
}
},
"text/html": [
- ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "data1 = fairsharing_countries.groupby('continent')[['url']].count()\n",
+ "data2 = opendoar_df.groupby('continent')[['url']].count()\n",
+ "\n",
+ "plot = [\n",
+ " go.Scatterpolar(\n",
+ " r=data1.url,\n",
+ " theta=data1.index,\n",
+ " fill='toself',\n",
+ " name='FAIRsharing'),\n",
+ " go.Scatterpolar(\n",
+ " r=data2.url,\n",
+ " theta=data2.index,\n",
+ " fill='toself',\n",
+ " name='OpenDOAR')\n",
+ "]\n",
+ "\n",
+ "layout = go.Layout(polar=dict(\n",
+ " radialaxis=dict(\n",
+ " visible=True\n",
+ " ),\n",
+ " )\n",
+ ")\n",
+ "\n",
+ "go.Figure(plot, layout).show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
{
"cell_type": "code",
"execution_count": null,