diff --git a/notebooks/01.1-exploration-re3data.ipynb b/notebooks/01.1-exploration-re3data.ipynb
new file mode 100644
index 0000000..b79a854
--- /dev/null
+++ b/notebooks/01.1-exploration-re3data.ipynb
@@ -0,0 +1,1330 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import ast\n",
+ "import csv\n",
+ "import json\n",
+ "import reverse_geocoder as rg\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "import pycountry_convert\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "from matplotlib_venn import venn2, venn2_circles\n",
+ "\n",
+ "import plotly\n",
+ "from plotly.offline import iplot, init_notebook_mode\n",
+ "import plotly.graph_objs as go\n",
+ "import plotly.express as px\n",
+ "\n",
+ "pd.set_option('display.max_columns', None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Loading datasets"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**re3data**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " re3data_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " provider_type | \n",
+ " keyword | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " r3d100000001 | \n",
+ " Odum Institute Archive Dataverse | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 111 Social ... | \n",
+ " [dataProvider] | \n",
+ " [FAIR, Middle East, crime, demography, economy... | \n",
+ " [[Odum Institute for Research in Social Scienc... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " r3d100000002 | \n",
+ " Access to Archival Databases | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 102 History... | \n",
+ " [dataProvider] | \n",
+ " [US History] | \n",
+ " [[The U.S. National Archives and Records Admin... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " r3d100000004 | \n",
+ " Datenbank Gesprochenes Deutsch | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 104 Linguis... | \n",
+ " [dataProvider, serviceProvider] | \n",
+ " [Australian German, FOLK, German dialects, Pfe... | \n",
+ " [[Institut für Deutsche Sprache, Archiv für Ge... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " r3d100000005 | \n",
+ " UNC Dataverse | \n",
+ " [institutional] | \n",
+ " [1 Humanities and Social Sciences, 111 Social ... | \n",
+ " [dataProvider, serviceProvider] | \n",
+ " [FAIR, census, demographic survey, demography,... | \n",
+ " [[Odum Institute for Research in Social Scienc... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " r3d100000006 | \n",
+ " Archaeology Data Service | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 101 Ancient... | \n",
+ " [dataProvider, serviceProvider] | \n",
+ " [FAIR, archaeology, cultural heritage, prehist... | \n",
+ " [[Arts and Humanities Research Council, [AHRC]... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " re3data_id repository_name type \\\n",
+ "0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] \n",
+ "1 r3d100000002 Access to Archival Databases [disciplinary] \n",
+ "2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] \n",
+ "3 r3d100000005 UNC Dataverse [institutional] \n",
+ "4 r3d100000006 Archaeology Data Service [disciplinary] \n",
+ "\n",
+ " subject \\\n",
+ "0 [1 Humanities and Social Sciences, 111 Social ... \n",
+ "1 [1 Humanities and Social Sciences, 102 History... \n",
+ "2 [1 Humanities and Social Sciences, 104 Linguis... \n",
+ "3 [1 Humanities and Social Sciences, 111 Social ... \n",
+ "4 [1 Humanities and Social Sciences, 101 Ancient... \n",
+ "\n",
+ " provider_type \\\n",
+ "0 [dataProvider] \n",
+ "1 [dataProvider] \n",
+ "2 [dataProvider, serviceProvider] \n",
+ "3 [dataProvider, serviceProvider] \n",
+ "4 [dataProvider, serviceProvider] \n",
+ "\n",
+ " keyword \\\n",
+ "0 [FAIR, Middle East, crime, demography, economy... \n",
+ "1 [US History] \n",
+ "2 [Australian German, FOLK, German dialects, Pfe... \n",
+ "3 [FAIR, census, demographic survey, demography,... \n",
+ "4 [FAIR, archaeology, cultural heritage, prehist... \n",
+ "\n",
+ " institution \n",
+ "0 [[Odum Institute for Research in Social Scienc... \n",
+ "1 [[The U.S. National Archives and Records Admin... \n",
+ "2 [[Institut für Deutsche Sprache, Archiv für Ge... \n",
+ "3 [[Odum Institute for Research in Social Scienc... \n",
+ "4 [[Arts and Humanities Research Council, [AHRC]... "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n",
+ " converters={'subject': ast.literal_eval,\n",
+ " 'keyword': ast.literal_eval,\n",
+ " 'additional_name': ast.literal_eval,\n",
+ " 'repository_id': ast.literal_eval,\n",
+ " 'type': ast.literal_eval,\n",
+ " 'content_type': ast.literal_eval,\n",
+ " 'provider_type': ast.literal_eval,\n",
+ " 'institution': ast.literal_eval\n",
+ " },\n",
+ " usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])\n",
+ "re3data_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "re3data_df = re3data_df.explode('provider_type')\n",
+ "re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " re3data_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " provider_type | \n",
+ " keyword | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ " 2459 | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " 2466 | \n",
+ " 2463 | \n",
+ " 9 | \n",
+ " 1282 | \n",
+ " 1 | \n",
+ " 2248 | \n",
+ " 2447 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " r3d100011987 | \n",
+ " Landmap | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 2 Life Scie... | \n",
+ " dataProvider | \n",
+ " [multidisciplinary] | \n",
+ " [[National Center for Biotechnology Informatio... | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1573 | \n",
+ " 200 | \n",
+ " 2459 | \n",
+ " 181 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " re3data_id repository_name type \\\n",
+ "count 2467 2467 2467 \n",
+ "unique 2466 2463 9 \n",
+ "top r3d100011987 Landmap [disciplinary] \n",
+ "freq 2 2 1573 \n",
+ "\n",
+ " subject provider_type \\\n",
+ "count 2467 2459 \n",
+ "unique 1282 1 \n",
+ "top [1 Humanities and Social Sciences, 2 Life Scie... dataProvider \n",
+ "freq 200 2459 \n",
+ "\n",
+ " keyword institution \n",
+ "count 2467 2467 \n",
+ "unique 2248 2447 \n",
+ "top [multidisciplinary] [[National Center for Biotechnology Informatio... \n",
+ "freq 181 6 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "re3data_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**openDOAR**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " opendoar_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 101 | \n",
+ " utrecht university repository | \n",
+ " institutional | \n",
+ " [multidisciplinary] | \n",
+ " [[university of utrecht, [universiteit utrecht... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 115 | \n",
+ " dspace at indian institute of management kozhi... | \n",
+ " institutional | \n",
+ " [ecology and environment, social sciences gene... | \n",
+ " [[indian institute of management kozhikode, [i... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 41 | \n",
+ " caltech engineering and science online | \n",
+ " institutional | \n",
+ " [biology and biochemistry, chemistry and chemi... | \n",
+ " [[california institute of technology, [caltech... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 119 | \n",
+ " dcu online research access service | \n",
+ " institutional | \n",
+ " [multidisciplinary] | \n",
+ " [[dublin city university, [dcu], ie, [], , htt... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 129 | \n",
+ " earth-prints repository | \n",
+ " disciplinary | \n",
+ " [earth and planetary sciences] | \n",
+ " [[istituto nazionale di geofisica e vulcanolog... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " opendoar_id repository_name \\\n",
+ "0 101 utrecht university repository \n",
+ "1 115 dspace at indian institute of management kozhi... \n",
+ "2 41 caltech engineering and science online \n",
+ "3 119 dcu online research access service \n",
+ "4 129 earth-prints repository \n",
+ "\n",
+ " type subject \\\n",
+ "0 institutional [multidisciplinary] \n",
+ "1 institutional [ecology and environment, social sciences gene... \n",
+ "2 institutional [biology and biochemistry, chemistry and chemi... \n",
+ "3 institutional [multidisciplinary] \n",
+ "4 disciplinary [earth and planetary sciences] \n",
+ "\n",
+ " institution \n",
+ "0 [[university of utrecht, [universiteit utrecht... \n",
+ "1 [[indian institute of management kozhikode, [i... \n",
+ "2 [[california institute of technology, [caltech... \n",
+ "3 [[dublin city university, [dcu], ie, [], , htt... \n",
+ "4 [[istituto nazionale di geofisica e vulcanolog... "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n",
+ " converters={'subject': ast.literal_eval,\n",
+ " 'additional_name': ast.literal_eval,\n",
+ " 'opendoar_id': ast.literal_eval,\n",
+ " 'content_type': ast.literal_eval,\n",
+ " 'institution': ast.literal_eval\n",
+ " },\n",
+ " usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])\n",
+ "opendoar_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " opendoar_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 5707.000000 | \n",
+ " 5707 | \n",
+ " 5707 | \n",
+ " 5707 | \n",
+ " 5707 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " NaN | \n",
+ " 5670 | \n",
+ " 4 | \n",
+ " 820 | \n",
+ " 5098 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " NaN | \n",
+ " arch | \n",
+ " institutional | \n",
+ " [multidisciplinary] | \n",
+ " [[rijksuniversiteit groningen, [rug], nl, [], ... | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 5067 | \n",
+ " 3212 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 4008.118801 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 2869.948770 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 2.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 1823.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 3361.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 5095.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 10175.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " opendoar_id repository_name type subject \\\n",
+ "count 5707.000000 5707 5707 5707 \n",
+ "unique NaN 5670 4 820 \n",
+ "top NaN arch institutional [multidisciplinary] \n",
+ "freq NaN 3 5067 3212 \n",
+ "mean 4008.118801 NaN NaN NaN \n",
+ "std 2869.948770 NaN NaN NaN \n",
+ "min 2.000000 NaN NaN NaN \n",
+ "25% 1823.000000 NaN NaN NaN \n",
+ "50% 3361.000000 NaN NaN NaN \n",
+ "75% 5095.000000 NaN NaN NaN \n",
+ "max 10175.000000 NaN NaN NaN \n",
+ "\n",
+ " institution \n",
+ "count 5707 \n",
+ "unique 5098 \n",
+ "top [[rijksuniversiteit groningen, [rug], nl, [], ... \n",
+ "freq 26 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "opendoar_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**ROAR**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " eprintid | \n",
+ " home_page | \n",
+ " title | \n",
+ " location_country | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 921 | \n",
+ " http://alcme.oclc.org/ndltd/index.html | \n",
+ " Networked Digital Library of Theses and Disser... | \n",
+ " us | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1489 | \n",
+ " http://prensahistorica.mcu.es/prensahistorica/... | \n",
+ " Virtual Library of Historical Press | \n",
+ " es | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 606 | \n",
+ " http://hal.archives-ouvertes.fr/ | \n",
+ " HAL: Hyper Article en Ligne | \n",
+ " fr | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 606 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 606 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " eprintid home_page \\\n",
+ "0 921 http://alcme.oclc.org/ndltd/index.html \n",
+ "1 1489 http://prensahistorica.mcu.es/prensahistorica/... \n",
+ "2 606 http://hal.archives-ouvertes.fr/ \n",
+ "3 606 NaN \n",
+ "4 606 NaN \n",
+ "\n",
+ " title location_country subjects \n",
+ "0 Networked Digital Library of Theses and Disser... us NaN \n",
+ "1 Virtual Library of Historical Press es NaN \n",
+ "2 HAL: Hyper Article en Ligne fr NaN \n",
+ "3 NaN NaN NaN \n",
+ "4 NaN NaN NaN "
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',\n",
+ " usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])\n",
+ "roar_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " eprintid | \n",
+ " home_page | \n",
+ " title | \n",
+ " location_country | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 106 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " Faculty Scholarship at The Claremont Colleges | \n",
+ " us | \n",
+ " AS | \n",
+ "
\n",
+ " \n",
+ " 107 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " BF | \n",
+ "
\n",
+ " \n",
+ " 108 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " BL | \n",
+ "
\n",
+ " \n",
+ " 109 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " CC | \n",
+ "
\n",
+ " \n",
+ " 110 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " GN | \n",
+ "
\n",
+ " \n",
+ " 111 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " H1 | \n",
+ "
\n",
+ " \n",
+ " 112 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " HB | \n",
+ "
\n",
+ " \n",
+ " 113 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " JA | \n",
+ "
\n",
+ " \n",
+ " 114 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " LB | \n",
+ "
\n",
+ " \n",
+ " 115 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NX | \n",
+ "
\n",
+ " \n",
+ " 116 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " PQ | \n",
+ "
\n",
+ " \n",
+ " 117 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " QA | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " eprintid home_page title \\\n",
+ "106 2303 NaN Faculty Scholarship at The Claremont Colleges \n",
+ "107 2303 NaN NaN \n",
+ "108 2303 NaN NaN \n",
+ "109 2303 NaN NaN \n",
+ "110 2303 NaN NaN \n",
+ "111 2303 NaN NaN \n",
+ "112 2303 NaN NaN \n",
+ "113 2303 NaN NaN \n",
+ "114 2303 NaN NaN \n",
+ "115 2303 NaN NaN \n",
+ "116 2303 NaN NaN \n",
+ "117 2303 NaN NaN \n",
+ "\n",
+ " location_country subjects \n",
+ "106 us AS \n",
+ "107 NaN BF \n",
+ "108 NaN BL \n",
+ "109 NaN CC \n",
+ "110 NaN GN \n",
+ "111 NaN H1 \n",
+ "112 NaN HB \n",
+ "113 NaN JA \n",
+ "114 NaN LB \n",
+ "115 NaN NX \n",
+ "116 NaN PQ \n",
+ "117 NaN QA "
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roar_df[roar_df.eprintid == 2303]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " eprintid | \n",
+ " home_page | \n",
+ " title | \n",
+ " location_country | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 5314.000000 | \n",
+ " 5263 | \n",
+ " 5268 | \n",
+ " 5024 | \n",
+ " 1225 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " NaN | \n",
+ " 5156 | \n",
+ " 5027 | \n",
+ " 134 | \n",
+ " 123 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " NaN | \n",
+ " http://ir.lib.isu.edu.tw/ | \n",
+ " Repositorio Institucional | \n",
+ " us | \n",
+ " H1 | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 7 | \n",
+ " 877 | \n",
+ " 147 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 6389.464434 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 5159.573937 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 1490.250000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 4990.500000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 10452.750000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 17302.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " eprintid home_page title \\\n",
+ "count 5314.000000 5263 5268 \n",
+ "unique NaN 5156 5027 \n",
+ "top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional \n",
+ "freq NaN 3 7 \n",
+ "mean 6389.464434 NaN NaN \n",
+ "std 5159.573937 NaN NaN \n",
+ "min 1.000000 NaN NaN \n",
+ "25% 1490.250000 NaN NaN \n",
+ "50% 4990.500000 NaN NaN \n",
+ "75% 10452.750000 NaN NaN \n",
+ "max 17302.000000 NaN NaN \n",
+ "\n",
+ " location_country subjects \n",
+ "count 5024 1225 \n",
+ "unique 134 123 \n",
+ "top us H1 \n",
+ "freq 877 147 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN "
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roar_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**FAIRsharing**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " full_name | \n",
+ " short_name | \n",
+ " fs_url | \n",
+ " url | \n",
+ " countries | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " GenBank | \n",
+ " GenBank | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.9... | \n",
+ " https://www.ncbi.nlm.nih.gov/genbank/ | \n",
+ " European Union,Japan,United States | \n",
+ " Bioinformatics,Data Management,Data Submission... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " GlycoNAVI | \n",
+ " GlycoNAVI | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.w... | \n",
+ " https://glyconavi.org/ | \n",
+ " Japan | \n",
+ " Chemistry,Glycomics,Life Science,Organic Chemi... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ADHDgene | \n",
+ " ADHDgene | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.m... | \n",
+ " http://adhd.psych.ac.cn/ | \n",
+ " China | \n",
+ " Biomedical Science,Genetics | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Allele frequency resource for research and tea... | \n",
+ " ALFRED | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.y... | \n",
+ " http://alfred.med.yale.edu | \n",
+ " United States | \n",
+ " Life Science | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Animal Transcription Factor Database | \n",
+ " AnimalTFDB | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.e... | \n",
+ " http://bioinfo.life.hust.edu.cn/AnimalTFDB/ | \n",
+ " China | \n",
+ " Life Science | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " full_name short_name \\\n",
+ "0 GenBank GenBank \n",
+ "1 GlycoNAVI GlycoNAVI \n",
+ "2 ADHDgene ADHDgene \n",
+ "3 Allele frequency resource for research and tea... ALFRED \n",
+ "4 Animal Transcription Factor Database AnimalTFDB \n",
+ "\n",
+ " fs_url \\\n",
+ "0 https://fairsharing.org/10.25504/FAIRsharing.9... \n",
+ "1 https://fairsharing.org/10.25504/FAIRsharing.w... \n",
+ "2 https://fairsharing.org/10.25504/FAIRsharing.m... \n",
+ "3 https://fairsharing.org/10.25504/FAIRsharing.y... \n",
+ "4 https://fairsharing.org/10.25504/FAIRsharing.e... \n",
+ "\n",
+ " url \\\n",
+ "0 https://www.ncbi.nlm.nih.gov/genbank/ \n",
+ "1 https://glyconavi.org/ \n",
+ "2 http://adhd.psych.ac.cn/ \n",
+ "3 http://alfred.med.yale.edu \n",
+ "4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n",
+ "\n",
+ " countries \\\n",
+ "0 European Union,Japan,United States \n",
+ "1 Japan \n",
+ "2 China \n",
+ "3 United States \n",
+ "4 China \n",
+ "\n",
+ " subjects \n",
+ "0 Bioinformatics,Data Management,Data Submission... \n",
+ "1 Chemistry,Glycomics,Life Science,Organic Chemi... \n",
+ "2 Biomedical Science,Genetics \n",
+ "3 Life Science \n",
+ "4 Life Science "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n",
+ " delimiter='|', header=0,\n",
+ " names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n",
+ "fairsharing_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " full_name | \n",
+ " short_name | \n",
+ " fs_url | \n",
+ " url | \n",
+ " countries | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 1749 | \n",
+ " 1690 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " 1752 | \n",
+ " 1741 | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 178 | \n",
+ " 834 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " CiteAb | \n",
+ " CGD | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.1... | \n",
+ " http://www.plexdb.org/ | \n",
+ " United States | \n",
+ " Life Science | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 588 | \n",
+ " 367 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " full_name short_name \\\n",
+ "count 1752 1752 \n",
+ "unique 1752 1741 \n",
+ "top CiteAb CGD \n",
+ "freq 1 3 \n",
+ "\n",
+ " fs_url \\\n",
+ "count 1752 \n",
+ "unique 1752 \n",
+ "top https://fairsharing.org/10.25504/FAIRsharing.1... \n",
+ "freq 1 \n",
+ "\n",
+ " url countries subjects \n",
+ "count 1752 1749 1690 \n",
+ "unique 1752 178 834 \n",
+ "top http://www.plexdb.org/ United States Life Science \n",
+ "freq 1 588 367 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fairsharing_df.describe(include='all')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/01.2-exploration-opendoar.ipynb b/notebooks/01.2-exploration-opendoar.ipynb
new file mode 100644
index 0000000..b79a854
--- /dev/null
+++ b/notebooks/01.2-exploration-opendoar.ipynb
@@ -0,0 +1,1330 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import ast\n",
+ "import csv\n",
+ "import json\n",
+ "import reverse_geocoder as rg\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "import pycountry_convert\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "from matplotlib_venn import venn2, venn2_circles\n",
+ "\n",
+ "import plotly\n",
+ "from plotly.offline import iplot, init_notebook_mode\n",
+ "import plotly.graph_objs as go\n",
+ "import plotly.express as px\n",
+ "\n",
+ "pd.set_option('display.max_columns', None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Loading datasets"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**re3data**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " re3data_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " provider_type | \n",
+ " keyword | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " r3d100000001 | \n",
+ " Odum Institute Archive Dataverse | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 111 Social ... | \n",
+ " [dataProvider] | \n",
+ " [FAIR, Middle East, crime, demography, economy... | \n",
+ " [[Odum Institute for Research in Social Scienc... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " r3d100000002 | \n",
+ " Access to Archival Databases | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 102 History... | \n",
+ " [dataProvider] | \n",
+ " [US History] | \n",
+ " [[The U.S. National Archives and Records Admin... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " r3d100000004 | \n",
+ " Datenbank Gesprochenes Deutsch | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 104 Linguis... | \n",
+ " [dataProvider, serviceProvider] | \n",
+ " [Australian German, FOLK, German dialects, Pfe... | \n",
+ " [[Institut für Deutsche Sprache, Archiv für Ge... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " r3d100000005 | \n",
+ " UNC Dataverse | \n",
+ " [institutional] | \n",
+ " [1 Humanities and Social Sciences, 111 Social ... | \n",
+ " [dataProvider, serviceProvider] | \n",
+ " [FAIR, census, demographic survey, demography,... | \n",
+ " [[Odum Institute for Research in Social Scienc... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " r3d100000006 | \n",
+ " Archaeology Data Service | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 101 Ancient... | \n",
+ " [dataProvider, serviceProvider] | \n",
+ " [FAIR, archaeology, cultural heritage, prehist... | \n",
+ " [[Arts and Humanities Research Council, [AHRC]... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " re3data_id repository_name type \\\n",
+ "0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] \n",
+ "1 r3d100000002 Access to Archival Databases [disciplinary] \n",
+ "2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] \n",
+ "3 r3d100000005 UNC Dataverse [institutional] \n",
+ "4 r3d100000006 Archaeology Data Service [disciplinary] \n",
+ "\n",
+ " subject \\\n",
+ "0 [1 Humanities and Social Sciences, 111 Social ... \n",
+ "1 [1 Humanities and Social Sciences, 102 History... \n",
+ "2 [1 Humanities and Social Sciences, 104 Linguis... \n",
+ "3 [1 Humanities and Social Sciences, 111 Social ... \n",
+ "4 [1 Humanities and Social Sciences, 101 Ancient... \n",
+ "\n",
+ " provider_type \\\n",
+ "0 [dataProvider] \n",
+ "1 [dataProvider] \n",
+ "2 [dataProvider, serviceProvider] \n",
+ "3 [dataProvider, serviceProvider] \n",
+ "4 [dataProvider, serviceProvider] \n",
+ "\n",
+ " keyword \\\n",
+ "0 [FAIR, Middle East, crime, demography, economy... \n",
+ "1 [US History] \n",
+ "2 [Australian German, FOLK, German dialects, Pfe... \n",
+ "3 [FAIR, census, demographic survey, demography,... \n",
+ "4 [FAIR, archaeology, cultural heritage, prehist... \n",
+ "\n",
+ " institution \n",
+ "0 [[Odum Institute for Research in Social Scienc... \n",
+ "1 [[The U.S. National Archives and Records Admin... \n",
+ "2 [[Institut für Deutsche Sprache, Archiv für Ge... \n",
+ "3 [[Odum Institute for Research in Social Scienc... \n",
+ "4 [[Arts and Humanities Research Council, [AHRC]... "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n",
+ " converters={'subject': ast.literal_eval,\n",
+ " 'keyword': ast.literal_eval,\n",
+ " 'additional_name': ast.literal_eval,\n",
+ " 'repository_id': ast.literal_eval,\n",
+ " 'type': ast.literal_eval,\n",
+ " 'content_type': ast.literal_eval,\n",
+ " 'provider_type': ast.literal_eval,\n",
+ " 'institution': ast.literal_eval\n",
+ " },\n",
+ " usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])\n",
+ "re3data_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "re3data_df = re3data_df.explode('provider_type')\n",
+ "re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " re3data_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " provider_type | \n",
+ " keyword | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ " 2459 | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " 2466 | \n",
+ " 2463 | \n",
+ " 9 | \n",
+ " 1282 | \n",
+ " 1 | \n",
+ " 2248 | \n",
+ " 2447 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " r3d100011987 | \n",
+ " Landmap | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 2 Life Scie... | \n",
+ " dataProvider | \n",
+ " [multidisciplinary] | \n",
+ " [[National Center for Biotechnology Informatio... | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1573 | \n",
+ " 200 | \n",
+ " 2459 | \n",
+ " 181 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " re3data_id repository_name type \\\n",
+ "count 2467 2467 2467 \n",
+ "unique 2466 2463 9 \n",
+ "top r3d100011987 Landmap [disciplinary] \n",
+ "freq 2 2 1573 \n",
+ "\n",
+ " subject provider_type \\\n",
+ "count 2467 2459 \n",
+ "unique 1282 1 \n",
+ "top [1 Humanities and Social Sciences, 2 Life Scie... dataProvider \n",
+ "freq 200 2459 \n",
+ "\n",
+ " keyword institution \n",
+ "count 2467 2467 \n",
+ "unique 2248 2447 \n",
+ "top [multidisciplinary] [[National Center for Biotechnology Informatio... \n",
+ "freq 181 6 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "re3data_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**openDOAR**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " opendoar_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 101 | \n",
+ " utrecht university repository | \n",
+ " institutional | \n",
+ " [multidisciplinary] | \n",
+ " [[university of utrecht, [universiteit utrecht... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 115 | \n",
+ " dspace at indian institute of management kozhi... | \n",
+ " institutional | \n",
+ " [ecology and environment, social sciences gene... | \n",
+ " [[indian institute of management kozhikode, [i... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 41 | \n",
+ " caltech engineering and science online | \n",
+ " institutional | \n",
+ " [biology and biochemistry, chemistry and chemi... | \n",
+ " [[california institute of technology, [caltech... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 119 | \n",
+ " dcu online research access service | \n",
+ " institutional | \n",
+ " [multidisciplinary] | \n",
+ " [[dublin city university, [dcu], ie, [], , htt... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 129 | \n",
+ " earth-prints repository | \n",
+ " disciplinary | \n",
+ " [earth and planetary sciences] | \n",
+ " [[istituto nazionale di geofisica e vulcanolog... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " opendoar_id repository_name \\\n",
+ "0 101 utrecht university repository \n",
+ "1 115 dspace at indian institute of management kozhi... \n",
+ "2 41 caltech engineering and science online \n",
+ "3 119 dcu online research access service \n",
+ "4 129 earth-prints repository \n",
+ "\n",
+ " type subject \\\n",
+ "0 institutional [multidisciplinary] \n",
+ "1 institutional [ecology and environment, social sciences gene... \n",
+ "2 institutional [biology and biochemistry, chemistry and chemi... \n",
+ "3 institutional [multidisciplinary] \n",
+ "4 disciplinary [earth and planetary sciences] \n",
+ "\n",
+ " institution \n",
+ "0 [[university of utrecht, [universiteit utrecht... \n",
+ "1 [[indian institute of management kozhikode, [i... \n",
+ "2 [[california institute of technology, [caltech... \n",
+ "3 [[dublin city university, [dcu], ie, [], , htt... \n",
+ "4 [[istituto nazionale di geofisica e vulcanolog... "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n",
+ " converters={'subject': ast.literal_eval,\n",
+ " 'additional_name': ast.literal_eval,\n",
+ " 'opendoar_id': ast.literal_eval,\n",
+ " 'content_type': ast.literal_eval,\n",
+ " 'institution': ast.literal_eval\n",
+ " },\n",
+ " usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])\n",
+ "opendoar_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " opendoar_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 5707.000000 | \n",
+ " 5707 | \n",
+ " 5707 | \n",
+ " 5707 | \n",
+ " 5707 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " NaN | \n",
+ " 5670 | \n",
+ " 4 | \n",
+ " 820 | \n",
+ " 5098 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " NaN | \n",
+ " arch | \n",
+ " institutional | \n",
+ " [multidisciplinary] | \n",
+ " [[rijksuniversiteit groningen, [rug], nl, [], ... | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 5067 | \n",
+ " 3212 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 4008.118801 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 2869.948770 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 2.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 1823.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 3361.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 5095.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 10175.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " opendoar_id repository_name type subject \\\n",
+ "count 5707.000000 5707 5707 5707 \n",
+ "unique NaN 5670 4 820 \n",
+ "top NaN arch institutional [multidisciplinary] \n",
+ "freq NaN 3 5067 3212 \n",
+ "mean 4008.118801 NaN NaN NaN \n",
+ "std 2869.948770 NaN NaN NaN \n",
+ "min 2.000000 NaN NaN NaN \n",
+ "25% 1823.000000 NaN NaN NaN \n",
+ "50% 3361.000000 NaN NaN NaN \n",
+ "75% 5095.000000 NaN NaN NaN \n",
+ "max 10175.000000 NaN NaN NaN \n",
+ "\n",
+ " institution \n",
+ "count 5707 \n",
+ "unique 5098 \n",
+ "top [[rijksuniversiteit groningen, [rug], nl, [], ... \n",
+ "freq 26 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "opendoar_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**ROAR**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " eprintid | \n",
+ " home_page | \n",
+ " title | \n",
+ " location_country | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 921 | \n",
+ " http://alcme.oclc.org/ndltd/index.html | \n",
+ " Networked Digital Library of Theses and Disser... | \n",
+ " us | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1489 | \n",
+ " http://prensahistorica.mcu.es/prensahistorica/... | \n",
+ " Virtual Library of Historical Press | \n",
+ " es | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 606 | \n",
+ " http://hal.archives-ouvertes.fr/ | \n",
+ " HAL: Hyper Article en Ligne | \n",
+ " fr | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 606 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 606 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " eprintid home_page \\\n",
+ "0 921 http://alcme.oclc.org/ndltd/index.html \n",
+ "1 1489 http://prensahistorica.mcu.es/prensahistorica/... \n",
+ "2 606 http://hal.archives-ouvertes.fr/ \n",
+ "3 606 NaN \n",
+ "4 606 NaN \n",
+ "\n",
+ " title location_country subjects \n",
+ "0 Networked Digital Library of Theses and Disser... us NaN \n",
+ "1 Virtual Library of Historical Press es NaN \n",
+ "2 HAL: Hyper Article en Ligne fr NaN \n",
+ "3 NaN NaN NaN \n",
+ "4 NaN NaN NaN "
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',\n",
+ " usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])\n",
+ "roar_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " eprintid | \n",
+ " home_page | \n",
+ " title | \n",
+ " location_country | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 106 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " Faculty Scholarship at The Claremont Colleges | \n",
+ " us | \n",
+ " AS | \n",
+ "
\n",
+ " \n",
+ " 107 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " BF | \n",
+ "
\n",
+ " \n",
+ " 108 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " BL | \n",
+ "
\n",
+ " \n",
+ " 109 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " CC | \n",
+ "
\n",
+ " \n",
+ " 110 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " GN | \n",
+ "
\n",
+ " \n",
+ " 111 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " H1 | \n",
+ "
\n",
+ " \n",
+ " 112 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " HB | \n",
+ "
\n",
+ " \n",
+ " 113 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " JA | \n",
+ "
\n",
+ " \n",
+ " 114 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " LB | \n",
+ "
\n",
+ " \n",
+ " 115 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NX | \n",
+ "
\n",
+ " \n",
+ " 116 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " PQ | \n",
+ "
\n",
+ " \n",
+ " 117 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " QA | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " eprintid home_page title \\\n",
+ "106 2303 NaN Faculty Scholarship at The Claremont Colleges \n",
+ "107 2303 NaN NaN \n",
+ "108 2303 NaN NaN \n",
+ "109 2303 NaN NaN \n",
+ "110 2303 NaN NaN \n",
+ "111 2303 NaN NaN \n",
+ "112 2303 NaN NaN \n",
+ "113 2303 NaN NaN \n",
+ "114 2303 NaN NaN \n",
+ "115 2303 NaN NaN \n",
+ "116 2303 NaN NaN \n",
+ "117 2303 NaN NaN \n",
+ "\n",
+ " location_country subjects \n",
+ "106 us AS \n",
+ "107 NaN BF \n",
+ "108 NaN BL \n",
+ "109 NaN CC \n",
+ "110 NaN GN \n",
+ "111 NaN H1 \n",
+ "112 NaN HB \n",
+ "113 NaN JA \n",
+ "114 NaN LB \n",
+ "115 NaN NX \n",
+ "116 NaN PQ \n",
+ "117 NaN QA "
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roar_df[roar_df.eprintid == 2303]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " eprintid | \n",
+ " home_page | \n",
+ " title | \n",
+ " location_country | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 5314.000000 | \n",
+ " 5263 | \n",
+ " 5268 | \n",
+ " 5024 | \n",
+ " 1225 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " NaN | \n",
+ " 5156 | \n",
+ " 5027 | \n",
+ " 134 | \n",
+ " 123 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " NaN | \n",
+ " http://ir.lib.isu.edu.tw/ | \n",
+ " Repositorio Institucional | \n",
+ " us | \n",
+ " H1 | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 7 | \n",
+ " 877 | \n",
+ " 147 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 6389.464434 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 5159.573937 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 1490.250000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 4990.500000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 10452.750000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 17302.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " eprintid home_page title \\\n",
+ "count 5314.000000 5263 5268 \n",
+ "unique NaN 5156 5027 \n",
+ "top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional \n",
+ "freq NaN 3 7 \n",
+ "mean 6389.464434 NaN NaN \n",
+ "std 5159.573937 NaN NaN \n",
+ "min 1.000000 NaN NaN \n",
+ "25% 1490.250000 NaN NaN \n",
+ "50% 4990.500000 NaN NaN \n",
+ "75% 10452.750000 NaN NaN \n",
+ "max 17302.000000 NaN NaN \n",
+ "\n",
+ " location_country subjects \n",
+ "count 5024 1225 \n",
+ "unique 134 123 \n",
+ "top us H1 \n",
+ "freq 877 147 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN "
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roar_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**FAIRsharing**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " full_name | \n",
+ " short_name | \n",
+ " fs_url | \n",
+ " url | \n",
+ " countries | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " GenBank | \n",
+ " GenBank | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.9... | \n",
+ " https://www.ncbi.nlm.nih.gov/genbank/ | \n",
+ " European Union,Japan,United States | \n",
+ " Bioinformatics,Data Management,Data Submission... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " GlycoNAVI | \n",
+ " GlycoNAVI | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.w... | \n",
+ " https://glyconavi.org/ | \n",
+ " Japan | \n",
+ " Chemistry,Glycomics,Life Science,Organic Chemi... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ADHDgene | \n",
+ " ADHDgene | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.m... | \n",
+ " http://adhd.psych.ac.cn/ | \n",
+ " China | \n",
+ " Biomedical Science,Genetics | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Allele frequency resource for research and tea... | \n",
+ " ALFRED | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.y... | \n",
+ " http://alfred.med.yale.edu | \n",
+ " United States | \n",
+ " Life Science | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Animal Transcription Factor Database | \n",
+ " AnimalTFDB | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.e... | \n",
+ " http://bioinfo.life.hust.edu.cn/AnimalTFDB/ | \n",
+ " China | \n",
+ " Life Science | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " full_name short_name \\\n",
+ "0 GenBank GenBank \n",
+ "1 GlycoNAVI GlycoNAVI \n",
+ "2 ADHDgene ADHDgene \n",
+ "3 Allele frequency resource for research and tea... ALFRED \n",
+ "4 Animal Transcription Factor Database AnimalTFDB \n",
+ "\n",
+ " fs_url \\\n",
+ "0 https://fairsharing.org/10.25504/FAIRsharing.9... \n",
+ "1 https://fairsharing.org/10.25504/FAIRsharing.w... \n",
+ "2 https://fairsharing.org/10.25504/FAIRsharing.m... \n",
+ "3 https://fairsharing.org/10.25504/FAIRsharing.y... \n",
+ "4 https://fairsharing.org/10.25504/FAIRsharing.e... \n",
+ "\n",
+ " url \\\n",
+ "0 https://www.ncbi.nlm.nih.gov/genbank/ \n",
+ "1 https://glyconavi.org/ \n",
+ "2 http://adhd.psych.ac.cn/ \n",
+ "3 http://alfred.med.yale.edu \n",
+ "4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n",
+ "\n",
+ " countries \\\n",
+ "0 European Union,Japan,United States \n",
+ "1 Japan \n",
+ "2 China \n",
+ "3 United States \n",
+ "4 China \n",
+ "\n",
+ " subjects \n",
+ "0 Bioinformatics,Data Management,Data Submission... \n",
+ "1 Chemistry,Glycomics,Life Science,Organic Chemi... \n",
+ "2 Biomedical Science,Genetics \n",
+ "3 Life Science \n",
+ "4 Life Science "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n",
+ " delimiter='|', header=0,\n",
+ " names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n",
+ "fairsharing_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " full_name | \n",
+ " short_name | \n",
+ " fs_url | \n",
+ " url | \n",
+ " countries | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 1749 | \n",
+ " 1690 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " 1752 | \n",
+ " 1741 | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 178 | \n",
+ " 834 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " CiteAb | \n",
+ " CGD | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.1... | \n",
+ " http://www.plexdb.org/ | \n",
+ " United States | \n",
+ " Life Science | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 588 | \n",
+ " 367 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " full_name short_name \\\n",
+ "count 1752 1752 \n",
+ "unique 1752 1741 \n",
+ "top CiteAb CGD \n",
+ "freq 1 3 \n",
+ "\n",
+ " fs_url \\\n",
+ "count 1752 \n",
+ "unique 1752 \n",
+ "top https://fairsharing.org/10.25504/FAIRsharing.1... \n",
+ "freq 1 \n",
+ "\n",
+ " url countries subjects \n",
+ "count 1752 1749 1690 \n",
+ "unique 1752 178 834 \n",
+ "top http://www.plexdb.org/ United States Life Science \n",
+ "freq 1 588 367 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fairsharing_df.describe(include='all')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/01.3-exploration-roar.ipynb b/notebooks/01.3-exploration-roar.ipynb
new file mode 100644
index 0000000..b79a854
--- /dev/null
+++ b/notebooks/01.3-exploration-roar.ipynb
@@ -0,0 +1,1330 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import ast\n",
+ "import csv\n",
+ "import json\n",
+ "import reverse_geocoder as rg\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "import pycountry_convert\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "from matplotlib_venn import venn2, venn2_circles\n",
+ "\n",
+ "import plotly\n",
+ "from plotly.offline import iplot, init_notebook_mode\n",
+ "import plotly.graph_objs as go\n",
+ "import plotly.express as px\n",
+ "\n",
+ "pd.set_option('display.max_columns', None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Loading datasets"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**re3data**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " re3data_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " provider_type | \n",
+ " keyword | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " r3d100000001 | \n",
+ " Odum Institute Archive Dataverse | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 111 Social ... | \n",
+ " [dataProvider] | \n",
+ " [FAIR, Middle East, crime, demography, economy... | \n",
+ " [[Odum Institute for Research in Social Scienc... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " r3d100000002 | \n",
+ " Access to Archival Databases | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 102 History... | \n",
+ " [dataProvider] | \n",
+ " [US History] | \n",
+ " [[The U.S. National Archives and Records Admin... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " r3d100000004 | \n",
+ " Datenbank Gesprochenes Deutsch | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 104 Linguis... | \n",
+ " [dataProvider, serviceProvider] | \n",
+ " [Australian German, FOLK, German dialects, Pfe... | \n",
+ " [[Institut für Deutsche Sprache, Archiv für Ge... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " r3d100000005 | \n",
+ " UNC Dataverse | \n",
+ " [institutional] | \n",
+ " [1 Humanities and Social Sciences, 111 Social ... | \n",
+ " [dataProvider, serviceProvider] | \n",
+ " [FAIR, census, demographic survey, demography,... | \n",
+ " [[Odum Institute for Research in Social Scienc... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " r3d100000006 | \n",
+ " Archaeology Data Service | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 101 Ancient... | \n",
+ " [dataProvider, serviceProvider] | \n",
+ " [FAIR, archaeology, cultural heritage, prehist... | \n",
+ " [[Arts and Humanities Research Council, [AHRC]... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " re3data_id repository_name type \\\n",
+ "0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] \n",
+ "1 r3d100000002 Access to Archival Databases [disciplinary] \n",
+ "2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] \n",
+ "3 r3d100000005 UNC Dataverse [institutional] \n",
+ "4 r3d100000006 Archaeology Data Service [disciplinary] \n",
+ "\n",
+ " subject \\\n",
+ "0 [1 Humanities and Social Sciences, 111 Social ... \n",
+ "1 [1 Humanities and Social Sciences, 102 History... \n",
+ "2 [1 Humanities and Social Sciences, 104 Linguis... \n",
+ "3 [1 Humanities and Social Sciences, 111 Social ... \n",
+ "4 [1 Humanities and Social Sciences, 101 Ancient... \n",
+ "\n",
+ " provider_type \\\n",
+ "0 [dataProvider] \n",
+ "1 [dataProvider] \n",
+ "2 [dataProvider, serviceProvider] \n",
+ "3 [dataProvider, serviceProvider] \n",
+ "4 [dataProvider, serviceProvider] \n",
+ "\n",
+ " keyword \\\n",
+ "0 [FAIR, Middle East, crime, demography, economy... \n",
+ "1 [US History] \n",
+ "2 [Australian German, FOLK, German dialects, Pfe... \n",
+ "3 [FAIR, census, demographic survey, demography,... \n",
+ "4 [FAIR, archaeology, cultural heritage, prehist... \n",
+ "\n",
+ " institution \n",
+ "0 [[Odum Institute for Research in Social Scienc... \n",
+ "1 [[The U.S. National Archives and Records Admin... \n",
+ "2 [[Institut für Deutsche Sprache, Archiv für Ge... \n",
+ "3 [[Odum Institute for Research in Social Scienc... \n",
+ "4 [[Arts and Humanities Research Council, [AHRC]... "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n",
+ " converters={'subject': ast.literal_eval,\n",
+ " 'keyword': ast.literal_eval,\n",
+ " 'additional_name': ast.literal_eval,\n",
+ " 'repository_id': ast.literal_eval,\n",
+ " 'type': ast.literal_eval,\n",
+ " 'content_type': ast.literal_eval,\n",
+ " 'provider_type': ast.literal_eval,\n",
+ " 'institution': ast.literal_eval\n",
+ " },\n",
+ " usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])\n",
+ "re3data_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "re3data_df = re3data_df.explode('provider_type')\n",
+ "re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " re3data_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " provider_type | \n",
+ " keyword | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ " 2459 | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " 2466 | \n",
+ " 2463 | \n",
+ " 9 | \n",
+ " 1282 | \n",
+ " 1 | \n",
+ " 2248 | \n",
+ " 2447 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " r3d100011987 | \n",
+ " Landmap | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 2 Life Scie... | \n",
+ " dataProvider | \n",
+ " [multidisciplinary] | \n",
+ " [[National Center for Biotechnology Informatio... | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1573 | \n",
+ " 200 | \n",
+ " 2459 | \n",
+ " 181 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " re3data_id repository_name type \\\n",
+ "count 2467 2467 2467 \n",
+ "unique 2466 2463 9 \n",
+ "top r3d100011987 Landmap [disciplinary] \n",
+ "freq 2 2 1573 \n",
+ "\n",
+ " subject provider_type \\\n",
+ "count 2467 2459 \n",
+ "unique 1282 1 \n",
+ "top [1 Humanities and Social Sciences, 2 Life Scie... dataProvider \n",
+ "freq 200 2459 \n",
+ "\n",
+ " keyword institution \n",
+ "count 2467 2467 \n",
+ "unique 2248 2447 \n",
+ "top [multidisciplinary] [[National Center for Biotechnology Informatio... \n",
+ "freq 181 6 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "re3data_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**openDOAR**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " opendoar_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 101 | \n",
+ " utrecht university repository | \n",
+ " institutional | \n",
+ " [multidisciplinary] | \n",
+ " [[university of utrecht, [universiteit utrecht... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 115 | \n",
+ " dspace at indian institute of management kozhi... | \n",
+ " institutional | \n",
+ " [ecology and environment, social sciences gene... | \n",
+ " [[indian institute of management kozhikode, [i... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 41 | \n",
+ " caltech engineering and science online | \n",
+ " institutional | \n",
+ " [biology and biochemistry, chemistry and chemi... | \n",
+ " [[california institute of technology, [caltech... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 119 | \n",
+ " dcu online research access service | \n",
+ " institutional | \n",
+ " [multidisciplinary] | \n",
+ " [[dublin city university, [dcu], ie, [], , htt... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 129 | \n",
+ " earth-prints repository | \n",
+ " disciplinary | \n",
+ " [earth and planetary sciences] | \n",
+ " [[istituto nazionale di geofisica e vulcanolog... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " opendoar_id repository_name \\\n",
+ "0 101 utrecht university repository \n",
+ "1 115 dspace at indian institute of management kozhi... \n",
+ "2 41 caltech engineering and science online \n",
+ "3 119 dcu online research access service \n",
+ "4 129 earth-prints repository \n",
+ "\n",
+ " type subject \\\n",
+ "0 institutional [multidisciplinary] \n",
+ "1 institutional [ecology and environment, social sciences gene... \n",
+ "2 institutional [biology and biochemistry, chemistry and chemi... \n",
+ "3 institutional [multidisciplinary] \n",
+ "4 disciplinary [earth and planetary sciences] \n",
+ "\n",
+ " institution \n",
+ "0 [[university of utrecht, [universiteit utrecht... \n",
+ "1 [[indian institute of management kozhikode, [i... \n",
+ "2 [[california institute of technology, [caltech... \n",
+ "3 [[dublin city university, [dcu], ie, [], , htt... \n",
+ "4 [[istituto nazionale di geofisica e vulcanolog... "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n",
+ " converters={'subject': ast.literal_eval,\n",
+ " 'additional_name': ast.literal_eval,\n",
+ " 'opendoar_id': ast.literal_eval,\n",
+ " 'content_type': ast.literal_eval,\n",
+ " 'institution': ast.literal_eval\n",
+ " },\n",
+ " usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])\n",
+ "opendoar_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " opendoar_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 5707.000000 | \n",
+ " 5707 | \n",
+ " 5707 | \n",
+ " 5707 | \n",
+ " 5707 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " NaN | \n",
+ " 5670 | \n",
+ " 4 | \n",
+ " 820 | \n",
+ " 5098 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " NaN | \n",
+ " arch | \n",
+ " institutional | \n",
+ " [multidisciplinary] | \n",
+ " [[rijksuniversiteit groningen, [rug], nl, [], ... | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 5067 | \n",
+ " 3212 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 4008.118801 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 2869.948770 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 2.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 1823.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 3361.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 5095.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 10175.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " opendoar_id repository_name type subject \\\n",
+ "count 5707.000000 5707 5707 5707 \n",
+ "unique NaN 5670 4 820 \n",
+ "top NaN arch institutional [multidisciplinary] \n",
+ "freq NaN 3 5067 3212 \n",
+ "mean 4008.118801 NaN NaN NaN \n",
+ "std 2869.948770 NaN NaN NaN \n",
+ "min 2.000000 NaN NaN NaN \n",
+ "25% 1823.000000 NaN NaN NaN \n",
+ "50% 3361.000000 NaN NaN NaN \n",
+ "75% 5095.000000 NaN NaN NaN \n",
+ "max 10175.000000 NaN NaN NaN \n",
+ "\n",
+ " institution \n",
+ "count 5707 \n",
+ "unique 5098 \n",
+ "top [[rijksuniversiteit groningen, [rug], nl, [], ... \n",
+ "freq 26 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "opendoar_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**ROAR**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " eprintid | \n",
+ " home_page | \n",
+ " title | \n",
+ " location_country | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 921 | \n",
+ " http://alcme.oclc.org/ndltd/index.html | \n",
+ " Networked Digital Library of Theses and Disser... | \n",
+ " us | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1489 | \n",
+ " http://prensahistorica.mcu.es/prensahistorica/... | \n",
+ " Virtual Library of Historical Press | \n",
+ " es | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 606 | \n",
+ " http://hal.archives-ouvertes.fr/ | \n",
+ " HAL: Hyper Article en Ligne | \n",
+ " fr | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 606 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 606 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " eprintid home_page \\\n",
+ "0 921 http://alcme.oclc.org/ndltd/index.html \n",
+ "1 1489 http://prensahistorica.mcu.es/prensahistorica/... \n",
+ "2 606 http://hal.archives-ouvertes.fr/ \n",
+ "3 606 NaN \n",
+ "4 606 NaN \n",
+ "\n",
+ " title location_country subjects \n",
+ "0 Networked Digital Library of Theses and Disser... us NaN \n",
+ "1 Virtual Library of Historical Press es NaN \n",
+ "2 HAL: Hyper Article en Ligne fr NaN \n",
+ "3 NaN NaN NaN \n",
+ "4 NaN NaN NaN "
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',\n",
+ " usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])\n",
+ "roar_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " eprintid | \n",
+ " home_page | \n",
+ " title | \n",
+ " location_country | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 106 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " Faculty Scholarship at The Claremont Colleges | \n",
+ " us | \n",
+ " AS | \n",
+ "
\n",
+ " \n",
+ " 107 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " BF | \n",
+ "
\n",
+ " \n",
+ " 108 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " BL | \n",
+ "
\n",
+ " \n",
+ " 109 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " CC | \n",
+ "
\n",
+ " \n",
+ " 110 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " GN | \n",
+ "
\n",
+ " \n",
+ " 111 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " H1 | \n",
+ "
\n",
+ " \n",
+ " 112 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " HB | \n",
+ "
\n",
+ " \n",
+ " 113 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " JA | \n",
+ "
\n",
+ " \n",
+ " 114 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " LB | \n",
+ "
\n",
+ " \n",
+ " 115 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NX | \n",
+ "
\n",
+ " \n",
+ " 116 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " PQ | \n",
+ "
\n",
+ " \n",
+ " 117 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " QA | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " eprintid home_page title \\\n",
+ "106 2303 NaN Faculty Scholarship at The Claremont Colleges \n",
+ "107 2303 NaN NaN \n",
+ "108 2303 NaN NaN \n",
+ "109 2303 NaN NaN \n",
+ "110 2303 NaN NaN \n",
+ "111 2303 NaN NaN \n",
+ "112 2303 NaN NaN \n",
+ "113 2303 NaN NaN \n",
+ "114 2303 NaN NaN \n",
+ "115 2303 NaN NaN \n",
+ "116 2303 NaN NaN \n",
+ "117 2303 NaN NaN \n",
+ "\n",
+ " location_country subjects \n",
+ "106 us AS \n",
+ "107 NaN BF \n",
+ "108 NaN BL \n",
+ "109 NaN CC \n",
+ "110 NaN GN \n",
+ "111 NaN H1 \n",
+ "112 NaN HB \n",
+ "113 NaN JA \n",
+ "114 NaN LB \n",
+ "115 NaN NX \n",
+ "116 NaN PQ \n",
+ "117 NaN QA "
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roar_df[roar_df.eprintid == 2303]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " eprintid | \n",
+ " home_page | \n",
+ " title | \n",
+ " location_country | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 5314.000000 | \n",
+ " 5263 | \n",
+ " 5268 | \n",
+ " 5024 | \n",
+ " 1225 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " NaN | \n",
+ " 5156 | \n",
+ " 5027 | \n",
+ " 134 | \n",
+ " 123 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " NaN | \n",
+ " http://ir.lib.isu.edu.tw/ | \n",
+ " Repositorio Institucional | \n",
+ " us | \n",
+ " H1 | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 7 | \n",
+ " 877 | \n",
+ " 147 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 6389.464434 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 5159.573937 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 1490.250000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 4990.500000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 10452.750000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 17302.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " eprintid home_page title \\\n",
+ "count 5314.000000 5263 5268 \n",
+ "unique NaN 5156 5027 \n",
+ "top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional \n",
+ "freq NaN 3 7 \n",
+ "mean 6389.464434 NaN NaN \n",
+ "std 5159.573937 NaN NaN \n",
+ "min 1.000000 NaN NaN \n",
+ "25% 1490.250000 NaN NaN \n",
+ "50% 4990.500000 NaN NaN \n",
+ "75% 10452.750000 NaN NaN \n",
+ "max 17302.000000 NaN NaN \n",
+ "\n",
+ " location_country subjects \n",
+ "count 5024 1225 \n",
+ "unique 134 123 \n",
+ "top us H1 \n",
+ "freq 877 147 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN "
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roar_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**FAIRsharing**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " full_name | \n",
+ " short_name | \n",
+ " fs_url | \n",
+ " url | \n",
+ " countries | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " GenBank | \n",
+ " GenBank | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.9... | \n",
+ " https://www.ncbi.nlm.nih.gov/genbank/ | \n",
+ " European Union,Japan,United States | \n",
+ " Bioinformatics,Data Management,Data Submission... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " GlycoNAVI | \n",
+ " GlycoNAVI | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.w... | \n",
+ " https://glyconavi.org/ | \n",
+ " Japan | \n",
+ " Chemistry,Glycomics,Life Science,Organic Chemi... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ADHDgene | \n",
+ " ADHDgene | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.m... | \n",
+ " http://adhd.psych.ac.cn/ | \n",
+ " China | \n",
+ " Biomedical Science,Genetics | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Allele frequency resource for research and tea... | \n",
+ " ALFRED | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.y... | \n",
+ " http://alfred.med.yale.edu | \n",
+ " United States | \n",
+ " Life Science | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Animal Transcription Factor Database | \n",
+ " AnimalTFDB | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.e... | \n",
+ " http://bioinfo.life.hust.edu.cn/AnimalTFDB/ | \n",
+ " China | \n",
+ " Life Science | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " full_name short_name \\\n",
+ "0 GenBank GenBank \n",
+ "1 GlycoNAVI GlycoNAVI \n",
+ "2 ADHDgene ADHDgene \n",
+ "3 Allele frequency resource for research and tea... ALFRED \n",
+ "4 Animal Transcription Factor Database AnimalTFDB \n",
+ "\n",
+ " fs_url \\\n",
+ "0 https://fairsharing.org/10.25504/FAIRsharing.9... \n",
+ "1 https://fairsharing.org/10.25504/FAIRsharing.w... \n",
+ "2 https://fairsharing.org/10.25504/FAIRsharing.m... \n",
+ "3 https://fairsharing.org/10.25504/FAIRsharing.y... \n",
+ "4 https://fairsharing.org/10.25504/FAIRsharing.e... \n",
+ "\n",
+ " url \\\n",
+ "0 https://www.ncbi.nlm.nih.gov/genbank/ \n",
+ "1 https://glyconavi.org/ \n",
+ "2 http://adhd.psych.ac.cn/ \n",
+ "3 http://alfred.med.yale.edu \n",
+ "4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n",
+ "\n",
+ " countries \\\n",
+ "0 European Union,Japan,United States \n",
+ "1 Japan \n",
+ "2 China \n",
+ "3 United States \n",
+ "4 China \n",
+ "\n",
+ " subjects \n",
+ "0 Bioinformatics,Data Management,Data Submission... \n",
+ "1 Chemistry,Glycomics,Life Science,Organic Chemi... \n",
+ "2 Biomedical Science,Genetics \n",
+ "3 Life Science \n",
+ "4 Life Science "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n",
+ " delimiter='|', header=0,\n",
+ " names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n",
+ "fairsharing_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " full_name | \n",
+ " short_name | \n",
+ " fs_url | \n",
+ " url | \n",
+ " countries | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 1749 | \n",
+ " 1690 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " 1752 | \n",
+ " 1741 | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 178 | \n",
+ " 834 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " CiteAb | \n",
+ " CGD | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.1... | \n",
+ " http://www.plexdb.org/ | \n",
+ " United States | \n",
+ " Life Science | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 588 | \n",
+ " 367 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " full_name short_name \\\n",
+ "count 1752 1752 \n",
+ "unique 1752 1741 \n",
+ "top CiteAb CGD \n",
+ "freq 1 3 \n",
+ "\n",
+ " fs_url \\\n",
+ "count 1752 \n",
+ "unique 1752 \n",
+ "top https://fairsharing.org/10.25504/FAIRsharing.1... \n",
+ "freq 1 \n",
+ "\n",
+ " url countries subjects \n",
+ "count 1752 1749 1690 \n",
+ "unique 1752 178 834 \n",
+ "top http://www.plexdb.org/ United States Life Science \n",
+ "freq 1 588 367 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fairsharing_df.describe(include='all')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/01.4-exploration-fairsharing.ipynb b/notebooks/01.4-exploration-fairsharing.ipynb
new file mode 100644
index 0000000..b79a854
--- /dev/null
+++ b/notebooks/01.4-exploration-fairsharing.ipynb
@@ -0,0 +1,1330 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import ast\n",
+ "import csv\n",
+ "import json\n",
+ "import reverse_geocoder as rg\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "\n",
+ "import pycountry_convert\n",
+ "\n",
+ "import matplotlib.pyplot as plt\n",
+ "from matplotlib_venn import venn2, venn2_circles\n",
+ "\n",
+ "import plotly\n",
+ "from plotly.offline import iplot, init_notebook_mode\n",
+ "import plotly.graph_objs as go\n",
+ "import plotly.express as px\n",
+ "\n",
+ "pd.set_option('display.max_columns', None)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Loading datasets"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**re3data**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " re3data_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " provider_type | \n",
+ " keyword | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " r3d100000001 | \n",
+ " Odum Institute Archive Dataverse | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 111 Social ... | \n",
+ " [dataProvider] | \n",
+ " [FAIR, Middle East, crime, demography, economy... | \n",
+ " [[Odum Institute for Research in Social Scienc... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " r3d100000002 | \n",
+ " Access to Archival Databases | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 102 History... | \n",
+ " [dataProvider] | \n",
+ " [US History] | \n",
+ " [[The U.S. National Archives and Records Admin... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " r3d100000004 | \n",
+ " Datenbank Gesprochenes Deutsch | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 104 Linguis... | \n",
+ " [dataProvider, serviceProvider] | \n",
+ " [Australian German, FOLK, German dialects, Pfe... | \n",
+ " [[Institut für Deutsche Sprache, Archiv für Ge... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " r3d100000005 | \n",
+ " UNC Dataverse | \n",
+ " [institutional] | \n",
+ " [1 Humanities and Social Sciences, 111 Social ... | \n",
+ " [dataProvider, serviceProvider] | \n",
+ " [FAIR, census, demographic survey, demography,... | \n",
+ " [[Odum Institute for Research in Social Scienc... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " r3d100000006 | \n",
+ " Archaeology Data Service | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 101 Ancient... | \n",
+ " [dataProvider, serviceProvider] | \n",
+ " [FAIR, archaeology, cultural heritage, prehist... | \n",
+ " [[Arts and Humanities Research Council, [AHRC]... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " re3data_id repository_name type \\\n",
+ "0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] \n",
+ "1 r3d100000002 Access to Archival Databases [disciplinary] \n",
+ "2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] \n",
+ "3 r3d100000005 UNC Dataverse [institutional] \n",
+ "4 r3d100000006 Archaeology Data Service [disciplinary] \n",
+ "\n",
+ " subject \\\n",
+ "0 [1 Humanities and Social Sciences, 111 Social ... \n",
+ "1 [1 Humanities and Social Sciences, 102 History... \n",
+ "2 [1 Humanities and Social Sciences, 104 Linguis... \n",
+ "3 [1 Humanities and Social Sciences, 111 Social ... \n",
+ "4 [1 Humanities and Social Sciences, 101 Ancient... \n",
+ "\n",
+ " provider_type \\\n",
+ "0 [dataProvider] \n",
+ "1 [dataProvider] \n",
+ "2 [dataProvider, serviceProvider] \n",
+ "3 [dataProvider, serviceProvider] \n",
+ "4 [dataProvider, serviceProvider] \n",
+ "\n",
+ " keyword \\\n",
+ "0 [FAIR, Middle East, crime, demography, economy... \n",
+ "1 [US History] \n",
+ "2 [Australian German, FOLK, German dialects, Pfe... \n",
+ "3 [FAIR, census, demographic survey, demography,... \n",
+ "4 [FAIR, archaeology, cultural heritage, prehist... \n",
+ "\n",
+ " institution \n",
+ "0 [[Odum Institute for Research in Social Scienc... \n",
+ "1 [[The U.S. National Archives and Records Admin... \n",
+ "2 [[Institut für Deutsche Sprache, Archiv für Ge... \n",
+ "3 [[Odum Institute for Research in Social Scienc... \n",
+ "4 [[Arts and Humanities Research Council, [AHRC]... "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n",
+ " converters={'subject': ast.literal_eval,\n",
+ " 'keyword': ast.literal_eval,\n",
+ " 'additional_name': ast.literal_eval,\n",
+ " 'repository_id': ast.literal_eval,\n",
+ " 'type': ast.literal_eval,\n",
+ " 'content_type': ast.literal_eval,\n",
+ " 'provider_type': ast.literal_eval,\n",
+ " 'institution': ast.literal_eval\n",
+ " },\n",
+ " usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])\n",
+ "re3data_df.head()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "re3data_df = re3data_df.explode('provider_type')\n",
+ "re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " re3data_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " provider_type | \n",
+ " keyword | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ " 2459 | \n",
+ " 2467 | \n",
+ " 2467 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " 2466 | \n",
+ " 2463 | \n",
+ " 9 | \n",
+ " 1282 | \n",
+ " 1 | \n",
+ " 2248 | \n",
+ " 2447 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " r3d100011987 | \n",
+ " Landmap | \n",
+ " [disciplinary] | \n",
+ " [1 Humanities and Social Sciences, 2 Life Scie... | \n",
+ " dataProvider | \n",
+ " [multidisciplinary] | \n",
+ " [[National Center for Biotechnology Informatio... | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1573 | \n",
+ " 200 | \n",
+ " 2459 | \n",
+ " 181 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " re3data_id repository_name type \\\n",
+ "count 2467 2467 2467 \n",
+ "unique 2466 2463 9 \n",
+ "top r3d100011987 Landmap [disciplinary] \n",
+ "freq 2 2 1573 \n",
+ "\n",
+ " subject provider_type \\\n",
+ "count 2467 2459 \n",
+ "unique 1282 1 \n",
+ "top [1 Humanities and Social Sciences, 2 Life Scie... dataProvider \n",
+ "freq 200 2459 \n",
+ "\n",
+ " keyword institution \n",
+ "count 2467 2467 \n",
+ "unique 2248 2447 \n",
+ "top [multidisciplinary] [[National Center for Biotechnology Informatio... \n",
+ "freq 181 6 "
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "re3data_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**openDOAR**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " opendoar_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 101 | \n",
+ " utrecht university repository | \n",
+ " institutional | \n",
+ " [multidisciplinary] | \n",
+ " [[university of utrecht, [universiteit utrecht... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 115 | \n",
+ " dspace at indian institute of management kozhi... | \n",
+ " institutional | \n",
+ " [ecology and environment, social sciences gene... | \n",
+ " [[indian institute of management kozhikode, [i... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 41 | \n",
+ " caltech engineering and science online | \n",
+ " institutional | \n",
+ " [biology and biochemistry, chemistry and chemi... | \n",
+ " [[california institute of technology, [caltech... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 119 | \n",
+ " dcu online research access service | \n",
+ " institutional | \n",
+ " [multidisciplinary] | \n",
+ " [[dublin city university, [dcu], ie, [], , htt... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 129 | \n",
+ " earth-prints repository | \n",
+ " disciplinary | \n",
+ " [earth and planetary sciences] | \n",
+ " [[istituto nazionale di geofisica e vulcanolog... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " opendoar_id repository_name \\\n",
+ "0 101 utrecht university repository \n",
+ "1 115 dspace at indian institute of management kozhi... \n",
+ "2 41 caltech engineering and science online \n",
+ "3 119 dcu online research access service \n",
+ "4 129 earth-prints repository \n",
+ "\n",
+ " type subject \\\n",
+ "0 institutional [multidisciplinary] \n",
+ "1 institutional [ecology and environment, social sciences gene... \n",
+ "2 institutional [biology and biochemistry, chemistry and chemi... \n",
+ "3 institutional [multidisciplinary] \n",
+ "4 disciplinary [earth and planetary sciences] \n",
+ "\n",
+ " institution \n",
+ "0 [[university of utrecht, [universiteit utrecht... \n",
+ "1 [[indian institute of management kozhikode, [i... \n",
+ "2 [[california institute of technology, [caltech... \n",
+ "3 [[dublin city university, [dcu], ie, [], , htt... \n",
+ "4 [[istituto nazionale di geofisica e vulcanolog... "
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n",
+ " converters={'subject': ast.literal_eval,\n",
+ " 'additional_name': ast.literal_eval,\n",
+ " 'opendoar_id': ast.literal_eval,\n",
+ " 'content_type': ast.literal_eval,\n",
+ " 'institution': ast.literal_eval\n",
+ " },\n",
+ " usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])\n",
+ "opendoar_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " opendoar_id | \n",
+ " repository_name | \n",
+ " type | \n",
+ " subject | \n",
+ " institution | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 5707.000000 | \n",
+ " 5707 | \n",
+ " 5707 | \n",
+ " 5707 | \n",
+ " 5707 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " NaN | \n",
+ " 5670 | \n",
+ " 4 | \n",
+ " 820 | \n",
+ " 5098 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " NaN | \n",
+ " arch | \n",
+ " institutional | \n",
+ " [multidisciplinary] | \n",
+ " [[rijksuniversiteit groningen, [rug], nl, [], ... | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 5067 | \n",
+ " 3212 | \n",
+ " 26 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 4008.118801 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 2869.948770 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 2.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 1823.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 3361.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 5095.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 10175.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " opendoar_id repository_name type subject \\\n",
+ "count 5707.000000 5707 5707 5707 \n",
+ "unique NaN 5670 4 820 \n",
+ "top NaN arch institutional [multidisciplinary] \n",
+ "freq NaN 3 5067 3212 \n",
+ "mean 4008.118801 NaN NaN NaN \n",
+ "std 2869.948770 NaN NaN NaN \n",
+ "min 2.000000 NaN NaN NaN \n",
+ "25% 1823.000000 NaN NaN NaN \n",
+ "50% 3361.000000 NaN NaN NaN \n",
+ "75% 5095.000000 NaN NaN NaN \n",
+ "max 10175.000000 NaN NaN NaN \n",
+ "\n",
+ " institution \n",
+ "count 5707 \n",
+ "unique 5098 \n",
+ "top [[rijksuniversiteit groningen, [rug], nl, [], ... \n",
+ "freq 26 \n",
+ "mean NaN \n",
+ "std NaN \n",
+ "min NaN \n",
+ "25% NaN \n",
+ "50% NaN \n",
+ "75% NaN \n",
+ "max NaN "
+ ]
+ },
+ "execution_count": 7,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "opendoar_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**ROAR**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 45,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " eprintid | \n",
+ " home_page | \n",
+ " title | \n",
+ " location_country | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 921 | \n",
+ " http://alcme.oclc.org/ndltd/index.html | \n",
+ " Networked Digital Library of Theses and Disser... | \n",
+ " us | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1489 | \n",
+ " http://prensahistorica.mcu.es/prensahistorica/... | \n",
+ " Virtual Library of Historical Press | \n",
+ " es | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 606 | \n",
+ " http://hal.archives-ouvertes.fr/ | \n",
+ " HAL: Hyper Article en Ligne | \n",
+ " fr | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 606 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 606 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " eprintid home_page \\\n",
+ "0 921 http://alcme.oclc.org/ndltd/index.html \n",
+ "1 1489 http://prensahistorica.mcu.es/prensahistorica/... \n",
+ "2 606 http://hal.archives-ouvertes.fr/ \n",
+ "3 606 NaN \n",
+ "4 606 NaN \n",
+ "\n",
+ " title location_country subjects \n",
+ "0 Networked Digital Library of Theses and Disser... us NaN \n",
+ "1 Virtual Library of Historical Press es NaN \n",
+ "2 HAL: Hyper Article en Ligne fr NaN \n",
+ "3 NaN NaN NaN \n",
+ "4 NaN NaN NaN "
+ ]
+ },
+ "execution_count": 45,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',\n",
+ " usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])\n",
+ "roar_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 43,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 47,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " eprintid | \n",
+ " home_page | \n",
+ " title | \n",
+ " location_country | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 106 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " Faculty Scholarship at The Claremont Colleges | \n",
+ " us | \n",
+ " AS | \n",
+ "
\n",
+ " \n",
+ " 107 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " BF | \n",
+ "
\n",
+ " \n",
+ " 108 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " BL | \n",
+ "
\n",
+ " \n",
+ " 109 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " CC | \n",
+ "
\n",
+ " \n",
+ " 110 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " GN | \n",
+ "
\n",
+ " \n",
+ " 111 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " H1 | \n",
+ "
\n",
+ " \n",
+ " 112 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " HB | \n",
+ "
\n",
+ " \n",
+ " 113 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " JA | \n",
+ "
\n",
+ " \n",
+ " 114 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " LB | \n",
+ "
\n",
+ " \n",
+ " 115 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NX | \n",
+ "
\n",
+ " \n",
+ " 116 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " PQ | \n",
+ "
\n",
+ " \n",
+ " 117 | \n",
+ " 2303 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " QA | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " eprintid home_page title \\\n",
+ "106 2303 NaN Faculty Scholarship at The Claremont Colleges \n",
+ "107 2303 NaN NaN \n",
+ "108 2303 NaN NaN \n",
+ "109 2303 NaN NaN \n",
+ "110 2303 NaN NaN \n",
+ "111 2303 NaN NaN \n",
+ "112 2303 NaN NaN \n",
+ "113 2303 NaN NaN \n",
+ "114 2303 NaN NaN \n",
+ "115 2303 NaN NaN \n",
+ "116 2303 NaN NaN \n",
+ "117 2303 NaN NaN \n",
+ "\n",
+ " location_country subjects \n",
+ "106 us AS \n",
+ "107 NaN BF \n",
+ "108 NaN BL \n",
+ "109 NaN CC \n",
+ "110 NaN GN \n",
+ "111 NaN H1 \n",
+ "112 NaN HB \n",
+ "113 NaN JA \n",
+ "114 NaN LB \n",
+ "115 NaN NX \n",
+ "116 NaN PQ \n",
+ "117 NaN QA "
+ ]
+ },
+ "execution_count": 47,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roar_df[roar_df.eprintid == 2303]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 44,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " eprintid | \n",
+ " home_page | \n",
+ " title | \n",
+ " location_country | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 5314.000000 | \n",
+ " 5263 | \n",
+ " 5268 | \n",
+ " 5024 | \n",
+ " 1225 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " NaN | \n",
+ " 5156 | \n",
+ " 5027 | \n",
+ " 134 | \n",
+ " 123 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " NaN | \n",
+ " http://ir.lib.isu.edu.tw/ | \n",
+ " Repositorio Institucional | \n",
+ " us | \n",
+ " H1 | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " NaN | \n",
+ " 3 | \n",
+ " 7 | \n",
+ " 877 | \n",
+ " 147 | \n",
+ "
\n",
+ " \n",
+ " mean | \n",
+ " 6389.464434 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " std | \n",
+ " 5159.573937 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " min | \n",
+ " 1.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 25% | \n",
+ " 1490.250000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 50% | \n",
+ " 4990.500000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 75% | \n",
+ " 10452.750000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " max | \n",
+ " 17302.000000 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " eprintid home_page title \\\n",
+ "count 5314.000000 5263 5268 \n",
+ "unique NaN 5156 5027 \n",
+ "top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional \n",
+ "freq NaN 3 7 \n",
+ "mean 6389.464434 NaN NaN \n",
+ "std 5159.573937 NaN NaN \n",
+ "min 1.000000 NaN NaN \n",
+ "25% 1490.250000 NaN NaN \n",
+ "50% 4990.500000 NaN NaN \n",
+ "75% 10452.750000 NaN NaN \n",
+ "max 17302.000000 NaN NaN \n",
+ "\n",
+ " location_country subjects \n",
+ "count 5024 1225 \n",
+ "unique 134 123 \n",
+ "top us H1 \n",
+ "freq 877 147 \n",
+ "mean NaN NaN \n",
+ "std NaN NaN \n",
+ "min NaN NaN \n",
+ "25% NaN NaN \n",
+ "50% NaN NaN \n",
+ "75% NaN NaN \n",
+ "max NaN NaN "
+ ]
+ },
+ "execution_count": 44,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "roar_df.describe(include='all')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**FAIRsharing**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " full_name | \n",
+ " short_name | \n",
+ " fs_url | \n",
+ " url | \n",
+ " countries | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " GenBank | \n",
+ " GenBank | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.9... | \n",
+ " https://www.ncbi.nlm.nih.gov/genbank/ | \n",
+ " European Union,Japan,United States | \n",
+ " Bioinformatics,Data Management,Data Submission... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " GlycoNAVI | \n",
+ " GlycoNAVI | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.w... | \n",
+ " https://glyconavi.org/ | \n",
+ " Japan | \n",
+ " Chemistry,Glycomics,Life Science,Organic Chemi... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " ADHDgene | \n",
+ " ADHDgene | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.m... | \n",
+ " http://adhd.psych.ac.cn/ | \n",
+ " China | \n",
+ " Biomedical Science,Genetics | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " Allele frequency resource for research and tea... | \n",
+ " ALFRED | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.y... | \n",
+ " http://alfred.med.yale.edu | \n",
+ " United States | \n",
+ " Life Science | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " Animal Transcription Factor Database | \n",
+ " AnimalTFDB | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.e... | \n",
+ " http://bioinfo.life.hust.edu.cn/AnimalTFDB/ | \n",
+ " China | \n",
+ " Life Science | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " full_name short_name \\\n",
+ "0 GenBank GenBank \n",
+ "1 GlycoNAVI GlycoNAVI \n",
+ "2 ADHDgene ADHDgene \n",
+ "3 Allele frequency resource for research and tea... ALFRED \n",
+ "4 Animal Transcription Factor Database AnimalTFDB \n",
+ "\n",
+ " fs_url \\\n",
+ "0 https://fairsharing.org/10.25504/FAIRsharing.9... \n",
+ "1 https://fairsharing.org/10.25504/FAIRsharing.w... \n",
+ "2 https://fairsharing.org/10.25504/FAIRsharing.m... \n",
+ "3 https://fairsharing.org/10.25504/FAIRsharing.y... \n",
+ "4 https://fairsharing.org/10.25504/FAIRsharing.e... \n",
+ "\n",
+ " url \\\n",
+ "0 https://www.ncbi.nlm.nih.gov/genbank/ \n",
+ "1 https://glyconavi.org/ \n",
+ "2 http://adhd.psych.ac.cn/ \n",
+ "3 http://alfred.med.yale.edu \n",
+ "4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n",
+ "\n",
+ " countries \\\n",
+ "0 European Union,Japan,United States \n",
+ "1 Japan \n",
+ "2 China \n",
+ "3 United States \n",
+ "4 China \n",
+ "\n",
+ " subjects \n",
+ "0 Bioinformatics,Data Management,Data Submission... \n",
+ "1 Chemistry,Glycomics,Life Science,Organic Chemi... \n",
+ "2 Biomedical Science,Genetics \n",
+ "3 Life Science \n",
+ "4 Life Science "
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n",
+ " delimiter='|', header=0,\n",
+ " names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n",
+ "fairsharing_df.head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " full_name | \n",
+ " short_name | \n",
+ " fs_url | \n",
+ " url | \n",
+ " countries | \n",
+ " subjects | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " count | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 1749 | \n",
+ " 1690 | \n",
+ "
\n",
+ " \n",
+ " unique | \n",
+ " 1752 | \n",
+ " 1741 | \n",
+ " 1752 | \n",
+ " 1752 | \n",
+ " 178 | \n",
+ " 834 | \n",
+ "
\n",
+ " \n",
+ " top | \n",
+ " CiteAb | \n",
+ " CGD | \n",
+ " https://fairsharing.org/10.25504/FAIRsharing.1... | \n",
+ " http://www.plexdb.org/ | \n",
+ " United States | \n",
+ " Life Science | \n",
+ "
\n",
+ " \n",
+ " freq | \n",
+ " 1 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 588 | \n",
+ " 367 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " full_name short_name \\\n",
+ "count 1752 1752 \n",
+ "unique 1752 1741 \n",
+ "top CiteAb CGD \n",
+ "freq 1 3 \n",
+ "\n",
+ " fs_url \\\n",
+ "count 1752 \n",
+ "unique 1752 \n",
+ "top https://fairsharing.org/10.25504/FAIRsharing.1... \n",
+ "freq 1 \n",
+ "\n",
+ " url countries subjects \n",
+ "count 1752 1749 1690 \n",
+ "unique 1752 178 834 \n",
+ "top http://www.plexdb.org/ United States Life Science \n",
+ "freq 1 588 367 "
+ ]
+ },
+ "execution_count": 12,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "fairsharing_df.describe(include='all')"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}
diff --git a/notebooks/01.1-explorative.ipynb b/notebooks/02-subjects&geographic.ipynb
similarity index 100%
rename from notebooks/01.1-explorative.ipynb
rename to notebooks/02-subjects&geographic.ipynb
diff --git a/notebooks/Untitled.ipynb b/notebooks/Untitled.ipynb
deleted file mode 100644
index 34f374f..0000000
--- a/notebooks/Untitled.ipynb
+++ /dev/null
@@ -1,32 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- }
- ],
- "metadata": {
- "kernelspec": {
- "display_name": "Python 3",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.8.3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 4
-}