1331 lines
46 KiB
Plaintext
1331 lines
46 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import ast\n",
|
|
"import csv\n",
|
|
"import json\n",
|
|
"import reverse_geocoder as rg\n",
|
|
"\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"import pycountry_convert\n",
|
|
"\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"from matplotlib_venn import venn2, venn2_circles\n",
|
|
"\n",
|
|
"import plotly\n",
|
|
"from plotly.offline import iplot, init_notebook_mode\n",
|
|
"import plotly.graph_objs as go\n",
|
|
"import plotly.express as px\n",
|
|
"\n",
|
|
"pd.set_option('display.max_columns', None)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Loading datasets"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"**re3data**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>re3data_id</th>\n",
|
|
" <th>repository_name</th>\n",
|
|
" <th>type</th>\n",
|
|
" <th>subject</th>\n",
|
|
" <th>provider_type</th>\n",
|
|
" <th>keyword</th>\n",
|
|
" <th>institution</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>r3d100000001</td>\n",
|
|
" <td>Odum Institute Archive Dataverse</td>\n",
|
|
" <td>[disciplinary]</td>\n",
|
|
" <td>[1 Humanities and Social Sciences, 111 Social ...</td>\n",
|
|
" <td>[dataProvider]</td>\n",
|
|
" <td>[FAIR, Middle East, crime, demography, economy...</td>\n",
|
|
" <td>[[Odum Institute for Research in Social Scienc...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>r3d100000002</td>\n",
|
|
" <td>Access to Archival Databases</td>\n",
|
|
" <td>[disciplinary]</td>\n",
|
|
" <td>[1 Humanities and Social Sciences, 102 History...</td>\n",
|
|
" <td>[dataProvider]</td>\n",
|
|
" <td>[US History]</td>\n",
|
|
" <td>[[The U.S. National Archives and Records Admin...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>r3d100000004</td>\n",
|
|
" <td>Datenbank Gesprochenes Deutsch</td>\n",
|
|
" <td>[disciplinary]</td>\n",
|
|
" <td>[1 Humanities and Social Sciences, 104 Linguis...</td>\n",
|
|
" <td>[dataProvider, serviceProvider]</td>\n",
|
|
" <td>[Australian German, FOLK, German dialects, Pfe...</td>\n",
|
|
" <td>[[Institut für Deutsche Sprache, Archiv für Ge...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>r3d100000005</td>\n",
|
|
" <td>UNC Dataverse</td>\n",
|
|
" <td>[institutional]</td>\n",
|
|
" <td>[1 Humanities and Social Sciences, 111 Social ...</td>\n",
|
|
" <td>[dataProvider, serviceProvider]</td>\n",
|
|
" <td>[FAIR, census, demographic survey, demography,...</td>\n",
|
|
" <td>[[Odum Institute for Research in Social Scienc...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>r3d100000006</td>\n",
|
|
" <td>Archaeology Data Service</td>\n",
|
|
" <td>[disciplinary]</td>\n",
|
|
" <td>[1 Humanities and Social Sciences, 101 Ancient...</td>\n",
|
|
" <td>[dataProvider, serviceProvider]</td>\n",
|
|
" <td>[FAIR, archaeology, cultural heritage, prehist...</td>\n",
|
|
" <td>[[Arts and Humanities Research Council, [AHRC]...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" re3data_id repository_name type \\\n",
|
|
"0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] \n",
|
|
"1 r3d100000002 Access to Archival Databases [disciplinary] \n",
|
|
"2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] \n",
|
|
"3 r3d100000005 UNC Dataverse [institutional] \n",
|
|
"4 r3d100000006 Archaeology Data Service [disciplinary] \n",
|
|
"\n",
|
|
" subject \\\n",
|
|
"0 [1 Humanities and Social Sciences, 111 Social ... \n",
|
|
"1 [1 Humanities and Social Sciences, 102 History... \n",
|
|
"2 [1 Humanities and Social Sciences, 104 Linguis... \n",
|
|
"3 [1 Humanities and Social Sciences, 111 Social ... \n",
|
|
"4 [1 Humanities and Social Sciences, 101 Ancient... \n",
|
|
"\n",
|
|
" provider_type \\\n",
|
|
"0 [dataProvider] \n",
|
|
"1 [dataProvider] \n",
|
|
"2 [dataProvider, serviceProvider] \n",
|
|
"3 [dataProvider, serviceProvider] \n",
|
|
"4 [dataProvider, serviceProvider] \n",
|
|
"\n",
|
|
" keyword \\\n",
|
|
"0 [FAIR, Middle East, crime, demography, economy... \n",
|
|
"1 [US History] \n",
|
|
"2 [Australian German, FOLK, German dialects, Pfe... \n",
|
|
"3 [FAIR, census, demographic survey, demography,... \n",
|
|
"4 [FAIR, archaeology, cultural heritage, prehist... \n",
|
|
"\n",
|
|
" institution \n",
|
|
"0 [[Odum Institute for Research in Social Scienc... \n",
|
|
"1 [[The U.S. National Archives and Records Admin... \n",
|
|
"2 [[Institut für Deutsche Sprache, Archiv für Ge... \n",
|
|
"3 [[Odum Institute for Research in Social Scienc... \n",
|
|
"4 [[Arts and Humanities Research Council, [AHRC]... "
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n",
|
|
" converters={'subject': ast.literal_eval,\n",
|
|
" 'keyword': ast.literal_eval,\n",
|
|
" 'additional_name': ast.literal_eval,\n",
|
|
" 'repository_id': ast.literal_eval,\n",
|
|
" 'type': ast.literal_eval,\n",
|
|
" 'content_type': ast.literal_eval,\n",
|
|
" 'provider_type': ast.literal_eval,\n",
|
|
" 'institution': ast.literal_eval\n",
|
|
" },\n",
|
|
" usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])\n",
|
|
"re3data_df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"re3data_df = re3data_df.explode('provider_type')\n",
|
|
"re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 5,
|
|
"metadata": {
|
|
"scrolled": false
|
|
},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>re3data_id</th>\n",
|
|
" <th>repository_name</th>\n",
|
|
" <th>type</th>\n",
|
|
" <th>subject</th>\n",
|
|
" <th>provider_type</th>\n",
|
|
" <th>keyword</th>\n",
|
|
" <th>institution</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>2467</td>\n",
|
|
" <td>2467</td>\n",
|
|
" <td>2467</td>\n",
|
|
" <td>2467</td>\n",
|
|
" <td>2459</td>\n",
|
|
" <td>2467</td>\n",
|
|
" <td>2467</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>2466</td>\n",
|
|
" <td>2463</td>\n",
|
|
" <td>9</td>\n",
|
|
" <td>1282</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>2248</td>\n",
|
|
" <td>2447</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>r3d100011987</td>\n",
|
|
" <td>Landmap</td>\n",
|
|
" <td>[disciplinary]</td>\n",
|
|
" <td>[1 Humanities and Social Sciences, 2 Life Scie...</td>\n",
|
|
" <td>dataProvider</td>\n",
|
|
" <td>[multidisciplinary]</td>\n",
|
|
" <td>[[National Center for Biotechnology Informatio...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>2</td>\n",
|
|
" <td>2</td>\n",
|
|
" <td>1573</td>\n",
|
|
" <td>200</td>\n",
|
|
" <td>2459</td>\n",
|
|
" <td>181</td>\n",
|
|
" <td>6</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" re3data_id repository_name type \\\n",
|
|
"count 2467 2467 2467 \n",
|
|
"unique 2466 2463 9 \n",
|
|
"top r3d100011987 Landmap [disciplinary] \n",
|
|
"freq 2 2 1573 \n",
|
|
"\n",
|
|
" subject provider_type \\\n",
|
|
"count 2467 2459 \n",
|
|
"unique 1282 1 \n",
|
|
"top [1 Humanities and Social Sciences, 2 Life Scie... dataProvider \n",
|
|
"freq 200 2459 \n",
|
|
"\n",
|
|
" keyword institution \n",
|
|
"count 2467 2467 \n",
|
|
"unique 2248 2447 \n",
|
|
"top [multidisciplinary] [[National Center for Biotechnology Informatio... \n",
|
|
"freq 181 6 "
|
|
]
|
|
},
|
|
"execution_count": 5,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"re3data_df.describe(include='all')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"**openDOAR**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>opendoar_id</th>\n",
|
|
" <th>repository_name</th>\n",
|
|
" <th>type</th>\n",
|
|
" <th>subject</th>\n",
|
|
" <th>institution</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>101</td>\n",
|
|
" <td>utrecht university repository</td>\n",
|
|
" <td>institutional</td>\n",
|
|
" <td>[multidisciplinary]</td>\n",
|
|
" <td>[[university of utrecht, [universiteit utrecht...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>115</td>\n",
|
|
" <td>dspace at indian institute of management kozhi...</td>\n",
|
|
" <td>institutional</td>\n",
|
|
" <td>[ecology and environment, social sciences gene...</td>\n",
|
|
" <td>[[indian institute of management kozhikode, [i...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>41</td>\n",
|
|
" <td>caltech engineering and science online</td>\n",
|
|
" <td>institutional</td>\n",
|
|
" <td>[biology and biochemistry, chemistry and chemi...</td>\n",
|
|
" <td>[[california institute of technology, [caltech...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>119</td>\n",
|
|
" <td>dcu online research access service</td>\n",
|
|
" <td>institutional</td>\n",
|
|
" <td>[multidisciplinary]</td>\n",
|
|
" <td>[[dublin city university, [dcu], ie, [], , htt...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>129</td>\n",
|
|
" <td>earth-prints repository</td>\n",
|
|
" <td>disciplinary</td>\n",
|
|
" <td>[earth and planetary sciences]</td>\n",
|
|
" <td>[[istituto nazionale di geofisica e vulcanolog...</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" opendoar_id repository_name \\\n",
|
|
"0 101 utrecht university repository \n",
|
|
"1 115 dspace at indian institute of management kozhi... \n",
|
|
"2 41 caltech engineering and science online \n",
|
|
"3 119 dcu online research access service \n",
|
|
"4 129 earth-prints repository \n",
|
|
"\n",
|
|
" type subject \\\n",
|
|
"0 institutional [multidisciplinary] \n",
|
|
"1 institutional [ecology and environment, social sciences gene... \n",
|
|
"2 institutional [biology and biochemistry, chemistry and chemi... \n",
|
|
"3 institutional [multidisciplinary] \n",
|
|
"4 disciplinary [earth and planetary sciences] \n",
|
|
"\n",
|
|
" institution \n",
|
|
"0 [[university of utrecht, [universiteit utrecht... \n",
|
|
"1 [[indian institute of management kozhikode, [i... \n",
|
|
"2 [[california institute of technology, [caltech... \n",
|
|
"3 [[dublin city university, [dcu], ie, [], , htt... \n",
|
|
"4 [[istituto nazionale di geofisica e vulcanolog... "
|
|
]
|
|
},
|
|
"execution_count": 6,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n",
|
|
" converters={'subject': ast.literal_eval,\n",
|
|
" 'additional_name': ast.literal_eval,\n",
|
|
" 'opendoar_id': ast.literal_eval,\n",
|
|
" 'content_type': ast.literal_eval,\n",
|
|
" 'institution': ast.literal_eval\n",
|
|
" },\n",
|
|
" usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])\n",
|
|
"opendoar_df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>opendoar_id</th>\n",
|
|
" <th>repository_name</th>\n",
|
|
" <th>type</th>\n",
|
|
" <th>subject</th>\n",
|
|
" <th>institution</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>5707.000000</td>\n",
|
|
" <td>5707</td>\n",
|
|
" <td>5707</td>\n",
|
|
" <td>5707</td>\n",
|
|
" <td>5707</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>5670</td>\n",
|
|
" <td>4</td>\n",
|
|
" <td>820</td>\n",
|
|
" <td>5098</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>arch</td>\n",
|
|
" <td>institutional</td>\n",
|
|
" <td>[multidisciplinary]</td>\n",
|
|
" <td>[[rijksuniversiteit groningen, [rug], nl, [], ...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>5067</td>\n",
|
|
" <td>3212</td>\n",
|
|
" <td>26</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>4008.118801</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>2869.948770</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>2.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1823.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>3361.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>5095.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>10175.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" opendoar_id repository_name type subject \\\n",
|
|
"count 5707.000000 5707 5707 5707 \n",
|
|
"unique NaN 5670 4 820 \n",
|
|
"top NaN arch institutional [multidisciplinary] \n",
|
|
"freq NaN 3 5067 3212 \n",
|
|
"mean 4008.118801 NaN NaN NaN \n",
|
|
"std 2869.948770 NaN NaN NaN \n",
|
|
"min 2.000000 NaN NaN NaN \n",
|
|
"25% 1823.000000 NaN NaN NaN \n",
|
|
"50% 3361.000000 NaN NaN NaN \n",
|
|
"75% 5095.000000 NaN NaN NaN \n",
|
|
"max 10175.000000 NaN NaN NaN \n",
|
|
"\n",
|
|
" institution \n",
|
|
"count 5707 \n",
|
|
"unique 5098 \n",
|
|
"top [[rijksuniversiteit groningen, [rug], nl, [], ... \n",
|
|
"freq 26 \n",
|
|
"mean NaN \n",
|
|
"std NaN \n",
|
|
"min NaN \n",
|
|
"25% NaN \n",
|
|
"50% NaN \n",
|
|
"75% NaN \n",
|
|
"max NaN "
|
|
]
|
|
},
|
|
"execution_count": 7,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"opendoar_df.describe(include='all')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"**ROAR**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 45,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>eprintid</th>\n",
|
|
" <th>home_page</th>\n",
|
|
" <th>title</th>\n",
|
|
" <th>location_country</th>\n",
|
|
" <th>subjects</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>921</td>\n",
|
|
" <td>http://alcme.oclc.org/ndltd/index.html</td>\n",
|
|
" <td>Networked Digital Library of Theses and Disser...</td>\n",
|
|
" <td>us</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>1489</td>\n",
|
|
" <td>http://prensahistorica.mcu.es/prensahistorica/...</td>\n",
|
|
" <td>Virtual Library of Historical Press</td>\n",
|
|
" <td>es</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>606</td>\n",
|
|
" <td>http://hal.archives-ouvertes.fr/</td>\n",
|
|
" <td>HAL: Hyper Article en Ligne</td>\n",
|
|
" <td>fr</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>606</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>606</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" eprintid home_page \\\n",
|
|
"0 921 http://alcme.oclc.org/ndltd/index.html \n",
|
|
"1 1489 http://prensahistorica.mcu.es/prensahistorica/... \n",
|
|
"2 606 http://hal.archives-ouvertes.fr/ \n",
|
|
"3 606 NaN \n",
|
|
"4 606 NaN \n",
|
|
"\n",
|
|
" title location_country subjects \n",
|
|
"0 Networked Digital Library of Theses and Disser... us NaN \n",
|
|
"1 Virtual Library of Historical Press es NaN \n",
|
|
"2 HAL: Hyper Article en Ligne fr NaN \n",
|
|
"3 NaN NaN NaN \n",
|
|
"4 NaN NaN NaN "
|
|
]
|
|
},
|
|
"execution_count": 45,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',\n",
|
|
" usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])\n",
|
|
"roar_df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 43,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 47,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>eprintid</th>\n",
|
|
" <th>home_page</th>\n",
|
|
" <th>title</th>\n",
|
|
" <th>location_country</th>\n",
|
|
" <th>subjects</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>106</th>\n",
|
|
" <td>2303</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>Faculty Scholarship at The Claremont Colleges</td>\n",
|
|
" <td>us</td>\n",
|
|
" <td>AS</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>107</th>\n",
|
|
" <td>2303</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>BF</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>108</th>\n",
|
|
" <td>2303</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>BL</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>109</th>\n",
|
|
" <td>2303</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>CC</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>110</th>\n",
|
|
" <td>2303</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>GN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>111</th>\n",
|
|
" <td>2303</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>H1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>112</th>\n",
|
|
" <td>2303</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>HB</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>113</th>\n",
|
|
" <td>2303</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>JA</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>114</th>\n",
|
|
" <td>2303</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>LB</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>115</th>\n",
|
|
" <td>2303</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NX</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>116</th>\n",
|
|
" <td>2303</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>PQ</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>117</th>\n",
|
|
" <td>2303</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>QA</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" eprintid home_page title \\\n",
|
|
"106 2303 NaN Faculty Scholarship at The Claremont Colleges \n",
|
|
"107 2303 NaN NaN \n",
|
|
"108 2303 NaN NaN \n",
|
|
"109 2303 NaN NaN \n",
|
|
"110 2303 NaN NaN \n",
|
|
"111 2303 NaN NaN \n",
|
|
"112 2303 NaN NaN \n",
|
|
"113 2303 NaN NaN \n",
|
|
"114 2303 NaN NaN \n",
|
|
"115 2303 NaN NaN \n",
|
|
"116 2303 NaN NaN \n",
|
|
"117 2303 NaN NaN \n",
|
|
"\n",
|
|
" location_country subjects \n",
|
|
"106 us AS \n",
|
|
"107 NaN BF \n",
|
|
"108 NaN BL \n",
|
|
"109 NaN CC \n",
|
|
"110 NaN GN \n",
|
|
"111 NaN H1 \n",
|
|
"112 NaN HB \n",
|
|
"113 NaN JA \n",
|
|
"114 NaN LB \n",
|
|
"115 NaN NX \n",
|
|
"116 NaN PQ \n",
|
|
"117 NaN QA "
|
|
]
|
|
},
|
|
"execution_count": 47,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"roar_df[roar_df.eprintid == 2303]"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 44,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>eprintid</th>\n",
|
|
" <th>home_page</th>\n",
|
|
" <th>title</th>\n",
|
|
" <th>location_country</th>\n",
|
|
" <th>subjects</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>5314.000000</td>\n",
|
|
" <td>5263</td>\n",
|
|
" <td>5268</td>\n",
|
|
" <td>5024</td>\n",
|
|
" <td>1225</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>5156</td>\n",
|
|
" <td>5027</td>\n",
|
|
" <td>134</td>\n",
|
|
" <td>123</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>http://ir.lib.isu.edu.tw/</td>\n",
|
|
" <td>Repositorio Institucional</td>\n",
|
|
" <td>us</td>\n",
|
|
" <td>H1</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>7</td>\n",
|
|
" <td>877</td>\n",
|
|
" <td>147</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>mean</th>\n",
|
|
" <td>6389.464434</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>std</th>\n",
|
|
" <td>5159.573937</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>min</th>\n",
|
|
" <td>1.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>25%</th>\n",
|
|
" <td>1490.250000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>50%</th>\n",
|
|
" <td>4990.500000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>75%</th>\n",
|
|
" <td>10452.750000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>max</th>\n",
|
|
" <td>17302.000000</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" <td>NaN</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" eprintid home_page title \\\n",
|
|
"count 5314.000000 5263 5268 \n",
|
|
"unique NaN 5156 5027 \n",
|
|
"top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional \n",
|
|
"freq NaN 3 7 \n",
|
|
"mean 6389.464434 NaN NaN \n",
|
|
"std 5159.573937 NaN NaN \n",
|
|
"min 1.000000 NaN NaN \n",
|
|
"25% 1490.250000 NaN NaN \n",
|
|
"50% 4990.500000 NaN NaN \n",
|
|
"75% 10452.750000 NaN NaN \n",
|
|
"max 17302.000000 NaN NaN \n",
|
|
"\n",
|
|
" location_country subjects \n",
|
|
"count 5024 1225 \n",
|
|
"unique 134 123 \n",
|
|
"top us H1 \n",
|
|
"freq 877 147 \n",
|
|
"mean NaN NaN \n",
|
|
"std NaN NaN \n",
|
|
"min NaN NaN \n",
|
|
"25% NaN NaN \n",
|
|
"50% NaN NaN \n",
|
|
"75% NaN NaN \n",
|
|
"max NaN NaN "
|
|
]
|
|
},
|
|
"execution_count": 44,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"roar_df.describe(include='all')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"**FAIRsharing**"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>full_name</th>\n",
|
|
" <th>short_name</th>\n",
|
|
" <th>fs_url</th>\n",
|
|
" <th>url</th>\n",
|
|
" <th>countries</th>\n",
|
|
" <th>subjects</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>GenBank</td>\n",
|
|
" <td>GenBank</td>\n",
|
|
" <td>https://fairsharing.org/10.25504/FAIRsharing.9...</td>\n",
|
|
" <td>https://www.ncbi.nlm.nih.gov/genbank/</td>\n",
|
|
" <td>European Union,Japan,United States</td>\n",
|
|
" <td>Bioinformatics,Data Management,Data Submission...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>GlycoNAVI</td>\n",
|
|
" <td>GlycoNAVI</td>\n",
|
|
" <td>https://fairsharing.org/10.25504/FAIRsharing.w...</td>\n",
|
|
" <td>https://glyconavi.org/</td>\n",
|
|
" <td>Japan</td>\n",
|
|
" <td>Chemistry,Glycomics,Life Science,Organic Chemi...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>ADHDgene</td>\n",
|
|
" <td>ADHDgene</td>\n",
|
|
" <td>https://fairsharing.org/10.25504/FAIRsharing.m...</td>\n",
|
|
" <td>http://adhd.psych.ac.cn/</td>\n",
|
|
" <td>China</td>\n",
|
|
" <td>Biomedical Science,Genetics</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>Allele frequency resource for research and tea...</td>\n",
|
|
" <td>ALFRED</td>\n",
|
|
" <td>https://fairsharing.org/10.25504/FAIRsharing.y...</td>\n",
|
|
" <td>http://alfred.med.yale.edu</td>\n",
|
|
" <td>United States</td>\n",
|
|
" <td>Life Science</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>Animal Transcription Factor Database</td>\n",
|
|
" <td>AnimalTFDB</td>\n",
|
|
" <td>https://fairsharing.org/10.25504/FAIRsharing.e...</td>\n",
|
|
" <td>http://bioinfo.life.hust.edu.cn/AnimalTFDB/</td>\n",
|
|
" <td>China</td>\n",
|
|
" <td>Life Science</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" full_name short_name \\\n",
|
|
"0 GenBank GenBank \n",
|
|
"1 GlycoNAVI GlycoNAVI \n",
|
|
"2 ADHDgene ADHDgene \n",
|
|
"3 Allele frequency resource for research and tea... ALFRED \n",
|
|
"4 Animal Transcription Factor Database AnimalTFDB \n",
|
|
"\n",
|
|
" fs_url \\\n",
|
|
"0 https://fairsharing.org/10.25504/FAIRsharing.9... \n",
|
|
"1 https://fairsharing.org/10.25504/FAIRsharing.w... \n",
|
|
"2 https://fairsharing.org/10.25504/FAIRsharing.m... \n",
|
|
"3 https://fairsharing.org/10.25504/FAIRsharing.y... \n",
|
|
"4 https://fairsharing.org/10.25504/FAIRsharing.e... \n",
|
|
"\n",
|
|
" url \\\n",
|
|
"0 https://www.ncbi.nlm.nih.gov/genbank/ \n",
|
|
"1 https://glyconavi.org/ \n",
|
|
"2 http://adhd.psych.ac.cn/ \n",
|
|
"3 http://alfred.med.yale.edu \n",
|
|
"4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n",
|
|
"\n",
|
|
" countries \\\n",
|
|
"0 European Union,Japan,United States \n",
|
|
"1 Japan \n",
|
|
"2 China \n",
|
|
"3 United States \n",
|
|
"4 China \n",
|
|
"\n",
|
|
" subjects \n",
|
|
"0 Bioinformatics,Data Management,Data Submission... \n",
|
|
"1 Chemistry,Glycomics,Life Science,Organic Chemi... \n",
|
|
"2 Biomedical Science,Genetics \n",
|
|
"3 Life Science \n",
|
|
"4 Life Science "
|
|
]
|
|
},
|
|
"execution_count": 11,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n",
|
|
" delimiter='|', header=0,\n",
|
|
" names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n",
|
|
"fairsharing_df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>full_name</th>\n",
|
|
" <th>short_name</th>\n",
|
|
" <th>fs_url</th>\n",
|
|
" <th>url</th>\n",
|
|
" <th>countries</th>\n",
|
|
" <th>subjects</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>1749</td>\n",
|
|
" <td>1690</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>1741</td>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>178</td>\n",
|
|
" <td>834</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>CiteAb</td>\n",
|
|
" <td>CGD</td>\n",
|
|
" <td>https://fairsharing.org/10.25504/FAIRsharing.1...</td>\n",
|
|
" <td>http://www.plexdb.org/</td>\n",
|
|
" <td>United States</td>\n",
|
|
" <td>Life Science</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>588</td>\n",
|
|
" <td>367</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" full_name short_name \\\n",
|
|
"count 1752 1752 \n",
|
|
"unique 1752 1741 \n",
|
|
"top CiteAb CGD \n",
|
|
"freq 1 3 \n",
|
|
"\n",
|
|
" fs_url \\\n",
|
|
"count 1752 \n",
|
|
"unique 1752 \n",
|
|
"top https://fairsharing.org/10.25504/FAIRsharing.1... \n",
|
|
"freq 1 \n",
|
|
"\n",
|
|
" url countries subjects \n",
|
|
"count 1752 1749 1690 \n",
|
|
"unique 1752 178 834 \n",
|
|
"top http://www.plexdb.org/ United States Life Science \n",
|
|
"freq 1 588 367 "
|
|
]
|
|
},
|
|
"execution_count": 12,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"fairsharing_df.describe(include='all')"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|