registries_analysis/notebooks/01.1-exploration-re3data.ipynb

1331 lines
46 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"import csv\n",
"import json\n",
"import reverse_geocoder as rg\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import pycountry_convert\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib_venn import venn2, venn2_circles\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
"import plotly.express as px\n",
"\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading datasets"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**re3data**"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>re3data_id</th>\n",
" <th>repository_name</th>\n",
" <th>type</th>\n",
" <th>subject</th>\n",
" <th>provider_type</th>\n",
" <th>keyword</th>\n",
" <th>institution</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>r3d100000001</td>\n",
" <td>Odum Institute Archive Dataverse</td>\n",
" <td>[disciplinary]</td>\n",
" <td>[1 Humanities and Social Sciences, 111 Social ...</td>\n",
" <td>[dataProvider]</td>\n",
" <td>[FAIR, Middle East, crime, demography, economy...</td>\n",
" <td>[[Odum Institute for Research in Social Scienc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>r3d100000002</td>\n",
" <td>Access to Archival Databases</td>\n",
" <td>[disciplinary]</td>\n",
" <td>[1 Humanities and Social Sciences, 102 History...</td>\n",
" <td>[dataProvider]</td>\n",
" <td>[US History]</td>\n",
" <td>[[The U.S. National Archives and Records Admin...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>r3d100000004</td>\n",
" <td>Datenbank Gesprochenes Deutsch</td>\n",
" <td>[disciplinary]</td>\n",
" <td>[1 Humanities and Social Sciences, 104 Linguis...</td>\n",
" <td>[dataProvider, serviceProvider]</td>\n",
" <td>[Australian German, FOLK, German dialects, Pfe...</td>\n",
" <td>[[Institut für Deutsche Sprache, Archiv für Ge...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>r3d100000005</td>\n",
" <td>UNC Dataverse</td>\n",
" <td>[institutional]</td>\n",
" <td>[1 Humanities and Social Sciences, 111 Social ...</td>\n",
" <td>[dataProvider, serviceProvider]</td>\n",
" <td>[FAIR, census, demographic survey, demography,...</td>\n",
" <td>[[Odum Institute for Research in Social Scienc...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>r3d100000006</td>\n",
" <td>Archaeology Data Service</td>\n",
" <td>[disciplinary]</td>\n",
" <td>[1 Humanities and Social Sciences, 101 Ancient...</td>\n",
" <td>[dataProvider, serviceProvider]</td>\n",
" <td>[FAIR, archaeology, cultural heritage, prehist...</td>\n",
" <td>[[Arts and Humanities Research Council, [AHRC]...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" re3data_id repository_name type \\\n",
"0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] \n",
"1 r3d100000002 Access to Archival Databases [disciplinary] \n",
"2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] \n",
"3 r3d100000005 UNC Dataverse [institutional] \n",
"4 r3d100000006 Archaeology Data Service [disciplinary] \n",
"\n",
" subject \\\n",
"0 [1 Humanities and Social Sciences, 111 Social ... \n",
"1 [1 Humanities and Social Sciences, 102 History... \n",
"2 [1 Humanities and Social Sciences, 104 Linguis... \n",
"3 [1 Humanities and Social Sciences, 111 Social ... \n",
"4 [1 Humanities and Social Sciences, 101 Ancient... \n",
"\n",
" provider_type \\\n",
"0 [dataProvider] \n",
"1 [dataProvider] \n",
"2 [dataProvider, serviceProvider] \n",
"3 [dataProvider, serviceProvider] \n",
"4 [dataProvider, serviceProvider] \n",
"\n",
" keyword \\\n",
"0 [FAIR, Middle East, crime, demography, economy... \n",
"1 [US History] \n",
"2 [Australian German, FOLK, German dialects, Pfe... \n",
"3 [FAIR, census, demographic survey, demography,... \n",
"4 [FAIR, archaeology, cultural heritage, prehist... \n",
"\n",
" institution \n",
"0 [[Odum Institute for Research in Social Scienc... \n",
"1 [[The U.S. National Archives and Records Admin... \n",
"2 [[Institut für Deutsche Sprache, Archiv für Ge... \n",
"3 [[Odum Institute for Research in Social Scienc... \n",
"4 [[Arts and Humanities Research Council, [AHRC]... "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\\t', \n",
" converters={'subject': ast.literal_eval,\n",
" 'keyword': ast.literal_eval,\n",
" 'additional_name': ast.literal_eval,\n",
" 'repository_id': ast.literal_eval,\n",
" 'type': ast.literal_eval,\n",
" 'content_type': ast.literal_eval,\n",
" 'provider_type': ast.literal_eval,\n",
" 'institution': ast.literal_eval\n",
" },\n",
" usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])\n",
"re3data_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"re3data_df = re3data_df.explode('provider_type')\n",
"re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"scrolled": false
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>re3data_id</th>\n",
" <th>repository_name</th>\n",
" <th>type</th>\n",
" <th>subject</th>\n",
" <th>provider_type</th>\n",
" <th>keyword</th>\n",
" <th>institution</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>2467</td>\n",
" <td>2467</td>\n",
" <td>2467</td>\n",
" <td>2467</td>\n",
" <td>2459</td>\n",
" <td>2467</td>\n",
" <td>2467</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>2466</td>\n",
" <td>2463</td>\n",
" <td>9</td>\n",
" <td>1282</td>\n",
" <td>1</td>\n",
" <td>2248</td>\n",
" <td>2447</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>r3d100011987</td>\n",
" <td>Landmap</td>\n",
" <td>[disciplinary]</td>\n",
" <td>[1 Humanities and Social Sciences, 2 Life Scie...</td>\n",
" <td>dataProvider</td>\n",
" <td>[multidisciplinary]</td>\n",
" <td>[[National Center for Biotechnology Informatio...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>2</td>\n",
" <td>2</td>\n",
" <td>1573</td>\n",
" <td>200</td>\n",
" <td>2459</td>\n",
" <td>181</td>\n",
" <td>6</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" re3data_id repository_name type \\\n",
"count 2467 2467 2467 \n",
"unique 2466 2463 9 \n",
"top r3d100011987 Landmap [disciplinary] \n",
"freq 2 2 1573 \n",
"\n",
" subject provider_type \\\n",
"count 2467 2459 \n",
"unique 1282 1 \n",
"top [1 Humanities and Social Sciences, 2 Life Scie... dataProvider \n",
"freq 200 2459 \n",
"\n",
" keyword institution \n",
"count 2467 2467 \n",
"unique 2248 2447 \n",
"top [multidisciplinary] [[National Center for Biotechnology Informatio... \n",
"freq 181 6 "
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"re3data_df.describe(include='all')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**openDOAR**"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>opendoar_id</th>\n",
" <th>repository_name</th>\n",
" <th>type</th>\n",
" <th>subject</th>\n",
" <th>institution</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>101</td>\n",
" <td>utrecht university repository</td>\n",
" <td>institutional</td>\n",
" <td>[multidisciplinary]</td>\n",
" <td>[[university of utrecht, [universiteit utrecht...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>115</td>\n",
" <td>dspace at indian institute of management kozhi...</td>\n",
" <td>institutional</td>\n",
" <td>[ecology and environment, social sciences gene...</td>\n",
" <td>[[indian institute of management kozhikode, [i...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>41</td>\n",
" <td>caltech engineering and science online</td>\n",
" <td>institutional</td>\n",
" <td>[biology and biochemistry, chemistry and chemi...</td>\n",
" <td>[[california institute of technology, [caltech...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>119</td>\n",
" <td>dcu online research access service</td>\n",
" <td>institutional</td>\n",
" <td>[multidisciplinary]</td>\n",
" <td>[[dublin city university, [dcu], ie, [], , htt...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>129</td>\n",
" <td>earth-prints repository</td>\n",
" <td>disciplinary</td>\n",
" <td>[earth and planetary sciences]</td>\n",
" <td>[[istituto nazionale di geofisica e vulcanolog...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" opendoar_id repository_name \\\n",
"0 101 utrecht university repository \n",
"1 115 dspace at indian institute of management kozhi... \n",
"2 41 caltech engineering and science online \n",
"3 119 dcu online research access service \n",
"4 129 earth-prints repository \n",
"\n",
" type subject \\\n",
"0 institutional [multidisciplinary] \n",
"1 institutional [ecology and environment, social sciences gene... \n",
"2 institutional [biology and biochemistry, chemistry and chemi... \n",
"3 institutional [multidisciplinary] \n",
"4 disciplinary [earth and planetary sciences] \n",
"\n",
" institution \n",
"0 [[university of utrecht, [universiteit utrecht... \n",
"1 [[indian institute of management kozhikode, [i... \n",
"2 [[california institute of technology, [caltech... \n",
"3 [[dublin city university, [dcu], ie, [], , htt... \n",
"4 [[istituto nazionale di geofisica e vulcanolog... "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\\t',\n",
" converters={'subject': ast.literal_eval,\n",
" 'additional_name': ast.literal_eval,\n",
" 'opendoar_id': ast.literal_eval,\n",
" 'content_type': ast.literal_eval,\n",
" 'institution': ast.literal_eval\n",
" },\n",
" usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])\n",
"opendoar_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>opendoar_id</th>\n",
" <th>repository_name</th>\n",
" <th>type</th>\n",
" <th>subject</th>\n",
" <th>institution</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>5707.000000</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
" <td>5707</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>NaN</td>\n",
" <td>5670</td>\n",
" <td>4</td>\n",
" <td>820</td>\n",
" <td>5098</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>NaN</td>\n",
" <td>arch</td>\n",
" <td>institutional</td>\n",
" <td>[multidisciplinary]</td>\n",
" <td>[[rijksuniversiteit groningen, [rug], nl, [], ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" <td>5067</td>\n",
" <td>3212</td>\n",
" <td>26</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>4008.118801</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>2869.948770</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>2.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>1823.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>3361.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>5095.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>10175.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" opendoar_id repository_name type subject \\\n",
"count 5707.000000 5707 5707 5707 \n",
"unique NaN 5670 4 820 \n",
"top NaN arch institutional [multidisciplinary] \n",
"freq NaN 3 5067 3212 \n",
"mean 4008.118801 NaN NaN NaN \n",
"std 2869.948770 NaN NaN NaN \n",
"min 2.000000 NaN NaN NaN \n",
"25% 1823.000000 NaN NaN NaN \n",
"50% 3361.000000 NaN NaN NaN \n",
"75% 5095.000000 NaN NaN NaN \n",
"max 10175.000000 NaN NaN NaN \n",
"\n",
" institution \n",
"count 5707 \n",
"unique 5098 \n",
"top [[rijksuniversiteit groningen, [rug], nl, [], ... \n",
"freq 26 \n",
"mean NaN \n",
"std NaN \n",
"min NaN \n",
"25% NaN \n",
"50% NaN \n",
"75% NaN \n",
"max NaN "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"opendoar_df.describe(include='all')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**ROAR**"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>eprintid</th>\n",
" <th>home_page</th>\n",
" <th>title</th>\n",
" <th>location_country</th>\n",
" <th>subjects</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>921</td>\n",
" <td>http://alcme.oclc.org/ndltd/index.html</td>\n",
" <td>Networked Digital Library of Theses and Disser...</td>\n",
" <td>us</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1489</td>\n",
" <td>http://prensahistorica.mcu.es/prensahistorica/...</td>\n",
" <td>Virtual Library of Historical Press</td>\n",
" <td>es</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>606</td>\n",
" <td>http://hal.archives-ouvertes.fr/</td>\n",
" <td>HAL: Hyper Article en Ligne</td>\n",
" <td>fr</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>606</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>606</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" eprintid home_page \\\n",
"0 921 http://alcme.oclc.org/ndltd/index.html \n",
"1 1489 http://prensahistorica.mcu.es/prensahistorica/... \n",
"2 606 http://hal.archives-ouvertes.fr/ \n",
"3 606 NaN \n",
"4 606 NaN \n",
"\n",
" title location_country subjects \n",
"0 Networked Digital Library of Theses and Disser... us NaN \n",
"1 Virtual Library of Historical Press es NaN \n",
"2 HAL: Hyper Article en Ligne fr NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN "
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',\n",
" usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])\n",
"roar_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [],
"source": [
"# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>eprintid</th>\n",
" <th>home_page</th>\n",
" <th>title</th>\n",
" <th>location_country</th>\n",
" <th>subjects</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>106</th>\n",
" <td>2303</td>\n",
" <td>NaN</td>\n",
" <td>Faculty Scholarship at The Claremont Colleges</td>\n",
" <td>us</td>\n",
" <td>AS</td>\n",
" </tr>\n",
" <tr>\n",
" <th>107</th>\n",
" <td>2303</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>BF</td>\n",
" </tr>\n",
" <tr>\n",
" <th>108</th>\n",
" <td>2303</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>BL</td>\n",
" </tr>\n",
" <tr>\n",
" <th>109</th>\n",
" <td>2303</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>CC</td>\n",
" </tr>\n",
" <tr>\n",
" <th>110</th>\n",
" <td>2303</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>GN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>111</th>\n",
" <td>2303</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>H1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>112</th>\n",
" <td>2303</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>HB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>113</th>\n",
" <td>2303</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>JA</td>\n",
" </tr>\n",
" <tr>\n",
" <th>114</th>\n",
" <td>2303</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>LB</td>\n",
" </tr>\n",
" <tr>\n",
" <th>115</th>\n",
" <td>2303</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NX</td>\n",
" </tr>\n",
" <tr>\n",
" <th>116</th>\n",
" <td>2303</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>PQ</td>\n",
" </tr>\n",
" <tr>\n",
" <th>117</th>\n",
" <td>2303</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>QA</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" eprintid home_page title \\\n",
"106 2303 NaN Faculty Scholarship at The Claremont Colleges \n",
"107 2303 NaN NaN \n",
"108 2303 NaN NaN \n",
"109 2303 NaN NaN \n",
"110 2303 NaN NaN \n",
"111 2303 NaN NaN \n",
"112 2303 NaN NaN \n",
"113 2303 NaN NaN \n",
"114 2303 NaN NaN \n",
"115 2303 NaN NaN \n",
"116 2303 NaN NaN \n",
"117 2303 NaN NaN \n",
"\n",
" location_country subjects \n",
"106 us AS \n",
"107 NaN BF \n",
"108 NaN BL \n",
"109 NaN CC \n",
"110 NaN GN \n",
"111 NaN H1 \n",
"112 NaN HB \n",
"113 NaN JA \n",
"114 NaN LB \n",
"115 NaN NX \n",
"116 NaN PQ \n",
"117 NaN QA "
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"roar_df[roar_df.eprintid == 2303]"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>eprintid</th>\n",
" <th>home_page</th>\n",
" <th>title</th>\n",
" <th>location_country</th>\n",
" <th>subjects</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>5314.000000</td>\n",
" <td>5263</td>\n",
" <td>5268</td>\n",
" <td>5024</td>\n",
" <td>1225</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>NaN</td>\n",
" <td>5156</td>\n",
" <td>5027</td>\n",
" <td>134</td>\n",
" <td>123</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>NaN</td>\n",
" <td>http://ir.lib.isu.edu.tw/</td>\n",
" <td>Repositorio Institucional</td>\n",
" <td>us</td>\n",
" <td>H1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>NaN</td>\n",
" <td>3</td>\n",
" <td>7</td>\n",
" <td>877</td>\n",
" <td>147</td>\n",
" </tr>\n",
" <tr>\n",
" <th>mean</th>\n",
" <td>6389.464434</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>std</th>\n",
" <td>5159.573937</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>min</th>\n",
" <td>1.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>25%</th>\n",
" <td>1490.250000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>50%</th>\n",
" <td>4990.500000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>75%</th>\n",
" <td>10452.750000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" <tr>\n",
" <th>max</th>\n",
" <td>17302.000000</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" eprintid home_page title \\\n",
"count 5314.000000 5263 5268 \n",
"unique NaN 5156 5027 \n",
"top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional \n",
"freq NaN 3 7 \n",
"mean 6389.464434 NaN NaN \n",
"std 5159.573937 NaN NaN \n",
"min 1.000000 NaN NaN \n",
"25% 1490.250000 NaN NaN \n",
"50% 4990.500000 NaN NaN \n",
"75% 10452.750000 NaN NaN \n",
"max 17302.000000 NaN NaN \n",
"\n",
" location_country subjects \n",
"count 5024 1225 \n",
"unique 134 123 \n",
"top us H1 \n",
"freq 877 147 \n",
"mean NaN NaN \n",
"std NaN NaN \n",
"min NaN NaN \n",
"25% NaN NaN \n",
"50% NaN NaN \n",
"75% NaN NaN \n",
"max NaN NaN "
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"roar_df.describe(include='all')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**FAIRsharing**"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>full_name</th>\n",
" <th>short_name</th>\n",
" <th>fs_url</th>\n",
" <th>url</th>\n",
" <th>countries</th>\n",
" <th>subjects</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>GenBank</td>\n",
" <td>GenBank</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.9...</td>\n",
" <td>https://www.ncbi.nlm.nih.gov/genbank/</td>\n",
" <td>European Union,Japan,United States</td>\n",
" <td>Bioinformatics,Data Management,Data Submission...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>GlycoNAVI</td>\n",
" <td>GlycoNAVI</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.w...</td>\n",
" <td>https://glyconavi.org/</td>\n",
" <td>Japan</td>\n",
" <td>Chemistry,Glycomics,Life Science,Organic Chemi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ADHDgene</td>\n",
" <td>ADHDgene</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.m...</td>\n",
" <td>http://adhd.psych.ac.cn/</td>\n",
" <td>China</td>\n",
" <td>Biomedical Science,Genetics</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Allele frequency resource for research and tea...</td>\n",
" <td>ALFRED</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.y...</td>\n",
" <td>http://alfred.med.yale.edu</td>\n",
" <td>United States</td>\n",
" <td>Life Science</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Animal Transcription Factor Database</td>\n",
" <td>AnimalTFDB</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.e...</td>\n",
" <td>http://bioinfo.life.hust.edu.cn/AnimalTFDB/</td>\n",
" <td>China</td>\n",
" <td>Life Science</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" full_name short_name \\\n",
"0 GenBank GenBank \n",
"1 GlycoNAVI GlycoNAVI \n",
"2 ADHDgene ADHDgene \n",
"3 Allele frequency resource for research and tea... ALFRED \n",
"4 Animal Transcription Factor Database AnimalTFDB \n",
"\n",
" fs_url \\\n",
"0 https://fairsharing.org/10.25504/FAIRsharing.9... \n",
"1 https://fairsharing.org/10.25504/FAIRsharing.w... \n",
"2 https://fairsharing.org/10.25504/FAIRsharing.m... \n",
"3 https://fairsharing.org/10.25504/FAIRsharing.y... \n",
"4 https://fairsharing.org/10.25504/FAIRsharing.e... \n",
"\n",
" url \\\n",
"0 https://www.ncbi.nlm.nih.gov/genbank/ \n",
"1 https://glyconavi.org/ \n",
"2 http://adhd.psych.ac.cn/ \n",
"3 http://alfred.med.yale.edu \n",
"4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n",
"\n",
" countries \\\n",
"0 European Union,Japan,United States \n",
"1 Japan \n",
"2 China \n",
"3 United States \n",
"4 China \n",
"\n",
" subjects \n",
"0 Bioinformatics,Data Management,Data Submission... \n",
"1 Chemistry,Glycomics,Life Science,Organic Chemi... \n",
"2 Biomedical Science,Genetics \n",
"3 Life Science \n",
"4 Life Science "
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n",
" delimiter='|', header=0,\n",
" names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n",
"fairsharing_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>full_name</th>\n",
" <th>short_name</th>\n",
" <th>fs_url</th>\n",
" <th>url</th>\n",
" <th>countries</th>\n",
" <th>subjects</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>1752</td>\n",
" <td>1752</td>\n",
" <td>1752</td>\n",
" <td>1752</td>\n",
" <td>1749</td>\n",
" <td>1690</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>1752</td>\n",
" <td>1741</td>\n",
" <td>1752</td>\n",
" <td>1752</td>\n",
" <td>178</td>\n",
" <td>834</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
" <td>CiteAb</td>\n",
" <td>CGD</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.1...</td>\n",
" <td>http://www.plexdb.org/</td>\n",
" <td>United States</td>\n",
" <td>Life Science</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>588</td>\n",
" <td>367</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" full_name short_name \\\n",
"count 1752 1752 \n",
"unique 1752 1741 \n",
"top CiteAb CGD \n",
"freq 1 3 \n",
"\n",
" fs_url \\\n",
"count 1752 \n",
"unique 1752 \n",
"top https://fairsharing.org/10.25504/FAIRsharing.1... \n",
"freq 1 \n",
"\n",
" url countries subjects \n",
"count 1752 1749 1690 \n",
"unique 1752 178 834 \n",
"top http://www.plexdb.org/ United States Life Science \n",
"freq 1 588 367 "
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fairsharing_df.describe(include='all')"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}