324 lines
10 KiB
Plaintext
324 lines
10 KiB
Plaintext
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 1,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import ast\n",
|
|
"import csv\n",
|
|
"import json\n",
|
|
"\n",
|
|
"import numpy as np\n",
|
|
"import pandas as pd\n",
|
|
"\n",
|
|
"import plotly\n",
|
|
"from plotly.offline import iplot, init_notebook_mode\n",
|
|
"import plotly.graph_objs as go\n",
|
|
"import plotly.express as px\n",
|
|
"\n",
|
|
"pd.set_option('display.max_columns', None)"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## Loading datasets"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>full_name</th>\n",
|
|
" <th>short_name</th>\n",
|
|
" <th>fs_url</th>\n",
|
|
" <th>url</th>\n",
|
|
" <th>countries</th>\n",
|
|
" <th>subjects</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>0</th>\n",
|
|
" <td>GenBank</td>\n",
|
|
" <td>GenBank</td>\n",
|
|
" <td>https://fairsharing.org/10.25504/FAIRsharing.9...</td>\n",
|
|
" <td>https://www.ncbi.nlm.nih.gov/genbank/</td>\n",
|
|
" <td>European Union,Japan,United States</td>\n",
|
|
" <td>Bioinformatics,Data Management,Data Submission...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>1</th>\n",
|
|
" <td>GlycoNAVI</td>\n",
|
|
" <td>GlycoNAVI</td>\n",
|
|
" <td>https://fairsharing.org/10.25504/FAIRsharing.w...</td>\n",
|
|
" <td>https://glyconavi.org/</td>\n",
|
|
" <td>Japan</td>\n",
|
|
" <td>Chemistry,Glycomics,Life Science,Organic Chemi...</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>2</th>\n",
|
|
" <td>ADHDgene</td>\n",
|
|
" <td>ADHDgene</td>\n",
|
|
" <td>https://fairsharing.org/10.25504/FAIRsharing.m...</td>\n",
|
|
" <td>http://adhd.psych.ac.cn/</td>\n",
|
|
" <td>China</td>\n",
|
|
" <td>Biomedical Science,Genetics</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>3</th>\n",
|
|
" <td>Allele frequency resource for research and tea...</td>\n",
|
|
" <td>ALFRED</td>\n",
|
|
" <td>https://fairsharing.org/10.25504/FAIRsharing.y...</td>\n",
|
|
" <td>http://alfred.med.yale.edu</td>\n",
|
|
" <td>United States</td>\n",
|
|
" <td>Life Science</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>4</th>\n",
|
|
" <td>Animal Transcription Factor Database</td>\n",
|
|
" <td>AnimalTFDB</td>\n",
|
|
" <td>https://fairsharing.org/10.25504/FAIRsharing.e...</td>\n",
|
|
" <td>http://bioinfo.life.hust.edu.cn/AnimalTFDB/</td>\n",
|
|
" <td>China</td>\n",
|
|
" <td>Life Science</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" full_name short_name \\\n",
|
|
"0 GenBank GenBank \n",
|
|
"1 GlycoNAVI GlycoNAVI \n",
|
|
"2 ADHDgene ADHDgene \n",
|
|
"3 Allele frequency resource for research and tea... ALFRED \n",
|
|
"4 Animal Transcription Factor Database AnimalTFDB \n",
|
|
"\n",
|
|
" fs_url \\\n",
|
|
"0 https://fairsharing.org/10.25504/FAIRsharing.9... \n",
|
|
"1 https://fairsharing.org/10.25504/FAIRsharing.w... \n",
|
|
"2 https://fairsharing.org/10.25504/FAIRsharing.m... \n",
|
|
"3 https://fairsharing.org/10.25504/FAIRsharing.y... \n",
|
|
"4 https://fairsharing.org/10.25504/FAIRsharing.e... \n",
|
|
"\n",
|
|
" url \\\n",
|
|
"0 https://www.ncbi.nlm.nih.gov/genbank/ \n",
|
|
"1 https://glyconavi.org/ \n",
|
|
"2 http://adhd.psych.ac.cn/ \n",
|
|
"3 http://alfred.med.yale.edu \n",
|
|
"4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n",
|
|
"\n",
|
|
" countries \\\n",
|
|
"0 European Union,Japan,United States \n",
|
|
"1 Japan \n",
|
|
"2 China \n",
|
|
"3 United States \n",
|
|
"4 China \n",
|
|
"\n",
|
|
" subjects \n",
|
|
"0 Bioinformatics,Data Management,Data Submission... \n",
|
|
"1 Chemistry,Glycomics,Life Science,Organic Chemi... \n",
|
|
"2 Biomedical Science,Genetics \n",
|
|
"3 Life Science \n",
|
|
"4 Life Science "
|
|
]
|
|
},
|
|
"execution_count": 2,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n",
|
|
" delimiter='|', header=0,\n",
|
|
" names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n",
|
|
"fairsharing_df.head()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/html": [
|
|
"<div>\n",
|
|
"<style scoped>\n",
|
|
" .dataframe tbody tr th:only-of-type {\n",
|
|
" vertical-align: middle;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe tbody tr th {\n",
|
|
" vertical-align: top;\n",
|
|
" }\n",
|
|
"\n",
|
|
" .dataframe thead th {\n",
|
|
" text-align: right;\n",
|
|
" }\n",
|
|
"</style>\n",
|
|
"<table border=\"1\" class=\"dataframe\">\n",
|
|
" <thead>\n",
|
|
" <tr style=\"text-align: right;\">\n",
|
|
" <th></th>\n",
|
|
" <th>full_name</th>\n",
|
|
" <th>short_name</th>\n",
|
|
" <th>fs_url</th>\n",
|
|
" <th>url</th>\n",
|
|
" <th>countries</th>\n",
|
|
" <th>subjects</th>\n",
|
|
" </tr>\n",
|
|
" </thead>\n",
|
|
" <tbody>\n",
|
|
" <tr>\n",
|
|
" <th>count</th>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>1749</td>\n",
|
|
" <td>1690</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>unique</th>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>1741</td>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>1752</td>\n",
|
|
" <td>178</td>\n",
|
|
" <td>834</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>top</th>\n",
|
|
" <td>Brassica Information Portal</td>\n",
|
|
" <td>CGD</td>\n",
|
|
" <td>https://fairsharing.org/10.25504/FAIRsharing.e...</td>\n",
|
|
" <td>http://web.iodp.tamu.edu/LORE/</td>\n",
|
|
" <td>United States</td>\n",
|
|
" <td>Life Science</td>\n",
|
|
" </tr>\n",
|
|
" <tr>\n",
|
|
" <th>freq</th>\n",
|
|
" <td>1</td>\n",
|
|
" <td>3</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>1</td>\n",
|
|
" <td>588</td>\n",
|
|
" <td>367</td>\n",
|
|
" </tr>\n",
|
|
" </tbody>\n",
|
|
"</table>\n",
|
|
"</div>"
|
|
],
|
|
"text/plain": [
|
|
" full_name short_name \\\n",
|
|
"count 1752 1752 \n",
|
|
"unique 1752 1741 \n",
|
|
"top Brassica Information Portal CGD \n",
|
|
"freq 1 3 \n",
|
|
"\n",
|
|
" fs_url \\\n",
|
|
"count 1752 \n",
|
|
"unique 1752 \n",
|
|
"top https://fairsharing.org/10.25504/FAIRsharing.e... \n",
|
|
"freq 1 \n",
|
|
"\n",
|
|
" url countries subjects \n",
|
|
"count 1752 1749 1690 \n",
|
|
"unique 1752 178 834 \n",
|
|
"top http://web.iodp.tamu.edu/LORE/ United States Life Science \n",
|
|
"freq 1 588 367 "
|
|
]
|
|
},
|
|
"execution_count": 3,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"fairsharing_df.describe(include='all')"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"outputs": [
|
|
{
|
|
"data": {
|
|
"text/plain": [
|
|
"full_name 0\n",
|
|
"short_name 0\n",
|
|
"fs_url 0\n",
|
|
"url 0\n",
|
|
"countries 3\n",
|
|
"subjects 62\n",
|
|
"dtype: int64"
|
|
]
|
|
},
|
|
"execution_count": 4,
|
|
"metadata": {},
|
|
"output_type": "execute_result"
|
|
}
|
|
],
|
|
"source": [
|
|
"fairsharing_df.isna().sum()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": []
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"codemirror_mode": {
|
|
"name": "ipython",
|
|
"version": 3
|
|
},
|
|
"file_extension": ".py",
|
|
"mimetype": "text/x-python",
|
|
"name": "python",
|
|
"nbconvert_exporter": "python",
|
|
"pygments_lexer": "ipython3",
|
|
"version": "3.8.3"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|