registries_analysis/notebooks/01.4-exploration-fairsharin...

330 lines
11 KiB
Plaintext
Raw Normal View History

2021-07-22 11:35:40 +02:00
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import ast\n",
"import csv\n",
"import json\n",
"import reverse_geocoder as rg\n",
"\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"import pycountry_convert\n",
"\n",
"import matplotlib.pyplot as plt\n",
"from matplotlib_venn import venn2, venn2_circles\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
"import plotly.express as px\n",
"\n",
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Loading datasets"
]
},
{
"cell_type": "code",
2021-07-23 12:41:17 +02:00
"execution_count": 2,
2021-07-22 11:35:40 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>full_name</th>\n",
" <th>short_name</th>\n",
" <th>fs_url</th>\n",
" <th>url</th>\n",
" <th>countries</th>\n",
" <th>subjects</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>GenBank</td>\n",
" <td>GenBank</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.9...</td>\n",
" <td>https://www.ncbi.nlm.nih.gov/genbank/</td>\n",
" <td>European Union,Japan,United States</td>\n",
" <td>Bioinformatics,Data Management,Data Submission...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>GlycoNAVI</td>\n",
" <td>GlycoNAVI</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.w...</td>\n",
" <td>https://glyconavi.org/</td>\n",
" <td>Japan</td>\n",
" <td>Chemistry,Glycomics,Life Science,Organic Chemi...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>ADHDgene</td>\n",
" <td>ADHDgene</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.m...</td>\n",
" <td>http://adhd.psych.ac.cn/</td>\n",
" <td>China</td>\n",
" <td>Biomedical Science,Genetics</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>Allele frequency resource for research and tea...</td>\n",
" <td>ALFRED</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.y...</td>\n",
" <td>http://alfred.med.yale.edu</td>\n",
" <td>United States</td>\n",
" <td>Life Science</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Animal Transcription Factor Database</td>\n",
" <td>AnimalTFDB</td>\n",
" <td>https://fairsharing.org/10.25504/FAIRsharing.e...</td>\n",
" <td>http://bioinfo.life.hust.edu.cn/AnimalTFDB/</td>\n",
" <td>China</td>\n",
" <td>Life Science</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" full_name short_name \\\n",
"0 GenBank GenBank \n",
"1 GlycoNAVI GlycoNAVI \n",
"2 ADHDgene ADHDgene \n",
"3 Allele frequency resource for research and tea... ALFRED \n",
"4 Animal Transcription Factor Database AnimalTFDB \n",
"\n",
" fs_url \\\n",
"0 https://fairsharing.org/10.25504/FAIRsharing.9... \n",
"1 https://fairsharing.org/10.25504/FAIRsharing.w... \n",
"2 https://fairsharing.org/10.25504/FAIRsharing.m... \n",
"3 https://fairsharing.org/10.25504/FAIRsharing.y... \n",
"4 https://fairsharing.org/10.25504/FAIRsharing.e... \n",
"\n",
" url \\\n",
"0 https://www.ncbi.nlm.nih.gov/genbank/ \n",
"1 https://glyconavi.org/ \n",
"2 http://adhd.psych.ac.cn/ \n",
"3 http://alfred.med.yale.edu \n",
"4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n",
"\n",
" countries \\\n",
"0 European Union,Japan,United States \n",
"1 Japan \n",
"2 China \n",
"3 United States \n",
"4 China \n",
"\n",
" subjects \n",
"0 Bioinformatics,Data Management,Data Submission... \n",
"1 Chemistry,Glycomics,Life Science,Organic Chemi... \n",
"2 Biomedical Science,Genetics \n",
"3 Life Science \n",
"4 Life Science "
]
},
2021-07-23 12:41:17 +02:00
"execution_count": 2,
2021-07-22 11:35:40 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n",
" delimiter='|', header=0,\n",
" names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n",
"fairsharing_df.head()"
]
},
{
"cell_type": "code",
2021-07-23 12:41:17 +02:00
"execution_count": 3,
2021-07-22 11:35:40 +02:00
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>full_name</th>\n",
" <th>short_name</th>\n",
" <th>fs_url</th>\n",
" <th>url</th>\n",
" <th>countries</th>\n",
" <th>subjects</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>count</th>\n",
" <td>1752</td>\n",
" <td>1752</td>\n",
" <td>1752</td>\n",
" <td>1752</td>\n",
" <td>1749</td>\n",
" <td>1690</td>\n",
" </tr>\n",
" <tr>\n",
" <th>unique</th>\n",
" <td>1752</td>\n",
" <td>1741</td>\n",
" <td>1752</td>\n",
" <td>1752</td>\n",
" <td>178</td>\n",
" <td>834</td>\n",
" </tr>\n",
" <tr>\n",
" <th>top</th>\n",
2021-07-23 12:41:17 +02:00
" <td>GBIF France IPT - GBIF France</td>\n",
2021-07-22 11:35:40 +02:00
" <td>CGD</td>\n",
2021-07-23 12:41:17 +02:00
" <td>https://fairsharing.org/10.25504/FAIRsharing.w...</td>\n",
" <td>http://www.ebi.ac.uk/merops/</td>\n",
2021-07-22 11:35:40 +02:00
" <td>United States</td>\n",
" <td>Life Science</td>\n",
" </tr>\n",
" <tr>\n",
" <th>freq</th>\n",
" <td>1</td>\n",
" <td>3</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>588</td>\n",
" <td>367</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
2021-07-23 12:41:17 +02:00
" full_name short_name \\\n",
"count 1752 1752 \n",
"unique 1752 1741 \n",
"top GBIF France IPT - GBIF France CGD \n",
"freq 1 3 \n",
2021-07-22 11:35:40 +02:00
"\n",
" fs_url \\\n",
"count 1752 \n",
"unique 1752 \n",
2021-07-23 12:41:17 +02:00
"top https://fairsharing.org/10.25504/FAIRsharing.w... \n",
2021-07-22 11:35:40 +02:00
"freq 1 \n",
"\n",
2021-07-23 12:41:17 +02:00
" url countries subjects \n",
"count 1752 1749 1690 \n",
"unique 1752 178 834 \n",
"top http://www.ebi.ac.uk/merops/ United States Life Science \n",
"freq 1 588 367 "
2021-07-22 11:35:40 +02:00
]
},
2021-07-23 12:41:17 +02:00
"execution_count": 3,
2021-07-22 11:35:40 +02:00
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fairsharing_df.describe(include='all')"
]
2021-07-23 12:41:17 +02:00
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"full_name 0\n",
"short_name 0\n",
"fs_url 0\n",
"url 0\n",
"countries 3\n",
"subjects 62\n",
"dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"fairsharing_df.isna().sum()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
2021-07-22 11:35:40 +02:00
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}