{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import ast\n", "import csv\n", "import json\n", "import reverse_geocoder as rg\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "import pycountry_convert\n", "\n", "import matplotlib.pyplot as plt\n", "from matplotlib_venn import venn2, venn2_circles\n", "\n", "import plotly\n", "from plotly.offline import iplot, init_notebook_mode\n", "import plotly.graph_objs as go\n", "import plotly.express as px\n", "\n", "pd.set_option('display.max_columns', None)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Loading datasets" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
full_nameshort_namefs_urlurlcountriessubjects
0GenBankGenBankhttps://fairsharing.org/10.25504/FAIRsharing.9...https://www.ncbi.nlm.nih.gov/genbank/European Union,Japan,United StatesBioinformatics,Data Management,Data Submission...
1GlycoNAVIGlycoNAVIhttps://fairsharing.org/10.25504/FAIRsharing.w...https://glyconavi.org/JapanChemistry,Glycomics,Life Science,Organic Chemi...
2ADHDgeneADHDgenehttps://fairsharing.org/10.25504/FAIRsharing.m...http://adhd.psych.ac.cn/ChinaBiomedical Science,Genetics
3Allele frequency resource for research and tea...ALFREDhttps://fairsharing.org/10.25504/FAIRsharing.y...http://alfred.med.yale.eduUnited StatesLife Science
4Animal Transcription Factor DatabaseAnimalTFDBhttps://fairsharing.org/10.25504/FAIRsharing.e...http://bioinfo.life.hust.edu.cn/AnimalTFDB/ChinaLife Science
\n", "
" ], "text/plain": [ " full_name short_name \\\n", "0 GenBank GenBank \n", "1 GlycoNAVI GlycoNAVI \n", "2 ADHDgene ADHDgene \n", "3 Allele frequency resource for research and tea... ALFRED \n", "4 Animal Transcription Factor Database AnimalTFDB \n", "\n", " fs_url \\\n", "0 https://fairsharing.org/10.25504/FAIRsharing.9... \n", "1 https://fairsharing.org/10.25504/FAIRsharing.w... \n", "2 https://fairsharing.org/10.25504/FAIRsharing.m... \n", "3 https://fairsharing.org/10.25504/FAIRsharing.y... \n", "4 https://fairsharing.org/10.25504/FAIRsharing.e... \n", "\n", " url \\\n", "0 https://www.ncbi.nlm.nih.gov/genbank/ \n", "1 https://glyconavi.org/ \n", "2 http://adhd.psych.ac.cn/ \n", "3 http://alfred.med.yale.edu \n", "4 http://bioinfo.life.hust.edu.cn/AnimalTFDB/ \n", "\n", " countries \\\n", "0 European Union,Japan,United States \n", "1 Japan \n", "2 China \n", "3 United States \n", "4 China \n", "\n", " subjects \n", "0 Bioinformatics,Data Management,Data Submission... \n", "1 Chemistry,Glycomics,Life Science,Organic Chemi... \n", "2 Biomedical Science,Genetics \n", "3 Life Science \n", "4 Life Science " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', \n", " delimiter='|', header=0,\n", " names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])\n", "fairsharing_df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
full_nameshort_namefs_urlurlcountriessubjects
count175217521752175217491690
unique1752174117521752178834
topGBIF France IPT - GBIF FranceCGDhttps://fairsharing.org/10.25504/FAIRsharing.w...http://www.ebi.ac.uk/merops/United StatesLife Science
freq1311588367
\n", "
" ], "text/plain": [ " full_name short_name \\\n", "count 1752 1752 \n", "unique 1752 1741 \n", "top GBIF France IPT - GBIF France CGD \n", "freq 1 3 \n", "\n", " fs_url \\\n", "count 1752 \n", "unique 1752 \n", "top https://fairsharing.org/10.25504/FAIRsharing.w... \n", "freq 1 \n", "\n", " url countries subjects \n", "count 1752 1749 1690 \n", "unique 1752 178 834 \n", "top http://www.ebi.ac.uk/merops/ United States Life Science \n", "freq 1 588 367 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fairsharing_df.describe(include='all')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "full_name 0\n", "short_name 0\n", "fs_url 0\n", "url 0\n", "countries 3\n", "subjects 62\n", "dtype: int64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "fairsharing_df.isna().sum()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.3" } }, "nbformat": 4, "nbformat_minor": 4 }