registries_analysis/notebooks/01.4-exploration-fairsharin...

11 KiB

In [1]:
import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

Loading datasets

In [2]:
fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', 
                             delimiter='|', header=0,
                             names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df.head()
Out[2]:
full_name short_name fs_url url countries subjects
0 GenBank GenBank https://fairsharing.org/10.25504/FAIRsharing.9... https://www.ncbi.nlm.nih.gov/genbank/ European Union,Japan,United States Bioinformatics,Data Management,Data Submission...
1 GlycoNAVI GlycoNAVI https://fairsharing.org/10.25504/FAIRsharing.w... https://glyconavi.org/ Japan Chemistry,Glycomics,Life Science,Organic Chemi...
2 ADHDgene ADHDgene https://fairsharing.org/10.25504/FAIRsharing.m... http://adhd.psych.ac.cn/ China Biomedical Science,Genetics
3 Allele frequency resource for research and tea... ALFRED https://fairsharing.org/10.25504/FAIRsharing.y... http://alfred.med.yale.edu United States Life Science
4 Animal Transcription Factor Database AnimalTFDB https://fairsharing.org/10.25504/FAIRsharing.e... http://bioinfo.life.hust.edu.cn/AnimalTFDB/ China Life Science
In [3]:
fairsharing_df.describe(include='all')
Out[3]:
full_name short_name fs_url url countries subjects
count 1752 1752 1752 1752 1749 1690
unique 1752 1741 1752 1752 178 834
top GBIF France IPT - GBIF France CGD https://fairsharing.org/10.25504/FAIRsharing.w... http://www.ebi.ac.uk/merops/ United States Life Science
freq 1 3 1 1 588 367
In [4]:
fairsharing_df.isna().sum()
Out[4]:
full_name      0
short_name     0
fs_url         0
url            0
countries      3
subjects      62
dtype: int64