In [1]:
import ast
import csv
import json

import numpy as np
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

## Loading datasets

In [2]:
with open('../data/raw/fairsharing_dump_api_09_2021.json') as f:
    lines = f.read().splitlines()
    
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))

fairsharing_df.head()

Unnamed: 0,id,type,attributes.created-at,attributes.updated-at,attributes.metadata.doi,attributes.metadata.name,attributes.metadata.status,attributes.metadata.contacts,attributes.metadata.homepage,attributes.metadata.identifier,attributes.metadata.description,attributes.metadata.support-links,attributes.metadata.year-creation,attributes.metadata.data-processes,attributes.legacy-ids,attributes.fairsharing-registry,attributes.record-type,attributes.subjects,attributes.domains,attributes.taxonomies,attributes.user-defined-tags,attributes.countries,attributes.name,attributes.abbreviation,attributes.url,attributes.doi,attributes.fairsharing-licence,attributes.description,attributes.publications,attributes.licence-links,attributes.metadata.citations,attributes.metadata.abbreviation,attributes.metadata.access-points,attributes.metadata.associated-tools,attributes.metadata.deprecation-date,attributes.metadata.deprecation-reason,attributes.metadata.tombstone
0,1723,fairsharing-records,2014-11-04T15:23:40.000Z,2021-09-30T11:39:06.829Z,10.25504/FAIRsharing.8t18te,Cell Image Library,ready,"[{'contact-name': 'David Orloff', 'contact-ema...",http://www.cellimagelibrary.org,1723,This library is a public and easily accessible...,[{'url': 'http://www.cellimagelibrary.org/page...,2010.0,"[{'name': 'live update', 'type': 'data release...","[biodbcore-000180, bsg-d000180]",Database,repository,"[Cell Biology, Life Science]","[Cell, Microscopy, Light microscopy, Electron ...",[All],[],[United States],FAIRsharing record for: Cell Image Library,,https://fairsharing.org/10.25504/FAIRsharing.8...,10.25504/FAIRsharing.8t18te,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: This librar...,"[{'id': 232, 'pubmed_id': 23203874, 'title': '...",[{'licence-name': 'Cell Image Library Data Pol...,,,,,,,
1,3101,fairsharing-records,2020-09-16T08:49:13.000Z,2021-09-30T11:36:45.452Z,,WHOI Ship Data-Grabber System,ready,,http://4dgeo.whoi.edu/shipdata/SDG_shipdata.html,3101,The WHOI Ship DataGrabber system provides the ...,[{'url': 'http://4dgeo.whoi.edu/shipdata/SDG_o...,2004.0,[{'url': 'http://4dgeo.whoi.edu/sdg-bin/dv_mai...,"[biodbcore-001609, bsg-d001609]",Database,repository,"[Earth Science, Water Research, Oceanography]",[],[Not applicable],[subseafloor environments],[United States],FAIRsharing record for: WHOI Ship Data-Grabber...,,https://fairsharing.org/fairsharing_records/3101,,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: The WHOI Sh...,[],"[{'licence-name': 'NDSF Data Archive Policy', ...",,,,,,,
2,2649,fairsharing-records,2018-08-07T20:23:32.000Z,2021-09-30T11:39:07.898Z,,Electron Microscope Public Image Archive,ready,"[{'contact-name': 'General contact', 'contact-...",https://www.ebi.ac.uk/pdbe/emdb/empiar/,2649,"EMPIAR, the Electron Microscopy Public Image A...",[{'url': 'https://www.ebi.ac.uk/support/EMPIAR...,2015.0,[{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi...,"[biodbcore-001140, bsg-d001140]",Database,repository,"[Bioinformatics, Biology]","[Protein image, Microscopy, Electron microscop...",[All],[],"[Greece, Czech Republic, United Kingdom, Icela...",FAIRsharing record for: Electron Microscope Pu...,EMPIAR,https://fairsharing.org/fairsharing_records/2649,,https://creativecommons.org/licenses/by-sa/4.0...,"This FAIRsharing record describes: EMPIAR, the...","[{'id': 2232, 'pubmed_id': 27067018, 'title': ...","[{'licence-name': 'EMBL-EBI Terms of Use', 'li...","[{'doi': '10.1038/nmeth.3806', 'pubmed-id': 27...",EMPIAR,[{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi...,[{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi...,,,
3,2657,fairsharing-records,2018-08-13T15:12:11.000Z,2021-09-30T11:37:28.736Z,10.25504/FAIRsharing.tnByoG,ClinicalStudyDataRequest.com,ready,[{'contact-email': 'support@clinicalstudydatar...,https://clinicalstudydatarequest.com/,2657,ClinicalStudyDataRequest.com (CSDR) is a conso...,[{'url': 'https://clinicalstudydatarequest.com...,2014.0,[{'url': 'https://clinicalstudydatarequest.com...,"[biodbcore-001149, bsg-d001149]",Database,repository,"[Preclinical Studies, Biomedical Science]",[],[Homo sapiens],[],[Worldwide],FAIRsharing record for: ClinicalStudyDataReque...,CSDR,https://fairsharing.org/10.25504/FAIRsharing.t...,10.25504/FAIRsharing.tnByoG,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: ClinicalStu...,[],[{'licence-name': 'CSDR Data Sharing Agreement...,,CSDR,,,,,
4,2078,fairsharing-records,2014-11-04T15:23:40.000Z,2021-09-30T11:34:43.129Z,10.25504/FAIRsharing.3axym7,Germplasm Resources Information Network,ready,[{'contact-email': 'dbmu@ars-grin.gov'}],https://www.ars-grin.gov/,2078,GRIN provides National Genetic Resources Progr...,[{'url': 'https://www.ars-grin.gov/Pages/Colle...,2010.0,"[{'url': 'https://www.ars-grin.gov/', 'name': ...","[biodbcore-000546, bsg-d000546]",Database,repository,[Life Science],"[Cell, Cell culture, Germplasm]","[Bacteria, Metazoa, Viridiplantae]",[],[United States],FAIRsharing record for: Germplasm Resources In...,GRIN,https://fairsharing.org/10.25504/FAIRsharing.3...,10.25504/FAIRsharing.3axym7,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: GRIN provid...,[],[],,GRIN,,,,,


In [3]:
fairsharing_df.describe(include='all')

Unnamed: 0,id,type,attributes.created-at,attributes.updated-at,attributes.metadata.doi,attributes.metadata.name,attributes.metadata.status,attributes.metadata.contacts,attributes.metadata.homepage,attributes.metadata.identifier,attributes.metadata.description,attributes.metadata.support-links,attributes.metadata.year-creation,attributes.metadata.data-processes,attributes.legacy-ids,attributes.fairsharing-registry,attributes.record-type,attributes.subjects,attributes.domains,attributes.taxonomies,attributes.user-defined-tags,attributes.countries,attributes.name,attributes.abbreviation,attributes.url,attributes.doi,attributes.fairsharing-licence,attributes.description,attributes.publications,attributes.licence-links,attributes.metadata.citations,attributes.metadata.abbreviation,attributes.metadata.access-points,attributes.metadata.associated-tools,attributes.metadata.deprecation-date,attributes.metadata.deprecation-reason,attributes.metadata.tombstone
count,1797.0,1797,1797,1797,1354,1797,1797,1678,1797,1797.0,1797,1608,1492.0,1565,1797,1797,1797,1797,1797,1797,1797,1797,1797,1638,1797,1354,1797,1797,1797,1797,326,1638,449,618,217,217,1
unique,1797.0,1,1162,1797,1354,1796,4,1576,1797,,1797,1594,,1563,1797,1,3,888,1163,378,384,185,1796,1626,1797,1354,1,1797,1109,1082,320,1626,444,615,55,86,1
top,1723.0,fairsharing-records,2014-11-04T15:23:40.000Z,2021-09-30T11:39:06.829Z,10.25504/FAIRsharing.8t18te,OmicsDB,ready,"[{'contact-name': 'Sam Hokin', 'contact-email'...",http://www.cellimagelibrary.org,,This library is a public and easily accessible...,[{'url': 'https://github.com/gbif/ipt/wiki/IPT...,,[{'url': 'http://qf.iodp.tamu.edu/qfsearch/sea...,"[biodbcore-000180, bsg-d000180]",Database,repository,[Life Science],[],[All],[],[United States],FAIRsharing record for: OmicsDB,CGD,https://fairsharing.org/10.25504/FAIRsharing.8...,10.25504/FAIRsharing.8t18te,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: This librar...,[],[],"[{'doi': '10.1093/nar/gkz890', 'pubmed-id': 31...",CGD,"[{'url': 'https://github.com/Ensembl', 'name':...",[{'url': 'http://www.h-invitational.jp/hinv/bl...,2021-9-17,This resource is no longer available at the st...,True
freq,1.0,1797,636,1,1,2,1540,6,1,,1,6,,2,1,1797,926,350,265,502,1193,594,2,3,1,1,1797,1,661,716,6,3,3,2,84,113,1
mean,,,,,,,,,,2446.100167,,,2007.636059,,,,,,,,,,,,,,,,,,,,,,,,
std,,,,,,,,,,520.058757,,,10.953269,,,,,,,,,,,,,,,,,,,,,,,,
min,,,,,,,,,,1547.0,,,1894.0,,,,,,,,,,,,,,,,,,,,,,,,
25%,,,,,,,,,,1996.0,,,2004.0,,,,,,,,,,,,,,,,,,,,,,,,
50%,,,,,,,,,,2445.0,,,2010.0,,,,,,,,,,,,,,,,,,,,,,,,
75%,,,,,,,,,,2897.0,,,2014.0,,,,,,,,,,,,,,,,,,,,,,,,


In [4]:
fairsharing_df.isna().sum()

id                                           0
type                                         0
attributes.created-at                        0
attributes.updated-at                        0
attributes.metadata.doi                    443
attributes.metadata.name                     0
attributes.metadata.status                   0
attributes.metadata.contacts               119
attributes.metadata.homepage                 0
attributes.metadata.identifier               0
attributes.metadata.description              0
attributes.metadata.support-links          189
attributes.metadata.year-creation          305
attributes.metadata.data-processes         232
attributes.legacy-ids                        0
attributes.fairsharing-registry              0
attributes.record-type                       0
attributes.subjects                          0
attributes.domains                           0
attributes.taxonomies                        0
attributes.user-defined-tags                 0
attributes.co

In [5]:
pd.DataFrame(fairsharing_df['attributes.record-type']).groupby('attributes.record-type').size()

attributes.record-type
knowledgebase                   774
knowledgebase_and_repository     97
repository                      926
dtype: int64