In [1]:
import ast
import csv
import json

import numpy as np
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

## Loading datasets

In [2]:
with open('../data/raw/fairsharing_dump_api_02_2022.json') as f:
    lines = f.read().splitlines()
    
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))

fairsharing_df.head()

Unnamed: 0,id,type,attributes.created-at,attributes.updated-at,attributes.metadata.doi,attributes.metadata.name,attributes.metadata.status,attributes.metadata.contacts,attributes.metadata.homepage,attributes.metadata.identifier,attributes.metadata.description,attributes.metadata.abbreviation,attributes.metadata.support-links,attributes.metadata.year-creation,attributes.metadata.data-processes,attributes.metadata.cross-references,attributes.legacy-ids,attributes.fairsharing-registry,attributes.record-type,attributes.subjects,attributes.domains,attributes.taxonomies,attributes.user-defined-tags,attributes.countries,attributes.name,attributes.abbreviation,attributes.url,attributes.doi,attributes.fairsharing-licence,attributes.description,attributes.publications,attributes.licence-links,attributes.url-for-logo,attributes.metadata.citations,attributes.metadata.associated-tools,attributes.metadata.deprecation-reason,attributes.metadata.data-access-condition.type,attributes.metadata.data-contact-information,attributes.metadata.data-deposition-condition.url,attributes.metadata.data-deposition-condition.type,attributes.metadata.deprecation-date,attributes.metadata.access-points,attributes.metadata.data-access-condition.url,attributes.metadata.resource-sustainability.url,attributes.metadata.resource-sustainability.name,attributes.metadata.data-preservation-policy.url,attributes.metadata.data-preservation-policy.name,attributes.metadata.data-access-for-pre-publication-review,attributes.metadata.data-versioning,attributes.metadata.data-curation.type,attributes.metadata.data-curation.url,attributes.metadata.citation-to-related-publications,attributes.metadata.tombstone
0,3226,fairsharing-records,2020-12-09T11:53:44.000Z,2022-02-08T10:42:36.452Z,10.25504/FAIRsharing.d6423b,WDC Sunspot Index and Long-term Solar Observat...,ready,"[{'contact-name': 'Frédéric Clette', 'contact-...",http://sidc.be/silso/home,3226,The WDC-SILSO is an activity of the Operationa...,WDC-SILSO,[{'url': 'http://www.sidc.be/silso/taxonomy/te...,2013.0,"[{'url': 'http://www.sidc.be/silso/datafiles',...",[{'url': 'https://www.re3data.org/repository/r...,"[biodbcore-001740, bsg-d001740]",Database,repository,"[Electromagnetism, Astrophysics and Astronomy,...","[Climate, Observation design]",[Not applicable],"[Climate change, earth observation, Electromag...",[Belgium],FAIRsharing record for: WDC Sunspot Index and ...,WDC-SILSO,https://fairsharing.org/10.25504/FAIRsharing.d...,10.25504/FAIRsharing.d6423b,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: The WDC-SIL...,[],"[{'licence-name': 'SILSO legal notices', 'lice...",,,,,,,,,,,,,,,,,,,,,
1,2114,fairsharing-records,2014-11-04T15:23:40.000Z,2022-01-21T14:39:02.195Z,10.25504/FAIRsharing.p06nme,Biological Magnetic Resonance Data Bank,ready,"[{'contact-name': 'Helpdesk', 'contact-email':...",https://bmrb.io/,2114,"BMRB collects, annotates, archives, and dissem...",BMRB,"[{'url': 'https://bmrb.io/bmrb/news/', 'name':...",1988.0,[{'url': 'https://bmrb.io/data_library/rsync.s...,[{'url': 'https://www.re3data.org/repository/r...,"[biodbcore-000584, bsg-d000584]",Database,repository,[Structural Biology],"[Molecular structure, Protein structure, Pepti...",[All],[],[United States],FAIRsharing record for: Biological Magnetic Re...,BMRB,https://fairsharing.org/10.25504/FAIRsharing.p...,10.25504/FAIRsharing.p06nme,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: BMRB collec...,"[{'id': 552, 'pubmed_id': 18288446, 'title': '...",[{'licence-name': 'wwPDB Privacy and Usage Pol...,,"[{'doi': '10.1093/nar/gkm957', 'pubmed-id': 17...","[{'url': 'https://bmrb.io/validate/', 'name': ...",,open,yes,https://bmrb.io/deposit/,open,,,,,,,,,,,,,
2,3022,fairsharing-records,2020-06-17T10:25:30.000Z,2022-02-08T10:41:04.073Z,10.25504/FAIRsharing.8b7a2f,Fisheries and Oceans Canada Pacific Region Dat...,ready,"[{'contact-name': 'Peter Chandler', 'contact-e...",http://www.pac.dfo-mpo.gc.ca/science/oceans/da...,3022,The Institute of Ocean Sciences (IOS)/Ocean Sc...,,[{'url': 'DFO.PAC.SCI.IOSData-DonneesISO.SCI.P...,,[{'name': 'Users must contact the Senior Analy...,[{'url': 'https://www.re3data.org/repository/r...,"[biodbcore-001530, bsg-d001530]",Database,repository,"[Environmental Science, Meteorology, Earth Sci...",[Climate],[Not applicable],"[Salinity, Temperature]",[Canada],FAIRsharing record for: Fisheries and Oceans C...,,https://fairsharing.org/10.25504/FAIRsharing.8...,10.25504/FAIRsharing.8b7a2f,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: The Institu...,[],[{'licence-name': 'Fisheries and Oceans Canada...,,,,,,,,,,,,,,,,,,,,,
3,2998,fairsharing-records,2020-05-21T07:42:30.000Z,2022-02-08T10:40:19.531Z,10.25504/FAIRsharing.e08886,Climate Prediction Center,ready,"[{'contact-name': 'Jon Hoopingarner', 'contact...",https://www.cpc.ncep.noaa.gov/,2998,The Climate Prediction Center (CPC) produces o...,CPC,[{'url': 'https://www.cpc.ncep.noaa.gov/commen...,1970.0,"[{'url': 'https://www.cpc.ncep.noaa.gov/', 'na...",[{'url': 'https://www.re3data.org/repository/r...,"[biodbcore-001504, bsg-d001504]",Database,repository,"[Hydrogeology, Geography, Meteorology, Geodesy...",[Climate],[Not applicable],"[Forecasting, weather]",[United States],FAIRsharing record for: Climate Prediction Center,CPC,https://fairsharing.org/10.25504/FAIRsharing.e...,10.25504/FAIRsharing.e08886,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: The Climate...,[],[{'licence-name': 'National Weather Service Di...,,,,,,,,,,,,,,,,,,,,,
4,2301,fairsharing-records,2016-06-03T14:54:08.000Z,2021-11-24T13:17:51.201Z,10.25504/FAIRsharing.meh9wz,Acytostelium Gene Database,deprecated,[{'contact-name': 'Acytostelium genome consort...,http://cosmos.bot.kyoto-u.ac.jp/acytodb//cgi-b...,2301,Genome and transcriptome database of Acytostel...,,,2008.0,,,"[biodbcore-000775, bsg-d000775]",Database,repository,"[Genomics, Life Science, Transcriptomics]","[DNA sequence data, Gene model annotation]",[Acytostelium subglobosum],[],"[United Kingdom, Japan]",FAIRsharing record for: Acytostelium Gene Data...,,https://fairsharing.org/10.25504/FAIRsharing.m...,10.25504/FAIRsharing.meh9wz,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: Genome and ...,"[{'id': 1139, 'pubmed_id': 25758444, 'title': ...",[],,,,This resource is no longer available at the st...,,,,,2021-9-17,,,,,,,,,,,,


In [3]:
fairsharing_df.describe(include='all')

Unnamed: 0,id,type,attributes.created-at,attributes.updated-at,attributes.metadata.doi,attributes.metadata.name,attributes.metadata.status,attributes.metadata.contacts,attributes.metadata.homepage,attributes.metadata.identifier,attributes.metadata.description,attributes.metadata.abbreviation,attributes.metadata.support-links,attributes.metadata.year-creation,attributes.metadata.data-processes,attributes.metadata.cross-references,attributes.legacy-ids,attributes.fairsharing-registry,attributes.record-type,attributes.subjects,attributes.domains,attributes.taxonomies,attributes.user-defined-tags,attributes.countries,attributes.name,attributes.abbreviation,attributes.url,attributes.doi,attributes.fairsharing-licence,attributes.description,attributes.publications,attributes.licence-links,attributes.url-for-logo,attributes.metadata.citations,attributes.metadata.associated-tools,attributes.metadata.deprecation-reason,attributes.metadata.data-access-condition.type,attributes.metadata.data-contact-information,attributes.metadata.data-deposition-condition.url,attributes.metadata.data-deposition-condition.type,attributes.metadata.deprecation-date,attributes.metadata.access-points,attributes.metadata.data-access-condition.url,attributes.metadata.resource-sustainability.url,attributes.metadata.resource-sustainability.name,attributes.metadata.data-preservation-policy.url,attributes.metadata.data-preservation-policy.name,attributes.metadata.data-access-for-pre-publication-review,attributes.metadata.data-versioning,attributes.metadata.data-curation.type,attributes.metadata.data-curation.url,attributes.metadata.citation-to-related-publications,attributes.metadata.tombstone
count,1853.0,1853,1853,1853,1601,1853,1853,1764,1853,1853.0,1853,1671,1663,1541.0,1626,790,1853,1853,1853,1853,1853,1853,1853,1853,1853,1671,1853,1601,1853,1853,1853,1853,18,621,632,363.0,42,47,22,33,238,465,19,2,2,3,3,10,17,22,8,35,1
unique,1853.0,1,1218,1853,1601,1851,4,1623,1853,,1853,1655,1646,,1625,790,1799,1,3,935,1205,385,395,194,1851,1655,1853,1601,1,1853,1135,1119,18,331,627,104.0,2,2,22,2,71,460,19,2,2,3,3,2,2,4,8,2,1
top,3226.0,fairsharing-records,2014-11-04T15:23:40.000Z,2022-02-08T10:42:36.452Z,10.25504/FAIRsharing.d6423b,iDog,ready,[],http://sidc.be/silso/home,,The WDC-SILSO is an activity of the Operationa...,CGD,[{'url': 'https://github.com/gbif/ipt/wiki/IPT...,,[{'url': 'https://site.uit.no/dataverseno/abou...,[{'url': 'https://www.re3data.org/repository/r...,[],Database,repository,[Life Science],[],[All],[],[United States],FAIRsharing record for: iDog,CGD,https://fairsharing.org/10.25504/FAIRsharing.d...,10.25504/FAIRsharing.d6423b,https://creativecommons.org/licenses/by-sa/4.0...,This FAIRsharing record describes: The WDC-SIL...,[],[],/rails/active_storage/blobs/redirect/eyJfcmFpb...,[],[],,open,yes,https://bmrb.io/deposit/,controlled,2021-9-17,[{'url': 'https://heidata.uni-heidelberg.de/oa...,https://arch.library.northwestern.edu/about?lo...,https://www.library.northwestern.edu/about/adm...,Commitment to Sustainability: Level 1,http://www.library.northwestern.edu/about/admi...,Digital Preservation Policy: Level 1,yes,yes,manual,https://www.gbif.org/tools/data-validator/about,yes,True
freq,1.0,1853,636,1,1,2,1564,40,1,,1,3,6,,2,1,55,1853,954,345,276,528,1258,607,2,3,1,1,1853,1,690,735,1,285,3,125.0,38,45,1,21,81,3,1,1,1,1,1,9,16,11,1,34,1
mean,,,,,,,,,,2481.862925,,,,2007.894873,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
std,,,,,,,,,,554.072492,,,,10.933713,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
min,,,,,,,,,,1120.0,,,,1894.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
25%,,,,,,,,,,2009.0,,,,2004.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
50%,,,,,,,,,,2473.0,,,,2010.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
75%,,,,,,,,,,2938.0,,,,2015.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [4]:
fairsharing_df.isna().sum()

id                                                               0
type                                                             0
attributes.created-at                                            0
attributes.updated-at                                            0
attributes.metadata.doi                                        252
attributes.metadata.name                                         0
attributes.metadata.status                                       0
attributes.metadata.contacts                                    89
attributes.metadata.homepage                                     0
attributes.metadata.identifier                                   0
attributes.metadata.description                                  0
attributes.metadata.abbreviation                               182
attributes.metadata.support-links                              190
attributes.metadata.year-creation                              312
attributes.metadata.data-processes                            

In [5]:
pd.DataFrame(fairsharing_df['attributes.record-type']).groupby('attributes.record-type').size()

attributes.record-type
knowledgebase                   787
knowledgebase_and_repository    112
repository                      954
dtype: int64