registries_analysis/notebooks/01.4-exploration-fairsharin...

67 KiB

In [1]:
import ast
import csv
import json

import numpy as np
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

Loading datasets

In [2]:
with open('../data/raw/fairsharing_dump_api_09_2021.json') as f:
    lines = f.read().splitlines()
    
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))

fairsharing_df.head()
Out[2]:
id type attributes.created-at attributes.updated-at attributes.metadata.doi attributes.metadata.name attributes.metadata.status attributes.metadata.contacts attributes.metadata.homepage attributes.metadata.identifier attributes.metadata.description attributes.metadata.support-links attributes.metadata.year-creation attributes.metadata.data-processes attributes.legacy-ids attributes.fairsharing-registry attributes.record-type attributes.subjects attributes.domains attributes.taxonomies attributes.user-defined-tags attributes.countries attributes.name attributes.abbreviation attributes.url attributes.doi attributes.fairsharing-licence attributes.description attributes.publications attributes.licence-links attributes.metadata.citations attributes.metadata.abbreviation attributes.metadata.access-points attributes.metadata.associated-tools attributes.metadata.deprecation-date attributes.metadata.deprecation-reason attributes.metadata.tombstone
0 1723 fairsharing-records 2014-11-04T15:23:40.000Z 2021-09-30T11:39:06.829Z 10.25504/FAIRsharing.8t18te Cell Image Library ready [{'contact-name': 'David Orloff', 'contact-ema... http://www.cellimagelibrary.org 1723 This library is a public and easily accessible... [{'url': 'http://www.cellimagelibrary.org/page... 2010.0 [{'name': 'live update', 'type': 'data release... [biodbcore-000180, bsg-d000180] Database repository [Cell Biology, Life Science] [Cell, Microscopy, Light microscopy, Electron ... [All] [] [United States] FAIRsharing record for: Cell Image Library None https://fairsharing.org/10.25504/FAIRsharing.8... 10.25504/FAIRsharing.8t18te https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: This librar... [{'id': 232, 'pubmed_id': 23203874, 'title': '... [{'licence-name': 'Cell Image Library Data Pol... NaN NaN NaN NaN NaN NaN NaN
1 3101 fairsharing-records 2020-09-16T08:49:13.000Z 2021-09-30T11:36:45.452Z NaN WHOI Ship Data-Grabber System ready NaN http://4dgeo.whoi.edu/shipdata/SDG_shipdata.html 3101 The WHOI Ship DataGrabber system provides the ... [{'url': 'http://4dgeo.whoi.edu/shipdata/SDG_o... 2004.0 [{'url': 'http://4dgeo.whoi.edu/sdg-bin/dv_mai... [biodbcore-001609, bsg-d001609] Database repository [Earth Science, Water Research, Oceanography] [] [Not applicable] [subseafloor environments] [United States] FAIRsharing record for: WHOI Ship Data-Grabber... None https://fairsharing.org/fairsharing_records/3101 None https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: The WHOI Sh... [] [{'licence-name': 'NDSF Data Archive Policy', ... NaN NaN NaN NaN NaN NaN NaN
2 2649 fairsharing-records 2018-08-07T20:23:32.000Z 2021-09-30T11:39:07.898Z NaN Electron Microscope Public Image Archive ready [{'contact-name': 'General contact', 'contact-... https://www.ebi.ac.uk/pdbe/emdb/empiar/ 2649 EMPIAR, the Electron Microscopy Public Image A... [{'url': 'https://www.ebi.ac.uk/support/EMPIAR... 2015.0 [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... [biodbcore-001140, bsg-d001140] Database repository [Bioinformatics, Biology] [Protein image, Microscopy, Electron microscop... [All] [] [Greece, Czech Republic, United Kingdom, Icela... FAIRsharing record for: Electron Microscope Pu... EMPIAR https://fairsharing.org/fairsharing_records/2649 None https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: EMPIAR, the... [{'id': 2232, 'pubmed_id': 27067018, 'title': ... [{'licence-name': 'EMBL-EBI Terms of Use', 'li... [{'doi': '10.1038/nmeth.3806', 'pubmed-id': 27... EMPIAR [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... NaN NaN NaN
3 2657 fairsharing-records 2018-08-13T15:12:11.000Z 2021-09-30T11:37:28.736Z 10.25504/FAIRsharing.tnByoG ClinicalStudyDataRequest.com ready [{'contact-email': 'support@clinicalstudydatar... https://clinicalstudydatarequest.com/ 2657 ClinicalStudyDataRequest.com (CSDR) is a conso... [{'url': 'https://clinicalstudydatarequest.com... 2014.0 [{'url': 'https://clinicalstudydatarequest.com... [biodbcore-001149, bsg-d001149] Database repository [Preclinical Studies, Biomedical Science] [] [Homo sapiens] [] [Worldwide] FAIRsharing record for: ClinicalStudyDataReque... CSDR https://fairsharing.org/10.25504/FAIRsharing.t... 10.25504/FAIRsharing.tnByoG https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: ClinicalStu... [] [{'licence-name': 'CSDR Data Sharing Agreement... NaN CSDR NaN NaN NaN NaN NaN
4 2078 fairsharing-records 2014-11-04T15:23:40.000Z 2021-09-30T11:34:43.129Z 10.25504/FAIRsharing.3axym7 Germplasm Resources Information Network ready [{'contact-email': 'dbmu@ars-grin.gov'}] https://www.ars-grin.gov/ 2078 GRIN provides National Genetic Resources Progr... [{'url': 'https://www.ars-grin.gov/Pages/Colle... 2010.0 [{'url': 'https://www.ars-grin.gov/', 'name': ... [biodbcore-000546, bsg-d000546] Database repository [Life Science] [Cell, Cell culture, Germplasm] [Bacteria, Metazoa, Viridiplantae] [] [United States] FAIRsharing record for: Germplasm Resources In... GRIN https://fairsharing.org/10.25504/FAIRsharing.3... 10.25504/FAIRsharing.3axym7 https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: GRIN provid... [] [] NaN GRIN NaN NaN NaN NaN NaN
In [3]:
fairsharing_df.describe(include='all')
Out[3]:
id type attributes.created-at attributes.updated-at attributes.metadata.doi attributes.metadata.name attributes.metadata.status attributes.metadata.contacts attributes.metadata.homepage attributes.metadata.identifier attributes.metadata.description attributes.metadata.support-links attributes.metadata.year-creation attributes.metadata.data-processes attributes.legacy-ids attributes.fairsharing-registry attributes.record-type attributes.subjects attributes.domains attributes.taxonomies attributes.user-defined-tags attributes.countries attributes.name attributes.abbreviation attributes.url attributes.doi attributes.fairsharing-licence attributes.description attributes.publications attributes.licence-links attributes.metadata.citations attributes.metadata.abbreviation attributes.metadata.access-points attributes.metadata.associated-tools attributes.metadata.deprecation-date attributes.metadata.deprecation-reason attributes.metadata.tombstone
count 1797 1797 1797 1797 1354 1797 1797 1678 1797 1797.000000 1797 1608 1492.000000 1565 1797 1797 1797 1797 1797 1797 1797 1797 1797 1638 1797 1354 1797 1797 1797 1797 326 1638 449 618 217 217 1
unique 1797 1 1162 1797 1354 1796 4 1576 1797 NaN 1797 1594 NaN 1563 1797 1 3 888 1163 378 384 185 1796 1626 1797 1354 1 1797 1109 1082 320 1626 444 615 55 86 1
top 1723 fairsharing-records 2014-11-04T15:23:40.000Z 2021-09-30T11:39:06.829Z 10.25504/FAIRsharing.8t18te OmicsDB ready [{'contact-name': 'Sam Hokin', 'contact-email'... http://www.cellimagelibrary.org NaN This library is a public and easily accessible... [{'url': 'https://github.com/gbif/ipt/wiki/IPT... NaN [{'url': 'http://qf.iodp.tamu.edu/qfsearch/sea... [biodbcore-000180, bsg-d000180] Database repository [Life Science] [] [All] [] [United States] FAIRsharing record for: OmicsDB CGD https://fairsharing.org/10.25504/FAIRsharing.8... 10.25504/FAIRsharing.8t18te https://creativecommons.org/licenses/by-sa/4.0... This FAIRsharing record describes: This librar... [] [] [{'doi': '10.1093/nar/gkz890', 'pubmed-id': 31... CGD [{'url': 'https://github.com/Ensembl', 'name':... [{'url': 'http://www.h-invitational.jp/hinv/bl... 2021-9-17 This resource is no longer available at the st... True
freq 1 1797 636 1 1 2 1540 6 1 NaN 1 6 NaN 2 1 1797 926 350 265 502 1193 594 2 3 1 1 1797 1 661 716 6 3 3 2 84 113 1
mean NaN NaN NaN NaN NaN NaN NaN NaN NaN 2446.100167 NaN NaN 2007.636059 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
std NaN NaN NaN NaN NaN NaN NaN NaN NaN 520.058757 NaN NaN 10.953269 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min NaN NaN NaN NaN NaN NaN NaN NaN NaN 1547.000000 NaN NaN 1894.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25% NaN NaN NaN NaN NaN NaN NaN NaN NaN 1996.000000 NaN NaN 2004.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
50% NaN NaN NaN NaN NaN NaN NaN NaN NaN 2445.000000 NaN NaN 2010.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
75% NaN NaN NaN NaN NaN NaN NaN NaN NaN 2897.000000 NaN NaN 2014.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
max NaN NaN NaN NaN NaN NaN NaN NaN NaN 3346.000000 NaN NaN 2021.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [4]:
fairsharing_df.isna().sum()
Out[4]:
id                                           0
type                                         0
attributes.created-at                        0
attributes.updated-at                        0
attributes.metadata.doi                    443
attributes.metadata.name                     0
attributes.metadata.status                   0
attributes.metadata.contacts               119
attributes.metadata.homepage                 0
attributes.metadata.identifier               0
attributes.metadata.description              0
attributes.metadata.support-links          189
attributes.metadata.year-creation          305
attributes.metadata.data-processes         232
attributes.legacy-ids                        0
attributes.fairsharing-registry              0
attributes.record-type                       0
attributes.subjects                          0
attributes.domains                           0
attributes.taxonomies                        0
attributes.user-defined-tags                 0
attributes.countries                         0
attributes.name                              0
attributes.abbreviation                    159
attributes.url                               0
attributes.doi                             443
attributes.fairsharing-licence               0
attributes.description                       0
attributes.publications                      0
attributes.licence-links                     0
attributes.metadata.citations             1471
attributes.metadata.abbreviation           159
attributes.metadata.access-points         1348
attributes.metadata.associated-tools      1179
attributes.metadata.deprecation-date      1580
attributes.metadata.deprecation-reason    1580
attributes.metadata.tombstone             1796
dtype: int64
In [5]:
pd.DataFrame(fairsharing_df['attributes.record-type']).groupby('attributes.record-type').size()
Out[5]:
attributes.record-type
knowledgebase                   774
knowledgebase_and_repository     97
repository                      926
dtype: int64