67 KiB
67 KiB
In [1]:
import ast
import csv
import json
import numpy as np
import pandas as pd
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('display.max_columns', None)
Loading datasets¶
In [2]:
with open('../data/raw/fairsharing_dump_api_09_2021.json') as f:
lines = f.read().splitlines()
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))
fairsharing_df.head()
Out[2]:
id | type | attributes.created-at | attributes.updated-at | attributes.metadata.doi | attributes.metadata.name | attributes.metadata.status | attributes.metadata.contacts | attributes.metadata.homepage | attributes.metadata.identifier | attributes.metadata.description | attributes.metadata.support-links | attributes.metadata.year-creation | attributes.metadata.data-processes | attributes.legacy-ids | attributes.fairsharing-registry | attributes.record-type | attributes.subjects | attributes.domains | attributes.taxonomies | attributes.user-defined-tags | attributes.countries | attributes.name | attributes.abbreviation | attributes.url | attributes.doi | attributes.fairsharing-licence | attributes.description | attributes.publications | attributes.licence-links | attributes.metadata.citations | attributes.metadata.abbreviation | attributes.metadata.access-points | attributes.metadata.associated-tools | attributes.metadata.deprecation-date | attributes.metadata.deprecation-reason | attributes.metadata.tombstone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1723 | fairsharing-records | 2014-11-04T15:23:40.000Z | 2021-09-30T11:39:06.829Z | 10.25504/FAIRsharing.8t18te | Cell Image Library | ready | [{'contact-name': 'David Orloff', 'contact-ema... | http://www.cellimagelibrary.org | 1723 | This library is a public and easily accessible... | [{'url': 'http://www.cellimagelibrary.org/page... | 2010.0 | [{'name': 'live update', 'type': 'data release... | [biodbcore-000180, bsg-d000180] | Database | repository | [Cell Biology, Life Science] | [Cell, Microscopy, Light microscopy, Electron ... | [All] | [] | [United States] | FAIRsharing record for: Cell Image Library | None | https://fairsharing.org/10.25504/FAIRsharing.8... | 10.25504/FAIRsharing.8t18te | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: This librar... | [{'id': 232, 'pubmed_id': 23203874, 'title': '... | [{'licence-name': 'Cell Image Library Data Pol... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | 3101 | fairsharing-records | 2020-09-16T08:49:13.000Z | 2021-09-30T11:36:45.452Z | NaN | WHOI Ship Data-Grabber System | ready | NaN | http://4dgeo.whoi.edu/shipdata/SDG_shipdata.html | 3101 | The WHOI Ship DataGrabber system provides the ... | [{'url': 'http://4dgeo.whoi.edu/shipdata/SDG_o... | 2004.0 | [{'url': 'http://4dgeo.whoi.edu/sdg-bin/dv_mai... | [biodbcore-001609, bsg-d001609] | Database | repository | [Earth Science, Water Research, Oceanography] | [] | [Not applicable] | [subseafloor environments] | [United States] | FAIRsharing record for: WHOI Ship Data-Grabber... | None | https://fairsharing.org/fairsharing_records/3101 | None | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: The WHOI Sh... | [] | [{'licence-name': 'NDSF Data Archive Policy', ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | 2649 | fairsharing-records | 2018-08-07T20:23:32.000Z | 2021-09-30T11:39:07.898Z | NaN | Electron Microscope Public Image Archive | ready | [{'contact-name': 'General contact', 'contact-... | https://www.ebi.ac.uk/pdbe/emdb/empiar/ | 2649 | EMPIAR, the Electron Microscopy Public Image A... | [{'url': 'https://www.ebi.ac.uk/support/EMPIAR... | 2015.0 | [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... | [biodbcore-001140, bsg-d001140] | Database | repository | [Bioinformatics, Biology] | [Protein image, Microscopy, Electron microscop... | [All] | [] | [Greece, Czech Republic, United Kingdom, Icela... | FAIRsharing record for: Electron Microscope Pu... | EMPIAR | https://fairsharing.org/fairsharing_records/2649 | None | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: EMPIAR, the... | [{'id': 2232, 'pubmed_id': 27067018, 'title': ... | [{'licence-name': 'EMBL-EBI Terms of Use', 'li... | [{'doi': '10.1038/nmeth.3806', 'pubmed-id': 27... | EMPIAR | [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... | [{'url': 'https://www.ebi.ac.uk/pdbe/emdb/empi... | NaN | NaN | NaN |
3 | 2657 | fairsharing-records | 2018-08-13T15:12:11.000Z | 2021-09-30T11:37:28.736Z | 10.25504/FAIRsharing.tnByoG | ClinicalStudyDataRequest.com | ready | [{'contact-email': 'support@clinicalstudydatar... | https://clinicalstudydatarequest.com/ | 2657 | ClinicalStudyDataRequest.com (CSDR) is a conso... | [{'url': 'https://clinicalstudydatarequest.com... | 2014.0 | [{'url': 'https://clinicalstudydatarequest.com... | [biodbcore-001149, bsg-d001149] | Database | repository | [Preclinical Studies, Biomedical Science] | [] | [Homo sapiens] | [] | [Worldwide] | FAIRsharing record for: ClinicalStudyDataReque... | CSDR | https://fairsharing.org/10.25504/FAIRsharing.t... | 10.25504/FAIRsharing.tnByoG | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: ClinicalStu... | [] | [{'licence-name': 'CSDR Data Sharing Agreement... | NaN | CSDR | NaN | NaN | NaN | NaN | NaN |
4 | 2078 | fairsharing-records | 2014-11-04T15:23:40.000Z | 2021-09-30T11:34:43.129Z | 10.25504/FAIRsharing.3axym7 | Germplasm Resources Information Network | ready | [{'contact-email': 'dbmu@ars-grin.gov'}] | https://www.ars-grin.gov/ | 2078 | GRIN provides National Genetic Resources Progr... | [{'url': 'https://www.ars-grin.gov/Pages/Colle... | 2010.0 | [{'url': 'https://www.ars-grin.gov/', 'name': ... | [biodbcore-000546, bsg-d000546] | Database | repository | [Life Science] | [Cell, Cell culture, Germplasm] | [Bacteria, Metazoa, Viridiplantae] | [] | [United States] | FAIRsharing record for: Germplasm Resources In... | GRIN | https://fairsharing.org/10.25504/FAIRsharing.3... | 10.25504/FAIRsharing.3axym7 | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: GRIN provid... | [] | [] | NaN | GRIN | NaN | NaN | NaN | NaN | NaN |
In [3]:
fairsharing_df.describe(include='all')
Out[3]:
id | type | attributes.created-at | attributes.updated-at | attributes.metadata.doi | attributes.metadata.name | attributes.metadata.status | attributes.metadata.contacts | attributes.metadata.homepage | attributes.metadata.identifier | attributes.metadata.description | attributes.metadata.support-links | attributes.metadata.year-creation | attributes.metadata.data-processes | attributes.legacy-ids | attributes.fairsharing-registry | attributes.record-type | attributes.subjects | attributes.domains | attributes.taxonomies | attributes.user-defined-tags | attributes.countries | attributes.name | attributes.abbreviation | attributes.url | attributes.doi | attributes.fairsharing-licence | attributes.description | attributes.publications | attributes.licence-links | attributes.metadata.citations | attributes.metadata.abbreviation | attributes.metadata.access-points | attributes.metadata.associated-tools | attributes.metadata.deprecation-date | attributes.metadata.deprecation-reason | attributes.metadata.tombstone | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1797 | 1797 | 1797 | 1797 | 1354 | 1797 | 1797 | 1678 | 1797 | 1797.000000 | 1797 | 1608 | 1492.000000 | 1565 | 1797 | 1797 | 1797 | 1797 | 1797 | 1797 | 1797 | 1797 | 1797 | 1638 | 1797 | 1354 | 1797 | 1797 | 1797 | 1797 | 326 | 1638 | 449 | 618 | 217 | 217 | 1 |
unique | 1797 | 1 | 1162 | 1797 | 1354 | 1796 | 4 | 1576 | 1797 | NaN | 1797 | 1594 | NaN | 1563 | 1797 | 1 | 3 | 888 | 1163 | 378 | 384 | 185 | 1796 | 1626 | 1797 | 1354 | 1 | 1797 | 1109 | 1082 | 320 | 1626 | 444 | 615 | 55 | 86 | 1 |
top | 1723 | fairsharing-records | 2014-11-04T15:23:40.000Z | 2021-09-30T11:39:06.829Z | 10.25504/FAIRsharing.8t18te | OmicsDB | ready | [{'contact-name': 'Sam Hokin', 'contact-email'... | http://www.cellimagelibrary.org | NaN | This library is a public and easily accessible... | [{'url': 'https://github.com/gbif/ipt/wiki/IPT... | NaN | [{'url': 'http://qf.iodp.tamu.edu/qfsearch/sea... | [biodbcore-000180, bsg-d000180] | Database | repository | [Life Science] | [] | [All] | [] | [United States] | FAIRsharing record for: OmicsDB | CGD | https://fairsharing.org/10.25504/FAIRsharing.8... | 10.25504/FAIRsharing.8t18te | https://creativecommons.org/licenses/by-sa/4.0... | This FAIRsharing record describes: This librar... | [] | [] | [{'doi': '10.1093/nar/gkz890', 'pubmed-id': 31... | CGD | [{'url': 'https://github.com/Ensembl', 'name':... | [{'url': 'http://www.h-invitational.jp/hinv/bl... | 2021-9-17 | This resource is no longer available at the st... | True |
freq | 1 | 1797 | 636 | 1 | 1 | 2 | 1540 | 6 | 1 | NaN | 1 | 6 | NaN | 2 | 1 | 1797 | 926 | 350 | 265 | 502 | 1193 | 594 | 2 | 3 | 1 | 1 | 1797 | 1 | 661 | 716 | 6 | 3 | 3 | 2 | 84 | 113 | 1 |
mean | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2446.100167 | NaN | NaN | 2007.636059 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
std | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 520.058757 | NaN | NaN | 10.953269 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
min | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1547.000000 | NaN | NaN | 1894.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
25% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1996.000000 | NaN | NaN | 2004.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
50% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2445.000000 | NaN | NaN | 2010.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
75% | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2897.000000 | NaN | NaN | 2014.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
max | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 3346.000000 | NaN | NaN | 2021.000000 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
In [4]:
fairsharing_df.isna().sum()
Out[4]:
id 0 type 0 attributes.created-at 0 attributes.updated-at 0 attributes.metadata.doi 443 attributes.metadata.name 0 attributes.metadata.status 0 attributes.metadata.contacts 119 attributes.metadata.homepage 0 attributes.metadata.identifier 0 attributes.metadata.description 0 attributes.metadata.support-links 189 attributes.metadata.year-creation 305 attributes.metadata.data-processes 232 attributes.legacy-ids 0 attributes.fairsharing-registry 0 attributes.record-type 0 attributes.subjects 0 attributes.domains 0 attributes.taxonomies 0 attributes.user-defined-tags 0 attributes.countries 0 attributes.name 0 attributes.abbreviation 159 attributes.url 0 attributes.doi 443 attributes.fairsharing-licence 0 attributes.description 0 attributes.publications 0 attributes.licence-links 0 attributes.metadata.citations 1471 attributes.metadata.abbreviation 159 attributes.metadata.access-points 1348 attributes.metadata.associated-tools 1179 attributes.metadata.deprecation-date 1580 attributes.metadata.deprecation-reason 1580 attributes.metadata.tombstone 1796 dtype: int64
In [5]:
pd.DataFrame(fairsharing_df['attributes.record-type']).groupby('attributes.record-type').size()
Out[5]:
attributes.record-type knowledgebase 774 knowledgebase_and_repository 97 repository 926 dtype: int64