registries_analysis/notebooks/01.1-exploration-re3data.ipynb

46 KiB

In [1]:
import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

Loading datasets

re3data

In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t', 
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'repository_id': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'provider_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    },
                        usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])
re3data_df.head()
Out[3]:
re3data_id repository_name type subject provider_type keyword institution
0 r3d100000001 Odum Institute Archive Dataverse [disciplinary] [1 Humanities and Social Sciences, 111 Social ... [dataProvider] [FAIR, Middle East, crime, demography, economy... [[Odum Institute for Research in Social Scienc...
1 r3d100000002 Access to Archival Databases [disciplinary] [1 Humanities and Social Sciences, 102 History... [dataProvider] [US History] [[The U.S. National Archives and Records Admin...
2 r3d100000004 Datenbank Gesprochenes Deutsch [disciplinary] [1 Humanities and Social Sciences, 104 Linguis... [dataProvider, serviceProvider] [Australian German, FOLK, German dialects, Pfe... [[Institut für Deutsche Sprache, Archiv für Ge...
3 r3d100000005 UNC Dataverse [institutional] [1 Humanities and Social Sciences, 111 Social ... [dataProvider, serviceProvider] [FAIR, census, demographic survey, demography,... [[Odum Institute for Research in Social Scienc...
4 r3d100000006 Archaeology Data Service [disciplinary] [1 Humanities and Social Sciences, 101 Ancient... [dataProvider, serviceProvider] [FAIR, archaeology, cultural heritage, prehist... [[Arts and Humanities Research Council, [AHRC]...

HERE I AM FILTERING SERVICE PROVIDERS OUT!!

In [4]:
re3data_df = re3data_df.explode('provider_type')
re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']
In [5]:
re3data_df.describe(include='all')
Out[5]:
re3data_id repository_name type subject provider_type keyword institution
count 2467 2467 2467 2467 2459 2467 2467
unique 2466 2463 9 1282 1 2248 2447
top r3d100011987 Landmap [disciplinary] [1 Humanities and Social Sciences, 2 Life Scie... dataProvider [multidisciplinary] [[National Center for Biotechnology Informatio...
freq 2 2 1573 200 2459 181 6

openDOAR

In [6]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'subject': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'opendoar_id': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    },
                        usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])
opendoar_df.head()
Out[6]:
opendoar_id repository_name type subject institution
0 101 utrecht university repository institutional [multidisciplinary] [[university of utrecht, [universiteit utrecht...
1 115 dspace at indian institute of management kozhi... institutional [ecology and environment, social sciences gene... [[indian institute of management kozhikode, [i...
2 41 caltech engineering and science online institutional [biology and biochemistry, chemistry and chemi... [[california institute of technology, [caltech...
3 119 dcu online research access service institutional [multidisciplinary] [[dublin city university, [dcu], ie, [], , htt...
4 129 earth-prints repository disciplinary [earth and planetary sciences] [[istituto nazionale di geofisica e vulcanolog...
In [7]:
opendoar_df.describe(include='all')
Out[7]:
opendoar_id repository_name type subject institution
count 5707.000000 5707 5707 5707 5707
unique NaN 5670 4 820 5098
top NaN arch institutional [multidisciplinary] [[rijksuniversiteit groningen, [rug], nl, [], ...
freq NaN 3 5067 3212 26
mean 4008.118801 NaN NaN NaN NaN
std 2869.948770 NaN NaN NaN NaN
min 2.000000 NaN NaN NaN NaN
25% 1823.000000 NaN NaN NaN NaN
50% 3361.000000 NaN NaN NaN NaN
75% 5095.000000 NaN NaN NaN NaN
max 10175.000000 NaN NaN NaN NaN

ROAR

In [45]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',
                     usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])
roar_df.head()
Out[45]:
eprintid home_page title location_country subjects
0 921 http://alcme.oclc.org/ndltd/index.html Networked Digital Library of Theses and Disser... us NaN
1 1489 http://prensahistorica.mcu.es/prensahistorica/... Virtual Library of Historical Press es NaN
2 606 http://hal.archives-ouvertes.fr/ HAL: Hyper Article en Ligne fr NaN
3 606 NaN NaN NaN NaN
4 606 NaN NaN NaN NaN
In [43]:
# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)
In [47]:
roar_df[roar_df.eprintid == 2303]
Out[47]:
eprintid home_page title location_country subjects
106 2303 NaN Faculty Scholarship at The Claremont Colleges us AS
107 2303 NaN NaN NaN BF
108 2303 NaN NaN NaN BL
109 2303 NaN NaN NaN CC
110 2303 NaN NaN NaN GN
111 2303 NaN NaN NaN H1
112 2303 NaN NaN NaN HB
113 2303 NaN NaN NaN JA
114 2303 NaN NaN NaN LB
115 2303 NaN NaN NaN NX
116 2303 NaN NaN NaN PQ
117 2303 NaN NaN NaN QA
In [44]:
roar_df.describe(include='all')
Out[44]:
eprintid home_page title location_country subjects
count 5314.000000 5263 5268 5024 1225
unique NaN 5156 5027 134 123
top NaN http://ir.lib.isu.edu.tw/ Repositorio Institucional us H1
freq NaN 3 7 877 147
mean 6389.464434 NaN NaN NaN NaN
std 5159.573937 NaN NaN NaN NaN
min 1.000000 NaN NaN NaN NaN
25% 1490.250000 NaN NaN NaN NaN
50% 4990.500000 NaN NaN NaN NaN
75% 10452.750000 NaN NaN NaN NaN
max 17302.000000 NaN NaN NaN NaN

FAIRsharing

In [11]:
fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', 
                             delimiter='|', header=0,
                             names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df.head()
Out[11]:
full_name short_name fs_url url countries subjects
0 GenBank GenBank https://fairsharing.org/10.25504/FAIRsharing.9... https://www.ncbi.nlm.nih.gov/genbank/ European Union,Japan,United States Bioinformatics,Data Management,Data Submission...
1 GlycoNAVI GlycoNAVI https://fairsharing.org/10.25504/FAIRsharing.w... https://glyconavi.org/ Japan Chemistry,Glycomics,Life Science,Organic Chemi...
2 ADHDgene ADHDgene https://fairsharing.org/10.25504/FAIRsharing.m... http://adhd.psych.ac.cn/ China Biomedical Science,Genetics
3 Allele frequency resource for research and tea... ALFRED https://fairsharing.org/10.25504/FAIRsharing.y... http://alfred.med.yale.edu United States Life Science
4 Animal Transcription Factor Database AnimalTFDB https://fairsharing.org/10.25504/FAIRsharing.e... http://bioinfo.life.hust.edu.cn/AnimalTFDB/ China Life Science
In [12]:
fairsharing_df.describe(include='all')
Out[12]:
full_name short_name fs_url url countries subjects
count 1752 1752 1752 1752 1749 1690
unique 1752 1741 1752 1752 178 834
top CiteAb CGD https://fairsharing.org/10.25504/FAIRsharing.1... http://www.plexdb.org/ United States Life Science
freq 1 3 1 1 588 367