In [1]:
import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

## Loading datasets

**re3data**

In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t', 
                        converters={'subject': ast.literal_eval,
                                    'keyword': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'repository_id': ast.literal_eval,
                                    'type': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'provider_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    },
                        usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])
re3data_df.head()

Unnamed: 0,re3data_id,repository_name,type,subject,provider_type,keyword,institution
0,r3d100000001,Odum Institute Archive Dataverse,[disciplinary],"[1 Humanities and Social Sciences, 111 Social ...",[dataProvider],"[FAIR, Middle East, crime, demography, economy...",[[Odum Institute for Research in Social Scienc...
1,r3d100000002,Access to Archival Databases,[disciplinary],"[1 Humanities and Social Sciences, 102 History...",[dataProvider],[US History],[[The U.S. National Archives and Records Admin...
2,r3d100000004,Datenbank Gesprochenes Deutsch,[disciplinary],"[1 Humanities and Social Sciences, 104 Linguis...","[dataProvider, serviceProvider]","[Australian German, FOLK, German dialects, Pfe...","[[Institut f端r Deutsche Sprache, Archiv f端r Ge..."
3,r3d100000005,UNC Dataverse,[institutional],"[1 Humanities and Social Sciences, 111 Social ...","[dataProvider, serviceProvider]","[FAIR, census, demographic survey, demography,...",[[Odum Institute for Research in Social Scienc...
4,r3d100000006,Archaeology Data Service,[disciplinary],"[1 Humanities and Social Sciences, 101 Ancient...","[dataProvider, serviceProvider]","[FAIR, archaeology, cultural heritage, prehist...","[[Arts and Humanities Research Council, [AHRC]..."


**HERE I AM FILTERING SERVICE PROVIDERS OUT!!**

In [4]:
re3data_df = re3data_df.explode('provider_type')
re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']

In [5]:
re3data_df.describe(include='all')

Unnamed: 0,re3data_id,repository_name,type,subject,provider_type,keyword,institution
count,2467,2467,2467,2467,2459,2467,2467
unique,2466,2463,9,1282,1,2248,2447
top,r3d100011987,Landmap,[disciplinary],"[1 Humanities and Social Sciences, 2 Life Scie...",dataProvider,[multidisciplinary],[[National Center for Biotechnology Informatio...
freq,2,2,1573,200,2459,181,6


**openDOAR**

In [6]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'subject': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'opendoar_id': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    },
                        usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])
opendoar_df.head()

Unnamed: 0,opendoar_id,repository_name,type,subject,institution
0,101,utrecht university repository,institutional,[multidisciplinary],"[[university of utrecht, [universiteit utrecht..."
1,115,dspace at indian institute of management kozhi...,institutional,"[ecology and environment, social sciences gene...","[[indian institute of management kozhikode, [i..."
2,41,caltech engineering and science online,institutional,"[biology and biochemistry, chemistry and chemi...","[[california institute of technology, [caltech..."
3,119,dcu online research access service,institutional,[multidisciplinary],"[[dublin city university, [dcu], ie, [], , htt..."
4,129,earth-prints repository,disciplinary,[earth and planetary sciences],[[istituto nazionale di geofisica e vulcanolog...


In [7]:
opendoar_df.describe(include='all')

Unnamed: 0,opendoar_id,repository_name,type,subject,institution
count,5707.0,5707,5707,5707,5707
unique,,5670,4,820,5098
top,,arch,institutional,[multidisciplinary],"[[rijksuniversiteit groningen, [rug], nl, [], ..."
freq,,3,5067,3212,26
mean,4008.118801,,,,
std,2869.94877,,,,
min,2.0,,,,
25%,1823.0,,,,
50%,3361.0,,,,
75%,5095.0,,,,


**ROAR**

In [45]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',
                     usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])
roar_df.head()

Unnamed: 0,eprintid,home_page,title,location_country,subjects
0,921,http://alcme.oclc.org/ndltd/index.html,Networked Digital Library of Theses and Disser...,us,
1,1489,http://prensahistorica.mcu.es/prensahistorica/...,Virtual Library of Historical Press,es,
2,606,http://hal.archives-ouvertes.fr/,HAL: Hyper Article en Ligne,fr,
3,606,,,,
4,606,,,,


In [43]:
# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)

In [47]:
roar_df[roar_df.eprintid == 2303]

Unnamed: 0,eprintid,home_page,title,location_country,subjects
106,2303,,Faculty Scholarship at The Claremont Colleges,us,AS
107,2303,,,,BF
108,2303,,,,BL
109,2303,,,,CC
110,2303,,,,GN
111,2303,,,,H1
112,2303,,,,HB
113,2303,,,,JA
114,2303,,,,LB
115,2303,,,,NX


In [44]:
roar_df.describe(include='all')

Unnamed: 0,eprintid,home_page,title,location_country,subjects
count,5314.0,5263,5268,5024,1225
unique,,5156,5027,134,123
top,,http://ir.lib.isu.edu.tw/,Repositorio Institucional,us,H1
freq,,3,7,877,147
mean,6389.464434,,,,
std,5159.573937,,,,
min,1.0,,,,
25%,1490.25,,,,
50%,4990.5,,,,
75%,10452.75,,,,


**FAIRsharing**

In [11]:
fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv', 
                             delimiter='|', header=0,
                             names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df.head()

Unnamed: 0,full_name,short_name,fs_url,url,countries,subjects
0,GenBank,GenBank,https://fairsharing.org/10.25504/FAIRsharing.9...,https://www.ncbi.nlm.nih.gov/genbank/,"European Union,Japan,United States","Bioinformatics,Data Management,Data Submission..."
1,GlycoNAVI,GlycoNAVI,https://fairsharing.org/10.25504/FAIRsharing.w...,https://glyconavi.org/,Japan,"Chemistry,Glycomics,Life Science,Organic Chemi..."
2,ADHDgene,ADHDgene,https://fairsharing.org/10.25504/FAIRsharing.m...,http://adhd.psych.ac.cn/,China,"Biomedical Science,Genetics"
3,Allele frequency resource for research and tea...,ALFRED,https://fairsharing.org/10.25504/FAIRsharing.y...,http://alfred.med.yale.edu,United States,Life Science
4,Animal Transcription Factor Database,AnimalTFDB,https://fairsharing.org/10.25504/FAIRsharing.e...,http://bioinfo.life.hust.edu.cn/AnimalTFDB/,China,Life Science


In [12]:
fairsharing_df.describe(include='all')

Unnamed: 0,full_name,short_name,fs_url,url,countries,subjects
count,1752,1752,1752,1752,1749,1690
unique,1752,1741,1752,1752,178,834
top,CiteAb,CGD,https://fairsharing.org/10.25504/FAIRsharing.1...,http://www.plexdb.org/,United States,Life Science
freq,1,3,1,1,588,367
