In [1]:
import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

## Loading datasets

In [24]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'subject': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'opendoar_id': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })
opendoar_df.head()

Unnamed: 0,openaire_id,opendoar_id,repository_name,additional_name,repository_url,description,type,update_date,start_date,subject,content_type,institution,metadata_policy,data_policy,submission_policy,content_policy,software,api
0,opendoar____::38b3eff8baf56627478ec76a704e9b52,101,utrecht university repository,[],http://dspace.library.uu.nl,this site is a university repository providing...,institutional,2021-04-16 15:22:03,2006-01-13 12:55:13,[multidisciplinary],"[journal_articles, conference_and_workshop_pap...","[[university of utrecht, [universiteit utrecht...",True,True,False,True,dspace,True
1,opendoar____::2b44928ae11fb9384c4cf38708677c48,115,dspace at indian institute of management kozhi...,[dspace@iimk],http://dspace.iimk.ac.in/,this site is a subject based university reposi...,institutional,2021-02-18 17:36:43,2006-01-04 11:54:34,"[ecology and environment, social sciences gene...","[journal_articles, conference_and_workshop_pap...","[[indian institute of management kozhikode, [i...",True,True,True,True,dspace 4.1,True
2,opendoar____::3416a75f4cea9109507cacd8e2f2aefc,41,caltech engineering and science online,[],http://calteches.library.caltech.edu/,the caltech archives holds approximately 220 c...,institutional,2021-02-18 17:36:28,2006-01-04 14:47:04,"[biology and biochemistry, chemistry and chemi...","[journal_articles, conference_and_workshop_pap...","[[california institute of technology, [caltech...",True,True,True,True,eprints 3.1.3,True
3,opendoar____::07e1cd7dca89a1678042477183b7ac3f,119,dcu online research access service,[doras],http://doras.dcu.ie/,this site is an institutional repository provi...,institutional,2021-02-18 17:36:44,2006-01-04 11:15:19,[multidisciplinary],"[journal_articles, conference_and_workshop_pap...","[[dublin city university, [dcu], ie, [], , htt...",True,True,True,True,eprints 3.0.5,True
4,opendoar____::d1f491a404d6854880943e5c3cd9ca25,129,earth-prints repository,[],http://www.earth-prints.org/,a subject based repository providing open acce...,disciplinary,2021-04-19 08:28:38,2006-01-30 16:43:11,[earth and planetary sciences],"[journal_articles, conference_and_workshop_pap...",[[istituto nazionale di geofisica e vulcanolog...,True,True,True,True,dspace 5.8.1-snapshot,True


In [25]:
opendoar_df.columns

Index(['openaire_id', 'opendoar_id', 'repository_name', 'additional_name',
       'repository_url', 'description', 'type', 'update_date', 'start_date',
       'subject', 'content_type', 'institution', 'metadata_policy',
       'data_policy', 'submission_policy', 'content_policy', 'software',
       'api'],
      dtype='object')

In [28]:
def empty_list_is_nan(cell):
    if isinstance(cell, list):
        return np.nan if len(cell) == 0 else cell
    else:
        return cell
    
opendoar_df = opendoar_df.applymap(empty_list_is_nan)

In [29]:
opendoar_df.describe(include='all')

Unnamed: 0,openaire_id,opendoar_id,repository_name,additional_name,repository_url,description,type,update_date,start_date,subject,content_type,institution,metadata_policy,data_policy,submission_policy,content_policy,software,api
count,5707,5707.0,5707,2138,5707,5425,5707,5707,5707,5542,5563,5707,5707,5707,5707,5707,5707,5707
unique,5707,,5670,2096,5670,4622,4,2501,5538,819,476,5098,2,2,2,2,321,2
top,opendoar____::3cf166c6b73f030b4f67eeaeba301103,,hiroshima associated repository portal,[],http://harp.lib.hiroshima-u.ac.jp/,this site provides access to the research outp...,institutional,2020-09-18 12:53:48,2020-09-18 12:53:48,[multidisciplinary],[theses_and_dissertations],"[[rijksuniversiteit groningen, [rug], nl, [], ...",False,False,False,False,dspace,true
freq,1,,3,4,3,95,5067,82,82,3212,460,26,4116,4101,5016,4075,800,4374
mean,,4008.118801,,,,,,,,,,,,,,,,
std,,2869.94877,,,,,,,,,,,,,,,,
min,,2.0,,,,,,,,,,,,,,,,
25%,,1823.0,,,,,,,,,,,,,,,,
50%,,3361.0,,,,,,,,,,,,,,,,
75%,,5095.0,,,,,,,,,,,,,,,,


In [30]:
opendoar_df.isna().sum()

openaire_id             0
opendoar_id             0
repository_name         0
additional_name      3569
repository_url          0
description           282
type                    0
update_date             0
start_date              0
subject               165
content_type          144
institution             0
metadata_policy         0
data_policy             0
submission_policy       0
content_policy          0
software                0
api                     0
dtype: int64