In [1]:
import ast
import csv
import json

import numpy as np
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

## Loading datasets

In [2]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'repository_metadata.content_subjects_phrases': ast.literal_eval,
                                    'repository_metadata.alternativename': ast.literal_eval,
                                    'repository_metadata.content_types': ast.literal_eval,
                                    'organization': ast.literal_eval
                                    },
                         dtype={'system_metadata.id': str})

opendoar_df.head()

Unnamed: 0,system_metadata.id,repository_metadata.name,repository_metadata.alternativename,repository_metadata.url,repository_metadata.description,repository_metadata.type,repository_metadata.content_languages,system_metadata.date_modified,system_metadata.date_created,repository_metadata.content_subjects,repository_metadata.content_types,organization,policy_urls,repository_metadata.software,repository_metadata.oai_url,system_metadata.publicly_visible,repository_metadata.repository_status,repository_metadata.fulltext_record_count,repository_metadata.metadata_record_count
0,175,"{""name"": ""hku theses online"", ""language"": ""en""}",[],http://hub.hku.hk/handle/10722/1057,this is an institutional repository providing ...,institutional,"[""zh"", ""en""]",2021-03-25 10:16:18,2005-12-21 12:44:08,"[""multidisciplinary""]","[bibliographic_references, theses_and_disserta...","[{'name': 'university of hong kong', 'alternat...",[],"{""name"": ""dspace"", ""version"": ""cris-5.3.1-snap...",,yes,fully_functional,,11850.0
1,64,"{""name"": ""research support scheme - central eu...",[],http://rss.archives.ceu.hu/,this is an institutional repository collecting...,institutional,"[""cs"", ""en"", ""hu"", ""ru""]",2021-03-25 09:48:31,2006-01-04 14:59:30,"[""multidisciplinary""]",[unpub_reports_and_working_papers],"[{'name': 'central european university', 'alte...",[],"{""name"": ""eprints"", ""version"": ""2.2.1""}",http://rss.archives.ceu.hu/perl/oai2,yes,fully_functional,,164.0
2,151,"{""name"": ""cadmus, eui research repository"", ""l...",[],http://cadmus.eui.eu/,cadmus is the name of the eui research reposit...,institutional,"[""nl"", ""en"", ""fr"", ""de"", ""it""]",2021-09-13 13:35:36,2006-01-04 12:07:07,"[""history and archaeology"", ""multidisciplinary...","[journal_articles, theses_and_dissertations, u...","[{'name': 'european university institute', 'al...","[{""policy_url"": ""https://www.eui.eu/research/e...","{""name"": ""dspace"", ""version"": ""5.2""}",http://cadmus.eui.eu/oai/request,yes,fully_functional,3867.0,24869.0
3,105,"{""name"": ""document server@uhasselt"", ""language...",[],https://doclib.uhasselt.be/dspace/,this site is a university repository providing...,institutional,"[""nl"", ""en"", ""fr"", ""de""]",2021-04-16 15:23:52,2006-01-24 15:46:44,"[""multidisciplinary""]","[journal_articles, conference_and_workshop_pap...","[{'name': 'uhasselt', 'alternativeName': 'hass...",[],"{""name"": ""dspace"", ""version"": ""1.7.2""}",http://doclib.uhasselt.be/dspace-oai/request,yes,fully_functional,0.0,27376.0
4,101,"{""name"": ""utrecht university repository"", ""lan...",[],http://dspace.library.uu.nl,this site is a university repository providing...,institutional,"[""nl"", ""en""]",2021-04-16 15:22:03,2006-01-13 12:55:13,"[""multidisciplinary""]","[journal_articles, conference_and_workshop_pap...","[{'name': 'university of utrecht', 'alternativ...",[],"{""name"": ""dspace"", ""version"": """"}",https://dspace.library.uu.nl/oai/request,yes,fully_functional,1686.0,185637.0


In [3]:
opendoar_df.columns

Index(['system_metadata.id', 'repository_metadata.name',
       'repository_metadata.alternativename', 'repository_metadata.url',
       'repository_metadata.description', 'repository_metadata.type',
       'repository_metadata.content_languages',
       'system_metadata.date_modified', 'system_metadata.date_created',
       'repository_metadata.content_subjects',
       'repository_metadata.content_types', 'organization', 'policy_urls',
       'repository_metadata.software', 'repository_metadata.oai_url',
       'system_metadata.publicly_visible',
       'repository_metadata.repository_status',
       'repository_metadata.fulltext_record_count',
       'repository_metadata.metadata_record_count'],
      dtype='object')

In [4]:
def empty_list_is_nan(cell):
    if isinstance(cell, list):
        return np.nan if len(cell) == 0 else cell
    else:
        return cell
    
opendoar_df = opendoar_df.applymap(empty_list_is_nan)

In [5]:
opendoar_df.describe(include='all')

Unnamed: 0,system_metadata.id,repository_metadata.name,repository_metadata.alternativename,repository_metadata.url,repository_metadata.description,repository_metadata.type,repository_metadata.content_languages,system_metadata.date_modified,system_metadata.date_created,repository_metadata.content_subjects,repository_metadata.content_types,organization,policy_urls,repository_metadata.software,repository_metadata.oai_url,system_metadata.publicly_visible,repository_metadata.repository_status,repository_metadata.fulltext_record_count,repository_metadata.metadata_record_count
count,5742.0,5742,2147,5742,5421,5742,5742,5742,5742,5742,5598,5742,5742,5742,4402,5742,5595,2299.0,4197.0
unique,5742.0,5713,2107,5705,4619,4,330,2372,5573,821,477,5201,642,321,4370,1,7,,
top,175.0,"{""name"": ""hiroshima associated repository port...",[{'acronym': 'aura'}],http://harp.lib.hiroshima-u.ac.jp/,this site provides access to the research outp...,institutional,"[""en""]",2020-09-18 12:53:48,2020-09-18 12:53:48,"[""multidisciplinary""]",[theses_and_dissertations],"[{'name': 'rijksuniversiteit groningen', 'alte...",[],"{""name"": ""dspace"", ""version"": """"}",https://kidoks.bsz-bw.de/oai,yes,fully_functional,,
freq,1.0,3,4,3,95,5096,1917,82,82,3227,465,26,5098,822,3,5742,5276,,
mean,,,,,,,,,,,,,,,,,,5010.186,176054.6
std,,,,,,,,,,,,,,,,,,42062.95,6600825.0
min,,,,,,,,,,,,,,,,,,0.0,0.0
25%,,,,,,,,,,,,,,,,,,0.0,895.0
50%,,,,,,,,,,,,,,,,,,422.0,4026.0
75%,,,,,,,,,,,,,,,,,,2930.5,16304.0


In [6]:
opendoar_df.isna().sum()

system_metadata.id                              0
repository_metadata.name                        0
repository_metadata.alternativename          3595
repository_metadata.url                         0
repository_metadata.description               321
repository_metadata.type                        0
repository_metadata.content_languages           0
system_metadata.date_modified                   0
system_metadata.date_created                    0
repository_metadata.content_subjects            0
repository_metadata.content_types             144
organization                                    0
policy_urls                                     0
repository_metadata.software                    0
repository_metadata.oai_url                  1340
system_metadata.publicly_visible                0
repository_metadata.repository_status         147
repository_metadata.fulltext_record_count    3443
repository_metadata.metadata_record_count    1545
dtype: int64

In [7]:
pd.DataFrame(opendoar_df['repository_metadata.content_types'].explode()).groupby('repository_metadata.content_types').size()

repository_metadata.content_types
bibliographic_references             865
books_chapters_and_sections         2194
conference_and_workshop_papers      1981
datasets                             401
journal_articles                    4030
learning_objects                     789
other_special_item_types            1759
patents                              182
software                              92
theses_and_dissertations            3319
unpub_reports_and_working_papers    1904
dtype: int64