In [1]:
import ast
import csv
import json

import numpy as np
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

## Loading datasets

In [2]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'repository_metadata.content_subjects': ast.literal_eval,
                                    'repository_metadata.alternativename': ast.literal_eval,
                                    'repository_metadata.content_types': ast.literal_eval,
                                    'organization': ast.literal_eval
                                    },
                         dtype={'system_metadata.id': str})

opendoar_df.head()

Unnamed: 0,system_metadata.id,repository_metadata.name,repository_metadata.alternativename,repository_metadata.url,repository_metadata.description,repository_metadata.type,repository_metadata.content_languages,system_metadata.date_modified,system_metadata.date_created,repository_metadata.content_subjects,repository_metadata.content_types,organization,policy_urls,repository_metadata.software,repository_metadata.oai_url,system_metadata.publicly_visible,repository_metadata.repository_status,repository_metadata.fulltext_record_count,repository_metadata.metadata_record_count
0,134,"{""name"": ""eldorado - repository of the tu dort...",[{'name': 'eldorado - ressourcen aus und f체r l...,https://eldorado.tu-dortmund.de,,institutional,[],2022-01-12 15:34:54,2005-12-19 14:57:52,"[arts, humanities, science, mathematics, socia...","[journal_articles, conference_and_workshop_pap...","[{'name': 'technische universit채t dortmund', '...",[],"{""name"": ""dspace"", ""version"": """"}",https://eldorado.tu-dortmund.de/oai/request,yes,,9629.0,20963.0
1,58,"{""name"": ""archive ouverte en sciences de linfo...",[{'acronym': '@rchivesic'}],https://archivesic.ccsd.cnrs.fr,,institutional,[],2022-01-12 15:34:53,2006-01-13 12:48:32,"[arts, science, technology, engineering, mathe...","[journal_articles, conference_and_workshop_pap...",[{'name': 'centre pour la communication scient...,[],"{""name"": ""hal"", ""version"": """"}",https://api.archives-ouvertes.fr/oai/archivesic,yes,,55492.0,1137498.0
2,93,"{""name"": ""digitalcommons@the texas medical cen...",[],http://digitalcommons.library.tmc.edu/,,institutional,[],2022-01-12 15:34:53,2006-02-14 11:16:12,[health and medicine],"[journal_articles, theses_and_dissertations]","[{'name': 'texas medical center', 'alternative...",[],"{""name"": ""other"", ""version"": """"}",http://digitalcommons.library.tmc.edu/do/oai/,yes,,2658.0,7268.0
3,68,"{""name"": ""cognitive sciences eprint archive"", ...",[{'acronym': 'cogprints'}],http://cogprints.org/,,disciplinary,[],2022-01-12 15:34:53,2006-01-04 15:01:23,"[humanities, health and medicine, science, soc...","[journal_articles, conference_and_workshop_pap...","[{'name': 'university of southampton', 'altern...",[],"{""name"": ""eprints"", ""version"": """"}",http://cogprints.org/cgi/oai2,yes,,2895.0,4277.0
4,84,"{""name"": ""digital commons@carleton college"", ""...",[],http://digitalcommons.carleton.edu/,,institutional,[],2022-01-12 15:34:53,2006-01-04 16:07:58,"[humanities, science, social sciences]","[journal_articles, unpub_reports_and_working_p...","[{'name': 'carleton college', 'alternativeName...",[],"{""name"": ""other"", ""version"": """"}",,yes,,,42.0


In [3]:
opendoar_df.columns

Index(['system_metadata.id', 'repository_metadata.name',
       'repository_metadata.alternativename', 'repository_metadata.url',
       'repository_metadata.description', 'repository_metadata.type',
       'repository_metadata.content_languages',
       'system_metadata.date_modified', 'system_metadata.date_created',
       'repository_metadata.content_subjects',
       'repository_metadata.content_types', 'organization', 'policy_urls',
       'repository_metadata.software', 'repository_metadata.oai_url',
       'system_metadata.publicly_visible',
       'repository_metadata.repository_status',
       'repository_metadata.fulltext_record_count',
       'repository_metadata.metadata_record_count'],
      dtype='object')

In [4]:
def empty_list_is_nan(cell):
    if isinstance(cell, list):
        return np.nan if len(cell) == 0 else cell
    else:
        return cell
    
opendoar_df = opendoar_df.applymap(empty_list_is_nan)

In [5]:
opendoar_df.describe(include='all')

Unnamed: 0,system_metadata.id,repository_metadata.name,repository_metadata.alternativename,repository_metadata.url,repository_metadata.description,repository_metadata.type,repository_metadata.content_languages,system_metadata.date_modified,system_metadata.date_created,repository_metadata.content_subjects,repository_metadata.content_types,organization,policy_urls,repository_metadata.software,repository_metadata.oai_url,system_metadata.publicly_visible,repository_metadata.repository_status,repository_metadata.fulltext_record_count,repository_metadata.metadata_record_count
count,5811.0,5811,2155,5810,0.0,5810,5811,5811,5811,5644,5667,5811,5811,5811,4447,5811,0.0,2292.0,4184.0
unique,5811.0,5780,2115,5772,,4,1,171,5643,236,476,5212,678,32,4415,1,,,
top,134.0,"{""name"": ""arch"", ""language"": ""en""}",[{'acronym': 'aura'}],http://harp.lib.hiroshima-u.ac.jp/,,institutional,[],2022-01-12 15:35:47,2020-09-18 12:53:48,"[science, technology, engineering, mathematics...",[theses_and_dissertations],"[{'name': 'rijksuniversiteit groningen', 'alte...",[],"{""name"": ""dspace"", ""version"": """"}",https://api.figshare.com/v2/oai,yes,,,
freq,1.0,3,4,3,,5161,5811,73,81,3321,469,26,5131,2273,3,5811,,,
mean,,,,,,,,,,,,,,,,,,5022.89,176555.6
std,,,,,,,,,,,,,,,,,,42126.48,6611068.0
min,,,,,,,,,,,,,,,,,,0.0,0.0
25%,,,,,,,,,,,,,,,,,,0.0,893.75
50%,,,,,,,,,,,,,,,,,,422.5,4012.5
75%,,,,,,,,,,,,,,,,,,2931.5,16293.5


In [6]:
opendoar_df.isna().sum()

system_metadata.id                              0
repository_metadata.name                        0
repository_metadata.alternativename          3656
repository_metadata.url                         1
repository_metadata.description              5811
repository_metadata.type                        1
repository_metadata.content_languages           0
system_metadata.date_modified                   0
system_metadata.date_created                    0
repository_metadata.content_subjects          167
repository_metadata.content_types             144
organization                                    0
policy_urls                                     0
repository_metadata.software                    0
repository_metadata.oai_url                  1364
system_metadata.publicly_visible                0
repository_metadata.repository_status        5811
repository_metadata.fulltext_record_count    3519
repository_metadata.metadata_record_count    1627
dtype: int64

In [7]:
pd.DataFrame(opendoar_df['repository_metadata.content_types'].explode()).groupby('repository_metadata.content_types').size()

repository_metadata.content_types
bibliographic_references             858
books_chapters_and_sections         2246
conference_and_workshop_papers      2037
datasets                             427
journal_articles                    4069
learning_objects                     807
other_special_item_types            1800
patents                              200
software                             105
theses_and_dissertations            3377
unpub_reports_and_working_papers    1953
dtype: int64