registries_analysis/notebooks/01.2-exploration-opendoar.i...

40 KiB

In [1]:
import ast
import csv
import json

import numpy as np
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

Loading datasets

In [2]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'repository_metadata.content_subjects': ast.literal_eval,
                                    'repository_metadata.alternativename': ast.literal_eval,
                                    'repository_metadata.content_types': ast.literal_eval,
                                    'organization': ast.literal_eval
                                    },
                         dtype={'system_metadata.id': str})

opendoar_df.head()
Out[2]:
system_metadata.id repository_metadata.name repository_metadata.alternativename repository_metadata.url repository_metadata.description repository_metadata.type repository_metadata.content_languages system_metadata.date_modified system_metadata.date_created repository_metadata.content_subjects repository_metadata.content_types organization policy_urls repository_metadata.software repository_metadata.oai_url system_metadata.publicly_visible repository_metadata.repository_status repository_metadata.fulltext_record_count repository_metadata.metadata_record_count
0 175 {"name": "hku theses online", "language": "en"} [] http://hub.hku.hk/handle/10722/1057 this is an institutional repository providing ... institutional ["zh", "en"] 2021-03-25 10:16:18 2005-12-21 12:44:08 ["multidisciplinary"] [bibliographic_references, theses_and_disserta... [{'name': 'university of hong kong', 'alternat... [] {"name": "dspace", "version": "cris-5.3.1-snap... NaN yes fully_functional NaN 11850.0
1 64 {"name": "research support scheme - central eu... [] http://rss.archives.ceu.hu/ this is an institutional repository collecting... institutional ["cs", "en", "hu", "ru"] 2021-03-25 09:48:31 2006-01-04 14:59:30 ["multidisciplinary"] [unpub_reports_and_working_papers] [{'name': 'central european university', 'alte... [] {"name": "eprints", "version": "2.2.1"} http://rss.archives.ceu.hu/perl/oai2 yes fully_functional NaN 164.0
2 151 {"name": "cadmus, eui research repository", "l... [] http://cadmus.eui.eu/ cadmus is the name of the eui research reposit... institutional ["nl", "en", "fr", "de", "it"] 2021-09-13 13:35:36 2006-01-04 12:07:07 ["history and archaeology", "multidisciplinary... [journal_articles, theses_and_dissertations, u... [{'name': 'european university institute', 'al... [{"policy_url": "https://www.eui.eu/research/e... {"name": "dspace", "version": "5.2"} http://cadmus.eui.eu/oai/request yes fully_functional 3867.0 24869.0
3 105 {"name": "document server@uhasselt", "language... [] https://doclib.uhasselt.be/dspace/ this site is a university repository providing... institutional ["nl", "en", "fr", "de"] 2021-04-16 15:23:52 2006-01-24 15:46:44 ["multidisciplinary"] [journal_articles, conference_and_workshop_pap... [{'name': 'uhasselt', 'alternativeName': 'hass... [] {"name": "dspace", "version": "1.7.2"} http://doclib.uhasselt.be/dspace-oai/request yes fully_functional 0.0 27376.0
4 101 {"name": "utrecht university repository", "lan... [] http://dspace.library.uu.nl this site is a university repository providing... institutional ["nl", "en"] 2021-04-16 15:22:03 2006-01-13 12:55:13 ["multidisciplinary"] [journal_articles, conference_and_workshop_pap... [{'name': 'university of utrecht', 'alternativ... [] {"name": "dspace", "version": ""} https://dspace.library.uu.nl/oai/request yes fully_functional 1686.0 185637.0
In [3]:
opendoar_df.columns
Out[3]:
Index(['system_metadata.id', 'repository_metadata.name',
       'repository_metadata.alternativename', 'repository_metadata.url',
       'repository_metadata.description', 'repository_metadata.type',
       'repository_metadata.content_languages',
       'system_metadata.date_modified', 'system_metadata.date_created',
       'repository_metadata.content_subjects',
       'repository_metadata.content_types', 'organization', 'policy_urls',
       'repository_metadata.software', 'repository_metadata.oai_url',
       'system_metadata.publicly_visible',
       'repository_metadata.repository_status',
       'repository_metadata.fulltext_record_count',
       'repository_metadata.metadata_record_count'],
      dtype='object')
In [4]:
def empty_list_is_nan(cell):
    if isinstance(cell, list):
        return np.nan if len(cell) == 0 else cell
    else:
        return cell
    
opendoar_df = opendoar_df.applymap(empty_list_is_nan)
In [5]:
opendoar_df.describe(include='all')
Out[5]:
system_metadata.id repository_metadata.name repository_metadata.alternativename repository_metadata.url repository_metadata.description repository_metadata.type repository_metadata.content_languages system_metadata.date_modified system_metadata.date_created repository_metadata.content_subjects repository_metadata.content_types organization policy_urls repository_metadata.software repository_metadata.oai_url system_metadata.publicly_visible repository_metadata.repository_status repository_metadata.fulltext_record_count repository_metadata.metadata_record_count
count 5742 5742 2147 5742 5421 5742 5742 5742 5742 5742 5598 5742 5742 5742 4402 5742 5595 2.299000e+03 4.197000e+03
unique 5742 5713 2107 5705 4619 4 330 2372 5573 821 477 5201 642 321 4370 1 7 NaN NaN
top 175 {"name": "hiroshima associated repository port... [{'acronym': 'aura'}] http://harp.lib.hiroshima-u.ac.jp/ this site provides access to the research outp... institutional ["en"] 2020-09-18 12:53:48 2020-09-18 12:53:48 ["multidisciplinary"] [theses_and_dissertations] [{'name': 'rijksuniversiteit groningen', 'alte... [] {"name": "dspace", "version": ""} https://kidoks.bsz-bw.de/oai yes fully_functional NaN NaN
freq 1 3 4 3 95 5096 1917 82 82 3227 465 26 5098 822 3 5742 5276 NaN NaN
mean NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.010186e+03 1.760546e+05
std NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.206295e+04 6.600825e+06
min NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000e+00 0.000000e+00
25% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000e+00 8.950000e+02
50% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.220000e+02 4.026000e+03
75% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.930500e+03 1.630400e+04
max NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.817531e+06 4.200000e+08
In [6]:
opendoar_df.isna().sum()
Out[6]:
system_metadata.id                              0
repository_metadata.name                        0
repository_metadata.alternativename          3595
repository_metadata.url                         0
repository_metadata.description               321
repository_metadata.type                        0
repository_metadata.content_languages           0
system_metadata.date_modified                   0
system_metadata.date_created                    0
repository_metadata.content_subjects            0
repository_metadata.content_types             144
organization                                    0
policy_urls                                     0
repository_metadata.software                    0
repository_metadata.oai_url                  1340
system_metadata.publicly_visible                0
repository_metadata.repository_status         147
repository_metadata.fulltext_record_count    3443
repository_metadata.metadata_record_count    1545
dtype: int64
In [7]:
pd.DataFrame(opendoar_df['repository_metadata.content_types'].explode()).groupby('repository_metadata.content_types').size()
Out[7]:
repository_metadata.content_types
bibliographic_references             865
books_chapters_and_sections         2194
conference_and_workshop_papers      1981
datasets                             401
journal_articles                    4030
learning_objects                     789
other_special_item_types            1759
patents                              182
software                              92
theses_and_dissertations            3319
unpub_reports_and_working_papers    1904
dtype: int64