registries_analysis/notebooks/01.2-exploration-opendoar.i...

39 KiB

In [1]:
import ast
import csv
import json

import numpy as np
import pandas as pd

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

Loading datasets

In [2]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'repository_metadata.content_subjects': ast.literal_eval,
                                    'repository_metadata.alternativename': ast.literal_eval,
                                    'repository_metadata.content_types': ast.literal_eval,
                                    'organization': ast.literal_eval
                                    },
                         dtype={'system_metadata.id': str})

opendoar_df.head()
Out[2]:
system_metadata.id repository_metadata.name repository_metadata.alternativename repository_metadata.url repository_metadata.description repository_metadata.type repository_metadata.content_languages system_metadata.date_modified system_metadata.date_created repository_metadata.content_subjects repository_metadata.content_types organization policy_urls repository_metadata.software repository_metadata.oai_url system_metadata.publicly_visible repository_metadata.repository_status repository_metadata.fulltext_record_count repository_metadata.metadata_record_count
0 134 {"name": "eldorado - repository of the tu dort... [{'name': 'eldorado - ressourcen aus und für l... https://eldorado.tu-dortmund.de NaN institutional [] 2022-01-12 15:34:54 2005-12-19 14:57:52 [arts, humanities, science, mathematics, socia... [journal_articles, conference_and_workshop_pap... [{'name': 'technische universität dortmund', '... [] {"name": "dspace", "version": ""} https://eldorado.tu-dortmund.de/oai/request yes NaN 9629.0 20963.0
1 58 {"name": "archive ouverte en sciences de linfo... [{'acronym': '@rchivesic'}] https://archivesic.ccsd.cnrs.fr NaN institutional [] 2022-01-12 15:34:53 2006-01-13 12:48:32 [arts, science, technology, engineering, mathe... [journal_articles, conference_and_workshop_pap... [{'name': 'centre pour la communication scient... [] {"name": "hal", "version": ""} https://api.archives-ouvertes.fr/oai/archivesic yes NaN 55492.0 1137498.0
2 93 {"name": "digitalcommons@the texas medical cen... [] http://digitalcommons.library.tmc.edu/ NaN institutional [] 2022-01-12 15:34:53 2006-02-14 11:16:12 [health and medicine] [journal_articles, theses_and_dissertations] [{'name': 'texas medical center', 'alternative... [] {"name": "other", "version": ""} http://digitalcommons.library.tmc.edu/do/oai/ yes NaN 2658.0 7268.0
3 68 {"name": "cognitive sciences eprint archive", ... [{'acronym': 'cogprints'}] http://cogprints.org/ NaN disciplinary [] 2022-01-12 15:34:53 2006-01-04 15:01:23 [humanities, health and medicine, science, soc... [journal_articles, conference_and_workshop_pap... [{'name': 'university of southampton', 'altern... [] {"name": "eprints", "version": ""} http://cogprints.org/cgi/oai2 yes NaN 2895.0 4277.0
4 84 {"name": "digital commons@carleton college", "... [] http://digitalcommons.carleton.edu/ NaN institutional [] 2022-01-12 15:34:53 2006-01-04 16:07:58 [humanities, science, social sciences] [journal_articles, unpub_reports_and_working_p... [{'name': 'carleton college', 'alternativeName... [] {"name": "other", "version": ""} NaN yes NaN NaN 42.0
In [3]:
opendoar_df.columns
Out[3]:
Index(['system_metadata.id', 'repository_metadata.name',
       'repository_metadata.alternativename', 'repository_metadata.url',
       'repository_metadata.description', 'repository_metadata.type',
       'repository_metadata.content_languages',
       'system_metadata.date_modified', 'system_metadata.date_created',
       'repository_metadata.content_subjects',
       'repository_metadata.content_types', 'organization', 'policy_urls',
       'repository_metadata.software', 'repository_metadata.oai_url',
       'system_metadata.publicly_visible',
       'repository_metadata.repository_status',
       'repository_metadata.fulltext_record_count',
       'repository_metadata.metadata_record_count'],
      dtype='object')
In [4]:
def empty_list_is_nan(cell):
    if isinstance(cell, list):
        return np.nan if len(cell) == 0 else cell
    else:
        return cell
    
opendoar_df = opendoar_df.applymap(empty_list_is_nan)
In [5]:
opendoar_df.describe(include='all')
Out[5]:
system_metadata.id repository_metadata.name repository_metadata.alternativename repository_metadata.url repository_metadata.description repository_metadata.type repository_metadata.content_languages system_metadata.date_modified system_metadata.date_created repository_metadata.content_subjects repository_metadata.content_types organization policy_urls repository_metadata.software repository_metadata.oai_url system_metadata.publicly_visible repository_metadata.repository_status repository_metadata.fulltext_record_count repository_metadata.metadata_record_count
count 5811 5811 2155 5810 0.0 5810 5811 5811 5811 5644 5667 5811 5811 5811 4447 5811 0.0 2.292000e+03 4.184000e+03
unique 5811 5780 2115 5772 NaN 4 1 171 5643 236 476 5212 678 32 4415 1 NaN NaN NaN
top 134 {"name": "arch", "language": "en"} [{'acronym': 'aura'}] http://harp.lib.hiroshima-u.ac.jp/ NaN institutional [] 2022-01-12 15:35:47 2020-09-18 12:53:48 [science, technology, engineering, mathematics... [theses_and_dissertations] [{'name': 'rijksuniversiteit groningen', 'alte... [] {"name": "dspace", "version": ""} https://api.figshare.com/v2/oai yes NaN NaN NaN
freq 1 3 4 3 NaN 5161 5811 73 81 3321 469 26 5131 2273 3 5811 NaN NaN NaN
mean NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 5.022890e+03 1.765556e+05
std NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.212648e+04 6.611068e+06
min NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000e+00 0.000000e+00
25% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0.000000e+00 8.937500e+02
50% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 4.225000e+02 4.012500e+03
75% NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 2.931500e+03 1.629350e+04
max NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 1.817531e+06 4.200000e+08
In [6]:
opendoar_df.isna().sum()
Out[6]:
system_metadata.id                              0
repository_metadata.name                        0
repository_metadata.alternativename          3656
repository_metadata.url                         1
repository_metadata.description              5811
repository_metadata.type                        1
repository_metadata.content_languages           0
system_metadata.date_modified                   0
system_metadata.date_created                    0
repository_metadata.content_subjects          167
repository_metadata.content_types             144
organization                                    0
policy_urls                                     0
repository_metadata.software                    0
repository_metadata.oai_url                  1364
system_metadata.publicly_visible                0
repository_metadata.repository_status        5811
repository_metadata.fulltext_record_count    3519
repository_metadata.metadata_record_count    1627
dtype: int64
In [7]:
pd.DataFrame(opendoar_df['repository_metadata.content_types'].explode()).groupby('repository_metadata.content_types').size()
Out[7]:
repository_metadata.content_types
bibliographic_references             858
books_chapters_and_sections         2246
conference_and_workshop_papers      2037
datasets                             427
journal_articles                    4069
learning_objects                     807
other_special_item_types            1800
patents                              200
software                             105
theses_and_dissertations            3377
unpub_reports_and_working_papers    1953
dtype: int64