registries_analysis/notebooks/01.2-exploration-opendoar.i...

31 KiB

In [1]:
import ast
import csv
import json
import reverse_geocoder as rg

import numpy as np
import pandas as pd

import pycountry_convert

import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles

import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px

pd.set_option('display.max_columns', None)

Loading datasets

In [24]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
                         converters={'subject': ast.literal_eval,
                                    'additional_name': ast.literal_eval,
                                    'opendoar_id': ast.literal_eval,
                                    'content_type': ast.literal_eval,
                                    'institution': ast.literal_eval
                                    })
opendoar_df.head()
Out[24]:
openaire_id opendoar_id repository_name additional_name repository_url description type update_date start_date subject content_type institution metadata_policy data_policy submission_policy content_policy software api
0 opendoar____::38b3eff8baf56627478ec76a704e9b52 101 utrecht university repository [] http://dspace.library.uu.nl this site is a university repository providing... institutional 2021-04-16 15:22:03 2006-01-13 12:55:13 [multidisciplinary] [journal_articles, conference_and_workshop_pap... [[university of utrecht, [universiteit utrecht... True True False True dspace true
1 opendoar____::2b44928ae11fb9384c4cf38708677c48 115 dspace at indian institute of management kozhi... [dspace@iimk] http://dspace.iimk.ac.in/ this site is a subject based university reposi... institutional 2021-02-18 17:36:43 2006-01-04 11:54:34 [ecology and environment, social sciences gene... [journal_articles, conference_and_workshop_pap... [[indian institute of management kozhikode, [i... True True True True dspace 4.1 true
2 opendoar____::3416a75f4cea9109507cacd8e2f2aefc 41 caltech engineering and science online [] http://calteches.library.caltech.edu/ the caltech archives holds approximately 220 c... institutional 2021-02-18 17:36:28 2006-01-04 14:47:04 [biology and biochemistry, chemistry and chemi... [journal_articles, conference_and_workshop_pap... [[california institute of technology, [caltech... True True True True eprints 3.1.3 true
3 opendoar____::07e1cd7dca89a1678042477183b7ac3f 119 dcu online research access service [doras] http://doras.dcu.ie/ this site is an institutional repository provi... institutional 2021-02-18 17:36:44 2006-01-04 11:15:19 [multidisciplinary] [journal_articles, conference_and_workshop_pap... [[dublin city university, [dcu], ie, [], , htt... True True True True eprints 3.0.5 true
4 opendoar____::d1f491a404d6854880943e5c3cd9ca25 129 earth-prints repository [] http://www.earth-prints.org/ a subject based repository providing open acce... disciplinary 2021-04-19 08:28:38 2006-01-30 16:43:11 [earth and planetary sciences] [journal_articles, conference_and_workshop_pap... [[istituto nazionale di geofisica e vulcanolog... True True True True dspace 5.8.1-snapshot true
In [25]:
opendoar_df.columns
Out[25]:
Index(['openaire_id', 'opendoar_id', 'repository_name', 'additional_name',
       'repository_url', 'description', 'type', 'update_date', 'start_date',
       'subject', 'content_type', 'institution', 'metadata_policy',
       'data_policy', 'submission_policy', 'content_policy', 'software',
       'api'],
      dtype='object')
In [28]:
def empty_list_is_nan(cell):
    if isinstance(cell, list):
        return np.nan if len(cell) == 0 else cell
    else:
        return cell
    
opendoar_df = opendoar_df.applymap(empty_list_is_nan)
In [29]:
opendoar_df.describe(include='all')
Out[29]:
openaire_id opendoar_id repository_name additional_name repository_url description type update_date start_date subject content_type institution metadata_policy data_policy submission_policy content_policy software api
count 5707 5707.000000 5707 2138 5707 5425 5707 5707 5707 5542 5563 5707 5707 5707 5707 5707 5707 5707
unique 5707 NaN 5670 2096 5670 4622 4 2501 5538 819 476 5098 2 2 2 2 321 2
top opendoar____::3cf166c6b73f030b4f67eeaeba301103 NaN hiroshima associated repository portal [] http://harp.lib.hiroshima-u.ac.jp/ this site provides access to the research outp... institutional 2020-09-18 12:53:48 2020-09-18 12:53:48 [multidisciplinary] [theses_and_dissertations] [[rijksuniversiteit groningen, [rug], nl, [], ... False False False False dspace true
freq 1 NaN 3 4 3 95 5067 82 82 3212 460 26 4116 4101 5016 4075 800 4374
mean NaN 4008.118801 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
std NaN 2869.948770 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
min NaN 2.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
25% NaN 1823.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
50% NaN 3361.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
75% NaN 5095.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
max NaN 10175.000000 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
In [30]:
opendoar_df.isna().sum()
Out[30]:
openaire_id             0
opendoar_id             0
repository_name         0
additional_name      3569
repository_url          0
description           282
type                    0
update_date             0
start_date              0
subject               165
content_type          144
institution             0
metadata_policy         0
data_policy             0
submission_policy       0
content_policy          0
software                0
api                     0
dtype: int64