38 KiB
38 KiB
Information to check
- names
- description
- url
- subjects & keywords
- content type
- repo type
- policies
In [1]:
import ast
import csv
import json
import numpy as np
import pandas as pd
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('display.max_columns', None)
Loading dataset¶
In [2]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
converters={'subject': ast.literal_eval,
'keyword': ast.literal_eval,
'additional_name': ast.literal_eval,
'repository_id': ast.literal_eval,
'type': ast.literal_eval,
'content_type': ast.literal_eval,
'provider_type': ast.literal_eval,
'institution': ast.literal_eval
})
re3data_df.head()
Out[2]:
In [3]:
re3data_df.columns
Out[3]:
In [4]:
def empty_list_is_nan(cell):
if isinstance(cell, list):
return np.nan if len(cell) == 0 else cell
else:
return cell
re3data_df = re3data_df.applymap(empty_list_is_nan)
In [5]:
re3data_df.describe(include='all')
Out[5]:
In [6]:
re3data_df.isna().sum()
Out[6]:
In [7]:
re3data_df.content_type.explode().unique()
Out[7]:
In [8]:
re3data_df.provider_type.explode().unique()
Out[8]: