30 KiB
30 KiB
In [1]:
import ast
import csv
import json
import numpy as np
import pandas as pd
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('display.max_columns', None)
Loading datasets¶
In [24]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
converters={'subject': ast.literal_eval,
'additional_name': ast.literal_eval,
'opendoar_id': ast.literal_eval,
'content_type': ast.literal_eval,
'institution': ast.literal_eval
})
opendoar_df.head()
Out[24]:
In [25]:
opendoar_df.columns
Out[25]:
In [28]:
def empty_list_is_nan(cell):
if isinstance(cell, list):
return np.nan if len(cell) == 0 else cell
else:
return cell
opendoar_df = opendoar_df.applymap(empty_list_is_nan)
In [29]:
opendoar_df.describe(include='all')
Out[29]:
In [30]:
opendoar_df.isna().sum()
Out[30]: