39 KiB
39 KiB
In [1]:
import ast
import csv
import json
import numpy as np
import pandas as pd
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('display.max_columns', None)
Loading datasets¶
In [2]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
converters={'repository_metadata.content_subjects': ast.literal_eval,
'repository_metadata.alternativename': ast.literal_eval,
'repository_metadata.content_types': ast.literal_eval,
'organization': ast.literal_eval
},
dtype={'system_metadata.id': str})
opendoar_df.head()
Out[2]:
In [3]:
opendoar_df.columns
Out[3]:
In [4]:
def empty_list_is_nan(cell):
if isinstance(cell, list):
return np.nan if len(cell) == 0 else cell
else:
return cell
opendoar_df = opendoar_df.applymap(empty_list_is_nan)
In [5]:
opendoar_df.describe(include='all')
Out[5]:
In [6]:
opendoar_df.isna().sum()
Out[6]:
In [7]:
pd.DataFrame(opendoar_df['repository_metadata.content_types'].explode()).groupby('repository_metadata.content_types').size()
Out[7]: