39 KiB
39 KiB
Information to check
- names
- description
- url
- subjects & keywords
- content type
- repo type
- policies
In [1]:
import ast
import csv
import json
import reverse_geocoder as rg
import numpy as np
import pandas as pd
import pycountry_convert
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('display.max_columns', None)
Loading dataset¶
In [20]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
converters={'subject': ast.literal_eval,
'keyword': ast.literal_eval,
'additional_name': ast.literal_eval,
'repository_id': ast.literal_eval,
'type': ast.literal_eval,
'content_type': ast.literal_eval,
'provider_type': ast.literal_eval,
'institution': ast.literal_eval
})
re3data_df.head()
Out[20]:
In [14]:
re3data_df.columns
Out[14]:
In [3]:
re3data_df.describe(include='all')
Out[3]:
In [10]:
re3data_df.isna().sum()
Out[10]:
In [18]:
re3data_df.content_type.explode().unique()
Out[18]:
In [19]:
re3data_df.provider_type.explode().unique()
Out[19]: