46 KiB
46 KiB
In [1]:
import ast
import csv
import json
import reverse_geocoder as rg
import numpy as np
import pandas as pd
import pycountry_convert
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('display.max_columns', None)
Loading datasets¶
re3data
In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
converters={'subject': ast.literal_eval,
'keyword': ast.literal_eval,
'additional_name': ast.literal_eval,
'repository_id': ast.literal_eval,
'type': ast.literal_eval,
'content_type': ast.literal_eval,
'provider_type': ast.literal_eval,
'institution': ast.literal_eval
},
usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])
re3data_df.head()
Out[3]:
HERE I AM FILTERING SERVICE PROVIDERS OUT!!
In [4]:
re3data_df = re3data_df.explode('provider_type')
re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']
In [5]:
re3data_df.describe(include='all')
Out[5]:
openDOAR
In [6]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
converters={'subject': ast.literal_eval,
'additional_name': ast.literal_eval,
'opendoar_id': ast.literal_eval,
'content_type': ast.literal_eval,
'institution': ast.literal_eval
},
usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])
opendoar_df.head()
Out[6]:
In [7]:
opendoar_df.describe(include='all')
Out[7]:
ROAR
In [45]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',
usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])
roar_df.head()
Out[45]:
In [43]:
# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)
In [47]:
roar_df[roar_df.eprintid == 2303]
Out[47]:
In [44]:
roar_df.describe(include='all')
Out[44]:
FAIRsharing
In [11]:
fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv',
delimiter='|', header=0,
names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df.head()
Out[11]:
In [12]:
fairsharing_df.describe(include='all')
Out[12]: