3.7 MiB
3.7 MiB
In [1]:
import ast
import csv
import json
import reverse_geocoder as rg
import numpy as np
import pandas as pd
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
FAIRsharing¶
In [2]:
fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv',
delimiter='|', header=0,
names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df['subjects'] = fairsharing_df.subjects.str.split(pat=',')
fairsharing_df['countries'] = fairsharing_df.countries.str.split(pat=',')
fairsharing_df.head()
Out[2]:
In [3]:
fairsharing_df.describe()
Out[3]:
In [4]:
fairsharing_subjects = fairsharing_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)
data = [
go.Bar(
x=fairsharing_subjects.index,
y=fairsharing_subjects['url']
)
]
layout = go.Layout(
title='Fairsharing subject coverage',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [5]:
fairsharing_countries = fairsharing_df.explode('countries').groupby('countries')[['url']].count().sort_values('url', ascending=False)
data = [
go.Bar(
x=fairsharing_countries.index,
y=fairsharing_countries['url']
)
]
layout = go.Layout(
title='Fairsharing country coverage',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
re3data¶
In [6]:
re3data_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
re3data_df = re3data_df[re3data_df.id.str.contains('re3data')]
re3data_df.head()
Out[6]:
In [7]:
re3data_df.loc[(re3data_df.latitude == 0.0) & (re3data_df.longitude == 0.0), ['latitude', 'longitude']] = [np.nan, np.nan]
In [8]:
re3data_df.subjects
Out[8]:
In [9]:
re3data_df['subjects'] = re3data_df.subjects.apply(lambda x: ast.literal_eval(x))
In [10]:
def merge_lists(lists):
res = []
for l in lists:
res = res + l
return res
re3data_cleaned_subjects = re3data_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
.apply(lambda row: row.dropna().tolist(), axis=1)\
.reset_index()\
.groupby('index')[0].apply(lambda x: merge_lists(x))
In [11]:
re3data_cleaned_subjects
Out[11]:
In [12]:
re3data_df = re3data_df.join(re3data_cleaned_subjects)
In [13]:
re3data_df.drop(columns=['subjects'], inplace=True)
re3data_df.rename(columns={0:'subjects'}, inplace=True)
In [14]:
re3data_df.describe(include='all')
Out[14]:
In [15]:
re3data_subjects = re3data_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)
data = [
go.Bar(
x=re3data_subjects.index,
y=re3data_subjects['url']
)
]
layout = go.Layout(
title='re3data subject coverage',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
OpenDOAR¶
In [82]:
opendoar_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
opendoar_df = opendoar_df[opendoar_df.id.str.contains('opendoar')]
opendoar_df.head()
Out[82]:
In [84]:
opendoar_df.subjects
Out[84]:
In [85]:
opendoar_df['subjects'] = opendoar_df.subjects.apply(lambda x: ast.literal_eval(x))
In [86]:
opendoar_cleaned_subjects = opendoar_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
.apply(lambda row: row.dropna().tolist(), axis=1)\
.reset_index()\
.groupby('index')[0].apply(lambda x: merge_lists(x))
In [87]:
opendoar_cleaned_subjects
Out[87]:
In [88]:
opendoar_df = opendoar_df.join(opendoar_cleaned_subjects)
In [89]:
opendoar_df.drop(columns=['subjects'], inplace=True)
opendoar_df.rename(columns={0: 'subjects'}, inplace=True)
In [90]:
opendoar_df.describe(include='all')
Out[90]:
In [91]:
opendoar_subjects = opendoar_df.explode('subjects').groupby('subjects')[['url']].count().sort_values('url', ascending=False)
data = [
go.Bar(
x=opendoar_subjects.index,
y=opendoar_subjects['url']
)
]
layout = go.Layout(
title='OpenDOAR subject coverage',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)
In [92]:
reverse_geocoding = pd.DataFrame(rg.search(opendoar_df[['latitude', 'longitude']].apply(tuple, axis=1).tolist()))
reverse_geocoding['lat'] = reverse_geocoding['lat'].astype('float')
reverse_geocoding['lon'] = reverse_geocoding['lon'].astype('float')
reverse_geocoding
Out[92]:
In [94]:
opendoar_df = opendoar_df.join(reverse_geocoding[['cc']])
In [96]:
opendoar_df.loc[(opendoar_df.latitude == 0.0) & (opendoar_df.longitude == 0.0), ['latitude', 'longitude', 'cc']] = [np.nan, np.nan, np.nan]
In [103]:
opendoar_countries = opendoar_df.groupby('cc')[['id']].count().sort_values('id', ascending=False)
data = [
go.Bar(
x=opendoar_countries.index,
y=opendoar_countries['id']
)
]
layout = go.Layout(
title='OpenDOAR country coverage',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig)