3.6 MiB
3.6 MiB
In [1]:
import ast
import csv
import json
import reverse_geocoder as rg
import numpy as np
import pandas as pd
import pycountry_convert
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
In [1]:
def country_to_countrycode(country):
if pd.isna(country):
return np.nan
else:
try:
return pycountry_convert.country_name_to_country_alpha2(country)
except:
return np.nan
def countrycode_to_continent(country_code):
if pd.isna(country_code):
return np.nan
else:
try:
return pycountry_convert.country_alpha2_to_continent_code(country_code)
except:
return np.nan
Loading datasets¶
FAIRsharing
In [3]:
fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv',
delimiter='|', header=0,
names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df['subjects'] = fairsharing_df.subjects.str.split(pat=',')
fairsharing_df['countries'] = fairsharing_df.countries.str.split(pat=',')
fairsharing_df.head()
Out[3]:
In [4]:
fairsharing_df.describe()
Out[4]:
re3data
In [42]:
re3data_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
re3data_df = re3data_df[(re3data_df.id.str.contains('re3data'))].reset_index()
re3data_df.head()
Out[42]:
In [6]:
re3data_df.describe(include='all')
Out[6]:
OpenDOAR
In [7]:
opendoar_df = pd.read_csv('../data/raw/re3data_opendoar.csv')
opendoar_df = opendoar_df[(opendoar_df.id.str.contains('opendoar'))].reset_index()
opendoar_df.head()
Out[7]:
In [8]:
opendoar_df.describe(include='all')
Out[8]:
Basic cleaning¶
re3data
In [9]:
re3data_df.subjects
Out[9]:
In [10]:
re3data_df['subjects'] = re3data_df.subjects.apply(lambda x: ast.literal_eval(x))
In [11]:
def merge_lists(lists):
res = []
for l in lists:
res = res + l
return res
re3data_cleaned_subjects = re3data_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
.apply(lambda row: row.dropna().tolist(), axis=1)\
.reset_index()\
.groupby('index')[0].apply(lambda x: merge_lists(x))
In [12]:
re3data_cleaned_subjects
Out[12]:
In [13]:
re3data_df = re3data_df.join(re3data_cleaned_subjects)
In [14]:
re3data_df.drop(columns=['subjects'], inplace=True)
re3data_df.rename(columns={0:'subjects'}, inplace=True)
OpenDOAR
In [15]:
opendoar_df.subjects
Out[15]:
In [16]:
opendoar_df['subjects'] = opendoar_df.subjects.apply(lambda x: ast.literal_eval(x))
In [17]:
opendoar_cleaned_subjects = opendoar_df.explode('subjects').subjects.str.split(',| and ', expand=True)\
.apply(lambda row: row.dropna().tolist(), axis=1)\
.reset_index()\
.groupby('index')[0].apply(lambda x: merge_lists(x))
In [18]:
opendoar_cleaned_subjects
Out[18]:
In [19]:
opendoar_df = opendoar_df.join(opendoar_cleaned_subjects)
In [20]:
opendoar_df.drop(columns=['subjects'], inplace=True)
opendoar_df.rename(columns={0: 'subjects'}, inplace=True)
Subjects analysis¶
In [21]:
fairsharing_subjects = fairsharing_df.explode('subjects')
re3data_subjects = re3data_df.explode('subjects')
opendoar_subjects = opendoar_df.explode('subjects')
In [22]:
data1 = fairsharing_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)
data2 = re3data_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)
data3 = opendoar_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)
plot = [
go.Bar(
x=data1.index,
y=data1['url'],
name='FAIRsharing'
),
go.Bar(
x=data2.index,
y=data2['url'],
name='re3data',
visible = 'legendonly'
),
go.Bar(
x=data3.index,
y=data3['url'],
name='OpenDOAR',
visible = 'legendonly'
)
]
layout = go.Layout(
title='Subject coverage',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(plot, layout).show()
In [23]:
len(fairsharing_subjects.subjects.unique())
Out[23]:
In [24]:
len(re3data_subjects.subjects.unique())
Out[24]:
In [25]:
len(opendoar_subjects.subjects.unique())
Out[25]:
In [26]:
opendoar_subjects.subjects.unique()
Out[26]:
Geographic analysis¶
FAIRsharing
In [2]:
fairsharing_countries = fairsharing_df.explode('countries')
fairsharing_countries['countrycode'] = fairsharing_countries.countries.map(lambda c: country_to_countrycode(c))
fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(lambda cc: countrycode_to_continent(cc))
In [3]:
fairsharing_countries[fairsharing_countries.countrycode.isna()].countries.unique()
In [29]:
fairsharing_countries[fairsharing_countries.continent.isna()].countries.unique()
Out[29]:
Manually fixing exceptions
In [30]:
fairsharing_countries.loc[fairsharing_countries.countries == 'Republic of Ireland', ['countries', 'countrycode', 'continent']] = ['Ireland', 'IE', 'EU']
fairsharing_countries.loc[fairsharing_countries.countries == 'Antarctica', ['countrycode', 'continent']] = ['AQ', np.nan]
fairsharing_countries.loc[fairsharing_countries.countries == 'European Union', ['countrycode', 'continent']] = ['EU', 'EU']
In [31]:
fairsharing_countries[fairsharing_countries.countrycode == 'AQ']
Out[31]:
re3data
In [32]:
re3data_df[re3data_df.latitude.notna()].count()
Out[32]:
Location is basically absent in re3data
In [33]:
reverse_geocoding = pd.DataFrame(rg.search(re3data_df[['latitude', 'longitude']].apply(tuple, axis=1).tolist()))
reverse_geocoding['lat'] = reverse_geocoding['lat'].astype('float')
reverse_geocoding['lon'] = reverse_geocoding['lon'].astype('float')
reverse_geocoding['continent'] = reverse_geocoding.cc.map(countrycode_to_continent)
reverse_geocoding
Out[33]:
In [34]:
re3data_df = re3data_df.join(reverse_geocoding)
Manual fix of null lat/lon
In [35]:
re3data_df.loc[(re3data_df.latitude == 0.0) & (re3data_df.longitude == 0.0), ['latitude', 'longitude', 'cc', 'continent']] = [np.nan, np.nan, np.nan, np.nan]
OpenDOAR
In [36]:
reverse_geocoding = pd.DataFrame(rg.search(opendoar_df[['latitude', 'longitude']].apply(tuple, axis=1).tolist()))
reverse_geocoding['lat'] = reverse_geocoding['lat'].astype('float')
reverse_geocoding['lon'] = reverse_geocoding['lon'].astype('float')
reverse_geocoding['continent'] = reverse_geocoding.cc.map(countrycode_to_continent)
reverse_geocoding
Out[36]:
In [37]:
opendoar_df = opendoar_df.join(reverse_geocoding)
Manual fix of null lat/lon
In [38]:
opendoar_df.loc[(opendoar_df.latitude == 0.0) & (opendoar_df.longitude == 0.0), ['latitude', 'longitude', 'cc', 'continent']] = [np.nan, np.nan, np.nan, np.nan]
Country intersection
In [39]:
venn2([set(fairsharing_countries.countrycode.dropna()), set(opendoar_df.cc.dropna())], set_labels = ('FAIRsharing', 'OpenDOAR'))
plt.show()
Country coverage
In [40]:
data1 = fairsharing_countries.groupby('countrycode')[['url']].count().sort_values('url', ascending=False)
data2 = opendoar_df.groupby('cc')[['id']].count().sort_values('id', ascending=False)
plot = [
go.Bar(
x=data1.index,
y=data1['url'],
name='FAIRsharing'
),
go.Bar(
x=data2.index,
y=data2['id'],
name='OpenDOAR'
)
]
layout = go.Layout(
title='Country coverage',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
go.Figure(plot, layout).show()
Continental coverage
In [41]:
data1 = fairsharing_countries.groupby('continent')[['url']].count()
data2 = opendoar_df.groupby('continent')[['url']].count()
plot = [
go.Scatterpolar(
r=data1.url,
theta=data1.index,
fill='toself',
name='FAIRsharing'),
go.Scatterpolar(
r=data2.url,
theta=data2.index,
fill='toself',
name='OpenDOAR')
]
layout = go.Layout(polar=dict(
radialaxis=dict(
visible=True
),
)
)
go.Figure(plot, layout).show()