401 KiB
401 KiB
In [1]:
import ast
import csv
import json
import reverse_geocoder as rg
import numpy as np
import pandas as pd
import pycountry_convert
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('display.max_columns', None)
In [2]:
def country_to_countrycode(country):
if pd.isna(country):
return np.nan
else:
try:
return pycountry_convert.country_name_to_country_alpha3(country)
except:
return np.nan
def countrycode_iso2_to_countrycode_iso3(country):
if pd.isna(country):
return np.nan
else:
try:
return pycountry_convert.country_name_to_country_alpha3(pycountry_convert.country_alpha2_to_country_name(country))
except:
return np.nan
def countrycode_to_continent(country_code):
if pd.isna(country_code):
return np.nan
else:
try:
return pycountry_convert.country_alpha2_to_continent_code(pycountry_convert.country_alpha3_to_country_alpha2(country_code))
except:
return np.nan
Loading datasets¶
re3data
In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
converters={'subject': ast.literal_eval,
'keyword': ast.literal_eval,
'additional_name': ast.literal_eval,
'repository_id': ast.literal_eval,
'type': ast.literal_eval,
'content_type': ast.literal_eval,
'provider_type': ast.literal_eval,
'institution': ast.literal_eval
},
usecols=['re3data_id', 'repository_name', 'subject', 'keyword', 'type', 'provider_type', 'institution'])
re3data_df.head()
Out[3]:
HERE I AM FILTERING SERVICE PROVIDERS OUT!!
In [4]:
re3data_df = re3data_df.explode('provider_type')
re3data_df = re3data_df[re3data_df.provider_type != 'serviceProvider']
In [5]:
re3data_df.describe(include='all')
Out[5]:
openDOAR
In [6]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
converters={'subject': ast.literal_eval,
'additional_name': ast.literal_eval,
'opendoar_id': ast.literal_eval,
'content_type': ast.literal_eval,
'institution': ast.literal_eval
},
usecols=['opendoar_id', 'repository_name', 'subject', 'type', 'institution'])
opendoar_df.head()
Out[6]:
In [7]:
opendoar_df.describe(include='all')
Out[7]:
ROAR
In [45]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv',
usecols=['eprintid', 'home_page', 'title', 'location_country', 'subjects'])
roar_df.head()
Out[45]:
In [43]:
# roar_df.drop_duplicates(subset=['home_page', 'title' , 'location_country', 'subjects'], keep=False, inplace=True)
In [47]:
roar_df[roar_df.eprintid == 2303]
Out[47]:
In [44]:
roar_df.describe(include='all')
Out[44]:
FAIRsharing
In [11]:
fairsharing_df = pd.read_csv('../data/raw/FAIRsharingDBrec_summary20210304.csv',
delimiter='|', header=0,
names=['full_name', 'short_name', 'fs_url', 'url', 'countries', 'subjects'])
fairsharing_df.head()
Out[11]:
In [12]:
fairsharing_df.describe(include='all')
Out[12]:
Subjects analysis¶
re3data
In [13]:
re3data_subjects = re3data_df.explode('subject')
In [37]:
data = re3data_subjects.groupby('subject')[['re3data_id']].count().sort_values('re3data_id', ascending=False)
plot = [
go.Bar(
x=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)].index,
y=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)]['re3data_id'],
name='re3data tier %s-digits' % tier
) for tier in [1,2,3,5]
]
layout = go.Layout(
title='Subject coverage re3data',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(plot, layout).show()
OpenDOAR
In [15]:
opendoar_subjects = opendoar_df.explode('subject')
In [16]:
data = opendoar_subjects.groupby('subject')[['opendoar_id']].count().sort_values('opendoar_id', ascending=False)
plot = [
go.Bar(
x=data.index,
y=data['opendoar_id'],
)
]
layout = go.Layout(
title='Subject coverage OpenDOAR',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(plot, layout).show()
ROAR
In [17]:
roar_df.subjects.unique()
Out[17]:
In [39]:
roar_df[roar_df.subjects == 'HD28']
Out[39]:
FAIRsharing
In [19]:
fairsharing_subjects = fairsharing_df.explode('subjects')
In [20]:
data = fairsharing_subjects.groupby('subjects')[['url']].count().sort_values('url', ascending=False)
plot = [
go.Bar(
x=data.index,
y=data['url'],
name='FAIRsharing'
)
]
layout = go.Layout(
title='Subject coverage FAIRsharing',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(plot, layout).show()
Geographic analysis¶
re3data
In [21]:
re3data_institutions = re3data_df.explode('institution')[['re3data_id', 'institution']]
re3data_institutions = re3data_institutions[~re3data_institutions.institution.isna()].reset_index(drop=True)
re3data_institutions = re3data_institutions.join(pd.DataFrame(re3data_institutions.institution.to_list(), columns=['org_name', 'org_other_names',
'org_country', 'org_3', 'org_noprofit',
'org_url', 'org_ids', 'org_date1',
'org_date2', 'org_contact']))
re3data_institutions.head()
Out[21]:
In [22]:
re3data_institutions['org_continent'] = re3data_institutions.org_country.map(countrycode_to_continent)
In [23]:
re3data_institutions[re3data_institutions.org_continent.isna()].org_country.unique()
Out[23]:
AAA is used for international collaborations; we skip this. EEC is used for the EU commission; we fix the continent manually.
In [24]:
re3data_institutions.loc[re3data_institutions.org_country == 'EEC', 'org_continent'] = 'EU'
OpenDOAR
In [25]:
opendoar_institutions = opendoar_df.explode('institution')[['opendoar_id', 'institution']]
opendoar_institutions = opendoar_institutions[~opendoar_institutions.institution.isna()].reset_index(drop=True)
opendoar_institutions = opendoar_institutions.join(pd.DataFrame(opendoar_institutions.institution.to_list(), columns=['org_name', 'org_other_names',
'org_country', 'org_3', 'org_noprofit',
'org_url', 'org_ids', 'org_date1',
'org_date2', 'org_contact']))
opendoar_institutions['org_country'] = opendoar_institutions.org_country.map(str.upper, na_action='ignore')
opendoar_institutions['org_country'] = opendoar_institutions.org_country.map(countrycode_iso2_to_countrycode_iso3, na_action='ignore')
opendoar_institutions.head()
Out[25]:
In [26]:
opendoar_institutions['org_continent'] = opendoar_institutions.org_country.map(countrycode_to_continent)
In [27]:
opendoar_institutions[opendoar_institutions.org_continent.isna()].org_country.unique()
Out[27]:
In [28]:
opendoar_institutions.loc[opendoar_institutions.org_country == 'UMI', 'org_continent'] = 'NA'
opendoar_institutions[opendoar_institutions.org_country == 'UMI']
Out[28]:
ROAR
In [29]:
roar_df['location_country'] = roar_df.location_country.map(str.upper, na_action='ignore')
roar_df['location_country'] = roar_df.location_country.map(countrycode_iso2_to_countrycode_iso3)
roar_df['continent'] = roar_df.location_country.map(countrycode_to_continent)
FAIRsharing
In [30]:
fairsharing_df['subjects'] = fairsharing_df.subjects.str.split(pat=',')
fairsharing_df['countries'] = fairsharing_df.countries.str.split(pat=',')
fairsharing_countries = fairsharing_df.explode('countries')
fairsharing_countries['countrycode'] = fairsharing_countries.countries.map(country_to_countrycode)
fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(countrycode_to_continent)
In [31]:
fairsharing_countries[fairsharing_countries.countrycode.isna()].countries.unique()
Out[31]:
In [32]:
fairsharing_countries[fairsharing_countries.continent.isna()].countries.unique()
Out[32]:
Fix manually some rows
In [33]:
fairsharing_countries.loc[fairsharing_countries.countries == 'Republic of Ireland', ['countries', 'countrycode', 'continent']] = ['Ireland', 'IE', 'EU']
fairsharing_countries.loc[fairsharing_countries.countries == 'European Union', ['countrycode', 'continent']] = ['EU', 'EU']
Make Antactica disappear (only one repo)
In [34]:
fairsharing_countries.loc[fairsharing_countries.countries == 'Antarctica', ['countrycode', 'continent']] = ['AQ', np.nan]
fairsharing_countries[fairsharing_countries.countrycode == 'AQ']
Out[34]:
Country coverage¶
In [35]:
data1 = re3data_institutions.groupby('org_country')[['re3data_id']].count().sort_values('re3data_id', ascending=False)
data2 = opendoar_institutions.groupby('org_country')[['opendoar_id']].count().sort_values('opendoar_id', ascending=False)
data3 = roar_df.groupby('location_country')[['eprintid']].count().sort_values('eprintid', ascending=False)
data4 = fairsharing_countries.groupby('countrycode')[['url']].count().sort_values('url', ascending=False)
plot = [
go.Bar(
x=data1.index,
y=data1['re3data_id'],
name='re3data'
),
go.Bar(
x=data2.index,
y=data2['opendoar_id'],
name='openDOAR',
visible = 'legendonly'
),
go.Bar(
x=data3.index,
y=data3['eprintid'],
name='ROAR',
visible = 'legendonly'
),
go.Bar(
x=data4.index,
y=data4['url'],
name='FAIRsharing',
visible = 'legendonly'
)
]
layout = go.Layout(
title='Country coverage',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
go.Figure(plot, layout).show()
Continental coverage¶
In [36]:
data1 = re3data_institutions.groupby('org_continent')[['re3data_id']].count()
data2 = opendoar_institutions.groupby('org_continent')[['opendoar_id']].count()
data3 = roar_df.groupby('continent')[['eprintid']].count()
data4 = fairsharing_countries.groupby('continent')[['url']].count()
plot = [
go.Scatterpolar(
r=data1.re3data_id,
theta=data1.index,
fill='toself',
name='re3data'),
go.Scatterpolar(
r=data2.opendoar_id,
theta=data2.index,
fill='toself',
name='OpenDOAR'),
go.Scatterpolar(
r=data3.eprintid,
theta=data3.index,
fill='toself',
name='ROAR'),
go.Scatterpolar(
r=data4.url,
theta=data4.index,
fill='toself',
name='FAIRsharing')
]
layout = go.Layout(polar=dict(
radialaxis=dict(
visible=True
),
)
)
go.Figure(plot, layout).show()