3.9 MiB
3.9 MiB
In [1]:
import ast
import csv
import json
import reverse_geocoder as rg
import numpy as np
import pandas as pd
import pycountry_convert
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('display.max_columns', None)
In [2]:
def country_to_countrycode(country):
if pd.isna(country):
return np.nan
else:
try:
return pycountry_convert.country_name_to_country_alpha3(country)
except:
return np.nan
def countrycode_iso2_to_countrycode_iso3(country):
if pd.isna(country):
return np.nan
else:
try:
return pycountry_convert.country_name_to_country_alpha3(pycountry_convert.country_alpha2_to_country_name(country))
except:
return np.nan
def countrycode_to_continent(country_code):
if pd.isna(country_code):
return np.nan
else:
try:
return pycountry_convert.country_alpha2_to_continent_code(pycountry_convert.country_alpha3_to_country_alpha2(country_code))
except:
return np.nan
Loading datasets¶
re3data
In [3]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
converters={'subject': ast.literal_eval,
'keyword': ast.literal_eval,
'additionalName': ast.literal_eval,
'repositoryIdentifier': ast.literal_eval,
'type': ast.literal_eval,
'contentType': ast.literal_eval,
'providerType': ast.literal_eval,
'institution': ast.literal_eval
})
re3data_df.head()
Out[3]:
In [4]:
re3data_df.describe(include='all')
Out[4]:
openDOAR
In [48]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
converters={'repository_metadata.content_subjects': ast.literal_eval,
'repository_metadata.alternativename': ast.literal_eval,
'repository_metadata.content_types': ast.literal_eval,
'organization': ast.literal_eval
},
dtype={'system_metadata.id': str})
opendoar_df.head()
Out[48]:
In [6]:
opendoar_df.describe(include='all')
Out[6]:
ROAR
In [9]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv', dtype='str')
roar_df = roar_df.groupby('eprintid').aggregate(set)
def value_or_list(cell_set):
copy = set(cell_set)
copy.discard(np.nan)
if len(copy) == 0:
return np.nan
if len(copy) == 1:
return copy.pop()
return list(copy)
roar_df = roar_df.applymap(value_or_list)
roar_df.reset_index(inplace=True)
roar_df.head()
Out[9]:
In [12]:
roar_df.describe(include='all')
Out[12]:
FAIRsharing
In [57]:
with open('../data/raw/fairsharing_dump_api_09_2021.json') as f:
lines = f.read().splitlines()
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))
fairsharing_df.head()
Out[57]:
In [58]:
fairsharing_df.describe(include='all')
Out[58]:
Subjects analysis¶
re3data
In [38]:
re3data_subjects = re3data_df[['orgIdentifier', 'subject']].explode('subject')
re3data_subjects['subject'] = re3data_subjects['subject'].apply(lambda x: x['name'] if x is not np.nan else np.nan)
re3data_subjects
Out[38]:
In [42]:
data = re3data_subjects.groupby('subject')[['orgIdentifier']].count().sort_values('subject', ascending=False)
data
plot = [
go.Bar(
x=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)].index,
y=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)]['orgIdentifier'],
name='re3data tier %s-digits' % tier
) for tier in [1,2,3,5]
]
layout = go.Layout(
title='Subject coverage re3data',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(plot, layout).show()
OpenDOAR
In [49]:
opendoar_subjects = opendoar_df.explode('repository_metadata.content_subjects')
In [50]:
data = opendoar_subjects.groupby('repository_metadata.content_subjects')[['system_metadata.id']].count().sort_values('system_metadata.id', ascending=False)
plot = [
go.Bar(
x=data.index,
y=data['system_metadata.id'],
)
]
layout = go.Layout(
title='Subject coverage OpenDOAR',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(plot, layout).show()
ROAR
In [55]:
roar_subjects = roar_df.explode('subjects')
In [56]:
data = roar_subjects.groupby('subjects')[['eprintid']].count().sort_values('eprintid', ascending=False)
plot = [
go.Bar(
x=data.index,
y=data['eprintid'],
)
]
layout = go.Layout(
title='Subject coverage OpenDOAR',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(plot, layout).show()
FAIRsharing
In [59]:
fairsharing_subjects = fairsharing_df.explode('attributes.subjects')
In [61]:
data = fairsharing_subjects.groupby('attributes.subjects')[['id']].count().sort_values('id', ascending=False)
plot = [
go.Bar(
x=data.index,
y=data['id'],
name='FAIRsharing'
)
]
layout = go.Layout(
title='Subject coverage FAIRsharing',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(plot, layout).show()
Geographic analysis¶
re3data
In [21]:
re3data_institutions = re3data_df.explode('institution')[['re3data_id', 'institution']]
re3data_institutions = re3data_institutions[~re3data_institutions.institution.isna()].reset_index(drop=True)
re3data_institutions = re3data_institutions.join(pd.DataFrame(re3data_institutions.institution.to_list(), columns=['org_name', 'org_other_names',
'org_country', 'org_3', 'org_noprofit',
'org_url', 'org_ids', 'org_date1',
'org_date2', 'org_contact']))
re3data_institutions.head()
Out[21]:
In [22]:
re3data_institutions['org_continent'] = re3data_institutions.org_country.map(countrycode_to_continent)
In [23]:
re3data_institutions[re3data_institutions.org_continent.isna()].org_country.unique()
Out[23]:
AAA is used for international collaborations; we skip this. EEC is used for the EU commission; we fix the continent manually.
In [24]:
re3data_institutions.loc[re3data_institutions.org_country == 'EEC', 'org_continent'] = 'EU'
OpenDOAR
In [25]:
opendoar_institutions = opendoar_df.explode('institution')[['opendoar_id', 'institution']]
opendoar_institutions = opendoar_institutions[~opendoar_institutions.institution.isna()].reset_index(drop=True)
opendoar_institutions = opendoar_institutions.join(pd.DataFrame(opendoar_institutions.institution.to_list(), columns=['org_name', 'org_other_names',
'org_country', 'org_3', 'org_noprofit',
'org_url', 'org_ids', 'org_date1',
'org_date2', 'org_contact']))
opendoar_institutions['org_country'] = opendoar_institutions.org_country.map(str.upper, na_action='ignore')
opendoar_institutions['org_country'] = opendoar_institutions.org_country.map(countrycode_iso2_to_countrycode_iso3, na_action='ignore')
opendoar_institutions.head()
Out[25]:
In [26]:
opendoar_institutions['org_continent'] = opendoar_institutions.org_country.map(countrycode_to_continent)
In [27]:
opendoar_institutions[opendoar_institutions.org_continent.isna()].org_country.unique()
Out[27]:
In [28]:
opendoar_institutions.loc[opendoar_institutions.org_country == 'UMI', 'org_continent'] = 'NA'
opendoar_institutions[opendoar_institutions.org_country == 'UMI']
Out[28]:
ROAR
In [29]:
roar_df['location_country'] = roar_df.location_country.map(str.upper, na_action='ignore')
roar_df['location_country'] = roar_df.location_country.map(countrycode_iso2_to_countrycode_iso3)
roar_df['continent'] = roar_df.location_country.map(countrycode_to_continent)
FAIRsharing
In [30]:
fairsharing_df['subjects'] = fairsharing_df.subjects.str.split(pat=',')
fairsharing_df['countries'] = fairsharing_df.countries.str.split(pat=',')
fairsharing_countries = fairsharing_df.explode('countries')
fairsharing_countries['countrycode'] = fairsharing_countries.countries.map(country_to_countrycode)
fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(countrycode_to_continent)
In [31]:
fairsharing_countries[fairsharing_countries.countrycode.isna()].countries.unique()
Out[31]:
In [32]:
fairsharing_countries[fairsharing_countries.continent.isna()].countries.unique()
Out[32]:
Fix manually some rows
In [33]:
fairsharing_countries.loc[fairsharing_countries.countries == 'Republic of Ireland', ['countries', 'countrycode', 'continent']] = ['Ireland', 'IE', 'EU']
fairsharing_countries.loc[fairsharing_countries.countries == 'European Union', ['countrycode', 'continent']] = ['EU', 'EU']
Make Antactica disappear (only one repo)
In [34]:
fairsharing_countries.loc[fairsharing_countries.countries == 'Antarctica', ['countrycode', 'continent']] = ['AQ', np.nan]
fairsharing_countries[fairsharing_countries.countrycode == 'AQ']
Out[34]:
Country coverage¶
In [35]:
data1 = re3data_institutions.groupby('org_country')[['re3data_id']].count().sort_values('re3data_id', ascending=False)
data2 = opendoar_institutions.groupby('org_country')[['opendoar_id']].count().sort_values('opendoar_id', ascending=False)
data3 = roar_df.groupby('location_country')[['eprintid']].count().sort_values('eprintid', ascending=False)
data4 = fairsharing_countries.groupby('countrycode')[['url']].count().sort_values('url', ascending=False)
plot = [
go.Bar(
x=data1.index,
y=data1['re3data_id'],
name='re3data'
),
go.Bar(
x=data2.index,
y=data2['opendoar_id'],
name='openDOAR',
visible = 'legendonly'
),
go.Bar(
x=data3.index,
y=data3['eprintid'],
name='ROAR',
visible = 'legendonly'
),
go.Bar(
x=data4.index,
y=data4['url'],
name='FAIRsharing',
visible = 'legendonly'
)
]
layout = go.Layout(
title='Country coverage',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
go.Figure(plot, layout).show()
Continental coverage¶
In [36]:
data1 = re3data_institutions.groupby('org_continent')[['re3data_id']].count()
data2 = opendoar_institutions.groupby('org_continent')[['opendoar_id']].count()
data3 = roar_df.groupby('continent')[['eprintid']].count()
data4 = fairsharing_countries.groupby('continent')[['url']].count()
plot = [
go.Scatterpolar(
r=data1.re3data_id,
theta=data1.index,
fill='toself',
name='re3data'),
go.Scatterpolar(
r=data2.opendoar_id,
theta=data2.index,
fill='toself',
name='OpenDOAR'),
go.Scatterpolar(
r=data3.eprintid,
theta=data3.index,
fill='toself',
name='ROAR'),
go.Scatterpolar(
r=data4.url,
theta=data4.index,
fill='toself',
name='FAIRsharing')
]
layout = go.Layout(polar=dict(
radialaxis=dict(
visible=True
),
)
)
go.Figure(plot, layout).show()