4.1 MiB
4.1 MiB
In [13]:
import ast
import csv
import json
import reverse_geocoder as rg
import numpy as np
import pandas as pd
import pycountry_convert
import matplotlib.pyplot as plt
from matplotlib_venn import venn2, venn2_circles
import plotly
from plotly.offline import iplot, init_notebook_mode
import plotly.graph_objs as go
import plotly.express as px
pd.set_option('display.max_columns', None)
In [14]:
def country_to_countrycode(country):
if pd.isna(country):
return np.nan
else:
try:
return pycountry_convert.country_name_to_country_alpha3(country)
except:
return np.nan
def countrycode_iso2_to_countrycode_iso3(country):
if pd.isna(country):
return np.nan
else:
try:
return pycountry_convert.country_name_to_country_alpha3(pycountry_convert.country_alpha2_to_country_name(country))
except:
return np.nan
def countrycode_to_continent(country_code):
if pd.isna(country_code):
return np.nan
else:
try:
return pycountry_convert.country_alpha2_to_continent_code(pycountry_convert.country_alpha3_to_country_alpha2(country_code))
except:
return np.nan
Loading datasets¶
re3data
In [15]:
re3data_df = pd.read_csv('../data/raw/re3data.tsv', delimiter='\t',
converters={'subject': ast.literal_eval,
'keyword': ast.literal_eval,
'additionalName': ast.literal_eval,
'repositoryIdentifier': ast.literal_eval,
'type': ast.literal_eval,
'contentType': ast.literal_eval,
'providerType': ast.literal_eval,
'institution': ast.literal_eval
})
re3data_df.head()
Out[15]:
In [16]:
re3data_df.describe(include='all')
Out[16]:
openDOAR
In [17]:
opendoar_df = pd.read_csv('../data/raw/openDoar.tsv', delimiter='\t',
converters={'repository_metadata.content_subjects': ast.literal_eval,
'repository_metadata.alternativename': ast.literal_eval,
'repository_metadata.content_types': ast.literal_eval,
'organization': ast.literal_eval
},
dtype={'system_metadata.id': str})
opendoar_df.head()
Out[17]:
In [18]:
opendoar_df.describe(include='all')
Out[18]:
ROAR
In [19]:
roar_df = pd.read_csv('../data/raw/export_roar_CSV.csv', dtype='str')
roar_df = roar_df.groupby('eprintid').aggregate(set)
def value_or_list(cell_set):
copy = set(cell_set)
copy.discard(np.nan)
if len(copy) == 0:
return np.nan
if len(copy) == 1:
return copy.pop()
return list(copy)
roar_df = roar_df.applymap(value_or_list)
roar_df.reset_index(inplace=True)
roar_df.head()
Out[19]:
In [20]:
roar_df.describe(include='all')
Out[20]:
FAIRsharing
In [21]:
with open('../data/raw/fairsharing_dump_api_09_2021.json') as f:
lines = f.read().splitlines()
fairsharing_df = pd.DataFrame(lines)
fairsharing_df.columns = ['json_element']
fairsharing_df['json_element'].apply(json.loads)
fairsharing_df = pd.json_normalize(fairsharing_df['json_element'].apply(json.loads))
fairsharing_df.head()
Out[21]:
In [22]:
fairsharing_df.describe(include='all')
Out[22]:
Subjects analysis¶
re3data
In [23]:
re3data_subjects = re3data_df[['orgIdentifier', 'subject']].explode('subject')
re3data_subjects['subject'] = re3data_subjects['subject'].apply(lambda x: x['name'] if x is not np.nan else np.nan)
re3data_subjects
Out[23]:
In [24]:
data = re3data_subjects.groupby('subject')[['orgIdentifier']].count().sort_values('subject', ascending=False)
data
plot = [
go.Bar(
x=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)].index,
y=data[data.index.str.contains('^\d{%s}\s' % tier, regex=True)]['orgIdentifier'],
name='re3data tier %s-digits' % tier
) for tier in [1,2,3,5]
]
layout = go.Layout(
title='Subject coverage re3data',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(plot, layout).show()
OpenDOAR
In [25]:
opendoar_subjects = opendoar_df.explode('repository_metadata.content_subjects')
In [26]:
data = opendoar_subjects.groupby('repository_metadata.content_subjects')[['system_metadata.id']].count().sort_values('system_metadata.id', ascending=False)
plot = [
go.Bar(
x=data.index,
y=data['system_metadata.id'],
)
]
layout = go.Layout(
title='Subject coverage OpenDOAR',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(plot, layout).show()
ROAR
In [27]:
roar_subjects = roar_df.explode('subjects')
In [28]:
data = roar_subjects.groupby('subjects')[['eprintid']].count().sort_values('eprintid', ascending=False)
plot = [
go.Bar(
x=data.index,
y=data['eprintid'],
)
]
layout = go.Layout(
title='Subject coverage OpenDOAR',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(plot, layout).show()
FAIRsharing
In [29]:
fairsharing_subjects = fairsharing_df.explode('attributes.subjects')
In [30]:
data = fairsharing_subjects.groupby('attributes.subjects')[['id']].count().sort_values('id', ascending=False)
plot = [
go.Bar(
x=data.index,
y=data['id'],
name='FAIRsharing'
)
]
layout = go.Layout(
title='Subject coverage FAIRsharing',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
fig = go.Figure(plot, layout).show()
Geographic analysis¶
re3data
In [31]:
re3data_institutions = re3data_df.explode('institution')[['orgIdentifier', 'institution']]
re3data_institutions = re3data_institutions[~re3data_institutions.institution.isna()].reset_index(drop=True)
re3data_institutions = re3data_institutions.join(pd.json_normalize(re3data_institutions.institution))
re3data_institutions.head()
Out[31]:
In [32]:
re3data_institutions['org_continent'] = re3data_institutions.institutionCountry.map(countrycode_to_continent)
In [33]:
re3data_institutions[re3data_institutions.org_continent.isna()].institutionCountry.unique()
Out[33]:
AAA is used for international collaborations; we skip this. EEC is used for the EU commission; we fix the continent manually.
In [34]:
re3data_institutions.loc[re3data_institutions.institutionCountry == 'EEC', 'org_continent'] = 'EU'
OpenDOAR
In [35]:
opendoar_institutions = opendoar_df.explode('organization')[['system_metadata.id', 'organization']]
opendoar_institutions = opendoar_institutions[~opendoar_institutions.organization.isna()].reset_index(drop=True)
opendoar_institutions = opendoar_institutions.join(pd.json_normalize(opendoar_institutions.organization))
opendoar_institutions['country'] = opendoar_institutions.country.map(str.upper, na_action='ignore')
opendoar_institutions['country'] = opendoar_institutions.country.map(countrycode_iso2_to_countrycode_iso3, na_action='ignore')
opendoar_institutions.head()
Out[35]:
In [36]:
opendoar_institutions['org_continent'] = opendoar_institutions.country.map(countrycode_to_continent)
In [37]:
opendoar_institutions[opendoar_institutions.org_continent.isna()].country.unique()
Out[37]:
In [38]:
opendoar_institutions.loc[opendoar_institutions.country == 'UMI', 'org_continent'] = 'NA'
opendoar_institutions[opendoar_institutions.country == 'UMI']
Out[38]:
ROAR
In [39]:
roar_institutions = roar_df.explode('location_country')
roar_institutions['location_country'] = roar_institutions.location_country.map(str.upper, na_action='ignore')
roar_institutions['location_country'] = roar_institutions.location_country.map(countrycode_iso2_to_countrycode_iso3)
roar_institutions['continent'] = roar_institutions.location_country.map(countrycode_to_continent)
FAIRsharing
In [40]:
fairsharing_countries = fairsharing_df.explode('attributes.countries')
fairsharing_countries['countrycode'] = fairsharing_countries['attributes.countries'].map(country_to_countrycode)
fairsharing_countries['continent'] = fairsharing_countries.countrycode.map(countrycode_to_continent)
In [41]:
fairsharing_countries[fairsharing_countries.countrycode.isna()]['attributes.countries'].unique()
Out[41]:
In [42]:
fairsharing_countries[fairsharing_countries.continent.isna()]['attributes.countries'].unique()
Out[42]:
Fix manually some rows
In [43]:
fairsharing_countries.loc[fairsharing_countries['attributes.countries'] == 'Republic of Ireland', ['attributes.countries', 'countrycode', 'continent']] = ['Ireland', 'IE', 'EU']
fairsharing_countries.loc[fairsharing_countries['attributes.countries'] == 'European Union', ['countrycode', 'continent']] = ['EU', 'EU']
Make Antactica disappear (only one repo)
In [44]:
fairsharing_countries.loc[fairsharing_countries['attributes.countries'] == 'Antarctica', ['countrycode', 'continent']] = ['AQ', np.nan]
fairsharing_countries[fairsharing_countries.countrycode == 'AQ']
Out[44]:
Country coverage¶
In [45]:
data1 = re3data_institutions.groupby('institutionCountry')[['orgIdentifier']].count().sort_values('orgIdentifier', ascending=False)
data2 = opendoar_institutions.groupby('country')[['system_metadata.id']].count().sort_values('system_metadata.id', ascending=False)
data3 = roar_institutions.groupby('location_country')[['eprintid']].count().sort_values('eprintid', ascending=False)
data4 = fairsharing_countries.groupby('countrycode')[['id']].count().sort_values('id', ascending=False)
plot = [
go.Bar(
x=data1.index,
y=data1['orgIdentifier'],
name='re3data'
),
go.Bar(
x=data2.index,
y=data2['system_metadata.id'],
name='openDOAR',
visible = 'legendonly'
),
go.Bar(
x=data3.index,
y=data3['eprintid'],
name='ROAR',
visible = 'legendonly'
),
go.Bar(
x=data4.index,
y=data4['id'],
name='FAIRsharing',
visible = 'legendonly'
)
]
layout = go.Layout(
title='Country coverage',
xaxis=dict(tickangle=45, tickfont=dict(size=12))
)
go.Figure(plot, layout).show()
Continental coverage¶
In [46]:
data1 = re3data_institutions.groupby('org_continent')[['orgIdentifier']].count()
data2 = opendoar_institutions.groupby('org_continent')[['system_metadata.id']].count()
data3 = roar_institutions.groupby('continent')[['eprintid']].count()
data4 = fairsharing_countries.groupby('continent')[['id']].count()
plot = [
go.Scatterpolar(
r=data1.orgIdentifier,
theta=data1.index,
fill='toself',
name='re3data'),
go.Scatterpolar(
r=data2['system_metadata.id'],
theta=data2.index,
fill='toself',
name='OpenDOAR'),
go.Scatterpolar(
r=data3.eprintid,
theta=data3.index,
fill='toself',
name='ROAR'),
go.Scatterpolar(
r=data4.id,
theta=data4.index,
fill='toself',
name='FAIRsharing')
]
layout = go.Layout(polar=dict(
radialaxis=dict(
visible=True
),
)
)
go.Figure(plot, layout).show()