openaire_beginners_kit/data/beginners_kit.ipynb

3544 lines
164 KiB
Plaintext
Raw Normal View History

2023-05-08 14:12:10 +02:00
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# OpenAIRE Beginners Kit\n",
"\n",
"The OpenAIRE Research Graph is an Open Access dataset containing metadata about research products (literature, datasets, software, etc.) linked to other entities of the research ecosystem like organisations, project grants, and data sources.\n",
"\n",
"The large size of the OpenAIRE Research Graph is a major impediment for beginners to familiarise with the underlying data model and explore its contents. Working with the Graph in its full size typically requires access to a huge distributed computing infrastructure which cannot be easily accessible to everyone.\n",
"\n",
"The OpenAIRE Beginners Kit aims to address this issue. It consists of two components: a subset of the Graph composed of the research products published between 2022-06-29 and 2022-12-29, all the entities connected to them and the respective relationships, and the present Zeppelin notebook that demonstrates how you can use PySpark to analyse the Graph and get answers to some interesting research questions.\n",
"\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download data"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"slideshow": {
"slide_type": "notes"
},
"tags": [
"zenodo_base_url"
]
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Downloading communities_infrastructures.tar\n",
"Extracting communities_infrastructures.tar\n",
"Downloading dataset.tar\n",
"Extracting dataset.tar\n",
"Downloading datasource.tar\n",
"Extracting datasource.tar\n",
"Downloading organization.tar\n",
"Extracting organization.tar\n",
"Downloading otherresearchproduct.tar\n",
"Extracting otherresearchproduct.tar\n",
"Downloading project.tar\n",
"Extracting project.tar\n",
"Downloading publication.tar\n",
"Extracting publication.tar\n",
"Downloading relation.tar\n",
"Extracting relation.tar\n",
"Downloading software.tar\n",
"Extracting software.tar\n"
]
}
],
"source": [
"!rm -rf data\n",
"!mkdir data\n",
"\n",
"import os\n",
"base_url = \"https://zenodo.org/record/7490192/files/\"\n",
"\n",
"\n",
"items =[\"communities_infrastructures.tar\",\"dataset.tar\",\"datasource.tar\",\"organization.tar\",\"otherresearchproduct.tar\",\"project.tar\",\"publication.tar\",\"relation.tar\", \"software.tar\"]\n",
"\n",
"for item in items: \n",
" print(f\"Downloading {item}\")\n",
" os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n",
" print(f\"Extracting {item}\")\n",
" os.system(f'tar -xf data/{item} -C data/; rm data/{item}')\n",
" \n",
" \n",
" \n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Have a look at the input data"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": [
"import json\n",
"\n",
"import pyspark.sql.functions as F\n",
"from pyspark.sql.functions import col\n",
"from pyspark.sql.types import StructType\n",
"from pyspark.sql import SparkSession\n",
"from IPython.display import JSON as pretty_print\n",
"\n",
"\n",
"spark = SparkSession.builder.getOrCreate()\n",
"\n",
"\n",
"\n",
"publicationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"author\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"fullname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"rank\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"surname\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"bestaccessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"container\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"conferencedate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"conferenceplace\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"edition\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"ep\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"iss\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnLinking\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnOnline\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnPrinted\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"sp\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"vol\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"contributor\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"coverage\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"dateofcollection\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"embargoenddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"format\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"indicators\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impactMeasures\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impulse\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name
"datasetSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"author\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"fullname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"rank\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"surname\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"bestaccessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"contributor\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"coverage\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"dateofcollection\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"embargoenddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"format\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"geolocation\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"box\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"place\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"point\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"indicators\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impactMeasures\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impulse\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":
"softwareSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"author\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"fullname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"rank\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"surname\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"bestaccessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"contributor\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"coverage\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"dateofcollection\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"documentationUrl\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"embargoenddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"format\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"indicators\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impactMeasures\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impulse\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\
"otherSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"author\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"fullname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"rank\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"surname\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"bestaccessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"contactgroup\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"contactperson\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"contributor\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"coverage\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"dateofcollection\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"embargoenddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"format\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"indicators\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impactMeasures\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impulse\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\
"datasourceSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"accessrights\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"certificates\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"citationguidelineurl\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"databaseaccessrestriction\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"datasourcetype\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"datauploadrestriction\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"dateofvalidation\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"englishname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"journal\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"issnLinking\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnOnline\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnPrinted\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"languages\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"logourl\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"missionstatementurl\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"officialname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openairecompatibility\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"originalId\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pidsystems\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"policies\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"releasestartdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"subjects\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"uploadrights\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"versioning\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"websiteurl\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n",
"organizationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"alternativenames\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"legalname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"legalshortname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"websiteurl\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n",
"projectSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"acronym\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"callidentifier\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"enddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"funding\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"funding_stream\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"jurisdiction\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"shortName\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"granted\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"currency\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"fundedamount\",\"nullable\":true,\"type\":\"double\"},{\"metadata\":{},\"name\":\"totalcost\",\"nullable\":true,\"type\":\"double\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"h2020programme\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"keywords\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openaccessmandatefordataset\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"openaccessmandateforpublications\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"startdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"summary\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"title\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"websiteurl\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n",
"communitySchema = '{\"fields\":[{\"metadata\":{},\"name\":\"acronym\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"zenodo_community\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n",
"relationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"reltype\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"target\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"validated\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"validationDate\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"number of publications 2685793\n",
"number of datasets 128092\n",
"number of software 26992\n",
"number of other research products 22779\n",
"number of results 2863656\n",
"number of datasources 47356\n",
"number of organizations 7411\n",
"number of communities 17\n",
"number of projects 15780\n",
"number of relationships 14004807\n"
]
}
],
"source": [
"\n",
"\n",
"\n",
"\n",
"#set the input path: the path on the cluster where the dataset will be stored (e.g. '/data/openaire_dump_subset/'); untar each folder in the dataset and move it to the chosen path\n",
"\n",
"inputPath = 'data/'\n",
" \n",
"# load entities and relationships\n",
"publication = spark.read.schema(StructType.fromJson(json.loads(publicationSchema))).json(inputPath + 'publication')\n",
"dataset = spark.read.schema(StructType.fromJson(json.loads(datasetSchema))).json(inputPath + 'dataset')\n",
"software = spark.read.schema(StructType.fromJson(json.loads(softwareSchema))).json(inputPath + 'software')\n",
"other = spark.read.schema(StructType.fromJson(json.loads(otherSchema))).json(inputPath + 'otherresearchproduct')\n",
"#results = publication.dropColumn('container').unionByName(dataset.dropColumns('size', 'version', 'geolocation'), allowMissingColumns=True).unionByName(software.dropColumns('documentationUrl', 'codeRepositoryUrl', 'programmingLanguage'), allowMissingColumns=True).unionByName(other.dropColumns('contactperson', 'contactgroup', 'tool'), allowMissingColumns=True)\n",
"results = publication.unionByName(dataset, allowMissingColumns=True).unionByName(software, allowMissingColumns=True).unionByName(other, allowMissingColumns=True)\n",
"datasource = spark.read.schema(StructType.fromJson(json.loads(datasourceSchema))).json(inputPath + 'datasource')\n",
"organization = spark.read.schema(StructType.fromJson(json.loads(organizationSchema))).json(inputPath + 'organization')\n",
"project = spark.read.schema(StructType.fromJson(json.loads(projectSchema))).json(inputPath + 'project')\n",
"community = spark.read.schema(StructType.fromJson(json.loads(communitySchema))).json(inputPath + 'communities_infrastructures')\n",
"relation = spark.read.schema(StructType.fromJson(json.loads(relationSchema))).json(inputPath + 'relation')\n",
"\n",
"publication.createOrReplaceTempView(\"publications\")\n",
"dataset.createOrReplaceTempView(\"datasets\")\n",
"software.createOrReplaceTempView(\"software\")\n",
"other.createOrReplaceTempView(\"others\")\n",
"results.createOrReplaceTempView(\"results\")\n",
"datasource.createOrReplaceTempView(\"datasources\")\n",
"organization.createOrReplaceTempView(\"organizations\")\n",
"project.createOrReplaceTempView(\"projects\")\n",
"community.createOrReplaceTempView(\"communities\")\n",
"relation.createOrReplaceTempView(\"relations\")\n",
"\n",
"# count and print their number\n",
"print(\"number of publications %s\"%publication.count())\n",
"print(\"number of datasets %s\"%dataset.count())\n",
"print(\"number of software %s\"%software.count())\n",
"print(\"number of other research products %s\"%other.count())\n",
"print(\"number of results %s\"%results.count())\n",
"print(\"number of datasources %s\"%datasource.count())\n",
"print(\"number of organizations %s\"%organization.count())\n",
"print(\"number of communities %s\"%community.count())\n",
"print(\"number of projects %s\"%project.count())\n",
"print(\"number of relationships %s\"%relation.count())"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"application/json": {
"author": [
{
"fullname": "Son, D.A.",
"name": "D. A.",
"rank": 1,
"surname": "Son"
},
{
"fullname": "Anh, N.T.N",
"name": "N. T. N.",
"rank": 2,
"surname": "Anh"
},
{
"fullname": "Tung, P.A.",
"name": "P. A.",
"rank": 3,
"surname": "Tung"
}
],
"bestaccessright": {
"code": "c_abf2",
"label": "OPEN",
"scheme": "http://vocabularies.coar-repositories.org/documentation/access_rights/"
},
"container": {
"edition": "",
"ep": "",
"iss": "",
"issnLinking": "",
"issnOnline": "0012-835X",
"issnPrinted": "0012-835X",
"name": "East African Medical Journal",
"sp": "",
"vol": ""
},
"contributor": [],
"country": [],
"coverage": [],
"dateofcollection": "2022-09-04T02:25:47.163Z",
"description": [
"Objectives: To understand the physical activity and cognitive impairment among elderly people. Materials and methods: The study was conducted on 1210 elderly people (aged 60 and older) in 3 district in Ha Nam province: Binh Luc, Duy Tien and Kim Bang, from January 2020 to June 2020. Results: The prevalence of elderly people with symptoms of cognitive impairment was 46.36%. There was a significant difference in the rate of cognitive impairment between 2 groups of continuous exercise (41.92%) and the group Non-exercise (51.62%) (p<0.05). Compared to the Non-exercise group, the risk of cognitive impairment of group that exercise 15 - 45 minutes/week (OR = 0.69; 95% CI: 0.51 - 0.93; p = 0.014); group that exercise 45 - 90 minute/week group (OR = 0.61; 95% CI: 0.44 - 0.84; p = 0.003) and group that exercise over 90 minutes/week (OR = 0.43; 95% CI: 0.27 - 0.67; p = 0.000) were all significantly lower. Conclusion: The more time the participants spent exercise, the lower the risk of developing cognitive impairment they get. It is necessary to expand research, continue to apply the international standard cognitive screening test, also guide and encourage the older adults to carry out physical activities according to their capabilities and international recommendations."
],
"format": [
"application/pdf"
],
"id": "50|78975075580c::2ff84f3173897001283274434e8f3eaa",
"instance": [
{
"accessright": {
"code": "c_abf2",
"label": "OPEN",
"scheme": "http://vocabularies.coar-repositories.org/documentation/access_rights/"
},
"alternateIdentifier": [],
"pid": [],
"publicationdate": "2022-08-29",
"refereed": "peerReviewed",
"type": "Article",
"url": [
"https://www.ajol.info/index.php/eamj/article/view/230697"
]
}
],
"language": {
"code": "eng",
"label": "English"
},
"lastupdatetimestamp": 1671492313610,
"maintitle": "Physical activity habits and cognitive impairment in the elderly in some districts of Ha Nam province, Vietnam in 2020",
"originalId": [
"oai:ajol.info:article/230697"
],
"pid": [],
"publicationdate": "2022-08-29",
"publisher": "Kenya Medical Association",
"source": [
"East African Medical Journal; Vol. 99 No. 7 (2022); 5014-5020",
"0012-835X"
],
"subjects": [],
"type": "publication"
},
"text/plain": [
"<IPython.core.display.JSON object>"
]
},
"execution_count": 13,
"metadata": {
"application/json": {
"expanded": true,
"root": "root"
}
},
"output_type": "execute_result"
}
],
"source": [
"# the generic result (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)\n",
"pretty_print(json.loads(publication.where(\"id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'\").toJSON().first()), expanded=True)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"application/json": {
"datasourcetype": {
"scheme": "datarepository::unknown",
"value": "Data Repository"
},
"description": "Scholars Portal Dataverse is a repository of research data in all fields of research. Researchers can share, publish, archive, find and cite data across all research fields. Researchers from subscribing institutions can use Dataverse to directly deposit data, create metadata, release and share data openly or privately, visualize and explore data, and search for data.",
"id": "10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e",
"languages": [],
"officialname": "Scholars Portal Dataverse",
"openairecompatibility": "Not yet registered",
"originalId": [
"fairsharing_::2542",
"opendoar____::10329",
"re3data_____::r3d100010691"
],
"pid": [
{
"scheme": "doi",
"value": "10.25504/FAIRsharing.kwzydf"
},
{
"scheme": "re3data",
"value": "r3d100010691"
}
],
"policies": [],
"releasestartdate": "2012-01-01",
"subjects": [
"Data Management",
"Subject Agnostic",
"Experimental measurement",
"Protocol",
"Data storage"
],
"versioning": false,
"websiteurl": "https://dataverse.scholarsportal.info/"
},
"text/plain": [
"<IPython.core.display.JSON object>"
]
},
"execution_count": 12,
"metadata": {
"application/json": {
"expanded": true,
"root": "root"
}
},
"output_type": "execute_result"
}
],
"source": [
"\n",
"\n",
"# the data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)\n",
"pretty_print(json.loads(datasource.where(\"id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'\").toJSON().first()), expanded=True)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"application/json": {
"alternativenames": [
"Hospital Authority",
"HA"
],
"country": {
"code": "CN",
"label": "China (People's Republic of)"
},
"id": "20|openorgs____::5836463160e0e5d1cd12997f7d2f0257",
"legalname": "Hospital Authority",
"legalshortname": "HA",
"pid": [
{
"scheme": "ISNI",
"value": "0000 0004 1764 4320"
},
{
"scheme": "FundRef",
"value": "501100003808"
},
{
"scheme": "FundRef",
"value": "501100006577"
},
{
"scheme": "GRID",
"value": "grid.414370.5"
},
{
"scheme": "ROR",
"value": "https://ror.org/05sn8t512"
},
{
"scheme": "Wikidata",
"value": "Q5908350"
},
{
"scheme": "ISNI",
"value": "0000 0004 1764 4320"
},
{
"scheme": "FundRef",
"value": "501100003808"
},
{
"scheme": "FundRef",
"value": "501100006577"
},
{
"scheme": "GRID",
"value": "grid.414370.5"
},
{
"scheme": "ROR",
"value": "https://ror.org/05sn8t512"
},
{
"scheme": "Wikidata",
"value": "Q5908350"
}
],
"websiteurl": "http://www.ha.org.hk/visitor/ha_index.asp?Content_ID=0&Lang=ENG&Dimension=100&Ver=HTML"
},
"text/plain": [
"<IPython.core.display.JSON object>"
]
},
"execution_count": 16,
"metadata": {
"application/json": {
"expanded": true,
"root": "root"
}
},
"output_type": "execute_result"
}
],
"source": [
"# the organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)\n",
"pretty_print(json.loads(organization.where(\"id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'\").toJSON().first()), expanded=True)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"application/json": {
"acronym": "BFC",
"code": "ANR-15-IDEX-0003",
"funding": [
{
"jurisdiction": "FR",
"name": "French National Research Agency (ANR)",
"shortName": "ANR"
}
],
"granted": {
"currency": "EUR",
"fundedamount": 49328900,
"totalcost": 0
},
"h2020programme": [],
"id": "40|anr_________::3590b45fea74b726d3c3e9872a2dcbf8",
"openaccessmandatefordataset": false,
"openaccessmandateforpublications": false,
"subject": [],
"summary": "Le projet ISITE-BFC réunit 16 partenaires : Université Bourgogne Franche-Comté (UBFC), porteur du projet ISITE-BFC ; Les 7 établissements membres dUBFC : Université de Bourgogne (uB), Université de Franche-Comté (UFC), Université de Technologie de Belfort-Montbéliard (UTBM) , AgroSup Dijon, Ecole Nationale Supérieure de Mécanique et de Microtechniques (ENSMM), Burgundy School of Business (BSB) et l'Ecole Nationale Supérieure des Arts et Métiers (ENSAM en cours d'intégration en tant qu'établissement membre d'UBFC ). Quatre organismes nationaux de recherche actifs en Bourgogne Franche-Comté (BFC): lINRA, le CNRS, lINSERM et le CEA. Quatre établissements de santé implantés en BFC : le CHRU Besançon, le CHU Dijon, le Centre Georges François Leclerc (CGFL, Centre de Lutte Contre le Cancer) et lEtablissement Français du Sang (EFS). ISITE-BFC a pour but de propulser l'université-cible du projet UBFC sur la scène internationale. UBFC représente en BFC une communauté de 58 000 étudiants et 8 800 personnels dont 2 200 enseignants-chercheurs actifs dans 60 laboratoires et fédérations de recherche. La Région BFC a annoncé que, parmi les établissements d'ESR de BFC, UBFC sera son interlocuteur privilégié en matière de soutien à l'innovation. En 2016, les membres du consortium ont obtenus 62 nouveaux projets nationaux et 31 nouveaux projets internationaux correspondant à des budgets cumulés supérieurs à 14 M€ et à 10 M€ respectivement. En fédérant les établissements membres et en unissant ainsi les forces en matière denseignement supérieur et de recherche, UBFC intègre dès 2016 le classement international des universités proposé par le Times Higher Education entre la 501ème et 600ème place parmi 980 institutions internationales classées. Les premières élections du CA et du CAC d'UBFC ont eu lieu en avril 2016. Le premier CA élu d'UBFC a ensuite choisi le premier président élu d'UBFC. Toutes les instances de gouvernance d'UBFC et du projet ISITE-BFC sont devenues opérationnelles en 2016. Le Conseil des Membres, constitué de lensemble des chefs détablissements membres d'UBFC, était bien sûr déjà fonctionnel avant 2016 puisqu'il avait travaillé à la rédaction des statuts de la COMUE UBFC créée en 2015. Durant 2016, conformément aux statuts, ce Conseil a continué à agir comme comité de pilotage de l'Université-Cible UBFC. En automne 2016, en donnant pour instruction à leurs personnels enseignants-chercheurs respectifs de soumettre les projets ANR et H2020 en y définissant UBFC comme porteur légal, les chefs d'établissements membres d'UBFC ont réalisé un engagement important prévu dans le projet ISITE-BFC pour la trajectoire d'UBFC. Le projet ISITE-BFC occupe une place centrale dans l'articulation avec la politique contractuelle du MENESR. ISITE-BFC est en effet au cœur des perspectives décrites dans le rapport HCERES d'UBFC déposé en 2016. Le dialogue contractuel de site avec lEtat a également été initié en 2016 et élaboré principalement durant l'automne 2016. Le contrat de site a été signé en avril 2017 et inscrit également l'ISITE-BFC dans ses objectifs stratégiques. Durant 2016, les messages à l'attention de la communauté des enseignants-chercheurs ont été multiplié pour les inciter à appliquer la signature scientifique unique UBFC. L'objectif annoncé dans la convention de préfinancement a été atteint, à savoir 30 % d'application de la signature scientifique unique UBFC sur la période de juillet 2015 (date de démarrage de l'application de la signature unique UBFC) à fin 2016. Sur la période de janvier à décembre 2016, ce taux est de 48 %. Depuis 2016, chaque personnel membre d'un établissement membre d'UBFC dispose d'une adresse de courrier éléctronique du type prenom.nom@ubfc.fr. Le projet ISITE-BFC a été sélectionné en janvier 2016. Il a bénéficié d'un préfinancement de 5 M€ à partir d'août 2016. Dès que l'enveloppe du préfinancement a été connue, le Comit
"title": "ISITE « BFC"
},
"text/plain": [
"<IPython.core.display.JSON object>"
]
},
"execution_count": 17,
"metadata": {
"application/json": {
"expanded": true,
"root": "root"
}
},
"output_type": "execute_result"
}
],
"source": [
"# the project (link to documentation: https://graph.openaire.eu/docs/data-model/entities/project)\n",
"pretty_print(json.loads(project.toJSON().first()), expanded=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"application/json": {
"acronym": "mes",
"description": "This community was initially defined to include a very broad range of topics, with the intention to generate a number of more focused and sustainable dashboards for research communities and initiatives. As outlined in the logo of this community, we intend to setup a community dashboard for EuroMarine (a consortium of 56 research and academic organisations) and monitoring dashboards for marine research initiatives, including infrastructures (e.g. EMBRC & EMSO), advisory boards (e.g. Marine Boards & ICES), and transnational funding bodies (e.g. JPI-Oceans and Tara Foundation).",
"id": "00|context_____::d2db8a610f8c7c0785d2d92a6e8c450e",
"name": "European Marine Science",
"subject": [
"marine",
"ocean",
"fish",
"aqua",
"sea"
],
"type": "Research Community",
"zenodo_community": "https://zenodo.org/communities/oac_mes"
},
"text/plain": [
"<IPython.core.display.JSON object>"
]
},
"execution_count": 18,
"metadata": {
"application/json": {
"expanded": true,
"root": "root"
}
},
"output_type": "execute_result"
}
],
"source": [
"# the community (link to documentation: https://graph.openaire.eu/docs/data-model/entities/community)\n",
"pretty_print(json.loads(community.where(\"acronym='mes'\").toJSON().first()), expanded=True)\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"application/json": {
"provenance": {
"provenance": "Harvested",
"trust": "0.9"
},
"reltype": {
"name": "isHostedBy",
"type": "provision"
},
"source": {
"id": "50|doi_________::536dffbcf19e4d1f48f99bfb0d86d2e1",
"type": "result"
},
"target": {
"id": "10|doajarticles::00f13fb5bcb74cf81c03e783bff91faf",
"type": "datasource"
},
"validated": false
},
"text/plain": [
"<IPython.core.display.JSON object>"
]
},
"execution_count": 19,
"metadata": {
"application/json": {
"expanded": true,
"root": "root"
}
},
"output_type": "execute_result"
}
],
"source": [
"# the relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)\n",
"pretty_print(json.loads(relation.toJSON().first()), expanded=True)"
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>isProvidedBy</td>\n",
" <td>3534319</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>provides</td>\n",
" <td>3534312</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>hosts</td>\n",
" <td>2696438</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>isHostedBy</td>\n",
" <td>2696436</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>IsRelatedTo</td>\n",
" <td>399737</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>isAuthorInstitutionOf</td>\n",
" <td>231642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>hasAuthorInstitution</td>\n",
" <td>231642</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>IsCitedBy</td>\n",
" <td>174058</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Cites</td>\n",
" <td>174058</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>HasVersion</td>\n",
" <td>44402</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>IsVersionOf</td>\n",
" <td>44402</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>isProducedBy</td>\n",
" <td>38672</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>produces</td>\n",
" <td>38671</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>IsPartOf</td>\n",
" <td>34520</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>HasPart</td>\n",
" <td>34520</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>hasParticipant</td>\n",
" <td>31035</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>isParticipant</td>\n",
" <td>31035</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>IsIdenticalTo</td>\n",
" <td>12974</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>HasAmongTopNSimilarDocuments</td>\n",
" <td>5903</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>IsAmongTopNSimilarDocuments</td>\n",
" <td>5903</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name count\n",
"0 isProvidedBy 3534319\n",
"1 provides 3534312\n",
"2 hosts 2696438\n",
"3 isHostedBy 2696436\n",
"4 IsRelatedTo 399737\n",
"5 isAuthorInstitutionOf 231642\n",
"6 hasAuthorInstitution 231642\n",
"7 IsCitedBy 174058\n",
"8 Cites 174058\n",
"9 HasVersion 44402\n",
"10 IsVersionOf 44402\n",
"11 isProducedBy 38672\n",
"12 produces 38671\n",
"13 IsPartOf 34520\n",
"14 HasPart 34520\n",
"15 hasParticipant 31035\n",
"16 isParticipant 31035\n",
"17 IsIdenticalTo 12974\n",
"18 HasAmongTopNSimilarDocuments 5903\n",
"19 IsAmongTopNSimilarDocuments 5903"
]
},
"execution_count": 22,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query =\"\"\"SELECT reltype.name, \n",
" COUNT(*) AS count \n",
"FROM relations \n",
"GROUP BY reltype.name \n",
"ORDER BY count DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>subject term</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>General Medicine</td>\n",
" <td>242423</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>Electrical and Electronic Engineering</td>\n",
" <td>66295</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>General Materials Science</td>\n",
" <td>62012</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>General Chemistry</td>\n",
" <td>56444</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Biochemistry</td>\n",
" <td>52956</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>Computer Science Applications</td>\n",
" <td>52099</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>Mechanical Engineering</td>\n",
" <td>46967</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Condensed Matter Physics</td>\n",
" <td>46413</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Surgery</td>\n",
" <td>42772</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>General Environmental Science</td>\n",
" <td>41371</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Public Health, Environmental and Occupational ...</td>\n",
" <td>40836</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>FOS: Computer and information sciences</td>\n",
" <td>40609</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Oncology</td>\n",
" <td>40491</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Molecular Biology</td>\n",
" <td>39883</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>General Engineering</td>\n",
" <td>39537</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>FOS: Physical sciences</td>\n",
" <td>39021</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Social and Behavioral Sciences</td>\n",
" <td>38058</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Renewable Energy, Sustainability and the Envir...</td>\n",
" <td>36529</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Education</td>\n",
" <td>36364</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Materials Chemistry</td>\n",
" <td>35187</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" subject term count\n",
"0 General Medicine 242423\n",
"1 Electrical and Electronic Engineering 66295\n",
"2 General Materials Science 62012\n",
"3 General Chemistry 56444\n",
"4 Biochemistry 52956\n",
"5 Computer Science Applications 52099\n",
"6 Mechanical Engineering 46967\n",
"7 Condensed Matter Physics 46413\n",
"8 Surgery 42772\n",
"9 General Environmental Science 41371\n",
"10 Public Health, Environmental and Occupational ... 40836\n",
"11 FOS: Computer and information sciences 40609\n",
"12 Oncology 40491\n",
"13 Molecular Biology 39883\n",
"14 General Engineering 39537\n",
"15 FOS: Physical sciences 39021\n",
"16 Social and Behavioral Sciences 38058\n",
"17 Renewable Energy, Sustainability and the Envir... 36529\n",
"18 Education 36364\n",
"19 Materials Chemistry 35187"
]
},
"execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query=\"\"\"WITH terms AS (\n",
" SELECT explode(subjects.subject.value) AS `term` FROM publications\n",
")\n",
"SELECT term AS `subject term`, \n",
" COUNT(*) AS count \n",
"FROM terms \n",
"GROUP BY term \n",
"ORDER BY count DESC\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>left</th>\n",
" <th>right</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>business</td>\n",
" <td>business.industry</td>\n",
" <td>12625</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>business.industry</td>\n",
" <td>medicine.medical_specialty</td>\n",
" <td>5327</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>business</td>\n",
" <td>medicine.medical_specialty</td>\n",
" <td>5323</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>business.industry</td>\n",
" <td>medicine</td>\n",
" <td>5190</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>business</td>\n",
" <td>medicine</td>\n",
" <td>5187</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>medicine</td>\n",
" <td>medicine.medical_specialty</td>\n",
" <td>4396</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>business.industry</td>\n",
" <td>medicine.disease</td>\n",
" <td>3997</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>business</td>\n",
" <td>medicine.disease</td>\n",
" <td>3994</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>medicine</td>\n",
" <td>medicine.disease</td>\n",
" <td>3754</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>Computer science</td>\n",
" <td>business</td>\n",
" <td>3275</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Computer science</td>\n",
" <td>business.industry</td>\n",
" <td>3239</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>media_common</td>\n",
" <td>media_common.quotation_subject</td>\n",
" <td>3234</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>medicine.disease</td>\n",
" <td>medicine.medical_specialty</td>\n",
" <td>3153</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Medicine</td>\n",
" <td>business</td>\n",
" <td>2630</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Medicine</td>\n",
" <td>business.industry</td>\n",
" <td>2630</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Artificial intelligence</td>\n",
" <td>business.industry</td>\n",
" <td>1758</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Artificial intelligence</td>\n",
" <td>business</td>\n",
" <td>1754</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Internal medicine</td>\n",
" <td>medicine.medical_specialty</td>\n",
" <td>1715</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>Internal medicine</td>\n",
" <td>business</td>\n",
" <td>1670</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Internal medicine</td>\n",
" <td>business.industry</td>\n",
" <td>1670</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" left right count\n",
"0 business business.industry 12625\n",
"1 business.industry medicine.medical_specialty 5327\n",
"2 business medicine.medical_specialty 5323\n",
"3 business.industry medicine 5190\n",
"4 business medicine 5187\n",
"5 medicine medicine.medical_specialty 4396\n",
"6 business.industry medicine.disease 3997\n",
"7 business medicine.disease 3994\n",
"8 medicine medicine.disease 3754\n",
"9 Computer science business 3275\n",
"10 Computer science business.industry 3239\n",
"11 media_common media_common.quotation_subject 3234\n",
"12 medicine.disease medicine.medical_specialty 3153\n",
"13 Medicine business 2630\n",
"14 Medicine business.industry 2630\n",
"15 Artificial intelligence business.industry 1758\n",
"16 Artificial intelligence business 1754\n",
"17 Internal medicine medicine.medical_specialty 1715\n",
"18 Internal medicine business 1670\n",
"19 Internal medicine business.industry 1670"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query=\"\"\"\n",
"WITH subjects AS (\n",
" WITH tmp (SELECT id, EXPLODE(subjects.subject) AS subject FROM publications) \n",
" SELECT id, subject.value AS `subject` FROM tmp WHERE subject.scheme != 'keyword'\n",
")\n",
"SELECT l.subject AS left, \n",
" r.subject AS right, \n",
" COUNT(*) AS count\n",
"FROM subjects AS l JOIN subjects AS r ON l.id = r.id AND l.subject < r.subject\n",
"GROUP BY left, right\n",
"ORDER BY count DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>conferencedate</th>\n",
" <th>conferenceplace</th>\n",
" <th>edition</th>\n",
" <th>ep</th>\n",
" <th>iss</th>\n",
" <th>issnLinking</th>\n",
" <th>issnOnline</th>\n",
" <th>issnPrinted</th>\n",
" <th>name</th>\n",
" <th>sp</th>\n",
" <th>vol</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td></td>\n",
" <td>0012-835X</td>\n",
" <td>0012-835X</td>\n",
" <td>East African Medical Journal</td>\n",
" <td></td>\n",
" <td></td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0032-5910</td>\n",
" <td>Powder Technology</td>\n",
" <td>117586</td>\n",
" <td>406</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1110-8460</td>\n",
" <td>None</td>\n",
" <td>المجلة العلمیة لعلوم وفنون الریاضة</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1319</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0883-5403</td>\n",
" <td>The Journal of Arthroplasty</td>\n",
" <td>1314</td>\n",
" <td>37</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>837</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1435-8115</td>\n",
" <td>1431-9276</td>\n",
" <td>Microscopy and Microanalysis</td>\n",
" <td>836</td>\n",
" <td>28</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>42133</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1944-8252</td>\n",
" <td>1944-8244</td>\n",
" <td>ACS Applied Materials &amp;amp; Interfaces</td>\n",
" <td>42123</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0272-8842</td>\n",
" <td>Ceramics International</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1023</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>0020-0255</td>\n",
" <td>Information Sciences</td>\n",
" <td>994</td>\n",
" <td>612</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2632-959X</td>\n",
" <td>None</td>\n",
" <td>Nano Express</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1863-4613</td>\n",
" <td>1865-1704</td>\n",
" <td>International Review of Economics</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2651-4141</td>\n",
" <td>Ankara Hacı Bayram Veli Üniversitesi Hukuk Fak...</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2107-0180</td>\n",
" <td>0378-7966</td>\n",
" <td>European Journal of Drug Metabolism and Pharma...</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1742-6596</td>\n",
" <td>1742-6588</td>\n",
" <td>Journal of Physics: Conference Series</td>\n",
" <td>012008</td>\n",
" <td>2304</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>9454</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2574-0962</td>\n",
" <td>2574-0962</td>\n",
" <td>ACS Applied Energy Materials</td>\n",
" <td>9447</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2022 International Conference on Intelligent C...</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1321</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2093-6311</td>\n",
" <td>1598-2351</td>\n",
" <td>International Journal of Steel Structures</td>\n",
" <td>1306</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>1475-4762</td>\n",
" <td>0004-0894</td>\n",
" <td>Area</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2326-831X</td>\n",
" <td>2326-8298</td>\n",
" <td>Annual Review of Statistics and Its Application</td>\n",
" <td>None</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>Spintronics XV</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>None</td>\n",
" <td>2072-6694</td>\n",
" <td>None</td>\n",
" <td>Cancers</td>\n",
" <td>3291</td>\n",
" <td>14</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" conferencedate conferenceplace edition ep iss issnLinking issnOnline \n",
"0 None None 0012-835X \\\n",
"1 None None None None None None None \n",
"2 None None None 0 None None 1110-8460 \n",
"3 None None None 1319 None None None \n",
"4 None None None 837 None None 1435-8115 \n",
"5 None None None 42133 None None 1944-8252 \n",
"6 None None None None None None None \n",
"7 None None None 1023 None None None \n",
"8 None None None None None None 2632-959X \n",
"9 None None None None None None 1863-4613 \n",
"10 None None None None None None None \n",
"11 None None None None None None 2107-0180 \n",
"12 None None None None None None 1742-6596 \n",
"13 None None None 9454 None None 2574-0962 \n",
"14 None None None None None None None \n",
"15 None None None 1321 None None 2093-6311 \n",
"16 None None None None None None 1475-4762 \n",
"17 None None None None None None 2326-831X \n",
"18 None None None None None None None \n",
"19 None None None None None None 2072-6694 \n",
"\n",
" issnPrinted name sp \n",
"0 0012-835X East African Medical Journal \\\n",
"1 0032-5910 Powder Technology 117586 \n",
"2 None المجلة العلمیة لعلوم وفنون الریاضة 0 \n",
"3 0883-5403 The Journal of Arthroplasty 1314 \n",
"4 1431-9276 Microscopy and Microanalysis 836 \n",
"5 1944-8244 ACS Applied Materials &amp; Interfaces 42123 \n",
"6 0272-8842 Ceramics International None \n",
"7 0020-0255 Information Sciences 994 \n",
"8 None Nano Express None \n",
"9 1865-1704 International Review of Economics None \n",
"10 2651-4141 Ankara Hacı Bayram Veli Üniversitesi Hukuk Fak... None \n",
"11 0378-7966 European Journal of Drug Metabolism and Pharma... None \n",
"12 1742-6588 Journal of Physics: Conference Series 012008 \n",
"13 2574-0962 ACS Applied Energy Materials 9447 \n",
"14 None 2022 International Conference on Intelligent C... None \n",
"15 1598-2351 International Journal of Steel Structures 1306 \n",
"16 0004-0894 Area None \n",
"17 2326-8298 Annual Review of Statistics and Its Application None \n",
"18 None Spintronics XV None \n",
"19 None Cancers 3291 \n",
"\n",
" vol \n",
"0 \n",
"1 406 \n",
"2 0 \n",
"3 37 \n",
"4 28 \n",
"5 14 \n",
"6 None \n",
"7 612 \n",
"8 None \n",
"9 None \n",
"10 None \n",
"11 None \n",
"12 2304 \n",
"13 5 \n",
"14 None \n",
"15 22 \n",
"16 None \n",
"17 10 \n",
"18 None \n",
"19 14 "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query=\"\"\"SELECT container.* \n",
"FROM publications \n",
"WHERE container IS NOT NULL\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>name</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>Scientific Reports</td>\n",
" <td>8152</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>SSRN Electronic Journal</td>\n",
" <td>8061</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>Blood</td>\n",
" <td>6527</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>PLOS ONE</td>\n",
" <td>6206</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>Cureus</td>\n",
" <td>5636</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>International Journal of Molecular Sciences</td>\n",
" <td>4793</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>International Journal of Environmental Researc...</td>\n",
" <td>4466</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>Academy of Management Proceedings</td>\n",
" <td>4391</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Sustainability</td>\n",
" <td>4334</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>ECS Meeting Abstracts</td>\n",
" <td>4235</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>Research, Society and Development</td>\n",
" <td>4042</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>Frontiers in Immunology</td>\n",
" <td>3750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Frontiers in Psychology</td>\n",
" <td>3667</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Science of The Total Environment</td>\n",
" <td>3630</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>International journal of health sciences</td>\n",
" <td>3592</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Frontiers in Oncology</td>\n",
" <td>3562</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>European Heart Journal</td>\n",
" <td>3358</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Applied Sciences</td>\n",
" <td>3111</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>IOP Conference Series: Earth and Environmental...</td>\n",
" <td>3047</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>Journal of Cleaner Production</td>\n",
" <td>3030</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" name count\n",
"0 Scientific Reports 8152\n",
"1 SSRN Electronic Journal 8061\n",
"2 Blood 6527\n",
"3 PLOS ONE 6206\n",
"4 Cureus 5636\n",
"5 International Journal of Molecular Sciences 4793\n",
"6 International Journal of Environmental Researc... 4466\n",
"7 Academy of Management Proceedings 4391\n",
"8 Sustainability 4334\n",
"9 ECS Meeting Abstracts 4235\n",
"10 Research, Society and Development 4042\n",
"11 Frontiers in Immunology 3750\n",
"12 Frontiers in Psychology 3667\n",
"13 Science of The Total Environment 3630\n",
"14 International journal of health sciences 3592\n",
"15 Frontiers in Oncology 3562\n",
"16 European Heart Journal 3358\n",
"17 Applied Sciences 3111\n",
"18 IOP Conference Series: Earth and Environmental... 3047\n",
"19 Journal of Cleaner Production 3030"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query=\"\"\"WITH journals AS (\n",
" SELECT container.* FROM publications WHERE container IS NOT NULL\n",
")\n",
"SELECT name, \n",
" count(*) AS count \n",
"FROM journals \n",
"GROUP BY name \n",
"ORDER BY count DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>project</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>NSERC - unidentified - unidentified</td>\n",
" <td>5817</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CIHR - unidentified - unidentified</td>\n",
" <td>2216</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>SSHRC - unidentified - unidentified</td>\n",
" <td>1044</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>EC - 822336 - Representation and Preservation ...</td>\n",
" <td>921</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>WT - unidentified - unidentified</td>\n",
" <td>588</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>EC - 773830 - Promoting One Health in Europe t...</td>\n",
" <td>155</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>EC - 786314 - Continuity and Rupture in Centra...</td>\n",
" <td>60</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>EC - 633053 - Implementation of activities des...</td>\n",
" <td>55</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>EC - 881603 - Graphene Flagship Core Project 3</td>\n",
" <td>47</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>EC - 945539 - Human Brain Project Specific Gra...</td>\n",
" <td>46</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>EC - 824093 - The strong interaction at the fr...</td>\n",
" <td>41</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>EC - 872522 - Expanding our knowledge on Citiz...</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>EC - 823717 - Enabling Science and Technology ...</td>\n",
" <td>40</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>EC - 900014 - Fracture mechanics testing of ir...</td>\n",
" <td>38</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>EC - 823914 - Advanced Research Infrastructure...</td>\n",
" <td>37</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>EC - 733032 - European Human Biomonitoring Ini...</td>\n",
" <td>32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>NSF - 1852977 - The Management and Operation o...</td>\n",
" <td>31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>EC - 776613 - European Climate Prediction system</td>\n",
" <td>31</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>EC - 776816 - Project Ô: demonstration of plan...</td>\n",
" <td>30</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>EC - 812880 - Joint PhD Laboratory for New Mat...</td>\n",
" <td>30</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" project count\n",
"0 NSERC - unidentified - unidentified 5817\n",
"1 CIHR - unidentified - unidentified 2216\n",
"2 SSHRC - unidentified - unidentified 1044\n",
"3 EC - 822336 - Representation and Preservation ... 921\n",
"4 WT - unidentified - unidentified 588\n",
"5 EC - 773830 - Promoting One Health in Europe t... 155\n",
"6 EC - 786314 - Continuity and Rupture in Centra... 60\n",
"7 EC - 633053 - Implementation of activities des... 55\n",
"8 EC - 881603 - Graphene Flagship Core Project 3 47\n",
"9 EC - 945539 - Human Brain Project Specific Gra... 46\n",
"10 EC - 824093 - The strong interaction at the fr... 41\n",
"11 EC - 872522 - Expanding our knowledge on Citiz... 40\n",
"12 EC - 823717 - Enabling Science and Technology ... 40\n",
"13 EC - 900014 - Fracture mechanics testing of ir... 38\n",
"14 EC - 823914 - Advanced Research Infrastructure... 37\n",
"15 EC - 733032 - European Human Biomonitoring Ini... 32\n",
"16 NSF - 1852977 - The Management and Operation o... 31\n",
"17 EC - 776613 - European Climate Prediction system 31\n",
"18 EC - 776816 - Project Ô: demonstration of plan... 30\n",
"19 EC - 812880 - Joint PhD Laboratory for New Mat... 30"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query=\"\"\"SELECT CONCAT_WS(' - ', IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '-'), COALESCE(code, '-'), SUBSTRING(title, 0, 50)) AS project,\n",
" COUNT(*) AS count \n",
"FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces'\n",
"GROUP BY project \n",
"ORDER BY count DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organization</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>CNRS</td>\n",
" <td>638</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>UH</td>\n",
" <td>579</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>CSIC</td>\n",
" <td>379</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>FHG</td>\n",
" <td>322</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>CNR</td>\n",
" <td>317</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>UCL</td>\n",
" <td>310</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>ETH Zurich</td>\n",
" <td>300</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>MPG</td>\n",
" <td>299</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>THE CHANCELLOR, MASTERS AND SCHOLARS OF THE UN...</td>\n",
" <td>271</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>CEA</td>\n",
" <td>267</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>KUL</td>\n",
" <td>255</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>UOXF</td>\n",
" <td>249</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>DTU</td>\n",
" <td>209</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Delft University of Technology</td>\n",
" <td>207</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>UCPH</td>\n",
" <td>203</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>Imperial</td>\n",
" <td>203</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>University of Edinburgh</td>\n",
" <td>181</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>Aalto University</td>\n",
" <td>180</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>AU</td>\n",
" <td>177</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>EPFL</td>\n",
" <td>172</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" organization count\n",
"0 CNRS 638\n",
"1 UH 579\n",
"2 CSIC 379\n",
"3 FHG 322\n",
"4 CNR 317\n",
"5 UCL 310\n",
"6 ETH Zurich 300\n",
"7 MPG 299\n",
"8 THE CHANCELLOR, MASTERS AND SCHOLARS OF THE UN... 271\n",
"9 CEA 267\n",
"10 KUL 255\n",
"11 UOXF 249\n",
"12 DTU 209\n",
"13 Delft University of Technology 207\n",
"14 UCPH 203\n",
"15 Imperial 203\n",
"16 University of Edinburgh 181\n",
"17 Aalto University 180\n",
"18 AU 177\n",
"19 EPFL 172"
]
},
"execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(*) AS count \n",
"FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n",
"GROUP BY organization \n",
"ORDER BY count DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organization</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>UPV</td>\n",
" <td>4980</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>UL</td>\n",
" <td>4819</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>University of Oxford</td>\n",
" <td>3859</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>University of Cambridge</td>\n",
" <td>3670</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>UPC</td>\n",
" <td>3041</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>ULP</td>\n",
" <td>2855</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>AMU</td>\n",
" <td>2624</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>KUL</td>\n",
" <td>2582</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>UB</td>\n",
" <td>2576</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>University of Zagreb</td>\n",
" <td>2555</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>AAU</td>\n",
" <td>2522</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>University of California System</td>\n",
" <td>2497</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>University of Edinburgh</td>\n",
" <td>2422</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Andalas University</td>\n",
" <td>2350</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>Amsterdam UMC</td>\n",
" <td>2323</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>ETH Zurich</td>\n",
" <td>2276</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>UPM</td>\n",
" <td>2191</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>INRIA</td>\n",
" <td>2096</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>UH</td>\n",
" <td>2082</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>VUA</td>\n",
" <td>1982</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" organization count\n",
"0 UPV 4980\n",
"1 UL 4819\n",
"2 University of Oxford 3859\n",
"3 University of Cambridge 3670\n",
"4 UPC 3041\n",
"5 ULP 2855\n",
"6 AMU 2624\n",
"7 KUL 2582\n",
"8 UB 2576\n",
"9 University of Zagreb 2555\n",
"10 AAU 2522\n",
"11 University of California System 2497\n",
"12 University of Edinburgh 2422\n",
"13 Andalas University 2350\n",
"14 Amsterdam UMC 2323\n",
"15 ETH Zurich 2276\n",
"16 UPM 2191\n",
"17 INRIA 2096\n",
"18 UH 2082\n",
"19 VUA 1982"
]
},
"execution_count": 40,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(*) AS count \n",
"FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isAuthorInstitutionOf' \n",
"GROUP BY organization\n",
"ORDER BY count DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organization</th>\n",
" <th>publication</th>\n",
" <th>dataset</th>\n",
" <th>software</th>\n",
" <th>other</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>UPV</td>\n",
" <td>4974</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>UL</td>\n",
" <td>4493</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>326</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>University of Oxford</td>\n",
" <td>3711</td>\n",
" <td>104</td>\n",
" <td>0</td>\n",
" <td>44</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>University of Cambridge</td>\n",
" <td>3468</td>\n",
" <td>99</td>\n",
" <td>4</td>\n",
" <td>99</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>UPC</td>\n",
" <td>3023</td>\n",
" <td>6</td>\n",
" <td>0</td>\n",
" <td>12</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>ULP</td>\n",
" <td>2822</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>33</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>AMU</td>\n",
" <td>2567</td>\n",
" <td>8</td>\n",
" <td>1</td>\n",
" <td>48</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>University of Zagreb</td>\n",
" <td>2509</td>\n",
" <td>3</td>\n",
" <td>0</td>\n",
" <td>43</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>University of California System</td>\n",
" <td>2483</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>14</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>AAU</td>\n",
" <td>2470</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>50</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>UB</td>\n",
" <td>2432</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>143</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>University of Edinburgh</td>\n",
" <td>2414</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>Andalas University</td>\n",
" <td>2342</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>8</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>Amsterdam UMC</td>\n",
" <td>2323</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>UPM</td>\n",
" <td>2188</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>ETH Zurich</td>\n",
" <td>2186</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>90</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>INRIA</td>\n",
" <td>2068</td>\n",
" <td>0</td>\n",
" <td>7</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>KUL</td>\n",
" <td>2060</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>521</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>INSERM</td>\n",
" <td>1954</td>\n",
" <td>0</td>\n",
" <td>3</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>VUA</td>\n",
" <td>1945</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>37</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" organization publication dataset software other\n",
"0 UPV 4974 6 0 0\n",
"1 UL 4493 0 0 326\n",
"2 University of Oxford 3711 104 0 44\n",
"3 University of Cambridge 3468 99 4 99\n",
"4 UPC 3023 6 0 12\n",
"5 ULP 2822 0 0 33\n",
"6 AMU 2567 8 1 48\n",
"7 University of Zagreb 2509 3 0 43\n",
"8 University of California System 2483 0 0 14\n",
"9 AAU 2470 1 1 50\n",
"10 UB 2432 0 1 143\n",
"11 University of Edinburgh 2414 1 1 6\n",
"12 Andalas University 2342 0 0 8\n",
"13 Amsterdam UMC 2323 0 0 0\n",
"14 UPM 2188 0 0 3\n",
"15 ETH Zurich 2186 0 0 90\n",
"16 INRIA 2068 0 7 21\n",
"17 KUL 2060 0 1 521\n",
"18 INSERM 1954 0 3 5\n",
"19 VUA 1945 0 0 37"
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(IF(type = 'publication', 1, NULL)) AS publication,\n",
" COUNT(IF(type = 'dataset', 1, NULL)) AS dataset,\n",
" COUNT(IF(type = 'software', 1, NULL)) AS software,\n",
" COUNT(IF(type = 'other', 1, NULL)) AS other\n",
"FROM results JOIN organizations JOIN relations ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf' \n",
"GROUP BY organization \n",
"ORDER BY publication DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>organization</th>\n",
" <th>open</th>\n",
" <th>embargo</th>\n",
" <th>closed</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>UPV</td>\n",
" <td>4770</td>\n",
" <td>11</td>\n",
" <td>199</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>UL</td>\n",
" <td>4603</td>\n",
" <td>67</td>\n",
" <td>144</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>University of Oxford</td>\n",
" <td>2999</td>\n",
" <td>837</td>\n",
" <td>21</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>UPC</td>\n",
" <td>2607</td>\n",
" <td>158</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>KUL</td>\n",
" <td>2518</td>\n",
" <td>15</td>\n",
" <td>28</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>UB</td>\n",
" <td>2481</td>\n",
" <td>19</td>\n",
" <td>58</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>University of California System</td>\n",
" <td>2450</td>\n",
" <td>1</td>\n",
" <td>32</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>University of Edinburgh</td>\n",
" <td>2394</td>\n",
" <td>1</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>Andalas University</td>\n",
" <td>2350</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>ETH Zurich</td>\n",
" <td>2251</td>\n",
" <td>12</td>\n",
" <td>10</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>University of Zagreb</td>\n",
" <td>2191</td>\n",
" <td>141</td>\n",
" <td>13</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>UH</td>\n",
" <td>2060</td>\n",
" <td>0</td>\n",
" <td>17</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>UPM</td>\n",
" <td>1997</td>\n",
" <td>35</td>\n",
" <td>11</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>ULP</td>\n",
" <td>1977</td>\n",
" <td>679</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>University of Cambridge</td>\n",
" <td>1944</td>\n",
" <td>12</td>\n",
" <td>181</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>University of Copenhagen</td>\n",
" <td>1896</td>\n",
" <td>0</td>\n",
" <td>5</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>Amsterdam UMC</td>\n",
" <td>1870</td>\n",
" <td>15</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>CSIC</td>\n",
" <td>1585</td>\n",
" <td>2</td>\n",
" <td>77</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>VUA</td>\n",
" <td>1503</td>\n",
" <td>10</td>\n",
" <td>55</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>UWO</td>\n",
" <td>1427</td>\n",
" <td>0</td>\n",
" <td>25</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" organization open embargo closed\n",
"0 UPV 4770 11 199\n",
"1 UL 4603 67 144\n",
"2 University of Oxford 2999 837 21\n",
"3 UPC 2607 158 3\n",
"4 KUL 2518 15 28\n",
"5 UB 2481 19 58\n",
"6 University of California System 2450 1 32\n",
"7 University of Edinburgh 2394 1 10\n",
"8 Andalas University 2350 0 0\n",
"9 ETH Zurich 2251 12 10\n",
"10 University of Zagreb 2191 141 13\n",
"11 UH 2060 0 17\n",
"12 UPM 1997 35 11\n",
"13 ULP 1977 679 1\n",
"14 University of Cambridge 1944 12 181\n",
"15 University of Copenhagen 1896 0 5\n",
"16 Amsterdam UMC 1870 15 3\n",
"17 CSIC 1585 2 77\n",
"18 VUA 1503 10 55\n",
"19 UWO 1427 0 25"
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n",
" COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n",
" COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n",
"FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n",
"GROUP BY organization\n",
"ORDER BY open DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>country</th>\n",
" <th>open</th>\n",
" <th>embargo</th>\n",
" <th>closed</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>ES</td>\n",
" <td>23724</td>\n",
" <td>309</td>\n",
" <td>618</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>GB</td>\n",
" <td>21034</td>\n",
" <td>1044</td>\n",
" <td>994</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>DE</td>\n",
" <td>15356</td>\n",
" <td>368</td>\n",
" <td>2772</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>US</td>\n",
" <td>11900</td>\n",
" <td>36</td>\n",
" <td>5577</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>FR</td>\n",
" <td>9348</td>\n",
" <td>176</td>\n",
" <td>3779</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>CH</td>\n",
" <td>6908</td>\n",
" <td>136</td>\n",
" <td>536</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>PT</td>\n",
" <td>6221</td>\n",
" <td>814</td>\n",
" <td>57</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>HR</td>\n",
" <td>5944</td>\n",
" <td>157</td>\n",
" <td>35</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>BE</td>\n",
" <td>5636</td>\n",
" <td>245</td>\n",
" <td>412</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>FI</td>\n",
" <td>5421</td>\n",
" <td>35</td>\n",
" <td>43</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>IT</td>\n",
" <td>4989</td>\n",
" <td>119</td>\n",
" <td>1658</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>NL</td>\n",
" <td>4963</td>\n",
" <td>29</td>\n",
" <td>240</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>DK</td>\n",
" <td>4651</td>\n",
" <td>56</td>\n",
" <td>491</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>SI</td>\n",
" <td>4642</td>\n",
" <td>67</td>\n",
" <td>398</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>CO</td>\n",
" <td>4124</td>\n",
" <td>59</td>\n",
" <td>155</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>ID</td>\n",
" <td>4060</td>\n",
" <td>0</td>\n",
" <td>9</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>SE</td>\n",
" <td>3690</td>\n",
" <td>1</td>\n",
" <td>93</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>CA</td>\n",
" <td>3458</td>\n",
" <td>66</td>\n",
" <td>778</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>NO</td>\n",
" <td>3338</td>\n",
" <td>2</td>\n",
" <td>56</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>TR</td>\n",
" <td>2759</td>\n",
" <td>139</td>\n",
" <td>1397</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" country open embargo closed\n",
"0 ES 23724 309 618\n",
"1 GB 21034 1044 994\n",
"2 DE 15356 368 2772\n",
"3 US 11900 36 5577\n",
"4 FR 9348 176 3779\n",
"5 CH 6908 136 536\n",
"6 PT 6221 814 57\n",
"7 HR 5944 157 35\n",
"8 BE 5636 245 412\n",
"9 FI 5421 35 43\n",
"10 IT 4989 119 1658\n",
"11 NL 4963 29 240\n",
"12 DK 4651 56 491\n",
"13 SI 4642 67 398\n",
"14 CO 4124 59 155\n",
"15 ID 4060 0 9\n",
"16 SE 3690 1 93\n",
"17 CA 3458 66 778\n",
"18 NO 3338 2 56\n",
"19 TR 2759 139 1397"
]
},
"execution_count": 43,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query=\"\"\"SELECT organizations.country.code AS country, \n",
" COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n",
" COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n",
" COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n",
"FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n",
"WHERE organizations.country IS NOT NULL\n",
"GROUP BY organizations.country.code\n",
"ORDER BY open DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>left</th>\n",
" <th>right</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>DE</td>\n",
" <td>DE</td>\n",
" <td>12806</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>GB</td>\n",
" <td>GB</td>\n",
" <td>9955</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>DE</td>\n",
" <td>GB</td>\n",
" <td>6269</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>IT</td>\n",
" <td>IT</td>\n",
" <td>5240</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ES</td>\n",
" <td>ES</td>\n",
" <td>4906</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>FR</td>\n",
" <td>FR</td>\n",
" <td>4830</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>DE</td>\n",
" <td>IT</td>\n",
" <td>4683</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>DE</td>\n",
" <td>FR</td>\n",
" <td>4573</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>DE</td>\n",
" <td>ES</td>\n",
" <td>4472</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>NL</td>\n",
" <td>NL</td>\n",
" <td>3613</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>DE</td>\n",
" <td>NL</td>\n",
" <td>3427</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>GB</td>\n",
" <td>IT</td>\n",
" <td>3332</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>FR</td>\n",
" <td>GB</td>\n",
" <td>3328</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>ES</td>\n",
" <td>GB</td>\n",
" <td>3195</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>GB</td>\n",
" <td>NL</td>\n",
" <td>2860</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>CH</td>\n",
" <td>DE</td>\n",
" <td>2676</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>ES</td>\n",
" <td>IT</td>\n",
" <td>2665</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>FR</td>\n",
" <td>IT</td>\n",
" <td>2456</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>ES</td>\n",
" <td>FR</td>\n",
" <td>2365</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>US</td>\n",
" <td>US</td>\n",
" <td>2040</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" left right count\n",
"0 DE DE 12806\n",
"1 GB GB 9955\n",
"2 DE GB 6269\n",
"3 IT IT 5240\n",
"4 ES ES 4906\n",
"5 FR FR 4830\n",
"6 DE IT 4683\n",
"7 DE FR 4573\n",
"8 DE ES 4472\n",
"9 NL NL 3613\n",
"10 DE NL 3427\n",
"11 GB IT 3332\n",
"12 FR GB 3328\n",
"13 ES GB 3195\n",
"14 GB NL 2860\n",
"15 CH DE 2676\n",
"16 ES IT 2665\n",
"17 FR IT 2456\n",
"18 ES FR 2365\n",
"19 US US 2040"
]
},
"execution_count": 44,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query=\"\"\"WITH countryProject AS (\n",
" SELECT country.code AS country, \n",
" target.id AS id \n",
" FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n",
" WHERE country IS NOT NULL\n",
")\n",
"SELECT l.country AS left, \n",
" r.country AS right,\n",
" COUNT(*) AS count \n",
"FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country <= r.country\n",
"GROUP BY left, right \n",
"ORDER BY count DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>left</th>\n",
" <th>right</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>DE</td>\n",
" <td>GB</td>\n",
" <td>6269</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>DE</td>\n",
" <td>IT</td>\n",
" <td>4683</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>DE</td>\n",
" <td>FR</td>\n",
" <td>4573</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>DE</td>\n",
" <td>ES</td>\n",
" <td>4472</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>DE</td>\n",
" <td>NL</td>\n",
" <td>3427</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>GB</td>\n",
" <td>IT</td>\n",
" <td>3332</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>FR</td>\n",
" <td>GB</td>\n",
" <td>3328</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>ES</td>\n",
" <td>GB</td>\n",
" <td>3195</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>GB</td>\n",
" <td>NL</td>\n",
" <td>2860</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>CH</td>\n",
" <td>DE</td>\n",
" <td>2676</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>ES</td>\n",
" <td>IT</td>\n",
" <td>2665</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>FR</td>\n",
" <td>IT</td>\n",
" <td>2456</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>ES</td>\n",
" <td>FR</td>\n",
" <td>2365</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>CH</td>\n",
" <td>GB</td>\n",
" <td>1955</td>\n",
" </tr>\n",
" <tr>\n",
" <th>14</th>\n",
" <td>DE</td>\n",
" <td>SE</td>\n",
" <td>1804</td>\n",
" </tr>\n",
" <tr>\n",
" <th>15</th>\n",
" <td>BE</td>\n",
" <td>DE</td>\n",
" <td>1759</td>\n",
" </tr>\n",
" <tr>\n",
" <th>16</th>\n",
" <td>FR</td>\n",
" <td>NL</td>\n",
" <td>1726</td>\n",
" </tr>\n",
" <tr>\n",
" <th>17</th>\n",
" <td>IT</td>\n",
" <td>NL</td>\n",
" <td>1708</td>\n",
" </tr>\n",
" <tr>\n",
" <th>18</th>\n",
" <td>ES</td>\n",
" <td>NL</td>\n",
" <td>1596</td>\n",
" </tr>\n",
" <tr>\n",
" <th>19</th>\n",
" <td>GB</td>\n",
" <td>SE</td>\n",
" <td>1491</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" left right count\n",
"0 DE GB 6269\n",
"1 DE IT 4683\n",
"2 DE FR 4573\n",
"3 DE ES 4472\n",
"4 DE NL 3427\n",
"5 GB IT 3332\n",
"6 FR GB 3328\n",
"7 ES GB 3195\n",
"8 GB NL 2860\n",
"9 CH DE 2676\n",
"10 ES IT 2665\n",
"11 FR IT 2456\n",
"12 ES FR 2365\n",
"13 CH GB 1955\n",
"14 DE SE 1804\n",
"15 BE DE 1759\n",
"16 FR NL 1726\n",
"17 IT NL 1708\n",
"18 ES NL 1596\n",
"19 GB SE 1491"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query=\"\"\"WITH countryProject AS (\n",
" SELECT country.code AS country, \n",
" target.id AS id \n",
" FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n",
" WHERE country IS NOT NULL\n",
")\n",
"SELECT l.country AS left, \n",
" r.country AS right, \n",
" COUNT(*) AS count \n",
"FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country < r.country\n",
"GROUP BY left, right \n",
"ORDER BY count DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": [
"query=\"\"\"WITH orgProject AS (\n",
" SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" target.id AS id \n",
" FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n",
")\n",
"SELECT l.organization AS left,\n",
" r.organization AS right,\n",
" COUNT(*) AS count\n",
"FROM orgProject AS l JOIN orgProject AS r ON l.id = r.id AND l.organization < r.organization\n",
"GROUP BY left, right \n",
"ORDER BY count DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": [
"query=\"\"\"WITH orgProject AS (\n",
" SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" target.id AS id \n",
" FROM organizations JOIN relations ON reltype.name = 'isAuthorInstitutionOf' AND source.id = organizations.id\n",
")\n",
"SELECT l.organization AS left, \n",
" r.organization AS right,\n",
" COUNT(*) AS count \n",
"FROM orgProject AS l JOIN orgProject AS r ON l.id = r.id AND l.organization < r.organization\n",
"GROUP BY left, right \n",
"ORDER BY count DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {
"autoscroll": "auto"
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>accessright</th>\n",
" <th>year</th>\n",
" <th>count</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>OPEN</td>\n",
" <td>2022</td>\n",
" <td>1391279</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>CLOSED</td>\n",
" <td>2022</td>\n",
" <td>672566</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>EMBARGO</td>\n",
" <td>2022</td>\n",
" <td>14258</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>RESTRICTED</td>\n",
" <td>2022</td>\n",
" <td>12312</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" accessright year count\n",
"0 OPEN 2022 1391279\n",
"1 CLOSED 2022 672566\n",
"2 EMBARGO 2022 14258\n",
"3 RESTRICTED 2022 12312"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query=\"\"\"SELECT bestaccessright.label AS accessright,\n",
" SUBSTRING(publicationdate, 0,4) AS year,\n",
" COUNT(*) AS count\n",
"FROM results\n",
"WHERE bestaccessright IS NOT NULL AND publicationdate IS NOT NULL\n",
"GROUP BY accessright, year\n",
"ORDER BY count DESC\"\"\"\n",
"spark.sql(query).limit(20).toPandas()\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": [
"query=\"\"\"SELECT COUNT(*) AS count\n",
"FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id\"\"\"\n",
"spark.sql(query).limit(20).toPandas()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.10"
},
"name": "openaire_beginners_kit SQL"
},
"nbformat": 4,
"nbformat_minor": 4
}