{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# OpenAIRE Beginners Kit\n", "\n", "The OpenAIRE Research Graph is an Open Access dataset containing metadata about research products (literature, datasets, software, etc.) linked to other entities of the research ecosystem like organisations, project grants, and data sources.\n", "\n", "The large size of the OpenAIRE Research Graph is a major impediment for beginners to familiarise with the underlying data model and explore its contents. Working with the Graph in its full size typically requires access to a huge distributed computing infrastructure which cannot be easily accessible to everyone.\n", "\n", "The OpenAIRE Beginner’s Kit aims to address this issue. It consists of two components: a subset of the Graph composed of the research products published between 2022-06-29 and 2022-12-29, all the entities connected to them and the respective relationships, and the present Zeppelin notebook that demonstrates how you can use PySpark to analyse the Graph and get answers to some interesting research questions.\n", "\n" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Download data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "slideshow": { "slide_type": "notes" }, "tags": [] }, "outputs": [], "source": [ "!rm -rf data\n", "!mkdir data\n", "\n", "import os\n", "base_url = \"https://zenodo.org/record/7490192/files/\"\n", "\n", "\n", "items =[\"communities_infrastructures.tar\",\"dataset.tar\",\"datasource.tar\",\"organization.tar\",\"otherresearchproduct.tar\",\"project.tar\",\"publication.tar\",\"relation.tar\", \"software.tar\"]\n", "\n", "for item in items: \n", " print(f\"Downloading {item}\")\n", " os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n", " print(f\"Extracting {item}\")\n", " os.system(f'tar -xf data/{item} -C data/; rm data/{item}')" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "## Have a look at the input data" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "import json\n", "\n", "import pyspark.sql.functions as F\n", "from pyspark.sql.functions import col\n", "from pyspark.sql.types import StructType\n", "from pyspark.sql import SparkSession\n", "from IPython.display import JSON as pretty_print\n", "\n", "import pandas as pd\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.max_colwidth', 100)\n", "\n", "spark = SparkSession.builder.getOrCreate()\n", "\n", "publicationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"author\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"fullname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"rank\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"surname\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"bestaccessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"container\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"conferencedate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"conferenceplace\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"edition\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"ep\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"iss\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnLinking\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnOnline\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnPrinted\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"sp\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"vol\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"contributor\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"coverage\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"dateofcollection\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"embargoenddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"format\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"indicators\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impactMeasures\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impulse\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"usageCounts\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"downloads\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"views\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"instance\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"accessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openAccessRoute\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"alternateIdentifier\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"license\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"refereed\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"url\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"language\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"lastupdatetimestamp\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"maintitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"originalId\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"publisher\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subjects\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subtitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n", "datasetSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"author\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"fullname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"rank\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"surname\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"bestaccessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"contributor\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"coverage\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"dateofcollection\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"embargoenddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"format\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"geolocation\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"box\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"place\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"point\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"indicators\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impactMeasures\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impulse\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"usageCounts\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"downloads\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"views\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"instance\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"accessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openAccessRoute\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"alternateIdentifier\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"license\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"refereed\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"url\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"language\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"lastupdatetimestamp\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"maintitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"originalId\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"publisher\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"size\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subjects\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subtitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"version\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n", "softwareSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"author\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"fullname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"rank\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"surname\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"bestaccessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"contributor\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"coverage\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"dateofcollection\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"documentationUrl\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"embargoenddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"format\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"indicators\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impactMeasures\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impulse\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"usageCounts\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"downloads\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"views\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"instance\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"accessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openAccessRoute\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"alternateIdentifier\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"license\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"refereed\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"url\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"language\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"lastupdatetimestamp\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"maintitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"originalId\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"programmingLanguage\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"publisher\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subjects\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subtitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n", "otherSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"author\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"fullname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"rank\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"surname\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"bestaccessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"contactgroup\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"contactperson\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"contributor\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"coverage\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"dateofcollection\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"embargoenddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"format\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"indicators\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impactMeasures\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impulse\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"usageCounts\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"downloads\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"views\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"instance\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"accessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openAccessRoute\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"alternateIdentifier\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"license\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"refereed\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"url\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"language\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"lastupdatetimestamp\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"maintitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"originalId\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"publisher\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subjects\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subtitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"tool\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n", "datasourceSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"accessrights\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"certificates\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"citationguidelineurl\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"databaseaccessrestriction\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"datasourcetype\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"datauploadrestriction\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"dateofvalidation\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"englishname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"journal\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"issnLinking\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnOnline\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnPrinted\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"languages\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"logourl\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"missionstatementurl\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"officialname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openairecompatibility\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"originalId\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pidsystems\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"policies\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"releasestartdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"subjects\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"uploadrights\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"versioning\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"websiteurl\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n", "organizationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"alternativenames\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"legalname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"legalshortname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"websiteurl\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n", "projectSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"acronym\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"callidentifier\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"enddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"funding\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"funding_stream\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"jurisdiction\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"shortName\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"granted\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"currency\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"fundedamount\",\"nullable\":true,\"type\":\"double\"},{\"metadata\":{},\"name\":\"totalcost\",\"nullable\":true,\"type\":\"double\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"h2020programme\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"keywords\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openaccessmandatefordataset\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"openaccessmandateforpublications\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"startdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"summary\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"title\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"websiteurl\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n", "communitySchema = '{\"fields\":[{\"metadata\":{},\"name\":\"acronym\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"zenodo_community\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n", "relationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"reltype\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"target\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"validated\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"validationDate\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Set the input path: the path on the cluster where the dataset will be stored (e.g. '/data/'); untar each folder in the dataset and move it to the chosen path" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "inputPath = 'data/'\n", " \n", "# load entities and relationships\n", "publication = spark.read.schema(StructType.fromJson(json.loads(publicationSchema))).json(inputPath + 'publication')\n", "dataset = spark.read.schema(StructType.fromJson(json.loads(datasetSchema))).json(inputPath + 'dataset')\n", "software = spark.read.schema(StructType.fromJson(json.loads(softwareSchema))).json(inputPath + 'software')\n", "other = spark.read.schema(StructType.fromJson(json.loads(otherSchema))).json(inputPath + 'otherresearchproduct')\n", "results = publication.unionByName(dataset, allowMissingColumns=True).unionByName(software, allowMissingColumns=True).unionByName(other, allowMissingColumns=True)\n", "datasource = spark.read.schema(StructType.fromJson(json.loads(datasourceSchema))).json(inputPath + 'datasource')\n", "organization = spark.read.schema(StructType.fromJson(json.loads(organizationSchema))).json(inputPath + 'organization')\n", "project = spark.read.schema(StructType.fromJson(json.loads(projectSchema))).json(inputPath + 'project')\n", "community = spark.read.schema(StructType.fromJson(json.loads(communitySchema))).json(inputPath + 'communities_infrastructures')\n", "relation = spark.read.schema(StructType.fromJson(json.loads(relationSchema))).json(inputPath + 'relation')\n", "\n", "publication.createOrReplaceTempView(\"publications\")\n", "dataset.createOrReplaceTempView(\"datasets\")\n", "software.createOrReplaceTempView(\"software\")\n", "other.createOrReplaceTempView(\"others\")\n", "results.createOrReplaceTempView(\"results\")\n", "datasource.createOrReplaceTempView(\"datasources\")\n", "organization.createOrReplaceTempView(\"organizations\")\n", "project.createOrReplaceTempView(\"projects\")\n", "community.createOrReplaceTempView(\"communities\")\n", "relation.createOrReplaceTempView(\"relations\")\n", "\n", "# count and print their number\n", "print(\"number of publications %s\"%publication.count())\n", "print(\"number of datasets %s\"%dataset.count())\n", "print(\"number of software %s\"%software.count())\n", "print(\"number of other research products %s\"%other.count())\n", "print(\"number of results %s\"%results.count())\n", "print(\"number of datasources %s\"%datasource.count())\n", "print(\"number of organizations %s\"%organization.count())\n", "print(\"number of communities %s\"%community.count())\n", "print(\"number of projects %s\"%project.count())\n", "print(\"number of relationships %s\"%relation.count())" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Let's show some data. \n", "For example, a generic result (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "pretty_print(json.loads(publication.where(\"id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'\").toJSON().first()), expanded=True)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Or a data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "pretty_print(json.loads(datasource.where(\"id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'\").toJSON().first()), expanded=True)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "An organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "pretty_print(json.loads(organization.where(\"id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'\").toJSON().first()), expanded=True)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "A project (link to documentation: https://graph.openaire.eu/docs/data-model/entities/project)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "pretty_print(json.loads(project.toJSON().first()), expanded=True)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "A community (link to documentation: https://graph.openaire.eu/docs/data-model/entities/community)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "pretty_print(json.loads(community.where(\"acronym='mes'\").toJSON().first()), expanded=True)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "And finally, a relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "pretty_print(json.loads(relation.toJSON().first()), expanded=True)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "## Exercises " ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "All the exercises follow the following template.\n", "```python\n", "query = \"\"\"\n", "SELECT \n", "FROM \n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()\n", "```" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task**: Split relations based on their semantics and compute their numbers; sort results in descending order, limit to the first 20. " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto", "tags": [] }, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT reltype.name, \n", " COUNT(*) AS count \n", "FROM relations \n", "GROUP BY reltype.name \n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show the most occurring publication subject term; sort results in descending order; limit to the first 20" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "query = \"\"\"\n", "WITH terms AS (\n", " SELECT explode(subjects.subject.value) AS `term`\n", " FROM publications\n", ")\n", "SELECT term AS `subject term`,\n", " COUNT(*) AS count \n", "FROM terms \n", "GROUP BY term \n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "### **Task:** Show the most co-occurring publication subjects from controlled vocabularies (i.e. scheme != 'keyword') avoiding repetition; limit to the first 20" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto", "tags": [] }, "outputs": [], "source": [ "query = \"\"\"\n", "WITH subjects AS (\n", " WITH exploded_subjects (\n", " SELECT id, EXPLODE(subjects.subject) AS subject \n", " FROM publications) \n", " SELECT id, subject.value AS `subject` \n", " FROM exploded_subjects \n", " WHERE subject.scheme != 'keyword'\n", ")\n", "SELECT l.subject AS left, \n", " r.subject AS right, \n", " COUNT(*) AS count\n", "FROM subjects AS l JOIN subjects AS r ON l.id = r.id AND l.subject < r.subject\n", "GROUP BY left, right\n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show journal information" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto", "tags": [] }, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT container.issnLinking, container.issnOnline, container.issnPrinted, container.name \n", "FROM publications \n", "WHERE container IS NOT NULL\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show the journals with the highest number of results published in" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto", "tags": [] }, "outputs": [], "source": [ "query = \"\"\"\n", "WITH journals AS (\n", " SELECT container.*\n", " FROM publications\n", " WHERE container IS NOT NULL\n", ")\n", "SELECT name, count(*) AS count \n", "FROM journals \n", "GROUP BY name \n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show the number of projects per organization; sort results in descending order; limit to the first 20" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT COALESCE(legalshortname, legalname) AS organization, \n", " COUNT(*) AS count \n", "FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n", "GROUP BY organization \n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { "tags": [] }, "source": [ "### **Task:** Show projects with the highest number of associated results. \n", "Note: An \"unidentified\" project is a placeholder for all the association to a funder without knowing the specific project. It should be removed from the count." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT funding.shortName, code, title, COUNT(*) AS count \n", "FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' \n", "GROUP BY funding.shortName, code, title\n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Strings can be manipulated as well on the fly" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT CONCAT_WS(' / ', \n", " IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '?'), \n", " COALESCE(code, '?'), \n", " SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count \n", "FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND NOT projects.title ilike '%unidentified%' \n", "GROUP BY project \n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show the number of research products per organization; sort results in descending order; limit to the first 20. The relation used is the affiliation, since in our data this relation links products to organization and not authors to organizations" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "Organization short names can be empty, so the legal name could be a fallback option." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT legalshortname, legalname\n", "FROM organizations \n", "WHERE legalshortname IS NULL\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT COALESCE(legalshortname, legalname) AS organization,\n", " COUNT(*) AS count \n", "FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isAuthorInstitutionOf' \n", "GROUP BY organization\n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show the number of research products (per type) per organization" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT COALESCE(legalshortname, legalname) AS organization, \n", " COUNT(*) AS total,\n", " COUNT(IF(type = 'publication', 1, NULL)) AS publication,\n", " COUNT(IF(type = 'dataset', 1, NULL)) AS dataset,\n", " COUNT(IF(type = 'software', 1, NULL)) AS software,\n", " COUNT(IF(type = 'other', 1, NULL)) AS other\n", "FROM results JOIN organizations JOIN relations ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf' \n", "GROUP BY organization \n", "ORDER BY total DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show result access types per organization" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT COALESCE(legalshortname, legalname) AS organization, \n", " COUNT(*) as total,\n", " COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n", " COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n", " COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n", "FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n", "GROUP BY organization\n", "ORDER BY total DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show the result access types per country of the organizations" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT organizations.country.code AS country, \n", " COUNT(*) AS total,\n", " COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n", " COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n", " COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n", "FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n", "WHERE organizations.country IS NOT NULL\n", "GROUP BY organizations.country.code\n", "ORDER BY total DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show the countries collaboration network in projects with respect to the partecipating organizations" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "query = \"\"\"\n", "WITH countryProject AS (\n", " SELECT country.code AS country, \n", " target.id AS id \n", " FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n", " WHERE country IS NOT NULL\n", ")\n", "SELECT l.country AS left, \n", " r.country AS right,\n", " COUNT(*) AS count \n", "FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country <= r.country\n", "GROUP BY left, right \n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import igraph as ig\n", "\n", "G = ig.Graph.TupleList(\n", " edges=edges[['left', 'right', 'count']].values,\n", " vertex_name_attr='countrycode',\n", " edge_attrs = ['weight'],\n", " directed=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "G.vcount()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "G.ecount()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "G.vs[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "G.es[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "fig, ax = plt.subplots()\n", "ig.plot(G, vertex_label=G.vs['countrycode'], target=ax)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "G.vs.find(countrycode_eq = 'MY') # maldives" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "H = G.induced_subgraph(G.neighborhood(50))\n", "H.summary()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "H.vs['color'] = 'grey'\n", "H.vs[0]['color'] = 'red'\n", "fig, ax = plt.subplots()\n", "ig.plot(H, target=ax)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "G.transitivity_local_undirected(50)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show the international collaboration in projects with respect to the participating organizations" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "query = \"\"\"\n", "WITH countryProject AS (\n", " SELECT country.code AS country, \n", " target.id AS id \n", " FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n", " WHERE country IS NOT NULL\n", ")\n", "SELECT l.country AS left, \n", " r.country AS right, \n", " COUNT(*) AS count \n", "FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country < r.country\n", "GROUP BY left, right \n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas() " ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show the kernel organisations often collaborating in projects" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "query = \"\"\"\n", "WITH orgProject AS (\n", " SELECT COALESCE(legalshortname, legalname) AS organization, \n", " target.id AS id \n", " FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n", ")\n", "SELECT l.organization AS left,\n", " r.organization AS right,\n", " COUNT(*) AS count\n", "FROM orgProject AS l JOIN orgProject AS r ON l.id = r.id AND l.organization < r.organization\n", "GROUP BY left, right \n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show the kernel organizations often co-authoring papers" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "query = \"\"\"\n", "WITH orgProduct AS (\n", " SELECT COALESCE(legalshortname, legalname) AS organization, \n", " target.id AS id \n", " FROM organizations JOIN relations ON reltype.name = 'isAuthorInstitutionOf' AND source.id = organizations.id\n", ")\n", "SELECT l.organization AS left, \n", " r.organization AS right,\n", " COUNT(*) AS count \n", "FROM orgProduct AS l JOIN orgProduct AS r ON l.id = r.id AND l.organization < r.organization\n", "GROUP BY left, right \n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show the access right s over the years" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT bestaccessright.label AS accessright,\n", " SUBSTRING(publicationdate, 0,4) AS year,\n", " COUNT(*) AS count\n", "FROM results\n", "WHERE bestaccessright IS NOT NULL AND publicationdate IS NOT NULL\n", "GROUP BY accessright, year\n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Show the number of publications supplemented by datasets" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "autoscroll": "auto" }, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT COUNT(*) AS count\n", "FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### **Task:** Count and sort publications by citations" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "query = \"\"\"\n", "SELECT publications.id, pid.value, COUNT(*) AS count\n", "FROM publications JOIN relations ON publications.id = relations.source.id AND reltype.name = 'IsCitedBy'\n", "GROUP BY publications.id, pid.value\n", "ORDER BY count DESC\n", "\"\"\"\n", "\n", "spark.sql(query).limit(20).toPandas()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.10" }, "name": "openaire_beginners_kit SQL" }, "nbformat": 4, "nbformat_minor": 4 }