1011 lines
70 KiB
Plaintext
1011 lines
70 KiB
Plaintext
{
|
||
"cells": [
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"# OpenAIRE Beginners Kit\n",
|
||
"\n",
|
||
"The OpenAIRE Research Graph is an Open Access dataset containing metadata about research products (literature, datasets, software, etc.) linked to other entities of the research ecosystem like organisations, project grants, and data sources.\n",
|
||
"\n",
|
||
"The large size of the OpenAIRE Research Graph is a major impediment for beginners to familiarise with the underlying data model and explore its contents. Working with the Graph in its full size typically requires access to a huge distributed computing infrastructure which cannot be easily accessible to everyone.\n",
|
||
"\n",
|
||
"The OpenAIRE Beginner’s Kit aims to address this issue. It consists of two components: a subset of the Graph composed of the research products published between 2022-06-29 and 2022-12-29, all the entities connected to them and the respective relationships, and the present Zeppelin notebook that demonstrates how you can use PySpark to analyse the Graph and get answers to some interesting research questions.\n",
|
||
"\n"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Download data"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"slideshow": {
|
||
"slide_type": "notes"
|
||
},
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"# !rm -rf data\n",
|
||
"# !mkdir data\n",
|
||
"\n",
|
||
"# import os\n",
|
||
"# base_url = \"https://zenodo.org/record/7490192/files/\"\n",
|
||
"\n",
|
||
"\n",
|
||
"# items =[\"communities_infrastructures.tar\",\"dataset.tar\",\"datasource.tar\",\"organization.tar\",\"otherresearchproduct.tar\",\"project.tar\",\"publication.tar\",\"relation.tar\", \"software.tar\"]\n",
|
||
"\n",
|
||
"# for item in items: \n",
|
||
"# print(f\"Downloading {item}\")\n",
|
||
"# os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n",
|
||
"# print(f\"Extracting {item}\")\n",
|
||
"# os.system(f'tar -xf data/{item} -C data/; rm data/{item}')"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"## Load the datasets"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"import json\n",
|
||
"\n",
|
||
"import pyspark.sql.functions as F\n",
|
||
"from pyspark.sql.functions import col\n",
|
||
"from pyspark.sql.types import StructType\n",
|
||
"from pyspark.sql import SparkSession\n",
|
||
"from IPython.display import JSON as pretty_print\n",
|
||
"\n",
|
||
"import pandas as pd\n",
|
||
"pd.set_option('display.max_columns', None)\n",
|
||
"pd.set_option('display.max_colwidth', 100)\n",
|
||
"\n",
|
||
"spark = SparkSession.builder.getOrCreate()\n",
|
||
"\n",
|
||
"publicationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"author\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"fullname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"rank\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"surname\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"bestaccessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"container\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"conferencedate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"conferenceplace\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"edition\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"ep\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"iss\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnLinking\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnOnline\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnPrinted\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"sp\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"vol\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"contributor\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"coverage\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"dateofcollection\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"embargoenddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"format\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"indicators\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impactMeasures\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impulse\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"usageCounts\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"downloads\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"views\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"instance\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"accessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openAccessRoute\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"alternateIdentifier\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"license\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"refereed\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"url\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"language\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"lastupdatetimestamp\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"maintitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"originalId\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"publisher\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subjects\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subtitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n",
|
||
"datasetSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"author\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"fullname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"rank\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"surname\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"bestaccessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"contributor\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"coverage\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"dateofcollection\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"embargoenddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"format\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"geolocation\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"box\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"place\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"point\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"indicators\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impactMeasures\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impulse\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"usageCounts\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"downloads\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"views\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"instance\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"accessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openAccessRoute\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"alternateIdentifier\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"license\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"refereed\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"url\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"language\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"lastupdatetimestamp\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"maintitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"originalId\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"publisher\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"size\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subjects\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subtitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"version\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n",
|
||
"softwareSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"author\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"fullname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"rank\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"surname\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"bestaccessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"contributor\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"coverage\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"dateofcollection\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"documentationUrl\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"embargoenddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"format\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"indicators\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impactMeasures\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impulse\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"usageCounts\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"downloads\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"views\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"instance\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"accessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openAccessRoute\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"alternateIdentifier\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"license\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"refereed\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"url\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"language\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"lastupdatetimestamp\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"maintitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"originalId\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"programmingLanguage\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"publisher\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subjects\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subtitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n",
|
||
"otherSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"author\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"fullname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"rank\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"surname\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"bestaccessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"contactgroup\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"contactperson\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"contributor\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"coverage\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"dateofcollection\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"embargoenddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"format\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"indicators\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impactMeasures\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"impulse\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"influence_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"popularity_alt\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"class\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"score\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"usageCounts\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"downloads\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"views\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"instance\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"accessright\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openAccessRoute\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"alternateIdentifier\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"license\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"refereed\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"url\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"language\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"lastupdatetimestamp\",\"nullable\":true,\"type\":\"long\"},{\"metadata\":{},\"name\":\"maintitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"originalId\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"publicationdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"publisher\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subjects\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"subtitle\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"tool\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n",
|
||
"datasourceSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"accessrights\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"certificates\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"citationguidelineurl\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"databaseaccessrestriction\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"datasourcetype\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"datauploadrestriction\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"dateofvalidation\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"englishname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"journal\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"issnLinking\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnOnline\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"issnPrinted\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"languages\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"logourl\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"missionstatementurl\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"officialname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openairecompatibility\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"originalId\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"pidsystems\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"policies\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"releasestartdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"subjects\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"uploadrights\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"versioning\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"websiteurl\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n",
|
||
"organizationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"alternativenames\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"country\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"label\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"legalname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"legalshortname\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"pid\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"scheme\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"value\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"websiteurl\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n",
|
||
"projectSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"acronym\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"callidentifier\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"enddate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"funding\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"funding_stream\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"jurisdiction\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"shortName\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"granted\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"currency\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"fundedamount\",\"nullable\":true,\"type\":\"double\"},{\"metadata\":{},\"name\":\"totalcost\",\"nullable\":true,\"type\":\"double\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"h2020programme\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":{\"fields\":[{\"metadata\":{},\"name\":\"code\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"},\"type\":\"array\"}},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"keywords\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"openaccessmandatefordataset\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"openaccessmandateforpublications\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"startdate\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"summary\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"title\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"websiteurl\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n",
|
||
"communitySchema = '{\"fields\":[{\"metadata\":{},\"name\":\"acronym\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"description\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"subject\",\"nullable\":true,\"type\":{\"containsNull\":true,\"elementType\":\"string\",\"type\":\"array\"}},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"zenodo_community\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'\n",
|
||
"relationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"reltype\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"target\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"validated\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"validationDate\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Set the input path: the path on the cluster where the dataset will be stored (e.g. '/data/'); untar each folder in the dataset and move it to the chosen path"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"inputPath = 'data/'\n",
|
||
" \n",
|
||
"# load entities and relationships\n",
|
||
"publication = spark.read.schema(StructType.fromJson(json.loads(publicationSchema))).json(inputPath + 'publication')\n",
|
||
"dataset = spark.read.schema(StructType.fromJson(json.loads(datasetSchema))).json(inputPath + 'dataset')\n",
|
||
"software = spark.read.schema(StructType.fromJson(json.loads(softwareSchema))).json(inputPath + 'software')\n",
|
||
"other = spark.read.schema(StructType.fromJson(json.loads(otherSchema))).json(inputPath + 'otherresearchproduct')\n",
|
||
"results = publication.unionByName(dataset, allowMissingColumns=True).unionByName(software, allowMissingColumns=True).unionByName(other, allowMissingColumns=True)\n",
|
||
"datasource = spark.read.schema(StructType.fromJson(json.loads(datasourceSchema))).json(inputPath + 'datasource')\n",
|
||
"organization = spark.read.schema(StructType.fromJson(json.loads(organizationSchema))).json(inputPath + 'organization')\n",
|
||
"project = spark.read.schema(StructType.fromJson(json.loads(projectSchema))).json(inputPath + 'project')\n",
|
||
"community = spark.read.schema(StructType.fromJson(json.loads(communitySchema))).json(inputPath + 'communities_infrastructures')\n",
|
||
"relation = spark.read.schema(StructType.fromJson(json.loads(relationSchema))).json(inputPath + 'relation')\n",
|
||
"\n",
|
||
"publication.createOrReplaceTempView(\"publications\")\n",
|
||
"dataset.createOrReplaceTempView(\"datasets\")\n",
|
||
"software.createOrReplaceTempView(\"software\")\n",
|
||
"other.createOrReplaceTempView(\"others\")\n",
|
||
"results.createOrReplaceTempView(\"results\")\n",
|
||
"datasource.createOrReplaceTempView(\"datasources\")\n",
|
||
"organization.createOrReplaceTempView(\"organizations\")\n",
|
||
"project.createOrReplaceTempView(\"projects\")\n",
|
||
"community.createOrReplaceTempView(\"communities\")\n",
|
||
"relation.createOrReplaceTempView(\"relations\")\n",
|
||
"\n",
|
||
"# count and print their number\n",
|
||
"print(\"number of publications %s\"%publication.count())\n",
|
||
"print(\"number of datasets %s\"%dataset.count())\n",
|
||
"print(\"number of software %s\"%software.count())\n",
|
||
"print(\"number of other research products %s\"%other.count())\n",
|
||
"print(\"number of results %s\"%results.count())\n",
|
||
"print(\"number of datasources %s\"%datasource.count())\n",
|
||
"print(\"number of organizations %s\"%organization.count())\n",
|
||
"print(\"number of communities %s\"%community.count())\n",
|
||
"print(\"number of projects %s\"%project.count())\n",
|
||
"print(\"number of relationships %s\"%relation.count())"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Let's show some data. \n",
|
||
"For example, a generic result (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"pretty_print(json.loads(publication.where(\"id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'\").toJSON().first()), expanded=False)"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Or a data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"pretty_print(json.loads(datasource.where(\"id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'\").toJSON().first()), expanded=False)"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"An organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"pretty_print(json.loads(organization.where(\"id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'\").toJSON().first()), expanded=False)"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"A project (link to documentation: https://graph.openaire.eu/docs/data-model/entities/project)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"pretty_print(json.loads(project.toJSON().first()), expanded=False)"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"A community (link to documentation: https://graph.openaire.eu/docs/data-model/entities/community)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"pretty_print(json.loads(community.where(\"acronym='mes'\").toJSON().first()), expanded=False)"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"And finally, a relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"pretty_print(json.loads(relation.toJSON().first()), expanded=False)"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"## Exercises "
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"All the exercises follow the template below.\n",
|
||
"```python\n",
|
||
"query = \"\"\"\n",
|
||
"SELECT <columns>\n",
|
||
"FROM <table>\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()\n",
|
||
"```"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task**: Split relations based on their semantics and count them; sort results in descending order, limit to the first 20. "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"SELECT reltype.name, \n",
|
||
" COUNT(*) AS count \n",
|
||
"FROM relations \n",
|
||
"GROUP BY reltype.name \n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Show the most occurring publication subjects; sort results in descending order; limit to the first 20"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"WITH terms AS (\n",
|
||
" SELECT explode(subjects.subject.value) AS `term`\n",
|
||
" FROM publications\n",
|
||
")\n",
|
||
"SELECT term AS `subject term`,\n",
|
||
" COUNT(*) AS count \n",
|
||
"FROM terms \n",
|
||
"GROUP BY term \n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### **Task:** Show the most co-occurring publication subjects from controlled vocabularies (i.e., scheme != 'keyword') avoiding repetition; limit to the first 20"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"WITH subjects AS (\n",
|
||
" WITH exploded_subjects (\n",
|
||
" SELECT id, EXPLODE(subjects.subject) AS subject \n",
|
||
" FROM publications) \n",
|
||
" SELECT id, subject.value AS `subject` \n",
|
||
" FROM exploded_subjects \n",
|
||
" WHERE subject.scheme != 'keyword'\n",
|
||
")\n",
|
||
"SELECT l.subject AS left, \n",
|
||
" r.subject AS right, \n",
|
||
" COUNT(*) AS count\n",
|
||
"FROM subjects AS l JOIN subjects AS r ON l.id = r.id AND l.subject < r.subject\n",
|
||
"GROUP BY left, right\n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Show journal information"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"SELECT container.issnLinking, container.issnOnline, container.issnPrinted, container.name \n",
|
||
"FROM publications \n",
|
||
"WHERE container IS NOT NULL\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Show the journals with the highest number of results published in"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto",
|
||
"tags": []
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"WITH journals AS (\n",
|
||
" SELECT container.*\n",
|
||
" FROM publications\n",
|
||
" WHERE container IS NOT NULL\n",
|
||
")\n",
|
||
"SELECT name, count(*) AS count \n",
|
||
"FROM journals \n",
|
||
"GROUP BY name \n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Show the number of projects per organization; sort results in descending order; limit to the first 20"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
|
||
" COUNT(*) AS count \n",
|
||
"FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n",
|
||
"GROUP BY organization \n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {
|
||
"tags": []
|
||
},
|
||
"source": [
|
||
"### **Task:** Show projects with the highest number of associated results. \n",
|
||
"Note: An \"unidentified\" project is a placeholder for all the association to a funder without knowing the specific project. It should be removed from the count."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"SELECT funding.shortName, code, title, COUNT(*) AS count \n",
|
||
"FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' \n",
|
||
"GROUP BY funding.shortName, code, title\n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Strings can be manipulated as well on the fly"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"SELECT CONCAT_WS(' / ', \n",
|
||
" IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '?'), \n",
|
||
" COALESCE(code, '?'), \n",
|
||
" SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count \n",
|
||
"FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND NOT projects.title ilike '%unidentified%' \n",
|
||
"GROUP BY project \n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Show the number of research products per organization; sort results in descending order; limit to the first 20. The relation used is the affiliation, since in our data this relation links products to organization and not authors to organizations"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"Organization short names can be empty, so the legal name could be a fallback option."
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"SELECT legalshortname, legalname\n",
|
||
"FROM organizations \n",
|
||
"WHERE legalshortname IS NULL\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"SELECT COALESCE(legalshortname, legalname) AS organization,\n",
|
||
" COUNT(*) AS count \n",
|
||
"FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isAuthorInstitutionOf' \n",
|
||
"GROUP BY organization\n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Show the number of research products (per type) per organization"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
|
||
" COUNT(*) AS total,\n",
|
||
" COUNT(IF(type = 'publication', 1, NULL)) AS publication,\n",
|
||
" COUNT(IF(type = 'dataset', 1, NULL)) AS dataset,\n",
|
||
" COUNT(IF(type = 'software', 1, NULL)) AS software,\n",
|
||
" COUNT(IF(type = 'other', 1, NULL)) AS other\n",
|
||
"FROM results JOIN organizations JOIN relations ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf' \n",
|
||
"GROUP BY organization \n",
|
||
"ORDER BY total DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Show result access types per organization"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
|
||
" COUNT(*) as total,\n",
|
||
" COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n",
|
||
" COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n",
|
||
" COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n",
|
||
"FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n",
|
||
"GROUP BY organization\n",
|
||
"ORDER BY total DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Show the result access types per country of the organizations"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"SELECT organizations.country.code AS country, \n",
|
||
" COUNT(*) AS total,\n",
|
||
" COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n",
|
||
" COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n",
|
||
" COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n",
|
||
"FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n",
|
||
"WHERE organizations.country IS NOT NULL\n",
|
||
"GROUP BY organizations.country.code\n",
|
||
"ORDER BY total DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Show the collaboration network of countries participating in projects with respect to the partecipating organizations"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"WITH countryProject AS (\n",
|
||
" SELECT country.code AS country, \n",
|
||
" target.id AS id \n",
|
||
" FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n",
|
||
" WHERE country IS NOT NULL\n",
|
||
")\n",
|
||
"SELECT l.country AS left, \n",
|
||
" r.country AS right,\n",
|
||
" COUNT(*) AS count \n",
|
||
"FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country <= r.country\n",
|
||
"GROUP BY left, right \n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import igraph as ig\n",
|
||
"\n",
|
||
"G = ig.Graph.TupleList(\n",
|
||
" edges=edges[['left', 'right', 'count']].values,\n",
|
||
" vertex_name_attr='countrycode',\n",
|
||
" edge_attrs = ['weight'],\n",
|
||
" directed=False)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"G.vcount()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"G.ecount()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"G.vs[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"G.es[0]"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"import matplotlib.pyplot as plt\n",
|
||
"fig, ax = plt.subplots()\n",
|
||
"ig.plot(G, vertex_label=G.vs['countrycode'], target=ax)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"G.vs.find(countrycode_eq = 'MY') # maldives"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"H = G.induced_subgraph(G.neighborhood(50))\n",
|
||
"H.summary()"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"H.vs['color'] = 'grey'\n",
|
||
"H.vs[0]['color'] = 'red'\n",
|
||
"fig, ax = plt.subplots()\n",
|
||
"ig.plot(H, target=ax)"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"G.transitivity_local_undirected(50)"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Show international project collaborations; focus on organizations"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"WITH countryProject AS (\n",
|
||
" SELECT country.code AS country, \n",
|
||
" target.id AS id \n",
|
||
" FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n",
|
||
" WHERE country IS NOT NULL\n",
|
||
")\n",
|
||
"SELECT l.country AS left, \n",
|
||
" r.country AS right, \n",
|
||
" COUNT(*) AS count \n",
|
||
"FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country < r.country\n",
|
||
"GROUP BY left, right \n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).toPandas() "
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Show the organisations collaborating in projects more often "
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"WITH orgProject AS (\n",
|
||
" SELECT COALESCE(legalshortname, legalname) AS organization, \n",
|
||
" target.id AS id \n",
|
||
" FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n",
|
||
")\n",
|
||
"SELECT l.organization AS left,\n",
|
||
" r.organization AS right,\n",
|
||
" COUNT(*) AS count\n",
|
||
"FROM orgProject AS l JOIN orgProject AS r ON l.id = r.id AND l.organization < r.organization\n",
|
||
"GROUP BY left, right \n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Show the organizations co-authoring papers more often"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"WITH orgProduct AS (\n",
|
||
" SELECT COALESCE(legalshortname, legalname) AS organization, \n",
|
||
" target.id AS id \n",
|
||
" FROM organizations JOIN relations ON reltype.name = 'isAuthorInstitutionOf' AND source.id = organizations.id\n",
|
||
")\n",
|
||
"SELECT l.organization AS left, \n",
|
||
" r.organization AS right,\n",
|
||
" COUNT(*) AS count \n",
|
||
"FROM orgProduct AS l JOIN orgProduct AS r ON l.id = r.id AND l.organization < r.organization\n",
|
||
"GROUP BY left, right \n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Summarize the access rights over the years"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"SELECT bestaccessright.label AS accessright,\n",
|
||
" SUBSTRING(publicationdate, 0,4) AS year,\n",
|
||
" COUNT(*) AS count\n",
|
||
"FROM results\n",
|
||
"WHERE bestaccessright IS NOT NULL AND publicationdate IS NOT NULL\n",
|
||
"GROUP BY accessright, year\n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Show the number of publications supplemented by datasets"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {
|
||
"autoscroll": "auto"
|
||
},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"SELECT COUNT(*) AS count\n",
|
||
"FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
},
|
||
{
|
||
"attachments": {},
|
||
"cell_type": "markdown",
|
||
"metadata": {},
|
||
"source": [
|
||
"### **Task:** Count and sort publications by citations"
|
||
]
|
||
},
|
||
{
|
||
"cell_type": "code",
|
||
"execution_count": null,
|
||
"metadata": {},
|
||
"outputs": [],
|
||
"source": [
|
||
"query = \"\"\"\n",
|
||
"SELECT publications.id, pid.value, COUNT(*) AS count\n",
|
||
"FROM publications JOIN relations ON publications.id = relations.source.id AND reltype.name = 'IsCitedBy'\n",
|
||
"GROUP BY publications.id, pid.value\n",
|
||
"ORDER BY count DESC\n",
|
||
"\"\"\"\n",
|
||
"\n",
|
||
"spark.sql(query).limit(20).toPandas()"
|
||
]
|
||
}
|
||
],
|
||
"metadata": {
|
||
"kernelspec": {
|
||
"display_name": "Python 3 (ipykernel)",
|
||
"language": "python",
|
||
"name": "python3"
|
||
},
|
||
"language_info": {
|
||
"codemirror_mode": {
|
||
"name": "ipython",
|
||
"version": 3
|
||
},
|
||
"file_extension": ".py",
|
||
"mimetype": "text/x-python",
|
||
"name": "python",
|
||
"nbconvert_exporter": "python",
|
||
"pygments_lexer": "ipython3",
|
||
"version": "3.10.10"
|
||
},
|
||
"name": "openaire_beginners_kit SQL"
|
||
},
|
||
"nbformat": 4,
|
||
"nbformat_minor": 4
|
||
}
|