openaire_beginners_kit/data/beginners_kit.ipynb

70 KiB
Raw Blame History

OpenAIRE Beginners Kit

The OpenAIRE Research Graph is an Open Access dataset containing metadata about research products (literature, datasets, software, etc.) linked to other entities of the research ecosystem like organisations, project grants, and data sources.

The large size of the OpenAIRE Research Graph is a major impediment for beginners to familiarise with the underlying data model and explore its contents. Working with the Graph in its full size typically requires access to a huge distributed computing infrastructure which cannot be easily accessible to everyone.

The OpenAIRE Beginners Kit aims to address this issue. It consists of two components: a subset of the Graph composed of the research products published between 2022-06-29 and 2022-12-29, all the entities connected to them and the respective relationships, and the present Zeppelin notebook that demonstrates how you can use PySpark to analyse the Graph and get answers to some interesting research questions.

Download data

In [ ]:
!rm -rf data
!mkdir data

import os
base_url = "https://zenodo.org/record/7490192/files/"


items =["communities_infrastructures.tar","dataset.tar","datasource.tar","organization.tar","otherresearchproduct.tar","project.tar","publication.tar","relation.tar", "software.tar"]

for item in items:    
    print(f"Downloading {item}")
    os.system(f'wget {base_url}{item}?download=1 -O data/{item}')
    print(f"Extracting {item}")
    os.system(f'tar -xf data/{item} -C data/; rm data/{item}')

Have a look at the input data

In [ ]:
import json

import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StructType
from pyspark.sql import SparkSession
from IPython.display import JSON as pretty_print

import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)

spark = SparkSession.builder.getOrCreate()

publicationSchema = '{"fields":[{"metadata":{},"name":"author","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"fullname","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"rank","nullable":true,"type":"long"},{"metadata":{},"name":"surname","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"bestaccessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"container","nullable":true,"type":{"fields":[{"metadata":{},"name":"conferencedate","nullable":true,"type":"string"},{"metadata":{},"name":"conferenceplace","nullable":true,"type":"string"},{"metadata":{},"name":"edition","nullable":true,"type":"string"},{"metadata":{},"name":"ep","nullable":true,"type":"string"},{"metadata":{},"name":"iss","nullable":true,"type":"string"},{"metadata":{},"name":"issnLinking","nullable":true,"type":"string"},{"metadata":{},"name":"issnOnline","nullable":true,"type":"string"},{"metadata":{},"name":"issnPrinted","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"sp","nullable":true,"type":"string"},{"metadata":{},"name":"vol","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"contributor","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"coverage","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"dateofcollection","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"embargoenddate","nullable":true,"type":"string"},{"metadata":{},"name":"format","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"indicators","nullable":true,"type":{"fields":[{"metadata":{},"name":"impactMeasures","nullable":true,"type":{"fields":[{"metadata":{},"name":"impulse","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"usageCounts","nullable":true,"type":{"fields":[{"metadata":{},"name":"downloads","nullable":true,"type":"string"},{"metadata":{},"name":"views","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"instance","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"accessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"openAccessRoute","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"alternateIdentifier","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"license","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"refereed","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"url","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"language","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"lastupdatetimestamp","nullable":true,"type":"long"},{"metadata":{},"name":"maintitle","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"publisher","nullable":true,"type":"string"},{"metadata":{},"name":"source","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"subject","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"subtitle","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}'
datasetSchema = '{"fields":[{"metadata":{},"name":"author","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"fullname","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"rank","nullable":true,"type":"long"},{"metadata":{},"name":"surname","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"bestaccessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"contributor","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"coverage","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"dateofcollection","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"embargoenddate","nullable":true,"type":"string"},{"metadata":{},"name":"format","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"geolocation","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"box","nullable":true,"type":"string"},{"metadata":{},"name":"place","nullable":true,"type":"string"},{"metadata":{},"name":"point","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"indicators","nullable":true,"type":{"fields":[{"metadata":{},"name":"impactMeasures","nullable":true,"type":{"fields":[{"metadata":{},"name":"impulse","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"usageCounts","nullable":true,"type":{"fields":[{"metadata":{},"name":"downloads","nullable":true,"type":"string"},{"metadata":{},"name":"views","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"instance","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"accessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"openAccessRoute","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"alternateIdentifier","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"license","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"refereed","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"url","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"language","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"lastupdatetimestamp","nullable":true,"type":"long"},{"metadata":{},"name":"maintitle","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"publisher","nullable":true,"type":"string"},{"metadata":{},"name":"size","nullable":true,"type":"string"},{"metadata":{},"name":"source","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"subject","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"subtitle","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"version","nullable":true,"type":"string"}],"type":"struct"}'
softwareSchema = '{"fields":[{"metadata":{},"name":"author","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"fullname","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"rank","nullable":true,"type":"long"},{"metadata":{},"name":"surname","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"bestaccessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"contributor","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"coverage","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"dateofcollection","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"documentationUrl","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"embargoenddate","nullable":true,"type":"string"},{"metadata":{},"name":"format","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"indicators","nullable":true,"type":{"fields":[{"metadata":{},"name":"impactMeasures","nullable":true,"type":{"fields":[{"metadata":{},"name":"impulse","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"usageCounts","nullable":true,"type":{"fields":[{"metadata":{},"name":"downloads","nullable":true,"type":"string"},{"metadata":{},"name":"views","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"instance","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"accessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"openAccessRoute","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"alternateIdentifier","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"license","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"refereed","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"url","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"language","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"lastupdatetimestamp","nullable":true,"type":"long"},{"metadata":{},"name":"maintitle","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"programmingLanguage","nullable":true,"type":"string"},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"publisher","nullable":true,"type":"string"},{"metadata":{},"name":"source","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"subject","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"subtitle","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}'
otherSchema = '{"fields":[{"metadata":{},"name":"author","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"fullname","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"rank","nullable":true,"type":"long"},{"metadata":{},"name":"surname","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"bestaccessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"contactgroup","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"contactperson","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"contributor","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"coverage","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"dateofcollection","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"embargoenddate","nullable":true,"type":"string"},{"metadata":{},"name":"format","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"indicators","nullable":true,"type":{"fields":[{"metadata":{},"name":"impactMeasures","nullable":true,"type":{"fields":[{"metadata":{},"name":"impulse","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"usageCounts","nullable":true,"type":{"fields":[{"metadata":{},"name":"downloads","nullable":true,"type":"string"},{"metadata":{},"name":"views","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"instance","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"accessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"openAccessRoute","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"alternateIdentifier","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"license","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"refereed","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"url","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"language","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"lastupdatetimestamp","nullable":true,"type":"long"},{"metadata":{},"name":"maintitle","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"publisher","nullable":true,"type":"string"},{"metadata":{},"name":"source","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"subject","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"subtitle","nullable":true,"type":"string"},{"metadata":{},"name":"tool","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}'
datasourceSchema = '{"fields":[{"metadata":{},"name":"accessrights","nullable":true,"type":"string"},{"metadata":{},"name":"certificates","nullable":true,"type":"string"},{"metadata":{},"name":"citationguidelineurl","nullable":true,"type":"string"},{"metadata":{},"name":"databaseaccessrestriction","nullable":true,"type":"string"},{"metadata":{},"name":"datasourcetype","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"datauploadrestriction","nullable":true,"type":"string"},{"metadata":{},"name":"dateofvalidation","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":"string"},{"metadata":{},"name":"englishname","nullable":true,"type":"string"},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"journal","nullable":true,"type":{"fields":[{"metadata":{},"name":"issnLinking","nullable":true,"type":"string"},{"metadata":{},"name":"issnOnline","nullable":true,"type":"string"},{"metadata":{},"name":"issnPrinted","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"languages","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"logourl","nullable":true,"type":"string"},{"metadata":{},"name":"missionstatementurl","nullable":true,"type":"string"},{"metadata":{},"name":"officialname","nullable":true,"type":"string"},{"metadata":{},"name":"openairecompatibility","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"pidsystems","nullable":true,"type":"string"},{"metadata":{},"name":"policies","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"releasestartdate","nullable":true,"type":"string"},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"uploadrights","nullable":true,"type":"string"},{"metadata":{},"name":"versioning","nullable":true,"type":"boolean"},{"metadata":{},"name":"websiteurl","nullable":true,"type":"string"}],"type":"struct"}'
organizationSchema = '{"fields":[{"metadata":{},"name":"alternativenames","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"legalname","nullable":true,"type":"string"},{"metadata":{},"name":"legalshortname","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"websiteurl","nullable":true,"type":"string"}],"type":"struct"}'
projectSchema = '{"fields":[{"metadata":{},"name":"acronym","nullable":true,"type":"string"},{"metadata":{},"name":"callidentifier","nullable":true,"type":"string"},{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"enddate","nullable":true,"type":"string"},{"metadata":{},"name":"funding","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"funding_stream","nullable":true,"type":{"fields":[{"metadata":{},"name":"description","nullable":true,"type":"string"},{"metadata":{},"name":"id","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"jurisdiction","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"shortName","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"granted","nullable":true,"type":{"fields":[{"metadata":{},"name":"currency","nullable":true,"type":"string"},{"metadata":{},"name":"fundedamount","nullable":true,"type":"double"},{"metadata":{},"name":"totalcost","nullable":true,"type":"double"}],"type":"struct"}},{"metadata":{},"name":"h2020programme","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"keywords","nullable":true,"type":"string"},{"metadata":{},"name":"openaccessmandatefordataset","nullable":true,"type":"boolean"},{"metadata":{},"name":"openaccessmandateforpublications","nullable":true,"type":"boolean"},{"metadata":{},"name":"startdate","nullable":true,"type":"string"},{"metadata":{},"name":"subject","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"summary","nullable":true,"type":"string"},{"metadata":{},"name":"title","nullable":true,"type":"string"},{"metadata":{},"name":"websiteurl","nullable":true,"type":"string"}],"type":"struct"}'
communitySchema = '{"fields":[{"metadata":{},"name":"acronym","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":"string"},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"subject","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"zenodo_community","nullable":true,"type":"string"}],"type":"struct"}'
relationSchema = '{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"reltype","nullable":true,"type":{"fields":[{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"source","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"target","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"validated","nullable":true,"type":"boolean"},{"metadata":{},"name":"validationDate","nullable":true,"type":"string"}],"type":"struct"}'

Set the input path: the path on the cluster where the dataset will be stored (e.g. '/data/'); untar each folder in the dataset and move it to the chosen path

In [ ]:
inputPath = 'data/'
 
# load entities and relationships
publication = spark.read.schema(StructType.fromJson(json.loads(publicationSchema))).json(inputPath + 'publication')
dataset = spark.read.schema(StructType.fromJson(json.loads(datasetSchema))).json(inputPath + 'dataset')
software = spark.read.schema(StructType.fromJson(json.loads(softwareSchema))).json(inputPath + 'software')
other = spark.read.schema(StructType.fromJson(json.loads(otherSchema))).json(inputPath + 'otherresearchproduct')
results = publication.unionByName(dataset, allowMissingColumns=True).unionByName(software, allowMissingColumns=True).unionByName(other, allowMissingColumns=True)
datasource = spark.read.schema(StructType.fromJson(json.loads(datasourceSchema))).json(inputPath + 'datasource')
organization = spark.read.schema(StructType.fromJson(json.loads(organizationSchema))).json(inputPath + 'organization')
project = spark.read.schema(StructType.fromJson(json.loads(projectSchema))).json(inputPath + 'project')
community = spark.read.schema(StructType.fromJson(json.loads(communitySchema))).json(inputPath + 'communities_infrastructures')
relation = spark.read.schema(StructType.fromJson(json.loads(relationSchema))).json(inputPath + 'relation')

publication.createOrReplaceTempView("publications")
dataset.createOrReplaceTempView("datasets")
software.createOrReplaceTempView("software")
other.createOrReplaceTempView("others")
results.createOrReplaceTempView("results")
datasource.createOrReplaceTempView("datasources")
organization.createOrReplaceTempView("organizations")
project.createOrReplaceTempView("projects")
community.createOrReplaceTempView("communities")
relation.createOrReplaceTempView("relations")

# count and print their number
print("number of publications %s"%publication.count())
print("number of datasets %s"%dataset.count())
print("number of software %s"%software.count())
print("number of other research products %s"%other.count())
print("number of results %s"%results.count())
print("number of datasources %s"%datasource.count())
print("number of organizations %s"%organization.count())
print("number of communities %s"%community.count())
print("number of projects %s"%project.count())
print("number of relationships %s"%relation.count())

Let's show some data. For example, a generic result (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)

In [ ]:
pretty_print(json.loads(publication.where("id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'").toJSON().first()), expanded=True)

Or a data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)

In [ ]:
pretty_print(json.loads(datasource.where("id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'").toJSON().first()), expanded=True)

An organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)

In [ ]:
pretty_print(json.loads(organization.where("id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'").toJSON().first()), expanded=True)
In [ ]:
pretty_print(json.loads(project.toJSON().first()), expanded=True)
In [ ]:
pretty_print(json.loads(community.where("acronym='mes'").toJSON().first()), expanded=True)

And finally, a relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)

In [ ]:
pretty_print(json.loads(relation.toJSON().first()), expanded=True)

Exercises

All the exercises follow the following template.

query = """
SELECT <columns>
FROM <table>
"""

spark.sql(query).limit(20).toPandas()

Task: Split relations based on their semantics and compute their numbers; sort results in descending order, limit to the first 20.

In [ ]:
query = """
SELECT reltype.name, 
        COUNT(*) AS count 
FROM relations 
GROUP BY reltype.name 
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas()

Task: Show the most occurring publication subject term; sort results in descending order; limit to the first 20

In [ ]:
query = """
WITH terms AS (
    SELECT explode(subjects.subject.value) AS `term`
    FROM publications
)
SELECT term AS `subject term`,
        COUNT(*) AS count 
FROM terms 
GROUP BY term 
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas()

Task: Show the most co-occurring publication subjects from controlled vocabularies (i.e. scheme != 'keyword') avoiding repetition; limit to the first 20

In [ ]:
query = """
WITH subjects AS (
    WITH exploded_subjects (
        SELECT id, EXPLODE(subjects.subject) AS subject 
        FROM publications) 
    SELECT id, subject.value AS `subject` 
    FROM exploded_subjects 
    WHERE subject.scheme != 'keyword'
)
SELECT l.subject AS left, 
       r.subject AS right, 
       COUNT(*) AS count
FROM subjects AS l JOIN subjects AS r ON l.id = r.id AND l.subject < r.subject
GROUP BY left, right
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas()

Task: Show journal information

In [ ]:
query = """
SELECT container.issnLinking, container.issnOnline, container.issnPrinted, container.name 
FROM publications 
WHERE container IS NOT NULL
"""

spark.sql(query).limit(20).toPandas()

Task: Show the journals with the highest number of results published in

In [ ]:
query = """
WITH journals AS (
    SELECT container.*
    FROM publications
    WHERE container IS NOT NULL
)
SELECT name, count(*) AS count 
FROM journals 
GROUP BY name 
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas()

Task: Show the number of projects per organization; sort results in descending order; limit to the first 20

In [ ]:
query = """
SELECT COALESCE(legalshortname, legalname) AS organization, 
        COUNT(*) AS count 
FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'
GROUP BY organization 
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas()

Task: Show projects with the highest number of associated results.

Note: An "unidentified" project is a placeholder for all the association to a funder without knowing the specific project. It should be removed from the count.

In [ ]:
query = """
SELECT funding.shortName, code, title, COUNT(*) AS count 
FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' 
GROUP BY funding.shortName, code, title
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas()

Strings can be manipulated as well on the fly

In [ ]:
query = """
SELECT CONCAT_WS(' / ',  
                IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '?'), 
                COALESCE(code, '?'), 
                SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count 
FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND NOT projects.title ilike '%unidentified%' 
GROUP BY project 
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas()

Organization short names can be empty, so the legal name could be a fallback option.

In [ ]:
query = """
SELECT legalshortname, legalname
FROM organizations 
WHERE legalshortname IS NULL
"""

spark.sql(query).limit(20).toPandas()
In [ ]:
query = """
SELECT COALESCE(legalshortname, legalname) AS organization,
        COUNT(*) AS count 
FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isAuthorInstitutionOf' 
GROUP BY organization
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas()

Task: Show the number of research products (per type) per organization

In [ ]:
query = """
SELECT COALESCE(legalshortname, legalname) AS organization, 
       COUNT(*) AS total,
       COUNT(IF(type = 'publication', 1, NULL)) AS publication,
       COUNT(IF(type = 'dataset', 1, NULL)) AS dataset,
       COUNT(IF(type = 'software', 1, NULL)) AS software,
       COUNT(IF(type = 'other', 1, NULL)) AS other
FROM results JOIN organizations JOIN relations ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf' 
GROUP BY organization 
ORDER BY total DESC
"""

spark.sql(query).limit(20).toPandas()

Task: Show result access types per organization

In [ ]:
query = """
SELECT COALESCE(legalshortname, legalname) AS organization, 
       COUNT(*) as total,
       COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,
       COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,
       COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed
FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id  AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'
GROUP BY organization
ORDER BY total DESC
"""

spark.sql(query).limit(20).toPandas()

Task: Show the result access types per country of the organizations

In [ ]:
query = """
SELECT organizations.country.code AS country, 
       COUNT(*) AS total,
       COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,
       COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,
       COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed
FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id  AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'
WHERE organizations.country IS NOT NULL
GROUP BY organizations.country.code
ORDER BY total DESC
"""

spark.sql(query).limit(20).toPandas()

Task: Show the countries collaboration network in projects with respect to the partecipating organizations

In [ ]:
query = """
WITH countryProject AS (
    SELECT country.code AS country, 
           target.id AS id 
    FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id
    WHERE country IS NOT NULL
)
SELECT l.country AS left, 
       r.country AS right,
       COUNT(*) AS count 
FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country <= r.country
GROUP BY left, right 
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas()
In [ ]:
import igraph as ig

G = ig.Graph.TupleList(
    edges=edges[['left', 'right', 'count']].values,
    vertex_name_attr='countrycode',
    edge_attrs = ['weight'],
    directed=False)
In [ ]:
G.vcount()
In [ ]:
G.ecount()
In [ ]:
G.vs[0]
In [ ]:
G.es[0]
In [ ]:
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ig.plot(G, vertex_label=G.vs['countrycode'], target=ax)
In [ ]:
G.vs.find(countrycode_eq = 'MY') # maldives
In [ ]:
H = G.induced_subgraph(G.neighborhood(50))
H.summary()
In [ ]:
H.vs['color'] = 'grey'
H.vs[0]['color'] = 'red'
fig, ax = plt.subplots()
ig.plot(H, target=ax)
In [ ]:
G.transitivity_local_undirected(50)

Task: Show the international collaboration in projects with respect to the participating organizations

In [ ]:
query = """
WITH countryProject AS (
    SELECT country.code AS country, 
           target.id AS id 
    FROM organizations JOIN relations ON  reltype.name = 'isParticipant' AND source.id = organizations.id
    WHERE country IS NOT NULL
)
SELECT l.country AS left, 
       r.country AS right, 
       COUNT(*) AS count 
FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country < r.country
GROUP BY left, right 
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas() 

Task: Show the kernel organisations often collaborating in projects

In [ ]:
query = """
WITH orgProject AS (
    SELECT COALESCE(legalshortname, legalname) AS organization, 
           target.id AS id 
    FROM organizations JOIN relations ON  reltype.name = 'isParticipant' AND source.id = organizations.id
)
SELECT l.organization AS left,
       r.organization AS right,
       COUNT(*) AS count
FROM orgProject AS l JOIN orgProject AS r ON l.id = r.id AND l.organization < r.organization
GROUP BY left, right 
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas()

Task: Show the kernel organizations often co-authoring papers

In [ ]:
query = """
WITH orgProduct AS (
    SELECT COALESCE(legalshortname, legalname) AS organization, 
           target.id AS id 
    FROM organizations JOIN relations ON reltype.name = 'isAuthorInstitutionOf' AND source.id = organizations.id
)
SELECT l.organization AS left, 
       r.organization AS right,
       COUNT(*) AS count 
FROM orgProduct AS l JOIN orgProduct AS r ON l.id = r.id AND l.organization < r.organization
GROUP BY left, right 
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas()

Task: Show the access right s over the years

In [ ]:
query = """
SELECT bestaccessright.label AS accessright,
       SUBSTRING(publicationdate, 0,4) AS year,
       COUNT(*) AS count
FROM results
WHERE bestaccessright IS NOT NULL AND publicationdate IS NOT NULL
GROUP BY accessright, year
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas()

Task: Show the number of publications supplemented by datasets

In [ ]:
query = """
SELECT COUNT(*) AS count
FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id
"""

spark.sql(query).limit(20).toPandas()

Task: Count and sort publications by citations

In [ ]:
query = """
SELECT publications.id, pid.value, COUNT(*) AS count
FROM publications JOIN relations ON publications.id = relations.source.id AND reltype.name = 'IsCitedBy'
GROUP BY publications.id, pid.value
ORDER BY count DESC
"""

spark.sql(query).limit(20).toPandas()