164 KiB
OpenAIRE Beginners Kit¶
The OpenAIRE Research Graph is an Open Access dataset containing metadata about research products (literature, datasets, software, etc.) linked to other entities of the research ecosystem like organisations, project grants, and data sources.
The large size of the OpenAIRE Research Graph is a major impediment for beginners to familiarise with the underlying data model and explore its contents. Working with the Graph in its full size typically requires access to a huge distributed computing infrastructure which cannot be easily accessible to everyone.
The OpenAIRE Beginner’s Kit aims to address this issue. It consists of two components: a subset of the Graph composed of the research products published between 2022-06-29 and 2022-12-29, all the entities connected to them and the respective relationships, and the present Zeppelin notebook that demonstrates how you can use PySpark to analyse the Graph and get answers to some interesting research questions.
Download data¶
!rm -rf data
!mkdir data
import os
base_url = "https://zenodo.org/record/7490192/files/"
items =["communities_infrastructures.tar","dataset.tar","datasource.tar","organization.tar","otherresearchproduct.tar","project.tar","publication.tar","relation.tar", "software.tar"]
for item in items:
print(f"Downloading {item}")
os.system(f'wget {base_url}{item}?download=1 -O data/{item}')
print(f"Extracting {item}")
os.system(f'tar -xf data/{item} -C data/; rm data/{item}')
Have a look at the input data¶
import json
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StructType
from pyspark.sql import SparkSession
from IPython.display import JSON as pretty_print
spark = SparkSession.builder.getOrCreate()
publicationSchema = '{"fields":[{"metadata":{},"name":"author","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"fullname","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"rank","nullable":true,"type":"long"},{"metadata":{},"name":"surname","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"bestaccessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"container","nullable":true,"type":{"fields":[{"metadata":{},"name":"conferencedate","nullable":true,"type":"string"},{"metadata":{},"name":"conferenceplace","nullable":true,"type":"string"},{"metadata":{},"name":"edition","nullable":true,"type":"string"},{"metadata":{},"name":"ep","nullable":true,"type":"string"},{"metadata":{},"name":"iss","nullable":true,"type":"string"},{"metadata":{},"name":"issnLinking","nullable":true,"type":"string"},{"metadata":{},"name":"issnOnline","nullable":true,"type":"string"},{"metadata":{},"name":"issnPrinted","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"sp","nullable":true,"type":"string"},{"metadata":{},"name":"vol","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"contributor","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"coverage","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"dateofcollection","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"embargoenddate","nullable":true,"type":"string"},{"metadata":{},"name":"format","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"indicators","nullable":true,"type":{"fields":[{"metadata":{},"name":"impactMeasures","nullable":true,"type":{"fields":[{"metadata":{},"name":"impulse","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"usageCounts","nullable":true,"type":{"fields":[{"metadata":{},"name":"downloads","nullable":true,"type":"string"},{"metadata":{},"name":"views","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"instance","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"accessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"openAccessRoute","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"alternateIdentifier","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"license","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"refereed","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"url","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"language","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"lastupdatetimestamp","nullable":true,"type":"long"},{"metadata":{},"name":"maintitle","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"publisher","nullable":true,"type":"string"},{"metadata":{},"name":"source","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"subject","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"subtitle","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}'
datasetSchema = '{"fields":[{"metadata":{},"name":"author","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"fullname","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"rank","nullable":true,"type":"long"},{"metadata":{},"name":"surname","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"bestaccessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"contributor","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"coverage","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"dateofcollection","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"embargoenddate","nullable":true,"type":"string"},{"metadata":{},"name":"format","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"geolocation","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"box","nullable":true,"type":"string"},{"metadata":{},"name":"place","nullable":true,"type":"string"},{"metadata":{},"name":"point","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"indicators","nullable":true,"type":{"fields":[{"metadata":{},"name":"impactMeasures","nullable":true,"type":{"fields":[{"metadata":{},"name":"impulse","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"usageCounts","nullable":true,"type":{"fields":[{"metadata":{},"name":"downloads","nullable":true,"type":"string"},{"metadata":{},"name":"views","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"instance","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"accessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"openAccessRoute","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"alternateIdentifier","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"license","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"refereed","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"url","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"language","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"lastupdatetimestamp","nullable":true,"type":"long"},{"metadata":{},"name":"maintitle","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"publisher","nullable":true,"type":"string"},{"metadata":{},"name":"size","nullable":true,"type":"string"},{"metadata":{},"name":"source","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"subject","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"subtitle","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"version","nullable":true,"type":"string"}],"type":"struct"}'
softwareSchema = '{"fields":[{"metadata":{},"name":"author","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"fullname","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"rank","nullable":true,"type":"long"},{"metadata":{},"name":"surname","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"bestaccessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"contributor","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"coverage","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"dateofcollection","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"documentationUrl","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"embargoenddate","nullable":true,"type":"string"},{"metadata":{},"name":"format","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"indicators","nullable":true,"type":{"fields":[{"metadata":{},"name":"impactMeasures","nullable":true,"type":{"fields":[{"metadata":{},"name":"impulse","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"usageCounts","nullable":true,"type":{"fields":[{"metadata":{},"name":"downloads","nullable":true,"type":"string"},{"metadata":{},"name":"views","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"instance","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"accessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"openAccessRoute","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"alternateIdentifier","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"license","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"refereed","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"url","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"language","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"lastupdatetimestamp","nullable":true,"type":"long"},{"metadata":{},"name":"maintitle","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"programmingLanguage","nullable":true,"type":"string"},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"publisher","nullable":true,"type":"string"},{"metadata":{},"name":"source","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"subject","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"subtitle","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}'
otherSchema = '{"fields":[{"metadata":{},"name":"author","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"fullname","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"rank","nullable":true,"type":"long"},{"metadata":{},"name":"surname","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"bestaccessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"contactgroup","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"contactperson","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"contributor","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"coverage","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"dateofcollection","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"embargoenddate","nullable":true,"type":"string"},{"metadata":{},"name":"format","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"indicators","nullable":true,"type":{"fields":[{"metadata":{},"name":"impactMeasures","nullable":true,"type":{"fields":[{"metadata":{},"name":"impulse","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"usageCounts","nullable":true,"type":{"fields":[{"metadata":{},"name":"downloads","nullable":true,"type":"string"},{"metadata":{},"name":"views","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"instance","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"accessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"openAccessRoute","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"alternateIdentifier","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"license","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"refereed","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"url","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"language","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"lastupdatetimestamp","nullable":true,"type":"long"},{"metadata":{},"name":"maintitle","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"publisher","nullable":true,"type":"string"},{"metadata":{},"name":"source","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"subject","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"subtitle","nullable":true,"type":"string"},{"metadata":{},"name":"tool","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}'
datasourceSchema = '{"fields":[{"metadata":{},"name":"accessrights","nullable":true,"type":"string"},{"metadata":{},"name":"certificates","nullable":true,"type":"string"},{"metadata":{},"name":"citationguidelineurl","nullable":true,"type":"string"},{"metadata":{},"name":"databaseaccessrestriction","nullable":true,"type":"string"},{"metadata":{},"name":"datasourcetype","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"datauploadrestriction","nullable":true,"type":"string"},{"metadata":{},"name":"dateofvalidation","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":"string"},{"metadata":{},"name":"englishname","nullable":true,"type":"string"},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"journal","nullable":true,"type":{"fields":[{"metadata":{},"name":"issnLinking","nullable":true,"type":"string"},{"metadata":{},"name":"issnOnline","nullable":true,"type":"string"},{"metadata":{},"name":"issnPrinted","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"languages","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"logourl","nullable":true,"type":"string"},{"metadata":{},"name":"missionstatementurl","nullable":true,"type":"string"},{"metadata":{},"name":"officialname","nullable":true,"type":"string"},{"metadata":{},"name":"openairecompatibility","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"pidsystems","nullable":true,"type":"string"},{"metadata":{},"name":"policies","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"releasestartdate","nullable":true,"type":"string"},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"uploadrights","nullable":true,"type":"string"},{"metadata":{},"name":"versioning","nullable":true,"type":"boolean"},{"metadata":{},"name":"websiteurl","nullable":true,"type":"string"}],"type":"struct"}'
organizationSchema = '{"fields":[{"metadata":{},"name":"alternativenames","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"legalname","nullable":true,"type":"string"},{"metadata":{},"name":"legalshortname","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"websiteurl","nullable":true,"type":"string"}],"type":"struct"}'
projectSchema = '{"fields":[{"metadata":{},"name":"acronym","nullable":true,"type":"string"},{"metadata":{},"name":"callidentifier","nullable":true,"type":"string"},{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"enddate","nullable":true,"type":"string"},{"metadata":{},"name":"funding","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"funding_stream","nullable":true,"type":{"fields":[{"metadata":{},"name":"description","nullable":true,"type":"string"},{"metadata":{},"name":"id","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"jurisdiction","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"shortName","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"granted","nullable":true,"type":{"fields":[{"metadata":{},"name":"currency","nullable":true,"type":"string"},{"metadata":{},"name":"fundedamount","nullable":true,"type":"double"},{"metadata":{},"name":"totalcost","nullable":true,"type":"double"}],"type":"struct"}},{"metadata":{},"name":"h2020programme","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"keywords","nullable":true,"type":"string"},{"metadata":{},"name":"openaccessmandatefordataset","nullable":true,"type":"boolean"},{"metadata":{},"name":"openaccessmandateforpublications","nullable":true,"type":"boolean"},{"metadata":{},"name":"startdate","nullable":true,"type":"string"},{"metadata":{},"name":"subject","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"summary","nullable":true,"type":"string"},{"metadata":{},"name":"title","nullable":true,"type":"string"},{"metadata":{},"name":"websiteurl","nullable":true,"type":"string"}],"type":"struct"}'
communitySchema = '{"fields":[{"metadata":{},"name":"acronym","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":"string"},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"subject","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"zenodo_community","nullable":true,"type":"string"}],"type":"struct"}'
relationSchema = '{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"reltype","nullable":true,"type":{"fields":[{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"source","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"target","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"validated","nullable":true,"type":"boolean"},{"metadata":{},"name":"validationDate","nullable":true,"type":"string"}],"type":"struct"}'
#set the input path: the path on the cluster where the dataset will be stored (e.g. '/data/openaire_dump_subset/'); untar each folder in the dataset and move it to the chosen path
inputPath = 'data/'
# load entities and relationships
publication = spark.read.schema(StructType.fromJson(json.loads(publicationSchema))).json(inputPath + 'publication')
dataset = spark.read.schema(StructType.fromJson(json.loads(datasetSchema))).json(inputPath + 'dataset')
software = spark.read.schema(StructType.fromJson(json.loads(softwareSchema))).json(inputPath + 'software')
other = spark.read.schema(StructType.fromJson(json.loads(otherSchema))).json(inputPath + 'otherresearchproduct')
#results = publication.dropColumn('container').unionByName(dataset.dropColumns('size', 'version', 'geolocation'), allowMissingColumns=True).unionByName(software.dropColumns('documentationUrl', 'codeRepositoryUrl', 'programmingLanguage'), allowMissingColumns=True).unionByName(other.dropColumns('contactperson', 'contactgroup', 'tool'), allowMissingColumns=True)
results = publication.unionByName(dataset, allowMissingColumns=True).unionByName(software, allowMissingColumns=True).unionByName(other, allowMissingColumns=True)
datasource = spark.read.schema(StructType.fromJson(json.loads(datasourceSchema))).json(inputPath + 'datasource')
organization = spark.read.schema(StructType.fromJson(json.loads(organizationSchema))).json(inputPath + 'organization')
project = spark.read.schema(StructType.fromJson(json.loads(projectSchema))).json(inputPath + 'project')
community = spark.read.schema(StructType.fromJson(json.loads(communitySchema))).json(inputPath + 'communities_infrastructures')
relation = spark.read.schema(StructType.fromJson(json.loads(relationSchema))).json(inputPath + 'relation')
publication.createOrReplaceTempView("publications")
dataset.createOrReplaceTempView("datasets")
software.createOrReplaceTempView("software")
other.createOrReplaceTempView("others")
results.createOrReplaceTempView("results")
datasource.createOrReplaceTempView("datasources")
organization.createOrReplaceTempView("organizations")
project.createOrReplaceTempView("projects")
community.createOrReplaceTempView("communities")
relation.createOrReplaceTempView("relations")
# count and print their number
print("number of publications %s"%publication.count())
print("number of datasets %s"%dataset.count())
print("number of software %s"%software.count())
print("number of other research products %s"%other.count())
print("number of results %s"%results.count())
print("number of datasources %s"%datasource.count())
print("number of organizations %s"%organization.count())
print("number of communities %s"%community.count())
print("number of projects %s"%project.count())
print("number of relationships %s"%relation.count())
# the generic result (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)
pretty_print(json.loads(publication.where("id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'").toJSON().first()), expanded=True)
# the data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)
pretty_print(json.loads(datasource.where("id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'").toJSON().first()), expanded=True)
# the organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)
pretty_print(json.loads(organization.where("id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'").toJSON().first()), expanded=True)
# the project (link to documentation: https://graph.openaire.eu/docs/data-model/entities/project)
pretty_print(json.loads(project.toJSON().first()), expanded=True)
# the community (link to documentation: https://graph.openaire.eu/docs/data-model/entities/community)
pretty_print(json.loads(community.where("acronym='mes'").toJSON().first()), expanded=True)
# the relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)
pretty_print(json.loads(relation.toJSON().first()), expanded=True)
query ="""SELECT reltype.name,
COUNT(*) AS count
FROM relations
GROUP BY reltype.name
ORDER BY count DESC"""
spark.sql(query).limit(20).toPandas()
query="""WITH terms AS (
SELECT explode(subjects.subject.value) AS `term` FROM publications
)
SELECT term AS `subject term`,
COUNT(*) AS count
FROM terms
GROUP BY term
ORDER BY count DESC"""
spark.sql(query).limit(20).toPandas()
query="""
WITH subjects AS (
WITH tmp (SELECT id, EXPLODE(subjects.subject) AS subject FROM publications)
SELECT id, subject.value AS `subject` FROM tmp WHERE subject.scheme != 'keyword'
)
SELECT l.subject AS left,
r.subject AS right,
COUNT(*) AS count
FROM subjects AS l JOIN subjects AS r ON l.id = r.id AND l.subject < r.subject
GROUP BY left, right
ORDER BY count DESC"""
spark.sql(query).limit(20).toPandas()
query="""SELECT container.*
FROM publications
WHERE container IS NOT NULL"""
spark.sql(query).limit(20).toPandas()
query="""WITH journals AS (
SELECT container.* FROM publications WHERE container IS NOT NULL
)
SELECT name,
count(*) AS count
FROM journals
GROUP BY name
ORDER BY count DESC"""
spark.sql(query).limit(20).toPandas()
query="""SELECT CONCAT_WS(' - ', IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '-'), COALESCE(code, '-'), SUBSTRING(title, 0, 50)) AS project,
COUNT(*) AS count
FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces'
GROUP BY project
ORDER BY count DESC"""
spark.sql(query).limit(20).toPandas()
query="""SELECT COALESCE(legalshortname, legalname) AS organization,
COUNT(*) AS count
FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'
GROUP BY organization
ORDER BY count DESC"""
spark.sql(query).limit(20).toPandas()
query="""SELECT COALESCE(legalshortname, legalname) AS organization,
COUNT(*) AS count
FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isAuthorInstitutionOf'
GROUP BY organization
ORDER BY count DESC"""
spark.sql(query).limit(20).toPandas()
query="""SELECT COALESCE(legalshortname, legalname) AS organization,
COUNT(IF(type = 'publication', 1, NULL)) AS publication,
COUNT(IF(type = 'dataset', 1, NULL)) AS dataset,
COUNT(IF(type = 'software', 1, NULL)) AS software,
COUNT(IF(type = 'other', 1, NULL)) AS other
FROM results JOIN organizations JOIN relations ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'
GROUP BY organization
ORDER BY publication DESC"""
spark.sql(query).limit(20).toPandas()
query="""SELECT COALESCE(legalshortname, legalname) AS organization,
COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,
COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,
COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed
FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'
GROUP BY organization
ORDER BY open DESC"""
spark.sql(query).limit(20).toPandas()
query="""SELECT organizations.country.code AS country,
COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,
COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,
COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed
FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'
WHERE organizations.country IS NOT NULL
GROUP BY organizations.country.code
ORDER BY open DESC"""
spark.sql(query).limit(20).toPandas()
query="""WITH countryProject AS (
SELECT country.code AS country,
target.id AS id
FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id
WHERE country IS NOT NULL
)
SELECT l.country AS left,
r.country AS right,
COUNT(*) AS count
FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country <= r.country
GROUP BY left, right
ORDER BY count DESC"""
spark.sql(query).limit(20).toPandas()
query="""WITH countryProject AS (
SELECT country.code AS country,
target.id AS id
FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id
WHERE country IS NOT NULL
)
SELECT l.country AS left,
r.country AS right,
COUNT(*) AS count
FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country < r.country
GROUP BY left, right
ORDER BY count DESC"""
spark.sql(query).limit(20).toPandas()
query="""WITH orgProject AS (
SELECT COALESCE(legalshortname, legalname) AS organization,
target.id AS id
FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id
)
SELECT l.organization AS left,
r.organization AS right,
COUNT(*) AS count
FROM orgProject AS l JOIN orgProject AS r ON l.id = r.id AND l.organization < r.organization
GROUP BY left, right
ORDER BY count DESC"""
spark.sql(query).limit(20).toPandas()
query="""WITH orgProject AS (
SELECT COALESCE(legalshortname, legalname) AS organization,
target.id AS id
FROM organizations JOIN relations ON reltype.name = 'isAuthorInstitutionOf' AND source.id = organizations.id
)
SELECT l.organization AS left,
r.organization AS right,
COUNT(*) AS count
FROM orgProject AS l JOIN orgProject AS r ON l.id = r.id AND l.organization < r.organization
GROUP BY left, right
ORDER BY count DESC"""
spark.sql(query).limit(20).toPandas()
query="""SELECT bestaccessright.label AS accessright,
SUBSTRING(publicationdate, 0,4) AS year,
COUNT(*) AS count
FROM results
WHERE bestaccessright IS NOT NULL AND publicationdate IS NOT NULL
GROUP BY accessright, year
ORDER BY count DESC"""
spark.sql(query).limit(20).toPandas()
query="""SELECT COUNT(*) AS count
FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id"""
spark.sql(query).limit(20).toPandas()