75 KiB
OpenAIRE Beginners Kit¶
The OpenAIRE Graph is an Open Access dataset containing metadata about research products (literature, datasets, software, and other research products) linked to other entities of the research ecosystem, such as organisations, grants, and data sources.
The large size of the OpenAIRE Graph is a major impediment for beginners to familiarise with the underlying data model and explore its contents. Working with the Graph in its full size typically requires access to a huge distributed computing infrastructure which cannot be easily accessible to everyone.
The OpenAIRE Beginner’s Kit aims to address this issue. It consists of two components: a subset of the Graph composed of the research products published between 2022-06-29
and 2022-12-29
, all the entities connected to them and the respective relationships, and the present Zeppelin notebook that demonstrates how you can use PySpark
to analyse the Graph and get answers to some interesting research questions.
Download data¶
This can take some time depending on your network speed.
Uncomment and run only if you need to downlaod the data the first time: these lines just download the datasets from the deposition on Zenodo containing data for this kit (https://zenodo.org/record/7490192), untar the content and clean up. All the data needed will sit under the data
folder.
# !rm -rf data
# !mkdir data
# import os
# base_url = "https://zenodo.org/record/7490192/files/"
# items =["communities_infrastructures.tar","dataset.tar","datasource.tar","organization.tar","otherresearchproduct.tar","project.tar","publication.tar","relation.tar", "software.tar"]
# for item in items:
# print(f"Downloading {item}")
# os.system(f'wget {base_url}{item}?download=1 -O data/{item}')
# print(f"Extracting {item}")
# os.system(f'tar -xf data/{item} -C data/; rm data/{item}')
import pandas as pd import glob
files = sorted(glob.glob('./data/publication/part-*.txt.gz')) publications_df = pd.concat(pd.read_json(f) for f in files)
Import libraries¶
import json
import glob
import pandas as pd
import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.sql.types import StructType
from pyspark.sql import SparkSession
from IPython.display import JSON as pretty_print
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 100)
Load the datasets¶
If you try to load the data straight into memory, one file would fit...
df = pd.read_json('./data/publication/part-00000.txt.gz', compression='gzip', lines=True)
df.head(2)
But if you try to load everything, even just all the publications, the chances are slim.
If you try uncommenting and running the following lines, after some time, the kernel will die while trying and restart.
# files = sorted(glob.glob('./data/publication/part-*.txt.gz'))
# publications_df = pd.concat(pd.read_json(f, compression='gzip', lines=True) for f in files)
So, let's see how Spark can help us.
First thing first, let's create the Spark session.
spark = SparkSession.builder.getOrCreate()
Let's define now a few variables containing the schema followed by OpenAIRE data.
publicationSchema = '{"fields":[{"metadata":{},"name":"author","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"fullname","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"rank","nullable":true,"type":"long"},{"metadata":{},"name":"surname","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"bestaccessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"container","nullable":true,"type":{"fields":[{"metadata":{},"name":"conferencedate","nullable":true,"type":"string"},{"metadata":{},"name":"conferenceplace","nullable":true,"type":"string"},{"metadata":{},"name":"edition","nullable":true,"type":"string"},{"metadata":{},"name":"ep","nullable":true,"type":"string"},{"metadata":{},"name":"iss","nullable":true,"type":"string"},{"metadata":{},"name":"issnLinking","nullable":true,"type":"string"},{"metadata":{},"name":"issnOnline","nullable":true,"type":"string"},{"metadata":{},"name":"issnPrinted","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"sp","nullable":true,"type":"string"},{"metadata":{},"name":"vol","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"contributor","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"coverage","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"dateofcollection","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"embargoenddate","nullable":true,"type":"string"},{"metadata":{},"name":"format","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"indicators","nullable":true,"type":{"fields":[{"metadata":{},"name":"impactMeasures","nullable":true,"type":{"fields":[{"metadata":{},"name":"impulse","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"usageCounts","nullable":true,"type":{"fields":[{"metadata":{},"name":"downloads","nullable":true,"type":"string"},{"metadata":{},"name":"views","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"instance","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"accessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"openAccessRoute","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"alternateIdentifier","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"license","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"refereed","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"url","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"language","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"lastupdatetimestamp","nullable":true,"type":"long"},{"metadata":{},"name":"maintitle","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"publisher","nullable":true,"type":"string"},{"metadata":{},"name":"source","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"subject","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"subtitle","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}'
datasetSchema = '{"fields":[{"metadata":{},"name":"author","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"fullname","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"rank","nullable":true,"type":"long"},{"metadata":{},"name":"surname","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"bestaccessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"contributor","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"coverage","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"dateofcollection","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"embargoenddate","nullable":true,"type":"string"},{"metadata":{},"name":"format","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"geolocation","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"box","nullable":true,"type":"string"},{"metadata":{},"name":"place","nullable":true,"type":"string"},{"metadata":{},"name":"point","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"indicators","nullable":true,"type":{"fields":[{"metadata":{},"name":"impactMeasures","nullable":true,"type":{"fields":[{"metadata":{},"name":"impulse","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"usageCounts","nullable":true,"type":{"fields":[{"metadata":{},"name":"downloads","nullable":true,"type":"string"},{"metadata":{},"name":"views","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"instance","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"accessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"openAccessRoute","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"alternateIdentifier","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"license","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"refereed","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"url","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"language","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"lastupdatetimestamp","nullable":true,"type":"long"},{"metadata":{},"name":"maintitle","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"publisher","nullable":true,"type":"string"},{"metadata":{},"name":"size","nullable":true,"type":"string"},{"metadata":{},"name":"source","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"subject","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"subtitle","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"version","nullable":true,"type":"string"}],"type":"struct"}'
softwareSchema = '{"fields":[{"metadata":{},"name":"author","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"fullname","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"rank","nullable":true,"type":"long"},{"metadata":{},"name":"surname","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"bestaccessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"contributor","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"coverage","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"dateofcollection","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"documentationUrl","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"embargoenddate","nullable":true,"type":"string"},{"metadata":{},"name":"format","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"indicators","nullable":true,"type":{"fields":[{"metadata":{},"name":"impactMeasures","nullable":true,"type":{"fields":[{"metadata":{},"name":"impulse","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"usageCounts","nullable":true,"type":{"fields":[{"metadata":{},"name":"downloads","nullable":true,"type":"string"},{"metadata":{},"name":"views","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"instance","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"accessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"openAccessRoute","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"alternateIdentifier","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"license","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"refereed","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"url","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"language","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"lastupdatetimestamp","nullable":true,"type":"long"},{"metadata":{},"name":"maintitle","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"programmingLanguage","nullable":true,"type":"string"},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"publisher","nullable":true,"type":"string"},{"metadata":{},"name":"source","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"subject","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"subtitle","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}'
otherSchema = '{"fields":[{"metadata":{},"name":"author","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"fullname","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"rank","nullable":true,"type":"long"},{"metadata":{},"name":"surname","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"bestaccessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"contactgroup","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"contactperson","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"contributor","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"coverage","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"dateofcollection","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"embargoenddate","nullable":true,"type":"string"},{"metadata":{},"name":"format","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"indicators","nullable":true,"type":{"fields":[{"metadata":{},"name":"impactMeasures","nullable":true,"type":{"fields":[{"metadata":{},"name":"impulse","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"influence_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"popularity_alt","nullable":true,"type":{"fields":[{"metadata":{},"name":"class","nullable":true,"type":"string"},{"metadata":{},"name":"score","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"usageCounts","nullable":true,"type":{"fields":[{"metadata":{},"name":"downloads","nullable":true,"type":"string"},{"metadata":{},"name":"views","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"}},{"metadata":{},"name":"instance","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"accessright","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"},{"metadata":{},"name":"openAccessRoute","nullable":true,"type":"string"},{"metadata":{},"name":"scheme","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"alternateIdentifier","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"license","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"refereed","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"url","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"language","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"lastupdatetimestamp","nullable":true,"type":"long"},{"metadata":{},"name":"maintitle","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"publicationdate","nullable":true,"type":"string"},{"metadata":{},"name":"publisher","nullable":true,"type":"string"},{"metadata":{},"name":"source","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"subject","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"subtitle","nullable":true,"type":"string"},{"metadata":{},"name":"tool","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}'
datasourceSchema = '{"fields":[{"metadata":{},"name":"accessrights","nullable":true,"type":"string"},{"metadata":{},"name":"certificates","nullable":true,"type":"string"},{"metadata":{},"name":"citationguidelineurl","nullable":true,"type":"string"},{"metadata":{},"name":"databaseaccessrestriction","nullable":true,"type":"string"},{"metadata":{},"name":"datasourcetype","nullable":true,"type":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"datauploadrestriction","nullable":true,"type":"string"},{"metadata":{},"name":"dateofvalidation","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":"string"},{"metadata":{},"name":"englishname","nullable":true,"type":"string"},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"journal","nullable":true,"type":{"fields":[{"metadata":{},"name":"issnLinking","nullable":true,"type":"string"},{"metadata":{},"name":"issnOnline","nullable":true,"type":"string"},{"metadata":{},"name":"issnPrinted","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"languages","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"logourl","nullable":true,"type":"string"},{"metadata":{},"name":"missionstatementurl","nullable":true,"type":"string"},{"metadata":{},"name":"officialname","nullable":true,"type":"string"},{"metadata":{},"name":"openairecompatibility","nullable":true,"type":"string"},{"metadata":{},"name":"originalId","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"pidsystems","nullable":true,"type":"string"},{"metadata":{},"name":"policies","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"releasestartdate","nullable":true,"type":"string"},{"metadata":{},"name":"subjects","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"uploadrights","nullable":true,"type":"string"},{"metadata":{},"name":"versioning","nullable":true,"type":"boolean"},{"metadata":{},"name":"websiteurl","nullable":true,"type":"string"}],"type":"struct"}'
organizationSchema = '{"fields":[{"metadata":{},"name":"alternativenames","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"country","nullable":true,"type":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"label","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"legalname","nullable":true,"type":"string"},{"metadata":{},"name":"legalshortname","nullable":true,"type":"string"},{"metadata":{},"name":"pid","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"scheme","nullable":true,"type":"string"},{"metadata":{},"name":"value","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"websiteurl","nullable":true,"type":"string"}],"type":"struct"}'
projectSchema = '{"fields":[{"metadata":{},"name":"acronym","nullable":true,"type":"string"},{"metadata":{},"name":"callidentifier","nullable":true,"type":"string"},{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"enddate","nullable":true,"type":"string"},{"metadata":{},"name":"funding","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"funding_stream","nullable":true,"type":{"fields":[{"metadata":{},"name":"description","nullable":true,"type":"string"},{"metadata":{},"name":"id","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"jurisdiction","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"shortName","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"granted","nullable":true,"type":{"fields":[{"metadata":{},"name":"currency","nullable":true,"type":"string"},{"metadata":{},"name":"fundedamount","nullable":true,"type":"double"},{"metadata":{},"name":"totalcost","nullable":true,"type":"double"}],"type":"struct"}},{"metadata":{},"name":"h2020programme","nullable":true,"type":{"containsNull":true,"elementType":{"fields":[{"metadata":{},"name":"code","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":"string"}],"type":"struct"},"type":"array"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"keywords","nullable":true,"type":"string"},{"metadata":{},"name":"openaccessmandatefordataset","nullable":true,"type":"boolean"},{"metadata":{},"name":"openaccessmandateforpublications","nullable":true,"type":"boolean"},{"metadata":{},"name":"startdate","nullable":true,"type":"string"},{"metadata":{},"name":"subject","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"summary","nullable":true,"type":"string"},{"metadata":{},"name":"title","nullable":true,"type":"string"},{"metadata":{},"name":"websiteurl","nullable":true,"type":"string"}],"type":"struct"}'
communitySchema = '{"fields":[{"metadata":{},"name":"acronym","nullable":true,"type":"string"},{"metadata":{},"name":"description","nullable":true,"type":"string"},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"subject","nullable":true,"type":{"containsNull":true,"elementType":"string","type":"array"}},{"metadata":{},"name":"type","nullable":true,"type":"string"},{"metadata":{},"name":"zenodo_community","nullable":true,"type":"string"}],"type":"struct"}'
relationSchema = '{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":{"fields":[{"metadata":{},"name":"provenance","nullable":true,"type":"string"},{"metadata":{},"name":"trust","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"reltype","nullable":true,"type":{"fields":[{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"source","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"target","nullable":true,"type":{"fields":[{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"type","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"validated","nullable":true,"type":"boolean"},{"metadata":{},"name":"validationDate","nullable":true,"type":"string"}],"type":"struct"}'
Now, let's read the datasetsabout OpenAIRE entitities.
inputPath = 'data/'
publications = spark.read.schema(StructType.fromJson(json.loads(publicationSchema))).json(inputPath + 'publication')
datasets = spark.read.schema(StructType.fromJson(json.loads(datasetSchema))).json(inputPath + 'dataset')
softwares = spark.read.schema(StructType.fromJson(json.loads(softwareSchema))).json(inputPath + 'software')
others = spark.read.schema(StructType.fromJson(json.loads(otherSchema))).json(inputPath + 'otherresearchproduct')
results = publications.unionByName(datasets, allowMissingColumns=True).unionByName(softwares, allowMissingColumns=True).unionByName(others, allowMissingColumns=True)
datasources = spark.read.schema(StructType.fromJson(json.loads(datasourceSchema))).json(inputPath + 'datasource')
organizations = spark.read.schema(StructType.fromJson(json.loads(organizationSchema))).json(inputPath + 'organization')
projects = spark.read.schema(StructType.fromJson(json.loads(projectSchema))).json(inputPath + 'project')
communities = spark.read.schema(StructType.fromJson(json.loads(communitySchema))).json(inputPath + 'communities_infrastructures')
relations = spark.read.schema(StructType.fromJson(json.loads(relationSchema))).json(inputPath + 'relation')
Let's create some Temporary views
, which is similar to a real SQL table that you can query via Spark.
publications.createOrReplaceTempView("publications")
datasets.createOrReplaceTempView("datasets")
softwares.createOrReplaceTempView("software")
others.createOrReplaceTempView("others")
results.createOrReplaceTempView("results")
datasources.createOrReplaceTempView("datasources")
organizations.createOrReplaceTempView("organizations")
projects.createOrReplaceTempView("projects")
communities.createOrReplaceTempView("communities")
relations.createOrReplaceTempView("relations")
Ok, let's count the number of rows.
print("number of publications %s"%publications.count())
print("number of datasets %s"%datasets.count())
print("number of software %s"%softwares.count())
print("number of other research products %s"%others.count())
print("number of results %s"%results.count())
print("number of datasources %s"%datasources.count())
print("number of organizations %s"%organizations.count())
print("number of communities %s"%communities.count())
print("number of projects %s"%projects.count())
print("number of relations %s"%relations.count())
By the way, the same could be achieved in SQL via Spark.
spark.sql("SELECT COUNT(*) FROM publications").toPandas()
Let's show some data now. For example, a generic publication (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)
pretty_print(json.loads(publications
.where("id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'")
.toJSON()
.first()), expanded=False)
Or a data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)
pretty_print(json.loads(datasources
.where("id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'")
.toJSON()
.first()), expanded=False)
An organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)
pretty_print(json.loads(organizations
.where("id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'")
.toJSON()
.first()), expanded=False)
A project (link to documentation: https://graph.openaire.eu/docs/data-model/entities/project)
pretty_print(json.loads(projects
.toJSON()
.first()), expanded=False)
A community (link to documentation: https://graph.openaire.eu/docs/data-model/entities/community)
pretty_print(json.loads(communities
.where("acronym='mes'")
.toJSON()
.first()), expanded=False)
And finally, a relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)
pretty_print(json.loads(relations.toJSON().first()), expanded=False)
Exercises¶
All the exercises follow the template below.
query = """
SELECT <columns>
FROM <table>
"""
spark.sql(query).limit(20).toPandas()
toPandas()
results in the collection of all records in the PySpark DataFrame to the driver program and should be done only on a small subset of the data. running on larger dataset’s results in memory error and crashes the application.
limit(20)
is added exactly for this purpose.
query = """
SELECT reltype.name, COUNT(*) AS count
FROM relations
GROUP BY reltype.name
ORDER BY count DESC
"""
spark.sql(query).limit(20).toPandas()
query = """
SELECT container.issnLinking, container.issnOnline, container.issnPrinted, container.name
FROM publications
WHERE container IS NOT NULL
"""
spark.sql(query).limit(20).toPandas()
query = """
SELECT publications.id, pid.value, COUNT(*) AS count
FROM publications JOIN relations ON publications.id = relations.source.id
WHERE reltype.name = 'IsCitedBy'
GROUP BY publications.id, pid.value
ORDER BY count DESC
"""
spark.sql(query).limit(20).toPandas()
query = """
WITH terms AS (
SELECT explode(subjects.subject.value) AS `term`
FROM publications
)
SELECT term AS `subject term`,
COUNT(*) AS count
FROM terms
GROUP BY term
ORDER BY count DESC
"""
spark.sql(query).limit(20).toPandas()
query = """
WITH journals AS (
SELECT container.*
FROM publications
WHERE container IS NOT NULL
)
SELECT name, count(*) AS count
FROM journals
GROUP BY name
ORDER BY count DESC
"""
spark.sql(query).limit(20).toPandas()
query = """
SELECT COALESCE(legalshortname, legalname) AS organization,
COUNT(*) AS count
FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'
GROUP BY organization
ORDER BY count DESC
"""
spark.sql(query).limit(20).toPandas()
query = """
SELECT funding.shortName, code, title, COUNT(*) AS count
FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%'
GROUP BY funding.shortName, code, title
ORDER BY count DESC
"""
spark.sql(query).limit(20).toPandas()
Strings can be manipulated as well on the fly
query = """
SELECT CONCAT_WS(' / ',
IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '?'),
COALESCE(code, '?'),
SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count
FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND NOT projects.title ilike '%unidentified%'
GROUP BY project
ORDER BY count DESC
"""
spark.sql(query).limit(20).toPandas()
query = """
WITH subjects AS (
WITH exploded_subjects (
SELECT id, EXPLODE(subjects.subject) AS subject
FROM publications)
SELECT id, subject.value AS `subject`
FROM exploded_subjects
WHERE subject.scheme != 'keyword'
)
SELECT l.subject AS left,
r.subject AS right,
COUNT(*) AS count
FROM subjects AS l JOIN subjects AS r ON l.id = r.id AND l.subject < r.subject
GROUP BY left, right
ORDER BY count DESC
"""
spark.sql(query).limit(20).toPandas()
Organization short names can be empty, so the legal name could be a fallback option.
query = """
SELECT legalshortname, legalname
FROM organizations
WHERE legalshortname IS NULL
"""
spark.sql(query).limit(20).toPandas()
query = """
SELECT COALESCE(legalshortname, legalname) AS organization,
COUNT(*) AS count
FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isAuthorInstitutionOf'
GROUP BY organization
ORDER BY count DESC
"""
spark.sql(query).limit(20).toPandas()
query = """
SELECT COALESCE(legalshortname, legalname) AS organization,
COUNT(*) AS total,
COUNT(IF(type = 'publication', 1, NULL)) AS publication,
COUNT(IF(type = 'dataset', 1, NULL)) AS dataset,
COUNT(IF(type = 'software', 1, NULL)) AS software,
COUNT(IF(type = 'other', 1, NULL)) AS other
FROM results JOIN organizations JOIN relations ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'
GROUP BY organization
ORDER BY total DESC
"""
spark.sql(query).limit(20).toPandas()
query = """
SELECT COALESCE(legalshortname, legalname) AS organization,
COUNT(*) as total,
COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,
COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,
COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed
FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'
GROUP BY organization
ORDER BY total DESC
"""
spark.sql(query).limit(20).toPandas()
query = """
SELECT organizations.country.code AS country,
COUNT(*) AS total,
COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,
COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,
COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed
FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'
WHERE organizations.country IS NOT NULL
GROUP BY organizations.country.code
ORDER BY total DESC
"""
spark.sql(query).limit(20).toPandas()
query = """
WITH countryProject AS (
SELECT country.code AS country,
target.id AS id
FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id
WHERE country IS NOT NULL
)
SELECT l.country AS left,
r.country AS right,
COUNT(*) AS count
FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country <= r.country
GROUP BY left, right
ORDER BY count DESC
"""
spark.sql(query).toPandas()
import igraph as ig
G = ig.Graph.TupleList(
edges=edges[['left', 'right', 'count']].values,
vertex_name_attr='countrycode',
edge_attrs = ['weight'],
directed=False)
G.vcount()
G.ecount()
G.vs[0]
G.es[0]
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
ig.plot(G, vertex_label=G.vs['countrycode'], target=ax)
G.vs.find(countrycode_eq = 'MY') # maldives
H = G.induced_subgraph(G.neighborhood(50))
H.summary()
H.vs['color'] = 'grey'
H.vs[0]['color'] = 'red'
fig, ax = plt.subplots()
ig.plot(H, target=ax)
G.transitivity_local_undirected(50)
query = """
WITH countryProject AS (
SELECT country.code AS country,
target.id AS id
FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id
WHERE country IS NOT NULL
)
SELECT l.country AS left,
r.country AS right,
COUNT(*) AS count
FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country < r.country
GROUP BY left, right
ORDER BY count DESC
"""
spark.sql(query).toPandas()
query = """
WITH orgProject AS (
SELECT COALESCE(legalshortname, legalname) AS organization,
target.id AS id
FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id
)
SELECT l.organization AS left,
r.organization AS right,
COUNT(*) AS count
FROM orgProject AS l JOIN orgProject AS r ON l.id = r.id AND l.organization < r.organization
GROUP BY left, right
ORDER BY count DESC
"""
spark.sql(query).limit(20).toPandas()
query = """
WITH orgProduct AS (
SELECT COALESCE(legalshortname, legalname) AS organization,
target.id AS id
FROM organizations JOIN relations ON reltype.name = 'isAuthorInstitutionOf' AND source.id = organizations.id
)
SELECT l.organization AS left,
r.organization AS right,
COUNT(*) AS count
FROM orgProduct AS l JOIN orgProduct AS r ON l.id = r.id AND l.organization < r.organization
GROUP BY left, right
ORDER BY count DESC
"""
spark.sql(query).limit(20).toPandas()
query = """
SELECT bestaccessright.label AS accessright,
SUBSTRING(publicationdate, 0,4) AS year,
COUNT(*) AS count
FROM results
WHERE bestaccessright IS NOT NULL AND publicationdate IS NOT NULL
GROUP BY accessright, year
ORDER BY count DESC
"""
spark.sql(query).limit(20).toPandas()
query = """
SELECT COUNT(*) AS count
FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id
"""
spark.sql(query).limit(20).toPandas()