improved MD format

2023-05-09 16:12:08 +02:00 · 2023-05-09 16:12:08 +02:00 · c560b9a02a
parent e3043ca57a
commit c560b9a02a
2 changed files with 204 additions and 151 deletions
--- a/data/.DS_Store
+++ b/data/.DS_Store
--- a/data/beginners_kit.ipynb
+++ b/data/beginners_kit.ipynb
@ -18,7 +18,7 @@
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Download data"
+    "## Download data"
   ]
  },
  {
@ -45,10 +45,7 @@
    "    print(f\"Downloading {item}\")\n",
    "    os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n",
    "    print(f\"Extracting {item}\")\n",
-    "    os.system(f'tar -xf data/{item} -C data/; rm data/{item}')\n",
+    "    os.system(f'tar -xf data/{item} -C data/; rm data/{item}')"
    "    \n",
    "    \n",
    "    \n"
   ]
  },
  {
@ -60,7 +57,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {
    "autoscroll": "auto"
   },
@ -89,6 +86,13 @@
    "relationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"reltype\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"target\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"validated\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"validationDate\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set the input path: the path on the cluster where the dataset will be stored (e.g. '/data/'); untar each folder in the dataset and move it to the chosen path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
@ -97,12 +101,6 @@
   },
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "\n",
    "\n",
    "#set the input path: the path on the cluster where the dataset will be stored (e.g. '/data/openaire_dump_subset/'); untar each folder in the dataset and move it to the chosen path\n",
    "\n",
    "inputPath = 'data/'\n",
    " \n",
    "# load entities and relationships\n",
@ -142,16 +140,11 @@
   ]
  },
  {
-   "cell_type": "code",
+   "cell_type": "markdown",
-   "execution_count": null,
+   "metadata": {},
   "metadata": {
    "autoscroll": "auto"
   },
   "outputs": [],
   "source": [
-    "# the generic result (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)\n",
+    "Let's show some data. \n",
-    "pretty_print(json.loads(publication.where(\"id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'\").toJSON().first()), expanded=True)\n",
+    "For example, a generic result (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)"
    "\n"
   ]
  },
  {
@ -162,11 +155,14 @@
   },
   "outputs": [],
   "source": [
-    "\n",
+    "pretty_print(json.loads(publication.where(\"id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'\").toJSON().first()), expanded=True)"
-    "\n",
+   ]
-    "# the data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)\n",
+  },
-    "pretty_print(json.loads(datasource.where(\"id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'\").toJSON().first()), expanded=True)\n",
+  {
-    "\n"
+   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Or a data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)"
   ]
  },
  {
@ -177,9 +173,14 @@
   },
   "outputs": [],
   "source": [
-    "# the organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)\n",
+    "pretty_print(json.loads(datasource.where(\"id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'\").toJSON().first()), expanded=True)"
-    "pretty_print(json.loads(organization.where(\"id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'\").toJSON().first()), expanded=True)\n",
+   ]
-    "\n"
+  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "An organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)"
   ]
  },
  {
@ -190,8 +191,14 @@
   },
   "outputs": [],
   "source": [
-    "# the project (link to documentation: https://graph.openaire.eu/docs/data-model/entities/project)\n",
+    "pretty_print(json.loads(organization.where(\"id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'\").toJSON().first()), expanded=True)"
-    "pretty_print(json.loads(project.toJSON().first()), expanded=True)\n"
+   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "A project (link to documentation: https://graph.openaire.eu/docs/data-model/entities/project)"
   ]
  },
  {
@ -202,8 +209,36 @@
   },
   "outputs": [],
   "source": [
-    "# the community (link to documentation: https://graph.openaire.eu/docs/data-model/entities/community)\n",
+    "pretty_print(json.loads(project.toJSON().first()), expanded=True)"
-    "pretty_print(json.loads(community.where(\"acronym='mes'\").toJSON().first()), expanded=True)\n"
+   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "A community (link to documentation: https://graph.openaire.eu/docs/data-model/entities/community)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "autoscroll": "auto"
   },
   "outputs": [],
   "source": [
    "pretty_print(json.loads(community.where(\"acronym='mes'\").toJSON().first()), expanded=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "And finally, a relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)"
   ]
  },
  {
@ -214,71 +249,44 @@
   },
   "outputs": [],
   "source": [
    "# the relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)\n",
    "pretty_print(json.loads(relation.toJSON().first()), expanded=True)"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
    "tags": []
   },
   "source": [
-    "## Split relations based on their semantics and compute their numbers; sort results in descending order, limit to the first 20. "
+    "## Exercises \n",
    "### **Task**: Split relations based on their semantics and compute their numbers; sort results in descending order, limit to the first 20. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "autoscroll": "auto"
+    "autoscroll": "auto",
    "tags": []
   },
   "outputs": [],
   "source": [
-    "query =\"\"\"SELECT reltype.name, \n",
+    "query = \"\"\"\n",
    "SELECT reltype.name, \n",
    "        COUNT(*) AS count \n",
    "FROM relations \n",
    "GROUP BY reltype.name \n",
-    "ORDER BY count DESC\"\"\"\n",
+    "ORDER BY count DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n"
+    "spark.sql(query).limit(20).toPandas()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Show the most occurring publication subject term; sort results in descending order; limit to the first 20"
+    "### **Task:** Show the most occurring publication subject term; sort results in descending order; limit to the first 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "autoscroll": "auto"
   },
   "outputs": [],
   "source": [
    "query=\"\"\"WITH terms AS (\n",
    "    SELECT explode(subjects.subject.value) AS `term` FROM publications\n",
    ")\n",
    "SELECT term AS `subject term`, \n",
    "       COUNT(*) AS count \n",
    "FROM terms \n",
    "GROUP BY term \n",
    "ORDER BY count DESC\"\"\"\n",
    "\n",
    "spark.sql(query).limit(20).toPandas()\n",
    "\n",
    "    "
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Show the most co-occurring publication subjects from controlled vocabularies (i.e. scheme != 'keyword') avoinding repetition; limit to the first 20"
   ]
  },
  {
@ -288,82 +296,123 @@
    "autoscroll": "auto"
   },
   "outputs": [],
   "source": [
    "query = \"\"\"\n",
    "WITH terms AS (\n",
    "    SELECT explode(subjects.subject.value) AS `term`\n",
    "    FROM publications\n",
    ")\n",
    "SELECT term AS `subject term`,\n",
    "        COUNT(*) AS count \n",
    "FROM terms \n",
    "GROUP BY term \n",
    "ORDER BY count DESC\n",
    "\"\"\"\n",
    "\n",
    "spark.sql(query).limit(20).toPandas()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "tags": []
   },
   "source": [
    "### **Task:** Show the most co-occurring publication subjects from controlled vocabularies (i.e. scheme != 'keyword') avoinding repetition; limit to the first 20"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "autoscroll": "auto",
    "tags": []
   },
   "outputs": [],
   "source": [
    "query = \"\"\"\n",
    "WITH subjects AS (\n",
-    "    WITH exploded_subjects (SELECT id, EXPLODE(subjects.subject) AS subject FROM publications) \n",
+    "    WITH exploded_subjects (\n",
-    "    SELECT id, subject.value AS `subject` FROM exploded_subjects WHERE subject.scheme != 'keyword'\n",
+    "        SELECT id, EXPLODE(subjects.subject) AS subject \n",
    "        FROM publications) \n",
    "    SELECT id, subject.value AS `subject` \n",
    "    FROM exploded_subjects \n",
    "    WHERE subject.scheme != 'keyword'\n",
    ")\n",
    "SELECT l.subject AS left, \n",
    "       r.subject AS right, \n",
    "       COUNT(*) AS count\n",
    "FROM subjects AS l JOIN subjects AS r ON l.id = r.id AND l.subject < r.subject\n",
    "GROUP BY left, right\n",
-    "ORDER BY count DESC\"\"\"\n",
+    "ORDER BY count DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n",
+    "spark.sql(query).limit(20).toPandas()"
    "\n",
    "\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Show journal information"
+    "### **Task:** Show journal information"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "autoscroll": "auto"
+    "autoscroll": "auto",
    "tags": []
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"SELECT container.issnLinking, container.issnOnline, container.issnPrinted, container.name \n",
+    "query = \"\"\"\n",
    "SELECT container.issnLinking, container.issnOnline, container.issnPrinted, container.name \n",
    "FROM publications \n",
-    "WHERE container IS NOT NULL\"\"\"\n",
+    "WHERE container IS NOT NULL\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n"
+    "spark.sql(query).limit(20).toPandas()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Show the journals with the highest number of results published in"
+    "### **Task:** Show the journals with the highest number of results published in"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
-    "autoscroll": "auto"
+    "autoscroll": "auto",
    "tags": []
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"WITH journals AS (\n",
+    "query = \"\"\"\n",
-    "    SELECT container.* FROM publications WHERE container IS NOT NULL\n",
+    "WITH journals AS (\n",
    "    SELECT container.*\n",
    "    FROM publications\n",
    "    WHERE container IS NOT NULL\n",
    ")\n",
-    "SELECT name, \n",
+    "SELECT name, count(*) AS count \n",
    "       count(*) AS count \n",
    "FROM journals \n",
    "GROUP BY name \n",
-    "ORDER BY count DESC\"\"\"\n",
+    "ORDER BY count DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n",
+    "spark.sql(query).limit(20).toPandas()"
    "\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
-   "metadata": {},
+   "metadata": {
    "tags": []
   },
   "source": [
-    "## Show projects with the highest number of associated results. The unidentified project is a placeholder for all the association to a funder without knowing the specific project. It will be removed from the count"
+    "### **Task:** Show projects with the highest number of associated results. The unidentified project is a placeholder for all the association to a funder without knowing the specific project. It will be removed from the count"
   ]
  },
  {
@ -374,22 +423,21 @@
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"SELECT CONCAT_WS(' - ',  IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '-'), COALESCE(code, '-'), SUBSTRING(title, 0, 50)) AS project,\n",
+    "query = \"\"\"\n",
-    "       COUNT(*) AS count \n",
+    "SELECT CONCAT_WS(' - ',  IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '-'), COALESCE(code, '-'), SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count \n",
    "FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' \n",
    "GROUP BY project \n",
-    "ORDER BY count DESC\"\"\"\n",
+    "ORDER BY count DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n",
+    "spark.sql(query).limit(20).toPandas()"
    "\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Show the number of projects per organization; sort results in descending order; limit to the first 20"
+    "### **Task:** Show the number of projects per organization; sort results in descending order; limit to the first 20"
   ]
  },
  {
@ -400,22 +448,22 @@
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
+    "query = \"\"\"\n",
    "SELECT COALESCE(legalshortname, legalname) AS organization, \n",
    "        COUNT(*) AS count \n",
    "FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n",
    "GROUP BY organization \n",
-    "ORDER BY count DESC\"\"\"\n",
+    "ORDER BY count DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n",
+    "spark.sql(query).limit(20).toPandas()"
    "    \n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Show the number of research products per organization; sort results in descending order; limit to the first 20. The relation used is the affiliation, since in our data this relation linkes products and organization and not authors and organizations"
+    "### **Task:** Show the number of research products per organization; sort results in descending order; limit to the first 20. The relation used is the affiliation, since in our data this relation linkes products and organization and not authors and organizations"
   ]
  },
  {
@ -426,21 +474,22 @@
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
+    "query = \"\"\"\n",
    "SELECT COALESCE(legalshortname, legalname) AS organization,\n",
    "        COUNT(*) AS count \n",
    "FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isAuthorInstitutionOf' \n",
    "GROUP BY organization\n",
-    "ORDER BY count DESC\"\"\"\n",
+    "ORDER BY count DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n"
+    "spark.sql(query).limit(20).toPandas()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Show the number of research products (per type) per organization"
+    "### **Task:** Show the number of research products (per type) per organization"
   ]
  },
  {
@ -451,24 +500,25 @@
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
+    "query = \"\"\"\n",
    "SELECT COALESCE(legalshortname, legalname) AS organization, \n",
    "       COUNT(IF(type = 'publication', 1, NULL)) AS publication,\n",
    "       COUNT(IF(type = 'dataset', 1, NULL)) AS dataset,\n",
    "       COUNT(IF(type = 'software', 1, NULL)) AS software,\n",
    "       COUNT(IF(type = 'other', 1, NULL)) AS other\n",
    "FROM results JOIN organizations JOIN relations ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf' \n",
    "GROUP BY organization \n",
-    "ORDER BY publication DESC\"\"\"\n",
+    "ORDER BY publication DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n"
+    "spark.sql(query).limit(20).toPandas()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Show result access types per organization"
+    "### **Task:** Show result access types per organization"
   ]
  },
  {
@ -479,24 +529,24 @@
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
+    "query = \"\"\"\n",
    "SELECT COALESCE(legalshortname, legalname) AS organization, \n",
    "       COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n",
    "       COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n",
    "       COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n",
    "FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id  AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n",
    "GROUP BY organization\n",
-    "ORDER BY open DESC\"\"\"\n",
+    "ORDER BY open DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n",
+    "spark.sql(query).limit(20).toPandas()"
    "\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Show the result access types per country of the organizations"
+    "### **Task:** Show the result access types per country of the organizations"
   ]
  },
  {
@ -507,25 +557,25 @@
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"SELECT organizations.country.code AS country, \n",
+    "query = \"\"\"\n",
    "SELECT organizations.country.code AS country, \n",
    "       COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n",
    "       COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n",
    "       COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n",
    "FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id  AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n",
    "WHERE organizations.country IS NOT NULL\n",
    "GROUP BY organizations.country.code\n",
-    "ORDER BY open DESC\"\"\"\n",
+    "ORDER BY open DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n",
+    "spark.sql(query).limit(20).toPandas()"
    "\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Show the countries collaboration network in projects with respect to the partecipating organizations"
+    "### **Task:** Show the countries collaboration network in projects with respect to the partecipating organizations"
   ]
  },
  {
@ -536,7 +586,8 @@
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"WITH countryProject AS (\n",
+    "query = \"\"\"\n",
    "WITH countryProject AS (\n",
    "    SELECT country.code AS country, \n",
    "           target.id AS id \n",
    "    FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n",
@ -547,18 +598,17 @@
    "       COUNT(*) AS count \n",
    "FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country <= r.country\n",
    "GROUP BY left, right \n",
-    "ORDER BY count DESC\"\"\"\n",
+    "ORDER BY count DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n",
+    "spark.sql(query).limit(20).toPandas()"
    "    \n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "# Show the international collaboration in projects with respect to the participating organizations"
+    "### **Task:** Show the international collaboration in projects with respect to the participating organizations"
   ]
  },
  {
@ -569,7 +619,8 @@
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"WITH countryProject AS (\n",
+    "query = \"\"\"\n",
    "WITH countryProject AS (\n",
    "    SELECT country.code AS country, \n",
    "           target.id AS id \n",
    "    FROM organizations JOIN relations ON  reltype.name = 'isParticipant' AND source.id = organizations.id\n",
@ -580,18 +631,17 @@
    "       COUNT(*) AS count \n",
    "FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country < r.country\n",
    "GROUP BY left, right \n",
-    "ORDER BY count DESC\"\"\"\n",
+    "ORDER BY count DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n",
+    "spark.sql(query).limit(20).toPandas() "
    " "
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Show the kernel organisations often collaborating in projects"
+    "### **Task:** Show the kernel organisations often collaborating in projects"
   ]
  },
  {
@ -602,7 +652,8 @@
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"WITH orgProject AS (\n",
+    "query = \"\"\"\n",
    "WITH orgProject AS (\n",
    "    SELECT COALESCE(legalshortname, legalname) AS organization, \n",
    "           target.id AS id \n",
    "    FROM organizations JOIN relations ON  reltype.name = 'isParticipant' AND source.id = organizations.id\n",
@ -612,18 +663,17 @@
    "       COUNT(*) AS count\n",
    "FROM orgProject AS l JOIN orgProject AS r ON l.id = r.id AND l.organization < r.organization\n",
    "GROUP BY left, right \n",
-    "ORDER BY count DESC\"\"\"\n",
+    "ORDER BY count DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n",
+    "spark.sql(query).limit(20).toPandas()"
    "\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Show the kernel organizations often co-authoring papers"
+    "### **Task:** Show the kernel organizations often co-authoring papers"
   ]
  },
  {
@ -634,7 +684,8 @@
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"WITH orgProduct AS (\n",
+    "query = \"\"\"\n",
    "WITH orgProduct AS (\n",
    "    SELECT COALESCE(legalshortname, legalname) AS organization, \n",
    "           target.id AS id \n",
    "    FROM organizations JOIN relations ON reltype.name = 'isAuthorInstitutionOf' AND source.id = organizations.id\n",
@ -644,19 +695,17 @@
    "       COUNT(*) AS count \n",
    "FROM orgProduct AS l JOIN orgProduct AS r ON l.id = r.id AND l.organization < r.organization\n",
    "GROUP BY left, right \n",
-    "ORDER BY count DESC\"\"\"\n",
+    "ORDER BY count DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n",
+    "spark.sql(query).limit(20).toPandas()"
    "\n",
    "\n"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Show the access right s over the years"
+    "### **Task:** Show the access right s over the years"
   ]
  },
  {
@ -667,23 +716,24 @@
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"SELECT bestaccessright.label AS accessright,\n",
+    "query = \"\"\"\n",
    "SELECT bestaccessright.label AS accessright,\n",
    "       SUBSTRING(publicationdate, 0,4) AS year,\n",
    "       COUNT(*) AS count\n",
    "FROM results\n",
    "WHERE bestaccessright IS NOT NULL AND publicationdate IS NOT NULL\n",
    "GROUP BY accessright, year\n",
-    "ORDER BY count DESC\"\"\"\n",
+    "ORDER BY count DESC\n",
    "\"\"\"\n",
    "\n",
-    "spark.sql(query).limit(20).toPandas()\n"
+    "spark.sql(query).limit(20).toPandas()"
   ]
  },
  {
   "attachments": {},
   "cell_type": "markdown",
   "metadata": {},
   "source": [
-    "## Show the number of publications supplemented by datasets"
+    "### **Task:** Show the number of publications supplemented by datasets"
   ]
  },
  {
@ -694,8 +744,11 @@
   },
   "outputs": [],
   "source": [
-    "query=\"\"\"SELECT COUNT(*) AS count\n",
+    "query = \"\"\"\n",
-    "FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id\"\"\"\n",
+    "SELECT COUNT(*) AS count\n",
    "FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id\n",
    "\"\"\"\n",
    "\n",
    "spark.sql(query).limit(20).toPandas()"
   ]
  },