moved cells

2023-06-22 17:15:54 +02:00 · 2023-06-22 17:15:54 +02:00 · 95e2f4a8ba
parent a4c6e5321b
commit 95e2f4a8ba
1 changed files with 109 additions and 122 deletions
--- a/data/beginners_kit.ipynb
+++ b/data/beginners_kit.ipynb
@ -569,6 +569,115 @@
    "spark.sql(query).limit(20).toPandas()"
   ]
  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-info\">\n",
+    "Show the journals with the highest number of results published in\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"\"\"\n",
+    "WITH journals AS (\n",
+    "    SELECT container.*\n",
+    "    FROM publications\n",
+    "    WHERE container IS NOT NULL\n",
+    ")\n",
+    "SELECT name, count(*) AS count \n",
+    "FROM journals \n",
+    "GROUP BY name \n",
+    "ORDER BY count DESC\n",
+    "\"\"\"\n",
+    "\n",
+    "spark.sql(query).limit(20).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-info\">\n",
+    "Show the number of projects per organization; sort results in descending order; limit to the first 20.\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"\"\"\n",
+    "SELECT COALESCE(legalshortname, legalname) AS organization, \n",
+    "        COUNT(*) AS count \n",
+    "FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n",
+    "GROUP BY organization \n",
+    "ORDER BY count DESC\n",
+    "\"\"\"\n",
+    "\n",
+    "spark.sql(query).limit(20).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<div class=\"alert alert-info\">\n",
+    "Show projects with the highest number of associated results. \n",
+    "\n",
+    "Note: An \"unidentified\" project is a placeholder for all the association to a funder without knowing the specific project. It should be removed from the count.\n",
+    "</div>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"\"\"\n",
+    "SELECT funding.shortName, code, title, COUNT(*) AS count \n",
+    "FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' \n",
+    "GROUP BY funding.shortName, code, title\n",
+    "ORDER BY count DESC\n",
+    "\"\"\"\n",
+    "\n",
+    "spark.sql(query).limit(20).toPandas()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Strings can be manipulated as well on the fly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "query = \"\"\"\n",
+    "SELECT CONCAT_WS(' / ',  \n",
+    "                IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '?'), \n",
+    "                COALESCE(code, '?'), \n",
+    "                SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count \n",
+    "FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND NOT projects.title ilike '%unidentified%' \n",
+    "GROUP BY project \n",
+    "ORDER BY count DESC\n",
+    "\"\"\"\n",
+    "\n",
+    "spark.sql(query).limit(20).toPandas()"
+   ]
+  },
  {
   "attachments": {},
   "cell_type": "markdown",
@ -610,128 +719,6 @@
    "spark.sql(query).limit(20).toPandas()"
   ]
  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<div class=\"alert alert-info\">\n",
-    "Show the journals with the highest number of results published in\n",
-    "</div>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "autoscroll": "auto",
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "query = \"\"\"\n",
-    "WITH journals AS (\n",
-    "    SELECT container.*\n",
-    "    FROM publications\n",
-    "    WHERE container IS NOT NULL\n",
-    ")\n",
-    "SELECT name, count(*) AS count \n",
-    "FROM journals \n",
-    "GROUP BY name \n",
-    "ORDER BY count DESC\n",
-    "\"\"\"\n",
-    "\n",
-    "spark.sql(query).limit(20).toPandas()"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "<div class=\"alert alert-info\">\n",
-    "Show the number of projects per organization; sort results in descending order; limit to the first 20.\n",
-    "</div>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "autoscroll": "auto"
-   },
-   "outputs": [],
-   "source": [
-    "query = \"\"\"\n",
-    "SELECT COALESCE(legalshortname, legalname) AS organization, \n",
-    "        COUNT(*) AS count \n",
-    "FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n",
-    "GROUP BY organization \n",
-    "ORDER BY count DESC\n",
-    "\"\"\"\n",
-    "\n",
-    "spark.sql(query).limit(20).toPandas()"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "<div class=\"alert alert-info\">\n",
-    "Show projects with the highest number of associated results. \n",
-    "\n",
-    "Note: An \"unidentified\" project is a placeholder for all the association to a funder without knowing the specific project. It should be removed from the count.\n",
-    "</div>"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "query = \"\"\"\n",
-    "SELECT funding.shortName, code, title, COUNT(*) AS count \n",
-    "FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' \n",
-    "GROUP BY funding.shortName, code, title\n",
-    "ORDER BY count DESC\n",
-    "\"\"\"\n",
-    "\n",
-    "spark.sql(query).limit(20).toPandas()"
-   ]
-  },
-  {
-   "attachments": {},
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Strings can be manipulated as well on the fly"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "autoscroll": "auto"
-   },
-   "outputs": [],
-   "source": [
-    "query = \"\"\"\n",
-    "SELECT CONCAT_WS(' / ',  \n",
-    "                IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '?'), \n",
-    "                COALESCE(code, '?'), \n",
-    "                SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count \n",
-    "FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND NOT projects.title ilike '%unidentified%' \n",
-    "GROUP BY project \n",
-    "ORDER BY count DESC\n",
-    "\"\"\"\n",
-    "\n",
-    "spark.sql(query).limit(20).toPandas()"
-   ]
-  },
  {
   "attachments": {},
   "cell_type": "markdown",