From 95e2f4a8ba1037ce1655248257457eb94720dfa6 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Thu, 22 Jun 2023 17:15:54 +0200 Subject: [PATCH] moved cells --- data/beginners_kit.ipynb | 231 ++++++++++++++++++--------------------- 1 file changed, 109 insertions(+), 122 deletions(-) diff --git a/data/beginners_kit.ipynb b/data/beginners_kit.ipynb index 748991b..617efe8 100644 --- a/data/beginners_kit.ipynb +++ b/data/beginners_kit.ipynb @@ -569,6 +569,115 @@ "spark.sql(query).limit(20).toPandas()" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "Show the journals with the highest number of results published in\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"\n", + "WITH journals AS (\n", + " SELECT container.*\n", + " FROM publications\n", + " WHERE container IS NOT NULL\n", + ")\n", + "SELECT name, count(*) AS count \n", + "FROM journals \n", + "GROUP BY name \n", + "ORDER BY count DESC\n", + "\"\"\"\n", + "\n", + "spark.sql(query).limit(20).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "Show the number of projects per organization; sort results in descending order; limit to the first 20.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"\n", + "SELECT COALESCE(legalshortname, legalname) AS organization, \n", + " COUNT(*) AS count \n", + "FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n", + "GROUP BY organization \n", + "ORDER BY count DESC\n", + "\"\"\"\n", + "\n", + "spark.sql(query).limit(20).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "
\n", + "Show projects with the highest number of associated results. \n", + "\n", + "Note: An \"unidentified\" project is a placeholder for all the association to a funder without knowing the specific project. It should be removed from the count.\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"\n", + "SELECT funding.shortName, code, title, COUNT(*) AS count \n", + "FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' \n", + "GROUP BY funding.shortName, code, title\n", + "ORDER BY count DESC\n", + "\"\"\"\n", + "\n", + "spark.sql(query).limit(20).toPandas()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Strings can be manipulated as well on the fly" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"\"\"\n", + "SELECT CONCAT_WS(' / ', \n", + " IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '?'), \n", + " COALESCE(code, '?'), \n", + " SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count \n", + "FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND NOT projects.title ilike '%unidentified%' \n", + "GROUP BY project \n", + "ORDER BY count DESC\n", + "\"\"\"\n", + "\n", + "spark.sql(query).limit(20).toPandas()" + ] + }, { "attachments": {}, "cell_type": "markdown", @@ -610,128 +719,6 @@ "spark.sql(query).limit(20).toPandas()" ] }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "Show the journals with the highest number of results published in\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "autoscroll": "auto", - "tags": [] - }, - "outputs": [], - "source": [ - "query = \"\"\"\n", - "WITH journals AS (\n", - " SELECT container.*\n", - " FROM publications\n", - " WHERE container IS NOT NULL\n", - ")\n", - "SELECT name, count(*) AS count \n", - "FROM journals \n", - "GROUP BY name \n", - "ORDER BY count DESC\n", - "\"\"\"\n", - "\n", - "spark.sql(query).limit(20).toPandas()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "
\n", - "Show the number of projects per organization; sort results in descending order; limit to the first 20.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "autoscroll": "auto" - }, - "outputs": [], - "source": [ - "query = \"\"\"\n", - "SELECT COALESCE(legalshortname, legalname) AS organization, \n", - " COUNT(*) AS count \n", - "FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n", - "GROUP BY organization \n", - "ORDER BY count DESC\n", - "\"\"\"\n", - "\n", - "spark.sql(query).limit(20).toPandas()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "tags": [] - }, - "source": [ - "
\n", - "Show projects with the highest number of associated results. \n", - "\n", - "Note: An \"unidentified\" project is a placeholder for all the association to a funder without knowing the specific project. It should be removed from the count.\n", - "
" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "query = \"\"\"\n", - "SELECT funding.shortName, code, title, COUNT(*) AS count \n", - "FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' \n", - "GROUP BY funding.shortName, code, title\n", - "ORDER BY count DESC\n", - "\"\"\"\n", - "\n", - "spark.sql(query).limit(20).toPandas()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Strings can be manipulated as well on the fly" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "autoscroll": "auto" - }, - "outputs": [], - "source": [ - "query = \"\"\"\n", - "SELECT CONCAT_WS(' / ', \n", - " IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '?'), \n", - " COALESCE(code, '?'), \n", - " SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count \n", - "FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND NOT projects.title ilike '%unidentified%' \n", - "GROUP BY project \n", - "ORDER BY count DESC\n", - "\"\"\"\n", - "\n", - "spark.sql(query).limit(20).toPandas()" - ] - }, { "attachments": {}, "cell_type": "markdown",