moved cells

This commit is contained in:
Andrea Mannocci 2023-06-22 17:15:54 +02:00
parent a4c6e5321b
commit 95e2f4a8ba
1 changed files with 109 additions and 122 deletions

View File

@ -569,6 +569,115 @@
"spark.sql(query).limit(20).toPandas()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<div class=\"alert alert-info\">\n",
"Show the journals with the highest number of results published in\n",
"</div>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query = \"\"\"\n",
"WITH journals AS (\n",
" SELECT container.*\n",
" FROM publications\n",
" WHERE container IS NOT NULL\n",
")\n",
"SELECT name, count(*) AS count \n",
"FROM journals \n",
"GROUP BY name \n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<div class=\"alert alert-info\">\n",
"Show the number of projects per organization; sort results in descending order; limit to the first 20.\n",
"</div>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query = \"\"\"\n",
"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(*) AS count \n",
"FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n",
"GROUP BY organization \n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<div class=\"alert alert-info\">\n",
"Show projects with the highest number of associated results. \n",
"\n",
"Note: An \"unidentified\" project is a placeholder for all the association to a funder without knowing the specific project. It should be removed from the count.\n",
"</div>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query = \"\"\"\n",
"SELECT funding.shortName, code, title, COUNT(*) AS count \n",
"FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' \n",
"GROUP BY funding.shortName, code, title\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Strings can be manipulated as well on the fly"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query = \"\"\"\n",
"SELECT CONCAT_WS(' / ', \n",
" IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '?'), \n",
" COALESCE(code, '?'), \n",
" SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count \n",
"FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND NOT projects.title ilike '%unidentified%' \n",
"GROUP BY project \n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
@ -610,128 +719,6 @@
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"<div class=\"alert alert-info\">\n",
"Show the journals with the highest number of results published in\n",
"</div>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto",
"tags": []
},
"outputs": [],
"source": [
"query = \"\"\"\n",
"WITH journals AS (\n",
" SELECT container.*\n",
" FROM publications\n",
" WHERE container IS NOT NULL\n",
")\n",
"SELECT name, count(*) AS count \n",
"FROM journals \n",
"GROUP BY name \n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"<div class=\"alert alert-info\">\n",
"Show the number of projects per organization; sort results in descending order; limit to the first 20.\n",
"</div>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": [
"query = \"\"\"\n",
"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(*) AS count \n",
"FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n",
"GROUP BY organization \n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"<div class=\"alert alert-info\">\n",
"Show projects with the highest number of associated results. \n",
"\n",
"Note: An \"unidentified\" project is a placeholder for all the association to a funder without knowing the specific project. It should be removed from the count.\n",
"</div>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"query = \"\"\"\n",
"SELECT funding.shortName, code, title, COUNT(*) AS count \n",
"FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' \n",
"GROUP BY funding.shortName, code, title\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"Strings can be manipulated as well on the fly"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": [
"query = \"\"\"\n",
"SELECT CONCAT_WS(' / ', \n",
" IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '?'), \n",
" COALESCE(code, '?'), \n",
" SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count \n",
"FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND NOT projects.title ilike '%unidentified%' \n",
"GROUP BY project \n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",