diff --git a/data/.DS_Store b/data/.DS_Store new file mode 100644 index 0000000..cfb6f6f Binary files /dev/null and b/data/.DS_Store differ diff --git a/data/beginners_kit.ipynb b/data/beginners_kit.ipynb index 63dbcad..658bfe2 100644 --- a/data/beginners_kit.ipynb +++ b/data/beginners_kit.ipynb @@ -18,7 +18,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "# Download data" + "## Download data" ] }, { @@ -45,10 +45,7 @@ " print(f\"Downloading {item}\")\n", " os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n", " print(f\"Extracting {item}\")\n", - " os.system(f'tar -xf data/{item} -C data/; rm data/{item}')\n", - " \n", - " \n", - " \n" + " os.system(f'tar -xf data/{item} -C data/; rm data/{item}')" ] }, { @@ -60,7 +57,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": { "autoscroll": "auto" }, @@ -89,6 +86,13 @@ "relationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"reltype\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"target\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"validated\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"validationDate\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the input path: the path on the cluster where the dataset will be stored (e.g. '/data/'); untar each folder in the dataset and move it to the chosen path" + ] + }, { "cell_type": "code", "execution_count": null, @@ -97,12 +101,6 @@ }, "outputs": [], "source": [ - "\n", - "\n", - "\n", - "\n", - "#set the input path: the path on the cluster where the dataset will be stored (e.g. '/data/openaire_dump_subset/'); untar each folder in the dataset and move it to the chosen path\n", - "\n", "inputPath = 'data/'\n", " \n", "# load entities and relationships\n", @@ -142,16 +140,11 @@ ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "autoscroll": "auto" - }, - "outputs": [], + "cell_type": "markdown", + "metadata": {}, "source": [ - "# the generic result (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)\n", - "pretty_print(json.loads(publication.where(\"id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'\").toJSON().first()), expanded=True)\n", - "\n" + "Let's show some data. \n", + "For example, a generic result (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)" ] }, { @@ -162,11 +155,14 @@ }, "outputs": [], "source": [ - "\n", - "\n", - "# the data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)\n", - "pretty_print(json.loads(datasource.where(\"id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'\").toJSON().first()), expanded=True)\n", - "\n" + "pretty_print(json.loads(publication.where(\"id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'\").toJSON().first()), expanded=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Or a data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)" ] }, { @@ -177,9 +173,14 @@ }, "outputs": [], "source": [ - "# the organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)\n", - "pretty_print(json.loads(organization.where(\"id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'\").toJSON().first()), expanded=True)\n", - "\n" + "pretty_print(json.loads(datasource.where(\"id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'\").toJSON().first()), expanded=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)" ] }, { @@ -190,8 +191,14 @@ }, "outputs": [], "source": [ - "# the project (link to documentation: https://graph.openaire.eu/docs/data-model/entities/project)\n", - "pretty_print(json.loads(project.toJSON().first()), expanded=True)\n" + "pretty_print(json.loads(organization.where(\"id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'\").toJSON().first()), expanded=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A project (link to documentation: https://graph.openaire.eu/docs/data-model/entities/project)" ] }, { @@ -202,8 +209,36 @@ }, "outputs": [], "source": [ - "# the community (link to documentation: https://graph.openaire.eu/docs/data-model/entities/community)\n", - "pretty_print(json.loads(community.where(\"acronym='mes'\").toJSON().first()), expanded=True)\n" + "pretty_print(json.loads(project.toJSON().first()), expanded=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "A community (link to documentation: https://graph.openaire.eu/docs/data-model/entities/community)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "autoscroll": "auto" + }, + "outputs": [], + "source": [ + "pretty_print(json.loads(community.where(\"acronym='mes'\").toJSON().first()), expanded=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "tags": [] + }, + "source": [ + "And finally, a relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)" ] }, { @@ -214,41 +249,44 @@ }, "outputs": [], "source": [ - "# the relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)\n", "pretty_print(json.loads(relation.toJSON().first()), expanded=True)" ] }, { - "attachments": {}, "cell_type": "markdown", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ - "## Split relations based on their semantics and compute their numbers; sort results in descending order, limit to the first 20. " + "## Exercises \n", + "### **Task**: Split relations based on their semantics and compute their numbers; sort results in descending order, limit to the first 20. " ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "autoscroll": "auto" + "autoscroll": "auto", + "tags": [] }, "outputs": [], "source": [ - "query =\"\"\"SELECT reltype.name, \n", - " COUNT(*) AS count \n", + "query = \"\"\"\n", + "SELECT reltype.name, \n", + " COUNT(*) AS count \n", "FROM relations \n", "GROUP BY reltype.name \n", - "ORDER BY count DESC\"\"\"\n", + "ORDER BY count DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Show the most occurring publication subject term; sort results in descending order; limit to the first 20" + "### **Task:** Show the most occurring publication subject term; sort results in descending order; limit to the first 20" ] }, { @@ -259,111 +297,122 @@ }, "outputs": [], "source": [ - "query=\"\"\"WITH terms AS (\n", - " SELECT explode(subjects.subject.value) AS `term` FROM publications\n", + "query = \"\"\"\n", + "WITH terms AS (\n", + " SELECT explode(subjects.subject.value) AS `term`\n", + " FROM publications\n", ")\n", - "SELECT term AS `subject term`, \n", - " COUNT(*) AS count \n", + "SELECT term AS `subject term`,\n", + " COUNT(*) AS count \n", "FROM terms \n", "GROUP BY term \n", - "ORDER BY count DESC\"\"\"\n", + "ORDER BY count DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n", - "\n", - " " + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ - "## Show the most co-occurring publication subjects from controlled vocabularies (i.e. scheme != 'keyword') avoinding repetition; limit to the first 20" + "### **Task:** Show the most co-occurring publication subjects from controlled vocabularies (i.e. scheme != 'keyword') avoinding repetition; limit to the first 20" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "autoscroll": "auto" + "autoscroll": "auto", + "tags": [] }, "outputs": [], "source": [ - "query=\"\"\"\n", + "query = \"\"\"\n", "WITH subjects AS (\n", - " WITH exploded_subjects (SELECT id, EXPLODE(subjects.subject) AS subject FROM publications) \n", - " SELECT id, subject.value AS `subject` FROM exploded_subjects WHERE subject.scheme != 'keyword'\n", + " WITH exploded_subjects (\n", + " SELECT id, EXPLODE(subjects.subject) AS subject \n", + " FROM publications) \n", + " SELECT id, subject.value AS `subject` \n", + " FROM exploded_subjects \n", + " WHERE subject.scheme != 'keyword'\n", ")\n", "SELECT l.subject AS left, \n", " r.subject AS right, \n", " COUNT(*) AS count\n", "FROM subjects AS l JOIN subjects AS r ON l.id = r.id AND l.subject < r.subject\n", "GROUP BY left, right\n", - "ORDER BY count DESC\"\"\"\n", + "ORDER BY count DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n", - "\n", - "\n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Show journal information" + "### **Task:** Show journal information" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "autoscroll": "auto" + "autoscroll": "auto", + "tags": [] }, "outputs": [], "source": [ - "query=\"\"\"SELECT container.issnLinking, container.issnOnline, container.issnPrinted, container.name \n", + "query = \"\"\"\n", + "SELECT container.issnLinking, container.issnOnline, container.issnPrinted, container.name \n", "FROM publications \n", - "WHERE container IS NOT NULL\"\"\"\n", + "WHERE container IS NOT NULL\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Show the journals with the highest number of results published in" + "### **Task:** Show the journals with the highest number of results published in" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "autoscroll": "auto" + "autoscroll": "auto", + "tags": [] }, "outputs": [], "source": [ - "query=\"\"\"WITH journals AS (\n", - " SELECT container.* FROM publications WHERE container IS NOT NULL\n", + "query = \"\"\"\n", + "WITH journals AS (\n", + " SELECT container.*\n", + " FROM publications\n", + " WHERE container IS NOT NULL\n", ")\n", - "SELECT name, \n", - " count(*) AS count \n", + "SELECT name, count(*) AS count \n", "FROM journals \n", "GROUP BY name \n", - "ORDER BY count DESC\"\"\"\n", + "ORDER BY count DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n", - "\n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", - "metadata": {}, + "metadata": { + "tags": [] + }, "source": [ - "## Show projects with the highest number of associated results. The unidentified project is a placeholder for all the association to a funder without knowing the specific project. It will be removed from the count" + "### **Task:** Show projects with the highest number of associated results. The unidentified project is a placeholder for all the association to a funder without knowing the specific project. It will be removed from the count" ] }, { @@ -374,22 +423,21 @@ }, "outputs": [], "source": [ - "query=\"\"\"SELECT CONCAT_WS(' - ', IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '-'), COALESCE(code, '-'), SUBSTRING(title, 0, 50)) AS project,\n", - " COUNT(*) AS count \n", + "query = \"\"\"\n", + "SELECT CONCAT_WS(' - ', IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '-'), COALESCE(code, '-'), SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count \n", "FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' \n", "GROUP BY project \n", - "ORDER BY count DESC\"\"\"\n", + "ORDER BY count DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n", - "\n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Show the number of projects per organization; sort results in descending order; limit to the first 20" + "### **Task:** Show the number of projects per organization; sort results in descending order; limit to the first 20" ] }, { @@ -400,22 +448,22 @@ }, "outputs": [], "source": [ - "query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n", - " COUNT(*) AS count \n", + "query = \"\"\"\n", + "SELECT COALESCE(legalshortname, legalname) AS organization, \n", + " COUNT(*) AS count \n", "FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n", "GROUP BY organization \n", - "ORDER BY count DESC\"\"\"\n", + "ORDER BY count DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n", - " \n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Show the number of research products per organization; sort results in descending order; limit to the first 20. The relation used is the affiliation, since in our data this relation linkes products and organization and not authors and organizations" + "### **Task:** Show the number of research products per organization; sort results in descending order; limit to the first 20. The relation used is the affiliation, since in our data this relation linkes products and organization and not authors and organizations" ] }, { @@ -426,21 +474,22 @@ }, "outputs": [], "source": [ - "query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n", - " COUNT(*) AS count \n", + "query = \"\"\"\n", + "SELECT COALESCE(legalshortname, legalname) AS organization,\n", + " COUNT(*) AS count \n", "FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isAuthorInstitutionOf' \n", "GROUP BY organization\n", - "ORDER BY count DESC\"\"\"\n", + "ORDER BY count DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Show the number of research products (per type) per organization" + "### **Task:** Show the number of research products (per type) per organization" ] }, { @@ -451,24 +500,25 @@ }, "outputs": [], "source": [ - "query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n", + "query = \"\"\"\n", + "SELECT COALESCE(legalshortname, legalname) AS organization, \n", " COUNT(IF(type = 'publication', 1, NULL)) AS publication,\n", " COUNT(IF(type = 'dataset', 1, NULL)) AS dataset,\n", " COUNT(IF(type = 'software', 1, NULL)) AS software,\n", " COUNT(IF(type = 'other', 1, NULL)) AS other\n", "FROM results JOIN organizations JOIN relations ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf' \n", "GROUP BY organization \n", - "ORDER BY publication DESC\"\"\"\n", + "ORDER BY publication DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Show result access types per organization" + "### **Task:** Show result access types per organization" ] }, { @@ -479,24 +529,24 @@ }, "outputs": [], "source": [ - "query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n", + "query = \"\"\"\n", + "SELECT COALESCE(legalshortname, legalname) AS organization, \n", " COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n", " COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n", " COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n", "FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n", "GROUP BY organization\n", - "ORDER BY open DESC\"\"\"\n", + "ORDER BY open DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n", - "\n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Show the result access types per country of the organizations" + "### **Task:** Show the result access types per country of the organizations" ] }, { @@ -507,25 +557,25 @@ }, "outputs": [], "source": [ - "query=\"\"\"SELECT organizations.country.code AS country, \n", + "query = \"\"\"\n", + "SELECT organizations.country.code AS country, \n", " COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n", " COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n", " COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n", "FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n", "WHERE organizations.country IS NOT NULL\n", "GROUP BY organizations.country.code\n", - "ORDER BY open DESC\"\"\"\n", + "ORDER BY open DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n", - "\n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "# Show the countries collaboration network in projects with respect to the partecipating organizations" + "### **Task:** Show the countries collaboration network in projects with respect to the partecipating organizations" ] }, { @@ -536,7 +586,8 @@ }, "outputs": [], "source": [ - "query=\"\"\"WITH countryProject AS (\n", + "query = \"\"\"\n", + "WITH countryProject AS (\n", " SELECT country.code AS country, \n", " target.id AS id \n", " FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n", @@ -547,18 +598,17 @@ " COUNT(*) AS count \n", "FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country <= r.country\n", "GROUP BY left, right \n", - "ORDER BY count DESC\"\"\"\n", + "ORDER BY count DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n", - " \n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "# Show the international collaboration in projects with respect to the participating organizations" + "### **Task:** Show the international collaboration in projects with respect to the participating organizations" ] }, { @@ -569,7 +619,8 @@ }, "outputs": [], "source": [ - "query=\"\"\"WITH countryProject AS (\n", + "query = \"\"\"\n", + "WITH countryProject AS (\n", " SELECT country.code AS country, \n", " target.id AS id \n", " FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n", @@ -580,18 +631,17 @@ " COUNT(*) AS count \n", "FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country < r.country\n", "GROUP BY left, right \n", - "ORDER BY count DESC\"\"\"\n", + "ORDER BY count DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n", - " " + "spark.sql(query).limit(20).toPandas() " ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Show the kernel organisations often collaborating in projects" + "### **Task:** Show the kernel organisations often collaborating in projects" ] }, { @@ -602,7 +652,8 @@ }, "outputs": [], "source": [ - "query=\"\"\"WITH orgProject AS (\n", + "query = \"\"\"\n", + "WITH orgProject AS (\n", " SELECT COALESCE(legalshortname, legalname) AS organization, \n", " target.id AS id \n", " FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n", @@ -612,18 +663,17 @@ " COUNT(*) AS count\n", "FROM orgProject AS l JOIN orgProject AS r ON l.id = r.id AND l.organization < r.organization\n", "GROUP BY left, right \n", - "ORDER BY count DESC\"\"\"\n", + "ORDER BY count DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n", - "\n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Show the kernel organizations often co-authoring papers" + "### **Task:** Show the kernel organizations often co-authoring papers" ] }, { @@ -634,7 +684,8 @@ }, "outputs": [], "source": [ - "query=\"\"\"WITH orgProduct AS (\n", + "query = \"\"\"\n", + "WITH orgProduct AS (\n", " SELECT COALESCE(legalshortname, legalname) AS organization, \n", " target.id AS id \n", " FROM organizations JOIN relations ON reltype.name = 'isAuthorInstitutionOf' AND source.id = organizations.id\n", @@ -644,19 +695,17 @@ " COUNT(*) AS count \n", "FROM orgProduct AS l JOIN orgProduct AS r ON l.id = r.id AND l.organization < r.organization\n", "GROUP BY left, right \n", - "ORDER BY count DESC\"\"\"\n", + "ORDER BY count DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n", - "\n", - "\n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Show the access right s over the years" + "### **Task:** Show the access right s over the years" ] }, { @@ -667,23 +716,24 @@ }, "outputs": [], "source": [ - "query=\"\"\"SELECT bestaccessright.label AS accessright,\n", + "query = \"\"\"\n", + "SELECT bestaccessright.label AS accessright,\n", " SUBSTRING(publicationdate, 0,4) AS year,\n", " COUNT(*) AS count\n", "FROM results\n", "WHERE bestaccessright IS NOT NULL AND publicationdate IS NOT NULL\n", "GROUP BY accessright, year\n", - "ORDER BY count DESC\"\"\"\n", + "ORDER BY count DESC\n", + "\"\"\"\n", "\n", - "spark.sql(query).limit(20).toPandas()\n" + "spark.sql(query).limit(20).toPandas()" ] }, { - "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ - "## Show the number of publications supplemented by datasets" + "### **Task:** Show the number of publications supplemented by datasets" ] }, { @@ -694,8 +744,11 @@ }, "outputs": [], "source": [ - "query=\"\"\"SELECT COUNT(*) AS count\n", - "FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id\"\"\"\n", + "query = \"\"\"\n", + "SELECT COUNT(*) AS count\n", + "FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id\n", + "\"\"\"\n", + "\n", "spark.sql(query).limit(20).toPandas()" ] },