improved MD format

This commit is contained in:
Andrea Mannocci 2023-05-09 16:12:08 +02:00
parent e3043ca57a
commit c560b9a02a
2 changed files with 204 additions and 151 deletions

BIN
data/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -18,7 +18,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"# Download data"
"## Download data"
]
},
{
@ -45,10 +45,7 @@
" print(f\"Downloading {item}\")\n",
" os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n",
" print(f\"Extracting {item}\")\n",
" os.system(f'tar -xf data/{item} -C data/; rm data/{item}')\n",
" \n",
" \n",
" \n"
" os.system(f'tar -xf data/{item} -C data/; rm data/{item}')"
]
},
{
@ -60,7 +57,7 @@
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {
"autoscroll": "auto"
},
@ -89,6 +86,13 @@
"relationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"reltype\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"target\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"validated\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"validationDate\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set the input path: the path on the cluster where the dataset will be stored (e.g. '/data/'); untar each folder in the dataset and move it to the chosen path"
]
},
{
"cell_type": "code",
"execution_count": null,
@ -97,12 +101,6 @@
},
"outputs": [],
"source": [
"\n",
"\n",
"\n",
"\n",
"#set the input path: the path on the cluster where the dataset will be stored (e.g. '/data/openaire_dump_subset/'); untar each folder in the dataset and move it to the chosen path\n",
"\n",
"inputPath = 'data/'\n",
" \n",
"# load entities and relationships\n",
@ -142,16 +140,11 @@
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"cell_type": "markdown",
"metadata": {},
"source": [
"# the generic result (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)\n",
"pretty_print(json.loads(publication.where(\"id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'\").toJSON().first()), expanded=True)\n",
"\n"
"Let's show some data. \n",
"For example, a generic result (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)"
]
},
{
@ -162,11 +155,14 @@
},
"outputs": [],
"source": [
"\n",
"\n",
"# the data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)\n",
"pretty_print(json.loads(datasource.where(\"id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'\").toJSON().first()), expanded=True)\n",
"\n"
"pretty_print(json.loads(publication.where(\"id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'\").toJSON().first()), expanded=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Or a data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)"
]
},
{
@ -177,9 +173,14 @@
},
"outputs": [],
"source": [
"# the organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)\n",
"pretty_print(json.loads(organization.where(\"id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'\").toJSON().first()), expanded=True)\n",
"\n"
"pretty_print(json.loads(datasource.where(\"id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'\").toJSON().first()), expanded=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"An organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)"
]
},
{
@ -190,8 +191,14 @@
},
"outputs": [],
"source": [
"# the project (link to documentation: https://graph.openaire.eu/docs/data-model/entities/project)\n",
"pretty_print(json.loads(project.toJSON().first()), expanded=True)\n"
"pretty_print(json.loads(organization.where(\"id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'\").toJSON().first()), expanded=True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A project (link to documentation: https://graph.openaire.eu/docs/data-model/entities/project)"
]
},
{
@ -202,8 +209,36 @@
},
"outputs": [],
"source": [
"# the community (link to documentation: https://graph.openaire.eu/docs/data-model/entities/community)\n",
"pretty_print(json.loads(community.where(\"acronym='mes'\").toJSON().first()), expanded=True)\n"
"pretty_print(json.loads(project.toJSON().first()), expanded=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"A community (link to documentation: https://graph.openaire.eu/docs/data-model/entities/community)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": [
"pretty_print(json.loads(community.where(\"acronym='mes'\").toJSON().first()), expanded=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"And finally, a relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)"
]
},
{
@ -214,41 +249,44 @@
},
"outputs": [],
"source": [
"# the relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)\n",
"pretty_print(json.loads(relation.toJSON().first()), expanded=True)"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"metadata": {
"tags": []
},
"source": [
"## Split relations based on their semantics and compute their numbers; sort results in descending order, limit to the first 20. "
"## Exercises \n",
"### **Task**: Split relations based on their semantics and compute their numbers; sort results in descending order, limit to the first 20. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto"
"autoscroll": "auto",
"tags": []
},
"outputs": [],
"source": [
"query =\"\"\"SELECT reltype.name, \n",
" COUNT(*) AS count \n",
"query = \"\"\"\n",
"SELECT reltype.name, \n",
" COUNT(*) AS count \n",
"FROM relations \n",
"GROUP BY reltype.name \n",
"ORDER BY count DESC\"\"\"\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show the most occurring publication subject term; sort results in descending order; limit to the first 20"
"### **Task:** Show the most occurring publication subject term; sort results in descending order; limit to the first 20"
]
},
{
@ -259,111 +297,122 @@
},
"outputs": [],
"source": [
"query=\"\"\"WITH terms AS (\n",
" SELECT explode(subjects.subject.value) AS `term` FROM publications\n",
"query = \"\"\"\n",
"WITH terms AS (\n",
" SELECT explode(subjects.subject.value) AS `term`\n",
" FROM publications\n",
")\n",
"SELECT term AS `subject term`, \n",
" COUNT(*) AS count \n",
"SELECT term AS `subject term`,\n",
" COUNT(*) AS count \n",
"FROM terms \n",
"GROUP BY term \n",
"ORDER BY count DESC\"\"\"\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n",
" "
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"metadata": {
"tags": []
},
"source": [
"## Show the most co-occurring publication subjects from controlled vocabularies (i.e. scheme != 'keyword') avoinding repetition; limit to the first 20"
"### **Task:** Show the most co-occurring publication subjects from controlled vocabularies (i.e. scheme != 'keyword') avoinding repetition; limit to the first 20"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto"
"autoscroll": "auto",
"tags": []
},
"outputs": [],
"source": [
"query=\"\"\"\n",
"query = \"\"\"\n",
"WITH subjects AS (\n",
" WITH exploded_subjects (SELECT id, EXPLODE(subjects.subject) AS subject FROM publications) \n",
" SELECT id, subject.value AS `subject` FROM exploded_subjects WHERE subject.scheme != 'keyword'\n",
" WITH exploded_subjects (\n",
" SELECT id, EXPLODE(subjects.subject) AS subject \n",
" FROM publications) \n",
" SELECT id, subject.value AS `subject` \n",
" FROM exploded_subjects \n",
" WHERE subject.scheme != 'keyword'\n",
")\n",
"SELECT l.subject AS left, \n",
" r.subject AS right, \n",
" COUNT(*) AS count\n",
"FROM subjects AS l JOIN subjects AS r ON l.id = r.id AND l.subject < r.subject\n",
"GROUP BY left, right\n",
"ORDER BY count DESC\"\"\"\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n",
"\n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show journal information"
"### **Task:** Show journal information"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto"
"autoscroll": "auto",
"tags": []
},
"outputs": [],
"source": [
"query=\"\"\"SELECT container.issnLinking, container.issnOnline, container.issnPrinted, container.name \n",
"query = \"\"\"\n",
"SELECT container.issnLinking, container.issnOnline, container.issnPrinted, container.name \n",
"FROM publications \n",
"WHERE container IS NOT NULL\"\"\"\n",
"WHERE container IS NOT NULL\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show the journals with the highest number of results published in"
"### **Task:** Show the journals with the highest number of results published in"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto"
"autoscroll": "auto",
"tags": []
},
"outputs": [],
"source": [
"query=\"\"\"WITH journals AS (\n",
" SELECT container.* FROM publications WHERE container IS NOT NULL\n",
"query = \"\"\"\n",
"WITH journals AS (\n",
" SELECT container.*\n",
" FROM publications\n",
" WHERE container IS NOT NULL\n",
")\n",
"SELECT name, \n",
" count(*) AS count \n",
"SELECT name, count(*) AS count \n",
"FROM journals \n",
"GROUP BY name \n",
"ORDER BY count DESC\"\"\"\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"metadata": {
"tags": []
},
"source": [
"## Show projects with the highest number of associated results. The unidentified project is a placeholder for all the association to a funder without knowing the specific project. It will be removed from the count"
"### **Task:** Show projects with the highest number of associated results. The unidentified project is a placeholder for all the association to a funder without knowing the specific project. It will be removed from the count"
]
},
{
@ -374,22 +423,21 @@
},
"outputs": [],
"source": [
"query=\"\"\"SELECT CONCAT_WS(' - ', IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '-'), COALESCE(code, '-'), SUBSTRING(title, 0, 50)) AS project,\n",
" COUNT(*) AS count \n",
"query = \"\"\"\n",
"SELECT CONCAT_WS(' - ', IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '-'), COALESCE(code, '-'), SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count \n",
"FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' \n",
"GROUP BY project \n",
"ORDER BY count DESC\"\"\"\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show the number of projects per organization; sort results in descending order; limit to the first 20"
"### **Task:** Show the number of projects per organization; sort results in descending order; limit to the first 20"
]
},
{
@ -400,22 +448,22 @@
},
"outputs": [],
"source": [
"query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(*) AS count \n",
"query = \"\"\"\n",
"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(*) AS count \n",
"FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n",
"GROUP BY organization \n",
"ORDER BY count DESC\"\"\"\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n",
" \n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show the number of research products per organization; sort results in descending order; limit to the first 20. The relation used is the affiliation, since in our data this relation linkes products and organization and not authors and organizations"
"### **Task:** Show the number of research products per organization; sort results in descending order; limit to the first 20. The relation used is the affiliation, since in our data this relation linkes products and organization and not authors and organizations"
]
},
{
@ -426,21 +474,22 @@
},
"outputs": [],
"source": [
"query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(*) AS count \n",
"query = \"\"\"\n",
"SELECT COALESCE(legalshortname, legalname) AS organization,\n",
" COUNT(*) AS count \n",
"FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isAuthorInstitutionOf' \n",
"GROUP BY organization\n",
"ORDER BY count DESC\"\"\"\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show the number of research products (per type) per organization"
"### **Task:** Show the number of research products (per type) per organization"
]
},
{
@ -451,24 +500,25 @@
},
"outputs": [],
"source": [
"query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
"query = \"\"\"\n",
"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(IF(type = 'publication', 1, NULL)) AS publication,\n",
" COUNT(IF(type = 'dataset', 1, NULL)) AS dataset,\n",
" COUNT(IF(type = 'software', 1, NULL)) AS software,\n",
" COUNT(IF(type = 'other', 1, NULL)) AS other\n",
"FROM results JOIN organizations JOIN relations ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf' \n",
"GROUP BY organization \n",
"ORDER BY publication DESC\"\"\"\n",
"ORDER BY publication DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show result access types per organization"
"### **Task:** Show result access types per organization"
]
},
{
@ -479,24 +529,24 @@
},
"outputs": [],
"source": [
"query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
"query = \"\"\"\n",
"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n",
" COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n",
" COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n",
"FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n",
"GROUP BY organization\n",
"ORDER BY open DESC\"\"\"\n",
"ORDER BY open DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show the result access types per country of the organizations"
"### **Task:** Show the result access types per country of the organizations"
]
},
{
@ -507,25 +557,25 @@
},
"outputs": [],
"source": [
"query=\"\"\"SELECT organizations.country.code AS country, \n",
"query = \"\"\"\n",
"SELECT organizations.country.code AS country, \n",
" COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n",
" COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n",
" COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n",
"FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n",
"WHERE organizations.country IS NOT NULL\n",
"GROUP BY organizations.country.code\n",
"ORDER BY open DESC\"\"\"\n",
"ORDER BY open DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Show the countries collaboration network in projects with respect to the partecipating organizations"
"### **Task:** Show the countries collaboration network in projects with respect to the partecipating organizations"
]
},
{
@ -536,7 +586,8 @@
},
"outputs": [],
"source": [
"query=\"\"\"WITH countryProject AS (\n",
"query = \"\"\"\n",
"WITH countryProject AS (\n",
" SELECT country.code AS country, \n",
" target.id AS id \n",
" FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n",
@ -547,18 +598,17 @@
" COUNT(*) AS count \n",
"FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country <= r.country\n",
"GROUP BY left, right \n",
"ORDER BY count DESC\"\"\"\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n",
" \n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"# Show the international collaboration in projects with respect to the participating organizations"
"### **Task:** Show the international collaboration in projects with respect to the participating organizations"
]
},
{
@ -569,7 +619,8 @@
},
"outputs": [],
"source": [
"query=\"\"\"WITH countryProject AS (\n",
"query = \"\"\"\n",
"WITH countryProject AS (\n",
" SELECT country.code AS country, \n",
" target.id AS id \n",
" FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n",
@ -580,18 +631,17 @@
" COUNT(*) AS count \n",
"FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country < r.country\n",
"GROUP BY left, right \n",
"ORDER BY count DESC\"\"\"\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n",
" "
"spark.sql(query).limit(20).toPandas() "
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show the kernel organisations often collaborating in projects"
"### **Task:** Show the kernel organisations often collaborating in projects"
]
},
{
@ -602,7 +652,8 @@
},
"outputs": [],
"source": [
"query=\"\"\"WITH orgProject AS (\n",
"query = \"\"\"\n",
"WITH orgProject AS (\n",
" SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" target.id AS id \n",
" FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n",
@ -612,18 +663,17 @@
" COUNT(*) AS count\n",
"FROM orgProject AS l JOIN orgProject AS r ON l.id = r.id AND l.organization < r.organization\n",
"GROUP BY left, right \n",
"ORDER BY count DESC\"\"\"\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show the kernel organizations often co-authoring papers"
"### **Task:** Show the kernel organizations often co-authoring papers"
]
},
{
@ -634,7 +684,8 @@
},
"outputs": [],
"source": [
"query=\"\"\"WITH orgProduct AS (\n",
"query = \"\"\"\n",
"WITH orgProduct AS (\n",
" SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" target.id AS id \n",
" FROM organizations JOIN relations ON reltype.name = 'isAuthorInstitutionOf' AND source.id = organizations.id\n",
@ -644,19 +695,17 @@
" COUNT(*) AS count \n",
"FROM orgProduct AS l JOIN orgProduct AS r ON l.id = r.id AND l.organization < r.organization\n",
"GROUP BY left, right \n",
"ORDER BY count DESC\"\"\"\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n",
"\n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show the access right s over the years"
"### **Task:** Show the access right s over the years"
]
},
{
@ -667,23 +716,24 @@
},
"outputs": [],
"source": [
"query=\"\"\"SELECT bestaccessright.label AS accessright,\n",
"query = \"\"\"\n",
"SELECT bestaccessright.label AS accessright,\n",
" SUBSTRING(publicationdate, 0,4) AS year,\n",
" COUNT(*) AS count\n",
"FROM results\n",
"WHERE bestaccessright IS NOT NULL AND publicationdate IS NOT NULL\n",
"GROUP BY accessright, year\n",
"ORDER BY count DESC\"\"\"\n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n"
"spark.sql(query).limit(20).toPandas()"
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show the number of publications supplemented by datasets"
"### **Task:** Show the number of publications supplemented by datasets"
]
},
{
@ -694,8 +744,11 @@
},
"outputs": [],
"source": [
"query=\"\"\"SELECT COUNT(*) AS count\n",
"FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id\"\"\"\n",
"query = \"\"\"\n",
"SELECT COUNT(*) AS count\n",
"FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()"
]
},