improved MD format

This commit is contained in:
Andrea Mannocci 2023-05-09 16:12:08 +02:00
parent e3043ca57a
commit c560b9a02a
2 changed files with 204 additions and 151 deletions

BIN
data/.DS_Store vendored Normal file

Binary file not shown.

View File

@ -18,7 +18,7 @@
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Download data" "## Download data"
] ]
}, },
{ {
@ -45,10 +45,7 @@
" print(f\"Downloading {item}\")\n", " print(f\"Downloading {item}\")\n",
" os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n", " os.system(f'wget {base_url}{item}?download=1 -O data/{item}')\n",
" print(f\"Extracting {item}\")\n", " print(f\"Extracting {item}\")\n",
" os.system(f'tar -xf data/{item} -C data/; rm data/{item}')\n", " os.system(f'tar -xf data/{item} -C data/; rm data/{item}')"
" \n",
" \n",
" \n"
] ]
}, },
{ {
@ -60,7 +57,7 @@
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": 8, "execution_count": null,
"metadata": { "metadata": {
"autoscroll": "auto" "autoscroll": "auto"
}, },
@ -89,6 +86,13 @@
"relationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"reltype\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"target\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"validated\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"validationDate\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'" "relationSchema = '{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"provenance\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"trust\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"reltype\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"name\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"source\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"target\",\"nullable\":true,\"type\":{\"fields\":[{\"metadata\":{},\"name\":\"id\",\"nullable\":true,\"type\":\"string\"},{\"metadata\":{},\"name\":\"type\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}},{\"metadata\":{},\"name\":\"validated\",\"nullable\":true,\"type\":\"boolean\"},{\"metadata\":{},\"name\":\"validationDate\",\"nullable\":true,\"type\":\"string\"}],\"type\":\"struct\"}'"
] ]
}, },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Set the input path: the path on the cluster where the dataset will be stored (e.g. '/data/'); untar each folder in the dataset and move it to the chosen path"
]
},
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
@ -97,12 +101,6 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"\n",
"\n",
"\n",
"\n",
"#set the input path: the path on the cluster where the dataset will be stored (e.g. '/data/openaire_dump_subset/'); untar each folder in the dataset and move it to the chosen path\n",
"\n",
"inputPath = 'data/'\n", "inputPath = 'data/'\n",
" \n", " \n",
"# load entities and relationships\n", "# load entities and relationships\n",
@ -142,16 +140,11 @@
] ]
}, },
{ {
"cell_type": "code", "cell_type": "markdown",
"execution_count": null, "metadata": {},
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": [ "source": [
"# the generic result (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)\n", "Let's show some data. \n",
"pretty_print(json.loads(publication.where(\"id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'\").toJSON().first()), expanded=True)\n", "For example, a generic result (link to documentation: https://graph.openaire.eu/docs/data-model/entities/result)"
"\n"
] ]
}, },
{ {
@ -162,11 +155,14 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"\n", "pretty_print(json.loads(publication.where(\"id='50|78975075580c::2ff84f3173897001283274434e8f3eaa'\").toJSON().first()), expanded=True)"
"\n", ]
"# the data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)\n", },
"pretty_print(json.loads(datasource.where(\"id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'\").toJSON().first()), expanded=True)\n", {
"\n" "cell_type": "markdown",
"metadata": {},
"source": [
"Or a data source (link to documentation: https://graph.openaire.eu/docs/data-model/entities/data-source)"
] ]
}, },
{ {
@ -177,9 +173,14 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"# the organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)\n", "pretty_print(json.loads(datasource.where(\"id='10|fairsharing_::c3a690be93aa602ee2dc0ccab5b7b67e'\").toJSON().first()), expanded=True)"
"pretty_print(json.loads(organization.where(\"id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'\").toJSON().first()), expanded=True)\n", ]
"\n" },
{
"cell_type": "markdown",
"metadata": {},
"source": [
"An organization (link to documentation: https://graph.openaire.eu/docs/data-model/entities/organization)"
] ]
}, },
{ {
@ -190,8 +191,14 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"# the project (link to documentation: https://graph.openaire.eu/docs/data-model/entities/project)\n", "pretty_print(json.loads(organization.where(\"id='20|openorgs____::5836463160e0e5d1cd12997f7d2f0257'\").toJSON().first()), expanded=True)"
"pretty_print(json.loads(project.toJSON().first()), expanded=True)\n" ]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"A project (link to documentation: https://graph.openaire.eu/docs/data-model/entities/project)"
] ]
}, },
{ {
@ -202,8 +209,36 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"# the community (link to documentation: https://graph.openaire.eu/docs/data-model/entities/community)\n", "pretty_print(json.loads(project.toJSON().first()), expanded=True)"
"pretty_print(json.loads(community.where(\"acronym='mes'\").toJSON().first()), expanded=True)\n" ]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"A community (link to documentation: https://graph.openaire.eu/docs/data-model/entities/community)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": [
"pretty_print(json.loads(community.where(\"acronym='mes'\").toJSON().first()), expanded=True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"And finally, a relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)"
] ]
}, },
{ {
@ -214,71 +249,44 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"# the relation (link to documentation: https://graph.openaire.eu/docs/data-model/relationships)\n",
"pretty_print(json.loads(relation.toJSON().first()), expanded=True)" "pretty_print(json.loads(relation.toJSON().first()), expanded=True)"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"tags": []
},
"source": [ "source": [
"## Split relations based on their semantics and compute their numbers; sort results in descending order, limit to the first 20. " "## Exercises \n",
"### **Task**: Split relations based on their semantics and compute their numbers; sort results in descending order, limit to the first 20. "
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"autoscroll": "auto" "autoscroll": "auto",
"tags": []
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query =\"\"\"SELECT reltype.name, \n", "query = \"\"\"\n",
"SELECT reltype.name, \n",
" COUNT(*) AS count \n", " COUNT(*) AS count \n",
"FROM relations \n", "FROM relations \n",
"GROUP BY reltype.name \n", "GROUP BY reltype.name \n",
"ORDER BY count DESC\"\"\"\n", "ORDER BY count DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n" "spark.sql(query).limit(20).toPandas()"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Show the most occurring publication subject term; sort results in descending order; limit to the first 20" "### **Task:** Show the most occurring publication subject term; sort results in descending order; limit to the first 20"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto"
},
"outputs": [],
"source": [
"query=\"\"\"WITH terms AS (\n",
" SELECT explode(subjects.subject.value) AS `term` FROM publications\n",
")\n",
"SELECT term AS `subject term`, \n",
" COUNT(*) AS count \n",
"FROM terms \n",
"GROUP BY term \n",
"ORDER BY count DESC\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()\n",
"\n",
" "
]
},
{
"attachments": {},
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show the most co-occurring publication subjects from controlled vocabularies (i.e. scheme != 'keyword') avoinding repetition; limit to the first 20"
] ]
}, },
{ {
@ -288,82 +296,123 @@
"autoscroll": "auto" "autoscroll": "auto"
}, },
"outputs": [], "outputs": [],
"source": [
"query = \"\"\"\n",
"WITH terms AS (\n",
" SELECT explode(subjects.subject.value) AS `term`\n",
" FROM publications\n",
")\n",
"SELECT term AS `subject term`,\n",
" COUNT(*) AS count \n",
"FROM terms \n",
"GROUP BY term \n",
"ORDER BY count DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()"
]
},
{
"cell_type": "markdown",
"metadata": {
"tags": []
},
"source": [
"### **Task:** Show the most co-occurring publication subjects from controlled vocabularies (i.e. scheme != 'keyword') avoinding repetition; limit to the first 20"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"autoscroll": "auto",
"tags": []
},
"outputs": [],
"source": [ "source": [
"query = \"\"\"\n", "query = \"\"\"\n",
"WITH subjects AS (\n", "WITH subjects AS (\n",
" WITH exploded_subjects (SELECT id, EXPLODE(subjects.subject) AS subject FROM publications) \n", " WITH exploded_subjects (\n",
" SELECT id, subject.value AS `subject` FROM exploded_subjects WHERE subject.scheme != 'keyword'\n", " SELECT id, EXPLODE(subjects.subject) AS subject \n",
" FROM publications) \n",
" SELECT id, subject.value AS `subject` \n",
" FROM exploded_subjects \n",
" WHERE subject.scheme != 'keyword'\n",
")\n", ")\n",
"SELECT l.subject AS left, \n", "SELECT l.subject AS left, \n",
" r.subject AS right, \n", " r.subject AS right, \n",
" COUNT(*) AS count\n", " COUNT(*) AS count\n",
"FROM subjects AS l JOIN subjects AS r ON l.id = r.id AND l.subject < r.subject\n", "FROM subjects AS l JOIN subjects AS r ON l.id = r.id AND l.subject < r.subject\n",
"GROUP BY left, right\n", "GROUP BY left, right\n",
"ORDER BY count DESC\"\"\"\n", "ORDER BY count DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n", "spark.sql(query).limit(20).toPandas()"
"\n",
"\n"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Show journal information" "### **Task:** Show journal information"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"autoscroll": "auto" "autoscroll": "auto",
"tags": []
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"SELECT container.issnLinking, container.issnOnline, container.issnPrinted, container.name \n", "query = \"\"\"\n",
"SELECT container.issnLinking, container.issnOnline, container.issnPrinted, container.name \n",
"FROM publications \n", "FROM publications \n",
"WHERE container IS NOT NULL\"\"\"\n", "WHERE container IS NOT NULL\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n" "spark.sql(query).limit(20).toPandas()"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Show the journals with the highest number of results published in" "### **Task:** Show the journals with the highest number of results published in"
] ]
}, },
{ {
"cell_type": "code", "cell_type": "code",
"execution_count": null, "execution_count": null,
"metadata": { "metadata": {
"autoscroll": "auto" "autoscroll": "auto",
"tags": []
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"WITH journals AS (\n", "query = \"\"\"\n",
" SELECT container.* FROM publications WHERE container IS NOT NULL\n", "WITH journals AS (\n",
" SELECT container.*\n",
" FROM publications\n",
" WHERE container IS NOT NULL\n",
")\n", ")\n",
"SELECT name, \n", "SELECT name, count(*) AS count \n",
" count(*) AS count \n",
"FROM journals \n", "FROM journals \n",
"GROUP BY name \n", "GROUP BY name \n",
"ORDER BY count DESC\"\"\"\n", "ORDER BY count DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n", "spark.sql(query).limit(20).toPandas()"
"\n"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {
"tags": []
},
"source": [ "source": [
"## Show projects with the highest number of associated results. The unidentified project is a placeholder for all the association to a funder without knowing the specific project. It will be removed from the count" "### **Task:** Show projects with the highest number of associated results. The unidentified project is a placeholder for all the association to a funder without knowing the specific project. It will be removed from the count"
] ]
}, },
{ {
@ -374,22 +423,21 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"SELECT CONCAT_WS(' - ', IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '-'), COALESCE(code, '-'), SUBSTRING(title, 0, 50)) AS project,\n", "query = \"\"\"\n",
" COUNT(*) AS count \n", "SELECT CONCAT_WS(' - ', IF(SIZE(funding.shortName) > 0, ARRAY_JOIN(funding.shortName, ',', '-'), '-'), COALESCE(code, '-'), SUBSTRING(title, 0, 50)) AS project, COUNT(*) AS count \n",
"FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' \n", "FROM projects JOIN relations ON projects.id = relations.source.id AND reltype.name = 'produces' AND not projects.title ilike '%unidentified%' \n",
"GROUP BY project \n", "GROUP BY project \n",
"ORDER BY count DESC\"\"\"\n", "ORDER BY count DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n", "spark.sql(query).limit(20).toPandas()"
"\n"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Show the number of projects per organization; sort results in descending order; limit to the first 20" "### **Task:** Show the number of projects per organization; sort results in descending order; limit to the first 20"
] ]
}, },
{ {
@ -400,22 +448,22 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n", "query = \"\"\"\n",
"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(*) AS count \n", " COUNT(*) AS count \n",
"FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n", "FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isParticipant'\n",
"GROUP BY organization \n", "GROUP BY organization \n",
"ORDER BY count DESC\"\"\"\n", "ORDER BY count DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n", "spark.sql(query).limit(20).toPandas()"
" \n"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Show the number of research products per organization; sort results in descending order; limit to the first 20. The relation used is the affiliation, since in our data this relation linkes products and organization and not authors and organizations" "### **Task:** Show the number of research products per organization; sort results in descending order; limit to the first 20. The relation used is the affiliation, since in our data this relation linkes products and organization and not authors and organizations"
] ]
}, },
{ {
@ -426,21 +474,22 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n", "query = \"\"\"\n",
"SELECT COALESCE(legalshortname, legalname) AS organization,\n",
" COUNT(*) AS count \n", " COUNT(*) AS count \n",
"FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isAuthorInstitutionOf' \n", "FROM organizations JOIN relations ON organizations.id = relations.source.id AND reltype.name = 'isAuthorInstitutionOf' \n",
"GROUP BY organization\n", "GROUP BY organization\n",
"ORDER BY count DESC\"\"\"\n", "ORDER BY count DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n" "spark.sql(query).limit(20).toPandas()"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Show the number of research products (per type) per organization" "### **Task:** Show the number of research products (per type) per organization"
] ]
}, },
{ {
@ -451,24 +500,25 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n", "query = \"\"\"\n",
"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(IF(type = 'publication', 1, NULL)) AS publication,\n", " COUNT(IF(type = 'publication', 1, NULL)) AS publication,\n",
" COUNT(IF(type = 'dataset', 1, NULL)) AS dataset,\n", " COUNT(IF(type = 'dataset', 1, NULL)) AS dataset,\n",
" COUNT(IF(type = 'software', 1, NULL)) AS software,\n", " COUNT(IF(type = 'software', 1, NULL)) AS software,\n",
" COUNT(IF(type = 'other', 1, NULL)) AS other\n", " COUNT(IF(type = 'other', 1, NULL)) AS other\n",
"FROM results JOIN organizations JOIN relations ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf' \n", "FROM results JOIN organizations JOIN relations ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf' \n",
"GROUP BY organization \n", "GROUP BY organization \n",
"ORDER BY publication DESC\"\"\"\n", "ORDER BY publication DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n" "spark.sql(query).limit(20).toPandas()"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Show result access types per organization" "### **Task:** Show result access types per organization"
] ]
}, },
{ {
@ -479,24 +529,24 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"SELECT COALESCE(legalshortname, legalname) AS organization, \n", "query = \"\"\"\n",
"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n", " COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n",
" COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n", " COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n",
" COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n", " COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n",
"FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n", "FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n",
"GROUP BY organization\n", "GROUP BY organization\n",
"ORDER BY open DESC\"\"\"\n", "ORDER BY open DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n", "spark.sql(query).limit(20).toPandas()"
"\n"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Show the result access types per country of the organizations" "### **Task:** Show the result access types per country of the organizations"
] ]
}, },
{ {
@ -507,25 +557,25 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"SELECT organizations.country.code AS country, \n", "query = \"\"\"\n",
"SELECT organizations.country.code AS country, \n",
" COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n", " COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n",
" COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n", " COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n",
" COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n", " COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n",
"FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n", "FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n",
"WHERE organizations.country IS NOT NULL\n", "WHERE organizations.country IS NOT NULL\n",
"GROUP BY organizations.country.code\n", "GROUP BY organizations.country.code\n",
"ORDER BY open DESC\"\"\"\n", "ORDER BY open DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n", "spark.sql(query).limit(20).toPandas()"
"\n"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Show the countries collaboration network in projects with respect to the partecipating organizations" "### **Task:** Show the countries collaboration network in projects with respect to the partecipating organizations"
] ]
}, },
{ {
@ -536,7 +586,8 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"WITH countryProject AS (\n", "query = \"\"\"\n",
"WITH countryProject AS (\n",
" SELECT country.code AS country, \n", " SELECT country.code AS country, \n",
" target.id AS id \n", " target.id AS id \n",
" FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n", " FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n",
@ -547,18 +598,17 @@
" COUNT(*) AS count \n", " COUNT(*) AS count \n",
"FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country <= r.country\n", "FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country <= r.country\n",
"GROUP BY left, right \n", "GROUP BY left, right \n",
"ORDER BY count DESC\"\"\"\n", "ORDER BY count DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n", "spark.sql(query).limit(20).toPandas()"
" \n"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"# Show the international collaboration in projects with respect to the participating organizations" "### **Task:** Show the international collaboration in projects with respect to the participating organizations"
] ]
}, },
{ {
@ -569,7 +619,8 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"WITH countryProject AS (\n", "query = \"\"\"\n",
"WITH countryProject AS (\n",
" SELECT country.code AS country, \n", " SELECT country.code AS country, \n",
" target.id AS id \n", " target.id AS id \n",
" FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n", " FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n",
@ -580,18 +631,17 @@
" COUNT(*) AS count \n", " COUNT(*) AS count \n",
"FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country < r.country\n", "FROM countryProject AS l JOIN countryProject AS r ON l.id = r.id AND l.country < r.country\n",
"GROUP BY left, right \n", "GROUP BY left, right \n",
"ORDER BY count DESC\"\"\"\n", "ORDER BY count DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n", "spark.sql(query).limit(20).toPandas() "
" "
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Show the kernel organisations often collaborating in projects" "### **Task:** Show the kernel organisations often collaborating in projects"
] ]
}, },
{ {
@ -602,7 +652,8 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"WITH orgProject AS (\n", "query = \"\"\"\n",
"WITH orgProject AS (\n",
" SELECT COALESCE(legalshortname, legalname) AS organization, \n", " SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" target.id AS id \n", " target.id AS id \n",
" FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n", " FROM organizations JOIN relations ON reltype.name = 'isParticipant' AND source.id = organizations.id\n",
@ -612,18 +663,17 @@
" COUNT(*) AS count\n", " COUNT(*) AS count\n",
"FROM orgProject AS l JOIN orgProject AS r ON l.id = r.id AND l.organization < r.organization\n", "FROM orgProject AS l JOIN orgProject AS r ON l.id = r.id AND l.organization < r.organization\n",
"GROUP BY left, right \n", "GROUP BY left, right \n",
"ORDER BY count DESC\"\"\"\n", "ORDER BY count DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n", "spark.sql(query).limit(20).toPandas()"
"\n"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Show the kernel organizations often co-authoring papers" "### **Task:** Show the kernel organizations often co-authoring papers"
] ]
}, },
{ {
@ -634,7 +684,8 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"WITH orgProduct AS (\n", "query = \"\"\"\n",
"WITH orgProduct AS (\n",
" SELECT COALESCE(legalshortname, legalname) AS organization, \n", " SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" target.id AS id \n", " target.id AS id \n",
" FROM organizations JOIN relations ON reltype.name = 'isAuthorInstitutionOf' AND source.id = organizations.id\n", " FROM organizations JOIN relations ON reltype.name = 'isAuthorInstitutionOf' AND source.id = organizations.id\n",
@ -644,19 +695,17 @@
" COUNT(*) AS count \n", " COUNT(*) AS count \n",
"FROM orgProduct AS l JOIN orgProduct AS r ON l.id = r.id AND l.organization < r.organization\n", "FROM orgProduct AS l JOIN orgProduct AS r ON l.id = r.id AND l.organization < r.organization\n",
"GROUP BY left, right \n", "GROUP BY left, right \n",
"ORDER BY count DESC\"\"\"\n", "ORDER BY count DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n", "spark.sql(query).limit(20).toPandas()"
"\n",
"\n"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Show the access right s over the years" "### **Task:** Show the access right s over the years"
] ]
}, },
{ {
@ -667,23 +716,24 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"SELECT bestaccessright.label AS accessright,\n", "query = \"\"\"\n",
"SELECT bestaccessright.label AS accessright,\n",
" SUBSTRING(publicationdate, 0,4) AS year,\n", " SUBSTRING(publicationdate, 0,4) AS year,\n",
" COUNT(*) AS count\n", " COUNT(*) AS count\n",
"FROM results\n", "FROM results\n",
"WHERE bestaccessright IS NOT NULL AND publicationdate IS NOT NULL\n", "WHERE bestaccessright IS NOT NULL AND publicationdate IS NOT NULL\n",
"GROUP BY accessright, year\n", "GROUP BY accessright, year\n",
"ORDER BY count DESC\"\"\"\n", "ORDER BY count DESC\n",
"\"\"\"\n",
"\n", "\n",
"spark.sql(query).limit(20).toPandas()\n" "spark.sql(query).limit(20).toPandas()"
] ]
}, },
{ {
"attachments": {},
"cell_type": "markdown", "cell_type": "markdown",
"metadata": {}, "metadata": {},
"source": [ "source": [
"## Show the number of publications supplemented by datasets" "### **Task:** Show the number of publications supplemented by datasets"
] ]
}, },
{ {
@ -694,8 +744,11 @@
}, },
"outputs": [], "outputs": [],
"source": [ "source": [
"query=\"\"\"SELECT COUNT(*) AS count\n", "query = \"\"\"\n",
"FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id\"\"\"\n", "SELECT COUNT(*) AS count\n",
"FROM relations JOIN publications JOIN datasets ON reltype.name = 'IsSupplementedBy' AND publications.id = relations.source.id AND datasets.id = relations.target.id\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()" "spark.sql(query).limit(20).toPandas()"
] ]
}, },