added some network analysis

This commit is contained in:
Andrea Mannocci 2023-06-06 15:54:52 +02:00
parent 673fa5f9b5
commit 549b1477f2
1 changed files with 108 additions and 3 deletions

View File

@ -597,13 +597,14 @@
"source": [
"query = \"\"\"\n",
"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(*) AS total,\n",
" COUNT(IF(type = 'publication', 1, NULL)) AS publication,\n",
" COUNT(IF(type = 'dataset', 1, NULL)) AS dataset,\n",
" COUNT(IF(type = 'software', 1, NULL)) AS software,\n",
" COUNT(IF(type = 'other', 1, NULL)) AS other\n",
"FROM results JOIN organizations JOIN relations ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf' \n",
"GROUP BY organization \n",
"ORDER BY publication DESC\n",
"ORDER BY total DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()"
@ -627,12 +628,13 @@
"source": [
"query = \"\"\"\n",
"SELECT COALESCE(legalshortname, legalname) AS organization, \n",
" COUNT(*) as total,\n",
" COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n",
" COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n",
" COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n",
"FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n",
"GROUP BY organization\n",
"ORDER BY open DESC\n",
"ORDER BY total DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()"
@ -656,13 +658,14 @@
"source": [
"query = \"\"\"\n",
"SELECT organizations.country.code AS country, \n",
" COUNT(*) AS total,\n",
" COUNT(IF(bestaccessright.label = 'OPEN', 1, NULL)) AS open,\n",
" COUNT(IF(bestaccessright.label = 'EMBARGO', 1, NULL)) AS embargo,\n",
" COUNT(IF(bestaccessright.label = 'CLOSED', 1, NULL)) AS closed\n",
"FROM organizations JOIN relations JOIN results ON organizations.id = relations.source.id AND results.id = relations.target.id AND reltype.name = 'isAuthorInstitutionOf'\n",
"WHERE organizations.country IS NOT NULL\n",
"GROUP BY organizations.country.code\n",
"ORDER BY open DESC\n",
"ORDER BY total DESC\n",
"\"\"\"\n",
"\n",
"spark.sql(query).limit(20).toPandas()"
@ -702,6 +705,108 @@
"spark.sql(query).limit(20).toPandas()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import igraph as ig\n",
"\n",
"G = ig.Graph.TupleList(\n",
" edges=edges[['left', 'right', 'count']].values,\n",
" vertex_name_attr='countrycode',\n",
" edge_attrs = ['weight'],\n",
" directed=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"G.vcount()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"G.ecount()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"G.vs[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"G.es[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import matplotlib.pyplot as plt\n",
"fig, ax = plt.subplots()\n",
"ig.plot(G, vertex_label=G.vs['countrycode'], target=ax)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"G.vs.find(countrycode_eq = 'MY') # maldives"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"H = G.induced_subgraph(G.neighborhood(50))\n",
"H.summary()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"H.vs['color'] = 'grey'\n",
"H.vs[0]['color'] = 'red'\n",
"fig, ax = plt.subplots()\n",
"ig.plot(H, target=ax)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"G.transitivity_local_undirected(50)"
]
},
{
"attachments": {},
"cell_type": "markdown",