From 9739054a74cc22960d93d299a3aadf29523c1037 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Thu, 29 Apr 2021 18:50:02 +0200 Subject: [PATCH] new try with supervisioned ML --- notebooks/01-Exploration.ipynb | 2041 ++++++++++++---------- notebooks/02-Spam filter.ipynb | 1096 ------------ notebooks/03-Machine Learning.ipynb | 1028 ----------- notebooks/03-Supervised.ipynb | 2464 +++++++++++++++++++++++++++ 4 files changed, 3602 insertions(+), 3027 deletions(-) delete mode 100644 notebooks/02-Spam filter.ipynb delete mode 100644 notebooks/03-Machine Learning.ipynb create mode 100644 notebooks/03-Supervised.ipynb diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index 7280a4b..7573fc0 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -190,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 221, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -480,7 +480,7 @@ "4 2 " ] }, - "execution_count": 221, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -532,6 +532,7 @@ " family_name\n", " biography\n", " other_names\n", + " urls\n", " primary_email\n", " keywords\n", " external_ids\n", @@ -567,8 +568,9 @@ " mannocci\n", " data scientist & researcher; scholarly knowled...\n", " NaN\n", + " [[personal website, https://andremann.github.i...\n", " andrea.mannocci@isti.cnr.it\n", - " [science of science, open science, research in...\n", + " [research infrastructures, scholarly knowledge...\n", " [[scopus author id, 55233589900]]\n", " [[information engineering, ph.d., università d...\n", " [[research associate, istituto di scienza e te...\n", @@ -605,11 +607,14 @@ " biography other_names \\\n", "3073261 data scientist & researcher; scholarly knowled... NaN \n", "\n", + " urls \\\n", + "3073261 [[personal website, https://andremann.github.i... \n", + "\n", " primary_email \\\n", "3073261 andrea.mannocci@isti.cnr.it \n", "\n", " keywords \\\n", - "3073261 [science of science, open science, research in... \n", + "3073261 [research infrastructures, scholarly knowledge... \n", "\n", " external_ids \\\n", "3073261 [[scopus author id, 55233589900]] \n", @@ -678,6 +683,7 @@ " family_name\n", " biography\n", " other_names\n", + " urls\n", " primary_email\n", " keywords\n", " external_ids\n", @@ -713,8 +719,9 @@ " whatsapp\n", " <NA>\n", " NaN\n", + " [[otherwhatsapp, https://otherwhatsapp.com/], ...\n", " <NA>\n", - " [whatsapp gb apk, whatsapp gb, whatsapp gb bai...\n", + " [whatsapp gb 2020, whatsapp gb apk, whatsapp g...\n", " NaN\n", " NaN\n", " NaN\n", @@ -745,11 +752,14 @@ " orcid verified_email verified_primary_email \\\n", "9887272 0000-0001-6997-9470 True True \n", "\n", - " given_names family_name biography other_names primary_email \\\n", - "9887272 other whatsapp NaN \n", + " given_names family_name biography other_names \\\n", + "9887272 other whatsapp NaN \n", + "\n", + " urls primary_email \\\n", + "9887272 [[otherwhatsapp, https://otherwhatsapp.com/], ... \n", "\n", " keywords external_ids \\\n", - "9887272 [whatsapp gb apk, whatsapp gb, whatsapp gb bai... NaN \n", + "9887272 [whatsapp gb 2020, whatsapp gb apk, whatsapp g... NaN \n", "\n", " education employment n_works works_source \\\n", "9887272 NaN NaN 0 NaN \n", @@ -783,7 +793,7 @@ "cell_type": "code", "execution_count": 8, "metadata": { - "scrolled": true + "scrolled": false }, "outputs": [ { @@ -796,6 +806,7 @@ "family_name 10671715\n", "biography 354015\n", "other_names 554684\n", + "urls 715067\n", "primary_email 124722\n", "keywords 649637\n", "external_ids 1308598\n", @@ -841,7 +852,7 @@ "text/plain": [ "count 10989649\n", "unique 10989649\n", - "top 0000-0003-4717-4481\n", + "top 0000-0002-2510-4696\n", "freq 1\n", "Name: orcid, dtype: object" ] @@ -950,6 +961,7 @@ " family_name\n", " biography\n", " other_names\n", + " urls\n", " primary_email\n", " keywords\n", " external_ids\n", @@ -985,6 +997,7 @@ " warasart\n", " <NA>\n", " NaN\n", + " NaN\n", " maykin@owasp.org\n", " NaN\n", " NaN\n", @@ -1018,6 +1031,7 @@ " warasart\n", " <NA>\n", " NaN\n", + " NaN\n", " maykin@owasp.org\n", " NaN\n", " NaN\n", @@ -1051,9 +1065,9 @@ "7543981 0000-0002-0836-2271 True True \n", "9316843 0000-0001-9855-1676 True True \n", "\n", - " given_names family_name biography other_names primary_email \\\n", - "7543981 maykin warasart NaN maykin@owasp.org \n", - "9316843 maykin warasart NaN maykin@owasp.org \n", + " given_names family_name biography other_names urls primary_email \\\n", + "7543981 maykin warasart NaN NaN maykin@owasp.org \n", + "9316843 maykin warasart NaN NaN maykin@owasp.org \n", "\n", " keywords external_ids education employment n_works works_source \\\n", "7543981 NaN NaN NaN NaN 0 NaN \n", @@ -1118,6 +1132,7 @@ " family_name\n", " biography\n", " other_names\n", + " urls\n", " primary_email\n", " keywords\n", " external_ids\n", @@ -1153,6 +1168,7 @@ " perçin\n", " <NA>\n", " NaN\n", + " NaN\n", " opercin@erbakan.edu.tr\n", " NaN\n", " NaN\n", @@ -1186,6 +1202,7 @@ " perçin\n", " <NA>\n", " NaN\n", + " NaN\n", " opercin@erbakan.edu.tr\n", " NaN\n", " NaN\n", @@ -1219,13 +1236,13 @@ "347852 0000-0002-2232-9638 True True \n", "1681787 0000-0003-0033-0918 True True \n", "\n", - " given_names family_name biography other_names primary_email \\\n", - "347852 osman perçin NaN opercin@erbakan.edu.tr \n", - "1681787 osman perçin NaN opercin@erbakan.edu.tr \n", + " given_names family_name biography other_names urls \\\n", + "347852 osman perçin NaN NaN \n", + "1681787 osman perçin NaN NaN \n", "\n", - " keywords external_ids education \\\n", - "347852 NaN NaN NaN \n", - "1681787 NaN NaN NaN \n", + " primary_email keywords external_ids education \\\n", + "347852 opercin@erbakan.edu.tr NaN NaN NaN \n", + "1681787 opercin@erbakan.edu.tr NaN NaN NaN \n", "\n", " employment n_works \\\n", "347852 NaN 0 \n", @@ -1290,6 +1307,7 @@ " family_name\n", " biography\n", " other_names\n", + " urls\n", " primary_email\n", " keywords\n", " external_ids\n", @@ -1325,8 +1343,9 @@ " davey\n", " <NA>\n", " NaN\n", + " NaN\n", " patrick.davey@monash.edu\n", - " [inorganic chemistry, radiopharmaceuticals, ra...\n", + " [radiopharmaceuticals, bioinorganic chemistry,...\n", " NaN\n", " NaN\n", " [[phd student, monash university, melbourne, ,...\n", @@ -1358,6 +1377,7 @@ " davey\n", " <NA>\n", " NaN\n", + " NaN\n", " patrick.davey@monash.edu\n", " NaN\n", " NaN\n", @@ -1391,16 +1411,16 @@ "954085 0000-0002-9158-1757 True True \n", "5590332 0000-0002-8774-0030 True True \n", "\n", - " given_names family_name biography other_names \\\n", - "954085 patrick davey NaN \n", - "5590332 patrick davey NaN \n", + " given_names family_name biography other_names urls \\\n", + "954085 patrick davey NaN NaN \n", + "5590332 patrick davey NaN NaN \n", "\n", " primary_email \\\n", "954085 patrick.davey@monash.edu \n", "5590332 patrick.davey@monash.edu \n", "\n", " keywords external_ids \\\n", - "954085 [inorganic chemistry, radiopharmaceuticals, ra... NaN \n", + "954085 [radiopharmaceuticals, bioinorganic chemistry,... NaN \n", "5590332 NaN NaN \n", "\n", " education employment n_works \\\n", @@ -2476,9 +2496,9 @@ } }, "text/html": [ - "