diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index 843eca0..ea30eb8 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 76, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -101,7 +101,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -121,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -143,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -222,16 +222,16 @@ " 0\n", " 0\n", " 0\n", - " 0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " False\n", " NaN\n", " NaN\n", " NaN\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " \n", " \n", " 1\n", @@ -255,16 +255,16 @@ " 0\n", " 0\n", " 0\n", - " 0\n", + " False\n", " NaN\n", " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 1.0\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " 1\n", " \n", " \n", " 2\n", @@ -288,16 +288,16 @@ " 0\n", " 0\n", " 0\n", - " 0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " False\n", " NaN\n", " NaN\n", " NaN\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " \n", " \n", " 3\n", @@ -321,16 +321,16 @@ " 0\n", " 0\n", " 0\n", - " 0\n", + " False\n", " NaN\n", " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 1.0\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " 1\n", " \n", " \n", " 4\n", @@ -354,16 +354,16 @@ " 0\n", " 0\n", " 0\n", - " 0\n", + " False\n", " NaN\n", " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 2.0\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " 2\n", " \n", " \n", "\n", @@ -399,28 +399,28 @@ "4 0 NaN 2014-03-10t13:22:01.966z 2016-06-14t22:17:54.470z \n", "\n", " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", - "0 0 0 0 0 0 NaN \n", - "1 0 0 0 0 0 NaN \n", - "2 0 0 0 0 0 NaN \n", - "3 0 0 0 0 0 NaN \n", - "4 0 0 0 0 0 NaN \n", + "0 0 0 0 0 False NaN \n", + "1 0 0 0 0 False NaN \n", + "2 0 0 0 0 False NaN \n", + "3 0 0 0 0 False NaN \n", + "4 0 0 0 0 False NaN \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", - "0 NaN NaN NaN NaN NaN NaN \n", - "1 NaN NaN NaN NaN NaN NaN \n", - "2 NaN NaN NaN NaN NaN NaN \n", - "3 NaN NaN NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN NaN NaN \n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", "\n", " n_education n_employment \n", - "0 NaN NaN \n", - "1 NaN 1.0 \n", - "2 NaN NaN \n", - "3 NaN 1.0 \n", - "4 NaN 2.0 " + "0 \n", + "1 1 \n", + "2 \n", + "3 1 \n", + "4 2 " ] }, - "execution_count": 80, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -501,14 +501,14 @@ " \n", " 3073261\n", " 0000-0002-5193-7851\n", - " 1\n", - " 1\n", + " True\n", + " True\n", " andrea\n", " mannocci\n", " data scientist & researcher; scholarly knowled...\n", " NaN\n", " andrea.mannocci@isti.cnr.it\n", - " [research infrastructures, science of science,...\n", + " [open science, data science, research infrastr...\n", " [[scopus author id, 55233589900]]\n", " [[information engineering, ph.d., università d...\n", " [[research associate, istituto di scienza e te...\n", @@ -520,16 +520,16 @@ " 0\n", " 0\n", " 60\n", - " 1\n", + " True\n", " isti.cnr.it\n", " NaN\n", " [github.io, twitter.com, linkedin.com]\n", - " NaN\n", - " 3.0\n", - " 1.0\n", - " 5.0\n", - " 4.0\n", - " 5.0\n", + " <NA>\n", + " 3\n", + " 1\n", + " 5\n", + " 4\n", + " 5\n", " \n", " \n", "\n", @@ -537,7 +537,7 @@ ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", - "3073261 0000-0002-5193-7851 1 1 \n", + "3073261 0000-0002-5193-7851 True True \n", "\n", " given_names family_name \\\n", "3073261 andrea mannocci \n", @@ -549,7 +549,7 @@ "3073261 andrea.mannocci@isti.cnr.it \n", "\n", " keywords \\\n", - "3073261 [research infrastructures, science of science,... \n", + "3073261 [open science, data science, research infrastr... \n", "\n", " external_ids \\\n", "3073261 [[scopus author id, 55233589900]] \n", @@ -567,13 +567,13 @@ "3073261 2017-09-12t14:28:33.467z 2021-03-17t15:40:07.776z 34 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", - "3073261 0 60 1 isti.cnr.it NaN \n", + "3073261 0 60 True isti.cnr.it NaN \n", "\n", " url_domains n_emails n_urls n_ids \\\n", - "3073261 [github.io, twitter.com, linkedin.com] NaN 3.0 1.0 \n", + "3073261 [github.io, twitter.com, linkedin.com] 3 1 \n", "\n", " n_keywords n_education n_employment \n", - "3073261 5.0 4.0 5.0 " + "3073261 5 4 5 " ] }, "execution_count": 6, @@ -647,13 +647,13 @@ " \n", " 9887272\n", " 0000-0001-6997-9470\n", - " 1\n", - " 1\n", + " True\n", + " True\n", " other\n", " whatsapp\n", + " <NA>\n", " NaN\n", - " NaN\n", - " NaN\n", + " <NA>\n", " [whatsapp gb apk, whatsapp gb baixar, whatsapp...\n", " NaN\n", " NaN\n", @@ -666,16 +666,16 @@ " 0\n", " 0\n", " 0\n", - " 0\n", + " False\n", " NaN\n", " NaN\n", " [otherwhatsapp.com, im-creator.com, facebook.c...\n", - " NaN\n", - " 27.0\n", - " NaN\n", - " 4.0\n", - " NaN\n", - " NaN\n", + " <NA>\n", + " 27\n", + " <NA>\n", + " 4\n", + " <NA>\n", + " <NA>\n", " \n", " \n", "\n", @@ -683,10 +683,10 @@ ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", - "9887272 0000-0001-6997-9470 1 1 \n", + "9887272 0000-0001-6997-9470 True True \n", "\n", " given_names family_name biography other_names primary_email \\\n", - "9887272 other whatsapp NaN NaN NaN \n", + "9887272 other whatsapp NaN \n", "\n", " keywords external_ids \\\n", "9887272 [whatsapp gb apk, whatsapp gb baixar, whatsapp... NaN \n", @@ -695,16 +695,16 @@ "9887272 NaN NaN 0 NaN 2020-10-07t10:37:12.237z \n", "\n", " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", - "9887272 2020-10-08t02:32:03.935z 0 0 0 0 0 \n", + "9887272 2020-10-08t02:32:03.935z 0 0 0 0 False \n", "\n", " primary_email_domain other_email_domains \\\n", "9887272 NaN NaN \n", "\n", " url_domains n_emails n_urls \\\n", - "9887272 [otherwhatsapp.com, im-creator.com, facebook.c... NaN 27.0 \n", + "9887272 [otherwhatsapp.com, im-creator.com, facebook.c... 27 \n", "\n", " n_ids n_keywords n_education n_employment \n", - "9887272 NaN 4.0 NaN NaN " + "9887272 4 " ] }, "execution_count": 7, @@ -778,7 +778,7 @@ "text/plain": [ "count 10989649\n", "unique 10989649\n", - "top 0000-0001-7886-4851\n", + "top 0000-0002-6749-5031\n", "freq 1\n", "Name: orcid, dtype: object" ] @@ -842,7 +842,7 @@ "5590332 patrick.davey@monash.edu\n", "9316843 maykin@owasp.org\n", "10375852 andycheng2026@163.com\n", - "Name: primary_email, dtype: object" + "Name: primary_email, dtype: string" ] }, "execution_count": 11, @@ -916,11 +916,11 @@ " \n", " 7543981\n", " 0000-0002-0836-2271\n", - " 1\n", - " 1\n", + " True\n", + " True\n", " maykin\n", " warasart\n", - " NaN\n", + " <NA>\n", " NaN\n", " maykin@owasp.org\n", " NaN\n", @@ -935,25 +935,25 @@ " 0\n", " 0\n", " 0\n", - " 0\n", + " False\n", " owasp.org\n", " [dga.or.th]\n", " NaN\n", - " 1.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 1\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " \n", " \n", " 9316843\n", " 0000-0001-9855-1676\n", - " 1\n", - " 1\n", + " True\n", + " True\n", " maykin\n", " warasart\n", - " NaN\n", + " <NA>\n", " NaN\n", " maykin@owasp.org\n", " NaN\n", @@ -968,16 +968,16 @@ " 0\n", " 0\n", " 0\n", - " 0\n", + " False\n", " owasp.org\n", " [dga.or.th, ieee.org]\n", " NaN\n", - " 2.0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 2\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " \n", " \n", "\n", @@ -985,12 +985,12 @@ ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", - "7543981 0000-0002-0836-2271 1 1 \n", - "9316843 0000-0001-9855-1676 1 1 \n", + "7543981 0000-0002-0836-2271 True True \n", + "9316843 0000-0001-9855-1676 True True \n", "\n", " given_names family_name biography other_names primary_email \\\n", - "7543981 maykin warasart NaN NaN maykin@owasp.org \n", - "9316843 maykin warasart NaN NaN maykin@owasp.org \n", + "7543981 maykin warasart NaN maykin@owasp.org \n", + "9316843 maykin warasart NaN maykin@owasp.org \n", "\n", " keywords external_ids education employment n_works works_source \\\n", "7543981 NaN NaN NaN NaN 0 NaN \n", @@ -1001,16 +1001,16 @@ "9316843 2020-10-23t17:51:51.925z 2021-01-01t15:00:52.053z 0 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain \\\n", - "7543981 0 0 0 owasp.org \n", - "9316843 0 0 0 owasp.org \n", + "7543981 0 0 False owasp.org \n", + "9316843 0 0 False owasp.org \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids \\\n", - "7543981 [dga.or.th] NaN 1.0 NaN NaN \n", - "9316843 [dga.or.th, ieee.org] NaN 2.0 NaN NaN \n", + "7543981 [dga.or.th] NaN 1 \n", + "9316843 [dga.or.th, ieee.org] NaN 2 \n", "\n", " n_keywords n_education n_employment \n", - "7543981 NaN NaN NaN \n", - "9316843 NaN NaN NaN " + "7543981 \n", + "9316843 " ] }, "execution_count": 12, @@ -1084,11 +1084,11 @@ " \n", " 347852\n", " 0000-0002-2232-9638\n", - " 1\n", - " 1\n", + " True\n", + " True\n", " osman\n", " perçin\n", - " NaN\n", + " <NA>\n", " NaN\n", " opercin@erbakan.edu.tr\n", " NaN\n", @@ -1103,25 +1103,25 @@ " 0\n", " 0\n", " 0\n", - " 0\n", + " False\n", " erbakan.edu.tr\n", " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " \n", " \n", " 1681787\n", " 0000-0003-0033-0918\n", - " 1\n", - " 1\n", + " True\n", + " True\n", " osman\n", " perçin\n", - " NaN\n", + " <NA>\n", " NaN\n", " opercin@erbakan.edu.tr\n", " NaN\n", @@ -1136,16 +1136,16 @@ " 0\n", " 0\n", " 0\n", - " 0\n", + " False\n", " erbakan.edu.tr\n", " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 1.0\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " 1\n", " \n", " \n", "\n", @@ -1153,12 +1153,12 @@ ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", - "347852 0000-0002-2232-9638 1 1 \n", - "1681787 0000-0003-0033-0918 1 1 \n", + "347852 0000-0002-2232-9638 True True \n", + "1681787 0000-0003-0033-0918 True True \n", "\n", " given_names family_name biography other_names primary_email \\\n", - "347852 osman perçin NaN NaN opercin@erbakan.edu.tr \n", - "1681787 osman perçin NaN NaN opercin@erbakan.edu.tr \n", + "347852 osman perçin NaN opercin@erbakan.edu.tr \n", + "1681787 osman perçin NaN opercin@erbakan.edu.tr \n", "\n", " keywords external_ids education \\\n", "347852 NaN NaN NaN \n", @@ -1173,16 +1173,16 @@ "1681787 NaN 2015-10-13t05:47:12.014z 2020-12-25t13:52:03.976z \n", "\n", " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", - "347852 0 0 0 0 0 erbakan.edu.tr \n", - "1681787 0 0 0 0 0 erbakan.edu.tr \n", + "347852 0 0 0 0 False erbakan.edu.tr \n", + "1681787 0 0 0 0 False erbakan.edu.tr \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", - "347852 NaN NaN NaN NaN NaN NaN \n", - "1681787 NaN NaN NaN NaN NaN NaN \n", + "347852 NaN NaN \n", + "1681787 NaN NaN \n", "\n", " n_education n_employment \n", - "347852 NaN NaN \n", - "1681787 NaN 1.0 " + "347852 \n", + "1681787 1 " ] }, "execution_count": 13, @@ -1256,14 +1256,14 @@ " \n", " 954085\n", " 0000-0002-9158-1757\n", - " 1\n", - " 1\n", + " True\n", + " True\n", " patrick\n", " davey\n", - " NaN\n", + " <NA>\n", " NaN\n", " patrick.davey@monash.edu\n", - " [radiochemistry, radiopharmaceuticals, inorgan...\n", + " [radiochemistry, radiopharmaceuticals, bioinor...\n", " NaN\n", " NaN\n", " [[phd student, monash university, melbourne, ,...\n", @@ -1275,25 +1275,25 @@ " 0\n", " 0\n", " 0\n", - " 0\n", + " False\n", " monash.edu\n", " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 4.0\n", - " NaN\n", - " 1.0\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " 4\n", + " <NA>\n", + " 1\n", " \n", " \n", " 5590332\n", " 0000-0002-8774-0030\n", - " 1\n", - " 1\n", + " True\n", + " True\n", " patrick\n", " davey\n", - " NaN\n", + " <NA>\n", " NaN\n", " patrick.davey@monash.edu\n", " NaN\n", @@ -1308,16 +1308,16 @@ " 0\n", " 0\n", " 0\n", - " 1\n", + " True\n", " monash.edu\n", " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 1.0\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " 1\n", " \n", " \n", "\n", @@ -1325,19 +1325,19 @@ ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", - "954085 0000-0002-9158-1757 1 1 \n", - "5590332 0000-0002-8774-0030 1 1 \n", + "954085 0000-0002-9158-1757 True True \n", + "5590332 0000-0002-8774-0030 True True \n", "\n", " given_names family_name biography other_names \\\n", - "954085 patrick davey NaN NaN \n", - "5590332 patrick davey NaN NaN \n", + "954085 patrick davey NaN \n", + "5590332 patrick davey NaN \n", "\n", " primary_email \\\n", "954085 patrick.davey@monash.edu \n", "5590332 patrick.davey@monash.edu \n", "\n", " keywords external_ids \\\n", - "954085 [radiochemistry, radiopharmaceuticals, inorgan... NaN \n", + "954085 [radiochemistry, radiopharmaceuticals, bioinor... NaN \n", "5590332 NaN NaN \n", "\n", " education employment n_works \\\n", @@ -1349,16 +1349,16 @@ "5590332 [crossref] 2018-09-11t10:47:10.997z 2021-02-09t06:21:44.138z \n", "\n", " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", - "954085 0 0 0 0 0 monash.edu \n", - "5590332 1 0 0 0 1 monash.edu \n", + "954085 0 0 0 0 False monash.edu \n", + "5590332 1 0 0 0 True monash.edu \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", - "954085 NaN NaN NaN NaN NaN 4.0 \n", - "5590332 NaN NaN NaN NaN NaN NaN \n", + "954085 NaN NaN 4 \n", + "5590332 NaN NaN \n", "\n", " n_education n_employment \n", - "954085 NaN 1.0 \n", - "5590332 NaN 1.0 " + "954085 1 \n", + "5590332 1 " ] }, "execution_count": 14, @@ -2413,9 +2413,9 @@ } }, "text/html": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" + "ename": "ValueError", + "evalue": "Cannot set non-string value '-1' into a StringArray.", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mimshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, axis, inplace, limit, downcast)\u001b[0m\n\u001b[1;32m 4460\u001b[0m \u001b[0mdowncast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4461\u001b[0m ) -> Optional[DataFrame]:\n\u001b[0;32m-> 4462\u001b[0;31m return super().fillna(\n\u001b[0m\u001b[1;32m 4463\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4464\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, axis, inplace, limit, downcast)\u001b[0m\n\u001b[1;32m 6427\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6428\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_list_like\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6429\u001b[0;31m new_data = self._mgr.fillna(\n\u001b[0m\u001b[1;32m 6430\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlimit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdowncast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdowncast\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6431\u001b[0m )\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, limit, inplace, downcast)\u001b[0m\n\u001b[1;32m 619\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 620\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdowncast\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m\"BlockManager\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 621\u001b[0;31m return self.apply(\n\u001b[0m\u001b[1;32m 622\u001b[0m \u001b[0;34m\"fillna\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlimit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdowncast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdowncast\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 623\u001b[0m )\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, f, align_keys, ignore_failures, **kwargs)\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 426\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 427\u001b[0;31m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 428\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mTypeError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mignore_failures\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/blocks.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, limit, inplace, downcast)\u001b[0m\n\u001b[1;32m 1916\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdowncast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1917\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minplace\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1918\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlimit\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1919\u001b[0m return [\n\u001b[1;32m 1920\u001b[0m self.make_block_same_class(\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/arrays/_mixins.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, limit)\u001b[0m\n\u001b[1;32m 260\u001b[0m \u001b[0;31m# fill with value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 261\u001b[0m \u001b[0mnew_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 262\u001b[0;31m \u001b[0mnew_values\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 263\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 264\u001b[0m \u001b[0mnew_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/arrays/string_.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStringDtype\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mna_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 275\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 276\u001b[0m \u001b[0;34mf\"Cannot set non-string value '{value}' into a StringArray.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 277\u001b[0m )\n", + "\u001b[0;31mValueError\u001b[0m: Cannot set non-string value '-1' into a StringArray." + ] } ], "source": [ - "fig = px.imshow(df.fillna(-1).corr())\n", + "fig = px.imshow(df[[]].fillna(-1).corr())\n", "fig.show()" ] }, { "cell_type": "code", - "execution_count": 69, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "df[['verified_email', \n", - " 'verified_primary_email', \n", - " 'n_works', \n", - " 'n_doi',\n", - " 'n_arxiv', \n", - " 'n_pmc', \n", - " 'n_other_pids', \n", - " 'n_emails', \n", - " 'n_urls', \n", - " 'n_ids', \n", - " 'n_keywords', \n", - " 'n_employment', \n", - " 'n_education', \n", - " 'label']].to_pickle('../data/processed/features.pkl')" + "# df[['verified_email', \n", + "# 'verified_primary_email', \n", + "# 'n_works', \n", + "# 'n_doi',\n", + "# 'n_arxiv', \n", + "# 'n_pmc', \n", + "# 'n_other_pids', \n", + "# 'n_emails', \n", + "# 'n_urls', \n", + "# 'n_ids', \n", + "# 'n_keywords', \n", + "# 'n_employment', \n", + "# 'n_education', \n", + "# 'label']].to_pickle('../data/processed/features.pkl')" ] }, { @@ -17950,13 +16684,13 @@ " \n", " 17\n", " 0000-0002-0137-3066\n", - " 1\n", - " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " True\n", + " True\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " NaN\n", + " <NA>\n", " NaN\n", " NaN\n", " NaN\n", @@ -17969,16 +16703,16 @@ " 0\n", " 0\n", " 0\n", - " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " True\n", " NaN\n", " NaN\n", " NaN\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " NaN\n", " NaN\n", " False\n", @@ -17987,13 +16721,13 @@ " \n", " 19\n", " 0000-0002-0461-9711\n", - " 1\n", - " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " True\n", + " True\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " NaN\n", + " <NA>\n", " NaN\n", " NaN\n", " NaN\n", @@ -18006,16 +16740,16 @@ " 0\n", " 0\n", " 0\n", - " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " True\n", " NaN\n", " NaN\n", " NaN\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " NaN\n", " NaN\n", " False\n", @@ -18024,13 +16758,13 @@ " \n", " 22\n", " 0000-0002-0761-9450\n", - " 1\n", - " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " True\n", + " True\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " NaN\n", + " <NA>\n", " NaN\n", " NaN\n", " NaN\n", @@ -18043,16 +16777,16 @@ " 0\n", " 0\n", " 0\n", - " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " True\n", " NaN\n", " NaN\n", " NaN\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " NaN\n", " NaN\n", " False\n", @@ -18061,13 +16795,13 @@ " \n", " 33\n", " 0000-0002-4447-9215\n", - " 1\n", - " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " True\n", + " True\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " NaN\n", + " <NA>\n", " NaN\n", " NaN\n", " NaN\n", @@ -18080,16 +16814,16 @@ " 0\n", " 0\n", " 0\n", - " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " True\n", " NaN\n", " NaN\n", " NaN\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " NaN\n", " NaN\n", " False\n", @@ -18098,13 +16832,13 @@ " \n", " 44\n", " 0000-0003-0426-4065\n", - " 1\n", - " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", + " True\n", + " True\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " [eliza i. gilbert]\n", - " NaN\n", + " <NA>\n", " NaN\n", " NaN\n", " NaN\n", @@ -18117,18 +16851,18 @@ " 0\n", " 0\n", " 0\n", + " True\n", + " NaN\n", + " NaN\n", + " NaN\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " 1\n", " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 1.0\n", - " NaN\n", - " NaN\n", " False\n", " NaN\n", " \n", @@ -18172,14 +16906,14 @@ " \n", " 10989636\n", " 0000-0002-2906-0299\n", - " 1\n", - " 1\n", + " True\n", + " True\n", " tiffany\n", " mackay\n", - " NaN\n", + " <NA>\n", " [tiffany russel sia]\n", - " NaN\n", - " [prostate cancer, oxytocin, radiolabelling, ga...\n", + " <NA>\n", + " [gpc-1, cancer, immunohistochemistry, clinical...\n", " [[researcherid, a-2121-2017]]\n", " [[faculty of medicine, master in pharmaceutica...\n", " [[clinical project lead, minomic international...\n", @@ -18191,16 +16925,16 @@ " 0\n", " 0\n", " 0\n", - " 1\n", + " True\n", " NaN\n", " NaN\n", " [oxytocin.com.au, linkedin.com]\n", - " NaN\n", - " 2.0\n", - " 1.0\n", - " 13.0\n", - " 2.0\n", - " 4.0\n", + " <NA>\n", + " 2\n", + " 1\n", + " 13\n", + " 2\n", + " 4\n", " [crossref, researcherid]\n", " 2.0\n", " True\n", @@ -18209,13 +16943,13 @@ " \n", " 10989637\n", " 0000-0001-5896-2024\n", - " 1\n", - " 1\n", + " True\n", + " True\n", " giovanni, l\n", " tiscia\n", + " <NA>\n", " NaN\n", - " NaN\n", - " NaN\n", + " <NA>\n", " NaN\n", " [[scopus author id, 54948242800]]\n", " NaN\n", @@ -18228,16 +16962,16 @@ " 0\n", " 17\n", " 52\n", + " True\n", + " NaN\n", + " NaN\n", + " NaN\n", + " <NA>\n", + " <NA>\n", " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 1.0\n", - " NaN\n", - " NaN\n", - " NaN\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " [scopus - elsevier, europe pubmed central, cro...\n", " 3.0\n", " True\n", @@ -18246,13 +16980,13 @@ " \n", " 10989640\n", " 0000-0002-1070-2220\n", - " 1\n", - " 1\n", + " True\n", + " True\n", " vinicios\n", " santanna\n", - " NaN\n", + " <NA>\n", " [vinicios sant anna, vinicios sant anna, vinic...\n", - " NaN\n", + " <NA>\n", " NaN\n", " [[scopus author id, 57201697952]]\n", " [[economics, ph.d., university of illinois at ...\n", @@ -18265,16 +16999,16 @@ " 0\n", " 0\n", " 1\n", - " 1\n", + " True\n", " NaN\n", " NaN\n", " [vpsantanna.com]\n", - " NaN\n", - " 1.0\n", - " 1.0\n", - " NaN\n", - " 2.0\n", - " NaN\n", + " <NA>\n", + " 1\n", + " 1\n", + " <NA>\n", + " 2\n", + " <NA>\n", " [crossref metadata search]\n", " 1.0\n", " True\n", @@ -18283,13 +17017,13 @@ " \n", " 10989643\n", " 0000-0003-2606-0936\n", - " 1\n", - " 1\n", + " True\n", + " True\n", " luang\n", " xu\n", - " NaN\n", + " <NA>\n", " [xu lu-ang, lu lu]\n", - " NaN\n", + " <NA>\n", " NaN\n", " NaN\n", " NaN\n", @@ -18302,16 +17036,16 @@ " 0\n", " 0\n", " 1\n", + " True\n", + " NaN\n", + " NaN\n", + " NaN\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 1.0\n", " [scopus - elsevier, crossref]\n", " 2.0\n", " True\n", @@ -18320,13 +17054,13 @@ " \n", " 10989645\n", " 0000-0002-3800-6331\n", - " 1\n", - " 1\n", + " True\n", + " True\n", " zachary\n", " calamari\n", + " <NA>\n", " NaN\n", - " NaN\n", - " NaN\n", + " <NA>\n", " NaN\n", " NaN\n", " [[richard gilder graduate school, phd in compa...\n", @@ -18339,16 +17073,16 @@ " 0\n", " 1\n", " 0\n", - " 1\n", + " True\n", " NaN\n", " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 2.0\n", - " 2.0\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", + " 2\n", + " 2\n", " [crossref metadata search, crossref]\n", " 2.0\n", " True\n", @@ -18361,43 +17095,43 @@ ], "text/plain": [ " orcid verified_email verified_primary_email \\\n", - "17 0000-0002-0137-3066 1 1 \n", - "19 0000-0002-0461-9711 1 1 \n", - "22 0000-0002-0761-9450 1 1 \n", - "33 0000-0002-4447-9215 1 1 \n", - "44 0000-0003-0426-4065 1 1 \n", + "17 0000-0002-0137-3066 True True \n", + "19 0000-0002-0461-9711 True True \n", + "22 0000-0002-0761-9450 True True \n", + "33 0000-0002-4447-9215 True True \n", + "44 0000-0003-0426-4065 True True \n", "... ... ... ... \n", - "10989636 0000-0002-2906-0299 1 1 \n", - "10989637 0000-0001-5896-2024 1 1 \n", - "10989640 0000-0002-1070-2220 1 1 \n", - "10989643 0000-0003-2606-0936 1 1 \n", - "10989645 0000-0002-3800-6331 1 1 \n", + "10989636 0000-0002-2906-0299 True True \n", + "10989637 0000-0001-5896-2024 True True \n", + "10989640 0000-0002-1070-2220 True True \n", + "10989643 0000-0003-2606-0936 True True \n", + "10989645 0000-0002-3800-6331 True True \n", "\n", " given_names family_name biography \\\n", - "17 NaN NaN NaN \n", - "19 NaN NaN NaN \n", - "22 NaN NaN NaN \n", - "33 NaN NaN NaN \n", - "44 NaN NaN NaN \n", + "17 \n", + "19 \n", + "22 \n", + "33 \n", + "44 \n", "... ... ... ... \n", - "10989636 tiffany mackay NaN \n", - "10989637 giovanni, l tiscia NaN \n", - "10989640 vinicios santanna NaN \n", - "10989643 luang xu NaN \n", - "10989645 zachary calamari NaN \n", + "10989636 tiffany mackay \n", + "10989637 giovanni, l tiscia \n", + "10989640 vinicios santanna \n", + "10989643 luang xu \n", + "10989645 zachary calamari \n", "\n", " other_names primary_email \\\n", - "17 NaN NaN \n", - "19 NaN NaN \n", - "22 NaN NaN \n", - "33 NaN NaN \n", - "44 [eliza i. gilbert] NaN \n", + "17 NaN \n", + "19 NaN \n", + "22 NaN \n", + "33 NaN \n", + "44 [eliza i. gilbert] \n", "... ... ... \n", - "10989636 [tiffany russel sia] NaN \n", - "10989637 NaN NaN \n", - "10989640 [vinicios sant anna, vinicios sant anna, vinic... NaN \n", - "10989643 [xu lu-ang, lu lu] NaN \n", - "10989645 NaN NaN \n", + "10989636 [tiffany russel sia] \n", + "10989637 NaN \n", + "10989640 [vinicios sant anna, vinicios sant anna, vinic... \n", + "10989643 [xu lu-ang, lu lu] \n", + "10989645 NaN \n", "\n", " keywords \\\n", "17 NaN \n", @@ -18406,7 +17140,7 @@ "33 NaN \n", "44 NaN \n", "... ... \n", - "10989636 [prostate cancer, oxytocin, radiolabelling, ga... \n", + "10989636 [gpc-1, cancer, immunohistochemistry, clinical... \n", "10989637 NaN \n", "10989640 NaN \n", "10989643 NaN \n", @@ -18478,43 +17212,43 @@ "10989645 2015-01-20t20:20:17.042z 2020-11-21t19:48:36.221z 7 0 \n", "\n", " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", - "17 0 0 1 NaN NaN \n", - "19 0 0 1 NaN NaN \n", - "22 0 0 1 NaN NaN \n", - "33 0 0 1 NaN NaN \n", - "44 0 0 1 NaN NaN \n", + "17 0 0 True NaN NaN \n", + "19 0 0 True NaN NaN \n", + "22 0 0 True NaN NaN \n", + "33 0 0 True NaN NaN \n", + "44 0 0 True NaN NaN \n", "... ... ... ... ... ... \n", - "10989636 0 0 1 NaN NaN \n", - "10989637 17 52 1 NaN NaN \n", - "10989640 0 1 1 NaN NaN \n", - "10989643 0 1 1 NaN NaN \n", - "10989645 1 0 1 NaN NaN \n", + "10989636 0 0 True NaN NaN \n", + "10989637 17 52 True NaN NaN \n", + "10989640 0 1 True NaN NaN \n", + "10989643 0 1 True NaN NaN \n", + "10989645 1 0 True NaN NaN \n", "\n", " url_domains n_emails n_urls n_ids \\\n", - "17 NaN NaN NaN NaN \n", - "19 NaN NaN NaN NaN \n", - "22 NaN NaN NaN NaN \n", - "33 NaN NaN NaN NaN \n", - "44 NaN NaN NaN NaN \n", + "17 NaN \n", + "19 NaN \n", + "22 NaN \n", + "33 NaN \n", + "44 NaN \n", "... ... ... ... ... \n", - "10989636 [oxytocin.com.au, linkedin.com] NaN 2.0 1.0 \n", - "10989637 NaN NaN NaN 1.0 \n", - "10989640 [vpsantanna.com] NaN 1.0 1.0 \n", - "10989643 NaN NaN NaN NaN \n", - "10989645 NaN NaN NaN NaN \n", + "10989636 [oxytocin.com.au, linkedin.com] 2 1 \n", + "10989637 NaN 1 \n", + "10989640 [vpsantanna.com] 1 1 \n", + "10989643 NaN \n", + "10989645 NaN \n", "\n", " n_keywords n_education n_employment \\\n", - "17 NaN NaN NaN \n", - "19 NaN NaN NaN \n", - "22 NaN NaN NaN \n", - "33 NaN NaN NaN \n", - "44 NaN NaN 1.0 \n", + "17 \n", + "19 \n", + "22 \n", + "33 \n", + "44 1 \n", "... ... ... ... \n", - "10989636 13.0 2.0 4.0 \n", - "10989637 NaN NaN NaN \n", - "10989640 NaN 2.0 NaN \n", - "10989643 NaN NaN 1.0 \n", - "10989645 NaN 2.0 2.0 \n", + "10989636 13 2 4 \n", + "10989637 \n", + "10989640 2 \n", + "10989643 1 \n", + "10989645 2 2 \n", "\n", " ext_works_source \\\n", "17 NaN \n", @@ -18556,182 +17290,19 @@ }, { "cell_type": "code", - "execution_count": 105, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 10989649 entries, 0 to 10989648\n", - "Data columns (total 30 columns):\n", - " # Column Dtype \n", - "--- ------ ----- \n", - " 0 orcid string\n", - " 1 verified_email bool \n", - " 2 verified_primary_email bool \n", - " 3 given_names string\n", - " 4 family_name string\n", - " 5 biography string\n", - " 6 other_names object\n", - " 7 primary_email string\n", - " 8 keywords object\n", - " 9 external_ids object\n", - " 10 education object\n", - " 11 employment object\n", - " 12 n_works int64 \n", - " 13 works_source object\n", - " 14 activation_date string\n", - " 15 last_update_date string\n", - " 16 n_doi int64 \n", - " 17 n_arxiv int64 \n", - " 18 n_pmc int64 \n", - " 19 n_other_pids int64 \n", - " 20 label int64 \n", - " 21 primary_email_domain object\n", - " 22 other_email_domains object\n", - " 23 url_domains object\n", - " 24 n_emails UInt16\n", - " 25 n_urls UInt16\n", - " 26 n_ids UInt16\n", - " 27 n_keywords UInt16\n", - " 28 n_education UInt16\n", - " 29 n_employment UInt16\n", - "dtypes: UInt16(6), bool(2), int64(6), object(9), string(7)\n", - "memory usage: 2.0+ GB\n" - ] - } - ], - "source": [ - "# (df.n_works > 0) & (df.n_ids > 1)\n", - "df.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 104, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ - "df.n_ids = df.n_ids.astype(pd.UInt16Dtype())" + "# (df.n_works > 0) & (df.n_ids > 1)" ] }, { "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 2016-07-27 10:09:13.585000+00:00\n", - "1 2016-07-27 10:09:13.585000+00:00\n", - "2 NaT\n", - "3 2016-07-27 10:09:13.585000+00:00\n", - "dtype: datetime64[ns, UTC]" - ] - }, - "execution_count": 107, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.Series(['2016-07-27t10:09:13.585z', '2016-07-27t10:09:13.585z', pd.NA, '2016-07-27t10:09:13.585z'])" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [ - { - "ename": "KeyboardInterrupt", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", - "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_datetime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mactivation_date\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", - "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/tools/datetimes.py\u001b[0m in \u001b[0;36mto_datetime\u001b[0;34m(arg, errors, dayfirst, yearfirst, utc, format, exact, unit, infer_datetime_format, origin, cache)\u001b[0m\n\u001b[1;32m 803\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcache_array\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 804\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 805\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconvert_listlike\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 806\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 807\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mABCDataFrame\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mabc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMutableMapping\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/tools/datetimes.py\u001b[0m in \u001b[0;36m_convert_listlike_datetimes\u001b[0;34m(arg, format, name, tz, unit, errors, infer_datetime_format, dayfirst, yearfirst, exact)\u001b[0m\n\u001b[1;32m 463\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mformat\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0minfer_datetime_format\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 464\u001b[0m \u001b[0mutc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtz\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"utc\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 465\u001b[0;31m result, tz_parsed = objects_to_datetime64ns(\n\u001b[0m\u001b[1;32m 466\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 467\u001b[0m \u001b[0mdayfirst\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdayfirst\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py\u001b[0m in \u001b[0;36mobjects_to_datetime64ns\u001b[0;34m(data, dayfirst, yearfirst, utc, errors, require_iso8601, allow_object)\u001b[0m\n\u001b[1;32m 2073\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2074\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2075\u001b[0;31m result, tz_parsed = tslib.array_to_datetime(\n\u001b[0m\u001b[1;32m 2076\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2077\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32mpandas/_libs/tslib.pyx\u001b[0m in \u001b[0;36mpandas._libs.tslib.array_to_datetime\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mpandas/_libs/tslib.pyx\u001b[0m in \u001b[0;36mpandas._libs.tslib.array_to_datetime\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mpandas/_libs/tslibs/parsing.pyx\u001b[0m in \u001b[0;36mpandas._libs.tslibs.parsing.parse_datetime_string\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32mpandas/_libs/tslibs/parsing.pyx\u001b[0m in \u001b[0;36mpandas._libs.tslibs.parsing._parse_dateabbr_string\u001b[0;34m()\u001b[0m\n", - "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/_strptime.py\u001b[0m in \u001b[0;36m_strptime_datetime\u001b[0;34m(cls, data_string, format)\u001b[0m\n\u001b[1;32m 566\u001b[0m \"\"\"Return a class cls instance based on the input string and the\n\u001b[1;32m 567\u001b[0m format string.\"\"\"\n\u001b[0;32m--> 568\u001b[0;31m \u001b[0mtt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfraction\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgmtoff_fraction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_strptime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_string\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 569\u001b[0m \u001b[0mtzname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgmtoff\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtt\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 570\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtt\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfraction\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/_strptime.py\u001b[0m in \u001b[0;36m_strptime\u001b[0;34m(data_string, format)\u001b[0m\n\u001b[1;32m 347\u001b[0m \u001b[0mfound\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mformat_regex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_string\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mfound\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 349\u001b[0;31m raise ValueError(\"time data %r does not match format %r\" %\n\u001b[0m\u001b[1;32m 350\u001b[0m (data_string, format))\n\u001b[1;32m 351\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_string\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mfound\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", - "\u001b[0;31mKeyboardInterrupt\u001b[0m: " - ] - } - ], - "source": [ - "pd.to_datetime(df.activation_date)" - ] - }, - { - "cell_type": "code", - "execution_count": 109, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "df['label'] = df['label'].astype('bool')\n" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 10989649 entries, 0 to 10989648\n", - "Data columns (total 30 columns):\n", - " # Column Dtype \n", - "--- ------ ----- \n", - " 0 orcid string\n", - " 1 verified_email bool \n", - " 2 verified_primary_email bool \n", - " 3 given_names string\n", - " 4 family_name string\n", - " 5 biography string\n", - " 6 other_names object\n", - " 7 primary_email string\n", - " 8 keywords object\n", - " 9 external_ids object\n", - " 10 education object\n", - " 11 employment object\n", - " 12 n_works int64 \n", - " 13 works_source object\n", - " 14 activation_date string\n", - " 15 last_update_date string\n", - " 16 n_doi int64 \n", - " 17 n_arxiv int64 \n", - " 18 n_pmc int64 \n", - " 19 n_other_pids int64 \n", - " 20 label bool \n", - " 21 primary_email_domain object\n", - " 22 other_email_domains object\n", - " 23 url_domains object\n", - " 24 n_emails UInt16\n", - " 25 n_urls UInt16\n", - " 26 n_ids UInt16\n", - " 27 n_keywords UInt16\n", - " 28 n_education UInt16\n", - " 29 n_employment UInt16\n", - "dtypes: UInt16(6), bool(3), int64(5), object(9), string(7)\n", - "memory usage: 1.9+ GB\n" - ] - } - ], - "source": [ - "df.info()" - ] + "source": [] }, { "cell_type": "code", diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 3861211..d75e6a1 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -58,11 +58,11 @@ def main(input_filepath, output_filepath): 'primary_email': 'string', 'activation_date': 'string', 'last_update_date': 'string', - 'n_works': pd.UInt16Dtype(), - 'n_doi': pd.UInt16Dtype(), - 'n_arxiv': pd.UInt16Dtype(), - 'n_pmc': pd.UInt16Dtype(), - 'n_other_pids': pd.UInt16Dtype()}) + 'n_works': pd.Int16Dtype(), + 'n_doi': pd.Int16Dtype(), + 'n_arxiv': pd.Int16Dtype(), + 'n_pmc': pd.Int16Dtype(), + 'n_other_pids': pd.Int16Dtype()}) # parse_dates=['activation_date', 'last_update_date']) logger.info('Loading list columns') @@ -105,22 +105,22 @@ def main(input_filepath, output_filepath): logger.info('Creating simple numeric columns') df['n_emails'] = df.other_emails.str.len() - df.n_emails = df.n_emails.astype(pd.UInt16Dtype()) + df.n_emails = df.n_emails.astype(pd.Int16Dtype()) df['n_urls'] = df.url_domains.str.len() - df.n_urls = df.n_urls.astype(pd.UInt16Dtype()) + df.n_urls = df.n_urls.astype(pd.Int16Dtype()) df['n_ids'] = df.external_ids.str.len() - df.n_ids = df.n_ids.astype(pd.UInt16Dtype()) + df.n_ids = df.n_ids.astype(pd.Int16Dtype()) df['n_keywords'] = df.keywords.str.len() - df.n_keywords = df.n_keywords.astype(pd.UInt16Dtype()) + df.n_keywords = df.n_keywords.astype(pd.Int16Dtype()) df['n_education'] = df.education.str.len() - df.n_education = df.n_education.astype(pd.UInt16Dtype()) + df.n_education = df.n_education.astype(pd.Int16Dtype()) df['n_employment'] = df.employment.str.len() - df.n_employment = df.n_employment.astype(pd.UInt16Dtype()) + df.n_employment = df.n_employment.astype(pd.Int16Dtype()) logger.info('Dropping useless columns') df = df.drop(['urls', 'other_emails'], axis=1)