diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb
index 843eca0..ea30eb8 100644
--- a/notebooks/01-Exploration.ipynb
+++ b/notebooks/01-Exploration.ipynb
@@ -20,7 +20,7 @@
},
{
"cell_type": "code",
- "execution_count": 76,
+ "execution_count": 1,
"metadata": {},
"outputs": [
{
@@ -84,7 +84,7 @@
},
{
"cell_type": "code",
- "execution_count": 77,
+ "execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -101,7 +101,7 @@
},
{
"cell_type": "code",
- "execution_count": 78,
+ "execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -121,7 +121,7 @@
},
{
"cell_type": "code",
- "execution_count": 79,
+ "execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
@@ -143,7 +143,7 @@
},
{
"cell_type": "code",
- "execution_count": 80,
+ "execution_count": 5,
"metadata": {},
"outputs": [
{
@@ -222,16 +222,16 @@
"
0 | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" \n",
" \n",
" 1 | \n",
@@ -255,16 +255,16 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
+ " False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1.0 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 1 | \n",
"
\n",
" \n",
" 2 | \n",
@@ -288,16 +288,16 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
"
\n",
" \n",
" 3 | \n",
@@ -321,16 +321,16 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
+ " False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1.0 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 1 | \n",
"
\n",
" \n",
" 4 | \n",
@@ -354,16 +354,16 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
+ " False | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 2.0 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 2 | \n",
"
\n",
" \n",
"\n",
@@ -399,28 +399,28 @@
"4 0 NaN 2014-03-10t13:22:01.966z 2016-06-14t22:17:54.470z \n",
"\n",
" n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n",
- "0 0 0 0 0 0 NaN \n",
- "1 0 0 0 0 0 NaN \n",
- "2 0 0 0 0 0 NaN \n",
- "3 0 0 0 0 0 NaN \n",
- "4 0 0 0 0 0 NaN \n",
+ "0 0 0 0 0 False NaN \n",
+ "1 0 0 0 0 False NaN \n",
+ "2 0 0 0 0 False NaN \n",
+ "3 0 0 0 0 False NaN \n",
+ "4 0 0 0 0 False NaN \n",
"\n",
" other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n",
- "0 NaN NaN NaN NaN NaN NaN \n",
- "1 NaN NaN NaN NaN NaN NaN \n",
- "2 NaN NaN NaN NaN NaN NaN \n",
- "3 NaN NaN NaN NaN NaN NaN \n",
- "4 NaN NaN NaN NaN NaN NaN \n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 NaN NaN \n",
+ "3 NaN NaN \n",
+ "4 NaN NaN \n",
"\n",
" n_education n_employment \n",
- "0 NaN NaN \n",
- "1 NaN 1.0 \n",
- "2 NaN NaN \n",
- "3 NaN 1.0 \n",
- "4 NaN 2.0 "
+ "0 \n",
+ "1 1 \n",
+ "2 \n",
+ "3 1 \n",
+ "4 2 "
]
},
- "execution_count": 80,
+ "execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
@@ -501,14 +501,14 @@
" \n",
" 3073261 | \n",
" 0000-0002-5193-7851 | \n",
- " 1 | \n",
- " 1 | \n",
+ " True | \n",
+ " True | \n",
" andrea | \n",
" mannocci | \n",
" data scientist & researcher; scholarly knowled... | \n",
" NaN | \n",
" andrea.mannocci@isti.cnr.it | \n",
- " [research infrastructures, science of science,... | \n",
+ " [open science, data science, research infrastr... | \n",
" [[scopus author id, 55233589900]] | \n",
" [[information engineering, ph.d., università d... | \n",
" [[research associate, istituto di scienza e te... | \n",
@@ -520,16 +520,16 @@
" 0 | \n",
" 0 | \n",
" 60 | \n",
- " 1 | \n",
+ " True | \n",
" isti.cnr.it | \n",
" NaN | \n",
" [github.io, twitter.com, linkedin.com] | \n",
- " NaN | \n",
- " 3.0 | \n",
- " 1.0 | \n",
- " 5.0 | \n",
- " 4.0 | \n",
- " 5.0 | \n",
+ " <NA> | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 5 | \n",
+ " 4 | \n",
+ " 5 | \n",
"
\n",
" \n",
"\n",
@@ -537,7 +537,7 @@
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
- "3073261 0000-0002-5193-7851 1 1 \n",
+ "3073261 0000-0002-5193-7851 True True \n",
"\n",
" given_names family_name \\\n",
"3073261 andrea mannocci \n",
@@ -549,7 +549,7 @@
"3073261 andrea.mannocci@isti.cnr.it \n",
"\n",
" keywords \\\n",
- "3073261 [research infrastructures, science of science,... \n",
+ "3073261 [open science, data science, research infrastr... \n",
"\n",
" external_ids \\\n",
"3073261 [[scopus author id, 55233589900]] \n",
@@ -567,13 +567,13 @@
"3073261 2017-09-12t14:28:33.467z 2021-03-17t15:40:07.776z 34 0 \n",
"\n",
" n_pmc n_other_pids label primary_email_domain other_email_domains \\\n",
- "3073261 0 60 1 isti.cnr.it NaN \n",
+ "3073261 0 60 True isti.cnr.it NaN \n",
"\n",
" url_domains n_emails n_urls n_ids \\\n",
- "3073261 [github.io, twitter.com, linkedin.com] NaN 3.0 1.0 \n",
+ "3073261 [github.io, twitter.com, linkedin.com] 3 1 \n",
"\n",
" n_keywords n_education n_employment \n",
- "3073261 5.0 4.0 5.0 "
+ "3073261 5 4 5 "
]
},
"execution_count": 6,
@@ -647,13 +647,13 @@
" \n",
" 9887272 | \n",
" 0000-0001-6997-9470 | \n",
- " 1 | \n",
- " 1 | \n",
+ " True | \n",
+ " True | \n",
" other | \n",
" whatsapp | \n",
+ " <NA> | \n",
" NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " <NA> | \n",
" [whatsapp gb apk, whatsapp gb baixar, whatsapp... | \n",
" NaN | \n",
" NaN | \n",
@@ -666,16 +666,16 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
+ " False | \n",
" NaN | \n",
" NaN | \n",
" [otherwhatsapp.com, im-creator.com, facebook.c... | \n",
- " NaN | \n",
- " 27.0 | \n",
- " NaN | \n",
- " 4.0 | \n",
- " NaN | \n",
- " NaN | \n",
+ " <NA> | \n",
+ " 27 | \n",
+ " <NA> | \n",
+ " 4 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
"
\n",
" \n",
"\n",
@@ -683,10 +683,10 @@
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
- "9887272 0000-0001-6997-9470 1 1 \n",
+ "9887272 0000-0001-6997-9470 True True \n",
"\n",
" given_names family_name biography other_names primary_email \\\n",
- "9887272 other whatsapp NaN NaN NaN \n",
+ "9887272 other whatsapp NaN \n",
"\n",
" keywords external_ids \\\n",
"9887272 [whatsapp gb apk, whatsapp gb baixar, whatsapp... NaN \n",
@@ -695,16 +695,16 @@
"9887272 NaN NaN 0 NaN 2020-10-07t10:37:12.237z \n",
"\n",
" last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n",
- "9887272 2020-10-08t02:32:03.935z 0 0 0 0 0 \n",
+ "9887272 2020-10-08t02:32:03.935z 0 0 0 0 False \n",
"\n",
" primary_email_domain other_email_domains \\\n",
"9887272 NaN NaN \n",
"\n",
" url_domains n_emails n_urls \\\n",
- "9887272 [otherwhatsapp.com, im-creator.com, facebook.c... NaN 27.0 \n",
+ "9887272 [otherwhatsapp.com, im-creator.com, facebook.c... 27 \n",
"\n",
" n_ids n_keywords n_education n_employment \n",
- "9887272 NaN 4.0 NaN NaN "
+ "9887272 4 "
]
},
"execution_count": 7,
@@ -778,7 +778,7 @@
"text/plain": [
"count 10989649\n",
"unique 10989649\n",
- "top 0000-0001-7886-4851\n",
+ "top 0000-0002-6749-5031\n",
"freq 1\n",
"Name: orcid, dtype: object"
]
@@ -842,7 +842,7 @@
"5590332 patrick.davey@monash.edu\n",
"9316843 maykin@owasp.org\n",
"10375852 andycheng2026@163.com\n",
- "Name: primary_email, dtype: object"
+ "Name: primary_email, dtype: string"
]
},
"execution_count": 11,
@@ -916,11 +916,11 @@
" \n",
" 7543981 | \n",
" 0000-0002-0836-2271 | \n",
- " 1 | \n",
- " 1 | \n",
+ " True | \n",
+ " True | \n",
" maykin | \n",
" warasart | \n",
- " NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" maykin@owasp.org | \n",
" NaN | \n",
@@ -935,25 +935,25 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
+ " False | \n",
" owasp.org | \n",
" [dga.or.th] | \n",
" NaN | \n",
- " 1.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " 1 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
"
\n",
" \n",
" 9316843 | \n",
" 0000-0001-9855-1676 | \n",
- " 1 | \n",
- " 1 | \n",
+ " True | \n",
+ " True | \n",
" maykin | \n",
" warasart | \n",
- " NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" maykin@owasp.org | \n",
" NaN | \n",
@@ -968,16 +968,16 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
+ " False | \n",
" owasp.org | \n",
" [dga.or.th, ieee.org] | \n",
" NaN | \n",
- " 2.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " 2 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
"
\n",
" \n",
"\n",
@@ -985,12 +985,12 @@
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
- "7543981 0000-0002-0836-2271 1 1 \n",
- "9316843 0000-0001-9855-1676 1 1 \n",
+ "7543981 0000-0002-0836-2271 True True \n",
+ "9316843 0000-0001-9855-1676 True True \n",
"\n",
" given_names family_name biography other_names primary_email \\\n",
- "7543981 maykin warasart NaN NaN maykin@owasp.org \n",
- "9316843 maykin warasart NaN NaN maykin@owasp.org \n",
+ "7543981 maykin warasart NaN maykin@owasp.org \n",
+ "9316843 maykin warasart NaN maykin@owasp.org \n",
"\n",
" keywords external_ids education employment n_works works_source \\\n",
"7543981 NaN NaN NaN NaN 0 NaN \n",
@@ -1001,16 +1001,16 @@
"9316843 2020-10-23t17:51:51.925z 2021-01-01t15:00:52.053z 0 0 \n",
"\n",
" n_pmc n_other_pids label primary_email_domain \\\n",
- "7543981 0 0 0 owasp.org \n",
- "9316843 0 0 0 owasp.org \n",
+ "7543981 0 0 False owasp.org \n",
+ "9316843 0 0 False owasp.org \n",
"\n",
" other_email_domains url_domains n_emails n_urls n_ids \\\n",
- "7543981 [dga.or.th] NaN 1.0 NaN NaN \n",
- "9316843 [dga.or.th, ieee.org] NaN 2.0 NaN NaN \n",
+ "7543981 [dga.or.th] NaN 1 \n",
+ "9316843 [dga.or.th, ieee.org] NaN 2 \n",
"\n",
" n_keywords n_education n_employment \n",
- "7543981 NaN NaN NaN \n",
- "9316843 NaN NaN NaN "
+ "7543981 \n",
+ "9316843 "
]
},
"execution_count": 12,
@@ -1084,11 +1084,11 @@
" \n",
" 347852 | \n",
" 0000-0002-2232-9638 | \n",
- " 1 | \n",
- " 1 | \n",
+ " True | \n",
+ " True | \n",
" osman | \n",
" perçin | \n",
- " NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" opercin@erbakan.edu.tr | \n",
" NaN | \n",
@@ -1103,25 +1103,25 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
+ " False | \n",
" erbakan.edu.tr | \n",
" NaN | \n",
" NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
"
\n",
" \n",
" 1681787 | \n",
" 0000-0003-0033-0918 | \n",
- " 1 | \n",
- " 1 | \n",
+ " True | \n",
+ " True | \n",
" osman | \n",
" perçin | \n",
- " NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" opercin@erbakan.edu.tr | \n",
" NaN | \n",
@@ -1136,16 +1136,16 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
+ " False | \n",
" erbakan.edu.tr | \n",
" NaN | \n",
" NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1.0 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 1 | \n",
"
\n",
" \n",
"\n",
@@ -1153,12 +1153,12 @@
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
- "347852 0000-0002-2232-9638 1 1 \n",
- "1681787 0000-0003-0033-0918 1 1 \n",
+ "347852 0000-0002-2232-9638 True True \n",
+ "1681787 0000-0003-0033-0918 True True \n",
"\n",
" given_names family_name biography other_names primary_email \\\n",
- "347852 osman perçin NaN NaN opercin@erbakan.edu.tr \n",
- "1681787 osman perçin NaN NaN opercin@erbakan.edu.tr \n",
+ "347852 osman perçin NaN opercin@erbakan.edu.tr \n",
+ "1681787 osman perçin NaN opercin@erbakan.edu.tr \n",
"\n",
" keywords external_ids education \\\n",
"347852 NaN NaN NaN \n",
@@ -1173,16 +1173,16 @@
"1681787 NaN 2015-10-13t05:47:12.014z 2020-12-25t13:52:03.976z \n",
"\n",
" n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n",
- "347852 0 0 0 0 0 erbakan.edu.tr \n",
- "1681787 0 0 0 0 0 erbakan.edu.tr \n",
+ "347852 0 0 0 0 False erbakan.edu.tr \n",
+ "1681787 0 0 0 0 False erbakan.edu.tr \n",
"\n",
" other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n",
- "347852 NaN NaN NaN NaN NaN NaN \n",
- "1681787 NaN NaN NaN NaN NaN NaN \n",
+ "347852 NaN NaN \n",
+ "1681787 NaN NaN \n",
"\n",
" n_education n_employment \n",
- "347852 NaN NaN \n",
- "1681787 NaN 1.0 "
+ "347852 \n",
+ "1681787 1 "
]
},
"execution_count": 13,
@@ -1256,14 +1256,14 @@
" \n",
" 954085 | \n",
" 0000-0002-9158-1757 | \n",
- " 1 | \n",
- " 1 | \n",
+ " True | \n",
+ " True | \n",
" patrick | \n",
" davey | \n",
- " NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" patrick.davey@monash.edu | \n",
- " [radiochemistry, radiopharmaceuticals, inorgan... | \n",
+ " [radiochemistry, radiopharmaceuticals, bioinor... | \n",
" NaN | \n",
" NaN | \n",
" [[phd student, monash university, melbourne, ,... | \n",
@@ -1275,25 +1275,25 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 0 | \n",
+ " False | \n",
" monash.edu | \n",
" NaN | \n",
" NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 4.0 | \n",
- " NaN | \n",
- " 1.0 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 4 | \n",
+ " <NA> | \n",
+ " 1 | \n",
"
\n",
" \n",
" 5590332 | \n",
" 0000-0002-8774-0030 | \n",
- " 1 | \n",
- " 1 | \n",
+ " True | \n",
+ " True | \n",
" patrick | \n",
" davey | \n",
- " NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" patrick.davey@monash.edu | \n",
" NaN | \n",
@@ -1308,16 +1308,16 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 1 | \n",
+ " True | \n",
" monash.edu | \n",
" NaN | \n",
" NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1.0 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 1 | \n",
"
\n",
" \n",
"\n",
@@ -1325,19 +1325,19 @@
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
- "954085 0000-0002-9158-1757 1 1 \n",
- "5590332 0000-0002-8774-0030 1 1 \n",
+ "954085 0000-0002-9158-1757 True True \n",
+ "5590332 0000-0002-8774-0030 True True \n",
"\n",
" given_names family_name biography other_names \\\n",
- "954085 patrick davey NaN NaN \n",
- "5590332 patrick davey NaN NaN \n",
+ "954085 patrick davey NaN \n",
+ "5590332 patrick davey NaN \n",
"\n",
" primary_email \\\n",
"954085 patrick.davey@monash.edu \n",
"5590332 patrick.davey@monash.edu \n",
"\n",
" keywords external_ids \\\n",
- "954085 [radiochemistry, radiopharmaceuticals, inorgan... NaN \n",
+ "954085 [radiochemistry, radiopharmaceuticals, bioinor... NaN \n",
"5590332 NaN NaN \n",
"\n",
" education employment n_works \\\n",
@@ -1349,16 +1349,16 @@
"5590332 [crossref] 2018-09-11t10:47:10.997z 2021-02-09t06:21:44.138z \n",
"\n",
" n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n",
- "954085 0 0 0 0 0 monash.edu \n",
- "5590332 1 0 0 0 1 monash.edu \n",
+ "954085 0 0 0 0 False monash.edu \n",
+ "5590332 1 0 0 0 True monash.edu \n",
"\n",
" other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n",
- "954085 NaN NaN NaN NaN NaN 4.0 \n",
- "5590332 NaN NaN NaN NaN NaN NaN \n",
+ "954085 NaN NaN 4 \n",
+ "5590332 NaN NaN \n",
"\n",
" n_education n_employment \n",
- "954085 NaN 1.0 \n",
- "5590332 NaN 1.0 "
+ "954085 1 \n",
+ "5590332 1 "
]
},
"execution_count": 14,
@@ -2413,9 +2413,9 @@
}
},
"text/html": [
- ""
- ]
- },
- "metadata": {},
- "output_type": "display_data"
+ "ename": "ValueError",
+ "evalue": "Cannot set non-string value '-1' into a StringArray.",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
+ "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mfig\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpx\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mimshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcorr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshow\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, axis, inplace, limit, downcast)\u001b[0m\n\u001b[1;32m 4460\u001b[0m \u001b[0mdowncast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4461\u001b[0m ) -> Optional[DataFrame]:\n\u001b[0;32m-> 4462\u001b[0;31m return super().fillna(\n\u001b[0m\u001b[1;32m 4463\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4464\u001b[0m \u001b[0mmethod\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mmethod\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, axis, inplace, limit, downcast)\u001b[0m\n\u001b[1;32m 6427\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6428\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mis_list_like\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 6429\u001b[0;31m new_data = self._mgr.fillna(\n\u001b[0m\u001b[1;32m 6430\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlimit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdowncast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdowncast\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6431\u001b[0m )\n",
+ "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, limit, inplace, downcast)\u001b[0m\n\u001b[1;32m 619\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 620\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdowncast\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0;34m\"BlockManager\"\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 621\u001b[0;31m return self.apply(\n\u001b[0m\u001b[1;32m 622\u001b[0m \u001b[0;34m\"fillna\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlimit\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0minplace\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdowncast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdowncast\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 623\u001b[0m )\n",
+ "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/managers.py\u001b[0m in \u001b[0;36mapply\u001b[0;34m(self, f, align_keys, ignore_failures, **kwargs)\u001b[0m\n\u001b[1;32m 425\u001b[0m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mb\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 426\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 427\u001b[0;31m \u001b[0mapplied\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 428\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mTypeError\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mNotImplementedError\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mignore_failures\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/internals/blocks.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, limit, inplace, downcast)\u001b[0m\n\u001b[1;32m 1916\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0minplace\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdowncast\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mNone\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1917\u001b[0m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0minplace\u001b[0m \u001b[0;32melse\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1918\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalues\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlimit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mlimit\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1919\u001b[0m return [\n\u001b[1;32m 1920\u001b[0m self.make_block_same_class(\n",
+ "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/arrays/_mixins.py\u001b[0m in \u001b[0;36mfillna\u001b[0;34m(self, value, method, limit)\u001b[0m\n\u001b[1;32m 260\u001b[0m \u001b[0;31m# fill with value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 261\u001b[0m \u001b[0mnew_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 262\u001b[0;31m \u001b[0mnew_values\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mmask\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 263\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 264\u001b[0m \u001b[0mnew_values\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcopy\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+ "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/arrays/string_.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 273\u001b[0m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mStringDtype\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mna_value\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 274\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 275\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 276\u001b[0m \u001b[0;34mf\"Cannot set non-string value '{value}' into a StringArray.\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 277\u001b[0m )\n",
+ "\u001b[0;31mValueError\u001b[0m: Cannot set non-string value '-1' into a StringArray."
+ ]
}
],
"source": [
- "fig = px.imshow(df.fillna(-1).corr())\n",
+ "fig = px.imshow(df[[]].fillna(-1).corr())\n",
"fig.show()"
]
},
{
"cell_type": "code",
- "execution_count": 69,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "df[['verified_email', \n",
- " 'verified_primary_email', \n",
- " 'n_works', \n",
- " 'n_doi',\n",
- " 'n_arxiv', \n",
- " 'n_pmc', \n",
- " 'n_other_pids', \n",
- " 'n_emails', \n",
- " 'n_urls', \n",
- " 'n_ids', \n",
- " 'n_keywords', \n",
- " 'n_employment', \n",
- " 'n_education', \n",
- " 'label']].to_pickle('../data/processed/features.pkl')"
+ "# df[['verified_email', \n",
+ "# 'verified_primary_email', \n",
+ "# 'n_works', \n",
+ "# 'n_doi',\n",
+ "# 'n_arxiv', \n",
+ "# 'n_pmc', \n",
+ "# 'n_other_pids', \n",
+ "# 'n_emails', \n",
+ "# 'n_urls', \n",
+ "# 'n_ids', \n",
+ "# 'n_keywords', \n",
+ "# 'n_employment', \n",
+ "# 'n_education', \n",
+ "# 'label']].to_pickle('../data/processed/features.pkl')"
]
},
{
@@ -17950,13 +16684,13 @@
" \n",
" 17 | \n",
" 0000-0002-0137-3066 | \n",
- " 1 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " True | \n",
+ " True | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -17969,16 +16703,16 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" NaN | \n",
" NaN | \n",
" False | \n",
@@ -17987,13 +16721,13 @@
"
\n",
" 19 | \n",
" 0000-0002-0461-9711 | \n",
- " 1 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " True | \n",
+ " True | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -18006,16 +16740,16 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" NaN | \n",
" NaN | \n",
" False | \n",
@@ -18024,13 +16758,13 @@
"
\n",
" 22 | \n",
" 0000-0002-0761-9450 | \n",
- " 1 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " True | \n",
+ " True | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -18043,16 +16777,16 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" NaN | \n",
" NaN | \n",
" False | \n",
@@ -18061,13 +16795,13 @@
"
\n",
" 33 | \n",
" 0000-0002-4447-9215 | \n",
- " 1 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " True | \n",
+ " True | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -18080,16 +16814,16 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" NaN | \n",
" NaN | \n",
" False | \n",
@@ -18098,13 +16832,13 @@
"
\n",
" 44 | \n",
" 0000-0003-0426-4065 | \n",
- " 1 | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " True | \n",
+ " True | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" [eliza i. gilbert] | \n",
- " NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -18117,18 +16851,18 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
+ " True | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1.0 | \n",
- " NaN | \n",
- " NaN | \n",
" False | \n",
" NaN | \n",
"
\n",
@@ -18172,14 +16906,14 @@
" \n",
" 10989636 | \n",
" 0000-0002-2906-0299 | \n",
- " 1 | \n",
- " 1 | \n",
+ " True | \n",
+ " True | \n",
" tiffany | \n",
" mackay | \n",
- " NaN | \n",
+ " <NA> | \n",
" [tiffany russel sia] | \n",
- " NaN | \n",
- " [prostate cancer, oxytocin, radiolabelling, ga... | \n",
+ " <NA> | \n",
+ " [gpc-1, cancer, immunohistochemistry, clinical... | \n",
" [[researcherid, a-2121-2017]] | \n",
" [[faculty of medicine, master in pharmaceutica... | \n",
" [[clinical project lead, minomic international... | \n",
@@ -18191,16 +16925,16 @@
" 0 | \n",
" 0 | \n",
" 0 | \n",
- " 1 | \n",
+ " True | \n",
" NaN | \n",
" NaN | \n",
" [oxytocin.com.au, linkedin.com] | \n",
- " NaN | \n",
- " 2.0 | \n",
- " 1.0 | \n",
- " 13.0 | \n",
- " 2.0 | \n",
- " 4.0 | \n",
+ " <NA> | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 13 | \n",
+ " 2 | \n",
+ " 4 | \n",
" [crossref, researcherid] | \n",
" 2.0 | \n",
" True | \n",
@@ -18209,13 +16943,13 @@
"
\n",
" 10989637 | \n",
" 0000-0001-5896-2024 | \n",
- " 1 | \n",
- " 1 | \n",
+ " True | \n",
+ " True | \n",
" giovanni, l | \n",
" tiscia | \n",
+ " <NA> | \n",
" NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" [[scopus author id, 54948242800]] | \n",
" NaN | \n",
@@ -18228,16 +16962,16 @@
" 0 | \n",
" 17 | \n",
" 52 | \n",
+ " True | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1.0 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" [scopus - elsevier, europe pubmed central, cro... | \n",
" 3.0 | \n",
" True | \n",
@@ -18246,13 +16980,13 @@
"
\n",
" 10989640 | \n",
" 0000-0002-1070-2220 | \n",
- " 1 | \n",
- " 1 | \n",
+ " True | \n",
+ " True | \n",
" vinicios | \n",
" santanna | \n",
- " NaN | \n",
+ " <NA> | \n",
" [vinicios sant anna, vinicios sant anna, vinic... | \n",
- " NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" [[scopus author id, 57201697952]] | \n",
" [[economics, ph.d., university of illinois at ... | \n",
@@ -18265,16 +16999,16 @@
" 0 | \n",
" 0 | \n",
" 1 | \n",
- " 1 | \n",
+ " True | \n",
" NaN | \n",
" NaN | \n",
" [vpsantanna.com] | \n",
- " NaN | \n",
- " 1.0 | \n",
- " 1.0 | \n",
- " NaN | \n",
- " 2.0 | \n",
- " NaN | \n",
+ " <NA> | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " <NA> | \n",
+ " 2 | \n",
+ " <NA> | \n",
" [crossref metadata search] | \n",
" 1.0 | \n",
" True | \n",
@@ -18283,13 +17017,13 @@
"
\n",
" 10989643 | \n",
" 0000-0003-2606-0936 | \n",
- " 1 | \n",
- " 1 | \n",
+ " True | \n",
+ " True | \n",
" luang | \n",
" xu | \n",
- " NaN | \n",
+ " <NA> | \n",
" [xu lu-ang, lu lu] | \n",
- " NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
@@ -18302,16 +17036,16 @@
" 0 | \n",
" 0 | \n",
" 1 | \n",
+ " True | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
" 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1.0 | \n",
" [scopus - elsevier, crossref] | \n",
" 2.0 | \n",
" True | \n",
@@ -18320,13 +17054,13 @@
"
\n",
" 10989645 | \n",
" 0000-0002-3800-6331 | \n",
- " 1 | \n",
- " 1 | \n",
+ " True | \n",
+ " True | \n",
" zachary | \n",
" calamari | \n",
+ " <NA> | \n",
" NaN | \n",
- " NaN | \n",
- " NaN | \n",
+ " <NA> | \n",
" NaN | \n",
" NaN | \n",
" [[richard gilder graduate school, phd in compa... | \n",
@@ -18339,16 +17073,16 @@
" 0 | \n",
" 1 | \n",
" 0 | \n",
- " 1 | \n",
+ " True | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 2.0 | \n",
- " 2.0 | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 2 | \n",
+ " 2 | \n",
" [crossref metadata search, crossref] | \n",
" 2.0 | \n",
" True | \n",
@@ -18361,43 +17095,43 @@
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
- "17 0000-0002-0137-3066 1 1 \n",
- "19 0000-0002-0461-9711 1 1 \n",
- "22 0000-0002-0761-9450 1 1 \n",
- "33 0000-0002-4447-9215 1 1 \n",
- "44 0000-0003-0426-4065 1 1 \n",
+ "17 0000-0002-0137-3066 True True \n",
+ "19 0000-0002-0461-9711 True True \n",
+ "22 0000-0002-0761-9450 True True \n",
+ "33 0000-0002-4447-9215 True True \n",
+ "44 0000-0003-0426-4065 True True \n",
"... ... ... ... \n",
- "10989636 0000-0002-2906-0299 1 1 \n",
- "10989637 0000-0001-5896-2024 1 1 \n",
- "10989640 0000-0002-1070-2220 1 1 \n",
- "10989643 0000-0003-2606-0936 1 1 \n",
- "10989645 0000-0002-3800-6331 1 1 \n",
+ "10989636 0000-0002-2906-0299 True True \n",
+ "10989637 0000-0001-5896-2024 True True \n",
+ "10989640 0000-0002-1070-2220 True True \n",
+ "10989643 0000-0003-2606-0936 True True \n",
+ "10989645 0000-0002-3800-6331 True True \n",
"\n",
" given_names family_name biography \\\n",
- "17 NaN NaN NaN \n",
- "19 NaN NaN NaN \n",
- "22 NaN NaN NaN \n",
- "33 NaN NaN NaN \n",
- "44 NaN NaN NaN \n",
+ "17 \n",
+ "19 \n",
+ "22 \n",
+ "33 \n",
+ "44 \n",
"... ... ... ... \n",
- "10989636 tiffany mackay NaN \n",
- "10989637 giovanni, l tiscia NaN \n",
- "10989640 vinicios santanna NaN \n",
- "10989643 luang xu NaN \n",
- "10989645 zachary calamari NaN \n",
+ "10989636 tiffany mackay \n",
+ "10989637 giovanni, l tiscia \n",
+ "10989640 vinicios santanna \n",
+ "10989643 luang xu \n",
+ "10989645 zachary calamari \n",
"\n",
" other_names primary_email \\\n",
- "17 NaN NaN \n",
- "19 NaN NaN \n",
- "22 NaN NaN \n",
- "33 NaN NaN \n",
- "44 [eliza i. gilbert] NaN \n",
+ "17 NaN \n",
+ "19 NaN \n",
+ "22 NaN \n",
+ "33 NaN \n",
+ "44 [eliza i. gilbert] \n",
"... ... ... \n",
- "10989636 [tiffany russel sia] NaN \n",
- "10989637 NaN NaN \n",
- "10989640 [vinicios sant anna, vinicios sant anna, vinic... NaN \n",
- "10989643 [xu lu-ang, lu lu] NaN \n",
- "10989645 NaN NaN \n",
+ "10989636 [tiffany russel sia] \n",
+ "10989637 NaN \n",
+ "10989640 [vinicios sant anna, vinicios sant anna, vinic... \n",
+ "10989643 [xu lu-ang, lu lu] \n",
+ "10989645 NaN \n",
"\n",
" keywords \\\n",
"17 NaN \n",
@@ -18406,7 +17140,7 @@
"33 NaN \n",
"44 NaN \n",
"... ... \n",
- "10989636 [prostate cancer, oxytocin, radiolabelling, ga... \n",
+ "10989636 [gpc-1, cancer, immunohistochemistry, clinical... \n",
"10989637 NaN \n",
"10989640 NaN \n",
"10989643 NaN \n",
@@ -18478,43 +17212,43 @@
"10989645 2015-01-20t20:20:17.042z 2020-11-21t19:48:36.221z 7 0 \n",
"\n",
" n_pmc n_other_pids label primary_email_domain other_email_domains \\\n",
- "17 0 0 1 NaN NaN \n",
- "19 0 0 1 NaN NaN \n",
- "22 0 0 1 NaN NaN \n",
- "33 0 0 1 NaN NaN \n",
- "44 0 0 1 NaN NaN \n",
+ "17 0 0 True NaN NaN \n",
+ "19 0 0 True NaN NaN \n",
+ "22 0 0 True NaN NaN \n",
+ "33 0 0 True NaN NaN \n",
+ "44 0 0 True NaN NaN \n",
"... ... ... ... ... ... \n",
- "10989636 0 0 1 NaN NaN \n",
- "10989637 17 52 1 NaN NaN \n",
- "10989640 0 1 1 NaN NaN \n",
- "10989643 0 1 1 NaN NaN \n",
- "10989645 1 0 1 NaN NaN \n",
+ "10989636 0 0 True NaN NaN \n",
+ "10989637 17 52 True NaN NaN \n",
+ "10989640 0 1 True NaN NaN \n",
+ "10989643 0 1 True NaN NaN \n",
+ "10989645 1 0 True NaN NaN \n",
"\n",
" url_domains n_emails n_urls n_ids \\\n",
- "17 NaN NaN NaN NaN \n",
- "19 NaN NaN NaN NaN \n",
- "22 NaN NaN NaN NaN \n",
- "33 NaN NaN NaN NaN \n",
- "44 NaN NaN NaN NaN \n",
+ "17 NaN \n",
+ "19 NaN \n",
+ "22 NaN \n",
+ "33 NaN \n",
+ "44 NaN \n",
"... ... ... ... ... \n",
- "10989636 [oxytocin.com.au, linkedin.com] NaN 2.0 1.0 \n",
- "10989637 NaN NaN NaN 1.0 \n",
- "10989640 [vpsantanna.com] NaN 1.0 1.0 \n",
- "10989643 NaN NaN NaN NaN \n",
- "10989645 NaN NaN NaN NaN \n",
+ "10989636 [oxytocin.com.au, linkedin.com] 2 1 \n",
+ "10989637 NaN 1 \n",
+ "10989640 [vpsantanna.com] 1 1 \n",
+ "10989643 NaN \n",
+ "10989645 NaN \n",
"\n",
" n_keywords n_education n_employment \\\n",
- "17 NaN NaN NaN \n",
- "19 NaN NaN NaN \n",
- "22 NaN NaN NaN \n",
- "33 NaN NaN NaN \n",
- "44 NaN NaN 1.0 \n",
+ "17 \n",
+ "19 \n",
+ "22 \n",
+ "33 \n",
+ "44 1 \n",
"... ... ... ... \n",
- "10989636 13.0 2.0 4.0 \n",
- "10989637 NaN NaN NaN \n",
- "10989640 NaN 2.0 NaN \n",
- "10989643 NaN NaN 1.0 \n",
- "10989645 NaN 2.0 2.0 \n",
+ "10989636 13 2 4 \n",
+ "10989637 \n",
+ "10989640 2 \n",
+ "10989643 1 \n",
+ "10989645 2 2 \n",
"\n",
" ext_works_source \\\n",
"17 NaN \n",
@@ -18556,182 +17290,19 @@
},
{
"cell_type": "code",
- "execution_count": 105,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 10989649 entries, 0 to 10989648\n",
- "Data columns (total 30 columns):\n",
- " # Column Dtype \n",
- "--- ------ ----- \n",
- " 0 orcid string\n",
- " 1 verified_email bool \n",
- " 2 verified_primary_email bool \n",
- " 3 given_names string\n",
- " 4 family_name string\n",
- " 5 biography string\n",
- " 6 other_names object\n",
- " 7 primary_email string\n",
- " 8 keywords object\n",
- " 9 external_ids object\n",
- " 10 education object\n",
- " 11 employment object\n",
- " 12 n_works int64 \n",
- " 13 works_source object\n",
- " 14 activation_date string\n",
- " 15 last_update_date string\n",
- " 16 n_doi int64 \n",
- " 17 n_arxiv int64 \n",
- " 18 n_pmc int64 \n",
- " 19 n_other_pids int64 \n",
- " 20 label int64 \n",
- " 21 primary_email_domain object\n",
- " 22 other_email_domains object\n",
- " 23 url_domains object\n",
- " 24 n_emails UInt16\n",
- " 25 n_urls UInt16\n",
- " 26 n_ids UInt16\n",
- " 27 n_keywords UInt16\n",
- " 28 n_education UInt16\n",
- " 29 n_employment UInt16\n",
- "dtypes: UInt16(6), bool(2), int64(6), object(9), string(7)\n",
- "memory usage: 2.0+ GB\n"
- ]
- }
- ],
- "source": [
- "# (df.n_works > 0) & (df.n_ids > 1)\n",
- "df.info()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 104,
+ "execution_count": null,
"metadata": {},
"outputs": [],
"source": [
- "df.n_ids = df.n_ids.astype(pd.UInt16Dtype())"
+ "# (df.n_works > 0) & (df.n_ids > 1)"
]
},
{
"cell_type": "code",
- "execution_count": 107,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0 2016-07-27 10:09:13.585000+00:00\n",
- "1 2016-07-27 10:09:13.585000+00:00\n",
- "2 NaT\n",
- "3 2016-07-27 10:09:13.585000+00:00\n",
- "dtype: datetime64[ns, UTC]"
- ]
- },
- "execution_count": 107,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "pd.Series(['2016-07-27t10:09:13.585z', '2016-07-27t10:09:13.585z', pd.NA, '2016-07-27t10:09:13.585z'])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 108,
- "metadata": {},
- "outputs": [
- {
- "ename": "KeyboardInterrupt",
- "evalue": "",
- "output_type": "error",
- "traceback": [
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
- "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)",
- "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_datetime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mactivation_date\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
- "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/tools/datetimes.py\u001b[0m in \u001b[0;36mto_datetime\u001b[0;34m(arg, errors, dayfirst, yearfirst, utc, format, exact, unit, infer_datetime_format, origin, cache)\u001b[0m\n\u001b[1;32m 803\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcache_array\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 804\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 805\u001b[0;31m \u001b[0mvalues\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconvert_listlike\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_values\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 806\u001b[0m \u001b[0mresult\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_constructor\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mindex\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mname\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mname\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 807\u001b[0m \u001b[0;32melif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mABCDataFrame\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mabc\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mMutableMapping\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/tools/datetimes.py\u001b[0m in \u001b[0;36m_convert_listlike_datetimes\u001b[0;34m(arg, format, name, tz, unit, errors, infer_datetime_format, dayfirst, yearfirst, exact)\u001b[0m\n\u001b[1;32m 463\u001b[0m \u001b[0;32massert\u001b[0m \u001b[0mformat\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mor\u001b[0m \u001b[0minfer_datetime_format\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 464\u001b[0m \u001b[0mutc\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtz\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m\"utc\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 465\u001b[0;31m result, tz_parsed = objects_to_datetime64ns(\n\u001b[0m\u001b[1;32m 466\u001b[0m \u001b[0marg\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 467\u001b[0m \u001b[0mdayfirst\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdayfirst\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m~/.virtualenvs/data-science/lib/python3.8/site-packages/pandas/core/arrays/datetimes.py\u001b[0m in \u001b[0;36mobjects_to_datetime64ns\u001b[0;34m(data, dayfirst, yearfirst, utc, errors, require_iso8601, allow_object)\u001b[0m\n\u001b[1;32m 2073\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2074\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 2075\u001b[0;31m result, tz_parsed = tslib.array_to_datetime(\n\u001b[0m\u001b[1;32m 2076\u001b[0m \u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2077\u001b[0m \u001b[0merrors\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0merrors\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32mpandas/_libs/tslib.pyx\u001b[0m in \u001b[0;36mpandas._libs.tslib.array_to_datetime\u001b[0;34m()\u001b[0m\n",
- "\u001b[0;32mpandas/_libs/tslib.pyx\u001b[0m in \u001b[0;36mpandas._libs.tslib.array_to_datetime\u001b[0;34m()\u001b[0m\n",
- "\u001b[0;32mpandas/_libs/tslibs/parsing.pyx\u001b[0m in \u001b[0;36mpandas._libs.tslibs.parsing.parse_datetime_string\u001b[0;34m()\u001b[0m\n",
- "\u001b[0;32mpandas/_libs/tslibs/parsing.pyx\u001b[0m in \u001b[0;36mpandas._libs.tslibs.parsing._parse_dateabbr_string\u001b[0;34m()\u001b[0m\n",
- "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/_strptime.py\u001b[0m in \u001b[0;36m_strptime_datetime\u001b[0;34m(cls, data_string, format)\u001b[0m\n\u001b[1;32m 566\u001b[0m \"\"\"Return a class cls instance based on the input string and the\n\u001b[1;32m 567\u001b[0m format string.\"\"\"\n\u001b[0;32m--> 568\u001b[0;31m \u001b[0mtt\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfraction\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgmtoff_fraction\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_strptime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_string\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mformat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 569\u001b[0m \u001b[0mtzname\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mgmtoff\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtt\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 570\u001b[0m \u001b[0margs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtt\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m6\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mfraction\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;32m/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/_strptime.py\u001b[0m in \u001b[0;36m_strptime\u001b[0;34m(data_string, format)\u001b[0m\n\u001b[1;32m 347\u001b[0m \u001b[0mfound\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mformat_regex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmatch\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_string\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 348\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mfound\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 349\u001b[0;31m raise ValueError(\"time data %r does not match format %r\" %\n\u001b[0m\u001b[1;32m 350\u001b[0m (data_string, format))\n\u001b[1;32m 351\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata_string\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mfound\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
- "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
- ]
- }
- ],
- "source": [
- "pd.to_datetime(df.activation_date)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 109,
+ "execution_count": null,
"metadata": {},
"outputs": [],
- "source": [
- "df['label'] = df['label'].astype('bool')\n"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 110,
- "metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "\n",
- "RangeIndex: 10989649 entries, 0 to 10989648\n",
- "Data columns (total 30 columns):\n",
- " # Column Dtype \n",
- "--- ------ ----- \n",
- " 0 orcid string\n",
- " 1 verified_email bool \n",
- " 2 verified_primary_email bool \n",
- " 3 given_names string\n",
- " 4 family_name string\n",
- " 5 biography string\n",
- " 6 other_names object\n",
- " 7 primary_email string\n",
- " 8 keywords object\n",
- " 9 external_ids object\n",
- " 10 education object\n",
- " 11 employment object\n",
- " 12 n_works int64 \n",
- " 13 works_source object\n",
- " 14 activation_date string\n",
- " 15 last_update_date string\n",
- " 16 n_doi int64 \n",
- " 17 n_arxiv int64 \n",
- " 18 n_pmc int64 \n",
- " 19 n_other_pids int64 \n",
- " 20 label bool \n",
- " 21 primary_email_domain object\n",
- " 22 other_email_domains object\n",
- " 23 url_domains object\n",
- " 24 n_emails UInt16\n",
- " 25 n_urls UInt16\n",
- " 26 n_ids UInt16\n",
- " 27 n_keywords UInt16\n",
- " 28 n_education UInt16\n",
- " 29 n_employment UInt16\n",
- "dtypes: UInt16(6), bool(3), int64(5), object(9), string(7)\n",
- "memory usage: 1.9+ GB\n"
- ]
- }
- ],
- "source": [
- "df.info()"
- ]
+ "source": []
},
{
"cell_type": "code",
diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py
index 3861211..d75e6a1 100644
--- a/src/data/make_dataset.py
+++ b/src/data/make_dataset.py
@@ -58,11 +58,11 @@ def main(input_filepath, output_filepath):
'primary_email': 'string',
'activation_date': 'string',
'last_update_date': 'string',
- 'n_works': pd.UInt16Dtype(),
- 'n_doi': pd.UInt16Dtype(),
- 'n_arxiv': pd.UInt16Dtype(),
- 'n_pmc': pd.UInt16Dtype(),
- 'n_other_pids': pd.UInt16Dtype()})
+ 'n_works': pd.Int16Dtype(),
+ 'n_doi': pd.Int16Dtype(),
+ 'n_arxiv': pd.Int16Dtype(),
+ 'n_pmc': pd.Int16Dtype(),
+ 'n_other_pids': pd.Int16Dtype()})
# parse_dates=['activation_date', 'last_update_date'])
logger.info('Loading list columns')
@@ -105,22 +105,22 @@ def main(input_filepath, output_filepath):
logger.info('Creating simple numeric columns')
df['n_emails'] = df.other_emails.str.len()
- df.n_emails = df.n_emails.astype(pd.UInt16Dtype())
+ df.n_emails = df.n_emails.astype(pd.Int16Dtype())
df['n_urls'] = df.url_domains.str.len()
- df.n_urls = df.n_urls.astype(pd.UInt16Dtype())
+ df.n_urls = df.n_urls.astype(pd.Int16Dtype())
df['n_ids'] = df.external_ids.str.len()
- df.n_ids = df.n_ids.astype(pd.UInt16Dtype())
+ df.n_ids = df.n_ids.astype(pd.Int16Dtype())
df['n_keywords'] = df.keywords.str.len()
- df.n_keywords = df.n_keywords.astype(pd.UInt16Dtype())
+ df.n_keywords = df.n_keywords.astype(pd.Int16Dtype())
df['n_education'] = df.education.str.len()
- df.n_education = df.n_education.astype(pd.UInt16Dtype())
+ df.n_education = df.n_education.astype(pd.Int16Dtype())
df['n_employment'] = df.employment.str.len()
- df.n_employment = df.n_employment.astype(pd.UInt16Dtype())
+ df.n_employment = df.n_employment.astype(pd.Int16Dtype())
logger.info('Dropping useless columns')
df = df.drop(['urls', 'other_emails'], axis=1)