From 51e479c28768bebab41d927871ca497ad3f42bc3 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Tue, 30 Mar 2021 17:39:05 +0200 Subject: [PATCH] added study of education and employment --- notebooks/01-Exploration.ipynb | 4830 ++++++++++++++++++++++++++++++-- 1 file changed, 4539 insertions(+), 291 deletions(-) diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index ea30eb8..a807910 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -508,7 +508,7 @@ " data scientist & researcher; scholarly knowled...\n", " NaN\n", " andrea.mannocci@isti.cnr.it\n", - " [open science, data science, research infrastr...\n", + " [open science, data science, science of scienc...\n", " [[scopus author id, 55233589900]]\n", " [[information engineering, ph.d., università d...\n", " [[research associate, istituto di scienza e te...\n", @@ -549,7 +549,7 @@ "3073261 andrea.mannocci@isti.cnr.it \n", "\n", " keywords \\\n", - "3073261 [open science, data science, research infrastr... \n", + "3073261 [open science, data science, science of scienc... \n", "\n", " external_ids \\\n", "3073261 [[scopus author id, 55233589900]] \n", @@ -654,7 +654,7 @@ " <NA>\n", " NaN\n", " <NA>\n", - " [whatsapp gb apk, whatsapp gb baixar, whatsapp...\n", + " [whatsapp gb baixar, whatsapp gb 2020, whatsap...\n", " NaN\n", " NaN\n", " NaN\n", @@ -689,7 +689,7 @@ "9887272 other whatsapp NaN \n", "\n", " keywords external_ids \\\n", - "9887272 [whatsapp gb apk, whatsapp gb baixar, whatsapp... NaN \n", + "9887272 [whatsapp gb baixar, whatsapp gb 2020, whatsap... NaN \n", "\n", " education employment n_works works_source activation_date \\\n", "9887272 NaN NaN 0 NaN 2020-10-07t10:37:12.237z \n", @@ -778,7 +778,7 @@ "text/plain": [ "count 10989649\n", "unique 10989649\n", - "top 0000-0002-6749-5031\n", + "top 0000-0001-5242-3687\n", "freq 1\n", "Name: orcid, dtype: object" ] @@ -807,10 +807,10 @@ { "data": { "text/plain": [ - "count 124722\n", - "unique 124718\n", - "top maykin@owasp.org\n", - "freq 2\n", + "count 124722\n", + "unique 124718\n", + "top opercin@erbakan.edu.tr\n", + "freq 2\n", "Name: primary_email, dtype: object" ] }, @@ -1263,7 +1263,7 @@ " <NA>\n", " NaN\n", " patrick.davey@monash.edu\n", - " [radiochemistry, radiopharmaceuticals, bioinor...\n", + " [radiochemistry, inorganic chemistry, bioinorg...\n", " NaN\n", " NaN\n", " [[phd student, monash university, melbourne, ,...\n", @@ -1337,7 +1337,7 @@ "5590332 patrick.davey@monash.edu \n", "\n", " keywords external_ids \\\n", - "954085 [radiochemistry, radiopharmaceuticals, bioinor... NaN \n", + "954085 [radiochemistry, inorganic chemistry, bioinorg... NaN \n", "5590332 NaN NaN \n", "\n", " education employment n_works \\\n", @@ -2413,9 +2413,9 @@ } }, "text/html": [ - "
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "fig = px.imshow(df[[]].fillna(-1).corr())\n", + "fig = px.imshow(df.select_dtypes(include=['bool','number']).fillna(-1).corr())\n", "fig.show()" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 74, "metadata": {}, "outputs": [], "source": [ @@ -16620,7 +20831,7 @@ }, { "cell_type": "code", - "execution_count": 70, + "execution_count": 75, "metadata": {}, "outputs": [ { @@ -16904,6 +21115,43 @@ " ...\n", " \n", " \n", + " 10989635\n", + " 0000-0002-7340-9697\n", + " True\n", + " True\n", + " tawanda\n", + " marandure\n", + " <NA>\n", + " NaN\n", + " <NA>\n", + " NaN\n", + " [[scopus author id, 48261373600]]\n", + " [[animal science, msc sustainable agriculture,...\n", + " [[lecturer, zimbabwe open university faculty o...\n", + " 7\n", + " [scopus - elsevier]\n", + " 2015-11-05t08:52:08.743z\n", + " 2020-12-09t17:59:18.350z\n", + " 7\n", + " 0\n", + " 0\n", + " 7\n", + " True\n", + " NaN\n", + " NaN\n", + " NaN\n", + " <NA>\n", + " <NA>\n", + " 1\n", + " <NA>\n", + " 3\n", + " 3\n", + " [scopus - elsevier]\n", + " 1.0\n", + " True\n", + " NaN\n", + " \n", + " \n", " 10989636\n", " 0000-0002-2906-0299\n", " True\n", @@ -16913,7 +21161,7 @@ " <NA>\n", " [tiffany russel sia]\n", " <NA>\n", - " [gpc-1, cancer, immunohistochemistry, clinical...\n", + " [microfluidics, gpc-1, gallium-67, pet/ct, oxy...\n", " [[researcherid, a-2121-2017]]\n", " [[faculty of medicine, master in pharmaceutica...\n", " [[clinical project lead, minomic international...\n", @@ -16978,43 +21226,6 @@ " NaN\n", " \n", " \n", - " 10989640\n", - " 0000-0002-1070-2220\n", - " True\n", - " True\n", - " vinicios\n", - " santanna\n", - " <NA>\n", - " [vinicios sant anna, vinicios sant anna, vinic...\n", - " <NA>\n", - " NaN\n", - " [[scopus author id, 57201697952]]\n", - " [[economics, ph.d., university of illinois at ...\n", - " NaN\n", - " 1\n", - " [crossref metadata search]\n", - " 2016-03-19t21:24:42.821z\n", - " 2020-12-10t16:34:09.722z\n", - " 1\n", - " 0\n", - " 0\n", - " 1\n", - " True\n", - " NaN\n", - " NaN\n", - " [vpsantanna.com]\n", - " <NA>\n", - " 1\n", - " 1\n", - " <NA>\n", - " 2\n", - " <NA>\n", - " [crossref metadata search]\n", - " 1.0\n", - " True\n", - " NaN\n", - " \n", - " \n", " 10989643\n", " 0000-0003-2606-0936\n", " True\n", @@ -17090,7 +21301,7 @@ " \n", " \n", "\n", - "

2664886 rows × 34 columns

\n", + "

2075872 rows × 34 columns

\n", "" ], "text/plain": [ @@ -17101,50 +21312,37 @@ "33 0000-0002-4447-9215 True True \n", "44 0000-0003-0426-4065 True True \n", "... ... ... ... \n", + "10989635 0000-0002-7340-9697 True True \n", "10989636 0000-0002-2906-0299 True True \n", "10989637 0000-0001-5896-2024 True True \n", - "10989640 0000-0002-1070-2220 True True \n", "10989643 0000-0003-2606-0936 True True \n", "10989645 0000-0002-3800-6331 True True \n", "\n", - " given_names family_name biography \\\n", - "17 \n", - "19 \n", - "22 \n", - "33 \n", - "44 \n", - "... ... ... ... \n", - "10989636 tiffany mackay \n", - "10989637 giovanni, l tiscia \n", - "10989640 vinicios santanna \n", - "10989643 luang xu \n", - "10989645 zachary calamari \n", + " given_names family_name biography other_names \\\n", + "17 NaN \n", + "19 NaN \n", + "22 NaN \n", + "33 NaN \n", + "44 [eliza i. gilbert] \n", + "... ... ... ... ... \n", + "10989635 tawanda marandure NaN \n", + "10989636 tiffany mackay [tiffany russel sia] \n", + "10989637 giovanni, l tiscia NaN \n", + "10989643 luang xu [xu lu-ang, lu lu] \n", + "10989645 zachary calamari NaN \n", "\n", - " other_names primary_email \\\n", - "17 NaN \n", - "19 NaN \n", - "22 NaN \n", - "33 NaN \n", - "44 [eliza i. gilbert] \n", - "... ... ... \n", - "10989636 [tiffany russel sia] \n", - "10989637 NaN \n", - "10989640 [vinicios sant anna, vinicios sant anna, vinic... \n", - "10989643 [xu lu-ang, lu lu] \n", - "10989645 NaN \n", - "\n", - " keywords \\\n", - "17 NaN \n", - "19 NaN \n", - "22 NaN \n", - "33 NaN \n", - "44 NaN \n", - "... ... \n", - "10989636 [gpc-1, cancer, immunohistochemistry, clinical... \n", - "10989637 NaN \n", - "10989640 NaN \n", - "10989643 NaN \n", - "10989645 NaN \n", + " primary_email keywords \\\n", + "17 NaN \n", + "19 NaN \n", + "22 NaN \n", + "33 NaN \n", + "44 NaN \n", + "... ... ... \n", + "10989635 NaN \n", + "10989636 [microfluidics, gpc-1, gallium-67, pet/ct, oxy... \n", + "10989637 NaN \n", + "10989643 NaN \n", + "10989645 NaN \n", "\n", " external_ids \\\n", "17 NaN \n", @@ -17153,9 +21351,9 @@ "33 NaN \n", "44 NaN \n", "... ... \n", + "10989635 [[scopus author id, 48261373600]] \n", "10989636 [[researcherid, a-2121-2017]] \n", "10989637 [[scopus author id, 54948242800]] \n", - "10989640 [[scopus author id, 57201697952]] \n", "10989643 NaN \n", "10989645 NaN \n", "\n", @@ -17166,9 +21364,9 @@ "33 NaN \n", "44 NaN \n", "... ... \n", + "10989635 [[animal science, msc sustainable agriculture,... \n", "10989636 [[faculty of medicine, master in pharmaceutica... \n", "10989637 NaN \n", - "10989640 [[economics, ph.d., university of illinois at ... \n", "10989643 NaN \n", "10989645 [[richard gilder graduate school, phd in compa... \n", "\n", @@ -17179,9 +21377,9 @@ "33 NaN 0 \n", "44 [[, us fish and wildlife service, albuquerque,... 0 \n", "... ... ... \n", + "10989635 [[lecturer, zimbabwe open university faculty o... 7 \n", "10989636 [[clinical project lead, minomic international... 11 \n", "10989637 NaN 70 \n", - "10989640 NaN 1 \n", "10989643 [[post-doc, institute of biochemistry and cell... 2 \n", "10989645 [[assistant professor, baruch college, city un... 7 \n", "\n", @@ -17192,9 +21390,9 @@ "33 NaN \n", "44 NaN \n", "... ... \n", + "10989635 [scopus - elsevier] \n", "10989636 [crossref, researcherid, tiffany mackay] \n", "10989637 [scopus - elsevier, tiscia giovanni, l, europe... \n", - "10989640 [crossref metadata search] \n", "10989643 [scopus - elsevier, crossref] \n", "10989645 [crossref metadata search, zachary t. calamari... \n", "\n", @@ -17205,9 +21403,9 @@ "33 2017-07-24t09:37:50.242z 2019-11-15t08:31:24.820z 0 0 \n", "44 2017-08-07t18:32:31.802z 2020-04-08t16:48:55.732z 0 0 \n", "... ... ... ... ... \n", + "10989635 2015-11-05t08:52:08.743z 2020-12-09t17:59:18.350z 7 0 \n", "10989636 2017-01-03t23:28:48.736z 2020-12-09t17:12:20.326z 11 0 \n", "10989637 2016-07-27t10:09:13.585z 2020-12-07t22:23:05.706z 65 0 \n", - "10989640 2016-03-19t21:24:42.821z 2020-12-10t16:34:09.722z 1 0 \n", "10989643 2015-10-24t03:53:23.544z 2020-11-19t09:23:48.896z 2 0 \n", "10989645 2015-01-20t20:20:17.042z 2020-11-21t19:48:36.221z 7 0 \n", "\n", @@ -17218,9 +21416,9 @@ "33 0 0 True NaN NaN \n", "44 0 0 True NaN NaN \n", "... ... ... ... ... ... \n", + "10989635 0 7 True NaN NaN \n", "10989636 0 0 True NaN NaN \n", "10989637 17 52 True NaN NaN \n", - "10989640 0 1 True NaN NaN \n", "10989643 0 1 True NaN NaN \n", "10989645 1 0 True NaN NaN \n", "\n", @@ -17231,9 +21429,9 @@ "33 NaN \n", "44 NaN \n", "... ... ... ... ... \n", + "10989635 NaN 1 \n", "10989636 [oxytocin.com.au, linkedin.com] 2 1 \n", "10989637 NaN 1 \n", - "10989640 [vpsantanna.com] 1 1 \n", "10989643 NaN \n", "10989645 NaN \n", "\n", @@ -17244,9 +21442,9 @@ "33 \n", "44 1 \n", "... ... ... ... \n", + "10989635 3 3 \n", "10989636 13 2 4 \n", "10989637 \n", - "10989640 2 \n", "10989643 1 \n", "10989645 2 2 \n", "\n", @@ -17257,9 +21455,9 @@ "33 NaN \n", "44 NaN \n", "... ... \n", + "10989635 [scopus - elsevier] \n", "10989636 [crossref, researcherid] \n", "10989637 [scopus - elsevier, europe pubmed central, cro... \n", - "10989640 [crossref metadata search] \n", "10989643 [scopus - elsevier, crossref] \n", "10989645 [crossref metadata search, crossref] \n", "\n", @@ -17270,16 +21468,16 @@ "33 NaN False NaN \n", "44 NaN False NaN \n", "... ... ... ... \n", + "10989635 1.0 True NaN \n", "10989636 2.0 True NaN \n", "10989637 3.0 True NaN \n", - "10989640 1.0 True NaN \n", "10989643 2.0 True NaN \n", "10989645 2.0 True NaN \n", "\n", - "[2664886 rows x 34 columns]" + "[2075872 rows x 34 columns]" ] }, - "execution_count": 70, + "execution_count": 75, "metadata": {}, "output_type": "execute_result" } @@ -17290,7 +21488,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 76, "metadata": {}, "outputs": [], "source": [ @@ -17299,10 +21497,60 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 77, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 10989649 entries, 0 to 10989648\n", + "Data columns (total 34 columns):\n", + " # Column Dtype \n", + "--- ------ ----- \n", + " 0 orcid object \n", + " 1 verified_email bool \n", + " 2 verified_primary_email bool \n", + " 3 given_names string \n", + " 4 family_name string \n", + " 5 biography string \n", + " 6 other_names object \n", + " 7 primary_email string \n", + " 8 keywords object \n", + " 9 external_ids object \n", + " 10 education object \n", + " 11 employment object \n", + " 12 n_works Int16 \n", + " 13 works_source object \n", + " 14 activation_date string \n", + " 15 last_update_date string \n", + " 16 n_doi Int16 \n", + " 17 n_arxiv Int16 \n", + " 18 n_pmc Int16 \n", + " 19 n_other_pids Int16 \n", + " 20 label bool \n", + " 21 primary_email_domain object \n", + " 22 other_email_domains object \n", + " 23 url_domains object \n", + " 24 n_emails Int16 \n", + " 25 n_urls Int16 \n", + " 26 n_ids Int16 \n", + " 27 n_keywords Int16 \n", + " 28 n_education Int16 \n", + " 29 n_employment Int16 \n", + " 30 ext_works_source object \n", + " 31 n_ext_work_source float64\n", + " 32 authoritative object \n", + " 33 spam_score float64\n", + "dtypes: Int16(11), bool(3), float64(2), object(12), string(6)\n", + "memory usage: 2.0+ GB\n" + ] + } + ], + "source": [ + "df.info()" + ] }, { "cell_type": "code",