\n",
"\n",
" keywords external_ids \\\n",
- "9887272 [whatsapp gb apk, whatsapp gb baixar, whatsapp... NaN \n",
+ "9887272 [whatsapp gb baixar, whatsapp gb 2020, whatsap... NaN \n",
"\n",
" education employment n_works works_source activation_date \\\n",
"9887272 NaN NaN 0 NaN 2020-10-07t10:37:12.237z \n",
@@ -778,7 +778,7 @@
"text/plain": [
"count 10989649\n",
"unique 10989649\n",
- "top 0000-0002-6749-5031\n",
+ "top 0000-0001-5242-3687\n",
"freq 1\n",
"Name: orcid, dtype: object"
]
@@ -807,10 +807,10 @@
{
"data": {
"text/plain": [
- "count 124722\n",
- "unique 124718\n",
- "top maykin@owasp.org\n",
- "freq 2\n",
+ "count 124722\n",
+ "unique 124718\n",
+ "top opercin@erbakan.edu.tr\n",
+ "freq 2\n",
"Name: primary_email, dtype: object"
]
},
@@ -1263,7 +1263,7 @@
" <NA> | \n",
" NaN | \n",
" patrick.davey@monash.edu | \n",
- " [radiochemistry, radiopharmaceuticals, bioinor... | \n",
+ " [radiochemistry, inorganic chemistry, bioinorg... | \n",
" NaN | \n",
" NaN | \n",
" [[phd student, monash university, melbourne, ,... | \n",
@@ -1337,7 +1337,7 @@
"5590332 patrick.davey@monash.edu \n",
"\n",
" keywords external_ids \\\n",
- "954085 [radiochemistry, radiopharmaceuticals, bioinor... NaN \n",
+ "954085 [radiochemistry, inorganic chemistry, bioinorg... NaN \n",
"5590332 NaN NaN \n",
"\n",
" education employment n_works \\\n",
@@ -2413,9 +2413,9 @@
}
},
"text/html": [
- ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
}
],
"source": [
- "fig = px.imshow(df[[]].fillna(-1).corr())\n",
+ "fig = px.imshow(df.select_dtypes(include=['bool','number']).fillna(-1).corr())\n",
"fig.show()"
]
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 74,
"metadata": {},
"outputs": [],
"source": [
@@ -16620,7 +20831,7 @@
},
{
"cell_type": "code",
- "execution_count": 70,
+ "execution_count": 75,
"metadata": {},
"outputs": [
{
@@ -16904,6 +21115,43 @@
" ... | \n",
" \n",
" \n",
+ " 10989635 | \n",
+ " 0000-0002-7340-9697 | \n",
+ " True | \n",
+ " True | \n",
+ " tawanda | \n",
+ " marandure | \n",
+ " <NA> | \n",
+ " NaN | \n",
+ " <NA> | \n",
+ " NaN | \n",
+ " [[scopus author id, 48261373600]] | \n",
+ " [[animal science, msc sustainable agriculture,... | \n",
+ " [[lecturer, zimbabwe open university faculty o... | \n",
+ " 7 | \n",
+ " [scopus - elsevier] | \n",
+ " 2015-11-05t08:52:08.743z | \n",
+ " 2020-12-09t17:59:18.350z | \n",
+ " 7 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 7 | \n",
+ " True | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " <NA> | \n",
+ " <NA> | \n",
+ " 1 | \n",
+ " <NA> | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " [scopus - elsevier] | \n",
+ " 1.0 | \n",
+ " True | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
" 10989636 | \n",
" 0000-0002-2906-0299 | \n",
" True | \n",
@@ -16913,7 +21161,7 @@
" <NA> | \n",
" [tiffany russel sia] | \n",
" <NA> | \n",
- " [gpc-1, cancer, immunohistochemistry, clinical... | \n",
+ " [microfluidics, gpc-1, gallium-67, pet/ct, oxy... | \n",
" [[researcherid, a-2121-2017]] | \n",
" [[faculty of medicine, master in pharmaceutica... | \n",
" [[clinical project lead, minomic international... | \n",
@@ -16978,43 +21226,6 @@
" NaN | \n",
"
\n",
" \n",
- " 10989640 | \n",
- " 0000-0002-1070-2220 | \n",
- " True | \n",
- " True | \n",
- " vinicios | \n",
- " santanna | \n",
- " <NA> | \n",
- " [vinicios sant anna, vinicios sant anna, vinic... | \n",
- " <NA> | \n",
- " NaN | \n",
- " [[scopus author id, 57201697952]] | \n",
- " [[economics, ph.d., university of illinois at ... | \n",
- " NaN | \n",
- " 1 | \n",
- " [crossref metadata search] | \n",
- " 2016-03-19t21:24:42.821z | \n",
- " 2020-12-10t16:34:09.722z | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " [vpsantanna.com] | \n",
- " <NA> | \n",
- " 1 | \n",
- " 1 | \n",
- " <NA> | \n",
- " 2 | \n",
- " <NA> | \n",
- " [crossref metadata search] | \n",
- " 1.0 | \n",
- " True | \n",
- " NaN | \n",
- "
\n",
- " \n",
" 10989643 | \n",
" 0000-0003-2606-0936 | \n",
" True | \n",
@@ -17090,7 +21301,7 @@
"
\n",
" \n",
"\n",
- "2664886 rows × 34 columns
\n",
+ "2075872 rows × 34 columns
\n",
""
],
"text/plain": [
@@ -17101,50 +21312,37 @@
"33 0000-0002-4447-9215 True True \n",
"44 0000-0003-0426-4065 True True \n",
"... ... ... ... \n",
+ "10989635 0000-0002-7340-9697 True True \n",
"10989636 0000-0002-2906-0299 True True \n",
"10989637 0000-0001-5896-2024 True True \n",
- "10989640 0000-0002-1070-2220 True True \n",
"10989643 0000-0003-2606-0936 True True \n",
"10989645 0000-0002-3800-6331 True True \n",
"\n",
- " given_names family_name biography \\\n",
- "17 \n",
- "19 \n",
- "22 \n",
- "33 \n",
- "44 \n",
- "... ... ... ... \n",
- "10989636 tiffany mackay \n",
- "10989637 giovanni, l tiscia \n",
- "10989640 vinicios santanna \n",
- "10989643 luang xu \n",
- "10989645 zachary calamari \n",
+ " given_names family_name biography other_names \\\n",
+ "17 NaN \n",
+ "19 NaN \n",
+ "22 NaN \n",
+ "33 NaN \n",
+ "44 [eliza i. gilbert] \n",
+ "... ... ... ... ... \n",
+ "10989635 tawanda marandure NaN \n",
+ "10989636 tiffany mackay [tiffany russel sia] \n",
+ "10989637 giovanni, l tiscia NaN \n",
+ "10989643 luang xu [xu lu-ang, lu lu] \n",
+ "10989645 zachary calamari NaN \n",
"\n",
- " other_names primary_email \\\n",
- "17 NaN \n",
- "19 NaN \n",
- "22 NaN \n",
- "33 NaN \n",
- "44 [eliza i. gilbert] \n",
- "... ... ... \n",
- "10989636 [tiffany russel sia] \n",
- "10989637 NaN \n",
- "10989640 [vinicios sant anna, vinicios sant anna, vinic... \n",
- "10989643 [xu lu-ang, lu lu] \n",
- "10989645 NaN \n",
- "\n",
- " keywords \\\n",
- "17 NaN \n",
- "19 NaN \n",
- "22 NaN \n",
- "33 NaN \n",
- "44 NaN \n",
- "... ... \n",
- "10989636 [gpc-1, cancer, immunohistochemistry, clinical... \n",
- "10989637 NaN \n",
- "10989640 NaN \n",
- "10989643 NaN \n",
- "10989645 NaN \n",
+ " primary_email keywords \\\n",
+ "17 NaN \n",
+ "19 NaN \n",
+ "22 NaN \n",
+ "33 NaN \n",
+ "44 NaN \n",
+ "... ... ... \n",
+ "10989635 NaN \n",
+ "10989636 [microfluidics, gpc-1, gallium-67, pet/ct, oxy... \n",
+ "10989637 NaN \n",
+ "10989643 NaN \n",
+ "10989645 NaN \n",
"\n",
" external_ids \\\n",
"17 NaN \n",
@@ -17153,9 +21351,9 @@
"33 NaN \n",
"44 NaN \n",
"... ... \n",
+ "10989635 [[scopus author id, 48261373600]] \n",
"10989636 [[researcherid, a-2121-2017]] \n",
"10989637 [[scopus author id, 54948242800]] \n",
- "10989640 [[scopus author id, 57201697952]] \n",
"10989643 NaN \n",
"10989645 NaN \n",
"\n",
@@ -17166,9 +21364,9 @@
"33 NaN \n",
"44 NaN \n",
"... ... \n",
+ "10989635 [[animal science, msc sustainable agriculture,... \n",
"10989636 [[faculty of medicine, master in pharmaceutica... \n",
"10989637 NaN \n",
- "10989640 [[economics, ph.d., university of illinois at ... \n",
"10989643 NaN \n",
"10989645 [[richard gilder graduate school, phd in compa... \n",
"\n",
@@ -17179,9 +21377,9 @@
"33 NaN 0 \n",
"44 [[, us fish and wildlife service, albuquerque,... 0 \n",
"... ... ... \n",
+ "10989635 [[lecturer, zimbabwe open university faculty o... 7 \n",
"10989636 [[clinical project lead, minomic international... 11 \n",
"10989637 NaN 70 \n",
- "10989640 NaN 1 \n",
"10989643 [[post-doc, institute of biochemistry and cell... 2 \n",
"10989645 [[assistant professor, baruch college, city un... 7 \n",
"\n",
@@ -17192,9 +21390,9 @@
"33 NaN \n",
"44 NaN \n",
"... ... \n",
+ "10989635 [scopus - elsevier] \n",
"10989636 [crossref, researcherid, tiffany mackay] \n",
"10989637 [scopus - elsevier, tiscia giovanni, l, europe... \n",
- "10989640 [crossref metadata search] \n",
"10989643 [scopus - elsevier, crossref] \n",
"10989645 [crossref metadata search, zachary t. calamari... \n",
"\n",
@@ -17205,9 +21403,9 @@
"33 2017-07-24t09:37:50.242z 2019-11-15t08:31:24.820z 0 0 \n",
"44 2017-08-07t18:32:31.802z 2020-04-08t16:48:55.732z 0 0 \n",
"... ... ... ... ... \n",
+ "10989635 2015-11-05t08:52:08.743z 2020-12-09t17:59:18.350z 7 0 \n",
"10989636 2017-01-03t23:28:48.736z 2020-12-09t17:12:20.326z 11 0 \n",
"10989637 2016-07-27t10:09:13.585z 2020-12-07t22:23:05.706z 65 0 \n",
- "10989640 2016-03-19t21:24:42.821z 2020-12-10t16:34:09.722z 1 0 \n",
"10989643 2015-10-24t03:53:23.544z 2020-11-19t09:23:48.896z 2 0 \n",
"10989645 2015-01-20t20:20:17.042z 2020-11-21t19:48:36.221z 7 0 \n",
"\n",
@@ -17218,9 +21416,9 @@
"33 0 0 True NaN NaN \n",
"44 0 0 True NaN NaN \n",
"... ... ... ... ... ... \n",
+ "10989635 0 7 True NaN NaN \n",
"10989636 0 0 True NaN NaN \n",
"10989637 17 52 True NaN NaN \n",
- "10989640 0 1 True NaN NaN \n",
"10989643 0 1 True NaN NaN \n",
"10989645 1 0 True NaN NaN \n",
"\n",
@@ -17231,9 +21429,9 @@
"33 NaN \n",
"44 NaN \n",
"... ... ... ... ... \n",
+ "10989635 NaN 1 \n",
"10989636 [oxytocin.com.au, linkedin.com] 2 1 \n",
"10989637 NaN 1 \n",
- "10989640 [vpsantanna.com] 1 1 \n",
"10989643 NaN \n",
"10989645 NaN \n",
"\n",
@@ -17244,9 +21442,9 @@
"33 \n",
"44 1 \n",
"... ... ... ... \n",
+ "10989635 3 3 \n",
"10989636 13 2 4 \n",
"10989637 \n",
- "10989640 2 \n",
"10989643 1 \n",
"10989645 2 2 \n",
"\n",
@@ -17257,9 +21455,9 @@
"33 NaN \n",
"44 NaN \n",
"... ... \n",
+ "10989635 [scopus - elsevier] \n",
"10989636 [crossref, researcherid] \n",
"10989637 [scopus - elsevier, europe pubmed central, cro... \n",
- "10989640 [crossref metadata search] \n",
"10989643 [scopus - elsevier, crossref] \n",
"10989645 [crossref metadata search, crossref] \n",
"\n",
@@ -17270,16 +21468,16 @@
"33 NaN False NaN \n",
"44 NaN False NaN \n",
"... ... ... ... \n",
+ "10989635 1.0 True NaN \n",
"10989636 2.0 True NaN \n",
"10989637 3.0 True NaN \n",
- "10989640 1.0 True NaN \n",
"10989643 2.0 True NaN \n",
"10989645 2.0 True NaN \n",
"\n",
- "[2664886 rows x 34 columns]"
+ "[2075872 rows x 34 columns]"
]
},
- "execution_count": 70,
+ "execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
@@ -17290,7 +21488,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
@@ -17299,10 +21497,60 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 77,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 10989649 entries, 0 to 10989648\n",
+ "Data columns (total 34 columns):\n",
+ " # Column Dtype \n",
+ "--- ------ ----- \n",
+ " 0 orcid object \n",
+ " 1 verified_email bool \n",
+ " 2 verified_primary_email bool \n",
+ " 3 given_names string \n",
+ " 4 family_name string \n",
+ " 5 biography string \n",
+ " 6 other_names object \n",
+ " 7 primary_email string \n",
+ " 8 keywords object \n",
+ " 9 external_ids object \n",
+ " 10 education object \n",
+ " 11 employment object \n",
+ " 12 n_works Int16 \n",
+ " 13 works_source object \n",
+ " 14 activation_date string \n",
+ " 15 last_update_date string \n",
+ " 16 n_doi Int16 \n",
+ " 17 n_arxiv Int16 \n",
+ " 18 n_pmc Int16 \n",
+ " 19 n_other_pids Int16 \n",
+ " 20 label bool \n",
+ " 21 primary_email_domain object \n",
+ " 22 other_email_domains object \n",
+ " 23 url_domains object \n",
+ " 24 n_emails Int16 \n",
+ " 25 n_urls Int16 \n",
+ " 26 n_ids Int16 \n",
+ " 27 n_keywords Int16 \n",
+ " 28 n_education Int16 \n",
+ " 29 n_employment Int16 \n",
+ " 30 ext_works_source object \n",
+ " 31 n_ext_work_source float64\n",
+ " 32 authoritative object \n",
+ " 33 spam_score float64\n",
+ "dtypes: Int16(11), bool(3), float64(2), object(12), string(6)\n",
+ "memory usage: 2.0+ GB\n"
+ ]
+ }
+ ],
+ "source": [
+ "df.info()"
+ ]
},
{
"cell_type": "code",