From 8e159607eab3649651dae5bfeb2358a98f9c26b3 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Thu, 25 Mar 2021 16:35:30 +0100 Subject: [PATCH] moved lots of preprocessing under make --- notebooks/01-Exploration.ipynb | 8062 +++++++++++++++++--------------- 1 file changed, 4258 insertions(+), 3804 deletions(-) diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index d94c9b1..4a624ae 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -200,8 +200,8 @@ " \n", " \n", " \n", - " 0000-0001-6097-3953\n", - " 1\n", + " 0\n", + " 0000-0001-6097-3953\n", " 0\n", " 0\n", " NaN\n", @@ -233,8 +233,8 @@ " NaN\n", " \n", " \n", - " 0000-0001-6112-5550\n", - " 1\n", + " 1\n", + " 0000-0001-6112-5550\n", " 1\n", " 1\n", " NaN\n", @@ -266,8 +266,8 @@ " 1.0\n", " \n", " \n", - " 0000-0001-6152-2695\n", - " 1\n", + " 2\n", + " 0000-0001-6152-2695\n", " 1\n", " 1\n", " NaN\n", @@ -299,8 +299,8 @@ " NaN\n", " \n", " \n", - " 0000-0001-6220-5683\n", - " 1\n", + " 3\n", + " 0000-0001-6220-5683\n", " 1\n", " 1\n", " NaN\n", @@ -332,8 +332,8 @@ " 1.0\n", " \n", " \n", - " 0000-0001-7071-8294\n", - " 1\n", + " 4\n", + " 0000-0001-7071-8294\n", " 1\n", " 1\n", " NaN\n", @@ -369,75 +369,54 @@ "" ], "text/plain": [ - " orcid verified_email verified_primary_email \\\n", - "0000-0001-6097-3953 1 0 0 \n", - "0000-0001-6112-5550 1 1 1 \n", - "0000-0001-6152-2695 1 1 1 \n", - "0000-0001-6220-5683 1 1 1 \n", - "0000-0001-7071-8294 1 1 1 \n", + " orcid verified_email verified_primary_email given_names \\\n", + "0 0000-0001-6097-3953 0 0 NaN \n", + "1 0000-0001-6112-5550 1 1 NaN \n", + "2 0000-0001-6152-2695 1 1 NaN \n", + "3 0000-0001-6220-5683 1 1 NaN \n", + "4 0000-0001-7071-8294 1 1 NaN \n", "\n", - " given_names family_name biography \\\n", - "0000-0001-6097-3953 NaN NaN NaN \n", - "0000-0001-6112-5550 NaN NaN NaN \n", - "0000-0001-6152-2695 NaN NaN NaN \n", - "0000-0001-6220-5683 NaN NaN NaN \n", - "0000-0001-7071-8294 NaN NaN NaN \n", + " family_name biography other_names primary_email keywords \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN [v.i. yurtaev; v. yurtaev] NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN NaN \n", "\n", - " other_names primary_email keywords \\\n", - "0000-0001-6097-3953 NaN NaN NaN \n", - "0000-0001-6112-5550 [v.i. yurtaev; v. yurtaev] NaN NaN \n", - "0000-0001-6152-2695 NaN NaN NaN \n", - "0000-0001-6220-5683 NaN NaN NaN \n", - "0000-0001-7071-8294 NaN NaN NaN \n", + " external_ids education employment \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN [[professor, peoples friendship university of ... \n", + "2 NaN NaN NaN \n", + "3 NaN NaN [[research scientist, new york university abu ... \n", + "4 NaN NaN [[researcher (academic), universidad de zarago... \n", "\n", - " external_ids education \\\n", - "0000-0001-6097-3953 NaN NaN \n", - "0000-0001-6112-5550 NaN NaN \n", - "0000-0001-6152-2695 NaN NaN \n", - "0000-0001-6220-5683 NaN NaN \n", - "0000-0001-7071-8294 NaN NaN \n", + " n_works works_source activation_date last_update_date \\\n", + "0 0 NaN 2018-03-02t09:29:16.528z 2018-03-02t09:43:07.551z \n", + "1 0 NaN 2018-04-03t07:50:23.358z 2020-03-18t09:42:44.753z \n", + "2 0 NaN 2019-12-11t15:31:56.388z 2020-01-28t15:34:17.309z \n", + "3 0 NaN 2015-08-18t12:36:45.307z 2020-09-23t13:37:54.180z \n", + "4 0 NaN 2014-03-10t13:22:01.966z 2016-06-14t22:17:54.470z \n", "\n", - " employment \\\n", - "0000-0001-6097-3953 NaN \n", - "0000-0001-6112-5550 [[professor, peoples friendship university of ... \n", - "0000-0001-6152-2695 NaN \n", - "0000-0001-6220-5683 [[research scientist, new york university abu ... \n", - "0000-0001-7071-8294 [[researcher (academic), universidad de zarago... \n", + " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", + "0 0 0 0 0 0 NaN \n", + "1 0 0 0 0 0 NaN \n", + "2 0 0 0 0 0 NaN \n", + "3 0 0 0 0 0 NaN \n", + "4 0 0 0 0 0 NaN \n", "\n", - " n_works works_source activation_date \\\n", - "0000-0001-6097-3953 0 NaN 2018-03-02t09:29:16.528z \n", - "0000-0001-6112-5550 0 NaN 2018-04-03t07:50:23.358z \n", - "0000-0001-6152-2695 0 NaN 2019-12-11t15:31:56.388z \n", - "0000-0001-6220-5683 0 NaN 2015-08-18t12:36:45.307z \n", - "0000-0001-7071-8294 0 NaN 2014-03-10t13:22:01.966z \n", + " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", + "0 NaN NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN NaN NaN \n", "\n", - " last_update_date n_doi n_arxiv n_pmc \\\n", - "0000-0001-6097-3953 2018-03-02t09:43:07.551z 0 0 0 \n", - "0000-0001-6112-5550 2020-03-18t09:42:44.753z 0 0 0 \n", - "0000-0001-6152-2695 2020-01-28t15:34:17.309z 0 0 0 \n", - "0000-0001-6220-5683 2020-09-23t13:37:54.180z 0 0 0 \n", - "0000-0001-7071-8294 2016-06-14t22:17:54.470z 0 0 0 \n", - "\n", - " n_other_pids label primary_email_domain \\\n", - "0000-0001-6097-3953 0 0 NaN \n", - "0000-0001-6112-5550 0 0 NaN \n", - "0000-0001-6152-2695 0 0 NaN \n", - "0000-0001-6220-5683 0 0 NaN \n", - "0000-0001-7071-8294 0 0 NaN \n", - "\n", - " other_email_domains url_domains n_emails n_urls n_ids \\\n", - "0000-0001-6097-3953 NaN NaN NaN NaN NaN \n", - "0000-0001-6112-5550 NaN NaN NaN NaN NaN \n", - "0000-0001-6152-2695 NaN NaN NaN NaN NaN \n", - "0000-0001-6220-5683 NaN NaN NaN NaN NaN \n", - "0000-0001-7071-8294 NaN NaN NaN NaN NaN \n", - "\n", - " n_keywords n_education n_employment \n", - "0000-0001-6097-3953 NaN NaN NaN \n", - "0000-0001-6112-5550 NaN NaN 1.0 \n", - "0000-0001-6152-2695 NaN NaN NaN \n", - "0000-0001-6220-5683 NaN NaN 1.0 \n", - "0000-0001-7071-8294 NaN NaN 2.0 " + " n_education n_employment \n", + "0 NaN NaN \n", + "1 NaN 1.0 \n", + "2 NaN NaN \n", + "3 NaN 1.0 \n", + "4 NaN 2.0 " ] }, "execution_count": 5, @@ -518,14 +497,82 @@ " \n", " \n", " \n", + " \n", + " 3073261\n", + " 0000-0002-5193-7851\n", + " 1\n", + " 1\n", + " andrea\n", + " mannocci\n", + " data scientist & researcher; scholarly knowled...\n", + " NaN\n", + " andrea.mannocci@isti.cnr.it\n", + " [research infrastructures, science of science,...\n", + " [[scopus author id, 55233589900]]\n", + " [[information engineering, ph.d., università d...\n", + " [[research associate, istituto di scienza e te...\n", + " 37\n", + " [scopus - elsevier, crossref metadata search, ...\n", + " 2017-09-12t14:28:33.467z\n", + " 2021-03-17t15:40:07.776z\n", + " 34\n", + " 0\n", + " 0\n", + " 60\n", + " 1\n", + " isti.cnr.it\n", + " NaN\n", + " [github.io, twitter.com, linkedin.com]\n", + " NaN\n", + " 3.0\n", + " 1.0\n", + " 5.0\n", + " 4.0\n", + " 5.0\n", + " \n", " \n", "\n", "" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [orcid, verified_email, verified_primary_email, given_names, family_name, biography, other_names, primary_email, keywords, external_ids, education, employment, n_works, works_source, activation_date, last_update_date, n_doi, n_arxiv, n_pmc, n_other_pids, label, primary_email_domain, other_email_domains, url_domains, n_emails, n_urls, n_ids, n_keywords, n_education, n_employment]\n", - "Index: []" + " orcid verified_email verified_primary_email \\\n", + "3073261 0000-0002-5193-7851 1 1 \n", + "\n", + " given_names family_name \\\n", + "3073261 andrea mannocci \n", + "\n", + " biography other_names \\\n", + "3073261 data scientist & researcher; scholarly knowled... NaN \n", + "\n", + " primary_email \\\n", + "3073261 andrea.mannocci@isti.cnr.it \n", + "\n", + " keywords \\\n", + "3073261 [research infrastructures, science of science,... \n", + "\n", + " external_ids \\\n", + "3073261 [[scopus author id, 55233589900]] \n", + "\n", + " education \\\n", + "3073261 [[information engineering, ph.d., università d... \n", + "\n", + " employment n_works \\\n", + "3073261 [[research associate, istituto di scienza e te... 37 \n", + "\n", + " works_source \\\n", + "3073261 [scopus - elsevier, crossref metadata search, ... \n", + "\n", + " activation_date last_update_date n_doi n_arxiv \\\n", + "3073261 2017-09-12t14:28:33.467z 2021-03-17t15:40:07.776z 34 0 \n", + "\n", + " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", + "3073261 0 60 1 isti.cnr.it NaN \n", + "\n", + " url_domains n_emails n_urls n_ids \\\n", + "3073261 [github.io, twitter.com, linkedin.com] NaN 3.0 1.0 \n", + "\n", + " n_keywords n_education n_employment \n", + "3073261 5.0 4.0 5.0 " ] }, "execution_count": 6, @@ -596,14 +643,67 @@ " \n", " \n", " \n", + " \n", + " 9887272\n", + " 0000-0001-6997-9470\n", + " 1\n", + " 1\n", + " other\n", + " whatsapp\n", + " NaN\n", + " NaN\n", + " NaN\n", + " [whatsapp gb apk, whatsapp gb baixar, whatsapp...\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0\n", + " NaN\n", + " 2020-10-07t10:37:12.237z\n", + " 2020-10-08t02:32:03.935z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " NaN\n", + " NaN\n", + " [otherwhatsapp.com, im-creator.com, facebook.c...\n", + " NaN\n", + " 27.0\n", + " NaN\n", + " 4.0\n", + " NaN\n", + " NaN\n", + " \n", " \n", "\n", "" ], "text/plain": [ - "Empty DataFrame\n", - "Columns: [orcid, verified_email, verified_primary_email, given_names, family_name, biography, other_names, primary_email, keywords, external_ids, education, employment, n_works, works_source, activation_date, last_update_date, n_doi, n_arxiv, n_pmc, n_other_pids, label, primary_email_domain, other_email_domains, url_domains, n_emails, n_urls, n_ids, n_keywords, n_education, n_employment]\n", - "Index: []" + " orcid verified_email verified_primary_email \\\n", + "9887272 0000-0001-6997-9470 1 1 \n", + "\n", + " given_names family_name biography other_names primary_email \\\n", + "9887272 other whatsapp NaN NaN NaN \n", + "\n", + " keywords external_ids \\\n", + "9887272 [whatsapp gb apk, whatsapp gb baixar, whatsapp... NaN \n", + "\n", + " education employment n_works works_source activation_date \\\n", + "9887272 NaN NaN 0 NaN 2020-10-07t10:37:12.237z \n", + "\n", + " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", + "9887272 2020-10-08t02:32:03.935z 0 0 0 0 0 \n", + "\n", + " primary_email_domain other_email_domains \\\n", + "9887272 NaN NaN \n", + "\n", + " url_domains n_emails n_urls \\\n", + "9887272 [otherwhatsapp.com, im-creator.com, facebook.c... NaN 27.0 \n", + "\n", + " n_ids n_keywords n_education n_employment \n", + "9887272 NaN 4.0 NaN NaN " ] }, "execution_count": 7, @@ -625,36 +725,36 @@ { "data": { "text/plain": [ - "orcid 10916574\n", - "verified_email 10916574\n", - "verified_primary_email 10916574\n", - "given_names 10886150\n", - "family_name 10601571\n", - "biography 348649\n", - "other_names 551482\n", - "primary_email 123851\n", - "keywords 646400\n", - "external_ids 1301959\n", - "education 2430233\n", - "employment 2665092\n", - "n_works 10916574\n", - "works_source 2721431\n", - "activation_date 10916574\n", - "last_update_date 10916574\n", - "n_doi 10916574\n", - "n_arxiv 10916574\n", - "n_pmc 10916574\n", - "n_other_pids 10916574\n", - "label 10916574\n", - "primary_email_domain 123851\n", - "other_email_domains 48306\n", - "url_domains 707687\n", - "n_emails 48306\n", - "n_urls 707687\n", - "n_ids 1301959\n", - "n_keywords 646400\n", - "n_education 2430233\n", - "n_employment 2665092\n", + "orcid 10989649\n", + "verified_email 10989649\n", + "verified_primary_email 10989649\n", + "given_names 10959039\n", + "family_name 10671715\n", + "biography 354015\n", + "other_names 554684\n", + "primary_email 124722\n", + "keywords 649637\n", + "external_ids 1308598\n", + "education 2441645\n", + "employment 2680488\n", + "n_works 10989649\n", + "works_source 2740939\n", + "activation_date 10989649\n", + "last_update_date 10989649\n", + "n_doi 10989649\n", + "n_arxiv 10989649\n", + "n_pmc 10989649\n", + "n_other_pids 10989649\n", + "label 10989649\n", + "primary_email_domain 124722\n", + "other_email_domains 48615\n", + "url_domains 715067\n", + "n_emails 48615\n", + "n_urls 715067\n", + "n_ids 1308598\n", + "n_keywords 649637\n", + "n_education 2441645\n", + "n_employment 2680488\n", "dtype: int64" ] }, @@ -675,15 +775,11 @@ { "data": { "text/plain": [ - "count 10916574.0\n", - "mean 1.0\n", - "std 0.0\n", - "min 1.0\n", - "25% 1.0\n", - "50% 1.0\n", - "75% 1.0\n", - "max 1.0\n", - "Name: orcid, dtype: float64" + "count 10989649\n", + "unique 10989649\n", + "top 0000-0003-2532-0046\n", + "freq 1\n", + "Name: orcid, dtype: object" ] }, "execution_count": 9, @@ -710,10 +806,10 @@ { "data": { "text/plain": [ - "count 123851\n", - "unique 123848\n", - "top maykin@owasp.org\n", - "freq 2\n", + "count 124722\n", + "unique 124718\n", + "top patrick.davey@monash.edu\n", + "freq 2\n", "Name: primary_email, dtype: object" ] }, @@ -741,9 +837,10 @@ { "data": { "text/plain": [ - "0000-0003-0033-0918 opercin@erbakan.edu.tr\n", - "0000-0002-8774-0030 patrick.davey@monash.edu\n", - "0000-0001-9855-1676 maykin@owasp.org\n", + "1681787 opercin@erbakan.edu.tr\n", + "5590332 patrick.davey@monash.edu\n", + "9316843 maykin@owasp.org\n", + "10375852 andycheng2026@163.com\n", "Name: primary_email, dtype: object" ] }, @@ -816,8 +913,8 @@ " \n", " \n", " \n", - " 0000-0002-0836-2271\n", - " 1\n", + " 7543981\n", + " 0000-0002-0836-2271\n", " 1\n", " 1\n", " maykin\n", @@ -849,8 +946,8 @@ " NaN\n", " \n", " \n", - " 0000-0001-9855-1676\n", - " 1\n", + " 9316843\n", + " 0000-0001-9855-1676\n", " 1\n", " 1\n", " maykin\n", @@ -886,41 +983,33 @@ "" ], "text/plain": [ - " orcid verified_email verified_primary_email \\\n", - "0000-0002-0836-2271 1 1 1 \n", - "0000-0001-9855-1676 1 1 1 \n", + " orcid verified_email verified_primary_email \\\n", + "7543981 0000-0002-0836-2271 1 1 \n", + "9316843 0000-0001-9855-1676 1 1 \n", "\n", - " given_names family_name biography other_names \\\n", - "0000-0002-0836-2271 maykin warasart NaN NaN \n", - "0000-0001-9855-1676 maykin warasart NaN NaN \n", + " given_names family_name biography other_names primary_email \\\n", + "7543981 maykin warasart NaN NaN maykin@owasp.org \n", + "9316843 maykin warasart NaN NaN maykin@owasp.org \n", "\n", - " primary_email keywords external_ids education \\\n", - "0000-0002-0836-2271 maykin@owasp.org NaN NaN NaN \n", - "0000-0001-9855-1676 maykin@owasp.org NaN NaN NaN \n", + " keywords external_ids education employment n_works works_source \\\n", + "7543981 NaN NaN NaN NaN 0 NaN \n", + "9316843 NaN NaN NaN NaN 0 NaN \n", "\n", - " employment n_works works_source \\\n", - "0000-0002-0836-2271 NaN 0 NaN \n", - "0000-0001-9855-1676 NaN 0 NaN \n", + " activation_date last_update_date n_doi n_arxiv \\\n", + "7543981 2020-09-15t04:43:55.709z 2020-09-15t05:17:28.509z 0 0 \n", + "9316843 2020-10-23t17:51:51.925z 2021-01-01t15:00:52.053z 0 0 \n", "\n", - " activation_date last_update_date \\\n", - "0000-0002-0836-2271 2020-09-15t04:43:55.709z 2020-09-15t05:17:28.509z \n", - "0000-0001-9855-1676 2020-10-23t17:51:51.925z 2021-01-01t15:00:52.053z \n", + " n_pmc n_other_pids label primary_email_domain \\\n", + "7543981 0 0 0 owasp.org \n", + "9316843 0 0 0 owasp.org \n", "\n", - " n_doi n_arxiv n_pmc n_other_pids label \\\n", - "0000-0002-0836-2271 0 0 0 0 0 \n", - "0000-0001-9855-1676 0 0 0 0 0 \n", + " other_email_domains url_domains n_emails n_urls n_ids \\\n", + "7543981 [dga.or.th] NaN 1.0 NaN NaN \n", + "9316843 [dga.or.th, ieee.org] NaN 2.0 NaN NaN \n", "\n", - " primary_email_domain other_email_domains url_domains \\\n", - "0000-0002-0836-2271 owasp.org [dga.or.th] NaN \n", - "0000-0001-9855-1676 owasp.org [dga.or.th, ieee.org] NaN \n", - "\n", - " n_emails n_urls n_ids n_keywords n_education \\\n", - "0000-0002-0836-2271 1.0 NaN NaN NaN NaN \n", - "0000-0001-9855-1676 2.0 NaN NaN NaN NaN \n", - "\n", - " n_employment \n", - "0000-0002-0836-2271 NaN \n", - "0000-0001-9855-1676 NaN " + " n_keywords n_education n_employment \n", + "7543981 NaN NaN NaN \n", + "9316843 NaN NaN NaN " ] }, "execution_count": 12, @@ -992,8 +1081,8 @@ " \n", " \n", " \n", - " 0000-0002-2232-9638\n", - " 1\n", + " 347852\n", + " 0000-0002-2232-9638\n", " 1\n", " 1\n", " osman\n", @@ -1025,8 +1114,8 @@ " NaN\n", " \n", " \n", - " 0000-0003-0033-0918\n", - " 1\n", + " 1681787\n", + " 0000-0003-0033-0918\n", " 1\n", " 1\n", " osman\n", @@ -1062,41 +1151,37 @@ "" ], "text/plain": [ - " orcid verified_email verified_primary_email \\\n", - "0000-0002-2232-9638 1 1 1 \n", - "0000-0003-0033-0918 1 1 1 \n", + " orcid verified_email verified_primary_email \\\n", + "347852 0000-0002-2232-9638 1 1 \n", + "1681787 0000-0003-0033-0918 1 1 \n", "\n", - " given_names family_name biography other_names \\\n", - "0000-0002-2232-9638 osman perçin NaN NaN \n", - "0000-0003-0033-0918 osman perçin NaN NaN \n", + " given_names family_name biography other_names primary_email \\\n", + "347852 osman perçin NaN NaN opercin@erbakan.edu.tr \n", + "1681787 osman perçin NaN NaN opercin@erbakan.edu.tr \n", "\n", - " primary_email keywords external_ids education \\\n", - "0000-0002-2232-9638 opercin@erbakan.edu.tr NaN NaN NaN \n", - "0000-0003-0033-0918 opercin@erbakan.edu.tr NaN NaN NaN \n", + " keywords external_ids education \\\n", + "347852 NaN NaN NaN \n", + "1681787 NaN NaN NaN \n", "\n", - " employment \\\n", - "0000-0002-2232-9638 NaN \n", - "0000-0003-0033-0918 [[, necmettin erbakan university, konya, , tr,... \n", + " employment n_works \\\n", + "347852 NaN 0 \n", + "1681787 [[, necmettin erbakan university, konya, , tr,... 0 \n", "\n", - " n_works works_source activation_date \\\n", - "0000-0002-2232-9638 0 NaN 2015-01-12t13:47:55.549z \n", - "0000-0003-0033-0918 0 NaN 2015-10-13t05:47:12.014z \n", + " works_source activation_date last_update_date \\\n", + "347852 NaN 2015-01-12t13:47:55.549z 2020-01-27t07:38:24.269z \n", + "1681787 NaN 2015-10-13t05:47:12.014z 2020-12-25t13:52:03.976z \n", "\n", - " last_update_date n_doi n_arxiv n_pmc \\\n", - "0000-0002-2232-9638 2020-01-27t07:38:24.269z 0 0 0 \n", - "0000-0003-0033-0918 2020-12-25t13:52:03.976z 0 0 0 \n", + " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", + "347852 0 0 0 0 0 erbakan.edu.tr \n", + "1681787 0 0 0 0 0 erbakan.edu.tr \n", "\n", - " n_other_pids label primary_email_domain \\\n", - "0000-0002-2232-9638 0 0 erbakan.edu.tr \n", - "0000-0003-0033-0918 0 0 erbakan.edu.tr \n", + " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", + "347852 NaN NaN NaN NaN NaN NaN \n", + "1681787 NaN NaN NaN NaN NaN NaN \n", "\n", - " other_email_domains url_domains n_emails n_urls n_ids \\\n", - "0000-0002-2232-9638 NaN NaN NaN NaN NaN \n", - "0000-0003-0033-0918 NaN NaN NaN NaN NaN \n", - "\n", - " n_keywords n_education n_employment \n", - "0000-0002-2232-9638 NaN NaN NaN \n", - "0000-0003-0033-0918 NaN NaN 1.0 " + " n_education n_employment \n", + "347852 NaN NaN \n", + "1681787 NaN 1.0 " ] }, "execution_count": 13, @@ -1168,8 +1253,8 @@ " \n", " \n", " \n", - " 0000-0002-9158-1757\n", - " 1\n", + " 954085\n", + " 0000-0002-9158-1757\n", " 1\n", " 1\n", " patrick\n", @@ -1177,7 +1262,7 @@ " NaN\n", " NaN\n", " patrick.davey@monash.edu\n", - " [radiochemistry, bioinorganic chemistry, inorg...\n", + " [radiochemistry, radiopharmaceuticals, inorgan...\n", " NaN\n", " NaN\n", " [[phd student, monash university, melbourne, ,...\n", @@ -1201,8 +1286,8 @@ " 1.0\n", " \n", " \n", - " 0000-0002-8774-0030\n", - " 1\n", + " 5590332\n", + " 0000-0002-8774-0030\n", " 1\n", " 1\n", " patrick\n", @@ -1222,7 +1307,7 @@ " 0\n", " 0\n", " 0\n", - " 0\n", + " 1\n", " monash.edu\n", " NaN\n", " NaN\n", @@ -1238,49 +1323,41 @@ "" ], "text/plain": [ - " orcid verified_email verified_primary_email \\\n", - "0000-0002-9158-1757 1 1 1 \n", - "0000-0002-8774-0030 1 1 1 \n", + " orcid verified_email verified_primary_email \\\n", + "954085 0000-0002-9158-1757 1 1 \n", + "5590332 0000-0002-8774-0030 1 1 \n", "\n", - " given_names family_name biography other_names \\\n", - "0000-0002-9158-1757 patrick davey NaN NaN \n", - "0000-0002-8774-0030 patrick davey NaN NaN \n", + " given_names family_name biography other_names \\\n", + "954085 patrick davey NaN NaN \n", + "5590332 patrick davey NaN NaN \n", "\n", - " primary_email \\\n", - "0000-0002-9158-1757 patrick.davey@monash.edu \n", - "0000-0002-8774-0030 patrick.davey@monash.edu \n", + " primary_email \\\n", + "954085 patrick.davey@monash.edu \n", + "5590332 patrick.davey@monash.edu \n", "\n", - " keywords \\\n", - "0000-0002-9158-1757 [radiochemistry, bioinorganic chemistry, inorg... \n", - "0000-0002-8774-0030 NaN \n", + " keywords external_ids \\\n", + "954085 [radiochemistry, radiopharmaceuticals, inorgan... NaN \n", + "5590332 NaN NaN \n", "\n", - " external_ids education \\\n", - "0000-0002-9158-1757 NaN NaN \n", - "0000-0002-8774-0030 NaN NaN \n", + " education employment n_works \\\n", + "954085 NaN [[phd student, monash university, melbourne, ,... 0 \n", + "5590332 NaN [[phd student, monash university, melbourne, v... 1 \n", "\n", - " employment \\\n", - "0000-0002-9158-1757 [[phd student, monash university, melbourne, ,... \n", - "0000-0002-8774-0030 [[phd student, monash university, melbourne, v... \n", + " works_source activation_date last_update_date \\\n", + "954085 NaN 2019-05-09t23:01:02.170z 2019-08-20t03:00:17.844z \n", + "5590332 [crossref] 2018-09-11t10:47:10.997z 2021-02-09t06:21:44.138z \n", "\n", - " n_works works_source activation_date \\\n", - "0000-0002-9158-1757 0 NaN 2019-05-09t23:01:02.170z \n", - "0000-0002-8774-0030 1 [crossref] 2018-09-11t10:47:10.997z \n", + " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", + "954085 0 0 0 0 0 monash.edu \n", + "5590332 1 0 0 0 1 monash.edu \n", "\n", - " last_update_date n_doi n_arxiv n_pmc \\\n", - "0000-0002-9158-1757 2019-08-20t03:00:17.844z 0 0 0 \n", - "0000-0002-8774-0030 2021-02-09t06:21:44.138z 1 0 0 \n", + " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", + "954085 NaN NaN NaN NaN NaN 4.0 \n", + "5590332 NaN NaN NaN NaN NaN NaN \n", "\n", - " n_other_pids label primary_email_domain \\\n", - "0000-0002-9158-1757 0 0 monash.edu \n", - "0000-0002-8774-0030 0 0 monash.edu \n", - "\n", - " other_email_domains url_domains n_emails n_urls n_ids \\\n", - "0000-0002-9158-1757 NaN NaN NaN NaN NaN \n", - "0000-0002-8774-0030 NaN NaN NaN NaN NaN \n", - "\n", - " n_keywords n_education n_employment \n", - "0000-0002-9158-1757 4.0 NaN 1.0 \n", - "0000-0002-8774-0030 NaN NaN 1.0 " + " n_education n_employment \n", + "954085 NaN 1.0 \n", + "5590332 NaN 1.0 " ] }, "execution_count": 14, @@ -1300,10 +1377,10 @@ { "data": { "text/plain": [ - "count 123851\n", - "unique 17089\n", + "count 124722\n", + "unique 17160\n", "top gmail.com\n", - "freq 26540\n", + "freq 26750\n", "Name: primary_email_domain, dtype: object" ] }, @@ -1352,42 +1429,42 @@ " \n", " \n", " gmail.com\n", - " 26540\n", + " 26750\n", " \n", " \n", " hotmail.com\n", - " 3769\n", + " 3801\n", " \n", " \n", " yahoo.com\n", - " 2614\n", + " 2625\n", " \n", " \n", " 163.com\n", - " 2109\n", + " 2132\n", " \n", " \n", " yuhs.ac\n", - " 1132\n", + " 1134\n", " \n", " \n", " ...\n", " ...\n", " \n", " \n", - " imean-biotech.com\n", + " imf.csic.es\n", " 1\n", " \n", " \n", - " imec.msu.ru\n", + " imf.org\n", " 1\n", " \n", " \n", - " imedea.uib-csic.es\n", + " imfd.tu-freiberg.de\n", " 1\n", " \n", " \n", - " imes.uni-hannover.de\n", + " imft.fr\n", " 1\n", " \n", " \n", @@ -1396,25 +1473,25 @@ " \n", " \n", "\n", - "

17089 rows × 1 columns

\n", + "

17160 rows × 1 columns

\n", "" ], "text/plain": [ " orcid\n", "primary_email_domain \n", - "gmail.com 26540\n", - "hotmail.com 3769\n", - "yahoo.com 2614\n", - "163.com 2109\n", - "yuhs.ac 1132\n", + "gmail.com 26750\n", + "hotmail.com 3801\n", + "yahoo.com 2625\n", + "163.com 2132\n", + "yuhs.ac 1134\n", "... ...\n", - "imean-biotech.com 1\n", - "imec.msu.ru 1\n", - "imedea.uib-csic.es 1\n", - "imes.uni-hannover.de 1\n", + "imf.csic.es 1\n", + "imf.org 1\n", + "imfd.tu-freiberg.de 1\n", + "imft.fr 1\n", "zzuli.edu.cn 1\n", "\n", - "[17089 rows x 1 columns]" + "[17160 rows x 1 columns]" ] }, "execution_count": 16, @@ -1458,8 +1535,8 @@ "bu.edu", "usgs.gov", "mail.ru", - "yahoo.com.br", "usp.br", + "yahoo.com.br", "ua.pt", "umich.edu", "ust.hk", @@ -1472,43 +1549,43 @@ "ucl.ac.uk", "ieee.org", "naver.com", - "st-annes.ox.ac.uk", "stcatz.ox.ac.uk", + "st-annes.ox.ac.uk", "yahoo.fr", "ucm.es", "live.com" ], "y": [ - 26540, - 3769, - 2614, - 2109, - 1132, - 1056, - 940, - 762, - 630, - 584, - 575, - 458, - 457, - 300, + 26750, + 3801, + 2625, + 2132, + 1134, + 1059, + 948, + 766, + 629, + 586, + 579, + 464, + 459, + 302, 290, 277, - 258, - 247, - 242, + 260, + 248, + 244, 235, - 225, + 226, 218, - 207, - 204, - 187, + 210, + 205, + 188, 184, 184, - 172, - 171, - 163 + 174, + 174, + 165 ] } ], @@ -2335,9 +2412,9 @@ } }, "text/html": [ - "