From 55377051926f35fe464392488e84f7cb2ef1de32 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Wed, 24 Mar 2021 12:06:27 +0100 Subject: [PATCH] a few optimisations by creating variables so to make operations once before charting results --- notebooks/01-Exploration.ipynb | 390 ++++++++++++++++++--------------- 1 file changed, 212 insertions(+), 178 deletions(-) diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index 4efffc5..98e01b9 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -135,7 +135,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -340,7 +340,7 @@ "[5 rows x 24 columns]" ] }, - "execution_count": 6, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -359,7 +359,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -467,7 +467,7 @@ "[1 rows x 24 columns]" ] }, - "execution_count": 7, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -478,7 +478,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -574,7 +574,7 @@ "[1 rows x 24 columns]" ] }, - "execution_count": 8, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } @@ -585,7 +585,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -618,7 +618,7 @@ "dtype: int64" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } @@ -629,7 +629,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -637,12 +637,12 @@ "text/plain": [ "count 10916574\n", "unique 10916574\n", - "top 0000-0002-5454-7613\n", + "top 0000-0001-8786-4765\n", "freq 1\n", "Name: orcid, dtype: object" ] }, - "execution_count": 10, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -660,7 +660,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -673,7 +673,7 @@ "Name: primary_email, dtype: object" ] }, - "execution_count": 20, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -691,7 +691,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -703,7 +703,7 @@ "Name: primary_email, dtype: object" ] }, - "execution_count": 21, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -714,7 +714,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 12, "metadata": {}, "outputs": [ { @@ -749,6 +749,7 @@ " urls\n", " primary_email\n", " ...\n", + " employment\n", " n_works\n", " works_source\n", " activation_date\n", @@ -758,7 +759,6 @@ " n_pmc\n", " n_other_pids\n", " label\n", - " primary_email_domain\n", " \n", " \n", " \n", @@ -775,6 +775,7 @@ " NaN\n", " maykin@owasp.org\n", " ...\n", + " NaN\n", " 0\n", " NaN\n", " 2020-10-23t17:51:51.925z\n", @@ -784,7 +785,6 @@ " 0\n", " 0\n", " 0\n", - " owasp.org\n", " \n", " \n", " 6347224\n", @@ -799,6 +799,7 @@ " NaN\n", " maykin@owasp.org\n", " ...\n", + " NaN\n", " 0\n", " NaN\n", " 2020-09-15t04:43:55.709z\n", @@ -808,11 +809,10 @@ " 0\n", " 0\n", " 0\n", - " owasp.org\n", " \n", " \n", "\n", - "

2 rows × 25 columns

\n", + "

2 rows × 24 columns

\n", "" ], "text/plain": [ @@ -824,22 +824,18 @@ "4450046 maykin warasart NaN NaN NaN maykin@owasp.org \n", "6347224 maykin warasart NaN NaN NaN maykin@owasp.org \n", "\n", - " ... n_works works_source activation_date \\\n", - "4450046 ... 0 NaN 2020-10-23t17:51:51.925z \n", - "6347224 ... 0 NaN 2020-09-15t04:43:55.709z \n", + " ... employment n_works works_source activation_date \\\n", + "4450046 ... NaN 0 NaN 2020-10-23t17:51:51.925z \n", + "6347224 ... NaN 0 NaN 2020-09-15t04:43:55.709z \n", "\n", - " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", - "4450046 2021-01-01t15:00:52.053z 0 0 0 0 0 \n", - "6347224 2020-09-15t05:17:28.509z 0 0 0 0 0 \n", + " last_update_date n_doi n_arxiv n_pmc n_other_pids label \n", + "4450046 2021-01-01t15:00:52.053z 0 0 0 0 0 \n", + "6347224 2020-09-15t05:17:28.509z 0 0 0 0 0 \n", "\n", - " primary_email_domain \n", - "4450046 owasp.org \n", - "6347224 owasp.org \n", - "\n", - "[2 rows x 25 columns]" + "[2 rows x 24 columns]" ] }, - "execution_count": 22, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -850,7 +846,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 13, "metadata": {}, "outputs": [ { @@ -885,6 +881,7 @@ " urls\n", " primary_email\n", " ...\n", + " employment\n", " n_works\n", " works_source\n", " activation_date\n", @@ -894,7 +891,6 @@ " n_pmc\n", " n_other_pids\n", " label\n", - " primary_email_domain\n", " \n", " \n", " \n", @@ -911,6 +907,7 @@ " NaN\n", " opercin@erbakan.edu.tr\n", " ...\n", + " NaN\n", " 0\n", " NaN\n", " 2015-01-12t13:47:55.549z\n", @@ -920,7 +917,6 @@ " 0\n", " 0\n", " 0\n", - " erbakan.edu.tr\n", " \n", " \n", " 9529005\n", @@ -935,6 +931,7 @@ " NaN\n", " opercin@erbakan.edu.tr\n", " ...\n", + " [[, necmettin erbakan university, konya, , tr,...\n", " 0\n", " NaN\n", " 2015-10-13t05:47:12.014z\n", @@ -944,11 +941,10 @@ " 0\n", " 0\n", " 0\n", - " erbakan.edu.tr\n", " \n", " \n", "\n", - "

2 rows × 25 columns

\n", + "

2 rows × 24 columns

\n", "" ], "text/plain": [ @@ -960,22 +956,26 @@ "6840791 osman perçin NaN NaN NaN \n", "9529005 osman perçin NaN NaN NaN \n", "\n", - " primary_email ... n_works works_source \\\n", - "6840791 opercin@erbakan.edu.tr ... 0 NaN \n", - "9529005 opercin@erbakan.edu.tr ... 0 NaN \n", + " primary_email ... \\\n", + "6840791 opercin@erbakan.edu.tr ... \n", + "9529005 opercin@erbakan.edu.tr ... \n", "\n", - " activation_date last_update_date n_doi n_arxiv \\\n", - "6840791 2015-01-12t13:47:55.549z 2020-01-27t07:38:24.269z 0 0 \n", - "9529005 2015-10-13t05:47:12.014z 2020-12-25t13:52:03.976z 0 0 \n", + " employment n_works \\\n", + "6840791 NaN 0 \n", + "9529005 [[, necmettin erbakan university, konya, , tr,... 0 \n", "\n", - " n_pmc n_other_pids label primary_email_domain \n", - "6840791 0 0 0 erbakan.edu.tr \n", - "9529005 0 0 0 erbakan.edu.tr \n", + " works_source activation_date last_update_date \\\n", + "6840791 NaN 2015-01-12t13:47:55.549z 2020-01-27t07:38:24.269z \n", + "9529005 NaN 2015-10-13t05:47:12.014z 2020-12-25t13:52:03.976z \n", "\n", - "[2 rows x 25 columns]" + " n_doi n_arxiv n_pmc n_other_pids label \n", + "6840791 0 0 0 0 0 \n", + "9529005 0 0 0 0 0 \n", + "\n", + "[2 rows x 24 columns]" ] }, - "execution_count": 23, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -986,7 +986,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -1021,6 +1021,7 @@ " urls\n", " primary_email\n", " ...\n", + " employment\n", " n_works\n", " works_source\n", " activation_date\n", @@ -1030,7 +1031,6 @@ " n_pmc\n", " n_other_pids\n", " label\n", - " primary_email_domain\n", " \n", " \n", " \n", @@ -1047,6 +1047,7 @@ " NaN\n", " patrick.davey@monash.edu\n", " ...\n", + " [[phd student, monash university, melbourne, ,...\n", " 0\n", " NaN\n", " 2019-05-09t23:01:02.170z\n", @@ -1056,7 +1057,6 @@ " 0\n", " 0\n", " 0\n", - " monash.edu\n", " \n", " \n", " 7027865\n", @@ -1071,6 +1071,7 @@ " NaN\n", " patrick.davey@monash.edu\n", " ...\n", + " [[phd student, monash university, melbourne, v...\n", " 1\n", " [crossref]\n", " 2018-09-11t10:47:10.997z\n", @@ -1080,11 +1081,10 @@ " 0\n", " 0\n", " 1\n", - " monash.edu\n", " \n", " \n", "\n", - "

2 rows × 25 columns

\n", + "

2 rows × 24 columns

\n", "" ], "text/plain": [ @@ -1096,22 +1096,26 @@ "944993 patrick davey NaN NaN NaN \n", "7027865 patrick davey NaN NaN NaN \n", "\n", - " primary_email ... n_works works_source \\\n", - "944993 patrick.davey@monash.edu ... 0 NaN \n", - "7027865 patrick.davey@monash.edu ... 1 [crossref] \n", + " primary_email ... \\\n", + "944993 patrick.davey@monash.edu ... \n", + "7027865 patrick.davey@monash.edu ... \n", "\n", - " activation_date last_update_date n_doi n_arxiv \\\n", - "944993 2019-05-09t23:01:02.170z 2019-08-20t03:00:17.844z 0 0 \n", - "7027865 2018-09-11t10:47:10.997z 2021-02-09t06:21:44.138z 1 0 \n", + " employment n_works \\\n", + "944993 [[phd student, monash university, melbourne, ,... 0 \n", + "7027865 [[phd student, monash university, melbourne, v... 1 \n", "\n", - " n_pmc n_other_pids label primary_email_domain \n", - "944993 0 0 0 monash.edu \n", - "7027865 0 0 1 monash.edu \n", + " works_source activation_date last_update_date \\\n", + "944993 NaN 2019-05-09t23:01:02.170z 2019-08-20t03:00:17.844z \n", + "7027865 [crossref] 2018-09-11t10:47:10.997z 2021-02-09t06:21:44.138z \n", "\n", - "[2 rows x 25 columns]" + " n_doi n_arxiv n_pmc n_other_pids label \n", + "944993 0 0 0 0 0 \n", + "7027865 1 0 0 0 1 \n", + "\n", + "[2 rows x 24 columns]" ] }, - "execution_count": 24, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -1122,16 +1126,16 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "df['primary_email_domain'] = df['primary_email'].apply(lambda x: x.split('@')[1] if pd.notna(x) else x)" + "df['primary_email_domain'] = df[df.primary_email.notna()]['primary_email'].apply(lambda x: x.split('@')[1])" ] }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 16, "metadata": {}, "outputs": [ { @@ -1144,7 +1148,7 @@ "Name: primary_email_domain, dtype: object" ] }, - "execution_count": 26, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } @@ -1155,7 +1159,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 17, "metadata": {}, "outputs": [ { @@ -1254,19 +1258,22 @@ "[17089 rows x 1 columns]" ] }, - "execution_count": 27, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "primary_emails = df[['primary_email_domain', 'orcid']].groupby('primary_email_domain').count().sort_values('orcid', ascending=False)\n", - "primary_emails" + "top_primary_emails = df[['primary_email_domain', 'orcid']]\\\n", + " .groupby('primary_email_domain')\\\n", + " .count()\\\n", + " .sort_values('orcid', ascending=False)\n", + "top_primary_emails" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -2154,7 +2161,7 @@ } }, "title": { - "text": "Top 30 email domains" + "text": "Top-30 email domains" }, "xaxis": { "range": [ @@ -2169,9 +2176,9 @@ } }, "text/html": [ - "