diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index a807910..323c070 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -13,9 +13,9 @@ "source": [ "TODO:\n", "- Understanding the reason for fake profiles can bring insight on how to catch them (could be trivial with prior knowledge, e.g., SEO hacking => URLs)\n", - "- Make casistics (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)\n", - "- Temporal dimension of any use?\n", - "- Can we access private info thanks to the OpenAIRE-ORCID agreement?\n" + "- Study different cases (e.g. author publishing with empty orcid, author publishing but not on OpenAIRE, etc.)\n", + "- Temporal dimension; is it of any use?\n", + "- Can we access private info thanks to the OpenAIRE-ORCID agreement? No.\n" ] }, { @@ -50,13 +50,14 @@ ], "source": [ "import glob\n", - "\n", - "import pandas as pd\n", "import ast\n", "import tldextract\n", + "\n", "import numpy as np\n", + "import pandas as pd\n", "\n", "import antispam\n", + "import profanity_check\n", "\n", "import plotly\n", "from plotly.offline import iplot, init_notebook_mode\n", @@ -125,13 +126,17 @@ "metadata": {}, "outputs": [], "source": [ - "SCAFFOLD = '0000-0001-5004-7761'\n", - "WHATSAPP = '0000-0001-6997-9470'\n", - "PENIS = '0000-0002-3399-7287'\n", - "BITCOIN = '0000-0002-7518-6845'\n", - "FITNESS_CHINA = '0000-0002-1234-835X' # URL record + employment\n", - "CANNABIS = '0000-0002-9025-8632' # URL > 70 + works (REMOVED)\n", - "PLUMBER = '0000-0002-1700-8311' # URL > 10 + works " + "FAKE_HEAP = {\n", + " 'scaffold': '0000-0001-5004-7761',\n", + " 'whatsapp': '0000-0001-6997-9470',\n", + " 'penis': '0000-0002-3399-7287',\n", + " 'bitcoin': '0000-0002-7518-6845',\n", + " 'fitness': '0000-0002-1234-835X', # URL record + employment\n", + " 'cannabis': '0000-0002-9025-8632', # URL > 70 + works (now REMOVED)\n", + " 'plumber': '0000-0002-1700-8311', # URL > 10 + works\n", + " 'furniture': '0000-0001-7478-4539',\n", + " 'cleaners': '0000-0002-7392-3792'\n", + "}" ] }, { @@ -216,8 +221,8 @@ " NaN\n", " 0\n", " NaN\n", - " 2018-03-02t09:29:16.528z\n", - " 2018-03-02t09:43:07.551z\n", + " 2018-03-02 09:29:16.528000+00:00\n", + " 2018-03-02 09:43:07.551000+00:00\n", " 0\n", " 0\n", " 0\n", @@ -249,8 +254,8 @@ " [[professor, peoples friendship university of ...\n", " 0\n", " NaN\n", - " 2018-04-03t07:50:23.358z\n", - " 2020-03-18t09:42:44.753z\n", + " 2018-04-03 07:50:23.358000+00:00\n", + " 2020-03-18 09:42:44.753000+00:00\n", " 0\n", " 0\n", " 0\n", @@ -282,8 +287,8 @@ " NaN\n", " 0\n", " NaN\n", - " 2019-12-11t15:31:56.388z\n", - " 2020-01-28t15:34:17.309z\n", + " 2019-12-11 15:31:56.388000+00:00\n", + " 2020-01-28 15:34:17.309000+00:00\n", " 0\n", " 0\n", " 0\n", @@ -315,8 +320,8 @@ " [[research scientist, new york university abu ...\n", " 0\n", " NaN\n", - " 2015-08-18t12:36:45.307z\n", - " 2020-09-23t13:37:54.180z\n", + " 2015-08-18 12:36:45.307000+00:00\n", + " 2020-09-23 13:37:54.180000+00:00\n", " 0\n", " 0\n", " 0\n", @@ -348,8 +353,8 @@ " [[researcher (academic), universidad de zarago...\n", " 0\n", " NaN\n", - " 2014-03-10t13:22:01.966z\n", - " 2016-06-14t22:17:54.470z\n", + " 2014-03-10 13:22:01.966000+00:00\n", + " 2016-06-14 22:17:54.470000+00:00\n", " 0\n", " 0\n", " 0\n", @@ -391,33 +396,33 @@ "3 NaN NaN [[research scientist, new york university abu ... \n", "4 NaN NaN [[researcher (academic), universidad de zarago... \n", "\n", - " n_works works_source activation_date last_update_date \\\n", - "0 0 NaN 2018-03-02t09:29:16.528z 2018-03-02t09:43:07.551z \n", - "1 0 NaN 2018-04-03t07:50:23.358z 2020-03-18t09:42:44.753z \n", - "2 0 NaN 2019-12-11t15:31:56.388z 2020-01-28t15:34:17.309z \n", - "3 0 NaN 2015-08-18t12:36:45.307z 2020-09-23t13:37:54.180z \n", - "4 0 NaN 2014-03-10t13:22:01.966z 2016-06-14t22:17:54.470z \n", + " n_works works_source activation_date \\\n", + "0 0 NaN 2018-03-02 09:29:16.528000+00:00 \n", + "1 0 NaN 2018-04-03 07:50:23.358000+00:00 \n", + "2 0 NaN 2019-12-11 15:31:56.388000+00:00 \n", + "3 0 NaN 2015-08-18 12:36:45.307000+00:00 \n", + "4 0 NaN 2014-03-10 13:22:01.966000+00:00 \n", "\n", - " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", - "0 0 0 0 0 False NaN \n", - "1 0 0 0 0 False NaN \n", - "2 0 0 0 0 False NaN \n", - "3 0 0 0 0 False NaN \n", - "4 0 0 0 0 False NaN \n", + " last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n", + "0 2018-03-02 09:43:07.551000+00:00 0 0 0 0 \n", + "1 2020-03-18 09:42:44.753000+00:00 0 0 0 0 \n", + "2 2020-01-28 15:34:17.309000+00:00 0 0 0 0 \n", + "3 2020-09-23 13:37:54.180000+00:00 0 0 0 0 \n", + "4 2016-06-14 22:17:54.470000+00:00 0 0 0 0 \n", "\n", - " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", - "0 NaN NaN \n", - "1 NaN NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", + " label primary_email_domain other_email_domains url_domains n_emails \\\n", + "0 False NaN NaN NaN \n", + "1 False NaN NaN NaN \n", + "2 False NaN NaN NaN \n", + "3 False NaN NaN NaN \n", + "4 False NaN NaN NaN \n", "\n", - " n_education n_employment \n", - "0 \n", - "1 1 \n", - "2 \n", - "3 1 \n", - "4 2 " + " n_urls n_ids n_keywords n_education n_employment \n", + "0 \n", + "1 1 \n", + "2 \n", + "3 1 \n", + "4 2 " ] }, "execution_count": 5, @@ -508,14 +513,14 @@ " data scientist & researcher; scholarly knowled...\n", " NaN\n", " andrea.mannocci@isti.cnr.it\n", - " [open science, data science, science of scienc...\n", + " [science of science, open science, research in...\n", " [[scopus author id, 55233589900]]\n", " [[information engineering, ph.d., università d...\n", " [[research associate, istituto di scienza e te...\n", " 37\n", " [scopus - elsevier, crossref metadata search, ...\n", - " 2017-09-12t14:28:33.467z\n", - " 2021-03-17t15:40:07.776z\n", + " 2017-09-12 14:28:33.467000+00:00\n", + " 2021-03-17 15:40:07.776000+00:00\n", " 34\n", " 0\n", " 0\n", @@ -549,7 +554,7 @@ "3073261 andrea.mannocci@isti.cnr.it \n", "\n", " keywords \\\n", - "3073261 [open science, data science, science of scienc... \n", + "3073261 [science of science, open science, research in... \n", "\n", " external_ids \\\n", "3073261 [[scopus author id, 55233589900]] \n", @@ -563,17 +568,17 @@ " works_source \\\n", "3073261 [scopus - elsevier, crossref metadata search, ... \n", "\n", - " activation_date last_update_date n_doi n_arxiv \\\n", - "3073261 2017-09-12t14:28:33.467z 2021-03-17t15:40:07.776z 34 0 \n", + " activation_date last_update_date \\\n", + "3073261 2017-09-12 14:28:33.467000+00:00 2021-03-17 15:40:07.776000+00:00 \n", "\n", - " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", - "3073261 0 60 True isti.cnr.it NaN \n", + " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", + "3073261 34 0 0 60 True isti.cnr.it \n", "\n", - " url_domains n_emails n_urls n_ids \\\n", - "3073261 [github.io, twitter.com, linkedin.com] 3 1 \n", + " other_email_domains url_domains n_emails \\\n", + "3073261 NaN [github.io, twitter.com, linkedin.com] \n", "\n", - " n_keywords n_education n_employment \n", - "3073261 5 4 5 " + " n_urls n_ids n_keywords n_education n_employment \n", + "3073261 3 1 5 4 5 " ] }, "execution_count": 6, @@ -654,14 +659,14 @@ " <NA>\n", " NaN\n", " <NA>\n", - " [whatsapp gb baixar, whatsapp gb 2020, whatsap...\n", + " [whatsapp gb apk, whatsapp gb, whatsapp gb bai...\n", " NaN\n", " NaN\n", " NaN\n", " 0\n", " NaN\n", - " 2020-10-07t10:37:12.237z\n", - " 2020-10-08t02:32:03.935z\n", + " 2020-10-07 10:37:12.237000+00:00\n", + " 2020-10-08 02:32:03.935000+00:00\n", " 0\n", " 0\n", " 0\n", @@ -689,16 +694,19 @@ "9887272 other whatsapp NaN \n", "\n", " keywords external_ids \\\n", - "9887272 [whatsapp gb baixar, whatsapp gb 2020, whatsap... NaN \n", + "9887272 [whatsapp gb apk, whatsapp gb, whatsapp gb bai... NaN \n", "\n", - " education employment n_works works_source activation_date \\\n", - "9887272 NaN NaN 0 NaN 2020-10-07t10:37:12.237z \n", + " education employment n_works works_source \\\n", + "9887272 NaN NaN 0 NaN \n", "\n", - " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", - "9887272 2020-10-08t02:32:03.935z 0 0 0 0 False \n", + " activation_date last_update_date \\\n", + "9887272 2020-10-07 10:37:12.237000+00:00 2020-10-08 02:32:03.935000+00:00 \n", "\n", - " primary_email_domain other_email_domains \\\n", - "9887272 NaN NaN \n", + " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", + "9887272 0 0 0 0 False NaN \n", + "\n", + " other_email_domains \\\n", + "9887272 NaN \n", "\n", " url_domains n_emails n_urls \\\n", "9887272 [otherwhatsapp.com, im-creator.com, facebook.c... 27 \n", @@ -713,7 +721,7 @@ } ], "source": [ - "df[df['orcid'] == WHATSAPP]" + "df[df['orcid'] == FAKE_HEAP['whatsapp']]" ] }, { @@ -778,7 +786,7 @@ "text/plain": [ "count 10989649\n", "unique 10989649\n", - "top 0000-0001-5242-3687\n", + "top 0000-0002-6591-4060\n", "freq 1\n", "Name: orcid, dtype: object" ] @@ -807,10 +815,10 @@ { "data": { "text/plain": [ - "count 124722\n", - "unique 124718\n", - "top opercin@erbakan.edu.tr\n", - "freq 2\n", + "count 124722\n", + "unique 124718\n", + "top andycheng2026@163.com\n", + "freq 2\n", "Name: primary_email, dtype: object" ] }, @@ -929,8 +937,8 @@ " NaN\n", " 0\n", " NaN\n", - " 2020-09-15t04:43:55.709z\n", - " 2020-09-15t05:17:28.509z\n", + " 2020-09-15 04:43:55.709000+00:00\n", + " 2020-09-15 05:17:28.509000+00:00\n", " 0\n", " 0\n", " 0\n", @@ -962,8 +970,8 @@ " NaN\n", " 0\n", " NaN\n", - " 2020-10-23t17:51:51.925z\n", - " 2021-01-01t15:00:52.053z\n", + " 2020-10-23 17:51:51.925000+00:00\n", + " 2021-01-01 15:00:52.053000+00:00\n", " 0\n", " 0\n", " 0\n", @@ -996,13 +1004,13 @@ "7543981 NaN NaN NaN NaN 0 NaN \n", "9316843 NaN NaN NaN NaN 0 NaN \n", "\n", - " activation_date last_update_date n_doi n_arxiv \\\n", - "7543981 2020-09-15t04:43:55.709z 2020-09-15t05:17:28.509z 0 0 \n", - "9316843 2020-10-23t17:51:51.925z 2021-01-01t15:00:52.053z 0 0 \n", + " activation_date last_update_date \\\n", + "7543981 2020-09-15 04:43:55.709000+00:00 2020-09-15 05:17:28.509000+00:00 \n", + "9316843 2020-10-23 17:51:51.925000+00:00 2021-01-01 15:00:52.053000+00:00 \n", "\n", - " n_pmc n_other_pids label primary_email_domain \\\n", - "7543981 0 0 False owasp.org \n", - "9316843 0 0 False owasp.org \n", + " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", + "7543981 0 0 0 0 False owasp.org \n", + "9316843 0 0 0 0 False owasp.org \n", "\n", " other_email_domains url_domains n_emails n_urls n_ids \\\n", "7543981 [dga.or.th] NaN 1 \n", @@ -1097,8 +1105,8 @@ " NaN\n", " 0\n", " NaN\n", - " 2015-01-12t13:47:55.549z\n", - " 2020-01-27t07:38:24.269z\n", + " 2015-01-12 13:47:55.549000+00:00\n", + " 2020-01-27 07:38:24.269000+00:00\n", " 0\n", " 0\n", " 0\n", @@ -1130,8 +1138,8 @@ " [[, necmettin erbakan university, konya, , tr,...\n", " 0\n", " NaN\n", - " 2015-10-13t05:47:12.014z\n", - " 2020-12-25t13:52:03.976z\n", + " 2015-10-13 05:47:12.014000+00:00\n", + " 2020-12-25 13:52:03.976000+00:00\n", " 0\n", " 0\n", " 0\n", @@ -1168,21 +1176,21 @@ "347852 NaN 0 \n", "1681787 [[, necmettin erbakan university, konya, , tr,... 0 \n", "\n", - " works_source activation_date last_update_date \\\n", - "347852 NaN 2015-01-12t13:47:55.549z 2020-01-27t07:38:24.269z \n", - "1681787 NaN 2015-10-13t05:47:12.014z 2020-12-25t13:52:03.976z \n", + " works_source activation_date \\\n", + "347852 NaN 2015-01-12 13:47:55.549000+00:00 \n", + "1681787 NaN 2015-10-13 05:47:12.014000+00:00 \n", "\n", - " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", - "347852 0 0 0 0 False erbakan.edu.tr \n", - "1681787 0 0 0 0 False erbakan.edu.tr \n", + " last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n", + "347852 2020-01-27 07:38:24.269000+00:00 0 0 0 0 \n", + "1681787 2020-12-25 13:52:03.976000+00:00 0 0 0 0 \n", "\n", - " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", - "347852 NaN NaN \n", - "1681787 NaN NaN \n", + " label primary_email_domain other_email_domains url_domains n_emails \\\n", + "347852 False erbakan.edu.tr NaN NaN \n", + "1681787 False erbakan.edu.tr NaN NaN \n", "\n", - " n_education n_employment \n", - "347852 \n", - "1681787 1 " + " n_urls n_ids n_keywords n_education n_employment \n", + "347852 \n", + "1681787 1 " ] }, "execution_count": 13, @@ -1263,14 +1271,14 @@ " <NA>\n", " NaN\n", " patrick.davey@monash.edu\n", - " [radiochemistry, inorganic chemistry, bioinorg...\n", + " [inorganic chemistry, radiopharmaceuticals, ra...\n", " NaN\n", " NaN\n", " [[phd student, monash university, melbourne, ,...\n", " 0\n", " NaN\n", - " 2019-05-09t23:01:02.170z\n", - " 2019-08-20t03:00:17.844z\n", + " 2019-05-09 23:01:02.170000+00:00\n", + " 2019-08-20 03:00:17.844000+00:00\n", " 0\n", " 0\n", " 0\n", @@ -1302,8 +1310,8 @@ " [[phd student, monash university, melbourne, v...\n", " 1\n", " [crossref]\n", - " 2018-09-11t10:47:10.997z\n", - " 2021-02-09t06:21:44.138z\n", + " 2018-09-11 10:47:10.997000+00:00\n", + " 2021-02-09 06:21:44.138000+00:00\n", " 1\n", " 0\n", " 0\n", @@ -1337,28 +1345,28 @@ "5590332 patrick.davey@monash.edu \n", "\n", " keywords external_ids \\\n", - "954085 [radiochemistry, inorganic chemistry, bioinorg... NaN \n", + "954085 [inorganic chemistry, radiopharmaceuticals, ra... NaN \n", "5590332 NaN NaN \n", "\n", " education employment n_works \\\n", "954085 NaN [[phd student, monash university, melbourne, ,... 0 \n", "5590332 NaN [[phd student, monash university, melbourne, v... 1 \n", "\n", - " works_source activation_date last_update_date \\\n", - "954085 NaN 2019-05-09t23:01:02.170z 2019-08-20t03:00:17.844z \n", - "5590332 [crossref] 2018-09-11t10:47:10.997z 2021-02-09t06:21:44.138z \n", + " works_source activation_date \\\n", + "954085 NaN 2019-05-09 23:01:02.170000+00:00 \n", + "5590332 [crossref] 2018-09-11 10:47:10.997000+00:00 \n", "\n", - " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", - "954085 0 0 0 0 False monash.edu \n", - "5590332 1 0 0 0 True monash.edu \n", + " last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n", + "954085 2019-08-20 03:00:17.844000+00:00 0 0 0 0 \n", + "5590332 2021-02-09 06:21:44.138000+00:00 1 0 0 0 \n", "\n", - " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", - "954085 NaN NaN 4 \n", - "5590332 NaN NaN \n", + " label primary_email_domain other_email_domains url_domains n_emails \\\n", + "954085 False monash.edu NaN NaN \n", + "5590332 True monash.edu NaN NaN \n", "\n", - " n_education n_employment \n", - "954085 1 \n", - "5590332 1 " + " n_urls n_ids n_keywords n_education n_employment \n", + "954085 4 1 \n", + "5590332 1 " ] }, "execution_count": 14, @@ -2413,9 +2421,9 @@ } }, "text/html": [ - "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "BIO_SNIPPET = 'really cool to find an entire community of people'\n", + "dup_bios_df = df[df.biography.str.contains(BIO_SNIPPET)].explode('url_domains').groupby('url_domains')[['orcid']].count().sort_values('orcid', ascending=False)\n", + "\n", + "set_top_n(50)\n", + "data = [\n", + " go.Bar(\n", + " x=dup_bios_df[:TOP_N].index,\n", + " y=dup_bios_df[:TOP_N]['orcid']\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='URL distribution for bio \"%s\"' % BIO_SNIPPET,\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Dup bios dates**" + ] + }, + { + "cell_type": "code", + "execution_count": 130, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": false + }, + "data": [ + { + "histfunc": "count", + "type": "histogram", + "x": [ + "2020-11-06T06:10:20.070000+00:00", + "2020-11-13T01:04:19.859000+00:00", + "2020-11-05T00:38:21.096000+00:00", + "2020-12-08T05:38:30.786000+00:00", + "2020-12-10T08:54:56.127000+00:00", + "2020-11-20T09:11:08.356000+00:00", + "2020-12-10T05:26:14.534000+00:00", + "2020-12-04T02:41:11.756000+00:00", + "2020-11-26T04:16:15.824000+00:00", + "2020-10-12T04:58:17.220000+00:00", + "2020-12-08T00:35:36.543000+00:00", + "2020-11-30T01:30:22.357000+00:00", + "2020-10-19T01:51:35.391000+00:00", + "2020-12-07T04:43:46.569000+00:00", + "2020-11-12T06:33:38.112000+00:00", + "2020-10-20T05:55:09.939000+00:00", + "2020-11-18T02:15:05.122000+00:00", + "2020-12-04T00:53:17.885000+00:00", + "2020-10-16T02:03:07.922000+00:00", + "2020-12-07T01:05:49.858000+00:00", + "2020-12-09T09:51:39.412000+00:00", + "2020-10-16T05:25:52.218000+00:00", + "2021-01-04T15:49:35.727000+00:00", + "2020-12-21T13:42:08.792000+00:00", + "2020-12-07T05:45:44.145000+00:00", + "2020-12-01T06:01:05.133000+00:00", + "2020-12-10T03:52:43.220000+00:00", + "2020-12-08T10:18:31.859000+00:00", + "2020-12-22T13:49:16.908000+00:00", + "2020-12-04T04:35:16.628000+00:00", + "2020-12-04T06:16:50.173000+00:00", + "2020-11-23T01:24:52.965000+00:00", + "2020-12-18T13:09:47.463000+00:00", + "2020-12-09T03:17:51.528000+00:00", + "2020-10-01T00:37:22.535000+00:00", + "2020-12-07T04:11:41.932000+00:00", + "2020-12-10T02:07:49.921000+00:00", + "2020-12-07T02:03:15.901000+00:00", + "2020-12-08T01:10:25.988000+00:00", + "2020-12-09T02:50:31.195000+00:00", + "2020-12-10T02:31:54.901000+00:00", + "2020-12-04T06:49:59.988000+00:00", + "2020-12-20T14:35:28.033000+00:00", + "2020-11-13T02:36:00.122000+00:00", + "2020-12-02T01:51:22.669000+00:00", + "2020-11-18T00:37:25.591000+00:00", + "2020-11-05T01:53:15.027000+00:00", + "2020-10-09T00:29:08.619000+00:00", + "2020-10-28T01:35:34.572000+00:00", + "2020-10-05T04:42:51.206000+00:00", + "2020-12-11T07:30:16.966000+00:00", + "2020-11-27T05:05:54.359000+00:00", + "2020-12-04T01:13:35.987000+00:00", + "2020-12-04T02:52:36.139000+00:00", + "2020-12-09T04:01:45.997000+00:00", + "2020-12-07T07:27:31.433000+00:00", + "2020-11-02T01:46:58.533000+00:00", + "2020-11-16T01:34:33.962000+00:00", + "2020-10-08T00:35:13.677000+00:00", + "2020-12-07T01:01:40.495000+00:00", + "2020-10-08T00:40:42.151000+00:00", + "2020-12-09T16:53:58.895000+00:00", + "2020-12-03T04:17:44.994000+00:00", + "2020-10-29T00:32:03.420000+00:00", + "2020-10-21T00:33:19.700000+00:00", + "2020-12-07T02:01:03.230000+00:00", + "2020-11-27T03:03:57.166000+00:00", + "2020-11-20T00:31:02.803000+00:00", + "2020-12-10T00:44:32.141000+00:00", + "2020-11-06T03:30:11.523000+00:00", + "2020-12-08T01:31:33.785000+00:00", + "2020-12-03T04:42:29.095000+00:00", + "2020-11-30T03:07:50.342000+00:00", + "2020-12-10T03:57:58.215000+00:00", + "2020-12-08T05:30:35.820000+00:00", + "2020-10-19T01:48:20.788000+00:00", + "2020-12-10T02:03:52.708000+00:00", + "2020-11-02T00:49:49.169000+00:00", + "2020-10-23T02:48:08.535000+00:00", + "2020-12-04T07:08:18.268000+00:00", + "2020-12-09T05:49:49.132000+00:00", + "2020-11-19T04:33:54.243000+00:00", + "2020-11-20T08:31:18.007000+00:00", + "2020-12-24T09:34:06.934000+00:00", + "2020-11-12T05:14:22.824000+00:00", + "2020-10-01T02:31:37.319000+00:00", + "2020-12-03T06:09:53.307000+00:00", + "2020-12-08T06:22:48.990000+00:00", + "2020-10-02T00:35:57.966000+00:00", + "2020-10-23T01:46:33.848000+00:00", + "2020-12-11T00:42:31.593000+00:00", + "2020-12-08T01:58:02.205000+00:00", + "2020-12-04T07:31:58.866000+00:00", + "2020-10-23T07:14:02.472000+00:00", + "2020-12-09T00:36:57.747000+00:00", + "2020-11-23T02:12:15.138000+00:00", + "2020-12-01T05:56:25.276000+00:00", + "2020-10-19T00:36:24.880000+00:00", + "2020-10-01T02:14:30.144000+00:00", + "2020-10-06T04:09:04.132000+00:00", + "2020-11-04T07:11:18.875000+00:00", + "2020-12-07T02:01:23.937000+00:00", + "2020-11-25T00:29:34.048000+00:00", + "2020-10-20T04:34:33.072000+00:00", + "2020-12-07T09:28:58.928000+00:00", + "2020-12-10T06:14:55.197000+00:00", + "2020-11-24T01:59:39.104000+00:00", + "2020-12-10T10:30:17.368000+00:00", + "2020-12-04T03:52:11.968000+00:00", + "2020-11-03T01:36:12.954000+00:00", + "2020-12-08T20:44:30.147000+00:00", + "2020-11-30T02:53:09.083000+00:00", + "2020-10-23T07:15:19.894000+00:00", + "2020-10-12T01:51:02.464000+00:00", + "2020-11-03T00:35:56.866000+00:00", + "2020-11-02T03:00:22.222000+00:00", + "2020-12-08T07:56:01.181000+00:00", + "2020-11-09T01:54:10.680000+00:00", + "2020-12-09T02:20:15.699000+00:00", + "2020-12-10T03:39:00.227000+00:00", + "2020-12-08T06:04:10.336000+00:00", + "2020-12-09T05:14:08.617000+00:00", + "2020-12-23T09:36:08.353000+00:00", + "2020-10-15T05:07:24.672000+00:00", + "2020-12-10T00:37:21.558000+00:00", + "2020-10-20T03:04:21.659000+00:00", + "2020-12-08T07:53:40.113000+00:00", + "2020-10-07T03:03:36.133000+00:00", + "2020-10-28T01:58:35.294000+00:00", + "2020-11-26T00:29:40.992000+00:00", + "2020-11-17T07:17:47.484000+00:00", + "2020-12-02T02:29:02.429000+00:00", + "2020-10-29T05:26:54.912000+00:00", + "2020-12-02T22:59:57.295000+00:00", + "2020-10-13T04:45:07.662000+00:00", + "2020-12-18T17:12:04.145000+00:00", + "2020-10-15T02:09:30.964000+00:00", + "2020-11-13T08:00:46.351000+00:00", + "2020-12-01T06:14:27.962000+00:00", + "2020-12-10T07:02:35.739000+00:00", + "2020-10-26T00:26:09.410000+00:00", + "2020-11-04T00:58:53.122000+00:00", + "2020-10-26T06:21:04.196000+00:00", + "2020-12-10T07:07:19.553000+00:00", + "2020-11-25T00:49:47.126000+00:00", + "2020-11-18T04:22:49.488000+00:00", + "2020-11-27T01:33:55.500000+00:00", + "2020-12-22T09:45:55.961000+00:00", + "2020-10-26T01:29:08.608000+00:00", + "2020-12-08T02:45:05.088000+00:00", + "2020-10-20T01:49:48.227000+00:00", + "2020-12-08T01:05:07.944000+00:00", + "2020-12-09T07:20:54.278000+00:00", + "2020-10-12T00:32:18.880000+00:00", + "2020-11-26T02:04:06.845000+00:00", + "2020-11-27T00:42:50.071000+00:00", + "2020-12-09T00:42:15.741000+00:00", + "2020-10-13T01:33:56.576000+00:00", + "2020-12-09T00:55:26.653000+00:00", + "2020-12-02T00:34:06.686000+00:00", + "2020-11-25T04:24:02.933000+00:00", + "2020-10-20T00:35:19.784000+00:00", + "2020-12-08T07:18:00.879000+00:00", + "2020-10-05T05:31:24.831000+00:00", + "2020-11-26T06:10:56.539000+00:00", + "2020-10-16T00:59:50.730000+00:00", + "2020-10-05T00:44:54.638000+00:00", + "2020-11-24T03:32:10.726000+00:00", + "2020-10-29T02:14:36.912000+00:00", + "2020-10-28T04:32:29.960000+00:00", + "2020-12-03T02:27:55.773000+00:00", + "2020-10-13T03:09:38.953000+00:00", + "2020-11-03T00:32:33.060000+00:00", + "2020-11-23T00:28:42.098000+00:00", + "2020-10-05T07:23:56.871000+00:00", + "2020-12-08T07:41:30.994000+00:00", + "2020-10-13T01:53:14.768000+00:00", + "2020-11-19T07:33:12.511000+00:00", + "2020-11-19T02:22:58.970000+00:00", + "2020-12-03T01:02:12.893000+00:00", + "2020-12-10T09:19:53.009000+00:00", + "2020-12-03T01:57:58.432000+00:00", + "2020-11-12T01:53:16.920000+00:00", + "2020-12-10T05:08:26.115000+00:00", + "2020-12-03T03:19:39.822000+00:00", + "2020-11-25T06:51:58.033000+00:00", + "2020-11-03T06:11:51.922000+00:00", + "2020-10-27T04:51:49.250000+00:00", + "2020-12-01T07:40:33.026000+00:00", + "2020-11-24T00:39:57.593000+00:00", + "2020-12-11T07:22:37.229000+00:00", + "2020-11-06T01:49:14.557000+00:00", + "2020-12-09T00:46:22.474000+00:00", + "2020-10-09T00:07:27.401000+00:00", + "2020-10-21T04:56:50.213000+00:00", + "2020-12-09T00:38:12.761000+00:00", + "2020-10-19T03:09:40.210000+00:00", + "2020-12-08T07:42:26.163000+00:00", + "2020-12-09T04:41:45.743000+00:00", + "2020-12-10T00:38:34.572000+00:00", + "2020-10-06T04:23:17.821000+00:00", + "2020-10-01T04:41:04.450000+00:00", + "2020-10-29T01:59:03.954000+00:00", + "2020-12-08T03:35:22.720000+00:00", + "2020-12-10T02:17:41.809000+00:00", + "2020-11-23T07:45:27.508000+00:00", + "2020-12-09T07:32:59.237000+00:00", + "2020-11-16T04:30:45.959000+00:00", + "2020-10-26T02:57:43.771000+00:00", + "2020-12-10T02:57:23.162000+00:00", + "2020-11-27T04:06:45.649000+00:00", + "2020-12-08T00:20:19.678000+00:00", + "2020-10-15T01:44:03.511000+00:00", + "2020-10-14T01:42:51.911000+00:00", + "2020-10-30T00:42:28.296000+00:00", + "2020-10-26T00:37:39.903000+00:00", + "2020-10-26T05:10:04.027000+00:00", + "2020-10-14T01:46:26.356000+00:00", + "2020-10-05T06:03:47.887000+00:00", + "2020-10-22T03:10:26.276000+00:00", + "2020-12-08T05:41:34.740000+00:00", + "2020-12-07T06:59:30.009000+00:00", + "2020-10-06T05:41:06.471000+00:00", + "2020-11-27T02:12:42.102000+00:00", + "2020-12-08T01:53:47.265000+00:00", + "2020-10-27T05:22:02.063000+00:00", + "2020-12-08T04:12:44.753000+00:00", + "2020-10-13T00:37:29.576000+00:00", + "2020-12-02T03:46:17.210000+00:00", + "2020-12-08T00:33:43.254000+00:00", + "2020-12-09T03:34:43.573000+00:00", + "2020-10-07T00:40:13.702000+00:00", + "2020-12-10T00:21:20.069000+00:00", + "2020-12-03T18:40:50.995000+00:00", + "2020-12-09T15:11:47.332000+00:00", + "2020-12-26T05:24:05.862000+00:00", + "2020-10-06T00:43:30.492000+00:00", + "2020-11-02T05:03:21.859000+00:00", + "2020-10-09T03:10:19.759000+00:00", + "2020-12-10T06:32:56.543000+00:00", + "2020-10-14T00:31:14.753000+00:00", + "2020-10-15T03:07:59.357000+00:00", + "2020-10-09T02:05:42.975000+00:00", + "2020-11-10T04:06:00.132000+00:00", + "2020-11-19T06:10:05.185000+00:00", + "2020-10-07T04:57:02.723000+00:00", + "2020-11-10T05:20:38.400000+00:00", + "2020-12-01T03:16:28.604000+00:00", + "2020-12-11T00:22:57.205000+00:00", + "2020-12-08T00:40:31.488000+00:00", + "2020-12-09T03:57:54.146000+00:00", + "2020-11-02T07:45:42.838000+00:00", + "2020-10-16T03:43:52.975000+00:00", + "2020-12-07T02:34:45.335000+00:00", + "2020-12-10T05:26:59.077000+00:00", + "2020-10-06T02:51:09.557000+00:00", + "2020-12-03T17:36:44.267000+00:00", + "2020-12-09T07:15:59.846000+00:00", + "2020-12-09T04:20:24.180000+00:00", + "2020-10-12T00:29:41.477000+00:00", + "2020-10-08T03:09:24.761000+00:00", + "2020-11-10T00:44:33.074000+00:00", + "2020-10-06T00:44:53.643000+00:00", + "2020-10-29T00:40:45.821000+00:00", + "2020-10-22T01:40:49.877000+00:00", + "2020-10-05T00:45:40.224000+00:00", + "2020-12-04T05:56:11.632000+00:00", + "2020-12-09T03:12:40.086000+00:00", + "2020-12-07T01:16:29.015000+00:00", + "2020-10-13T00:32:41.200000+00:00", + "2020-12-11T05:10:03.095000+00:00", + "2020-10-14T04:47:45.211000+00:00", + "2020-11-09T03:37:34.513000+00:00", + "2020-11-09T00:41:45.173000+00:00", + "2020-12-08T01:50:10.568000+00:00", + "2020-11-24T01:51:28.207000+00:00", + "2020-12-10T01:52:37.083000+00:00", + "2020-12-22T11:47:29.012000+00:00", + "2020-10-01T01:14:36.461000+00:00", + "2020-12-07T07:24:21.357000+00:00", + "2020-11-05T04:56:52.777000+00:00", + "2020-12-11T05:15:34.655000+00:00", + "2020-11-05T03:01:48.301000+00:00", + "2020-11-11T03:56:13.111000+00:00", + "2020-12-09T06:11:25.359000+00:00", + "2020-12-09T04:28:58.267000+00:00", + "2020-10-30T07:19:59.994000+00:00", + "2020-10-07T06:09:58.118000+00:00", + "2020-11-04T02:32:20.006000+00:00", + "2020-12-08T00:34:59.437000+00:00", + "2020-12-10T03:32:11.013000+00:00", + "2020-10-27T02:55:23.288000+00:00", + "2020-11-06T01:46:33.352000+00:00", + "2020-12-07T02:28:40.834000+00:00", + "2020-11-04T07:18:51.293000+00:00", + "2020-10-08T04:37:16.253000+00:00", + "2020-12-09T08:26:09.172000+00:00", + "2020-12-09T06:41:51.112000+00:00", + "2020-11-03T02:49:25.793000+00:00", + "2020-11-06T00:38:50.208000+00:00", + "2020-10-21T01:39:42.463000+00:00", + "2020-12-08T02:04:35.965000+00:00", + "2020-12-27T12:39:28.524000+00:00", + "2020-11-20T05:49:25.708000+00:00", + "2020-12-04T06:25:21.013000+00:00", + "2020-11-16T00:26:54.233000+00:00", + "2020-11-03T04:40:55.485000+00:00", + "2020-11-25T08:24:26.620000+00:00", + "2020-10-28T00:33:46.915000+00:00", + "2020-12-10T06:01:53.737000+00:00", + "2020-11-04T05:13:14.840000+00:00", + "2020-11-18T05:44:57.361000+00:00", + "2020-12-07T00:36:46.723000+00:00", + "2020-11-16T02:42:38.500000+00:00", + "2020-11-30T00:27:02.900000+00:00", + "2020-12-10T02:04:37.554000+00:00", + "2020-12-09T02:02:01.646000+00:00", + "2020-12-07T03:42:54.669000+00:00", + "2020-12-27T03:56:52.049000+00:00", + "2020-11-09T00:38:44.818000+00:00", + "2020-12-07T06:47:43.421000+00:00", + "2020-12-10T04:13:00.551000+00:00", + "2020-12-26T15:17:09.480000+00:00", + "2020-11-05T02:09:35.869000+00:00", + "2020-12-08T02:01:25.880000+00:00", + "2020-11-19T00:47:12.305000+00:00", + "2020-10-09T01:56:49.349000+00:00", + "2020-10-15T00:32:01.688000+00:00", + "2020-10-27T02:41:01.334000+00:00", + "2020-11-18T02:12:25.671000+00:00", + "2020-10-07T04:58:13.870000+00:00", + "2020-12-07T02:20:06.535000+00:00", + "2020-10-12T03:10:36.308000+00:00", + "2020-10-29T03:18:20.673000+00:00", + "2020-12-09T08:06:54.526000+00:00", + "2020-11-12T03:17:05.140000+00:00", + "2020-10-01T01:15:09.993000+00:00", + "2020-11-11T00:51:14.004000+00:00", + "2020-11-26T01:43:11.269000+00:00", + "2020-11-24T00:43:00.335000+00:00", + "2020-11-30T04:44:39.496000+00:00", + "2020-10-23T00:30:18.009000+00:00", + "2020-10-16T02:12:55.314000+00:00", + "2020-12-02T06:24:05.665000+00:00", + "2021-01-20T10:24:17.206000+00:00", + "2020-11-05T00:36:01.999000+00:00", + "2020-12-02T02:41:16.199000+00:00", + "2020-12-03T00:30:22.897000+00:00", + "2020-12-10T03:39:20.103000+00:00", + "2020-11-11T02:04:59.354000+00:00", + "2020-12-08T06:38:19.302000+00:00", + "2020-12-01T00:35:19.625000+00:00", + "2020-11-11T05:45:47.110000+00:00", + "2020-10-28T00:35:44.890000+00:00", + "2020-12-07T04:16:41.797000+00:00", + "2020-11-26T02:16:11.427000+00:00", + "2020-11-23T02:32:21.420000+00:00", + "2020-11-30T03:20:48.098000+00:00", + "2020-12-09T00:51:34.359000+00:00", + "2020-12-21T15:37:42.646000+00:00", + "2020-12-04T02:14:35.901000+00:00", + "2020-12-04T04:47:02.247000+00:00", + "2020-10-07T00:37:11.738000+00:00", + "2020-11-10T01:59:01.119000+00:00", + "2020-10-02T02:48:15.987000+00:00", + "2020-10-22T01:52:32.093000+00:00", + "2020-11-23T00:21:54.782000+00:00", + "2020-12-08T06:25:58.310000+00:00", + "2020-10-02T01:45:44.168000+00:00", + "2020-11-09T02:00:09.022000+00:00", + "2020-11-30T01:55:34.221000+00:00", + "2020-12-07T00:32:11.199000+00:00", + "2020-12-06T17:32:11.180000+00:00", + "2020-11-13T05:32:08.198000+00:00", + "2020-12-08T07:42:30.666000+00:00", + "2020-10-30T00:41:50.497000+00:00", + "2020-10-21T00:36:51.459000+00:00", + "2020-10-22T00:30:26.343000+00:00", + "2020-10-14T00:33:42.972000+00:00", + "2020-11-26T02:58:59.509000+00:00", + "2020-10-08T04:50:07.561000+00:00", + "2020-10-21T02:47:27.539000+00:00", + "2020-11-12T00:38:10.297000+00:00", + "2020-11-20T00:59:49.538000+00:00", + "2020-12-09T01:59:11.916000+00:00", + "2020-12-11T04:13:54.427000+00:00", + "2020-10-02T03:18:52.565000+00:00", + "2020-12-07T04:29:04.602000+00:00", + "2020-12-06T18:46:28.549000+00:00", + "2020-10-19T00:30:26.334000+00:00", + "2020-12-10T02:10:10.657000+00:00", + "2020-12-08T00:49:04.595000+00:00", + "2020-10-14T02:59:56.059000+00:00", + "2020-10-27T03:50:08.036000+00:00", + "2020-11-25T06:06:07.910000+00:00", + "2020-10-21T02:17:39.198000+00:00", + "2020-10-02T01:44:17.197000+00:00", + "2020-12-08T05:04:35.419000+00:00", + "2020-10-23T00:25:01.106000+00:00", + "2020-12-10T01:37:48.987000+00:00", + "2020-10-08T05:35:12.525000+00:00", + "2020-10-28T02:41:42.671000+00:00", + "2020-10-22T00:32:37.607000+00:00", + "2020-10-27T06:18:42.715000+00:00", + "2020-11-27T02:22:36.165000+00:00", + "2020-12-20T12:04:18.314000+00:00", + "2020-11-24T05:33:20.884000+00:00", + "2020-12-10T00:32:51.611000+00:00", + "2020-10-15T00:30:54.576000+00:00", + "2020-10-19T05:15:33.712000+00:00", + "2020-11-06T00:34:06.186000+00:00", + "2020-10-02T00:34:50.494000+00:00", + "2020-11-25T01:47:57.398000+00:00", + "2020-12-26T13:27:41.589000+00:00", + "2020-10-22T06:11:02.945000+00:00", + "2020-12-03T02:00:33.684000+00:00", + "2020-11-18T00:39:17.492000+00:00", + "2020-11-19T00:31:54.080000+00:00", + "2020-10-16T00:32:26.207000+00:00" + ], + "xbins": { + "size": "D1" + }, + "y": [ + "0000-0002-7397-7977", + "0000-0003-4931-9736", + "0000-0001-8221-2303", + "0000-0001-6736-072X", + "0000-0002-8727-1246", + "0000-0001-6760-9521", + "0000-0001-9283-9441", + "0000-0002-4732-4729", + "0000-0002-9827-9374", + "0000-0002-6834-0023", + "0000-0002-2002-1963", + "0000-0002-6761-8230", + "0000-0003-2879-0537", + "0000-0002-8132-9689", + "0000-0002-8780-3628", + "0000-0002-1638-362X", + "0000-0002-2288-2476", + "0000-0003-1668-4649", + "0000-0002-0274-0892", + "0000-0002-2226-8564", + "0000-0002-4644-3793", + "0000-0002-1450-0757", + "0000-0003-0934-7898", + "0000-0001-8857-1227", + "0000-0001-7391-5859", + "0000-0001-7855-7181", + "0000-0003-2092-5417", + "0000-0003-2802-4779", + "0000-0003-4446-8089", + "0000-0002-4169-2694", + "0000-0003-3544-8879", + "0000-0003-4183-5576", + "0000-0002-8715-8892", + "0000-0003-2084-7316", + "0000-0002-9104-1662", + "0000-0002-4643-1215", + "0000-0003-3672-4863", + "0000-0003-3822-4088", + "0000-0001-8056-2510", + "0000-0003-0116-611X", + "0000-0002-7866-9236", + "0000-0002-9483-7383", + "0000-0001-6180-8810", + "0000-0003-0815-2327", + "0000-0001-8577-7780", + "0000-0001-9143-4151", + "0000-0003-2213-5611", + "0000-0003-4137-5282", + "0000-0001-8543-9183", + "0000-0002-1346-7860", + "0000-0001-9141-0715", + "0000-0002-8923-182X", + "0000-0001-5466-8100", + "0000-0002-4847-5422", + "0000-0002-8769-5698", + "0000-0003-4107-9766", + "0000-0002-6027-4105", + "0000-0001-5782-8739", + "0000-0002-5911-6433", + "0000-0001-8021-0961", + "0000-0001-6247-4330", + "0000-0002-4491-1901", + "0000-0002-7736-940X", + "0000-0001-6119-7669", + "0000-0003-3877-5373", + "0000-0002-8929-7212", + "0000-0002-2459-2675", + "0000-0002-8156-5059", + "0000-0002-7114-5886", + "0000-0001-5097-977X", + "0000-0003-3138-197X", + "0000-0003-4083-2496", + "0000-0002-0338-3890", + "0000-0001-9872-5998", + "0000-0003-1091-0852", + "0000-0002-8561-142X", + "0000-0002-6052-6368", + "0000-0002-2862-2552", + "0000-0003-1164-9246", + "0000-0002-0726-7555", + "0000-0002-7800-1463", + "0000-0003-4868-5507", + "0000-0002-2049-316X", + "0000-0002-1295-2055", + "0000-0001-9281-8579", + "0000-0002-7810-3574", + "0000-0002-4910-1078", + "0000-0002-7877-8643", + "0000-0002-4809-8129", + "0000-0003-0739-261X", + "0000-0002-4555-6171", + "0000-0002-4828-8969", + "0000-0001-7065-7567", + "0000-0002-1043-5679", + "0000-0001-8942-822X", + "0000-0002-6106-9883", + "0000-0003-4561-1406", + "0000-0001-8834-2336", + "0000-0001-9293-2603", + "0000-0002-2299-2931", + "0000-0003-0529-408X", + "0000-0001-6040-4697", + "0000-0002-1772-6567", + "0000-0002-7328-7845", + "0000-0002-3856-3242", + "0000-0003-2768-672X", + "0000-0002-2564-7148", + "0000-0003-3095-4430", + "0000-0003-4298-9059", + "0000-0002-1322-595X", + "0000-0002-6346-5062", + "0000-0002-7684-1346", + "0000-0002-0837-7668", + "0000-0002-8547-0647", + "0000-0001-9393-1805", + "0000-0002-9071-6023", + "0000-0002-5953-3958", + "0000-0002-1698-5831", + "0000-0002-1338-9604", + "0000-0002-9586-0999", + "0000-0003-1137-9039", + "0000-0003-3480-0367", + "0000-0002-1491-2583", + "0000-0002-7894-3856", + "0000-0003-4977-7817", + "0000-0002-3459-7437", + "0000-0002-3725-5483", + "0000-0002-5538-8140", + "0000-0002-6151-3200", + "0000-0002-7630-5682", + "0000-0002-7681-0021", + "0000-0003-4306-9019", + "0000-0001-5546-8688", + "0000-0003-4431-5437", + "0000-0001-5968-1718", + "0000-0002-4091-7791", + "0000-0002-4137-7205", + "0000-0003-0572-8757", + "0000-0002-4864-0758", + "0000-0003-0756-5509", + "0000-0003-4096-1634", + "0000-0002-2413-4439", + "0000-0002-4679-0998", + "0000-0002-4645-269X", + "0000-0002-5922-3001", + "0000-0001-7256-5228", + "0000-0002-5105-8515", + "0000-0003-0098-7936", + "0000-0003-1866-3411", + "0000-0001-6111-5547", + "0000-0003-3525-3940", + "0000-0002-3644-3503", + "0000-0002-6028-7627", + "0000-0003-3175-5572", + "0000-0002-8287-7222", + "0000-0003-1595-1389", + "0000-0001-8377-8843", + "0000-0002-6968-6931", + "0000-0001-9338-4573", + "0000-0001-9504-9306", + "0000-0003-0254-7141", + "0000-0001-8275-9325", + "0000-0002-0702-1256", + "0000-0003-4922-7292", + "0000-0002-4815-8953", + "0000-0003-4222-2069", + "0000-0002-7586-5477", + "0000-0001-7531-6702", + "0000-0002-1764-6137", + "0000-0002-1936-2859", + "0000-0002-1110-0694", + "0000-0002-4580-1528", + "0000-0003-2354-3537", + "0000-0002-1112-0425", + "0000-0003-0226-9700", + "0000-0002-0750-4004", + "0000-0003-0798-5503", + "0000-0002-5724-3092", + "0000-0002-8037-1148", + "0000-0002-9075-6957", + "0000-0003-0273-3948", + "0000-0001-8105-0541", + "0000-0001-5240-8946", + "0000-0002-1255-6722", + "0000-0002-2248-9076", + "0000-0002-2482-3031", + "0000-0001-6610-4716", + "0000-0002-2679-385X", + "0000-0001-7351-8749", + "0000-0003-1870-2011", + "0000-0002-5780-1605", + "0000-0001-9548-6661", + "0000-0001-7956-0921", + "0000-0001-6077-1083", + "0000-0002-4312-9694", + "0000-0003-2726-990X", + "0000-0002-2310-0990", + "0000-0003-1218-6052", + "0000-0003-4137-9291", + "0000-0002-7546-3240", + "0000-0001-9632-6459", + "0000-0002-0317-7042", + "0000-0002-7785-451X", + "0000-0001-5329-7467", + "0000-0002-1872-3667", + "0000-0003-1180-8753", + "0000-0003-0011-1520", + "0000-0002-7218-6057", + "0000-0003-0225-1324", + "0000-0001-8055-8992", + "0000-0002-3019-6791", + "0000-0003-2859-603X", + "0000-0001-9119-6026", + "0000-0002-0587-2233", + "0000-0002-9511-9754", + "0000-0003-1517-3898", + "0000-0003-3993-8541", + "0000-0003-0208-4394", + "0000-0003-2322-3859", + "0000-0002-5105-7504", + "0000-0002-6613-9166", + "0000-0001-5793-9375", + "0000-0002-8887-0646", + "0000-0001-5618-4820", + "0000-0001-8160-5658", + "0000-0003-4635-2733", + "0000-0002-4032-956X", + "0000-0001-8926-0922", + "0000-0002-4366-3395", + "0000-0001-6056-0964", + "0000-0002-3423-891X", + "0000-0002-1547-4204", + "0000-0002-4324-7121", + "0000-0001-7540-3339", + "0000-0001-8624-3027", + "0000-0002-3475-2388", + "0000-0001-5049-4721", + "0000-0002-0549-2812", + "0000-0003-4352-3234", + "0000-0002-0285-6940", + "0000-0001-5883-3337", + "0000-0001-7918-7071", + "0000-0003-0270-8849", + "0000-0002-1059-9753", + "0000-0002-7330-596X", + "0000-0001-8805-3681", + "0000-0003-0272-1883", + "0000-0002-3679-6886", + "0000-0003-3718-4774", + "0000-0002-5438-2716", + "0000-0002-4805-891X", + "0000-0003-1151-7112", + "0000-0002-1649-4661", + "0000-0003-2956-5278", + "0000-0002-8016-3111", + "0000-0003-2827-1227", + "0000-0003-2145-4717", + "0000-0003-2028-9510", + "0000-0003-4424-6881", + "0000-0001-7474-2988", + "0000-0002-2510-6501", + "0000-0003-3582-6294", + "0000-0002-1343-126X", + "0000-0002-4595-2588", + "0000-0002-9526-2876", + "0000-0003-1158-7057", + "0000-0002-3345-5154", + "0000-0002-8244-6924", + "0000-0003-4806-3951", + "0000-0002-7464-3688", + "0000-0003-2647-2359", + "0000-0001-6073-387X", + "0000-0001-8230-5606", + "0000-0001-9108-194X", + "0000-0002-1674-6427", + "0000-0002-6355-1382", + "0000-0003-3632-7673", + "0000-0002-5140-4932", + "0000-0003-2316-2347", + "0000-0003-2793-1027", + "0000-0002-8554-3738", + "0000-0001-6673-083X", + "0000-0001-7378-174X", + "0000-0002-1614-9163", + "0000-0002-8282-0330", + "0000-0003-3527-7336", + "0000-0001-7189-2147", + "0000-0002-3344-3903", + "0000-0003-1786-3489", + "0000-0001-7108-8036", + "0000-0003-4078-2193", + "0000-0001-9352-3676", + "0000-0001-9856-8531", + "0000-0002-9276-8003", + "0000-0003-0872-8082", + "0000-0002-7824-1395", + "0000-0001-5224-7353", + "0000-0002-5281-2110", + "0000-0002-3201-6256", + "0000-0003-3697-1370", + "0000-0002-7678-0520", + "0000-0002-3741-371X", + "0000-0002-8132-2356", + "0000-0001-8074-1025", + "0000-0002-7896-7268", + "0000-0002-5536-6005", + "0000-0002-5627-5594", + "0000-0002-2790-8196", + "0000-0002-4814-6303", + "0000-0003-3563-8111", + "0000-0002-3894-8185", + "0000-0001-6608-759X", + "0000-0001-6678-4133", + "0000-0001-6893-5775", + "0000-0002-9472-9307", + "0000-0003-2441-0736", + "0000-0002-8295-024X", + "0000-0002-4322-6590", + "0000-0001-9047-2156", + "0000-0002-9072-6328", + "0000-0002-1625-1774", + "0000-0002-2315-0242", + "0000-0001-6178-4337", + "0000-0003-1272-8819", + "0000-0003-1460-8458", + "0000-0003-4091-5443", + "0000-0001-6647-3570", + "0000-0002-0756-4381", + "0000-0002-7428-5307", + "0000-0002-7699-0579", + "0000-0002-3145-000X", + "0000-0002-9249-230X", + "0000-0002-0511-4600", + "0000-0003-1621-0435", + "0000-0002-6126-7150", + "0000-0002-8161-6469", + "0000-0002-9577-7263", + "0000-0003-4087-3992", + "0000-0002-1295-2450", + "0000-0002-5135-5341", + "0000-0002-7357-3104", + "0000-0002-2913-3400", + "0000-0003-0271-5678", + "0000-0002-3427-5032", + "0000-0002-2550-0496", + "0000-0001-6399-6536", + "0000-0001-7317-8000", + "0000-0002-0357-9789", + "0000-0002-8995-3531", + "0000-0003-4421-9234", + "0000-0002-6004-465X", + "0000-0002-8739-4473", + "0000-0002-5576-1865", + "0000-0002-6978-9068", + "0000-0001-9918-496X", + "0000-0003-2243-8460", + "0000-0003-2778-4266", + "0000-0003-4318-6821", + "0000-0002-7600-7845", + "0000-0002-8469-1288", + "0000-0003-4684-4148", + "0000-0002-7312-128X", + "0000-0001-5236-2001", + "0000-0001-8692-9651", + "0000-0002-8544-0202", + "0000-0002-8884-3592", + "0000-0002-1231-9903", + "0000-0003-0407-7913", + "0000-0001-7196-6679", + "0000-0002-1319-1358", + "0000-0001-7305-3240", + "0000-0002-9910-2765", + "0000-0003-4952-1491", + "0000-0002-7466-2256", + "0000-0002-9221-8730", + "0000-0001-9116-7882", + "0000-0002-8938-6004", + "0000-0002-8684-4321", + "0000-0003-0818-6607", + "0000-0001-6798-4401", + "0000-0001-6500-9047", + "0000-0002-2693-5627", + "0000-0002-7256-6340", + "0000-0002-0194-6362", + "0000-0002-5125-5092", + "0000-0002-7197-2355", + "0000-0002-4535-0958", + "0000-0001-6476-7321", + "0000-0003-3345-4757", + "0000-0003-4179-1155", + "0000-0002-5536-0156", + "0000-0002-6206-8444", + "0000-0001-6570-1594", + "0000-0002-9711-0067", + "0000-0001-5321-0392", + "0000-0003-2488-1263", + "0000-0003-2781-8198", + "0000-0002-3739-6681", + "0000-0003-3577-7963", + "0000-0003-3286-3073", + "0000-0001-8200-3333", + "0000-0002-5813-2765", + "0000-0002-6206-3563", + "0000-0003-3660-364X", + "0000-0002-4134-3566", + "0000-0001-7768-9799", + "0000-0001-5895-047X", + "0000-0002-4722-8681", + "0000-0001-9603-8908", + "0000-0002-0772-1586", + "0000-0001-6797-3964", + "0000-0002-6834-901X", + "0000-0002-1554-8306", + "0000-0002-1879-4262", + "0000-0002-9640-8136", + "0000-0002-6926-3752", + "0000-0002-3655-4713", + "0000-0002-8724-1020", + "0000-0002-4601-4569" + ] + } + ], + "layout": { + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "Activation distribution for bio \"more straightforward way to borrow the money you\"" + }, + "xaxis": { + "tickangle": 45, + "tickfont": { + "size": 12 + } + } + } + }, + "text/html": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "BIO_SNIPPET = 'more straightforward way to borrow the money you'\n", + "dup_bios_df = df[df.biography.str.contains(BIO_SNIPPET)]\n", + "# .groupby(df.activation_date.dt.month)[['orcid']].count().sort_values('orcid', ascending=False)\n", + "\n", + "set_top_n(50)\n", + "data = [\n", + " go.Histogram(\n", + " x=dup_bios_df['activation_date'],\n", + " y=dup_bios_df['orcid'],\n", + " histfunc=\"count\"\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='Activation distribution for bio \"%s\"' % BIO_SNIPPET,\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "fig.update_traces(xbins_size='D1')\n", + "plotly.offline.iplot(fig)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Dup bios with extended length**" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcid
biography
car title loans are a more straightforward way to borrow the money you need, without dealing with the hassles of a traditional bank loan. because they use the equity value of your qualifying vehicle to secure funding, they are a great borrowing option for customers with credit issues, or who need expedited funding. our customers know to turn to our local experts when they need auto title loans. our team is excited to get you your loan today!343
hi, how are you? it is really cool to find an entire community of people interested in the same thing you are.229
the sound and the fury is one of my all-time favorite novels but i have many.218
one of my passions is people watching but i dont get to do it as much as i would like.132
why hello there. i can not believe i didnt know this community existed sooner.131
......
mütter, die sich vor der entbindung für einen rückbildungskurs anmelden, belügen sich selbst.denn das gleicht den vorsätzen zu neujahr: gut gemeint,2
<p class=p__2>politically, taking on the affordable care act or not taking it on are both dangerous. while many citizens dont comprehend all that the2
insaat kalip yagi, kalip yag, plywood kalip yagi, ahsap kalip yagi alanlarinda profesyonel ve organik olarak imalat yapan sirketimiz musteri goruslerini son derece onemsemektedir.2
<p class=p__7>since life and medical insurance commissions are front-loaded, agents generally do not get a commission after the 3rd policy renewal.2
ive traveled to several countries and have several more to see. i have a lizard named tinky.2
\n", + "

1619 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " orcid\n", + "biography \n", + "car title loans are a more straightforward way ... 343\n", + "hi, how are you? it is really cool to find an e... 229\n", + "the sound and the fury is one of my all-time fa... 218\n", + "one of my passions is people watching but i don... 132\n", + "why hello there. i can not believe i didnt know... 131\n", + "... ...\n", + "mütter, die sich vor der entbindung für einen r... 2\n", + "

politically, taking on the afford... 2\n", + "insaat kalip yagi, kalip yag, plywood kalip yag... 2\n", + "

since life and medical insurance ... 2\n", + "ive traveled to several countries and have seve... 2\n", + "\n", + "[1619 rows x 1 columns]" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dup_bios[dup_bios.index.str.split(' ').str.len() > 10]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Assign spam score from precanned library**" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": {}, + "outputs": [], + "source": [ + "# bios = df[df.biography.notna()][['orcid', 'biography']]" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "metadata": {}, + "outputs": [], + "source": [ + "# def score(bio):\n", + "# try:\n", + "# return antispam.score(bio)\n", + "# except: # if len(bio) < 3 the filter doesn't know how to handle that\n", + "# return -1" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "# bios['spam_score'] = bios.biography.apply(lambda bio: score(bio))" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [], + "source": [ + "# bios[bios.spam_score == -1] # these are artefacts (no scoring possible)" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [], + "source": [ + "# bios.spam_score.replace(to_replace=-1, value=np.nan, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [], + "source": [ + "# bios.spam_score.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [], + "source": [ + "# bios[bios.spam_score > 0.99]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Spam goes nowhere." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Search offending words, sexually explicit content, etc.**" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "# bios['profanity_score'] = profanity_check.predict_prob(bios.biography)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [], + "source": [ + "# bios[bios.profanity_score > 0.90]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Profanity detection goes nowhere too." ] }, { @@ -19468,7 +22591,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 97, "metadata": {}, "outputs": [ { @@ -19499,9 +22622,8 @@ "n_education", "n_employment", "n_ext_work_source", - "spam_score", - "n_valid_employment", - "n_valid_education" + "n_valid_education", + "n_valid_employment" ], "xaxis": "x", "y": [ @@ -19520,370 +22642,332 @@ "n_education", "n_employment", "n_ext_work_source", - "spam_score", - "n_valid_employment", - "n_valid_education" + "n_valid_education", + "n_valid_employment" ], "yaxis": "y", "z": [ [ 1, - 0.9851287263502948, - 0.032912669844356764, - 0.030955329523625504, - 0.002797563048323001, - 0.01323756721452204, - 0.026255505701943696, - 0.05741408837128926, - 0.01569597153718797, - 0.040989235686940816, - 0.061568740672497534, - 0.03696794447170714, - 0.05352081277803731, - 0.04004869062451576, - 0.09668574568788427, - 0.020983848636741746, - 0.04230149069451306, - 0.0530823224770852 + 0.9649829131836175, + 0.07899833525811681, + 0.07259719921935885, + 0.0064613638682561435, + 0.030614701011724112, + 0.0606246420123506, + 0.1240658917743258, + 0.03267144560134065, + 0.10712349577355784, + 0.14475767748321952, + 0.10243044622702734, + 0.22284375415000315, + 0.2240544946507108, + 0.22551664240183317, + 0.21764209610282456, + 0.22363930402583765 ], [ - 0.9851287263502948, + 0.9649829131836175, 1, - 0.032893021887808044, - 0.030952716284260284, - 0.0027246992919826396, - 0.013197039191265785, - 0.026305729103779554, - 0.056657974238298256, - 0.013593908269146495, - 0.04075930430401838, - 0.06093122306222735, - 0.036427174310613826, - 0.05283100363748757, - 0.039617047413162375, - 0.09617813225153261, - 0.020779934453739678, - 0.04187757510946746, - 0.05252647681980516 + 0.08183974046700901, + 0.07518160639621203, + 0.0066860590291805974, + 0.031712353459948744, + 0.06277678931008057, + 0.12998640687807267, + 0.03226243840878624, + 0.11113100230411314, + 0.15032740706571793, + 0.10624021870034253, + 0.23154024673662948, + 0.23250920740301625, + 0.2327233169990374, + 0.22610062830832708, + 0.2320611339644608 ], [ - 0.032912669844356764, - 0.032893021887808044, + 0.07899833525811681, + 0.08183974046700901, 1, - 0.9347845603401734, - 0.31289353882027604, - 0.3327034204536421, - 0.8256148606228672, - 0.24175475571155877, - 0.025014569456040185, - 0.15426190915221324, - 0.3477514643689929, - 0.13281808409522428, - 0.06673049906774123, - 0.14909994555141065, - 0.3919479197818211, - 0.11430255435331263, - 0.1530900744997813, - 0.04224623793246877 + 0.9378726254398347, + 0.3126299250047347, + 0.35108563893979355, + 0.8353346326813307, + 0.22240613268720477, + 0.03160640765461562, + 0.14838588295615024, + 0.37171182274445363, + 0.1506365739780303, + 0.13686193479055792, + 0.21343320832924814, + 0.4005951950706468, + 0.127757181278294, + 0.21359436822476655 ], [ - 0.030955329523625504, - 0.030952716284260284, - 0.9347845603401734, + 0.07259719921935885, + 0.07518160639621203, + 0.9378726254398347, 1, - 0.3551317964752695, - 0.3396714358684357, - 0.792393006061876, - 0.23812475099223263, - 0.021917404276208984, - 0.12119446342488618, - 0.3286894436814909, - 0.11073297656642778, - 0.048318069131114486, - 0.12097737688298277, - 0.37572530981238894, - 0.08706480424871708, - 0.1367349622160526, - 0.03356612824419668 + 0.35605399617723354, + 0.3624050122938972, + 0.8018196175347003, + 0.2178190759174422, + 0.028320630207299337, + 0.12192573243272048, + 0.3525468899622581, + 0.12916204989780206, + 0.11736450285212531, + 0.18470550214116468, + 0.3834831573219326, + 0.11103974478415263, + 0.18991605863836233 ], [ - 0.002797563048323001, - 0.0027246992919826396, - 0.31289353882027604, - 0.3551317964752695, + 0.0064613638682561435, + 0.0066860590291805974, + 0.3126299250047347, + 0.35605399617723354, 1, - -0.0006813987707150902, - 0.2282600668610334, - 0.01778638684849228, - 0.0013809757631778703, - 0.006444532015829912, - 0.004329998524016476, - 0.002683817677771984, - 0.0013582777338929175, - 0.012499036555991529, - 0.01818525196078464, - 0.0005840365476183185, - 0.016598234185244483, - 0.0016583682855190094 + 0.0009072282179230706, + 0.2420914875526222, + 0.01776888092417036, + 0.002099309887982074, + 0.0064144255162447246, + 0.009269883208277058, + 0.005433864001670957, + 0.008619933999683011, + 0.015077339853222701, + 0.023203121401780318, + 0.008480130351469113, + 0.016480069731931876 ], [ - 0.01323756721452204, - 0.013197039191265785, - 0.3327034204536421, - 0.3396714358684357, - -0.0006813987707150902, + 0.030614701011724112, + 0.031712353459948744, + 0.35108563893979355, + 0.3624050122938972, + 0.0009072282179230706, 1, - 0.24340068611465254, - 0.1079192129721149, - 0.004399691864203764, - 0.04628651725406843, - 0.07481405788686601, - 0.042315521608991075, - 0.04004460862097079, - 0.08040739840894055, - 0.16125389614532118, - 0.02926424821268392, - 0.08624271168084971, - 0.02840212050111673 + 0.2570742999530523, + 0.0922145270760206, + 0.00898053907910667, + 0.04988227847309645, + 0.08759311081674451, + 0.04978801517329604, + 0.05364308187508679, + 0.09230230828045376, + 0.15718240355316795, + 0.05003145912334212, + 0.09365323185411886 ], [ - 0.026255505701943696, - 0.026305729103779554, - 0.8256148606228672, - 0.792393006061876, - 0.2282600668610334, - 0.24340068611465254, + 0.0606246420123506, + 0.06277678931008057, + 0.8353346326813307, + 0.8018196175347003, + 0.2420914875526222, + 0.2570742999530523, 1, - 0.19116755828206128, - 0.01771983580822778, - 0.1252448169099988, - 0.3322716889228379, - 0.09961147764401061, - 0.041174205561030665, - 0.10958484453530018, - 0.33496675669425113, - 0.09286341691577868, - 0.11434288833407659, - 0.025414430857047858 + 0.17311687493499073, + 0.022792492767692595, + 0.12058320737626094, + 0.3393283270986452, + 0.11149215572697663, + 0.09603598655375359, + 0.16336464942113507, + 0.32867917711898453, + 0.08988706485108988, + 0.1640726861251499 ], [ - 0.05741408837128926, - 0.056657974238298256, - 0.24175475571155877, - 0.23812475099223263, - 0.01778638684849228, - 0.1079192129721149, - 0.19116755828206128, + 0.1240658917743258, + 0.12998640687807267, + 0.22240613268720477, + 0.2178190759174422, + 0.01776888092417036, + 0.0922145270760206, + 0.17311687493499073, 1, - 0.03224992209236746, - 0.11065458894869484, - 0.3192781208585223, - 0.10852551976660856, - 0.05957549731552794, - 0.10279567787680673, - 0.5003162103088351, - 0.04411190681146708, - 0.14858846866838135, - 0.05163986473435027 + 0.03648260346059746, + 0.08904343614326711, + 0.2970254304693113, + 0.11831548180884943, + 0.16253031328738454, + 0.19508445555508772, + 0.4800924470705352, + 0.15718163705271387, + 0.20674076197089802 ], [ - 0.01569597153718797, - 0.013593908269146495, - 0.025014569456040185, - 0.021917404276208984, - 0.0013809757631778703, - 0.004399691864203764, - 0.01771983580822778, - 0.03224992209236746, + 0.03267144560134065, + 0.03226243840878624, + 0.03160640765461562, + 0.028320630207299337, + 0.002099309887982074, + 0.00898053907910667, + 0.022792492767692595, + 0.03648260346059746, 1, - 0.08461189794646956, - 0.05362282810243895, - 0.07093322291648477, - 0.03791228294028205, - 0.0392669868393358, - 0.07145369872483175, - 0.049930352165612917, - 0.03549877798968587, - 0.03111432559287062 + 0.07143241126539773, + 0.06149968615329382, + 0.07528446624958736, + 0.0730249552881224, + 0.07128594621281013, + 0.07937133873035572, + 0.07063161008579438, + 0.06937988840793924 ], [ - 0.040989235686940816, - 0.04075930430401838, - 0.15426190915221324, - 0.12119446342488618, - 0.006444532015829912, - 0.04628651725406843, - 0.1252448169099988, - 0.11065458894869484, - 0.08461189794646956, + 0.10712349577355784, + 0.11113100230411314, + 0.14838588295615024, + 0.12192573243272048, + 0.0064144255162447246, + 0.04988227847309645, + 0.12058320737626094, + 0.08904343614326711, + 0.07143241126539773, 1, - 0.22021014568642164, - 0.3859318893600913, - 0.1423827468461567, - 0.18096621138083624, - 0.24286823272263663, - 0.21228500020230104, - 0.1459962513695925, - 0.10496069983968939 + 0.2085344284826277, + 0.3756141239879568, + 0.20860391435209405, + 0.2439338448964409, + 0.2262491070521664, + 0.1960308197632984, + 0.22734610676272235 ], [ - 0.061568740672497534, - 0.06093122306222735, - 0.3477514643689929, - 0.3286894436814909, - 0.004329998524016476, - 0.07481405788686601, - 0.3322716889228379, - 0.3192781208585223, - 0.05362282810243895, - 0.22021014568642164, + 0.14475767748321952, + 0.15032740706571793, + 0.37171182274445363, + 0.3525468899622581, + 0.009269883208277058, + 0.08759311081674451, + 0.3393283270986452, + 0.2970254304693113, + 0.06149968615329382, + 0.2085344284826277, 1, - 0.21377525703111794, - 0.09573562665274538, - 0.1469607584988768, - 0.6666060438319036, - 0.11784342341053432, - 0.17059756897606349, - 0.07232857049616995 + 0.23998646957005906, + 0.2584672204668393, + 0.3193726129742757, + 0.6563247307005879, + 0.24730637320826065, + 0.32652990120768727 ], [ - 0.03696794447170714, - 0.036427174310613826, - 0.13281808409522428, - 0.11073297656642778, - 0.002683817677771984, - 0.042315521608991075, - 0.09961147764401061, - 0.10852551976660856, - 0.07093322291648477, - 0.3859318893600913, - 0.21377525703111794, + 0.10243044622702734, + 0.10624021870034253, + 0.1506365739780303, + 0.12916204989780206, + 0.005433864001670957, + 0.04978801517329604, + 0.11149215572697663, + 0.11831548180884943, + 0.07528446624958736, + 0.3756141239879568, + 0.23998646957005906, 1, - 0.15256142173955728, - 0.17722420130407404, - 0.22691037584731275, - 0.23230106942130674, - 0.14594541852321113, - 0.11454051858316387 + 0.28174315114239534, + 0.29513823401207673, + 0.2602571143552704, + 0.2671750926309919, + 0.27827299388502297 ], [ - 0.05352081277803731, - 0.05283100363748757, - 0.06673049906774123, - 0.048318069131114486, - 0.0013582777338929175, - 0.04004460862097079, - 0.041174205561030665, - 0.05957549731552794, - 0.03791228294028205, - 0.1423827468461567, - 0.09573562665274538, - 0.15256142173955728, + 0.22284375415000315, + 0.23154024673662948, + 0.13686193479055792, + 0.11736450285212531, + 0.008619933999683011, + 0.05364308187508679, + 0.09603598655375359, + 0.16253031328738454, + 0.0730249552881224, + 0.20860391435209405, + 0.2584672204668393, + 0.28174315114239534, 1, - 0.35408552736376164, - 0.1432327734837157, - 0.13359305997797627, - 0.2749705812249971, - 0.7885246402196713 + 0.5935197907835382, + 0.34969846406582145, + 0.9580202466838004, + 0.5673220091683485 ], [ - 0.04004869062451576, - 0.039617047413162375, - 0.14909994555141065, - 0.12097737688298277, - 0.012499036555991529, - 0.08040739840894055, - 0.10958484453530018, - 0.10279567787680673, - 0.0392669868393358, - 0.18096621138083624, - 0.1469607584988768, - 0.17722420130407404, - 0.35408552736376164, + 0.2240544946507108, + 0.23250920740301625, + 0.21343320832924814, + 0.18470550214116468, + 0.015077339853222701, + 0.09230230828045376, + 0.16336464942113507, + 0.19508445555508772, + 0.07128594621281013, + 0.2439338448964409, + 0.3193726129742757, + 0.29513823401207673, + 0.5935197907835382, 1, - 0.1930653267280024, - 0.14496890815575308, - 0.7528620160953167, - 0.26673120117740184 + 0.4068774187637994, + 0.5657949950743488, + 0.9213956311003227 ], [ - 0.09668574568788427, - 0.09617813225153261, - 0.3919479197818211, - 0.37572530981238894, - 0.01818525196078464, - 0.16125389614532118, - 0.33496675669425113, - 0.5003162103088351, - 0.07145369872483175, - 0.24286823272263663, - 0.6666060438319036, - 0.22691037584731275, - 0.1432327734837157, - 0.1930653267280024, + 0.22551664240183317, + 0.2327233169990374, + 0.4005951950706468, + 0.3834831573219326, + 0.023203121401780318, + 0.15718240355316795, + 0.32867917711898453, + 0.4800924470705352, + 0.07937133873035572, + 0.2262491070521664, + 0.6563247307005879, + 0.2602571143552704, + 0.34969846406582145, + 0.4068774187637994, 1, - 0.1297554982537589, - 0.22390994050434165, - 0.11768415981384495 + 0.33742739872409666, + 0.41450829231197867 ], [ - 0.020983848636741746, - 0.020779934453739678, - 0.11430255435331263, - 0.08706480424871708, - 0.0005840365476183185, - 0.02926424821268392, - 0.09286341691577868, - 0.04411190681146708, - 0.049930352165612917, - 0.21228500020230104, - 0.11784342341053432, - 0.23230106942130674, - 0.13359305997797627, - 0.14496890815575308, - 0.1297554982537589, + 0.21764209610282456, + 0.22610062830832708, + 0.127757181278294, + 0.11103974478415263, + 0.008480130351469113, + 0.05003145912334212, + 0.08988706485108988, + 0.15718163705271387, + 0.07063161008579438, + 0.1960308197632984, + 0.24730637320826065, + 0.2671750926309919, + 0.9580202466838004, + 0.5657949950743488, + 0.33742739872409666, 1, - 0.09868450111186694, - 0.09679171001982584 + 0.5703564611231601 ], [ - 0.04230149069451306, - 0.04187757510946746, - 0.1530900744997813, - 0.1367349622160526, - 0.016598234185244483, - 0.08624271168084971, - 0.11434288833407659, - 0.14858846866838135, - 0.03549877798968587, - 0.1459962513695925, - 0.17059756897606349, - 0.14594541852321113, - 0.2749705812249971, - 0.7528620160953167, - 0.22390994050434165, - 0.09868450111186694, - 1, - 0.34138499506636344 - ], - [ - 0.0530823224770852, - 0.05252647681980516, - 0.04224623793246877, - 0.03356612824419668, - 0.0016583682855190094, - 0.02840212050111673, - 0.025414430857047858, - 0.05163986473435027, - 0.03111432559287062, - 0.10496069983968939, - 0.07232857049616995, - 0.11454051858316387, - 0.7885246402196713, - 0.26673120117740184, - 0.11768415981384495, - 0.09679171001982584, - 0.34138499506636344, + 0.22363930402583765, + 0.2320611339644608, + 0.21359436822476655, + 0.18991605863836233, + 0.016480069731931876, + 0.09365323185411886, + 0.1640726861251499, + 0.20674076197089802, + 0.06937988840793924, + 0.22734610676272235, + 0.32652990120768727, + 0.27827299388502297, + 0.5673220091683485, + 0.9213956311003227, + 0.41450829231197867, + 0.5703564611231601, 1 ] ] @@ -20764,9 +23848,9 @@ } }, "text/html": [ - "

" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "fig = px.imshow(df[df.label == True].select_dtypes(include=['bool','number']).fillna(-1).corr())\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 99, "metadata": {}, "outputs": [], "source": [ @@ -20822,682 +25201,16 @@ "# 'label']].to_pickle('../data/processed/features.pkl')" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Label speculation" - ] - }, { "cell_type": "code", - "execution_count": 75, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employmentext_works_sourcen_ext_work_sourceauthoritativespam_score
170000-0002-0137-3066TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaNNaN0NaN2017-07-25t04:34:17.338z2019-11-27t17:54:45.418z0000TrueNaNNaNNaN<NA><NA><NA><NA><NA><NA>NaNNaNFalseNaN
190000-0002-0461-9711TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaNNaN2[crossref]2015-08-18t12:42:01.797z2019-12-06t11:37:38.203z2000TrueNaNNaNNaN<NA><NA><NA><NA><NA><NA>NaNNaNFalseNaN
220000-0002-0761-9450TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaNNaN1[crossref]2020-05-13t17:15:28.405z2020-08-11t21:00:45.694z1000TrueNaNNaNNaN<NA><NA><NA><NA><NA><NA>NaNNaNFalseNaN
330000-0002-4447-9215TrueTrue<NA><NA><NA>NaN<NA>NaNNaNNaNNaN0NaN2017-07-24t09:37:50.242z2019-11-15t08:31:24.820z0000TrueNaNNaNNaN<NA><NA><NA><NA><NA><NA>NaNNaNFalseNaN
440000-0003-0426-4065TrueTrue<NA><NA><NA>[eliza i. gilbert]<NA>NaNNaNNaN[[, us fish and wildlife service, albuquerque,...0NaN2017-08-07t18:32:31.802z2020-04-08t16:48:55.732z0000TrueNaNNaNNaN<NA><NA><NA><NA><NA>1NaNNaNFalseNaN
.........................................................................................................
109896350000-0002-7340-9697TrueTruetawandamarandure<NA>NaN<NA>NaN[[scopus author id, 48261373600]][[animal science, msc sustainable agriculture,...[[lecturer, zimbabwe open university faculty o...7[scopus - elsevier]2015-11-05t08:52:08.743z2020-12-09t17:59:18.350z7007TrueNaNNaNNaN<NA><NA>1<NA>33[scopus - elsevier]1.0TrueNaN
109896360000-0002-2906-0299TrueTruetiffanymackay<NA>[tiffany russel sia]<NA>[microfluidics, gpc-1, gallium-67, pet/ct, oxy...[[researcherid, a-2121-2017]][[faculty of medicine, master in pharmaceutica...[[clinical project lead, minomic international...11[crossref, researcherid, tiffany mackay]2017-01-03t23:28:48.736z2020-12-09t17:12:20.326z11000TrueNaNNaN[oxytocin.com.au, linkedin.com]<NA>211324[crossref, researcherid]2.0TrueNaN
109896370000-0001-5896-2024TrueTruegiovanni, ltiscia<NA>NaN<NA>NaN[[scopus author id, 54948242800]]NaNNaN70[scopus - elsevier, tiscia giovanni, l, europe...2016-07-27t10:09:13.585z2020-12-07t22:23:05.706z6501752TrueNaNNaNNaN<NA><NA>1<NA><NA><NA>[scopus - elsevier, europe pubmed central, cro...3.0TrueNaN
109896430000-0003-2606-0936TrueTrueluangxu<NA>[xu lu-ang, lu lu]<NA>NaNNaNNaN[[post-doc, institute of biochemistry and cell...2[scopus - elsevier, crossref]2015-10-24t03:53:23.544z2020-11-19t09:23:48.896z2001TrueNaNNaNNaN<NA><NA><NA><NA><NA>1[scopus - elsevier, crossref]2.0TrueNaN
109896450000-0002-3800-6331TrueTruezacharycalamari<NA>NaN<NA>NaNNaN[[richard gilder graduate school, phd in compa...[[assistant professor, baruch college, city un...7[crossref metadata search, zachary t. calamari...2015-01-20t20:20:17.042z2020-11-21t19:48:36.221z7010TrueNaNNaNNaN<NA><NA><NA><NA>22[crossref metadata search, crossref]2.0TrueNaN
\n", - "

2075872 rows × 34 columns

\n", - "
" - ], - "text/plain": [ - " orcid verified_email verified_primary_email \\\n", - "17 0000-0002-0137-3066 True True \n", - "19 0000-0002-0461-9711 True True \n", - "22 0000-0002-0761-9450 True True \n", - "33 0000-0002-4447-9215 True True \n", - "44 0000-0003-0426-4065 True True \n", - "... ... ... ... \n", - "10989635 0000-0002-7340-9697 True True \n", - "10989636 0000-0002-2906-0299 True True \n", - "10989637 0000-0001-5896-2024 True True \n", - "10989643 0000-0003-2606-0936 True True \n", - "10989645 0000-0002-3800-6331 True True \n", - "\n", - " given_names family_name biography other_names \\\n", - "17 NaN \n", - "19 NaN \n", - "22 NaN \n", - "33 NaN \n", - "44 [eliza i. gilbert] \n", - "... ... ... ... ... \n", - "10989635 tawanda marandure NaN \n", - "10989636 tiffany mackay [tiffany russel sia] \n", - "10989637 giovanni, l tiscia NaN \n", - "10989643 luang xu [xu lu-ang, lu lu] \n", - "10989645 zachary calamari NaN \n", - "\n", - " primary_email keywords \\\n", - "17 NaN \n", - "19 NaN \n", - "22 NaN \n", - "33 NaN \n", - "44 NaN \n", - "... ... ... \n", - "10989635 NaN \n", - "10989636 [microfluidics, gpc-1, gallium-67, pet/ct, oxy... \n", - "10989637 NaN \n", - "10989643 NaN \n", - "10989645 NaN \n", - "\n", - " external_ids \\\n", - "17 NaN \n", - "19 NaN \n", - "22 NaN \n", - "33 NaN \n", - "44 NaN \n", - "... ... \n", - "10989635 [[scopus author id, 48261373600]] \n", - "10989636 [[researcherid, a-2121-2017]] \n", - "10989637 [[scopus author id, 54948242800]] \n", - "10989643 NaN \n", - "10989645 NaN \n", - "\n", - " education \\\n", - "17 NaN \n", - "19 NaN \n", - "22 NaN \n", - "33 NaN \n", - "44 NaN \n", - "... ... \n", - "10989635 [[animal science, msc sustainable agriculture,... \n", - "10989636 [[faculty of medicine, master in pharmaceutica... \n", - "10989637 NaN \n", - "10989643 NaN \n", - "10989645 [[richard gilder graduate school, phd in compa... \n", - "\n", - " employment n_works \\\n", - "17 NaN 0 \n", - "19 NaN 2 \n", - "22 NaN 1 \n", - "33 NaN 0 \n", - "44 [[, us fish and wildlife service, albuquerque,... 0 \n", - "... ... ... \n", - "10989635 [[lecturer, zimbabwe open university faculty o... 7 \n", - "10989636 [[clinical project lead, minomic international... 11 \n", - "10989637 NaN 70 \n", - "10989643 [[post-doc, institute of biochemistry and cell... 2 \n", - "10989645 [[assistant professor, baruch college, city un... 7 \n", - "\n", - " works_source \\\n", - "17 NaN \n", - "19 [crossref] \n", - "22 [crossref] \n", - "33 NaN \n", - "44 NaN \n", - "... ... \n", - "10989635 [scopus - elsevier] \n", - "10989636 [crossref, researcherid, tiffany mackay] \n", - "10989637 [scopus - elsevier, tiscia giovanni, l, europe... \n", - "10989643 [scopus - elsevier, crossref] \n", - "10989645 [crossref metadata search, zachary t. calamari... \n", - "\n", - " activation_date last_update_date n_doi n_arxiv \\\n", - "17 2017-07-25t04:34:17.338z 2019-11-27t17:54:45.418z 0 0 \n", - "19 2015-08-18t12:42:01.797z 2019-12-06t11:37:38.203z 2 0 \n", - "22 2020-05-13t17:15:28.405z 2020-08-11t21:00:45.694z 1 0 \n", - "33 2017-07-24t09:37:50.242z 2019-11-15t08:31:24.820z 0 0 \n", - "44 2017-08-07t18:32:31.802z 2020-04-08t16:48:55.732z 0 0 \n", - "... ... ... ... ... \n", - "10989635 2015-11-05t08:52:08.743z 2020-12-09t17:59:18.350z 7 0 \n", - "10989636 2017-01-03t23:28:48.736z 2020-12-09t17:12:20.326z 11 0 \n", - "10989637 2016-07-27t10:09:13.585z 2020-12-07t22:23:05.706z 65 0 \n", - "10989643 2015-10-24t03:53:23.544z 2020-11-19t09:23:48.896z 2 0 \n", - "10989645 2015-01-20t20:20:17.042z 2020-11-21t19:48:36.221z 7 0 \n", - "\n", - " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n", - "17 0 0 True NaN NaN \n", - "19 0 0 True NaN NaN \n", - "22 0 0 True NaN NaN \n", - "33 0 0 True NaN NaN \n", - "44 0 0 True NaN NaN \n", - "... ... ... ... ... ... \n", - "10989635 0 7 True NaN NaN \n", - "10989636 0 0 True NaN NaN \n", - "10989637 17 52 True NaN NaN \n", - "10989643 0 1 True NaN NaN \n", - "10989645 1 0 True NaN NaN \n", - "\n", - " url_domains n_emails n_urls n_ids \\\n", - "17 NaN \n", - "19 NaN \n", - "22 NaN \n", - "33 NaN \n", - "44 NaN \n", - "... ... ... ... ... \n", - "10989635 NaN 1 \n", - "10989636 [oxytocin.com.au, linkedin.com] 2 1 \n", - "10989637 NaN 1 \n", - "10989643 NaN \n", - "10989645 NaN \n", - "\n", - " n_keywords n_education n_employment \\\n", - "17 \n", - "19 \n", - "22 \n", - "33 \n", - "44 1 \n", - "... ... ... ... \n", - "10989635 3 3 \n", - "10989636 13 2 4 \n", - "10989637 \n", - "10989643 1 \n", - "10989645 2 2 \n", - "\n", - " ext_works_source \\\n", - "17 NaN \n", - "19 NaN \n", - "22 NaN \n", - "33 NaN \n", - "44 NaN \n", - "... ... \n", - "10989635 [scopus - elsevier] \n", - "10989636 [crossref, researcherid] \n", - "10989637 [scopus - elsevier, europe pubmed central, cro... \n", - "10989643 [scopus - elsevier, crossref] \n", - "10989645 [crossref metadata search, crossref] \n", - "\n", - " n_ext_work_source authoritative spam_score \n", - "17 NaN False NaN \n", - "19 NaN False NaN \n", - "22 NaN False NaN \n", - "33 NaN False NaN \n", - "44 NaN False NaN \n", - "... ... ... ... \n", - "10989635 1.0 True NaN \n", - "10989636 2.0 True NaN \n", - "10989637 3.0 True NaN \n", - "10989643 2.0 True NaN \n", - "10989645 2.0 True NaN \n", - "\n", - "[2075872 rows x 34 columns]" - ] - }, - "execution_count": 75, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df.label == 1]" - ] - }, - { - "cell_type": "code", - "execution_count": 76, + "execution_count": null, "metadata": {}, "outputs": [], - "source": [ - "# (df.n_works > 0) & (df.n_ids > 1)" - ] + "source": [] }, { "cell_type": "code", - "execution_count": 77, + "execution_count": 100, "metadata": {}, "outputs": [ { @@ -21505,59 +25218,53 @@ "output_type": "stream", "text": [ "\n", - "RangeIndex: 10989649 entries, 0 to 10989648\n", - "Data columns (total 34 columns):\n", - " # Column Dtype \n", - "--- ------ ----- \n", - " 0 orcid object \n", - " 1 verified_email bool \n", - " 2 verified_primary_email bool \n", - " 3 given_names string \n", - " 4 family_name string \n", - " 5 biography string \n", - " 6 other_names object \n", - " 7 primary_email string \n", - " 8 keywords object \n", - " 9 external_ids object \n", - " 10 education object \n", - " 11 employment object \n", - " 12 n_works Int16 \n", - " 13 works_source object \n", - " 14 activation_date string \n", - " 15 last_update_date string \n", - " 16 n_doi Int16 \n", - " 17 n_arxiv Int16 \n", - " 18 n_pmc Int16 \n", - " 19 n_other_pids Int16 \n", - " 20 label bool \n", - " 21 primary_email_domain object \n", - " 22 other_email_domains object \n", - " 23 url_domains object \n", - " 24 n_emails Int16 \n", - " 25 n_urls Int16 \n", - " 26 n_ids Int16 \n", - " 27 n_keywords Int16 \n", - " 28 n_education Int16 \n", - " 29 n_employment Int16 \n", - " 30 ext_works_source object \n", - " 31 n_ext_work_source float64\n", - " 32 authoritative object \n", - " 33 spam_score float64\n", - "dtypes: Int16(11), bool(3), float64(2), object(12), string(6)\n", - "memory usage: 2.0+ GB\n" + "Int64Index: 10989649 entries, 0 to 10989648\n", + "Data columns (total 35 columns):\n", + " # Column Dtype \n", + "--- ------ ----- \n", + " 0 orcid object \n", + " 1 verified_email bool \n", + " 2 verified_primary_email bool \n", + " 3 given_names string \n", + " 4 family_name string \n", + " 5 biography string \n", + " 6 other_names object \n", + " 7 primary_email string \n", + " 8 keywords object \n", + " 9 external_ids object \n", + " 10 education object \n", + " 11 employment object \n", + " 12 n_works Int16 \n", + " 13 works_source object \n", + " 14 activation_date datetime64[ns, UTC]\n", + " 15 last_update_date datetime64[ns, UTC]\n", + " 16 n_doi Int16 \n", + " 17 n_arxiv Int16 \n", + " 18 n_pmc Int16 \n", + " 19 n_other_pids Int16 \n", + " 20 label bool \n", + " 21 primary_email_domain object \n", + " 22 other_email_domains object \n", + " 23 url_domains object \n", + " 24 n_emails Int16 \n", + " 25 n_urls Int16 \n", + " 26 n_ids Int16 \n", + " 27 n_keywords Int16 \n", + " 28 n_education Int16 \n", + " 29 n_employment Int16 \n", + " 30 ext_works_source object \n", + " 31 n_ext_work_source Int16 \n", + " 32 authoritative object \n", + " 33 n_valid_education float64 \n", + " 34 n_valid_employment float64 \n", + "dtypes: Int16(12), bool(3), datetime64[ns, UTC](2), float64(2), object(12), string(4)\n", + "memory usage: 2.1+ GB\n" ] } ], "source": [ "df.info()" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": {