1 "
]
},
"execution_count": 14,
@@ -2413,9 +2421,9 @@
}
},
"text/html": [
- ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "BIO_SNIPPET = 'really cool to find an entire community of people'\n",
+ "dup_bios_df = df[df.biography.str.contains(BIO_SNIPPET)].explode('url_domains').groupby('url_domains')[['orcid']].count().sort_values('orcid', ascending=False)\n",
+ "\n",
+ "set_top_n(50)\n",
+ "data = [\n",
+ " go.Bar(\n",
+ " x=dup_bios_df[:TOP_N].index,\n",
+ " y=dup_bios_df[:TOP_N]['orcid']\n",
+ " )\n",
+ "]\n",
+ "\n",
+ "layout = go.Layout(\n",
+ " title='URL distribution for bio \"%s\"' % BIO_SNIPPET,\n",
+ " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n",
+ ")\n",
+ "fig = go.Figure(data=data, layout=layout)\n",
+ "plotly.offline.iplot(fig)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Dup bios dates**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 130,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "application/vnd.plotly.v1+json": {
+ "config": {
+ "linkText": "Export to plot.ly",
+ "plotlyServerURL": "https://plot.ly",
+ "showLink": false
+ },
+ "data": [
+ {
+ "histfunc": "count",
+ "type": "histogram",
+ "x": [
+ "2020-11-06T06:10:20.070000+00:00",
+ "2020-11-13T01:04:19.859000+00:00",
+ "2020-11-05T00:38:21.096000+00:00",
+ "2020-12-08T05:38:30.786000+00:00",
+ "2020-12-10T08:54:56.127000+00:00",
+ "2020-11-20T09:11:08.356000+00:00",
+ "2020-12-10T05:26:14.534000+00:00",
+ "2020-12-04T02:41:11.756000+00:00",
+ "2020-11-26T04:16:15.824000+00:00",
+ "2020-10-12T04:58:17.220000+00:00",
+ "2020-12-08T00:35:36.543000+00:00",
+ "2020-11-30T01:30:22.357000+00:00",
+ "2020-10-19T01:51:35.391000+00:00",
+ "2020-12-07T04:43:46.569000+00:00",
+ "2020-11-12T06:33:38.112000+00:00",
+ "2020-10-20T05:55:09.939000+00:00",
+ "2020-11-18T02:15:05.122000+00:00",
+ "2020-12-04T00:53:17.885000+00:00",
+ "2020-10-16T02:03:07.922000+00:00",
+ "2020-12-07T01:05:49.858000+00:00",
+ "2020-12-09T09:51:39.412000+00:00",
+ "2020-10-16T05:25:52.218000+00:00",
+ "2021-01-04T15:49:35.727000+00:00",
+ "2020-12-21T13:42:08.792000+00:00",
+ "2020-12-07T05:45:44.145000+00:00",
+ "2020-12-01T06:01:05.133000+00:00",
+ "2020-12-10T03:52:43.220000+00:00",
+ "2020-12-08T10:18:31.859000+00:00",
+ "2020-12-22T13:49:16.908000+00:00",
+ "2020-12-04T04:35:16.628000+00:00",
+ "2020-12-04T06:16:50.173000+00:00",
+ "2020-11-23T01:24:52.965000+00:00",
+ "2020-12-18T13:09:47.463000+00:00",
+ "2020-12-09T03:17:51.528000+00:00",
+ "2020-10-01T00:37:22.535000+00:00",
+ "2020-12-07T04:11:41.932000+00:00",
+ "2020-12-10T02:07:49.921000+00:00",
+ "2020-12-07T02:03:15.901000+00:00",
+ "2020-12-08T01:10:25.988000+00:00",
+ "2020-12-09T02:50:31.195000+00:00",
+ "2020-12-10T02:31:54.901000+00:00",
+ "2020-12-04T06:49:59.988000+00:00",
+ "2020-12-20T14:35:28.033000+00:00",
+ "2020-11-13T02:36:00.122000+00:00",
+ "2020-12-02T01:51:22.669000+00:00",
+ "2020-11-18T00:37:25.591000+00:00",
+ "2020-11-05T01:53:15.027000+00:00",
+ "2020-10-09T00:29:08.619000+00:00",
+ "2020-10-28T01:35:34.572000+00:00",
+ "2020-10-05T04:42:51.206000+00:00",
+ "2020-12-11T07:30:16.966000+00:00",
+ "2020-11-27T05:05:54.359000+00:00",
+ "2020-12-04T01:13:35.987000+00:00",
+ "2020-12-04T02:52:36.139000+00:00",
+ "2020-12-09T04:01:45.997000+00:00",
+ "2020-12-07T07:27:31.433000+00:00",
+ "2020-11-02T01:46:58.533000+00:00",
+ "2020-11-16T01:34:33.962000+00:00",
+ "2020-10-08T00:35:13.677000+00:00",
+ "2020-12-07T01:01:40.495000+00:00",
+ "2020-10-08T00:40:42.151000+00:00",
+ "2020-12-09T16:53:58.895000+00:00",
+ "2020-12-03T04:17:44.994000+00:00",
+ "2020-10-29T00:32:03.420000+00:00",
+ "2020-10-21T00:33:19.700000+00:00",
+ "2020-12-07T02:01:03.230000+00:00",
+ "2020-11-27T03:03:57.166000+00:00",
+ "2020-11-20T00:31:02.803000+00:00",
+ "2020-12-10T00:44:32.141000+00:00",
+ "2020-11-06T03:30:11.523000+00:00",
+ "2020-12-08T01:31:33.785000+00:00",
+ "2020-12-03T04:42:29.095000+00:00",
+ "2020-11-30T03:07:50.342000+00:00",
+ "2020-12-10T03:57:58.215000+00:00",
+ "2020-12-08T05:30:35.820000+00:00",
+ "2020-10-19T01:48:20.788000+00:00",
+ "2020-12-10T02:03:52.708000+00:00",
+ "2020-11-02T00:49:49.169000+00:00",
+ "2020-10-23T02:48:08.535000+00:00",
+ "2020-12-04T07:08:18.268000+00:00",
+ "2020-12-09T05:49:49.132000+00:00",
+ "2020-11-19T04:33:54.243000+00:00",
+ "2020-11-20T08:31:18.007000+00:00",
+ "2020-12-24T09:34:06.934000+00:00",
+ "2020-11-12T05:14:22.824000+00:00",
+ "2020-10-01T02:31:37.319000+00:00",
+ "2020-12-03T06:09:53.307000+00:00",
+ "2020-12-08T06:22:48.990000+00:00",
+ "2020-10-02T00:35:57.966000+00:00",
+ "2020-10-23T01:46:33.848000+00:00",
+ "2020-12-11T00:42:31.593000+00:00",
+ "2020-12-08T01:58:02.205000+00:00",
+ "2020-12-04T07:31:58.866000+00:00",
+ "2020-10-23T07:14:02.472000+00:00",
+ "2020-12-09T00:36:57.747000+00:00",
+ "2020-11-23T02:12:15.138000+00:00",
+ "2020-12-01T05:56:25.276000+00:00",
+ "2020-10-19T00:36:24.880000+00:00",
+ "2020-10-01T02:14:30.144000+00:00",
+ "2020-10-06T04:09:04.132000+00:00",
+ "2020-11-04T07:11:18.875000+00:00",
+ "2020-12-07T02:01:23.937000+00:00",
+ "2020-11-25T00:29:34.048000+00:00",
+ "2020-10-20T04:34:33.072000+00:00",
+ "2020-12-07T09:28:58.928000+00:00",
+ "2020-12-10T06:14:55.197000+00:00",
+ "2020-11-24T01:59:39.104000+00:00",
+ "2020-12-10T10:30:17.368000+00:00",
+ "2020-12-04T03:52:11.968000+00:00",
+ "2020-11-03T01:36:12.954000+00:00",
+ "2020-12-08T20:44:30.147000+00:00",
+ "2020-11-30T02:53:09.083000+00:00",
+ "2020-10-23T07:15:19.894000+00:00",
+ "2020-10-12T01:51:02.464000+00:00",
+ "2020-11-03T00:35:56.866000+00:00",
+ "2020-11-02T03:00:22.222000+00:00",
+ "2020-12-08T07:56:01.181000+00:00",
+ "2020-11-09T01:54:10.680000+00:00",
+ "2020-12-09T02:20:15.699000+00:00",
+ "2020-12-10T03:39:00.227000+00:00",
+ "2020-12-08T06:04:10.336000+00:00",
+ "2020-12-09T05:14:08.617000+00:00",
+ "2020-12-23T09:36:08.353000+00:00",
+ "2020-10-15T05:07:24.672000+00:00",
+ "2020-12-10T00:37:21.558000+00:00",
+ "2020-10-20T03:04:21.659000+00:00",
+ "2020-12-08T07:53:40.113000+00:00",
+ "2020-10-07T03:03:36.133000+00:00",
+ "2020-10-28T01:58:35.294000+00:00",
+ "2020-11-26T00:29:40.992000+00:00",
+ "2020-11-17T07:17:47.484000+00:00",
+ "2020-12-02T02:29:02.429000+00:00",
+ "2020-10-29T05:26:54.912000+00:00",
+ "2020-12-02T22:59:57.295000+00:00",
+ "2020-10-13T04:45:07.662000+00:00",
+ "2020-12-18T17:12:04.145000+00:00",
+ "2020-10-15T02:09:30.964000+00:00",
+ "2020-11-13T08:00:46.351000+00:00",
+ "2020-12-01T06:14:27.962000+00:00",
+ "2020-12-10T07:02:35.739000+00:00",
+ "2020-10-26T00:26:09.410000+00:00",
+ "2020-11-04T00:58:53.122000+00:00",
+ "2020-10-26T06:21:04.196000+00:00",
+ "2020-12-10T07:07:19.553000+00:00",
+ "2020-11-25T00:49:47.126000+00:00",
+ "2020-11-18T04:22:49.488000+00:00",
+ "2020-11-27T01:33:55.500000+00:00",
+ "2020-12-22T09:45:55.961000+00:00",
+ "2020-10-26T01:29:08.608000+00:00",
+ "2020-12-08T02:45:05.088000+00:00",
+ "2020-10-20T01:49:48.227000+00:00",
+ "2020-12-08T01:05:07.944000+00:00",
+ "2020-12-09T07:20:54.278000+00:00",
+ "2020-10-12T00:32:18.880000+00:00",
+ "2020-11-26T02:04:06.845000+00:00",
+ "2020-11-27T00:42:50.071000+00:00",
+ "2020-12-09T00:42:15.741000+00:00",
+ "2020-10-13T01:33:56.576000+00:00",
+ "2020-12-09T00:55:26.653000+00:00",
+ "2020-12-02T00:34:06.686000+00:00",
+ "2020-11-25T04:24:02.933000+00:00",
+ "2020-10-20T00:35:19.784000+00:00",
+ "2020-12-08T07:18:00.879000+00:00",
+ "2020-10-05T05:31:24.831000+00:00",
+ "2020-11-26T06:10:56.539000+00:00",
+ "2020-10-16T00:59:50.730000+00:00",
+ "2020-10-05T00:44:54.638000+00:00",
+ "2020-11-24T03:32:10.726000+00:00",
+ "2020-10-29T02:14:36.912000+00:00",
+ "2020-10-28T04:32:29.960000+00:00",
+ "2020-12-03T02:27:55.773000+00:00",
+ "2020-10-13T03:09:38.953000+00:00",
+ "2020-11-03T00:32:33.060000+00:00",
+ "2020-11-23T00:28:42.098000+00:00",
+ "2020-10-05T07:23:56.871000+00:00",
+ "2020-12-08T07:41:30.994000+00:00",
+ "2020-10-13T01:53:14.768000+00:00",
+ "2020-11-19T07:33:12.511000+00:00",
+ "2020-11-19T02:22:58.970000+00:00",
+ "2020-12-03T01:02:12.893000+00:00",
+ "2020-12-10T09:19:53.009000+00:00",
+ "2020-12-03T01:57:58.432000+00:00",
+ "2020-11-12T01:53:16.920000+00:00",
+ "2020-12-10T05:08:26.115000+00:00",
+ "2020-12-03T03:19:39.822000+00:00",
+ "2020-11-25T06:51:58.033000+00:00",
+ "2020-11-03T06:11:51.922000+00:00",
+ "2020-10-27T04:51:49.250000+00:00",
+ "2020-12-01T07:40:33.026000+00:00",
+ "2020-11-24T00:39:57.593000+00:00",
+ "2020-12-11T07:22:37.229000+00:00",
+ "2020-11-06T01:49:14.557000+00:00",
+ "2020-12-09T00:46:22.474000+00:00",
+ "2020-10-09T00:07:27.401000+00:00",
+ "2020-10-21T04:56:50.213000+00:00",
+ "2020-12-09T00:38:12.761000+00:00",
+ "2020-10-19T03:09:40.210000+00:00",
+ "2020-12-08T07:42:26.163000+00:00",
+ "2020-12-09T04:41:45.743000+00:00",
+ "2020-12-10T00:38:34.572000+00:00",
+ "2020-10-06T04:23:17.821000+00:00",
+ "2020-10-01T04:41:04.450000+00:00",
+ "2020-10-29T01:59:03.954000+00:00",
+ "2020-12-08T03:35:22.720000+00:00",
+ "2020-12-10T02:17:41.809000+00:00",
+ "2020-11-23T07:45:27.508000+00:00",
+ "2020-12-09T07:32:59.237000+00:00",
+ "2020-11-16T04:30:45.959000+00:00",
+ "2020-10-26T02:57:43.771000+00:00",
+ "2020-12-10T02:57:23.162000+00:00",
+ "2020-11-27T04:06:45.649000+00:00",
+ "2020-12-08T00:20:19.678000+00:00",
+ "2020-10-15T01:44:03.511000+00:00",
+ "2020-10-14T01:42:51.911000+00:00",
+ "2020-10-30T00:42:28.296000+00:00",
+ "2020-10-26T00:37:39.903000+00:00",
+ "2020-10-26T05:10:04.027000+00:00",
+ "2020-10-14T01:46:26.356000+00:00",
+ "2020-10-05T06:03:47.887000+00:00",
+ "2020-10-22T03:10:26.276000+00:00",
+ "2020-12-08T05:41:34.740000+00:00",
+ "2020-12-07T06:59:30.009000+00:00",
+ "2020-10-06T05:41:06.471000+00:00",
+ "2020-11-27T02:12:42.102000+00:00",
+ "2020-12-08T01:53:47.265000+00:00",
+ "2020-10-27T05:22:02.063000+00:00",
+ "2020-12-08T04:12:44.753000+00:00",
+ "2020-10-13T00:37:29.576000+00:00",
+ "2020-12-02T03:46:17.210000+00:00",
+ "2020-12-08T00:33:43.254000+00:00",
+ "2020-12-09T03:34:43.573000+00:00",
+ "2020-10-07T00:40:13.702000+00:00",
+ "2020-12-10T00:21:20.069000+00:00",
+ "2020-12-03T18:40:50.995000+00:00",
+ "2020-12-09T15:11:47.332000+00:00",
+ "2020-12-26T05:24:05.862000+00:00",
+ "2020-10-06T00:43:30.492000+00:00",
+ "2020-11-02T05:03:21.859000+00:00",
+ "2020-10-09T03:10:19.759000+00:00",
+ "2020-12-10T06:32:56.543000+00:00",
+ "2020-10-14T00:31:14.753000+00:00",
+ "2020-10-15T03:07:59.357000+00:00",
+ "2020-10-09T02:05:42.975000+00:00",
+ "2020-11-10T04:06:00.132000+00:00",
+ "2020-11-19T06:10:05.185000+00:00",
+ "2020-10-07T04:57:02.723000+00:00",
+ "2020-11-10T05:20:38.400000+00:00",
+ "2020-12-01T03:16:28.604000+00:00",
+ "2020-12-11T00:22:57.205000+00:00",
+ "2020-12-08T00:40:31.488000+00:00",
+ "2020-12-09T03:57:54.146000+00:00",
+ "2020-11-02T07:45:42.838000+00:00",
+ "2020-10-16T03:43:52.975000+00:00",
+ "2020-12-07T02:34:45.335000+00:00",
+ "2020-12-10T05:26:59.077000+00:00",
+ "2020-10-06T02:51:09.557000+00:00",
+ "2020-12-03T17:36:44.267000+00:00",
+ "2020-12-09T07:15:59.846000+00:00",
+ "2020-12-09T04:20:24.180000+00:00",
+ "2020-10-12T00:29:41.477000+00:00",
+ "2020-10-08T03:09:24.761000+00:00",
+ "2020-11-10T00:44:33.074000+00:00",
+ "2020-10-06T00:44:53.643000+00:00",
+ "2020-10-29T00:40:45.821000+00:00",
+ "2020-10-22T01:40:49.877000+00:00",
+ "2020-10-05T00:45:40.224000+00:00",
+ "2020-12-04T05:56:11.632000+00:00",
+ "2020-12-09T03:12:40.086000+00:00",
+ "2020-12-07T01:16:29.015000+00:00",
+ "2020-10-13T00:32:41.200000+00:00",
+ "2020-12-11T05:10:03.095000+00:00",
+ "2020-10-14T04:47:45.211000+00:00",
+ "2020-11-09T03:37:34.513000+00:00",
+ "2020-11-09T00:41:45.173000+00:00",
+ "2020-12-08T01:50:10.568000+00:00",
+ "2020-11-24T01:51:28.207000+00:00",
+ "2020-12-10T01:52:37.083000+00:00",
+ "2020-12-22T11:47:29.012000+00:00",
+ "2020-10-01T01:14:36.461000+00:00",
+ "2020-12-07T07:24:21.357000+00:00",
+ "2020-11-05T04:56:52.777000+00:00",
+ "2020-12-11T05:15:34.655000+00:00",
+ "2020-11-05T03:01:48.301000+00:00",
+ "2020-11-11T03:56:13.111000+00:00",
+ "2020-12-09T06:11:25.359000+00:00",
+ "2020-12-09T04:28:58.267000+00:00",
+ "2020-10-30T07:19:59.994000+00:00",
+ "2020-10-07T06:09:58.118000+00:00",
+ "2020-11-04T02:32:20.006000+00:00",
+ "2020-12-08T00:34:59.437000+00:00",
+ "2020-12-10T03:32:11.013000+00:00",
+ "2020-10-27T02:55:23.288000+00:00",
+ "2020-11-06T01:46:33.352000+00:00",
+ "2020-12-07T02:28:40.834000+00:00",
+ "2020-11-04T07:18:51.293000+00:00",
+ "2020-10-08T04:37:16.253000+00:00",
+ "2020-12-09T08:26:09.172000+00:00",
+ "2020-12-09T06:41:51.112000+00:00",
+ "2020-11-03T02:49:25.793000+00:00",
+ "2020-11-06T00:38:50.208000+00:00",
+ "2020-10-21T01:39:42.463000+00:00",
+ "2020-12-08T02:04:35.965000+00:00",
+ "2020-12-27T12:39:28.524000+00:00",
+ "2020-11-20T05:49:25.708000+00:00",
+ "2020-12-04T06:25:21.013000+00:00",
+ "2020-11-16T00:26:54.233000+00:00",
+ "2020-11-03T04:40:55.485000+00:00",
+ "2020-11-25T08:24:26.620000+00:00",
+ "2020-10-28T00:33:46.915000+00:00",
+ "2020-12-10T06:01:53.737000+00:00",
+ "2020-11-04T05:13:14.840000+00:00",
+ "2020-11-18T05:44:57.361000+00:00",
+ "2020-12-07T00:36:46.723000+00:00",
+ "2020-11-16T02:42:38.500000+00:00",
+ "2020-11-30T00:27:02.900000+00:00",
+ "2020-12-10T02:04:37.554000+00:00",
+ "2020-12-09T02:02:01.646000+00:00",
+ "2020-12-07T03:42:54.669000+00:00",
+ "2020-12-27T03:56:52.049000+00:00",
+ "2020-11-09T00:38:44.818000+00:00",
+ "2020-12-07T06:47:43.421000+00:00",
+ "2020-12-10T04:13:00.551000+00:00",
+ "2020-12-26T15:17:09.480000+00:00",
+ "2020-11-05T02:09:35.869000+00:00",
+ "2020-12-08T02:01:25.880000+00:00",
+ "2020-11-19T00:47:12.305000+00:00",
+ "2020-10-09T01:56:49.349000+00:00",
+ "2020-10-15T00:32:01.688000+00:00",
+ "2020-10-27T02:41:01.334000+00:00",
+ "2020-11-18T02:12:25.671000+00:00",
+ "2020-10-07T04:58:13.870000+00:00",
+ "2020-12-07T02:20:06.535000+00:00",
+ "2020-10-12T03:10:36.308000+00:00",
+ "2020-10-29T03:18:20.673000+00:00",
+ "2020-12-09T08:06:54.526000+00:00",
+ "2020-11-12T03:17:05.140000+00:00",
+ "2020-10-01T01:15:09.993000+00:00",
+ "2020-11-11T00:51:14.004000+00:00",
+ "2020-11-26T01:43:11.269000+00:00",
+ "2020-11-24T00:43:00.335000+00:00",
+ "2020-11-30T04:44:39.496000+00:00",
+ "2020-10-23T00:30:18.009000+00:00",
+ "2020-10-16T02:12:55.314000+00:00",
+ "2020-12-02T06:24:05.665000+00:00",
+ "2021-01-20T10:24:17.206000+00:00",
+ "2020-11-05T00:36:01.999000+00:00",
+ "2020-12-02T02:41:16.199000+00:00",
+ "2020-12-03T00:30:22.897000+00:00",
+ "2020-12-10T03:39:20.103000+00:00",
+ "2020-11-11T02:04:59.354000+00:00",
+ "2020-12-08T06:38:19.302000+00:00",
+ "2020-12-01T00:35:19.625000+00:00",
+ "2020-11-11T05:45:47.110000+00:00",
+ "2020-10-28T00:35:44.890000+00:00",
+ "2020-12-07T04:16:41.797000+00:00",
+ "2020-11-26T02:16:11.427000+00:00",
+ "2020-11-23T02:32:21.420000+00:00",
+ "2020-11-30T03:20:48.098000+00:00",
+ "2020-12-09T00:51:34.359000+00:00",
+ "2020-12-21T15:37:42.646000+00:00",
+ "2020-12-04T02:14:35.901000+00:00",
+ "2020-12-04T04:47:02.247000+00:00",
+ "2020-10-07T00:37:11.738000+00:00",
+ "2020-11-10T01:59:01.119000+00:00",
+ "2020-10-02T02:48:15.987000+00:00",
+ "2020-10-22T01:52:32.093000+00:00",
+ "2020-11-23T00:21:54.782000+00:00",
+ "2020-12-08T06:25:58.310000+00:00",
+ "2020-10-02T01:45:44.168000+00:00",
+ "2020-11-09T02:00:09.022000+00:00",
+ "2020-11-30T01:55:34.221000+00:00",
+ "2020-12-07T00:32:11.199000+00:00",
+ "2020-12-06T17:32:11.180000+00:00",
+ "2020-11-13T05:32:08.198000+00:00",
+ "2020-12-08T07:42:30.666000+00:00",
+ "2020-10-30T00:41:50.497000+00:00",
+ "2020-10-21T00:36:51.459000+00:00",
+ "2020-10-22T00:30:26.343000+00:00",
+ "2020-10-14T00:33:42.972000+00:00",
+ "2020-11-26T02:58:59.509000+00:00",
+ "2020-10-08T04:50:07.561000+00:00",
+ "2020-10-21T02:47:27.539000+00:00",
+ "2020-11-12T00:38:10.297000+00:00",
+ "2020-11-20T00:59:49.538000+00:00",
+ "2020-12-09T01:59:11.916000+00:00",
+ "2020-12-11T04:13:54.427000+00:00",
+ "2020-10-02T03:18:52.565000+00:00",
+ "2020-12-07T04:29:04.602000+00:00",
+ "2020-12-06T18:46:28.549000+00:00",
+ "2020-10-19T00:30:26.334000+00:00",
+ "2020-12-10T02:10:10.657000+00:00",
+ "2020-12-08T00:49:04.595000+00:00",
+ "2020-10-14T02:59:56.059000+00:00",
+ "2020-10-27T03:50:08.036000+00:00",
+ "2020-11-25T06:06:07.910000+00:00",
+ "2020-10-21T02:17:39.198000+00:00",
+ "2020-10-02T01:44:17.197000+00:00",
+ "2020-12-08T05:04:35.419000+00:00",
+ "2020-10-23T00:25:01.106000+00:00",
+ "2020-12-10T01:37:48.987000+00:00",
+ "2020-10-08T05:35:12.525000+00:00",
+ "2020-10-28T02:41:42.671000+00:00",
+ "2020-10-22T00:32:37.607000+00:00",
+ "2020-10-27T06:18:42.715000+00:00",
+ "2020-11-27T02:22:36.165000+00:00",
+ "2020-12-20T12:04:18.314000+00:00",
+ "2020-11-24T05:33:20.884000+00:00",
+ "2020-12-10T00:32:51.611000+00:00",
+ "2020-10-15T00:30:54.576000+00:00",
+ "2020-10-19T05:15:33.712000+00:00",
+ "2020-11-06T00:34:06.186000+00:00",
+ "2020-10-02T00:34:50.494000+00:00",
+ "2020-11-25T01:47:57.398000+00:00",
+ "2020-12-26T13:27:41.589000+00:00",
+ "2020-10-22T06:11:02.945000+00:00",
+ "2020-12-03T02:00:33.684000+00:00",
+ "2020-11-18T00:39:17.492000+00:00",
+ "2020-11-19T00:31:54.080000+00:00",
+ "2020-10-16T00:32:26.207000+00:00"
+ ],
+ "xbins": {
+ "size": "D1"
+ },
+ "y": [
+ "0000-0002-7397-7977",
+ "0000-0003-4931-9736",
+ "0000-0001-8221-2303",
+ "0000-0001-6736-072X",
+ "0000-0002-8727-1246",
+ "0000-0001-6760-9521",
+ "0000-0001-9283-9441",
+ "0000-0002-4732-4729",
+ "0000-0002-9827-9374",
+ "0000-0002-6834-0023",
+ "0000-0002-2002-1963",
+ "0000-0002-6761-8230",
+ "0000-0003-2879-0537",
+ "0000-0002-8132-9689",
+ "0000-0002-8780-3628",
+ "0000-0002-1638-362X",
+ "0000-0002-2288-2476",
+ "0000-0003-1668-4649",
+ "0000-0002-0274-0892",
+ "0000-0002-2226-8564",
+ "0000-0002-4644-3793",
+ "0000-0002-1450-0757",
+ "0000-0003-0934-7898",
+ "0000-0001-8857-1227",
+ "0000-0001-7391-5859",
+ "0000-0001-7855-7181",
+ "0000-0003-2092-5417",
+ "0000-0003-2802-4779",
+ "0000-0003-4446-8089",
+ "0000-0002-4169-2694",
+ "0000-0003-3544-8879",
+ "0000-0003-4183-5576",
+ "0000-0002-8715-8892",
+ "0000-0003-2084-7316",
+ "0000-0002-9104-1662",
+ "0000-0002-4643-1215",
+ "0000-0003-3672-4863",
+ "0000-0003-3822-4088",
+ "0000-0001-8056-2510",
+ "0000-0003-0116-611X",
+ "0000-0002-7866-9236",
+ "0000-0002-9483-7383",
+ "0000-0001-6180-8810",
+ "0000-0003-0815-2327",
+ "0000-0001-8577-7780",
+ "0000-0001-9143-4151",
+ "0000-0003-2213-5611",
+ "0000-0003-4137-5282",
+ "0000-0001-8543-9183",
+ "0000-0002-1346-7860",
+ "0000-0001-9141-0715",
+ "0000-0002-8923-182X",
+ "0000-0001-5466-8100",
+ "0000-0002-4847-5422",
+ "0000-0002-8769-5698",
+ "0000-0003-4107-9766",
+ "0000-0002-6027-4105",
+ "0000-0001-5782-8739",
+ "0000-0002-5911-6433",
+ "0000-0001-8021-0961",
+ "0000-0001-6247-4330",
+ "0000-0002-4491-1901",
+ "0000-0002-7736-940X",
+ "0000-0001-6119-7669",
+ "0000-0003-3877-5373",
+ "0000-0002-8929-7212",
+ "0000-0002-2459-2675",
+ "0000-0002-8156-5059",
+ "0000-0002-7114-5886",
+ "0000-0001-5097-977X",
+ "0000-0003-3138-197X",
+ "0000-0003-4083-2496",
+ "0000-0002-0338-3890",
+ "0000-0001-9872-5998",
+ "0000-0003-1091-0852",
+ "0000-0002-8561-142X",
+ "0000-0002-6052-6368",
+ "0000-0002-2862-2552",
+ "0000-0003-1164-9246",
+ "0000-0002-0726-7555",
+ "0000-0002-7800-1463",
+ "0000-0003-4868-5507",
+ "0000-0002-2049-316X",
+ "0000-0002-1295-2055",
+ "0000-0001-9281-8579",
+ "0000-0002-7810-3574",
+ "0000-0002-4910-1078",
+ "0000-0002-7877-8643",
+ "0000-0002-4809-8129",
+ "0000-0003-0739-261X",
+ "0000-0002-4555-6171",
+ "0000-0002-4828-8969",
+ "0000-0001-7065-7567",
+ "0000-0002-1043-5679",
+ "0000-0001-8942-822X",
+ "0000-0002-6106-9883",
+ "0000-0003-4561-1406",
+ "0000-0001-8834-2336",
+ "0000-0001-9293-2603",
+ "0000-0002-2299-2931",
+ "0000-0003-0529-408X",
+ "0000-0001-6040-4697",
+ "0000-0002-1772-6567",
+ "0000-0002-7328-7845",
+ "0000-0002-3856-3242",
+ "0000-0003-2768-672X",
+ "0000-0002-2564-7148",
+ "0000-0003-3095-4430",
+ "0000-0003-4298-9059",
+ "0000-0002-1322-595X",
+ "0000-0002-6346-5062",
+ "0000-0002-7684-1346",
+ "0000-0002-0837-7668",
+ "0000-0002-8547-0647",
+ "0000-0001-9393-1805",
+ "0000-0002-9071-6023",
+ "0000-0002-5953-3958",
+ "0000-0002-1698-5831",
+ "0000-0002-1338-9604",
+ "0000-0002-9586-0999",
+ "0000-0003-1137-9039",
+ "0000-0003-3480-0367",
+ "0000-0002-1491-2583",
+ "0000-0002-7894-3856",
+ "0000-0003-4977-7817",
+ "0000-0002-3459-7437",
+ "0000-0002-3725-5483",
+ "0000-0002-5538-8140",
+ "0000-0002-6151-3200",
+ "0000-0002-7630-5682",
+ "0000-0002-7681-0021",
+ "0000-0003-4306-9019",
+ "0000-0001-5546-8688",
+ "0000-0003-4431-5437",
+ "0000-0001-5968-1718",
+ "0000-0002-4091-7791",
+ "0000-0002-4137-7205",
+ "0000-0003-0572-8757",
+ "0000-0002-4864-0758",
+ "0000-0003-0756-5509",
+ "0000-0003-4096-1634",
+ "0000-0002-2413-4439",
+ "0000-0002-4679-0998",
+ "0000-0002-4645-269X",
+ "0000-0002-5922-3001",
+ "0000-0001-7256-5228",
+ "0000-0002-5105-8515",
+ "0000-0003-0098-7936",
+ "0000-0003-1866-3411",
+ "0000-0001-6111-5547",
+ "0000-0003-3525-3940",
+ "0000-0002-3644-3503",
+ "0000-0002-6028-7627",
+ "0000-0003-3175-5572",
+ "0000-0002-8287-7222",
+ "0000-0003-1595-1389",
+ "0000-0001-8377-8843",
+ "0000-0002-6968-6931",
+ "0000-0001-9338-4573",
+ "0000-0001-9504-9306",
+ "0000-0003-0254-7141",
+ "0000-0001-8275-9325",
+ "0000-0002-0702-1256",
+ "0000-0003-4922-7292",
+ "0000-0002-4815-8953",
+ "0000-0003-4222-2069",
+ "0000-0002-7586-5477",
+ "0000-0001-7531-6702",
+ "0000-0002-1764-6137",
+ "0000-0002-1936-2859",
+ "0000-0002-1110-0694",
+ "0000-0002-4580-1528",
+ "0000-0003-2354-3537",
+ "0000-0002-1112-0425",
+ "0000-0003-0226-9700",
+ "0000-0002-0750-4004",
+ "0000-0003-0798-5503",
+ "0000-0002-5724-3092",
+ "0000-0002-8037-1148",
+ "0000-0002-9075-6957",
+ "0000-0003-0273-3948",
+ "0000-0001-8105-0541",
+ "0000-0001-5240-8946",
+ "0000-0002-1255-6722",
+ "0000-0002-2248-9076",
+ "0000-0002-2482-3031",
+ "0000-0001-6610-4716",
+ "0000-0002-2679-385X",
+ "0000-0001-7351-8749",
+ "0000-0003-1870-2011",
+ "0000-0002-5780-1605",
+ "0000-0001-9548-6661",
+ "0000-0001-7956-0921",
+ "0000-0001-6077-1083",
+ "0000-0002-4312-9694",
+ "0000-0003-2726-990X",
+ "0000-0002-2310-0990",
+ "0000-0003-1218-6052",
+ "0000-0003-4137-9291",
+ "0000-0002-7546-3240",
+ "0000-0001-9632-6459",
+ "0000-0002-0317-7042",
+ "0000-0002-7785-451X",
+ "0000-0001-5329-7467",
+ "0000-0002-1872-3667",
+ "0000-0003-1180-8753",
+ "0000-0003-0011-1520",
+ "0000-0002-7218-6057",
+ "0000-0003-0225-1324",
+ "0000-0001-8055-8992",
+ "0000-0002-3019-6791",
+ "0000-0003-2859-603X",
+ "0000-0001-9119-6026",
+ "0000-0002-0587-2233",
+ "0000-0002-9511-9754",
+ "0000-0003-1517-3898",
+ "0000-0003-3993-8541",
+ "0000-0003-0208-4394",
+ "0000-0003-2322-3859",
+ "0000-0002-5105-7504",
+ "0000-0002-6613-9166",
+ "0000-0001-5793-9375",
+ "0000-0002-8887-0646",
+ "0000-0001-5618-4820",
+ "0000-0001-8160-5658",
+ "0000-0003-4635-2733",
+ "0000-0002-4032-956X",
+ "0000-0001-8926-0922",
+ "0000-0002-4366-3395",
+ "0000-0001-6056-0964",
+ "0000-0002-3423-891X",
+ "0000-0002-1547-4204",
+ "0000-0002-4324-7121",
+ "0000-0001-7540-3339",
+ "0000-0001-8624-3027",
+ "0000-0002-3475-2388",
+ "0000-0001-5049-4721",
+ "0000-0002-0549-2812",
+ "0000-0003-4352-3234",
+ "0000-0002-0285-6940",
+ "0000-0001-5883-3337",
+ "0000-0001-7918-7071",
+ "0000-0003-0270-8849",
+ "0000-0002-1059-9753",
+ "0000-0002-7330-596X",
+ "0000-0001-8805-3681",
+ "0000-0003-0272-1883",
+ "0000-0002-3679-6886",
+ "0000-0003-3718-4774",
+ "0000-0002-5438-2716",
+ "0000-0002-4805-891X",
+ "0000-0003-1151-7112",
+ "0000-0002-1649-4661",
+ "0000-0003-2956-5278",
+ "0000-0002-8016-3111",
+ "0000-0003-2827-1227",
+ "0000-0003-2145-4717",
+ "0000-0003-2028-9510",
+ "0000-0003-4424-6881",
+ "0000-0001-7474-2988",
+ "0000-0002-2510-6501",
+ "0000-0003-3582-6294",
+ "0000-0002-1343-126X",
+ "0000-0002-4595-2588",
+ "0000-0002-9526-2876",
+ "0000-0003-1158-7057",
+ "0000-0002-3345-5154",
+ "0000-0002-8244-6924",
+ "0000-0003-4806-3951",
+ "0000-0002-7464-3688",
+ "0000-0003-2647-2359",
+ "0000-0001-6073-387X",
+ "0000-0001-8230-5606",
+ "0000-0001-9108-194X",
+ "0000-0002-1674-6427",
+ "0000-0002-6355-1382",
+ "0000-0003-3632-7673",
+ "0000-0002-5140-4932",
+ "0000-0003-2316-2347",
+ "0000-0003-2793-1027",
+ "0000-0002-8554-3738",
+ "0000-0001-6673-083X",
+ "0000-0001-7378-174X",
+ "0000-0002-1614-9163",
+ "0000-0002-8282-0330",
+ "0000-0003-3527-7336",
+ "0000-0001-7189-2147",
+ "0000-0002-3344-3903",
+ "0000-0003-1786-3489",
+ "0000-0001-7108-8036",
+ "0000-0003-4078-2193",
+ "0000-0001-9352-3676",
+ "0000-0001-9856-8531",
+ "0000-0002-9276-8003",
+ "0000-0003-0872-8082",
+ "0000-0002-7824-1395",
+ "0000-0001-5224-7353",
+ "0000-0002-5281-2110",
+ "0000-0002-3201-6256",
+ "0000-0003-3697-1370",
+ "0000-0002-7678-0520",
+ "0000-0002-3741-371X",
+ "0000-0002-8132-2356",
+ "0000-0001-8074-1025",
+ "0000-0002-7896-7268",
+ "0000-0002-5536-6005",
+ "0000-0002-5627-5594",
+ "0000-0002-2790-8196",
+ "0000-0002-4814-6303",
+ "0000-0003-3563-8111",
+ "0000-0002-3894-8185",
+ "0000-0001-6608-759X",
+ "0000-0001-6678-4133",
+ "0000-0001-6893-5775",
+ "0000-0002-9472-9307",
+ "0000-0003-2441-0736",
+ "0000-0002-8295-024X",
+ "0000-0002-4322-6590",
+ "0000-0001-9047-2156",
+ "0000-0002-9072-6328",
+ "0000-0002-1625-1774",
+ "0000-0002-2315-0242",
+ "0000-0001-6178-4337",
+ "0000-0003-1272-8819",
+ "0000-0003-1460-8458",
+ "0000-0003-4091-5443",
+ "0000-0001-6647-3570",
+ "0000-0002-0756-4381",
+ "0000-0002-7428-5307",
+ "0000-0002-7699-0579",
+ "0000-0002-3145-000X",
+ "0000-0002-9249-230X",
+ "0000-0002-0511-4600",
+ "0000-0003-1621-0435",
+ "0000-0002-6126-7150",
+ "0000-0002-8161-6469",
+ "0000-0002-9577-7263",
+ "0000-0003-4087-3992",
+ "0000-0002-1295-2450",
+ "0000-0002-5135-5341",
+ "0000-0002-7357-3104",
+ "0000-0002-2913-3400",
+ "0000-0003-0271-5678",
+ "0000-0002-3427-5032",
+ "0000-0002-2550-0496",
+ "0000-0001-6399-6536",
+ "0000-0001-7317-8000",
+ "0000-0002-0357-9789",
+ "0000-0002-8995-3531",
+ "0000-0003-4421-9234",
+ "0000-0002-6004-465X",
+ "0000-0002-8739-4473",
+ "0000-0002-5576-1865",
+ "0000-0002-6978-9068",
+ "0000-0001-9918-496X",
+ "0000-0003-2243-8460",
+ "0000-0003-2778-4266",
+ "0000-0003-4318-6821",
+ "0000-0002-7600-7845",
+ "0000-0002-8469-1288",
+ "0000-0003-4684-4148",
+ "0000-0002-7312-128X",
+ "0000-0001-5236-2001",
+ "0000-0001-8692-9651",
+ "0000-0002-8544-0202",
+ "0000-0002-8884-3592",
+ "0000-0002-1231-9903",
+ "0000-0003-0407-7913",
+ "0000-0001-7196-6679",
+ "0000-0002-1319-1358",
+ "0000-0001-7305-3240",
+ "0000-0002-9910-2765",
+ "0000-0003-4952-1491",
+ "0000-0002-7466-2256",
+ "0000-0002-9221-8730",
+ "0000-0001-9116-7882",
+ "0000-0002-8938-6004",
+ "0000-0002-8684-4321",
+ "0000-0003-0818-6607",
+ "0000-0001-6798-4401",
+ "0000-0001-6500-9047",
+ "0000-0002-2693-5627",
+ "0000-0002-7256-6340",
+ "0000-0002-0194-6362",
+ "0000-0002-5125-5092",
+ "0000-0002-7197-2355",
+ "0000-0002-4535-0958",
+ "0000-0001-6476-7321",
+ "0000-0003-3345-4757",
+ "0000-0003-4179-1155",
+ "0000-0002-5536-0156",
+ "0000-0002-6206-8444",
+ "0000-0001-6570-1594",
+ "0000-0002-9711-0067",
+ "0000-0001-5321-0392",
+ "0000-0003-2488-1263",
+ "0000-0003-2781-8198",
+ "0000-0002-3739-6681",
+ "0000-0003-3577-7963",
+ "0000-0003-3286-3073",
+ "0000-0001-8200-3333",
+ "0000-0002-5813-2765",
+ "0000-0002-6206-3563",
+ "0000-0003-3660-364X",
+ "0000-0002-4134-3566",
+ "0000-0001-7768-9799",
+ "0000-0001-5895-047X",
+ "0000-0002-4722-8681",
+ "0000-0001-9603-8908",
+ "0000-0002-0772-1586",
+ "0000-0001-6797-3964",
+ "0000-0002-6834-901X",
+ "0000-0002-1554-8306",
+ "0000-0002-1879-4262",
+ "0000-0002-9640-8136",
+ "0000-0002-6926-3752",
+ "0000-0002-3655-4713",
+ "0000-0002-8724-1020",
+ "0000-0002-4601-4569"
+ ]
+ }
+ ],
+ "layout": {
+ "template": {
+ "data": {
+ "bar": [
+ {
+ "error_x": {
+ "color": "#2a3f5f"
+ },
+ "error_y": {
+ "color": "#2a3f5f"
+ },
+ "marker": {
+ "line": {
+ "color": "#E5ECF6",
+ "width": 0.5
+ }
+ },
+ "type": "bar"
+ }
+ ],
+ "barpolar": [
+ {
+ "marker": {
+ "line": {
+ "color": "#E5ECF6",
+ "width": 0.5
+ }
+ },
+ "type": "barpolar"
+ }
+ ],
+ "carpet": [
+ {
+ "aaxis": {
+ "endlinecolor": "#2a3f5f",
+ "gridcolor": "white",
+ "linecolor": "white",
+ "minorgridcolor": "white",
+ "startlinecolor": "#2a3f5f"
+ },
+ "baxis": {
+ "endlinecolor": "#2a3f5f",
+ "gridcolor": "white",
+ "linecolor": "white",
+ "minorgridcolor": "white",
+ "startlinecolor": "#2a3f5f"
+ },
+ "type": "carpet"
+ }
+ ],
+ "choropleth": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "type": "choropleth"
+ }
+ ],
+ "contour": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "contour"
+ }
+ ],
+ "contourcarpet": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "type": "contourcarpet"
+ }
+ ],
+ "heatmap": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "heatmap"
+ }
+ ],
+ "heatmapgl": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "heatmapgl"
+ }
+ ],
+ "histogram": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "histogram"
+ }
+ ],
+ "histogram2d": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "histogram2d"
+ }
+ ],
+ "histogram2dcontour": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "histogram2dcontour"
+ }
+ ],
+ "mesh3d": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "type": "mesh3d"
+ }
+ ],
+ "parcoords": [
+ {
+ "line": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "parcoords"
+ }
+ ],
+ "pie": [
+ {
+ "automargin": true,
+ "type": "pie"
+ }
+ ],
+ "scatter": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatter"
+ }
+ ],
+ "scatter3d": [
+ {
+ "line": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatter3d"
+ }
+ ],
+ "scattercarpet": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattercarpet"
+ }
+ ],
+ "scattergeo": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattergeo"
+ }
+ ],
+ "scattergl": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattergl"
+ }
+ ],
+ "scattermapbox": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scattermapbox"
+ }
+ ],
+ "scatterpolar": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatterpolar"
+ }
+ ],
+ "scatterpolargl": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatterpolargl"
+ }
+ ],
+ "scatterternary": [
+ {
+ "marker": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "type": "scatterternary"
+ }
+ ],
+ "surface": [
+ {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ },
+ "colorscale": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "type": "surface"
+ }
+ ],
+ "table": [
+ {
+ "cells": {
+ "fill": {
+ "color": "#EBF0F8"
+ },
+ "line": {
+ "color": "white"
+ }
+ },
+ "header": {
+ "fill": {
+ "color": "#C8D4E3"
+ },
+ "line": {
+ "color": "white"
+ }
+ },
+ "type": "table"
+ }
+ ]
+ },
+ "layout": {
+ "annotationdefaults": {
+ "arrowcolor": "#2a3f5f",
+ "arrowhead": 0,
+ "arrowwidth": 1
+ },
+ "autotypenumbers": "strict",
+ "coloraxis": {
+ "colorbar": {
+ "outlinewidth": 0,
+ "ticks": ""
+ }
+ },
+ "colorscale": {
+ "diverging": [
+ [
+ 0,
+ "#8e0152"
+ ],
+ [
+ 0.1,
+ "#c51b7d"
+ ],
+ [
+ 0.2,
+ "#de77ae"
+ ],
+ [
+ 0.3,
+ "#f1b6da"
+ ],
+ [
+ 0.4,
+ "#fde0ef"
+ ],
+ [
+ 0.5,
+ "#f7f7f7"
+ ],
+ [
+ 0.6,
+ "#e6f5d0"
+ ],
+ [
+ 0.7,
+ "#b8e186"
+ ],
+ [
+ 0.8,
+ "#7fbc41"
+ ],
+ [
+ 0.9,
+ "#4d9221"
+ ],
+ [
+ 1,
+ "#276419"
+ ]
+ ],
+ "sequential": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ],
+ "sequentialminus": [
+ [
+ 0,
+ "#0d0887"
+ ],
+ [
+ 0.1111111111111111,
+ "#46039f"
+ ],
+ [
+ 0.2222222222222222,
+ "#7201a8"
+ ],
+ [
+ 0.3333333333333333,
+ "#9c179e"
+ ],
+ [
+ 0.4444444444444444,
+ "#bd3786"
+ ],
+ [
+ 0.5555555555555556,
+ "#d8576b"
+ ],
+ [
+ 0.6666666666666666,
+ "#ed7953"
+ ],
+ [
+ 0.7777777777777778,
+ "#fb9f3a"
+ ],
+ [
+ 0.8888888888888888,
+ "#fdca26"
+ ],
+ [
+ 1,
+ "#f0f921"
+ ]
+ ]
+ },
+ "colorway": [
+ "#636efa",
+ "#EF553B",
+ "#00cc96",
+ "#ab63fa",
+ "#FFA15A",
+ "#19d3f3",
+ "#FF6692",
+ "#B6E880",
+ "#FF97FF",
+ "#FECB52"
+ ],
+ "font": {
+ "color": "#2a3f5f"
+ },
+ "geo": {
+ "bgcolor": "white",
+ "lakecolor": "white",
+ "landcolor": "#E5ECF6",
+ "showlakes": true,
+ "showland": true,
+ "subunitcolor": "white"
+ },
+ "hoverlabel": {
+ "align": "left"
+ },
+ "hovermode": "closest",
+ "mapbox": {
+ "style": "light"
+ },
+ "paper_bgcolor": "white",
+ "plot_bgcolor": "#E5ECF6",
+ "polar": {
+ "angularaxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ },
+ "bgcolor": "#E5ECF6",
+ "radialaxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ }
+ },
+ "scene": {
+ "xaxis": {
+ "backgroundcolor": "#E5ECF6",
+ "gridcolor": "white",
+ "gridwidth": 2,
+ "linecolor": "white",
+ "showbackground": true,
+ "ticks": "",
+ "zerolinecolor": "white"
+ },
+ "yaxis": {
+ "backgroundcolor": "#E5ECF6",
+ "gridcolor": "white",
+ "gridwidth": 2,
+ "linecolor": "white",
+ "showbackground": true,
+ "ticks": "",
+ "zerolinecolor": "white"
+ },
+ "zaxis": {
+ "backgroundcolor": "#E5ECF6",
+ "gridcolor": "white",
+ "gridwidth": 2,
+ "linecolor": "white",
+ "showbackground": true,
+ "ticks": "",
+ "zerolinecolor": "white"
+ }
+ },
+ "shapedefaults": {
+ "line": {
+ "color": "#2a3f5f"
+ }
+ },
+ "ternary": {
+ "aaxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ },
+ "baxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ },
+ "bgcolor": "#E5ECF6",
+ "caxis": {
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": ""
+ }
+ },
+ "title": {
+ "x": 0.05
+ },
+ "xaxis": {
+ "automargin": true,
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": "",
+ "title": {
+ "standoff": 15
+ },
+ "zerolinecolor": "white",
+ "zerolinewidth": 2
+ },
+ "yaxis": {
+ "automargin": true,
+ "gridcolor": "white",
+ "linecolor": "white",
+ "ticks": "",
+ "title": {
+ "standoff": 15
+ },
+ "zerolinecolor": "white",
+ "zerolinewidth": 2
+ }
+ }
+ },
+ "title": {
+ "text": "Activation distribution for bio \"more straightforward way to borrow the money you\""
+ },
+ "xaxis": {
+ "tickangle": 45,
+ "tickfont": {
+ "size": 12
+ }
+ }
+ }
+ },
+ "text/html": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "BIO_SNIPPET = 'more straightforward way to borrow the money you'\n",
+ "dup_bios_df = df[df.biography.str.contains(BIO_SNIPPET)]\n",
+ "# .groupby(df.activation_date.dt.month)[['orcid']].count().sort_values('orcid', ascending=False)\n",
+ "\n",
+ "set_top_n(50)\n",
+ "data = [\n",
+ " go.Histogram(\n",
+ " x=dup_bios_df['activation_date'],\n",
+ " y=dup_bios_df['orcid'],\n",
+ " histfunc=\"count\"\n",
+ " )\n",
+ "]\n",
+ "\n",
+ "layout = go.Layout(\n",
+ " title='Activation distribution for bio \"%s\"' % BIO_SNIPPET,\n",
+ " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n",
+ ")\n",
+ "fig = go.Figure(data=data, layout=layout)\n",
+ "fig.update_traces(xbins_size='D1')\n",
+ "plotly.offline.iplot(fig)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Dup bios with extended length**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 87,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " orcid | \n",
+ "
\n",
+ " \n",
+ " biography | \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " car title loans are a more straightforward way to borrow the money you need, without dealing with the hassles of a traditional bank loan. because they use the equity value of your qualifying vehicle to secure funding, they are a great borrowing option for customers with credit issues, or who need expedited funding. our customers know to turn to our local experts when they need auto title loans. our team is excited to get you your loan today! | \n",
+ " 343 | \n",
+ "
\n",
+ " \n",
+ " hi, how are you? it is really cool to find an entire community of people interested in the same thing you are. | \n",
+ " 229 | \n",
+ "
\n",
+ " \n",
+ " the sound and the fury is one of my all-time favorite novels but i have many. | \n",
+ " 218 | \n",
+ "
\n",
+ " \n",
+ " one of my passions is people watching but i dont get to do it as much as i would like. | \n",
+ " 132 | \n",
+ "
\n",
+ " \n",
+ " why hello there. i can not believe i didnt know this community existed sooner. | \n",
+ " 131 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " mütter, die sich vor der entbindung für einen rückbildungskurs anmelden, belügen sich selbst.denn das gleicht den vorsätzen zu neujahr: gut gemeint, | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " <p class=p__2>politically, taking on the affordable care act or not taking it on are both dangerous. while many citizens dont comprehend all that the | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " insaat kalip yagi, kalip yag, plywood kalip yagi, ahsap kalip yagi alanlarinda profesyonel ve organik olarak imalat yapan sirketimiz musteri goruslerini son derece onemsemektedir. | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " <p class=p__7>since life and medical insurance commissions are front-loaded, agents generally do not get a commission after the 3rd policy renewal. | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " ive traveled to several countries and have several more to see. i have a lizard named tinky. | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1619 rows × 1 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " orcid\n",
+ "biography \n",
+ "car title loans are a more straightforward way ... 343\n",
+ "hi, how are you? it is really cool to find an e... 229\n",
+ "the sound and the fury is one of my all-time fa... 218\n",
+ "one of my passions is people watching but i don... 132\n",
+ "why hello there. i can not believe i didnt know... 131\n",
+ "... ...\n",
+ "mütter, die sich vor der entbindung für einen r... 2\n",
+ "politically, taking on the afford... 2\n",
+ "insaat kalip yagi, kalip yag, plywood kalip yag... 2\n",
+ "
since life and medical insurance ... 2\n",
+ "ive traveled to several countries and have seve... 2\n",
+ "\n",
+ "[1619 rows x 1 columns]"
+ ]
+ },
+ "execution_count": 87,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "dup_bios[dup_bios.index.str.split(' ').str.len() > 10]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Assign spam score from precanned library**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 88,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# bios = df[df.biography.notna()][['orcid', 'biography']]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 89,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# def score(bio):\n",
+ "# try:\n",
+ "# return antispam.score(bio)\n",
+ "# except: # if len(bio) < 3 the filter doesn't know how to handle that\n",
+ "# return -1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 90,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# bios['spam_score'] = bios.biography.apply(lambda bio: score(bio))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 91,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# bios[bios.spam_score == -1] # these are artefacts (no scoring possible)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 92,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# bios.spam_score.replace(to_replace=-1, value=np.nan, inplace=True)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 93,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# bios.spam_score.describe()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 94,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# bios[bios.spam_score > 0.99]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Spam goes nowhere."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Search offending words, sexually explicit content, etc.**"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 95,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# bios['profanity_score'] = profanity_check.predict_prob(bios.biography)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 96,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# bios[bios.profanity_score > 0.90]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Profanity detection goes nowhere too."
]
},
{
@@ -19468,7 +22591,7 @@
},
{
"cell_type": "code",
- "execution_count": 136,
+ "execution_count": 97,
"metadata": {},
"outputs": [
{
@@ -19499,9 +22622,8 @@
"n_education",
"n_employment",
"n_ext_work_source",
- "spam_score",
- "n_valid_employment",
- "n_valid_education"
+ "n_valid_education",
+ "n_valid_employment"
],
"xaxis": "x",
"y": [
@@ -19520,370 +22642,332 @@
"n_education",
"n_employment",
"n_ext_work_source",
- "spam_score",
- "n_valid_employment",
- "n_valid_education"
+ "n_valid_education",
+ "n_valid_employment"
],
"yaxis": "y",
"z": [
[
1,
- 0.9851287263502948,
- 0.032912669844356764,
- 0.030955329523625504,
- 0.002797563048323001,
- 0.01323756721452204,
- 0.026255505701943696,
- 0.05741408837128926,
- 0.01569597153718797,
- 0.040989235686940816,
- 0.061568740672497534,
- 0.03696794447170714,
- 0.05352081277803731,
- 0.04004869062451576,
- 0.09668574568788427,
- 0.020983848636741746,
- 0.04230149069451306,
- 0.0530823224770852
+ 0.9649829131836175,
+ 0.07899833525811681,
+ 0.07259719921935885,
+ 0.0064613638682561435,
+ 0.030614701011724112,
+ 0.0606246420123506,
+ 0.1240658917743258,
+ 0.03267144560134065,
+ 0.10712349577355784,
+ 0.14475767748321952,
+ 0.10243044622702734,
+ 0.22284375415000315,
+ 0.2240544946507108,
+ 0.22551664240183317,
+ 0.21764209610282456,
+ 0.22363930402583765
],
[
- 0.9851287263502948,
+ 0.9649829131836175,
1,
- 0.032893021887808044,
- 0.030952716284260284,
- 0.0027246992919826396,
- 0.013197039191265785,
- 0.026305729103779554,
- 0.056657974238298256,
- 0.013593908269146495,
- 0.04075930430401838,
- 0.06093122306222735,
- 0.036427174310613826,
- 0.05283100363748757,
- 0.039617047413162375,
- 0.09617813225153261,
- 0.020779934453739678,
- 0.04187757510946746,
- 0.05252647681980516
+ 0.08183974046700901,
+ 0.07518160639621203,
+ 0.0066860590291805974,
+ 0.031712353459948744,
+ 0.06277678931008057,
+ 0.12998640687807267,
+ 0.03226243840878624,
+ 0.11113100230411314,
+ 0.15032740706571793,
+ 0.10624021870034253,
+ 0.23154024673662948,
+ 0.23250920740301625,
+ 0.2327233169990374,
+ 0.22610062830832708,
+ 0.2320611339644608
],
[
- 0.032912669844356764,
- 0.032893021887808044,
+ 0.07899833525811681,
+ 0.08183974046700901,
1,
- 0.9347845603401734,
- 0.31289353882027604,
- 0.3327034204536421,
- 0.8256148606228672,
- 0.24175475571155877,
- 0.025014569456040185,
- 0.15426190915221324,
- 0.3477514643689929,
- 0.13281808409522428,
- 0.06673049906774123,
- 0.14909994555141065,
- 0.3919479197818211,
- 0.11430255435331263,
- 0.1530900744997813,
- 0.04224623793246877
+ 0.9378726254398347,
+ 0.3126299250047347,
+ 0.35108563893979355,
+ 0.8353346326813307,
+ 0.22240613268720477,
+ 0.03160640765461562,
+ 0.14838588295615024,
+ 0.37171182274445363,
+ 0.1506365739780303,
+ 0.13686193479055792,
+ 0.21343320832924814,
+ 0.4005951950706468,
+ 0.127757181278294,
+ 0.21359436822476655
],
[
- 0.030955329523625504,
- 0.030952716284260284,
- 0.9347845603401734,
+ 0.07259719921935885,
+ 0.07518160639621203,
+ 0.9378726254398347,
1,
- 0.3551317964752695,
- 0.3396714358684357,
- 0.792393006061876,
- 0.23812475099223263,
- 0.021917404276208984,
- 0.12119446342488618,
- 0.3286894436814909,
- 0.11073297656642778,
- 0.048318069131114486,
- 0.12097737688298277,
- 0.37572530981238894,
- 0.08706480424871708,
- 0.1367349622160526,
- 0.03356612824419668
+ 0.35605399617723354,
+ 0.3624050122938972,
+ 0.8018196175347003,
+ 0.2178190759174422,
+ 0.028320630207299337,
+ 0.12192573243272048,
+ 0.3525468899622581,
+ 0.12916204989780206,
+ 0.11736450285212531,
+ 0.18470550214116468,
+ 0.3834831573219326,
+ 0.11103974478415263,
+ 0.18991605863836233
],
[
- 0.002797563048323001,
- 0.0027246992919826396,
- 0.31289353882027604,
- 0.3551317964752695,
+ 0.0064613638682561435,
+ 0.0066860590291805974,
+ 0.3126299250047347,
+ 0.35605399617723354,
1,
- -0.0006813987707150902,
- 0.2282600668610334,
- 0.01778638684849228,
- 0.0013809757631778703,
- 0.006444532015829912,
- 0.004329998524016476,
- 0.002683817677771984,
- 0.0013582777338929175,
- 0.012499036555991529,
- 0.01818525196078464,
- 0.0005840365476183185,
- 0.016598234185244483,
- 0.0016583682855190094
+ 0.0009072282179230706,
+ 0.2420914875526222,
+ 0.01776888092417036,
+ 0.002099309887982074,
+ 0.0064144255162447246,
+ 0.009269883208277058,
+ 0.005433864001670957,
+ 0.008619933999683011,
+ 0.015077339853222701,
+ 0.023203121401780318,
+ 0.008480130351469113,
+ 0.016480069731931876
],
[
- 0.01323756721452204,
- 0.013197039191265785,
- 0.3327034204536421,
- 0.3396714358684357,
- -0.0006813987707150902,
+ 0.030614701011724112,
+ 0.031712353459948744,
+ 0.35108563893979355,
+ 0.3624050122938972,
+ 0.0009072282179230706,
1,
- 0.24340068611465254,
- 0.1079192129721149,
- 0.004399691864203764,
- 0.04628651725406843,
- 0.07481405788686601,
- 0.042315521608991075,
- 0.04004460862097079,
- 0.08040739840894055,
- 0.16125389614532118,
- 0.02926424821268392,
- 0.08624271168084971,
- 0.02840212050111673
+ 0.2570742999530523,
+ 0.0922145270760206,
+ 0.00898053907910667,
+ 0.04988227847309645,
+ 0.08759311081674451,
+ 0.04978801517329604,
+ 0.05364308187508679,
+ 0.09230230828045376,
+ 0.15718240355316795,
+ 0.05003145912334212,
+ 0.09365323185411886
],
[
- 0.026255505701943696,
- 0.026305729103779554,
- 0.8256148606228672,
- 0.792393006061876,
- 0.2282600668610334,
- 0.24340068611465254,
+ 0.0606246420123506,
+ 0.06277678931008057,
+ 0.8353346326813307,
+ 0.8018196175347003,
+ 0.2420914875526222,
+ 0.2570742999530523,
1,
- 0.19116755828206128,
- 0.01771983580822778,
- 0.1252448169099988,
- 0.3322716889228379,
- 0.09961147764401061,
- 0.041174205561030665,
- 0.10958484453530018,
- 0.33496675669425113,
- 0.09286341691577868,
- 0.11434288833407659,
- 0.025414430857047858
+ 0.17311687493499073,
+ 0.022792492767692595,
+ 0.12058320737626094,
+ 0.3393283270986452,
+ 0.11149215572697663,
+ 0.09603598655375359,
+ 0.16336464942113507,
+ 0.32867917711898453,
+ 0.08988706485108988,
+ 0.1640726861251499
],
[
- 0.05741408837128926,
- 0.056657974238298256,
- 0.24175475571155877,
- 0.23812475099223263,
- 0.01778638684849228,
- 0.1079192129721149,
- 0.19116755828206128,
+ 0.1240658917743258,
+ 0.12998640687807267,
+ 0.22240613268720477,
+ 0.2178190759174422,
+ 0.01776888092417036,
+ 0.0922145270760206,
+ 0.17311687493499073,
1,
- 0.03224992209236746,
- 0.11065458894869484,
- 0.3192781208585223,
- 0.10852551976660856,
- 0.05957549731552794,
- 0.10279567787680673,
- 0.5003162103088351,
- 0.04411190681146708,
- 0.14858846866838135,
- 0.05163986473435027
+ 0.03648260346059746,
+ 0.08904343614326711,
+ 0.2970254304693113,
+ 0.11831548180884943,
+ 0.16253031328738454,
+ 0.19508445555508772,
+ 0.4800924470705352,
+ 0.15718163705271387,
+ 0.20674076197089802
],
[
- 0.01569597153718797,
- 0.013593908269146495,
- 0.025014569456040185,
- 0.021917404276208984,
- 0.0013809757631778703,
- 0.004399691864203764,
- 0.01771983580822778,
- 0.03224992209236746,
+ 0.03267144560134065,
+ 0.03226243840878624,
+ 0.03160640765461562,
+ 0.028320630207299337,
+ 0.002099309887982074,
+ 0.00898053907910667,
+ 0.022792492767692595,
+ 0.03648260346059746,
1,
- 0.08461189794646956,
- 0.05362282810243895,
- 0.07093322291648477,
- 0.03791228294028205,
- 0.0392669868393358,
- 0.07145369872483175,
- 0.049930352165612917,
- 0.03549877798968587,
- 0.03111432559287062
+ 0.07143241126539773,
+ 0.06149968615329382,
+ 0.07528446624958736,
+ 0.0730249552881224,
+ 0.07128594621281013,
+ 0.07937133873035572,
+ 0.07063161008579438,
+ 0.06937988840793924
],
[
- 0.040989235686940816,
- 0.04075930430401838,
- 0.15426190915221324,
- 0.12119446342488618,
- 0.006444532015829912,
- 0.04628651725406843,
- 0.1252448169099988,
- 0.11065458894869484,
- 0.08461189794646956,
+ 0.10712349577355784,
+ 0.11113100230411314,
+ 0.14838588295615024,
+ 0.12192573243272048,
+ 0.0064144255162447246,
+ 0.04988227847309645,
+ 0.12058320737626094,
+ 0.08904343614326711,
+ 0.07143241126539773,
1,
- 0.22021014568642164,
- 0.3859318893600913,
- 0.1423827468461567,
- 0.18096621138083624,
- 0.24286823272263663,
- 0.21228500020230104,
- 0.1459962513695925,
- 0.10496069983968939
+ 0.2085344284826277,
+ 0.3756141239879568,
+ 0.20860391435209405,
+ 0.2439338448964409,
+ 0.2262491070521664,
+ 0.1960308197632984,
+ 0.22734610676272235
],
[
- 0.061568740672497534,
- 0.06093122306222735,
- 0.3477514643689929,
- 0.3286894436814909,
- 0.004329998524016476,
- 0.07481405788686601,
- 0.3322716889228379,
- 0.3192781208585223,
- 0.05362282810243895,
- 0.22021014568642164,
+ 0.14475767748321952,
+ 0.15032740706571793,
+ 0.37171182274445363,
+ 0.3525468899622581,
+ 0.009269883208277058,
+ 0.08759311081674451,
+ 0.3393283270986452,
+ 0.2970254304693113,
+ 0.06149968615329382,
+ 0.2085344284826277,
1,
- 0.21377525703111794,
- 0.09573562665274538,
- 0.1469607584988768,
- 0.6666060438319036,
- 0.11784342341053432,
- 0.17059756897606349,
- 0.07232857049616995
+ 0.23998646957005906,
+ 0.2584672204668393,
+ 0.3193726129742757,
+ 0.6563247307005879,
+ 0.24730637320826065,
+ 0.32652990120768727
],
[
- 0.03696794447170714,
- 0.036427174310613826,
- 0.13281808409522428,
- 0.11073297656642778,
- 0.002683817677771984,
- 0.042315521608991075,
- 0.09961147764401061,
- 0.10852551976660856,
- 0.07093322291648477,
- 0.3859318893600913,
- 0.21377525703111794,
+ 0.10243044622702734,
+ 0.10624021870034253,
+ 0.1506365739780303,
+ 0.12916204989780206,
+ 0.005433864001670957,
+ 0.04978801517329604,
+ 0.11149215572697663,
+ 0.11831548180884943,
+ 0.07528446624958736,
+ 0.3756141239879568,
+ 0.23998646957005906,
1,
- 0.15256142173955728,
- 0.17722420130407404,
- 0.22691037584731275,
- 0.23230106942130674,
- 0.14594541852321113,
- 0.11454051858316387
+ 0.28174315114239534,
+ 0.29513823401207673,
+ 0.2602571143552704,
+ 0.2671750926309919,
+ 0.27827299388502297
],
[
- 0.05352081277803731,
- 0.05283100363748757,
- 0.06673049906774123,
- 0.048318069131114486,
- 0.0013582777338929175,
- 0.04004460862097079,
- 0.041174205561030665,
- 0.05957549731552794,
- 0.03791228294028205,
- 0.1423827468461567,
- 0.09573562665274538,
- 0.15256142173955728,
+ 0.22284375415000315,
+ 0.23154024673662948,
+ 0.13686193479055792,
+ 0.11736450285212531,
+ 0.008619933999683011,
+ 0.05364308187508679,
+ 0.09603598655375359,
+ 0.16253031328738454,
+ 0.0730249552881224,
+ 0.20860391435209405,
+ 0.2584672204668393,
+ 0.28174315114239534,
1,
- 0.35408552736376164,
- 0.1432327734837157,
- 0.13359305997797627,
- 0.2749705812249971,
- 0.7885246402196713
+ 0.5935197907835382,
+ 0.34969846406582145,
+ 0.9580202466838004,
+ 0.5673220091683485
],
[
- 0.04004869062451576,
- 0.039617047413162375,
- 0.14909994555141065,
- 0.12097737688298277,
- 0.012499036555991529,
- 0.08040739840894055,
- 0.10958484453530018,
- 0.10279567787680673,
- 0.0392669868393358,
- 0.18096621138083624,
- 0.1469607584988768,
- 0.17722420130407404,
- 0.35408552736376164,
+ 0.2240544946507108,
+ 0.23250920740301625,
+ 0.21343320832924814,
+ 0.18470550214116468,
+ 0.015077339853222701,
+ 0.09230230828045376,
+ 0.16336464942113507,
+ 0.19508445555508772,
+ 0.07128594621281013,
+ 0.2439338448964409,
+ 0.3193726129742757,
+ 0.29513823401207673,
+ 0.5935197907835382,
1,
- 0.1930653267280024,
- 0.14496890815575308,
- 0.7528620160953167,
- 0.26673120117740184
+ 0.4068774187637994,
+ 0.5657949950743488,
+ 0.9213956311003227
],
[
- 0.09668574568788427,
- 0.09617813225153261,
- 0.3919479197818211,
- 0.37572530981238894,
- 0.01818525196078464,
- 0.16125389614532118,
- 0.33496675669425113,
- 0.5003162103088351,
- 0.07145369872483175,
- 0.24286823272263663,
- 0.6666060438319036,
- 0.22691037584731275,
- 0.1432327734837157,
- 0.1930653267280024,
+ 0.22551664240183317,
+ 0.2327233169990374,
+ 0.4005951950706468,
+ 0.3834831573219326,
+ 0.023203121401780318,
+ 0.15718240355316795,
+ 0.32867917711898453,
+ 0.4800924470705352,
+ 0.07937133873035572,
+ 0.2262491070521664,
+ 0.6563247307005879,
+ 0.2602571143552704,
+ 0.34969846406582145,
+ 0.4068774187637994,
1,
- 0.1297554982537589,
- 0.22390994050434165,
- 0.11768415981384495
+ 0.33742739872409666,
+ 0.41450829231197867
],
[
- 0.020983848636741746,
- 0.020779934453739678,
- 0.11430255435331263,
- 0.08706480424871708,
- 0.0005840365476183185,
- 0.02926424821268392,
- 0.09286341691577868,
- 0.04411190681146708,
- 0.049930352165612917,
- 0.21228500020230104,
- 0.11784342341053432,
- 0.23230106942130674,
- 0.13359305997797627,
- 0.14496890815575308,
- 0.1297554982537589,
+ 0.21764209610282456,
+ 0.22610062830832708,
+ 0.127757181278294,
+ 0.11103974478415263,
+ 0.008480130351469113,
+ 0.05003145912334212,
+ 0.08988706485108988,
+ 0.15718163705271387,
+ 0.07063161008579438,
+ 0.1960308197632984,
+ 0.24730637320826065,
+ 0.2671750926309919,
+ 0.9580202466838004,
+ 0.5657949950743488,
+ 0.33742739872409666,
1,
- 0.09868450111186694,
- 0.09679171001982584
+ 0.5703564611231601
],
[
- 0.04230149069451306,
- 0.04187757510946746,
- 0.1530900744997813,
- 0.1367349622160526,
- 0.016598234185244483,
- 0.08624271168084971,
- 0.11434288833407659,
- 0.14858846866838135,
- 0.03549877798968587,
- 0.1459962513695925,
- 0.17059756897606349,
- 0.14594541852321113,
- 0.2749705812249971,
- 0.7528620160953167,
- 0.22390994050434165,
- 0.09868450111186694,
- 1,
- 0.34138499506636344
- ],
- [
- 0.0530823224770852,
- 0.05252647681980516,
- 0.04224623793246877,
- 0.03356612824419668,
- 0.0016583682855190094,
- 0.02840212050111673,
- 0.025414430857047858,
- 0.05163986473435027,
- 0.03111432559287062,
- 0.10496069983968939,
- 0.07232857049616995,
- 0.11454051858316387,
- 0.7885246402196713,
- 0.26673120117740184,
- 0.11768415981384495,
- 0.09679171001982584,
- 0.34138499506636344,
+ 0.22363930402583765,
+ 0.2320611339644608,
+ 0.21359436822476655,
+ 0.18991605863836233,
+ 0.016480069731931876,
+ 0.09365323185411886,
+ 0.1640726861251499,
+ 0.20674076197089802,
+ 0.06937988840793924,
+ 0.22734610676272235,
+ 0.32652990120768727,
+ 0.27827299388502297,
+ 0.5673220091683485,
+ 0.9213956311003227,
+ 0.41450829231197867,
+ 0.5703564611231601,
1
]
]
@@ -20764,9 +23848,9 @@
}
},
"text/html": [
- "
"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "fig = px.imshow(df[df.label == True].select_dtypes(include=['bool','number']).fillna(-1).corr())\n",
+ "fig.show()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 99,
"metadata": {},
"outputs": [],
"source": [
@@ -20822,682 +25201,16 @@
"# 'label']].to_pickle('../data/processed/features.pkl')"
]
},
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Label speculation"
- ]
- },
{
"cell_type": "code",
- "execution_count": 75,
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " orcid | \n",
- " verified_email | \n",
- " verified_primary_email | \n",
- " given_names | \n",
- " family_name | \n",
- " biography | \n",
- " other_names | \n",
- " primary_email | \n",
- " keywords | \n",
- " external_ids | \n",
- " education | \n",
- " employment | \n",
- " n_works | \n",
- " works_source | \n",
- " activation_date | \n",
- " last_update_date | \n",
- " n_doi | \n",
- " n_arxiv | \n",
- " n_pmc | \n",
- " n_other_pids | \n",
- " label | \n",
- " primary_email_domain | \n",
- " other_email_domains | \n",
- " url_domains | \n",
- " n_emails | \n",
- " n_urls | \n",
- " n_ids | \n",
- " n_keywords | \n",
- " n_education | \n",
- " n_employment | \n",
- " ext_works_source | \n",
- " n_ext_work_source | \n",
- " authoritative | \n",
- " spam_score | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 17 | \n",
- " 0000-0002-0137-3066 | \n",
- " True | \n",
- " True | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " NaN | \n",
- " <NA> | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 0 | \n",
- " NaN | \n",
- " 2017-07-25t04:34:17.338z | \n",
- " 2019-11-27t17:54:45.418z | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " NaN | \n",
- " NaN | \n",
- " False | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 19 | \n",
- " 0000-0002-0461-9711 | \n",
- " True | \n",
- " True | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " NaN | \n",
- " <NA> | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 2 | \n",
- " [crossref] | \n",
- " 2015-08-18t12:42:01.797z | \n",
- " 2019-12-06t11:37:38.203z | \n",
- " 2 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " NaN | \n",
- " NaN | \n",
- " False | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 22 | \n",
- " 0000-0002-0761-9450 | \n",
- " True | \n",
- " True | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " NaN | \n",
- " <NA> | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 1 | \n",
- " [crossref] | \n",
- " 2020-05-13t17:15:28.405z | \n",
- " 2020-08-11t21:00:45.694z | \n",
- " 1 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " NaN | \n",
- " NaN | \n",
- " False | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 33 | \n",
- " 0000-0002-4447-9215 | \n",
- " True | \n",
- " True | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " NaN | \n",
- " <NA> | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " 0 | \n",
- " NaN | \n",
- " 2017-07-24t09:37:50.242z | \n",
- " 2019-11-15t08:31:24.820z | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " NaN | \n",
- " NaN | \n",
- " False | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 44 | \n",
- " 0000-0003-0426-4065 | \n",
- " True | \n",
- " True | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [eliza i. gilbert] | \n",
- " <NA> | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " [[, us fish and wildlife service, albuquerque,... | \n",
- " 0 | \n",
- " NaN | \n",
- " 2017-08-07t18:32:31.802z | \n",
- " 2020-04-08t16:48:55.732z | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 1 | \n",
- " NaN | \n",
- " NaN | \n",
- " False | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " 10989635 | \n",
- " 0000-0002-7340-9697 | \n",
- " True | \n",
- " True | \n",
- " tawanda | \n",
- " marandure | \n",
- " <NA> | \n",
- " NaN | \n",
- " <NA> | \n",
- " NaN | \n",
- " [[scopus author id, 48261373600]] | \n",
- " [[animal science, msc sustainable agriculture,... | \n",
- " [[lecturer, zimbabwe open university faculty o... | \n",
- " 7 | \n",
- " [scopus - elsevier] | \n",
- " 2015-11-05t08:52:08.743z | \n",
- " 2020-12-09t17:59:18.350z | \n",
- " 7 | \n",
- " 0 | \n",
- " 0 | \n",
- " 7 | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 1 | \n",
- " <NA> | \n",
- " 3 | \n",
- " 3 | \n",
- " [scopus - elsevier] | \n",
- " 1.0 | \n",
- " True | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 10989636 | \n",
- " 0000-0002-2906-0299 | \n",
- " True | \n",
- " True | \n",
- " tiffany | \n",
- " mackay | \n",
- " <NA> | \n",
- " [tiffany russel sia] | \n",
- " <NA> | \n",
- " [microfluidics, gpc-1, gallium-67, pet/ct, oxy... | \n",
- " [[researcherid, a-2121-2017]] | \n",
- " [[faculty of medicine, master in pharmaceutica... | \n",
- " [[clinical project lead, minomic international... | \n",
- " 11 | \n",
- " [crossref, researcherid, tiffany mackay] | \n",
- " 2017-01-03t23:28:48.736z | \n",
- " 2020-12-09t17:12:20.326z | \n",
- " 11 | \n",
- " 0 | \n",
- " 0 | \n",
- " 0 | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " [oxytocin.com.au, linkedin.com] | \n",
- " <NA> | \n",
- " 2 | \n",
- " 1 | \n",
- " 13 | \n",
- " 2 | \n",
- " 4 | \n",
- " [crossref, researcherid] | \n",
- " 2.0 | \n",
- " True | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 10989637 | \n",
- " 0000-0001-5896-2024 | \n",
- " True | \n",
- " True | \n",
- " giovanni, l | \n",
- " tiscia | \n",
- " <NA> | \n",
- " NaN | \n",
- " <NA> | \n",
- " NaN | \n",
- " [[scopus author id, 54948242800]] | \n",
- " NaN | \n",
- " NaN | \n",
- " 70 | \n",
- " [scopus - elsevier, tiscia giovanni, l, europe... | \n",
- " 2016-07-27t10:09:13.585z | \n",
- " 2020-12-07t22:23:05.706z | \n",
- " 65 | \n",
- " 0 | \n",
- " 17 | \n",
- " 52 | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 1 | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " [scopus - elsevier, europe pubmed central, cro... | \n",
- " 3.0 | \n",
- " True | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 10989643 | \n",
- " 0000-0003-2606-0936 | \n",
- " True | \n",
- " True | \n",
- " luang | \n",
- " xu | \n",
- " <NA> | \n",
- " [xu lu-ang, lu lu] | \n",
- " <NA> | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " [[post-doc, institute of biochemistry and cell... | \n",
- " 2 | \n",
- " [scopus - elsevier, crossref] | \n",
- " 2015-10-24t03:53:23.544z | \n",
- " 2020-11-19t09:23:48.896z | \n",
- " 2 | \n",
- " 0 | \n",
- " 0 | \n",
- " 1 | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 1 | \n",
- " [scopus - elsevier, crossref] | \n",
- " 2.0 | \n",
- " True | \n",
- " NaN | \n",
- "
\n",
- " \n",
- " 10989645 | \n",
- " 0000-0002-3800-6331 | \n",
- " True | \n",
- " True | \n",
- " zachary | \n",
- " calamari | \n",
- " <NA> | \n",
- " NaN | \n",
- " <NA> | \n",
- " NaN | \n",
- " NaN | \n",
- " [[richard gilder graduate school, phd in compa... | \n",
- " [[assistant professor, baruch college, city un... | \n",
- " 7 | \n",
- " [crossref metadata search, zachary t. calamari... | \n",
- " 2015-01-20t20:20:17.042z | \n",
- " 2020-11-21t19:48:36.221z | \n",
- " 7 | \n",
- " 0 | \n",
- " 1 | \n",
- " 0 | \n",
- " True | \n",
- " NaN | \n",
- " NaN | \n",
- " NaN | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " <NA> | \n",
- " 2 | \n",
- " 2 | \n",
- " [crossref metadata search, crossref] | \n",
- " 2.0 | \n",
- " True | \n",
- " NaN | \n",
- "
\n",
- " \n",
- "
\n",
- "
2075872 rows × 34 columns
\n",
- "
"
- ],
- "text/plain": [
- " orcid verified_email verified_primary_email \\\n",
- "17 0000-0002-0137-3066 True True \n",
- "19 0000-0002-0461-9711 True True \n",
- "22 0000-0002-0761-9450 True True \n",
- "33 0000-0002-4447-9215 True True \n",
- "44 0000-0003-0426-4065 True True \n",
- "... ... ... ... \n",
- "10989635 0000-0002-7340-9697 True True \n",
- "10989636 0000-0002-2906-0299 True True \n",
- "10989637 0000-0001-5896-2024 True True \n",
- "10989643 0000-0003-2606-0936 True True \n",
- "10989645 0000-0002-3800-6331 True True \n",
- "\n",
- " given_names family_name biography other_names \\\n",
- "17 NaN \n",
- "19 NaN \n",
- "22 NaN \n",
- "33 NaN \n",
- "44 [eliza i. gilbert] \n",
- "... ... ... ... ... \n",
- "10989635 tawanda marandure NaN \n",
- "10989636 tiffany mackay [tiffany russel sia] \n",
- "10989637 giovanni, l tiscia NaN \n",
- "10989643 luang xu [xu lu-ang, lu lu] \n",
- "10989645 zachary calamari NaN \n",
- "\n",
- " primary_email keywords \\\n",
- "17 NaN \n",
- "19 NaN \n",
- "22 NaN \n",
- "33 NaN \n",
- "44 NaN \n",
- "... ... ... \n",
- "10989635 NaN \n",
- "10989636 [microfluidics, gpc-1, gallium-67, pet/ct, oxy... \n",
- "10989637 NaN \n",
- "10989643 NaN \n",
- "10989645 NaN \n",
- "\n",
- " external_ids \\\n",
- "17 NaN \n",
- "19 NaN \n",
- "22 NaN \n",
- "33 NaN \n",
- "44 NaN \n",
- "... ... \n",
- "10989635 [[scopus author id, 48261373600]] \n",
- "10989636 [[researcherid, a-2121-2017]] \n",
- "10989637 [[scopus author id, 54948242800]] \n",
- "10989643 NaN \n",
- "10989645 NaN \n",
- "\n",
- " education \\\n",
- "17 NaN \n",
- "19 NaN \n",
- "22 NaN \n",
- "33 NaN \n",
- "44 NaN \n",
- "... ... \n",
- "10989635 [[animal science, msc sustainable agriculture,... \n",
- "10989636 [[faculty of medicine, master in pharmaceutica... \n",
- "10989637 NaN \n",
- "10989643 NaN \n",
- "10989645 [[richard gilder graduate school, phd in compa... \n",
- "\n",
- " employment n_works \\\n",
- "17 NaN 0 \n",
- "19 NaN 2 \n",
- "22 NaN 1 \n",
- "33 NaN 0 \n",
- "44 [[, us fish and wildlife service, albuquerque,... 0 \n",
- "... ... ... \n",
- "10989635 [[lecturer, zimbabwe open university faculty o... 7 \n",
- "10989636 [[clinical project lead, minomic international... 11 \n",
- "10989637 NaN 70 \n",
- "10989643 [[post-doc, institute of biochemistry and cell... 2 \n",
- "10989645 [[assistant professor, baruch college, city un... 7 \n",
- "\n",
- " works_source \\\n",
- "17 NaN \n",
- "19 [crossref] \n",
- "22 [crossref] \n",
- "33 NaN \n",
- "44 NaN \n",
- "... ... \n",
- "10989635 [scopus - elsevier] \n",
- "10989636 [crossref, researcherid, tiffany mackay] \n",
- "10989637 [scopus - elsevier, tiscia giovanni, l, europe... \n",
- "10989643 [scopus - elsevier, crossref] \n",
- "10989645 [crossref metadata search, zachary t. calamari... \n",
- "\n",
- " activation_date last_update_date n_doi n_arxiv \\\n",
- "17 2017-07-25t04:34:17.338z 2019-11-27t17:54:45.418z 0 0 \n",
- "19 2015-08-18t12:42:01.797z 2019-12-06t11:37:38.203z 2 0 \n",
- "22 2020-05-13t17:15:28.405z 2020-08-11t21:00:45.694z 1 0 \n",
- "33 2017-07-24t09:37:50.242z 2019-11-15t08:31:24.820z 0 0 \n",
- "44 2017-08-07t18:32:31.802z 2020-04-08t16:48:55.732z 0 0 \n",
- "... ... ... ... ... \n",
- "10989635 2015-11-05t08:52:08.743z 2020-12-09t17:59:18.350z 7 0 \n",
- "10989636 2017-01-03t23:28:48.736z 2020-12-09t17:12:20.326z 11 0 \n",
- "10989637 2016-07-27t10:09:13.585z 2020-12-07t22:23:05.706z 65 0 \n",
- "10989643 2015-10-24t03:53:23.544z 2020-11-19t09:23:48.896z 2 0 \n",
- "10989645 2015-01-20t20:20:17.042z 2020-11-21t19:48:36.221z 7 0 \n",
- "\n",
- " n_pmc n_other_pids label primary_email_domain other_email_domains \\\n",
- "17 0 0 True NaN NaN \n",
- "19 0 0 True NaN NaN \n",
- "22 0 0 True NaN NaN \n",
- "33 0 0 True NaN NaN \n",
- "44 0 0 True NaN NaN \n",
- "... ... ... ... ... ... \n",
- "10989635 0 7 True NaN NaN \n",
- "10989636 0 0 True NaN NaN \n",
- "10989637 17 52 True NaN NaN \n",
- "10989643 0 1 True NaN NaN \n",
- "10989645 1 0 True NaN NaN \n",
- "\n",
- " url_domains n_emails n_urls n_ids \\\n",
- "17 NaN \n",
- "19 NaN \n",
- "22 NaN \n",
- "33 NaN \n",
- "44 NaN \n",
- "... ... ... ... ... \n",
- "10989635 NaN 1 \n",
- "10989636 [oxytocin.com.au, linkedin.com] 2 1 \n",
- "10989637 NaN 1 \n",
- "10989643 NaN \n",
- "10989645 NaN \n",
- "\n",
- " n_keywords n_education n_employment \\\n",
- "17 \n",
- "19 \n",
- "22 \n",
- "33 \n",
- "44 1 \n",
- "... ... ... ... \n",
- "10989635 3 3 \n",
- "10989636 13 2 4 \n",
- "10989637 \n",
- "10989643 1 \n",
- "10989645 2 2 \n",
- "\n",
- " ext_works_source \\\n",
- "17 NaN \n",
- "19 NaN \n",
- "22 NaN \n",
- "33 NaN \n",
- "44 NaN \n",
- "... ... \n",
- "10989635 [scopus - elsevier] \n",
- "10989636 [crossref, researcherid] \n",
- "10989637 [scopus - elsevier, europe pubmed central, cro... \n",
- "10989643 [scopus - elsevier, crossref] \n",
- "10989645 [crossref metadata search, crossref] \n",
- "\n",
- " n_ext_work_source authoritative spam_score \n",
- "17 NaN False NaN \n",
- "19 NaN False NaN \n",
- "22 NaN False NaN \n",
- "33 NaN False NaN \n",
- "44 NaN False NaN \n",
- "... ... ... ... \n",
- "10989635 1.0 True NaN \n",
- "10989636 2.0 True NaN \n",
- "10989637 3.0 True NaN \n",
- "10989643 2.0 True NaN \n",
- "10989645 2.0 True NaN \n",
- "\n",
- "[2075872 rows x 34 columns]"
- ]
- },
- "execution_count": 75,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df[df.label == 1]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 76,
+ "execution_count": null,
"metadata": {},
"outputs": [],
- "source": [
- "# (df.n_works > 0) & (df.n_ids > 1)"
- ]
+ "source": []
},
{
"cell_type": "code",
- "execution_count": 77,
+ "execution_count": 100,
"metadata": {},
"outputs": [
{
@@ -21505,59 +25218,53 @@
"output_type": "stream",
"text": [
"\n",
- "RangeIndex: 10989649 entries, 0 to 10989648\n",
- "Data columns (total 34 columns):\n",
- " # Column Dtype \n",
- "--- ------ ----- \n",
- " 0 orcid object \n",
- " 1 verified_email bool \n",
- " 2 verified_primary_email bool \n",
- " 3 given_names string \n",
- " 4 family_name string \n",
- " 5 biography string \n",
- " 6 other_names object \n",
- " 7 primary_email string \n",
- " 8 keywords object \n",
- " 9 external_ids object \n",
- " 10 education object \n",
- " 11 employment object \n",
- " 12 n_works Int16 \n",
- " 13 works_source object \n",
- " 14 activation_date string \n",
- " 15 last_update_date string \n",
- " 16 n_doi Int16 \n",
- " 17 n_arxiv Int16 \n",
- " 18 n_pmc Int16 \n",
- " 19 n_other_pids Int16 \n",
- " 20 label bool \n",
- " 21 primary_email_domain object \n",
- " 22 other_email_domains object \n",
- " 23 url_domains object \n",
- " 24 n_emails Int16 \n",
- " 25 n_urls Int16 \n",
- " 26 n_ids Int16 \n",
- " 27 n_keywords Int16 \n",
- " 28 n_education Int16 \n",
- " 29 n_employment Int16 \n",
- " 30 ext_works_source object \n",
- " 31 n_ext_work_source float64\n",
- " 32 authoritative object \n",
- " 33 spam_score float64\n",
- "dtypes: Int16(11), bool(3), float64(2), object(12), string(6)\n",
- "memory usage: 2.0+ GB\n"
+ "Int64Index: 10989649 entries, 0 to 10989648\n",
+ "Data columns (total 35 columns):\n",
+ " # Column Dtype \n",
+ "--- ------ ----- \n",
+ " 0 orcid object \n",
+ " 1 verified_email bool \n",
+ " 2 verified_primary_email bool \n",
+ " 3 given_names string \n",
+ " 4 family_name string \n",
+ " 5 biography string \n",
+ " 6 other_names object \n",
+ " 7 primary_email string \n",
+ " 8 keywords object \n",
+ " 9 external_ids object \n",
+ " 10 education object \n",
+ " 11 employment object \n",
+ " 12 n_works Int16 \n",
+ " 13 works_source object \n",
+ " 14 activation_date datetime64[ns, UTC]\n",
+ " 15 last_update_date datetime64[ns, UTC]\n",
+ " 16 n_doi Int16 \n",
+ " 17 n_arxiv Int16 \n",
+ " 18 n_pmc Int16 \n",
+ " 19 n_other_pids Int16 \n",
+ " 20 label bool \n",
+ " 21 primary_email_domain object \n",
+ " 22 other_email_domains object \n",
+ " 23 url_domains object \n",
+ " 24 n_emails Int16 \n",
+ " 25 n_urls Int16 \n",
+ " 26 n_ids Int16 \n",
+ " 27 n_keywords Int16 \n",
+ " 28 n_education Int16 \n",
+ " 29 n_employment Int16 \n",
+ " 30 ext_works_source object \n",
+ " 31 n_ext_work_source Int16 \n",
+ " 32 authoritative object \n",
+ " 33 n_valid_education float64 \n",
+ " 34 n_valid_employment float64 \n",
+ "dtypes: Int16(12), bool(3), datetime64[ns, UTC](2), float64(2), object(12), string(4)\n",
+ "memory usage: 2.1+ GB\n"
]
}
],
"source": [
"df.info()"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {