{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "subsequent-cornell", "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import glob\n", "\n", "import pandas as pd\n", "import ast\n", "import tldextract\n", "import numpy\n", "\n", "import plotly\n", "from plotly.offline import iplot, init_notebook_mode\n", "import plotly.graph_objs as go\n", "import plotly.express as px\n", "\n", "init_notebook_mode(connected=True)\n", "TOP_N = 0\n", "TOP_RANGE = [0, 0]\n", "def set_top_n(n):\n", " global TOP_N, TOP_RANGE\n", " TOP_N = n\n", " TOP_RANGE = [-.5, n - 1 + .5]" ] }, { "cell_type": "code", "execution_count": 2, "id": "hydraulic-baker", "metadata": {}, "outputs": [], "source": [ "parts = glob.glob('/Users/miriam.baglioni/Develop/Gitea/fake-orcid-analysis-v2/fake-orcid-analysis/data/processed/dataset.pkl.*')" ] }, { "cell_type": "code", "execution_count": 3, "id": "lesbian-routine", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | orcid | \n", "verified_email | \n", "verified_primary_email | \n", "given_names | \n", "family_name | \n", "biography | \n", "other_names | \n", "urls | \n", "primary_email | \n", "other_emails | \n", "... | \n", "employment | \n", "n_works | \n", "works_source | \n", "activation_date | \n", "last_update_date | \n", "n_doi | \n", "n_arxiv | \n", "n_pmc | \n", "n_other_pids | \n", "label | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
10000000 | \n", "0000-0002-7790-0483 | \n", "1 | \n", "0 | \n", "abel | \n", "elias | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "0 | \n", "NaN | \n", "2020-09-16t16:51:54.155z | \n", "2020-09-16t17:00:08.451z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
10000001 | \n", "0000-0001-6368-0531 | \n", "0 | \n", "0 | \n", "abelardo | \n", "ramirez | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "0 | \n", "NaN | \n", "2017-05-10t19:28:13.217z | \n", "2017-05-10t19:28:17.315z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
10000002 | \n", "0000-0001-8149-4900 | \n", "1 | \n", "1 | \n", "abelardo | \n", "mancinas | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "[[profesor investigador, instituto tecnológico... | \n", "0 | \n", "NaN | \n", "2018-10-15t21:46:52.162z | \n", "2020-01-13t03:33:47.645z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
10000003 | \n", "0000-0002-8684-2422 | \n", "0 | \n", "0 | \n", "abera | \n", "nigussie | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "0 | \n", "NaN | \n", "2020-09-23t08:36:17.451z | \n", "2020-09-23t08:36:17.450z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
10000004 | \n", "0000-0003-4814-7872 | \n", "1 | \n", "1 | \n", "abhijeet | \n", "singh | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "NaN | \n", "0 | \n", "NaN | \n", "2018-05-01t22:43:17.407z | \n", "2018-10-06t22:21:54.024z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "
5 rows × 23 columns
\n", "\n", " | ext_works_source | \n", "orcid | \n", "
---|---|---|
0 | \n", "crossref | \n", "1460841 | \n", "
1 | \n", "scopus - elsevier | \n", "902231 | \n", "
2 | \n", "crossref metadata search | \n", "297684 | \n", "
3 | \n", "multidisciplinary digital publishing institute | \n", "281664 | \n", "
4 | \n", "europe pubmed central | \n", "181605 | \n", "
... | \n", "... | \n", "... | \n", "
337 | \n", "uta - oa journal global insight | \n", "3 | \n", "
338 | \n", "francis crick institute | \n", "3 | \n", "
339 | \n", "anna | \n", "3 | \n", "
340 | \n", "santos | \n", "3 | \n", "
341 | \n", "universitäts- und stadtbibliothek köln | \n", "3 | \n", "
342 rows × 2 columns
\n", "\n", " | orcid | \n", "verified_email | \n", "verified_primary_email | \n", "given_names | \n", "family_name | \n", "biography | \n", "other_names | \n", "urls | \n", "primary_email | \n", "other_emails | \n", "... | \n", "activation_date | \n", "last_update_date | \n", "n_doi | \n", "n_arxiv | \n", "n_pmc | \n", "n_other_pids | \n", "label | \n", "ext_works_source | \n", "n_ext_work_source | \n", "authoritative | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "0000-0002-7790-0483 | \n", "1 | \n", "0 | \n", "abel | \n", "elias | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "2020-09-16t16:51:54.155z | \n", "2020-09-16t17:00:08.451z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "[] | \n", "0 | \n", "False | \n", "
1 | \n", "0000-0001-6368-0531 | \n", "0 | \n", "0 | \n", "abelardo | \n", "ramirez | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "2017-05-10t19:28:13.217z | \n", "2017-05-10t19:28:17.315z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "[] | \n", "0 | \n", "False | \n", "
2 | \n", "0000-0001-8149-4900 | \n", "1 | \n", "1 | \n", "abelardo | \n", "mancinas | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "2018-10-15t21:46:52.162z | \n", "2020-01-13t03:33:47.645z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "[] | \n", "0 | \n", "False | \n", "
3 | \n", "0000-0002-8684-2422 | \n", "0 | \n", "0 | \n", "abera | \n", "nigussie | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "2020-09-23t08:36:17.451z | \n", "2020-09-23t08:36:17.450z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "[] | \n", "0 | \n", "False | \n", "
4 | \n", "0000-0003-4814-7872 | \n", "1 | \n", "1 | \n", "abhijeet | \n", "singh | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "2018-05-01t22:43:17.407z | \n", "2018-10-06t22:21:54.024z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "[] | \n", "0 | \n", "False | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
10989644 | \n", "0000-0001-7468-9881 | \n", "1 | \n", "1 | \n", "abeer | \n", "elbaroudi | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "2020-02-06t15:04:42.485z | \n", "2020-02-06t15:16:45.537z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "[] | \n", "0 | \n", "False | \n", "
10989645 | \n", "0000-0003-0081-4285 | \n", "1 | \n", "1 | \n", "abeer | \n", "sohrab | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "2020-05-12t22:39:26.356z | \n", "2020-05-12t22:41:45.239z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "[] | \n", "0 | \n", "False | \n", "
10989646 | \n", "0000-0003-2004-3457 | \n", "0 | \n", "0 | \n", "abeer | \n", "abdelmaksoud | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "2019-12-19t23:09:12.579z | \n", "2019-12-19t23:09:12.798z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "[] | \n", "0 | \n", "False | \n", "
10989647 | \n", "0000-0003-2841-9754 | \n", "1 | \n", "1 | \n", "abeer | \n", "al-ghazali | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "2019-06-02t18:35:32.973z | \n", "2019-08-05t14:54:41.796z | \n", "2 | \n", "0 | \n", "0 | \n", "2 | \n", "1 | \n", "[crossref metadata search] | \n", "1 | \n", "True | \n", "
10989648 | \n", "0000-0002-3675-6876 | \n", "0 | \n", "0 | \n", "abegail | \n", "palos-simbre | \n", "NaN | \n", "[gail] | \n", "NaN | \n", "NaN | \n", "NaN | \n", "... | \n", "2017-02-10t16:38:52.988z | \n", "2019-12-11t01:37:15.405z | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "0 | \n", "[] | \n", "0 | \n", "False | \n", "
10989649 rows × 26 columns
\n", "