From b5e99701b1e1a103085933a44fb7d2a21189d6ec Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Tue, 23 Mar 2021 19:03:37 +0100 Subject: [PATCH] adding preprocessing with make --- notebooks/01-Exploration.ipynb | 8973 ++++++++++++++++---------------- src/data/make_dataset.py | 41 +- 2 files changed, 4549 insertions(+), 4465 deletions(-) diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index 0f8d755..4efffc5 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -89,12 +89,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Anomalies ORCiD profile" + "Notable anomalies:" ] }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -108,7 +108,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Notable fake ORCID iDs for explorative purposes:" + "Notable fake ORCID iDs:" ] }, { @@ -133,25 +133,10 @@ "Load the dataset" ] }, - { - "cell_type": "code", - "execution_count": 163, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header=0,\n", - " names=['orcid', 'claimed','verified_email', 'verified_primary_email', \n", - " 'given_names', 'family_name', 'biography', 'other_names', 'urls', \n", - " 'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', \n", - " 'employment', 'n_works', 'works_source'])" - ] - }, { "cell_type": "code", "execution_count": 6, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -184,360 +169,175 @@ " other_names\n", " urls\n", " primary_email\n", - " other_emails\n", - " keywords\n", - " external_ids\n", - " education\n", + " ...\n", " employment\n", " n_works\n", " works_source\n", + " activation_date\n", + " last_update_date\n", + " n_doi\n", + " n_arxiv\n", + " n_pmc\n", + " n_other_pids\n", + " label\n", " \n", " \n", " \n", " \n", - " 7552\n", - " 0000-0001-7831-7567\n", - " 1\n", - " 1\n", - " 1\n", - " Vahab\n", - " Vahdat\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [[\"Scopus Author ID\", \"57193490305\"], [\"Scopus...\n", - " [[\"Industrial Engineering\", \"PhD\", \"Northeaste...\n", - " [[\"Post-doctorate fellow\", \"Harvard Medical Sc...\n", - " 25\n", - " [\"Vahab Vahdat\", \"Scopus - Elsevier\", \"Multidi...\n", - " \n", - " \n", - " 8416\n", - " 0000-0001-8161-1345\n", + " 0\n", + " 0000-0001-5009-2052\n", " 1\n", " 1\n", " 1\n", - " AYFER\n", - " TEKIN ATACAN\n", - " NaN\n", - " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " ...\n", " NaN\n", " 0\n", " NaN\n", + " 2019-06-05t20:25:43.066z\n", + " 2019-12-11t03:57:41.741z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", " \n", " \n", - " 16498\n", - " 0000-0002-1133-1505\n", - " 1\n", - " 1\n", - " 1\n", - " Xianrong\n", - " Lai\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [[\"Scopus Author ID\", \"15769435500\"]]\n", - " [[\"Department of pharmacy\", \"Bachelor of Tradi...\n", - " [[\"Associate Research, Professor\", \"Chengdu Un...\n", - " 115\n", - " [\"Xianrong Lai\", \"Scopus - Elsevier\", \"Crossref\"]\n", - " \n", - " \n", - " 16830\n", - " 0000-0002-1257-5536\n", + " 1\n", + " 0000-0001-5943-0732\n", " 1\n", " 1\n", " 1\n", - " Alexandra\n", - " Zimmer\n", - " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " ...\n", " NaN\n", - " [[\"Research assistent\", \"Fraunhofer-Institut f...\n", " 0\n", " NaN\n", + " 2015-08-18t13:10:42.871z\n", + " 2016-06-15t01:05:19.986z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", " \n", " \n", - " 18835\n", - " 0000-0002-2026-4156\n", - " 1\n", - " 1\n", - " 1\n", - " Fatma\n", - " Sri Wahyuni\n", - " NaN\n", - " [\"Ayu\"]\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [[\"ResearcherID\", \"C-5194-2015\"], [\"Scopus Aut...\n", - " [[\"Biosains\", \"PHD\", \"Universiti Putra Malaysi...\n", - " [[\"Lecturer\", \"Universitas Andalas\", \"Padang\",...\n", - " 27\n", - " [\"Publons\", \"Crossref Metadata Search\", \"Scopu...\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 10733293\n", - " 0000-0002-9887-7788\n", + " 2\n", + " 0000-0001-6083-622x\n", " 1\n", " 1\n", " 1\n", - " Markéta\n", - " Laštůvková\n", - " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " ...\n", " NaN\n", - " [[\"\", \"VSB - Technical University of Ostrava\",...\n", " 0\n", " NaN\n", + " 2019-01-21t10:55:27.997z\n", + " 2019-01-28t16:24:02.199z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", " \n", " \n", - " 10737258\n", - " 0000-0003-1367-8104\n", - " 1\n", - " 1\n", - " 1\n", - " LORENA\n", - " GUTIÉRREZ GARCÍA\n", - " NaN\n", - " NaN\n", - " [[\"LinkedIn\", \"https://www.linkedin.com/in/lor...\n", - " lorenagg@unex.es\n", - " NaN\n", - " [\"Agroecolog\\u00eda, Bot\\u00e1nica, Did\\u00e1c...\n", - " [[\"ResearcherID\", \"AAE-6316-2021\"]]\n", - " [[\"\", \"M\\u00e1ster en Formaci\\u00f3n del profe...\n", - " [[\"PCI\", \"Universidad de Extremadura - Campus ...\n", - " 14\n", - " [\"Multidisciplinary Digital Publishing Institu...\n", - " \n", - " \n", - " 10738308\n", - " 0000-0003-1741-3437\n", + " 3\n", + " 0000-0001-6262-5709\n", " 1\n", " 1\n", " 1\n", - " Xing\n", - " Liu\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", - " [[\"ResearcherID\", \"S-3053-2017\"]]\n", - " NaN\n", + " ...\n", " NaN\n", " 0\n", " NaN\n", + " 2015-08-18t14:29:39.440z\n", + " 2017-06-21t07:18:20.787z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", " \n", " \n", - " 10741460\n", - " 0000-0003-2909-8585\n", + " 4\n", + " 0000-0001-6616-4890\n", " 1\n", " 1\n", " 1\n", - " Yusuf\n", - " Özcan\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", " NaN\n", + " ...\n", " NaN\n", - " [[\"\\u0130lahiyat Fak\\u00fcltesi\", \"Doktora\", \"...\n", - " [[\"Research Assistant\", \"\\u00c7ukurova Univers...\n", " 0\n", " NaN\n", - " \n", - " \n", - " 10745078\n", - " 0000-0003-4259-5324\n", - " 1\n", - " 1\n", - " 1\n", - " P Rama Mohan\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [[\"Scopus Author ID\", \"24776757000\"]]\n", - " [[\"EEE Department\", \"Ph.D. (Power Electronics ...\n", - " [[\"Associate Professor\", \"RGM College of Engin...\n", - " 21\n", - " [\"Scopus - Elsevier\", \"P Rama Mohan\"]\n", + " 2015-08-13t01:59:51.802z\n", + " 2016-06-15t01:05:21.373z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", " \n", " \n", "\n", - "

2418 rows × 17 columns

\n", + "

5 rows × 24 columns

\n", "" ], "text/plain": [ - " orcid claimed verified_email \\\n", - "7552 0000-0001-7831-7567 1 1 \n", - "8416 0000-0001-8161-1345 1 1 \n", - "16498 0000-0002-1133-1505 1 1 \n", - "16830 0000-0002-1257-5536 1 1 \n", - "18835 0000-0002-2026-4156 1 1 \n", - "... ... ... ... \n", - "10733293 0000-0002-9887-7788 1 1 \n", - "10737258 0000-0003-1367-8104 1 1 \n", - "10738308 0000-0003-1741-3437 1 1 \n", - "10741460 0000-0003-2909-8585 1 1 \n", - "10745078 0000-0003-4259-5324 1 1 \n", - "\n", - " verified_primary_email given_names family_name biography \\\n", - "7552 1 Vahab Vahdat NaN \n", - "8416 1 AYFER TEKIN ATACAN NaN \n", - "16498 1 Xianrong Lai NaN \n", - "16830 1 Alexandra Zimmer NaN \n", - "18835 1 Fatma Sri Wahyuni NaN \n", - "... ... ... ... ... \n", - "10733293 1 Markéta Laštůvková NaN \n", - "10737258 1 LORENA GUTIÉRREZ GARCÍA NaN \n", - "10738308 1 Xing Liu NaN \n", - "10741460 1 Yusuf Özcan NaN \n", - "10745078 1 P Rama Mohan NaN NaN \n", - "\n", - " other_names urls \\\n", - "7552 NaN NaN \n", - "8416 NaN NaN \n", - "16498 NaN NaN \n", - "16830 NaN NaN \n", - "18835 [\"Ayu\"] NaN \n", - "... ... ... \n", - "10733293 NaN NaN \n", - "10737258 NaN [[\"LinkedIn\", \"https://www.linkedin.com/in/lor... \n", - "10738308 NaN NaN \n", - "10741460 NaN NaN \n", - "10745078 NaN NaN \n", - "\n", - " primary_email other_emails \\\n", - "7552 NaN NaN \n", - "8416 NaN NaN \n", - "16498 NaN NaN \n", - "16830 NaN NaN \n", - "18835 NaN NaN \n", - "... ... ... \n", - "10733293 NaN NaN \n", - "10737258 lorenagg@unex.es NaN \n", - "10738308 NaN NaN \n", - "10741460 NaN NaN \n", - "10745078 NaN NaN \n", - "\n", - " keywords \\\n", - "7552 NaN \n", - "8416 NaN \n", - "16498 NaN \n", - "16830 NaN \n", - "18835 NaN \n", - "... ... \n", - "10733293 NaN \n", - "10737258 [\"Agroecolog\\u00eda, Bot\\u00e1nica, Did\\u00e1c... \n", - "10738308 NaN \n", - "10741460 NaN \n", - "10745078 NaN \n", - "\n", - " external_ids \\\n", - "7552 [[\"Scopus Author ID\", \"57193490305\"], [\"Scopus... \n", - "8416 NaN \n", - "16498 [[\"Scopus Author ID\", \"15769435500\"]] \n", - "16830 NaN \n", - "18835 [[\"ResearcherID\", \"C-5194-2015\"], [\"Scopus Aut... \n", - "... ... \n", - "10733293 NaN \n", - "10737258 [[\"ResearcherID\", \"AAE-6316-2021\"]] \n", - "10738308 [[\"ResearcherID\", \"S-3053-2017\"]] \n", - "10741460 NaN \n", - "10745078 [[\"Scopus Author ID\", \"24776757000\"]] \n", - "\n", - " education \\\n", - "7552 [[\"Industrial Engineering\", \"PhD\", \"Northeaste... \n", - "8416 NaN \n", - "16498 [[\"Department of pharmacy\", \"Bachelor of Tradi... \n", - "16830 NaN \n", - "18835 [[\"Biosains\", \"PHD\", \"Universiti Putra Malaysi... \n", - "... ... \n", - "10733293 NaN \n", - "10737258 [[\"\", \"M\\u00e1ster en Formaci\\u00f3n del profe... \n", - "10738308 NaN \n", - "10741460 [[\"\\u0130lahiyat Fak\\u00fcltesi\", \"Doktora\", \"... \n", - "10745078 [[\"EEE Department\", \"Ph.D. (Power Electronics ... \n", - "\n", - " employment n_works \\\n", - "7552 [[\"Post-doctorate fellow\", \"Harvard Medical Sc... 25 \n", - "8416 NaN 0 \n", - "16498 [[\"Associate Research, Professor\", \"Chengdu Un... 115 \n", - "16830 [[\"Research assistent\", \"Fraunhofer-Institut f... 0 \n", - "18835 [[\"Lecturer\", \"Universitas Andalas\", \"Padang\",... 27 \n", - "... ... ... \n", - "10733293 [[\"\", \"VSB - Technical University of Ostrava\",... 0 \n", - "10737258 [[\"PCI\", \"Universidad de Extremadura - Campus ... 14 \n", - "10738308 NaN 0 \n", - "10741460 [[\"Research Assistant\", \"\\u00c7ukurova Univers... 0 \n", - "10745078 [[\"Associate Professor\", \"RGM College of Engin... 21 \n", - "\n", - " works_source \n", - "7552 [\"Vahab Vahdat\", \"Scopus - Elsevier\", \"Multidi... \n", - "8416 NaN \n", - "16498 [\"Xianrong Lai\", \"Scopus - Elsevier\", \"Crossref\"] \n", - "16830 NaN \n", - "18835 [\"Publons\", \"Crossref Metadata Search\", \"Scopu... \n", - "... ... \n", - "10733293 NaN \n", - "10737258 [\"Multidisciplinary Digital Publishing Institu... \n", - "10738308 NaN \n", - "10741460 NaN \n", - "10745078 [\"Scopus - Elsevier\", \"P Rama Mohan\"] \n", - "\n", - "[2418 rows x 17 columns]" + " orcid claimed verified_email verified_primary_email \\\n", + "0 0000-0001-5009-2052 1 1 1 \n", + "1 0000-0001-5943-0732 1 1 1 \n", + "2 0000-0001-6083-622x 1 1 1 \n", + "3 0000-0001-6262-5709 1 1 1 \n", + "4 0000-0001-6616-4890 1 1 1 \n", + "\n", + " given_names family_name biography other_names urls primary_email ... \\\n", + "0 NaN NaN NaN NaN NaN NaN ... \n", + "1 NaN NaN NaN NaN NaN NaN ... \n", + "2 NaN NaN NaN NaN NaN NaN ... \n", + "3 NaN NaN NaN NaN NaN NaN ... \n", + "4 NaN NaN NaN NaN NaN NaN ... \n", + "\n", + " employment n_works works_source activation_date \\\n", + "0 NaN 0 NaN 2019-06-05t20:25:43.066z \n", + "1 NaN 0 NaN 2015-08-18t13:10:42.871z \n", + "2 NaN 0 NaN 2019-01-21t10:55:27.997z \n", + "3 NaN 0 NaN 2015-08-18t14:29:39.440z \n", + "4 NaN 0 NaN 2015-08-13t01:59:51.802z \n", + "\n", + " last_update_date n_doi n_arxiv n_pmc n_other_pids label \n", + "0 2019-12-11t03:57:41.741z 0 0 0 0 0 \n", + "1 2016-06-15t01:05:19.986z 0 0 0 0 0 \n", + "2 2019-01-28t16:24:02.199z 0 0 0 0 0 \n", + "3 2017-06-21t07:18:20.787z 0 0 0 0 0 \n", + "4 2016-06-15t01:05:21.373z 0 0 0 0 0 \n", + "\n", + "[5 rows x 24 columns]" ] }, "execution_count": 6, @@ -546,107 +346,20 @@ } ], "source": [ - "df[df.duplicated()]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "df.drop_duplicates(inplace=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Basic column manipulation (interpret columns as lists when necessary)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "df['other_names'] = df[df.other_names.notna()]['other_names'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 164, - "metadata": {}, - "outputs": [], - "source": [ - "df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "df['education'] = df[df.education.notna()]['education'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "df['employment'] = df[df.employment.notna()]['employment'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "df['external_ids'] = df[df.external_ids.notna()]['external_ids'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "df['works_source'] = df[df.works_source.notna()]['works_source'].apply(lambda x: ast.literal_eval(x))" + "df = pd.read_pickle('../data/processed/dataset.pkl')\n", + "df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "That's how it looks like" + "Notable profiles inspection" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -680,169 +393,92 @@ " other_names\n", " urls\n", " primary_email\n", - " other_emails\n", - " keywords\n", - " external_ids\n", - " education\n", + " ...\n", " employment\n", " n_works\n", " works_source\n", + " activation_date\n", + " last_update_date\n", + " n_doi\n", + " n_arxiv\n", + " n_pmc\n", + " n_other_pids\n", + " label\n", " \n", " \n", " \n", " \n", - " 0\n", - " 0000-0001-5000-2053\n", + " 1575869\n", + " 0000-0002-5193-7851\n", " 1\n", - " 0\n", - " 0\n", - " Jorge\n", - " Jaramillo Sanchez\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 0\n", - " NaN\n", - " \n", - " \n", - " 1\n", - " 0000-0001-5000-6548\n", " 1\n", - " 0\n", - " 0\n", - " Wiseman\n", - " Bekelesi\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 1\n", + " andrea\n", + " mannocci\n", + " data scientist & researcher; scholarly knowled...\n", " NaN\n", + " [[personal website, https://andremann.github.i...\n", + " andrea.mannocci@isti.cnr.it\n", + " ...\n", + " [[research associate, istituto di scienza e te...\n", + " 37\n", + " [scopus - elsevier, crossref metadata search, ...\n", + " 2017-09-12t14:28:33.467z\n", + " 2021-03-09t08:32:47.840z\n", + " 34\n", " 0\n", - " NaN\n", - " \n", - " \n", - " 2\n", - " 0000-0001-5000-7962\n", - " 1\n", - " 1\n", + " 0\n", + " 60\n", " 1\n", - " ALICE\n", - " INDIMULI\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 0\n", - " NaN\n", - " \n", - " \n", - " 3\n", - " 0000-0001-5000-8586\n", - " 1\n", - " 0\n", - " 0\n", - " shim\n", - " ji yun\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 0\n", - " NaN\n", - " \n", - " \n", - " 4\n", - " 0000-0001-5001-0256\n", - " 1\n", - " 0\n", - " 0\n", - " Sandro\n", - " Caramaschi\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 0\n", - " NaN\n", " \n", " \n", "\n", + "

1 rows × 24 columns

\n", "" ], "text/plain": [ - " orcid claimed verified_email verified_primary_email \\\n", - "0 0000-0001-5000-2053 1 0 0 \n", - "1 0000-0001-5000-6548 1 0 0 \n", - "2 0000-0001-5000-7962 1 1 1 \n", - "3 0000-0001-5000-8586 1 0 0 \n", - "4 0000-0001-5001-0256 1 0 0 \n", - "\n", - " given_names family_name biography other_names urls primary_email \\\n", - "0 Jorge Jaramillo Sanchez NaN NaN NaN NaN \n", - "1 Wiseman Bekelesi NaN NaN NaN NaN \n", - "2 ALICE INDIMULI NaN NaN NaN NaN \n", - "3 shim ji yun NaN NaN NaN NaN \n", - "4 Sandro Caramaschi NaN NaN NaN NaN \n", - "\n", - " other_emails keywords external_ids education employment n_works \\\n", - "0 NaN NaN NaN NaN NaN 0 \n", - "1 NaN NaN NaN NaN NaN 0 \n", - "2 NaN NaN NaN NaN NaN 0 \n", - "3 NaN NaN NaN NaN NaN 0 \n", - "4 NaN NaN NaN NaN NaN 0 \n", - "\n", - " works_source \n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN " + " orcid claimed verified_email verified_primary_email \\\n", + "1575869 0000-0002-5193-7851 1 1 1 \n", + "\n", + " given_names family_name \\\n", + "1575869 andrea mannocci \n", + "\n", + " biography other_names \\\n", + "1575869 data scientist & researcher; scholarly knowled... NaN \n", + "\n", + " urls \\\n", + "1575869 [[personal website, https://andremann.github.i... \n", + "\n", + " primary_email ... \\\n", + "1575869 andrea.mannocci@isti.cnr.it ... \n", + "\n", + " employment n_works \\\n", + "1575869 [[research associate, istituto di scienza e te... 37 \n", + "\n", + " works_source \\\n", + "1575869 [scopus - elsevier, crossref metadata search, ... \n", + "\n", + " activation_date last_update_date n_doi n_arxiv \\\n", + "1575869 2017-09-12t14:28:33.467z 2021-03-09t08:32:47.840z 34 0 \n", + "\n", + " n_pmc n_other_pids label \n", + "1575869 0 60 1 \n", + "\n", + "[1 rows x 24 columns]" ] }, - "execution_count": 16, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add labels to the dataframe based on their presence in OpenAIRE" + "df[df['orcid'] == AM]" ] }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -867,105 +503,218 @@ " \n", " \n", " orcid\n", + " claimed\n", + " verified_email\n", + " verified_primary_email\n", + " given_names\n", + " family_name\n", + " biography\n", + " other_names\n", + " urls\n", + " primary_email\n", + " ...\n", + " employment\n", + " n_works\n", + " works_source\n", + " activation_date\n", + " last_update_date\n", + " n_doi\n", + " n_arxiv\n", + " n_pmc\n", + " n_other_pids\n", + " label\n", " \n", " \n", " \n", " \n", - " 0\n", - " 0000-0002-4292-8072\n", - " \n", - " \n", - " 1\n", - " 0000-0003-3026-718x\n", - " \n", - " \n", - " 2\n", - " 0000-0002-4568-8748\n", - " \n", - " \n", - " 3\n", - " 0000-0002-1380-859x\n", - " \n", - " \n", - " 4\n", - " 0000-0001-7526-5095\n", - " \n", - " \n", - " ...\n", + " 6819986\n", + " 0000-0001-6997-9470\n", + " 1\n", + " 1\n", + " 1\n", + " other\n", + " whatsapp\n", + " NaN\n", + " NaN\n", + " [[otherwhatsapp, https://otherwhatsapp.com/], ...\n", + " NaN\n", " ...\n", - " \n", - " \n", - " 2934111\n", - " 0000-0002-2394-5387\n", - " \n", - " \n", - " 2934112\n", - " 0000-0002-6125-3170\n", - " \n", - " \n", - " 2934113\n", - " 0000-0002-7062-5845\n", - " \n", - " \n", - " 2934114\n", - " 0000-0001-8677-5162\n", - " \n", - " \n", - " 2934115\n", - " 0000-0002-0240-8021\n", + " NaN\n", + " 0\n", + " NaN\n", + " 2020-10-07t10:37:12.237z\n", + " 2020-10-08t02:32:03.935z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", " \n", " \n", "\n", - "

2934116 rows × 1 columns

\n", + "

1 rows × 24 columns

\n", "" ], "text/plain": [ - " orcid\n", - "0 0000-0002-4292-8072\n", - "1 0000-0003-3026-718x\n", - "2 0000-0002-4568-8748\n", - "3 0000-0002-1380-859x\n", - "4 0000-0001-7526-5095\n", - "... ...\n", - "2934111 0000-0002-2394-5387\n", - "2934112 0000-0002-6125-3170\n", - "2934113 0000-0002-7062-5845\n", - "2934114 0000-0001-8677-5162\n", - "2934115 0000-0002-0240-8021\n", - "\n", - "[2934116 rows x 1 columns]" + " orcid claimed verified_email verified_primary_email \\\n", + "6819986 0000-0001-6997-9470 1 1 1 \n", + "\n", + " given_names family_name biography other_names \\\n", + "6819986 other whatsapp NaN NaN \n", + "\n", + " urls primary_email ... \\\n", + "6819986 [[otherwhatsapp, https://otherwhatsapp.com/], ... NaN ... \n", + "\n", + " employment n_works works_source activation_date \\\n", + "6819986 NaN 0 NaN 2020-10-07t10:37:12.237z \n", + "\n", + " last_update_date n_doi n_arxiv n_pmc n_other_pids label \n", + "6819986 2020-10-08t02:32:03.935z 0 0 0 0 0 \n", + "\n", + "[1 rows x 24 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == WHATSAPP]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "orcid 10916574\n", + "claimed 10916574\n", + "verified_email 10916574\n", + "verified_primary_email 10916574\n", + "given_names 10886150\n", + "family_name 10601571\n", + "biography 348649\n", + "other_names 551482\n", + "urls 707687\n", + "primary_email 123851\n", + "other_emails 48306\n", + "keywords 646400\n", + "external_ids 1301959\n", + "education 2430233\n", + "employment 2665092\n", + "n_works 10916574\n", + "works_source 2721431\n", + "activation_date 10916574\n", + "last_update_date 10916574\n", + "n_doi 10916574\n", + "n_arxiv 10916574\n", + "n_pmc 10916574\n", + "n_other_pids 10916574\n", + "label 10916574\n", + "dtype: int64" ] }, - "execution_count": 97, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "openaire_orcid = pd.read_csv('../data/raw/orcid_openaire.txt', header=None, names=['orcid'])\n", - "openaire_orcid" + "df.count()" ] }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "count 10916574\n", + "unique 10916574\n", + "top 0000-0002-5454-7613\n", + "freq 1\n", + "Name: orcid, dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df['label'] = df.orcid.isin(openaire_orcid['orcid'])\n", - "df[\"label\"] = df[\"label\"].astype(int)" + "df['orcid'].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Notable profiles inspection" + "## Primary email" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 123851\n", + "unique 123848\n", + "top patrick.davey@monash.edu\n", + "freq 2\n", + "Name: primary_email, dtype: object" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['primary_email'].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dupe emails" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6347224 maykin@owasp.org\n", + "7027865 patrick.davey@monash.edu\n", + "9529005 opercin@erbakan.edu.tr\n", + "Name: primary_email, dtype: object" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['primary_email'].dropna().loc[df['primary_email'].duplicated()]" ] }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1002,83 +751,106 @@ " ...\n", " n_works\n", " works_source\n", - " primary_email_domain\n", - " other_email_domains\n", - " n_emails\n", - " url_domains\n", - " n_urls\n", - " n_ids\n", - " n_keywords\n", + " activation_date\n", + " last_update_date\n", + " n_doi\n", + " n_arxiv\n", + " n_pmc\n", + " n_other_pids\n", " label\n", + " primary_email_domain\n", " \n", " \n", " \n", " \n", - " 8840413\n", - " 0000-0002-5193-7851\n", + " 4450046\n", + " 0000-0001-9855-1676\n", " 1\n", " 1\n", " 1\n", - " Andrea\n", - " Mannocci\n", + " maykin\n", + " warasart\n", " NaN\n", " NaN\n", - " [[Personal website, https://andremann.github.i...\n", - " andrea.mannocci@isti.cnr.it\n", - " ...\n", - " 37\n", - " [Scopus - Elsevier, Crossref Metadata Search, ...\n", - " isti.cnr.it\n", " NaN\n", + " maykin@owasp.org\n", + " ...\n", + " 0\n", " NaN\n", - " [github.io, twitter.com, linkedin.com]\n", - " 3.0\n", - " 1.0\n", - " 5.0\n", + " 2020-10-23t17:51:51.925z\n", + " 2021-01-01t15:00:52.053z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " owasp.org\n", + " \n", + " \n", + " 6347224\n", + " 0000-0002-0836-2271\n", " 1\n", + " 1\n", + " 1\n", + " maykin\n", + " warasart\n", + " NaN\n", + " NaN\n", + " NaN\n", + " maykin@owasp.org\n", + " ...\n", + " 0\n", + " NaN\n", + " 2020-09-15t04:43:55.709z\n", + " 2020-09-15t05:17:28.509z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " owasp.org\n", " \n", " \n", "\n", - "

1 rows × 25 columns

\n", + "

2 rows × 25 columns

\n", "" ], "text/plain": [ " orcid claimed verified_email verified_primary_email \\\n", - "8840413 0000-0002-5193-7851 1 1 1 \n", + "4450046 0000-0001-9855-1676 1 1 1 \n", + "6347224 0000-0002-0836-2271 1 1 1 \n", "\n", - " given_names family_name biography other_names \\\n", - "8840413 Andrea Mannocci NaN NaN \n", + " given_names family_name biography other_names urls primary_email \\\n", + "4450046 maykin warasart NaN NaN NaN maykin@owasp.org \n", + "6347224 maykin warasart NaN NaN NaN maykin@owasp.org \n", "\n", - " urls \\\n", - "8840413 [[Personal website, https://andremann.github.i... \n", - "\n", - " primary_email ... n_works \\\n", - "8840413 andrea.mannocci@isti.cnr.it ... 37 \n", - "\n", - " works_source \\\n", - "8840413 [Scopus - Elsevier, Crossref Metadata Search, ... \n", + " ... n_works works_source activation_date \\\n", + "4450046 ... 0 NaN 2020-10-23t17:51:51.925z \n", + "6347224 ... 0 NaN 2020-09-15t04:43:55.709z \n", "\n", - " primary_email_domain other_email_domains n_emails \\\n", - "8840413 isti.cnr.it NaN NaN \n", + " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", + "4450046 2021-01-01t15:00:52.053z 0 0 0 0 0 \n", + "6347224 2020-09-15t05:17:28.509z 0 0 0 0 0 \n", "\n", - " url_domains n_urls n_ids n_keywords label \n", - "8840413 [github.io, twitter.com, linkedin.com] 3.0 1.0 5.0 1 \n", + " primary_email_domain \n", + "4450046 owasp.org \n", + "6347224 owasp.org \n", "\n", - "[1 rows x 25 columns]" + "[2 rows x 25 columns]" ] }, - "execution_count": 111, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df[df['orcid'] == AM]" + "df[df['primary_email'] == 'maykin@owasp.org']" ] }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1115,125 +887,106 @@ " ...\n", " n_works\n", " works_source\n", - " primary_email_domain\n", - " other_email_domains\n", - " n_emails\n", - " url_domains\n", - " n_urls\n", - " n_ids\n", - " n_keywords\n", + " activation_date\n", + " last_update_date\n", + " n_doi\n", + " n_arxiv\n", + " n_pmc\n", + " n_other_pids\n", " label\n", + " primary_email_domain\n", " \n", " \n", " \n", " \n", - " 9517099\n", - " 0000-0001-6997-9470\n", + " 6840791\n", + " 0000-0002-2232-9638\n", " 1\n", " 1\n", " 1\n", - " other\n", - " whatsapp\n", + " osman\n", + " perçin\n", " NaN\n", " NaN\n", - " [[Otherwhatsapp, https://otherwhatsapp.com/], ...\n", " NaN\n", + " opercin@erbakan.edu.tr\n", " ...\n", " 0\n", " NaN\n", + " 2015-01-12t13:47:55.549z\n", + " 2020-01-27t07:38:24.269z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " erbakan.edu.tr\n", + " \n", + " \n", + " 9529005\n", + " 0000-0003-0033-0918\n", + " 1\n", + " 1\n", + " 1\n", + " osman\n", + " perçin\n", " NaN\n", " NaN\n", " NaN\n", - " [otherwhatsapp.com, im-creator.com, facebook.c...\n", - " 27.0\n", + " opercin@erbakan.edu.tr\n", + " ...\n", + " 0\n", " NaN\n", - " 4.0\n", + " 2015-10-13t05:47:12.014z\n", + " 2020-12-25t13:52:03.976z\n", + " 0\n", " 0\n", + " 0\n", + " 0\n", + " 0\n", + " erbakan.edu.tr\n", " \n", " \n", "\n", - "

1 rows × 25 columns

\n", + "

2 rows × 25 columns

\n", "" ], "text/plain": [ " orcid claimed verified_email verified_primary_email \\\n", - "9517099 0000-0001-6997-9470 1 1 1 \n", - "\n", - " given_names family_name biography other_names \\\n", - "9517099 other whatsapp NaN NaN \n", + "6840791 0000-0002-2232-9638 1 1 1 \n", + "9529005 0000-0003-0033-0918 1 1 1 \n", "\n", - " urls primary_email ... \\\n", - "9517099 [[Otherwhatsapp, https://otherwhatsapp.com/], ... NaN ... \n", + " given_names family_name biography other_names urls \\\n", + "6840791 osman perçin NaN NaN NaN \n", + "9529005 osman perçin NaN NaN NaN \n", "\n", - " n_works works_source primary_email_domain other_email_domains \\\n", - "9517099 0 NaN NaN NaN \n", + " primary_email ... n_works works_source \\\n", + "6840791 opercin@erbakan.edu.tr ... 0 NaN \n", + "9529005 opercin@erbakan.edu.tr ... 0 NaN \n", "\n", - " n_emails url_domains n_urls \\\n", - "9517099 NaN [otherwhatsapp.com, im-creator.com, facebook.c... 27.0 \n", + " activation_date last_update_date n_doi n_arxiv \\\n", + "6840791 2015-01-12t13:47:55.549z 2020-01-27t07:38:24.269z 0 0 \n", + "9529005 2015-10-13t05:47:12.014z 2020-12-25t13:52:03.976z 0 0 \n", "\n", - " n_ids n_keywords label \n", - "9517099 NaN 4.0 0 \n", + " n_pmc n_other_pids label primary_email_domain \n", + "6840791 0 0 0 erbakan.edu.tr \n", + "9529005 0 0 0 erbakan.edu.tr \n", "\n", - "[1 rows x 25 columns]" - ] - }, - "execution_count": 112, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['orcid'] == WHATSAPP]" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "orcid 10744621\n", - "claimed 10744621\n", - "verified_email 10744621\n", - "verified_primary_email 10744621\n", - "given_names 10716788\n", - "family_name 10437093\n", - "biography 333885\n", - "other_names 544550\n", - "urls 688262\n", - "primary_email 121476\n", - "other_emails 47470\n", - "keywords 638634\n", - "external_ids 1285292\n", - "education 2402439\n", - "employment 2626669\n", - "n_works 10744621\n", - "works_source 2671905\n", - "primary_email_domain 121476\n", - "other_email_domains 47470\n", - "n_emails 47470\n", - "url_domains 688262\n", - "n_urls 688262\n", - "n_ids 1285292\n", - "n_keywords 638634\n", - "label 10744621\n", - "dtype: int64" + "[2 rows x 25 columns]" ] }, - "execution_count": 103, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.count()" + "df[df['primary_email'] == 'opercin@erbakan.edu.tr']" ] }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1270,171 +1023,139 @@ " ...\n", " n_works\n", " works_source\n", - " primary_email_domain\n", - " other_email_domains\n", - " n_emails\n", - " url_domains\n", - " n_urls\n", - " n_ids\n", - " n_keywords\n", + " activation_date\n", + " last_update_date\n", + " n_doi\n", + " n_arxiv\n", + " n_pmc\n", + " n_other_pids\n", " label\n", + " primary_email_domain\n", " \n", " \n", " \n", " \n", - " 4595263\n", - " 0000-0002-5154-6404\n", + " 944993\n", + " 0000-0002-9158-1757\n", " 1\n", " 1\n", " 1\n", - " Olusola\n", - " Bamisile\n", - " NaN\n", + " patrick\n", + " davey\n", " NaN\n", " NaN\n", " NaN\n", + " patrick.davey@monash.edu\n", " ...\n", - " 3\n", - " [Multidisciplinary Digital Publishing Institut...\n", - " NaN\n", - " NaN\n", - " NaN\n", + " 0\n", " NaN\n", + " 2019-05-09t23:01:02.170z\n", + " 2019-08-20t03:00:17.844z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " monash.edu\n", + " \n", + " \n", + " 7027865\n", + " 0000-0002-8774-0030\n", + " 1\n", + " 1\n", + " 1\n", + " patrick\n", + " davey\n", " NaN\n", " NaN\n", " NaN\n", - " False\n", + " patrick.davey@monash.edu\n", + " ...\n", + " 1\n", + " [crossref]\n", + " 2018-09-11t10:47:10.997z\n", + " 2021-02-09t06:21:44.138z\n", + " 1\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " monash.edu\n", " \n", " \n", "\n", - "

1 rows × 25 columns

\n", + "

2 rows × 25 columns

\n", "" ], "text/plain": [ " orcid claimed verified_email verified_primary_email \\\n", - "4595263 0000-0002-5154-6404 1 1 1 \n", + "944993 0000-0002-9158-1757 1 1 1 \n", + "7027865 0000-0002-8774-0030 1 1 1 \n", "\n", - " given_names family_name biography other_names urls primary_email ... \\\n", - "4595263 Olusola Bamisile NaN NaN NaN NaN ... \n", + " given_names family_name biography other_names urls \\\n", + "944993 patrick davey NaN NaN NaN \n", + "7027865 patrick davey NaN NaN NaN \n", "\n", - " n_works works_source \\\n", - "4595263 3 [Multidisciplinary Digital Publishing Institut... \n", + " primary_email ... n_works works_source \\\n", + "944993 patrick.davey@monash.edu ... 0 NaN \n", + "7027865 patrick.davey@monash.edu ... 1 [crossref] \n", "\n", - " primary_email_domain other_email_domains n_emails url_domains n_urls \\\n", - "4595263 NaN NaN NaN NaN NaN \n", + " activation_date last_update_date n_doi n_arxiv \\\n", + "944993 2019-05-09t23:01:02.170z 2019-08-20t03:00:17.844z 0 0 \n", + "7027865 2018-09-11t10:47:10.997z 2021-02-09t06:21:44.138z 1 0 \n", "\n", - " n_ids n_keywords label \n", - "4595263 NaN NaN False \n", + " n_pmc n_other_pids label primary_email_domain \n", + "944993 0 0 0 monash.edu \n", + "7027865 0 0 1 monash.edu \n", "\n", - "[1 rows x 25 columns]" + "[2 rows x 25 columns]" ] }, - "execution_count": 104, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df[df['orcid'] == '0000-0002-5154-6404']" + "df[df['primary_email'] == 'patrick.davey@monash.edu']" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ - "df.drop(index=4595264, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 10744621\n", - "unique 10744621\n", - "top 0000-0001-8644-5622\n", - "freq 1\n", - "Name: orcid, dtype: object" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['orcid'].describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Primary email" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 121476\n", - "unique 121473\n", - "top patrick.davey@monash.edu\n", - "freq 2\n", - "Name: primary_email, dtype: object" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['primary_email'].describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Dupe emails" + "df['primary_email_domain'] = df['primary_email'].apply(lambda x: x.split('@')[1] if pd.notna(x) else x)" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "7483666 maykin@owasp.org\n", - "9068234 opercin@erbakan.edu.tr\n", - "10246485 patrick.davey@monash.edu\n", - "Name: primary_email, dtype: object" + "count 123851\n", + "unique 17089\n", + "top gmail.com\n", + "freq 26540\n", + "Name: primary_email_domain, dtype: object" ] }, - "execution_count": 24, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df['primary_email'].dropna().loc[df['primary_email'].duplicated()]" + "df['primary_email_domain'].describe()" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1459,489 +1180,93 @@ " \n", " \n", " orcid\n", - " claimed\n", - " verified_email\n", - " verified_primary_email\n", - " given_names\n", - " family_name\n", - " biography\n", - " other_names\n", - " urls\n", - " primary_email\n", - " other_emails\n", - " keywords\n", - " external_ids\n", - " education\n", - " employment\n", - " n_works\n", - " works_source\n", + " \n", + " \n", + " primary_email_domain\n", + " \n", " \n", " \n", " \n", " \n", - " 3776350\n", - " 0000-0002-0836-2271\n", - " 1\n", + " gmail.com\n", + " 26540\n", + " \n", + " \n", + " hotmail.com\n", + " 3769\n", + " \n", + " \n", + " yahoo.com\n", + " 2614\n", + " \n", + " \n", + " 163.com\n", + " 2109\n", + " \n", + " \n", + " yuhs.ac\n", + " 1132\n", + " \n", + " \n", + " ...\n", + " ...\n", + " \n", + " \n", + " imean-biotech.com\n", " 1\n", + " \n", + " \n", + " imec.msu.ru\n", " 1\n", - " Maykin\n", - " Warasart\n", - " NaN\n", - " NaN\n", - " NaN\n", - " maykin@owasp.org\n", - " [maykin@dga.or.th]\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 0\n", - " NaN\n", " \n", " \n", - " 7483666\n", - " 0000-0001-9855-1676\n", + " imedea.uib-csic.es\n", " 1\n", + " \n", + " \n", + " imes.uni-hannover.de\n", " 1\n", + " \n", + " \n", + " zzuli.edu.cn\n", " 1\n", - " Maykin\n", - " Warasart\n", - " NaN\n", - " NaN\n", - " NaN\n", - " maykin@owasp.org\n", - " [maykin@dga.or.th, maykin@ieee.org]\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 0\n", - " NaN\n", " \n", " \n", "\n", + "

17089 rows × 1 columns

\n", "" ], "text/plain": [ - " orcid claimed verified_email verified_primary_email \\\n", - "3776350 0000-0002-0836-2271 1 1 1 \n", - "7483666 0000-0001-9855-1676 1 1 1 \n", - "\n", - " given_names family_name biography other_names urls primary_email \\\n", - "3776350 Maykin Warasart NaN NaN NaN maykin@owasp.org \n", - "7483666 Maykin Warasart NaN NaN NaN maykin@owasp.org \n", - "\n", - " other_emails keywords external_ids education \\\n", - "3776350 [maykin@dga.or.th] NaN NaN NaN \n", - "7483666 [maykin@dga.or.th, maykin@ieee.org] NaN NaN NaN \n", + " orcid\n", + "primary_email_domain \n", + "gmail.com 26540\n", + "hotmail.com 3769\n", + "yahoo.com 2614\n", + "163.com 2109\n", + "yuhs.ac 1132\n", + "... ...\n", + "imean-biotech.com 1\n", + "imec.msu.ru 1\n", + "imedea.uib-csic.es 1\n", + "imes.uni-hannover.de 1\n", + "zzuli.edu.cn 1\n", "\n", - " employment n_works works_source \n", - "3776350 NaN 0 NaN \n", - "7483666 NaN 0 NaN " + "[17089 rows x 1 columns]" ] }, - "execution_count": 25, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df[df['primary_email'] == 'maykin@owasp.org']" + "primary_emails = df[['primary_email_domain', 'orcid']].groupby('primary_email_domain').count().sort_values('orcid', ascending=False)\n", + "primary_emails" ] }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_source
39950320000-0002-2232-9638111OsmanPerçinNaNNaNNaNopercin@erbakan.edu.trNaNNaNNaNNaNNaN0NaN
90682340000-0003-0033-0918111OsmanPERÇİNNaNNaNNaNopercin@erbakan.edu.trNaNNaNNaNNaN[[, Necmettin Erbakan University, Konya, , TR,...0NaN
\n", - "
" - ], - "text/plain": [ - " orcid claimed verified_email verified_primary_email \\\n", - "3995032 0000-0002-2232-9638 1 1 1 \n", - "9068234 0000-0003-0033-0918 1 1 1 \n", - "\n", - " given_names family_name biography other_names urls \\\n", - "3995032 Osman Perçin NaN NaN NaN \n", - "9068234 Osman PERÇİN NaN NaN NaN \n", - "\n", - " primary_email other_emails keywords external_ids education \\\n", - "3995032 opercin@erbakan.edu.tr NaN NaN NaN NaN \n", - "9068234 opercin@erbakan.edu.tr NaN NaN NaN NaN \n", - "\n", - " employment n_works \\\n", - "3995032 NaN 0 \n", - "9068234 [[, Necmettin Erbakan University, Konya, , TR,... 0 \n", - "\n", - " works_source \n", - "3995032 NaN \n", - "9068234 NaN " - ] - }, - "execution_count": 26, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['primary_email'] == 'opercin@erbakan.edu.tr']" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_emailother_emailskeywordsexternal_idseducationemploymentn_worksworks_source
50877450000-0002-8774-0030111PatrickDaveyNaNNaNNaNpatrick.davey@monash.eduNaNNaNNaNNaN[[PhD Student, Monash University, Melbourne, V...1[Crossref]
102464850000-0002-9158-1757111PatrickDaveyNaNNaNNaNpatrick.davey@monash.eduNaN[Radiopharmaceuticals, Inorganic Chemistry, Bi...NaNNaN[[PhD Student, Monash University, Melbourne, ,...0NaN
\n", - "
" - ], - "text/plain": [ - " orcid claimed verified_email \\\n", - "5087745 0000-0002-8774-0030 1 1 \n", - "10246485 0000-0002-9158-1757 1 1 \n", - "\n", - " verified_primary_email given_names family_name biography \\\n", - "5087745 1 Patrick Davey NaN \n", - "10246485 1 Patrick Davey NaN \n", - "\n", - " other_names urls primary_email other_emails \\\n", - "5087745 NaN NaN patrick.davey@monash.edu NaN \n", - "10246485 NaN NaN patrick.davey@monash.edu NaN \n", - "\n", - " keywords external_ids \\\n", - "5087745 NaN NaN \n", - "10246485 [Radiopharmaceuticals, Inorganic Chemistry, Bi... NaN \n", - "\n", - " education employment \\\n", - "5087745 NaN [[PhD Student, Monash University, Melbourne, V... \n", - "10246485 NaN [[PhD Student, Monash University, Melbourne, ,... \n", - "\n", - " n_works works_source \n", - "5087745 1 [Crossref] \n", - "10246485 0 NaN " - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['primary_email'] == 'patrick.davey@monash.edu']" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "df['primary_email_domain'] = df['primary_email'].apply(lambda x: x.split('@')[1] if pd.notna(x) else x)" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 121476\n", - "unique 17047\n", - "top gmail.com\n", - "freq 25892\n", - "Name: primary_email_domain, dtype: object" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['primary_email_domain'].describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcid
primary_email_domain
gmail.com25892
hotmail.com3674
yahoo.com2578
163.com2067
yuhs.ac1124
......
iiap.gob.pe1
iiap.org.pe1
iibb.csic.es1
iic.hokudai.ac.jp1
zzuli.edu.cn1
\n", - "

17047 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " orcid\n", - "primary_email_domain \n", - "gmail.com 25892\n", - "hotmail.com 3674\n", - "yahoo.com 2578\n", - "163.com 2067\n", - "yuhs.ac 1124\n", - "... ...\n", - "iiap.gob.pe 1\n", - "iiap.org.pe 1\n", - "iibb.csic.es 1\n", - "iic.hokudai.ac.jp 1\n", - "zzuli.edu.cn 1\n", - "\n", - "[17047 rows x 1 columns]" - ] - }, - "execution_count": 30, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "primary_emails = df[['primary_email_domain', 'orcid']].groupby('primary_email_domain').count().sort_values('orcid', ascending=False)\n", - "primary_emails" - ] - }, - { - "cell_type": "code", - "execution_count": 91, + "execution_count": 28, "metadata": {}, "outputs": [ { @@ -1967,8 +1292,8 @@ "bu.edu", "usgs.gov", "mail.ru", - "usp.br", "yahoo.com.br", + "usp.br", "ua.pt", "umich.edu", "ust.hk", @@ -1980,44 +1305,44 @@ "unesp.br", "ucl.ac.uk", "ieee.org", - "stcatz.ox.ac.uk", - "st-annes.ox.ac.uk", "naver.com", + "st-annes.ox.ac.uk", + "stcatz.ox.ac.uk", "yahoo.fr", "ucm.es", "live.com" ], "y": [ - 25892, - 3674, - 2578, - 2067, - 1124, - 1035, - 914, - 755, - 626, + 26540, + 3769, + 2614, + 2109, + 1132, + 1056, + 940, + 762, + 630, 584, - 564, - 455, - 454, - 291, + 575, + 458, + 457, + 300, 290, - 278, - 249, + 277, + 258, + 247, 242, - 237, - 234, - 220, - 214, + 235, + 225, + 218, + 207, 204, - 203, - 185, + 187, + 184, 184, - 182, 172, - 166, - 159 + 171, + 163 ] } ], @@ -2844,9 +2169,9 @@ } }, "text/html": [ - "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "set_top_n(30)\n", + "data = [\n", + " go.Bar(\n", + " x=grouped_urls.sort_values(by=['orcid'], ascending=False).index[:TOP_N],\n", + " y=grouped_urls.sort_values(by=['orcid'], ascending=False)['orcid'][:TOP_N]\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='Top %s URL domains' % TOP_N,\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_email...n_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsn_emailsurl_domainsn_urls
3824970000-0002-9025-8632111buycannabisdispensarywe procure and deliver premium cannabis strain...[we procure and deliver premium cannabis strai...[[find your cannabis & marijuana dispensary , ...NaN...00000NaNNaNNaN[goowonderland.com, goowonderland.com, goowond...81.0
9118110000-0002-4062-3603111juan de diosbeltrán mancillajuan de dios beltrán mancilla (*) filósofo aut...[juan de dios beltrán mancilla, filósofo autod...[[01.- juan de dios beltrán mancilla. teoría o...NaN...00070NaNNaNNaN[yumpu.com, ijopm.org, google.com, blogspot.co...69.0
11361290000-0002-1929-6054111franklin américocanaza choquedocente-investigador social. maestrando en der...[franklin américo canaza-choque , franklin a. ...[[consejo nacional de ciencia, tecnología e in...leo_123fa@hotmail.com...2900331hotmail.com[gmail.com, gmail.com, hotmail.com, baldwin.ed...5.0[concytec.gob.pe, redalyc.org, redalyc.org, un...61.0
31026860000-0003-2593-7134111aanjaelaniall my papers can be downloaded from portal:re...[jaelani, a., jaelani, aan][[microsoft academic research, https://academi...aan_jaelani@syekhnurjati.ac.id...88001931syekhnurjati.ac.id[gmail.com]1.0[microsoft.com, twitter.com, academia.edu, aca...67.0
68689320000-0002-5710-4041111ryszardromaniukprofessor of electronics and communications en...[r.romaniuk, r.s.romaniuk, ryszard romaniuk, r...[[scholar google, http://scholar.google.pl/cit...rrom@ise.pw.edu.pl...122125017421ise.pw.edu.pl[ise.pw.edu.pl, elka.pw.edu.pl, cern.ch]3.0[google.pl, publons.com, scopus.com, mendeley....114.0
80889870000-0002-9965-2425111jaroslawspychalajaroslaw spychala has received a doctoral degr...[jaroslaw jozef spychala][[resume, http://www.biowebspin.com/wp-content...NaN...1500291NaNNaNNaN[biowebspin.com, biowebspin.com, google.com, l...73.0
86583550000-0002-3920-7389111а.гусевsurname, name gusev alexander leonidovichdate...[alexander l. gusev , alexander leonidovich gu...[[a.l. gusev alternative energy and ecology, ...NaN...3700211NaNNaNNaN[youtube.com, isjaee.com, researchgate.net, re...111.0
87788640000-0002-3997-5070111dr. parameshacharib ddr. parameshachari b dacm distinguished speake...[dr. parameshachari b d][[gsssietw,mysuru, http://geethashishu.in/], [...NaN...4700481NaNNaNNaN[geethashishu.in, geethashishu.in, acm.org, go...71.0
99801640000-0003-4948-9268111gustavoduperrégustavo norberto duperré graduated in arts and...[gustavo norberto duperré, duperré, g. n., gus...[[gis in cultural heritage - icomos românia, h...gustavo.duperre@usal.edu.ar...1300340usal.edu.arNaNNaN[icomos.ro, unirioja.es, unirioja.es, unc.edu....61.0
100245010000-0003-2407-3557111abdulazizabdul aziz was born on may 25, 1973, in brebes...[abdul aziz, aziz, abdul, aziz, a., aziz, abd,...[[google scholar, https://scholar.google.com/c...NaN...1900771NaNNaNNaN[google.com, syekhnurjati.ac.id, orcid.org, bl...59.0
100911650000-0003-2183-8112111pelayo munhozoleapós-doutorado em gestão ambiental pela univers...[ munhoz, pelayo olea, olea, pelayo, olea, p...[[currículo lattes, http://lattes.cnpq.br/6209...NaN...797015821NaNNaNNaN[cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c...61.0
105232050000-0003-2450-090x111eduardbabulakprofessor eduard babulak is accomplished inter...[professor eduard babulak][[honorary chair, chief mentor & senior adviso...NaN...199011741NaNNaNNaN[worldassessmentcouncil.org, spseke.sk, bcs.or...114.0
106960590000-0002-6938-9638111adolfocatral sanabriamy education is in computer science, mathemati...NaN[[researchgate adolfo catral , https://www.res...NaN...202200161NaNNaNNaN[researchgate.net, youtube.com, linkedin.com, ...152.0
\n", + "

13 rows × 29 columns

\n", + "
" + ], + "text/plain": [ + " orcid claimed verified_email \\\n", + "382497 0000-0002-9025-8632 1 1 \n", + "911811 0000-0002-4062-3603 1 1 \n", + "1136129 0000-0002-1929-6054 1 1 \n", + "3102686 0000-0003-2593-7134 1 1 \n", + "6868932 0000-0002-5710-4041 1 1 \n", + "8088987 0000-0002-9965-2425 1 1 \n", + "8658355 0000-0002-3920-7389 1 1 \n", + "8778864 0000-0002-3997-5070 1 1 \n", + "9980164 0000-0003-4948-9268 1 1 \n", + "10024501 0000-0003-2407-3557 1 1 \n", + "10091165 0000-0003-2183-8112 1 1 \n", + "10523205 0000-0003-2450-090x 1 1 \n", + "10696059 0000-0002-6938-9638 1 1 \n", + "\n", + " verified_primary_email given_names family_name \\\n", + "382497 1 buycannabis dispensary \n", + "911811 1 juan de dios beltrán mancilla \n", + "1136129 1 franklin américo canaza choque \n", + "3102686 1 aan jaelani \n", + "6868932 1 ryszard romaniuk \n", + "8088987 1 jaroslaw spychala \n", + "8658355 1 а. гусев \n", + "8778864 1 dr. parameshachari b d \n", + "9980164 1 gustavo duperré \n", + "10024501 1 abdul aziz \n", + "10091165 1 pelayo munhoz olea \n", + "10523205 1 eduard babulak \n", + "10696059 1 adolfo catral sanabria \n", "\n", - "// Listen for the clearing of the current output cell\n", - "var outputEl = gd.closest('.output');\n", - "if (outputEl) {{\n", - " x.observe(outputEl, {childList: true});\n", - "}}\n", + " biography \\\n", + "382497 we procure and deliver premium cannabis strain... \n", + "911811 juan de dios beltrán mancilla (*) filósofo aut... \n", + "1136129 docente-investigador social. maestrando en der... \n", + "3102686 all my papers can be downloaded from portal:re... \n", + "6868932 professor of electronics and communications en... \n", + "8088987 jaroslaw spychala has received a doctoral degr... \n", + "8658355 surname, name gusev alexander leonidovichdate... \n", + "8778864 dr. parameshachari b dacm distinguished speake... \n", + "9980164 gustavo norberto duperré graduated in arts and... \n", + "10024501 abdul aziz was born on may 25, 1973, in brebes... \n", + "10091165 pós-doutorado em gestão ambiental pela univers... \n", + "10523205 professor eduard babulak is accomplished inter... \n", + "10696059 my education is in computer science, mathemati... \n", "\n", - " }) }; }); " + " other_names \\\n", + "382497 [we procure and deliver premium cannabis strai... \n", + "911811 [juan de dios beltrán mancilla, filósofo autod... \n", + "1136129 [franklin américo canaza-choque , franklin a. ... \n", + "3102686 [jaelani, a., jaelani, aan] \n", + "6868932 [r.romaniuk, r.s.romaniuk, ryszard romaniuk, r... \n", + "8088987 [jaroslaw jozef spychala] \n", + "8658355 [alexander l. gusev , alexander leonidovich gu... \n", + "8778864 [dr. parameshachari b d] \n", + "9980164 [gustavo norberto duperré, duperré, g. n., gus... \n", + "10024501 [abdul aziz, aziz, abdul, aziz, a., aziz, abd,... \n", + "10091165 [ munhoz, pelayo olea, olea, pelayo, olea, p... \n", + "10523205 [professor eduard babulak] \n", + "10696059 NaN \n", + "\n", + " urls \\\n", + "382497 [[find your cannabis & marijuana dispensary , ... \n", + "911811 [[01.- juan de dios beltrán mancilla. teoría o... \n", + "1136129 [[consejo nacional de ciencia, tecnología e in... \n", + "3102686 [[microsoft academic research, https://academi... \n", + "6868932 [[scholar google, http://scholar.google.pl/cit... \n", + "8088987 [[resume, http://www.biowebspin.com/wp-content... \n", + "8658355 [[a.l. gusev alternative energy and ecology, ... \n", + "8778864 [[gsssietw,mysuru, http://geethashishu.in/], [... \n", + "9980164 [[gis in cultural heritage - icomos românia, h... \n", + "10024501 [[google scholar, https://scholar.google.com/c... \n", + "10091165 [[currículo lattes, http://lattes.cnpq.br/6209... \n", + "10523205 [[honorary chair, chief mentor & senior adviso... \n", + "10696059 [[researchgate adolfo catral , https://www.res... \n", + "\n", + " primary_email ... n_doi n_arxiv n_pmc \\\n", + "382497 NaN ... 0 0 0 \n", + "911811 NaN ... 0 0 0 \n", + "1136129 leo_123fa@hotmail.com ... 29 0 0 \n", + "3102686 aan_jaelani@syekhnurjati.ac.id ... 88 0 0 \n", + "6868932 rrom@ise.pw.edu.pl ... 1221 25 0 \n", + "8088987 NaN ... 15 0 0 \n", + "8658355 NaN ... 37 0 0 \n", + "8778864 NaN ... 47 0 0 \n", + "9980164 gustavo.duperre@usal.edu.ar ... 13 0 0 \n", + "10024501 NaN ... 19 0 0 \n", + "10091165 NaN ... 797 0 1 \n", + "10523205 NaN ... 199 0 1 \n", + "10696059 NaN ... 2022 0 0 \n", + "\n", + " n_other_pids label primary_email_domain \\\n", + "382497 0 0 NaN \n", + "911811 7 0 NaN \n", + "1136129 33 1 hotmail.com \n", + "3102686 193 1 syekhnurjati.ac.id \n", + "6868932 1742 1 ise.pw.edu.pl \n", + "8088987 29 1 NaN \n", + "8658355 21 1 NaN \n", + "8778864 48 1 NaN \n", + "9980164 34 0 usal.edu.ar \n", + "10024501 77 1 NaN \n", + "10091165 582 1 NaN \n", + "10523205 174 1 NaN \n", + "10696059 16 1 NaN \n", + "\n", + " other_email_domains n_emails \\\n", + "382497 NaN NaN \n", + "911811 NaN NaN \n", + "1136129 [gmail.com, gmail.com, hotmail.com, baldwin.ed... 5.0 \n", + "3102686 [gmail.com] 1.0 \n", + "6868932 [ise.pw.edu.pl, elka.pw.edu.pl, cern.ch] 3.0 \n", + "8088987 NaN NaN \n", + "8658355 NaN NaN \n", + "8778864 NaN NaN \n", + "9980164 NaN NaN \n", + "10024501 NaN NaN \n", + "10091165 NaN NaN \n", + "10523205 NaN NaN \n", + "10696059 NaN NaN \n", + "\n", + " url_domains n_urls \n", + "382497 [goowonderland.com, goowonderland.com, goowond... 81.0 \n", + "911811 [yumpu.com, ijopm.org, google.com, blogspot.co... 69.0 \n", + "1136129 [concytec.gob.pe, redalyc.org, redalyc.org, un... 61.0 \n", + "3102686 [microsoft.com, twitter.com, academia.edu, aca... 67.0 \n", + "6868932 [google.pl, publons.com, scopus.com, mendeley.... 114.0 \n", + "8088987 [biowebspin.com, biowebspin.com, google.com, l... 73.0 \n", + "8658355 [youtube.com, isjaee.com, researchgate.net, re... 111.0 \n", + "8778864 [geethashishu.in, geethashishu.in, acm.org, go... 71.0 \n", + "9980164 [icomos.ro, unirioja.es, unirioja.es, unc.edu.... 61.0 \n", + "10024501 [google.com, syekhnurjati.ac.id, orcid.org, bl... 59.0 \n", + "10091165 [cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c... 61.0 \n", + "10523205 [worldassessmentcouncil.org, spseke.sk, bcs.or... 114.0 \n", + "10696059 [researchgate.net, youtube.com, linkedin.com, ... 152.0 \n", + "\n", + "[13 rows x 29 columns]" ] }, + "execution_count": 45, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } - ], - "source": [ - "set_top_n(30)\n", - "data = [\n", - " go.Bar(\n", - " x=grouped_urls.sort_values(by=['orcid'], ascending=False).index[:TOP_N],\n", - " y=grouped_urls.sort_values(by=['orcid'], ascending=False)['orcid'][:TOP_N]\n", - " )\n", - "]\n", - "\n", - "layout = go.Layout(\n", - " title='Top %s URL domains' % TOP_N,\n", - " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", - ")\n", - "fig = go.Figure(data=data, layout=layout)\n", - "plotly.offline.iplot(fig)" + ], + "source": [ + "df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]" ] }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 46, "metadata": { "scrolled": true }, @@ -8095,11 +7864,11 @@ " urls\n", " primary_email\n", " ...\n", - " external_ids\n", - " education\n", - " employment\n", - " n_works\n", - " works_source\n", + " n_doi\n", + " n_arxiv\n", + " n_pmc\n", + " n_other_pids\n", + " label\n", " primary_email_domain\n", " other_email_domains\n", " n_emails\n", @@ -8109,533 +7878,407 @@ " \n", " \n", " \n", - " 482862\n", - " 0000-0003-4948-9268\n", + " 97666\n", + " 0000-0002-7843-8497\n", " 1\n", " 1\n", " 1\n", - " Gustavo\n", - " Duperré\n", - " Gustavo Norberto Duperré graduated in Arts and...\n", - " [Gustavo Norberto Duperré, Duperré, G. N.]\n", - " [[Gis in Cultural Heritage - ICOMOS România, h...\n", - " gustavo.duperre@usal.edu.ar\n", + " davi\n", + " barbosa\n", + " pesquisador na área sociojurídica, professor, ...\n", + " [professor davi barbosa delmont]\n", + " [[plataforma de cursos ideia criativa, https:/...\n", + " NaN\n", " ...\n", - " [[Scopus Author ID, 57195936346], [ResearcherI...\n", - " [[Programme in History, History of Art and Ter...\n", - " [[Titular Professor, Dirección General de Cult...\n", - " 13\n", - " [Gustavo Duperré, Scopus - Elsevier, Publons, ...\n", - " usal.edu.ar\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", " NaN\n", " NaN\n", - " [icomos.ro, unirioja.es, unirioja.es, unc.edu....\n", - " 51.0\n", + " NaN\n", + " [eadplataforma.com, facebook.com, youtube.com,...\n", + " 39.0\n", " \n", " \n", - " 554859\n", - " 0000-0002-1929-6054\n", + " 200670\n", + " 0000-0003-1554-1531\n", " 1\n", " 1\n", " 1\n", - " Franklin Américo\n", - " Canaza Choque\n", - " Docente-Investigador Social. Maestrando en Der...\n", - " [Franklin Américo Canaza-Choque , Franklin A. ...\n", - " [[Consejo Nacional de Ciencia, Tecnología e In...\n", - " Leo_123fa@hotmail.com\n", + " katarzyna\n", + " ochman\n", + " katarzyna ochman [kataˈʐɨna ˈɔxman] is assista...\n", + " [[kataˈʐɨna ˈɔxman], catharina ochman, cathari...\n", + " [[researchgate, https://www.researchgate.net/p...\n", + " NaN\n", " ...\n", - " [[ResearcherID, P-8613-2018], [Loop profile, 8...\n", - " [[Facultad de Ciencias de la Educación , Maest...\n", - " [[Investigador Social, Universidad Católica de...\n", - " 38\n", - " [ResearcherID, BASE - Bielefeld Academic Searc...\n", - " hotmail.com\n", - " [gmail.com, gmail.com, hotmail.com, baldwin.ed...\n", - " 5.0\n", - " [concytec.gob.pe, redalyc.org, redalyc.org, un...\n", - " 61.0\n", - " \n", - " \n", - " 1381092\n", - " 0000-0002-9025-8632\n", - " 1\n", " 1\n", + " 0\n", + " 0\n", + " 0\n", " 1\n", - " buycannabis\n", - " dispensary\n", - " We procure and deliver premium cannabis strain...\n", - " [We procure and deliver premium cannabis strai...\n", - " [[find your cannabis & marijuana dispensary , ...\n", - " NaN\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 10\n", - " [goowonderland dispensary]\n", " NaN\n", " NaN\n", " NaN\n", - " [goowonderland.com, goowonderland.com, goowond...\n", - " 81.0\n", + " [researchgate.net, academia.edu, facebook.com,...\n", + " 11.0\n", " \n", " \n", - " 2679353\n", - " 0000-0003-2407-3557\n", + " 210325\n", + " 0000-0003-3080-4643\n", " 1\n", " 1\n", " 1\n", - " Abdul\n", - " Aziz\n", - " Abdul Aziz was born on May 25, 1973, in Brebes...\n", - " [Abdul Aziz, Aziz, Abdul, Aziz, A., Aziz, Abd,...\n", - " [[Google Scholar, https://scholar.google.com/c...\n", - " NaN\n", + " graham\n", + " dawson\n", + " science and engineering faculty (sef) libraria...\n", + " [ graham colin dawson, g.c. dawson]\n", + " [[qut home page, https://www.library.qut.edu.a...\n", + " g.dawson@qut.edu.au\n", " ...\n", - " NaN\n", - " [[Ilmu Ekonomi, Dr, Universitas Borobudur, Jak...\n", - " [[Assisten Professor/Dr, Institut Agama Islam ...\n", - " 72\n", - " [BASE - Bielefeld Academic Search Engine, Abdu...\n", - " NaN\n", + " 0\n", + " 0\n", + " 0\n", + " 6\n", + " 1\n", + " qut.edu.au\n", " NaN\n", " NaN\n", - " [google.com, syekhnurjati.ac.id, orcid.org, bl...\n", - " 59.0\n", + " [qut.edu.au, qut.edu.au, google.com.au, resear...\n", + " 11.0\n", " \n", " \n", - " 3354430\n", - " 0000-0002-3920-7389\n", + " 218947\n", + " 0000-0003-3193-030x\n", " 1\n", " 1\n", " 1\n", - " А.\n", - " Гусев\n", - " Surname, Name Gusev Alexander LeonidovichDate...\n", - " [Alexander L. Gusev , Alexander Leonidovich Gu...\n", - " [[A.L. Gusev Alternative Energy and Ecology, ...\n", - " NaN\n", + " juan pablo\n", + " wolff mejia\n", + " aspirante a maestría en derecho y negocios int...\n", + " [juan pablo wolff, pablo wolff mejia, juan p. ...\n", + " [[twitter, https://twitter.com/pablomejiam], [...\n", + " juanpmejia@ulasallista.edu.co\n", " ...\n", - " [[ResearcherID, F-8048-2014], [Scopus Author I...\n", - " [[Chemical technology and cryogenic-vacuum tec...\n", - " [[General Director, Scientific Technical Centr...\n", - " 472\n", - " [Publons, DataCite, Scopus - Elsevier, A.L. Gu...\n", - " NaN\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " ulasallista.edu.co\n", " NaN\n", " NaN\n", - " [youtube.com, isjaee.com, researchgate.net, re...\n", - " 111.0\n", + " [twitter.com, youtube.com, google.com, linkedi...\n", + " 11.0\n", " \n", " \n", - " 4004281\n", - " 0000-0002-5710-4041\n", + " 261974\n", + " 0000-0002-5341-6531\n", " 1\n", " 1\n", " 1\n", - " Ryszard\n", - " Romaniuk\n", - " Professor of Electronics and Communications En...\n", - " [R.Romaniuk, R.S.Romaniuk, Ryszard Romaniuk, R...\n", - " [[Scholar Google, http://scholar.google.pl/cit...\n", - " rrom@ise.pw.edu.pl\n", + " trent\n", + " hammond\n", + " mr trent hammond is an honorary research fello...\n", + " [trent ernest hammond (t.e.hammond)]\n", + " [[academic support masters, http://trenthammon...\n", + " trent.hammond@academicsupportmasters.com.au\n", " ...\n", - " [[ISNI, 0000000071432485], [ResearcherID, B-91...\n", - " [[Faculty of Electronics and Information Techn...\n", - " [[Professor, Institute Director, Politechnika ...\n", - " 5008\n", - " [INSPIRE-HEP, ResearcherID, ISNI2ORCID search ...\n", - " ise.pw.edu.pl\n", - " [ise.pw.edu.pl, elka.pw.edu.pl, cern.ch]\n", - " 3.0\n", - " [google.pl, publons.com, scopus.com, mendeley....\n", - " 114.0\n", - " \n", - " \n", - " 4022480\n", - " 0000-0003-2450-090X\n", " 1\n", + " 0\n", + " 0\n", " 1\n", " 1\n", - " Eduard\n", - " Babulak\n", - " Professor Eduard Babulak is accomplished inter...\n", - " [Professor Eduard Babulak]\n", - " [[Honorary Chair, Chief Mentor & Senior Adviso...\n", - " NaN\n", - " ...\n", - " [[Scopus Author ID, 6506867432], [ResearcherID...\n", - " [[Information Technology, Doctor Habilitated (...\n", - " [[Consultant, Horizon 2020 Framework Programme...\n", - " 274\n", - " [The Lens, BASE - Bielefeld Academic Search En...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [worldassessmentcouncil.org, spseke.sk, bcs.or...\n", - " 114.0\n", + " academicsupportmasters.com.au\n", + " [health.nsw.gov.au, csu.edu.au, sociologist.co...\n", + " 5.0\n", + " [wix.com, academia.edu, researchgate.net, rese...\n", + " 12.0\n", " \n", " \n", - " 6335357\n", - " 0000-0003-2593-7134\n", - " 1\n", - " 1\n", - " 1\n", - " Aan\n", - " Jaelani\n", - " All my papers can be downloaded from portal:Re...\n", - " [Jaelani, A., Jaelani, Aan]\n", - " [[Microsoft Academic Research, https://academi...\n", - " aan_jaelani@syekhnurjati.ac.id\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", " ...\n", - " [[Scopus Author ID, 57195963463], [Loop profil...\n", - " [[Post Graduate, S3/Dr, Universitas Islam Nege...\n", - " [[Dr, Institut Agama Islam Negeri Syekh Nurjat...\n", - " 79\n", - " [Publons, Aan Jaelani, Scopus - Elsevier, Dime...\n", - " syekhnurjati.ac.id\n", - " [gmail.com]\n", - " 1.0\n", - " [microsoft.com, twitter.com, academia.edu, aca...\n", - " 67.0\n", " \n", " \n", - " 6489838\n", - " 0000-0002-9965-2425\n", + " 10405738\n", + " 0000-0002-3374-5709\n", " 1\n", " 1\n", " 1\n", - " Jaroslaw\n", - " Spychala\n", - " Jaroslaw Spychala has received a doctoral degr...\n", - " [Jaroslaw Jozef Spychala]\n", - " [[RESUME, http://www.biowebspin.com/wp-content...\n", + " guillermo\n", + " ortiz\n", + " médico, internista, neumólogo, intensivista, e...\n", + " [guillermo ortiz-ruiz]\n", + " [[elsevier, https://www.elsevier.com/], [asoci...\n", " NaN\n", " ...\n", - " [[Scopus Author ID, 7006745874]]\n", - " [[Department of Chemistry, Postdoctoral Associ...\n", - " [[Assistant Professor, Adam Mickiewicz Univers...\n", - " 29\n", - " [Scopus - Elsevier]\n", + " 62\n", + " 0\n", + " 0\n", + " 88\n", + " 0\n", " NaN\n", " NaN\n", " NaN\n", - " [biowebspin.com, biowebspin.com, google.com, l...\n", - " 73.0\n", + " [elsevier.com, amci.org.co, springer.com, revi...\n", + " 12.0\n", " \n", " \n", - " 7570584\n", - " 0000-0003-2183-8112\n", + " 10472264\n", + " 0000-0001-7228-5680\n", " 1\n", " 1\n", " 1\n", - " Pelayo Munhoz\n", - " Olea\n", - " Pós-Doutorado em Gestão Ambiental pela Univers...\n", - " [ Munhoz, Pelayo Olea, Olea, Pelayo, Olea, P...\n", - " [[Currículo Lattes, http://lattes.cnpq.br/6209...\n", + " text\n", + " protocol\n", + " NaN\n", + " NaN\n", + " [[about, https://about.me/textprotocol], [gith...\n", " NaN\n", " ...\n", - " [[Scopus Author ID, 55175503300], [ResearcherI...\n", - " [[, Postdoctoral in Environmental Sustainabili...\n", - " [[Professor, Universidade Federal do Rio Grand...\n", - " 1105\n", - " [The Lens, Pelayo Munhoz Olea, Dimensions, BAS...\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", " NaN\n", " NaN\n", " NaN\n", - " [cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c...\n", - " 61.0\n", + " [about.me, github.com, gitlab.com, gravatar.co...\n", + " 12.0\n", " \n", " \n", - " 10240510\n", - " 0000-0002-6938-9638\n", + " 10785961\n", + " 0000-0002-3064-0194\n", " 1\n", " 1\n", " 1\n", - " Adolfo\n", - " Catral Sanabria\n", - " My education is in computer science, mathemati...\n", + " leonardo fernando\n", + " cruz basso\n", " NaN\n", - " [[ResearchGate Adolfo Catral , https://www.res...\n", " NaN\n", + " [[papers-1, https://www.researchgate.net/profi...\n", + " leonardofernando.basso@mackenzie.br\n", " ...\n", - " [[Loop profile, 747193]]\n", - " [[Education, Capacitación para la enseñanza en...\n", - " NaN\n", - " 2023\n", - " [BASE - Bielefeld Academic Search Engine, Data...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [researchgate.net, youtube.com, linkedin.com, ...\n", - " 152.0\n", + " 5\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " mackenzie.br\n", + " [mackenzie.br]\n", + " 1.0\n", + " [researchgate.net, ssrn.com, cnpq.br, google.c...\n", + " 17.0\n", " \n", " \n", - " 10448304\n", - " 0000-0002-4062-3603\n", + " 10845645\n", + " 0000-0003-1047-4229\n", " 1\n", " 1\n", " 1\n", - " JUAN DE DIOS\n", - " BELTRÁN MANCILLA\n", - " JUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut...\n", - " [Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD...\n", - " [[01.- Juan de Dios Beltrán Mancilla. Teoría O...\n", + " bayu\n", + " sakti\n", + " bayu purbha saktisaya adalah bayu purbha sakti...\n", + " [bayu purbha sakti]\n", + " [[osf, http://osf.io/qe2ug], [inarxiv, https:/...\n", " NaN\n", " ...\n", - " NaN\n", - " [[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR...\n", - " [[INSPECTOR GENERAL JORNADA VESPERTINA // De 2...\n", - " 11\n", - " [JUAN DE DIOS BELTR´´ÁN MANCILLA]\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", " NaN\n", " NaN\n", " NaN\n", - " [yumpu.com, ijopm.org, google.com, blogspot.co...\n", - " 69.0\n", + " [osf.io, osf.io, academia.edu, mendeley.com, f...\n", + " 12.0\n", " \n", " \n", - " 10663894\n", - " 0000-0002-3997-5070\n", + " 10896059\n", + " 0000-0003-4836-7074\n", " 1\n", " 1\n", " 1\n", - " Dr. Parameshachari\n", - " B D\n", - " Dr. Parameshachari B DACM Distinguished Speake...\n", - " [Dr. PARAMESHACHARI B D]\n", - " [[GSSSIETW,MYSURU, http://geethashishu.in/], [...\n", + " karla haydee\n", + " ortiz palafox\n", + " karla haydee ortíz palafoxmiembro del sistema ...\n", + " [karla palafox]\n", + " [[opinión día del maestro, http://www.cronicaj...\n", " NaN\n", " ...\n", - " [[ResearcherID, F-7045-2018], [Scopus Author I...\n", - " [[Electronics and Communication Engineering, P...\n", - " [[ACM Distinguished Speaker (Volunteer), Assoc...\n", - " 93\n", - " [Publons, Multidisciplinary Digital Publishing...\n", + " 0\n", + " 0\n", + " 0\n", + " 2\n", + " 1\n", " NaN\n", " NaN\n", " NaN\n", - " [geethashishu.in, geethashishu.in, acm.org, go...\n", - " 71.0\n", + " [cronicajalisco.com, youtube.com, tlaquepaque....\n", + " 22.0\n", " \n", " \n", "\n", - "

13 rows × 22 columns

\n", + "

141 rows × 29 columns

\n", "" ], "text/plain": [ " orcid claimed verified_email \\\n", - "482862 0000-0003-4948-9268 1 1 \n", - "554859 0000-0002-1929-6054 1 1 \n", - "1381092 0000-0002-9025-8632 1 1 \n", - "2679353 0000-0003-2407-3557 1 1 \n", - "3354430 0000-0002-3920-7389 1 1 \n", - "4004281 0000-0002-5710-4041 1 1 \n", - "4022480 0000-0003-2450-090X 1 1 \n", - "6335357 0000-0003-2593-7134 1 1 \n", - "6489838 0000-0002-9965-2425 1 1 \n", - "7570584 0000-0003-2183-8112 1 1 \n", - "10240510 0000-0002-6938-9638 1 1 \n", - "10448304 0000-0002-4062-3603 1 1 \n", - "10663894 0000-0002-3997-5070 1 1 \n", - "\n", - " verified_primary_email given_names family_name \\\n", - "482862 1 Gustavo Duperré \n", - "554859 1 Franklin Américo Canaza Choque \n", - "1381092 1 buycannabis dispensary \n", - "2679353 1 Abdul Aziz \n", - "3354430 1 А. Гусев \n", - "4004281 1 Ryszard Romaniuk \n", - "4022480 1 Eduard Babulak \n", - "6335357 1 Aan Jaelani \n", - "6489838 1 Jaroslaw Spychala \n", - "7570584 1 Pelayo Munhoz Olea \n", - "10240510 1 Adolfo Catral Sanabria \n", - "10448304 1 JUAN DE DIOS BELTRÁN MANCILLA \n", - "10663894 1 Dr. Parameshachari B D \n", + "97666 0000-0002-7843-8497 1 1 \n", + "200670 0000-0003-1554-1531 1 1 \n", + "210325 0000-0003-3080-4643 1 1 \n", + "218947 0000-0003-3193-030x 1 1 \n", + "261974 0000-0002-5341-6531 1 1 \n", + "... ... ... ... \n", + "10405738 0000-0002-3374-5709 1 1 \n", + "10472264 0000-0001-7228-5680 1 1 \n", + "10785961 0000-0002-3064-0194 1 1 \n", + "10845645 0000-0003-1047-4229 1 1 \n", + "10896059 0000-0003-4836-7074 1 1 \n", + "\n", + " verified_primary_email given_names family_name \\\n", + "97666 1 davi barbosa \n", + "200670 1 katarzyna ochman \n", + "210325 1 graham dawson \n", + "218947 1 juan pablo wolff mejia \n", + "261974 1 trent hammond \n", + "... ... ... ... \n", + "10405738 1 guillermo ortiz \n", + "10472264 1 text protocol \n", + "10785961 1 leonardo fernando cruz basso \n", + "10845645 1 bayu sakti \n", + "10896059 1 karla haydee ortiz palafox \n", "\n", " biography \\\n", - "482862 Gustavo Norberto Duperré graduated in Arts and... \n", - "554859 Docente-Investigador Social. Maestrando en Der... \n", - "1381092 We procure and deliver premium cannabis strain... \n", - "2679353 Abdul Aziz was born on May 25, 1973, in Brebes... \n", - "3354430 Surname, Name Gusev Alexander LeonidovichDate... \n", - "4004281 Professor of Electronics and Communications En... \n", - "4022480 Professor Eduard Babulak is accomplished inter... \n", - "6335357 All my papers can be downloaded from portal:Re... \n", - "6489838 Jaroslaw Spychala has received a doctoral degr... \n", - "7570584 Pós-Doutorado em Gestão Ambiental pela Univers... \n", - "10240510 My education is in computer science, mathemati... \n", - "10448304 JUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut... \n", - "10663894 Dr. Parameshachari B DACM Distinguished Speake... \n", + "97666 pesquisador na área sociojurídica, professor, ... \n", + "200670 katarzyna ochman [kataˈʐɨna ˈɔxman] is assista... \n", + "210325 science and engineering faculty (sef) libraria... \n", + "218947 aspirante a maestría en derecho y negocios int... \n", + "261974 mr trent hammond is an honorary research fello... \n", + "... ... \n", + "10405738 médico, internista, neumólogo, intensivista, e... \n", + "10472264 NaN \n", + "10785961 NaN \n", + "10845645 bayu purbha saktisaya adalah bayu purbha sakti... \n", + "10896059 karla haydee ortíz palafoxmiembro del sistema ... \n", "\n", " other_names \\\n", - "482862 [Gustavo Norberto Duperré, Duperré, G. N.] \n", - "554859 [Franklin Américo Canaza-Choque , Franklin A. ... \n", - "1381092 [We procure and deliver premium cannabis strai... \n", - "2679353 [Abdul Aziz, Aziz, Abdul, Aziz, A., Aziz, Abd,... \n", - "3354430 [Alexander L. Gusev , Alexander Leonidovich Gu... \n", - "4004281 [R.Romaniuk, R.S.Romaniuk, Ryszard Romaniuk, R... \n", - "4022480 [Professor Eduard Babulak] \n", - "6335357 [Jaelani, A., Jaelani, Aan] \n", - "6489838 [Jaroslaw Jozef Spychala] \n", - "7570584 [ Munhoz, Pelayo Olea, Olea, Pelayo, Olea, P... \n", - "10240510 NaN \n", - "10448304 [Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD... \n", - "10663894 [Dr. PARAMESHACHARI B D] \n", + "97666 [professor davi barbosa delmont] \n", + "200670 [[kataˈʐɨna ˈɔxman], catharina ochman, cathari... \n", + "210325 [ graham colin dawson, g.c. dawson] \n", + "218947 [juan pablo wolff, pablo wolff mejia, juan p. ... \n", + "261974 [trent ernest hammond (t.e.hammond)] \n", + "... ... \n", + "10405738 [guillermo ortiz-ruiz] \n", + "10472264 NaN \n", + "10785961 NaN \n", + "10845645 [bayu purbha sakti] \n", + "10896059 [karla palafox] \n", "\n", " urls \\\n", - "482862 [[Gis in Cultural Heritage - ICOMOS România, h... \n", - "554859 [[Consejo Nacional de Ciencia, Tecnología e In... \n", - "1381092 [[find your cannabis & marijuana dispensary , ... \n", - "2679353 [[Google Scholar, https://scholar.google.com/c... \n", - "3354430 [[A.L. Gusev Alternative Energy and Ecology, ... \n", - "4004281 [[Scholar Google, http://scholar.google.pl/cit... \n", - "4022480 [[Honorary Chair, Chief Mentor & Senior Adviso... \n", - "6335357 [[Microsoft Academic Research, https://academi... \n", - "6489838 [[RESUME, http://www.biowebspin.com/wp-content... \n", - "7570584 [[Currículo Lattes, http://lattes.cnpq.br/6209... \n", - "10240510 [[ResearchGate Adolfo Catral , https://www.res... \n", - "10448304 [[01.- Juan de Dios Beltrán Mancilla. Teoría O... \n", - "10663894 [[GSSSIETW,MYSURU, http://geethashishu.in/], [... \n", - "\n", - " primary_email ... \\\n", - "482862 gustavo.duperre@usal.edu.ar ... \n", - "554859 Leo_123fa@hotmail.com ... \n", - "1381092 NaN ... \n", - "2679353 NaN ... \n", - "3354430 NaN ... \n", - "4004281 rrom@ise.pw.edu.pl ... \n", - "4022480 NaN ... \n", - "6335357 aan_jaelani@syekhnurjati.ac.id ... \n", - "6489838 NaN ... \n", - "7570584 NaN ... \n", - "10240510 NaN ... \n", - "10448304 NaN ... \n", - "10663894 NaN ... \n", - "\n", - " external_ids \\\n", - "482862 [[Scopus Author ID, 57195936346], [ResearcherI... \n", - "554859 [[ResearcherID, P-8613-2018], [Loop profile, 8... \n", - "1381092 NaN \n", - "2679353 NaN \n", - "3354430 [[ResearcherID, F-8048-2014], [Scopus Author I... \n", - "4004281 [[ISNI, 0000000071432485], [ResearcherID, B-91... \n", - "4022480 [[Scopus Author ID, 6506867432], [ResearcherID... \n", - "6335357 [[Scopus Author ID, 57195963463], [Loop profil... \n", - "6489838 [[Scopus Author ID, 7006745874]] \n", - "7570584 [[Scopus Author ID, 55175503300], [ResearcherI... \n", - "10240510 [[Loop profile, 747193]] \n", - "10448304 NaN \n", - "10663894 [[ResearcherID, F-7045-2018], [Scopus Author I... \n", - "\n", - " education \\\n", - "482862 [[Programme in History, History of Art and Ter... \n", - "554859 [[Facultad de Ciencias de la Educación , Maest... \n", - "1381092 NaN \n", - "2679353 [[Ilmu Ekonomi, Dr, Universitas Borobudur, Jak... \n", - "3354430 [[Chemical technology and cryogenic-vacuum tec... \n", - "4004281 [[Faculty of Electronics and Information Techn... \n", - "4022480 [[Information Technology, Doctor Habilitated (... \n", - "6335357 [[Post Graduate, S3/Dr, Universitas Islam Nege... \n", - "6489838 [[Department of Chemistry, Postdoctoral Associ... \n", - "7570584 [[, Postdoctoral in Environmental Sustainabili... \n", - "10240510 [[Education, Capacitación para la enseñanza en... \n", - "10448304 [[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR... \n", - "10663894 [[Electronics and Communication Engineering, P... \n", - "\n", - " employment n_works \\\n", - "482862 [[Titular Professor, Dirección General de Cult... 13 \n", - "554859 [[Investigador Social, Universidad Católica de... 38 \n", - "1381092 NaN 10 \n", - "2679353 [[Assisten Professor/Dr, Institut Agama Islam ... 72 \n", - "3354430 [[General Director, Scientific Technical Centr... 472 \n", - "4004281 [[Professor, Institute Director, Politechnika ... 5008 \n", - "4022480 [[Consultant, Horizon 2020 Framework Programme... 274 \n", - "6335357 [[Dr, Institut Agama Islam Negeri Syekh Nurjat... 79 \n", - "6489838 [[Assistant Professor, Adam Mickiewicz Univers... 29 \n", - "7570584 [[Professor, Universidade Federal do Rio Grand... 1105 \n", - "10240510 NaN 2023 \n", - "10448304 [[INSPECTOR GENERAL JORNADA VESPERTINA // De 2... 11 \n", - "10663894 [[ACM Distinguished Speaker (Volunteer), Assoc... 93 \n", - "\n", - " works_source \\\n", - "482862 [Gustavo Duperré, Scopus - Elsevier, Publons, ... \n", - "554859 [ResearcherID, BASE - Bielefeld Academic Searc... \n", - "1381092 [goowonderland dispensary] \n", - "2679353 [BASE - Bielefeld Academic Search Engine, Abdu... \n", - "3354430 [Publons, DataCite, Scopus - Elsevier, A.L. Gu... \n", - "4004281 [INSPIRE-HEP, ResearcherID, ISNI2ORCID search ... \n", - "4022480 [The Lens, BASE - Bielefeld Academic Search En... \n", - "6335357 [Publons, Aan Jaelani, Scopus - Elsevier, Dime... \n", - "6489838 [Scopus - Elsevier] \n", - "7570584 [The Lens, Pelayo Munhoz Olea, Dimensions, BAS... \n", - "10240510 [BASE - Bielefeld Academic Search Engine, Data... \n", - "10448304 [JUAN DE DIOS BELTR´´ÁN MANCILLA] \n", - "10663894 [Publons, Multidisciplinary Digital Publishing... \n", - "\n", - " primary_email_domain \\\n", - "482862 usal.edu.ar \n", - "554859 hotmail.com \n", - "1381092 NaN \n", - "2679353 NaN \n", - "3354430 NaN \n", - "4004281 ise.pw.edu.pl \n", - "4022480 NaN \n", - "6335357 syekhnurjati.ac.id \n", - "6489838 NaN \n", - "7570584 NaN \n", - "10240510 NaN \n", - "10448304 NaN \n", - "10663894 NaN \n", + "97666 [[plataforma de cursos ideia criativa, https:/... \n", + "200670 [[researchgate, https://www.researchgate.net/p... \n", + "210325 [[qut home page, https://www.library.qut.edu.a... \n", + "218947 [[twitter, https://twitter.com/pablomejiam], [... \n", + "261974 [[academic support masters, http://trenthammon... \n", + "... ... \n", + "10405738 [[elsevier, https://www.elsevier.com/], [asoci... \n", + "10472264 [[about, https://about.me/textprotocol], [gith... \n", + "10785961 [[papers-1, https://www.researchgate.net/profi... \n", + "10845645 [[osf, http://osf.io/qe2ug], [inarxiv, https:/... \n", + "10896059 [[opinión día del maestro, http://www.cronicaj... \n", + "\n", + " primary_email ... n_doi n_arxiv \\\n", + "97666 NaN ... 0 0 \n", + "200670 NaN ... 1 0 \n", + "210325 g.dawson@qut.edu.au ... 0 0 \n", + "218947 juanpmejia@ulasallista.edu.co ... 0 0 \n", + "261974 trent.hammond@academicsupportmasters.com.au ... 1 0 \n", + "... ... ... ... ... \n", + "10405738 NaN ... 62 0 \n", + "10472264 NaN ... 0 0 \n", + "10785961 leonardofernando.basso@mackenzie.br ... 5 0 \n", + "10845645 NaN ... 0 0 \n", + "10896059 NaN ... 0 0 \n", + "\n", + " n_pmc n_other_pids label primary_email_domain \\\n", + "97666 0 0 0 NaN \n", + "200670 0 0 1 NaN \n", + "210325 0 6 1 qut.edu.au \n", + "218947 0 0 1 ulasallista.edu.co \n", + "261974 0 1 1 academicsupportmasters.com.au \n", + "... ... ... ... ... \n", + "10405738 0 88 0 NaN \n", + "10472264 0 0 0 NaN \n", + "10785961 0 0 1 mackenzie.br \n", + "10845645 0 0 1 NaN \n", + "10896059 0 2 1 NaN \n", "\n", " other_email_domains n_emails \\\n", - "482862 NaN NaN \n", - "554859 [gmail.com, gmail.com, hotmail.com, baldwin.ed... 5.0 \n", - "1381092 NaN NaN \n", - "2679353 NaN NaN \n", - "3354430 NaN NaN \n", - "4004281 [ise.pw.edu.pl, elka.pw.edu.pl, cern.ch] 3.0 \n", - "4022480 NaN NaN \n", - "6335357 [gmail.com] 1.0 \n", - "6489838 NaN NaN \n", - "7570584 NaN NaN \n", - "10240510 NaN NaN \n", - "10448304 NaN NaN \n", - "10663894 NaN NaN \n", + "97666 NaN NaN \n", + "200670 NaN NaN \n", + "210325 NaN NaN \n", + "218947 NaN NaN \n", + "261974 [health.nsw.gov.au, csu.edu.au, sociologist.co... 5.0 \n", + "... ... ... \n", + "10405738 NaN NaN \n", + "10472264 NaN NaN \n", + "10785961 [mackenzie.br] 1.0 \n", + "10845645 NaN NaN \n", + "10896059 NaN NaN \n", "\n", " url_domains n_urls \n", - "482862 [icomos.ro, unirioja.es, unirioja.es, unc.edu.... 51.0 \n", - "554859 [concytec.gob.pe, redalyc.org, redalyc.org, un... 61.0 \n", - "1381092 [goowonderland.com, goowonderland.com, goowond... 81.0 \n", - "2679353 [google.com, syekhnurjati.ac.id, orcid.org, bl... 59.0 \n", - "3354430 [youtube.com, isjaee.com, researchgate.net, re... 111.0 \n", - "4004281 [google.pl, publons.com, scopus.com, mendeley.... 114.0 \n", - "4022480 [worldassessmentcouncil.org, spseke.sk, bcs.or... 114.0 \n", - "6335357 [microsoft.com, twitter.com, academia.edu, aca... 67.0 \n", - "6489838 [biowebspin.com, biowebspin.com, google.com, l... 73.0 \n", - "7570584 [cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c... 61.0 \n", - "10240510 [researchgate.net, youtube.com, linkedin.com, ... 152.0 \n", - "10448304 [yumpu.com, ijopm.org, google.com, blogspot.co... 69.0 \n", - "10663894 [geethashishu.in, geethashishu.in, acm.org, go... 71.0 \n", - "\n", - "[13 rows x 22 columns]" + "97666 [eadplataforma.com, facebook.com, youtube.com,... 39.0 \n", + "200670 [researchgate.net, academia.edu, facebook.com,... 11.0 \n", + "210325 [qut.edu.au, qut.edu.au, google.com.au, resear... 11.0 \n", + "218947 [twitter.com, youtube.com, google.com, linkedi... 11.0 \n", + "261974 [wix.com, academia.edu, researchgate.net, rese... 12.0 \n", + "... ... ... \n", + "10405738 [elsevier.com, amci.org.co, springer.com, revi... 12.0 \n", + "10472264 [about.me, github.com, gitlab.com, gravatar.co... 12.0 \n", + "10785961 [researchgate.net, ssrn.com, cnpq.br, google.c... 17.0 \n", + "10845645 [osf.io, osf.io, academia.edu, mendeley.com, f... 12.0 \n", + "10896059 [cronicajalisco.com, youtube.com, tlaquepaque.... 22.0 \n", + "\n", + "[141 rows x 29 columns]" ] }, - "execution_count": 48, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]" + "df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 47, "metadata": { "scrolled": true }, @@ -8672,11 +8315,11 @@ " urls\n", " primary_email\n", " ...\n", - " external_ids\n", - " education\n", - " employment\n", - " n_works\n", - " works_source\n", + " n_doi\n", + " n_arxiv\n", + " n_pmc\n", + " n_other_pids\n", + " label\n", " primary_email_domain\n", " other_email_domains\n", " n_emails\n", @@ -8686,124 +8329,124 @@ " \n", " \n", " \n", - " 45566\n", - " 0000-0003-1948-3180\n", + " 0\n", + " 0000-0002-7843-8497\n", " 1\n", " 1\n", " 1\n", - " Mark\n", - " Katz\n", - " Mark N. Katz is a professor of government and ...\n", - " NaN\n", - " [[Adjusting to Change: American Foreign Policy...\n", + " davi\n", + " barbosa\n", + " pesquisador na área sociojurídica, professor, ...\n", + " [professor davi barbosa delmont]\n", + " [[plataforma de cursos ideia criativa, https:/...\n", " NaN\n", " ...\n", - " [[Scopus Author ID, 25649901800]]\n", - " [[Political Science, Ph.D., Massachusetts Inst...\n", - " [[Professor of Government and Politics, George...\n", - " 58\n", - " [Scopus - Elsevier]\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", " NaN\n", " NaN\n", " NaN\n", - " [wordpress.com, marknkatz.com, gmu.edu, atlant...\n", - " 16.0\n", + " [eadplataforma.com, facebook.com, youtube.com,...\n", + " 39.0\n", " \n", " \n", - " 72674\n", - " 0000-0002-2000-8339\n", + " 1\n", + " 0000-0003-1554-1531\n", " 1\n", " 1\n", " 1\n", - " Phòng khám tư nhân Hà Nội\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [[Sức khỏe, https://onhealth.vn/], [Khám phụ k...\n", + " katarzyna\n", + " ochman\n", + " katarzyna ochman [kataˈʐɨna ˈɔxman] is assista...\n", + " [[kataˈʐɨna ˈɔxman], catharina ochman, cathari...\n", + " [[researchgate, https://www.researchgate.net/p...\n", " NaN\n", " ...\n", + " 1\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", " NaN\n", " NaN\n", " NaN\n", - " 4\n", - " [Phòng khám tư nhân Hà Nội]\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [onhealth.vn, onhealth.vn, onhealth.vn, onheal...\n", - " 49.0\n", + " [researchgate.net, academia.edu, facebook.com,...\n", + " 11.0\n", " \n", " \n", - " 172820\n", - " 0000-0001-9293-2224\n", + " 2\n", + " 0000-0003-3080-4643\n", " 1\n", " 1\n", " 1\n", - " Juan Carlos\n", - " Garcia Hoyos\n", - " My name is Juan Carlos García Hoyos. I was bor...\n", - " [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /...\n", - " [[Air Force Office of Scientific Research (WRI...\n", - " NaN\n", + " graham\n", + " dawson\n", + " science and engineering faculty (sef) libraria...\n", + " [ graham colin dawson, g.c. dawson]\n", + " [[qut home page, https://www.library.qut.edu.a...\n", + " g.dawson@qut.edu.au\n", " ...\n", - " NaN\n", - " [[Faculty of Philosophy, History - Ph.D., Char...\n", - " [[responsible for the Project Service Level Ag...\n", - " 20\n", - " [Juan Carlos Garcia Hoyos]\n", - " NaN\n", + " 0\n", + " 0\n", + " 0\n", + " 6\n", + " 1\n", + " qut.edu.au\n", " NaN\n", " NaN\n", - " [af.mil, gst.com, govtribe.com, sbir.gov, open...\n", - " 28.0\n", + " [qut.edu.au, qut.edu.au, google.com.au, resear...\n", + " 11.0\n", " \n", " \n", - " 209505\n", - " 0000-0003-3045-0056\n", + " 3\n", + " 0000-0003-3193-030x\n", " 1\n", " 1\n", " 1\n", - " Ananda\n", - " Majumdar\n", - " I am Ananda Majumdar, Child Care Educator at B...\n", - " NaN\n", - " [[Migration Scholar and Ananda , https://grfdt...\n", - " NaN\n", + " juan pablo\n", + " wolff mejia\n", + " aspirante a maestría en derecho y negocios int...\n", + " [juan pablo wolff, pablo wolff mejia, juan p. ...\n", + " [[twitter, https://twitter.com/pablomejiam], [...\n", + " juanpmejia@ulasallista.edu.co\n", " ...\n", - " NaN\n", - " [[Education , B.Ed. After Degree , University ...\n", - " [[General Coordinator- University of Alberta C...\n", - " 43\n", - " [Ananda Majumdar]\n", - " NaN\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " ulasallista.edu.co\n", " NaN\n", " NaN\n", - " [grfdt.com, linkedin.com, academia.edu, resear...\n", - " 24.0\n", + " [twitter.com, youtube.com, google.com, linkedi...\n", + " 11.0\n", " \n", " \n", - " 259877\n", - " 0000-0003-1815-5732\n", + " 4\n", + " 0000-0002-5341-6531\n", " 1\n", " 1\n", " 1\n", - " JAS\n", - " (Jurnal Akuntansi Syariah)\n", - " JAS (Jurnal Akuntansi Syariah) published in pr...\n", - " NaN\n", - " [[Website, https://ejournal.stiesyariahbengkal...\n", - " NaN\n", + " trent\n", + " hammond\n", + " mr trent hammond is an honorary research fello...\n", + " [trent ernest hammond (t.e.hammond)]\n", + " [[academic support masters, http://trenthammon...\n", + " trent.hammond@academicsupportmasters.com.au\n", " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 67\n", - " [JAS (Jurnal Akuntansi Syariah)]\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [stiesyariahbengkalis.ac.id, lipi.go.id, cross...\n", - " 17.0\n", + " 1\n", + " 0\n", + " 0\n", + " 1\n", + " 1\n", + " academicsupportmasters.com.au\n", + " [health.nsw.gov.au, csu.edu.au, sociologist.co...\n", + " 5.0\n", + " [wix.com, academia.edu, researchgate.net, rese...\n", + " 12.0\n", " \n", " \n", " ...\n", @@ -8830,302 +8473,264 @@ " ...\n", " \n", " \n", - " 10494820\n", - " 0000-0002-1324-7171\n", + " 136\n", + " 0000-0002-3374-5709\n", " 1\n", " 1\n", " 1\n", - " Vanesa Natalia\n", - " Rodriguez\n", - " Nombre y Apellido: Vanesa Natalia Rodriguez. ...\n", - " [Vanesa Rodriguez, Vanesa N. Rodriguez]\n", - " [[De rufianes y franchutas Representaciones y ...\n", + " guillermo\n", + " ortiz\n", + " médico, internista, neumólogo, intensivista, e...\n", + " [guillermo ortiz-ruiz]\n", + " [[elsevier, https://www.elsevier.com/], [asoci...\n", " NaN\n", " ...\n", - " NaN\n", - " [[, Maestría en Ciencias Sociales con Mención ...\n", - " [[Profesora, Universidad Nacional de La Matanz...\n", - " 7\n", - " [Vanesa Natalia Rodriguez]\n", + " 62\n", + " 0\n", + " 0\n", + " 88\n", + " 0\n", " NaN\n", " NaN\n", " NaN\n", - " [unlam.edu.ar, unirioja.es, amazon.fr, abebook...\n", - " 19.0\n", + " [elsevier.com, amci.org.co, springer.com, revi...\n", + " 12.0\n", " \n", " \n", - " 10495806\n", - " 0000-0002-1700-8311\n", + " 137\n", + " 0000-0001-7228-5680\n", " 1\n", " 1\n", " 1\n", - " Fix-IT\n", - " Rite\n", + " text\n", + " protocol\n", " NaN\n", - " [Best Heating & Plumbing Company]\n", - " [[Website, https://fix-itrite.com], [Muckrack,...\n", - " NaN\n", - " ...\n", " NaN\n", + " [[about, https://about.me/textprotocol], [gith...\n", " NaN\n", - " NaN\n", - " 1\n", - " [Fix-It Rite]\n", + " ...\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", " NaN\n", " NaN\n", " NaN\n", - " [fix-itrite.com, muckrack.com, tumblr.com, dri...\n", - " 11.0\n", + " [about.me, github.com, gitlab.com, gravatar.co...\n", + " 12.0\n", " \n", " \n", - " 10633545\n", - " 0000-0003-2676-4431\n", + " 138\n", + " 0000-0002-3064-0194\n", " 1\n", " 1\n", " 1\n", - " Benny\n", - " Soewandi\n", + " leonardo fernando\n", + " cruz basso\n", " NaN\n", - " [Benny Soewandi]\n", - " [[Conservation Efforts as a Result of Theoreti...\n", " NaN\n", + " [[papers-1, https://www.researchgate.net/profi...\n", + " leonardofernando.basso@mackenzie.br\n", " ...\n", - " NaN\n", - " NaN\n", - " [[Membership, Paguyuban Pelestarian Budaya Ban...\n", - " 2\n", - " [Benny Soewandi]\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [wordpress.com, wordpress.com, linkedin.com, f...\n", - " 11.0\n", + " 5\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " mackenzie.br\n", + " [mackenzie.br]\n", + " 1.0\n", + " [researchgate.net, ssrn.com, cnpq.br, google.c...\n", + " 17.0\n", " \n", " \n", - " 10648241\n", - " 0000-0001-8157-0600\n", + " 139\n", + " 0000-0003-1047-4229\n", " 1\n", " 1\n", " 1\n", - " Bijan\n", - " Yavar\n", - " Senior Research Assistant and Phd Student in O...\n", - " [B. Yavar, Yavar Bijan]\n", - " [[Web of Science (Pub) Researcher ID: A-3544-2...\n", + " bayu\n", + " sakti\n", + " bayu purbha saktisaya adalah bayu purbha sakti...\n", + " [bayu purbha sakti]\n", + " [[osf, http://osf.io/qe2ug], [inarxiv, https:/...\n", " NaN\n", " ...\n", - " [[Scopus Author ID, 56556873600]]\n", - " NaN\n", - " NaN\n", - " 6\n", - " [Scopus - Elsevier]\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", " NaN\n", " NaN\n", " NaN\n", - " [publons.com, articulate.com, zenodo.org, orci...\n", - " 15.0\n", + " [osf.io, osf.io, academia.edu, mendeley.com, f...\n", + " 12.0\n", " \n", " \n", - " 10679699\n", - " 0000-0002-9874-1450\n", + " 140\n", + " 0000-0003-4836-7074\n", " 1\n", " 1\n", " 1\n", - " FENGZHI\n", - " WU\n", - " NaN\n", - " NaN\n", - " [[A Systematic Study on the Dynamic Softening ...\n", + " karla haydee\n", + " ortiz palafox\n", + " karla haydee ortíz palafoxmiembro del sistema ...\n", + " [karla palafox]\n", + " [[opinión día del maestro, http://www.cronicaj...\n", " NaN\n", " ...\n", + " 0\n", + " 0\n", + " 0\n", + " 2\n", + " 1\n", " NaN\n", " NaN\n", " NaN\n", - " 3\n", - " [FENGZHI WU]\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [springer.com, sciencedirect.com, sciencedirec...\n", - " 23.0\n", + " [cronicajalisco.com, youtube.com, tlaquepaque....\n", + " 22.0\n", " \n", " \n", "\n", - "

139 rows × 22 columns

\n", + "

141 rows × 29 columns

\n", "" ], "text/plain": [ - " orcid claimed verified_email \\\n", - "45566 0000-0003-1948-3180 1 1 \n", - "72674 0000-0002-2000-8339 1 1 \n", - "172820 0000-0001-9293-2224 1 1 \n", - "209505 0000-0003-3045-0056 1 1 \n", - "259877 0000-0003-1815-5732 1 1 \n", - "... ... ... ... \n", - "10494820 0000-0002-1324-7171 1 1 \n", - "10495806 0000-0002-1700-8311 1 1 \n", - "10633545 0000-0003-2676-4431 1 1 \n", - "10648241 0000-0001-8157-0600 1 1 \n", - "10679699 0000-0002-9874-1450 1 1 \n", - "\n", - " verified_primary_email given_names \\\n", - "45566 1 Mark \n", - "72674 1 Phòng khám tư nhân Hà Nội \n", - "172820 1 Juan Carlos \n", - "209505 1 Ananda \n", - "259877 1 JAS \n", - "... ... ... \n", - "10494820 1 Vanesa Natalia \n", - "10495806 1 Fix-IT \n", - "10633545 1 Benny \n", - "10648241 1 Bijan \n", - "10679699 1 FENGZHI \n", - "\n", - " family_name \\\n", - "45566 Katz \n", - "72674 NaN \n", - "172820 Garcia Hoyos \n", - "209505 Majumdar \n", - "259877 (Jurnal Akuntansi Syariah) \n", - "... ... \n", - "10494820 Rodriguez \n", - "10495806 Rite \n", - "10633545 Soewandi \n", - "10648241 Yavar \n", - "10679699 WU \n", - "\n", - " biography \\\n", - "45566 Mark N. Katz is a professor of government and ... \n", - "72674 NaN \n", - "172820 My name is Juan Carlos García Hoyos. I was bor... \n", - "209505 I am Ananda Majumdar, Child Care Educator at B... \n", - "259877 JAS (Jurnal Akuntansi Syariah) published in pr... \n", - "... ... \n", - "10494820 Nombre y Apellido: Vanesa Natalia Rodriguez. ... \n", - "10495806 NaN \n", - "10633545 NaN \n", - "10648241 Senior Research Assistant and Phd Student in O... \n", - "10679699 NaN \n", + " orcid claimed verified_email verified_primary_email \\\n", + "0 0000-0002-7843-8497 1 1 1 \n", + "1 0000-0003-1554-1531 1 1 1 \n", + "2 0000-0003-3080-4643 1 1 1 \n", + "3 0000-0003-3193-030x 1 1 1 \n", + "4 0000-0002-5341-6531 1 1 1 \n", + ".. ... ... ... ... \n", + "136 0000-0002-3374-5709 1 1 1 \n", + "137 0000-0001-7228-5680 1 1 1 \n", + "138 0000-0002-3064-0194 1 1 1 \n", + "139 0000-0003-1047-4229 1 1 1 \n", + "140 0000-0003-4836-7074 1 1 1 \n", + "\n", + " given_names family_name \\\n", + "0 davi barbosa \n", + "1 katarzyna ochman \n", + "2 graham dawson \n", + "3 juan pablo wolff mejia \n", + "4 trent hammond \n", + ".. ... ... \n", + "136 guillermo ortiz \n", + "137 text protocol \n", + "138 leonardo fernando cruz basso \n", + "139 bayu sakti \n", + "140 karla haydee ortiz palafox \n", "\n", - " other_names \\\n", - "45566 NaN \n", - "72674 NaN \n", - "172820 [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... \n", - "209505 NaN \n", - "259877 NaN \n", - "... ... \n", - "10494820 [Vanesa Rodriguez, Vanesa N. Rodriguez] \n", - "10495806 [Best Heating & Plumbing Company] \n", - "10633545 [Benny Soewandi] \n", - "10648241 [B. Yavar, Yavar Bijan] \n", - "10679699 NaN \n", + " biography \\\n", + "0 pesquisador na área sociojurídica, professor, ... \n", + "1 katarzyna ochman [kataˈʐɨna ˈɔxman] is assista... \n", + "2 science and engineering faculty (sef) libraria... \n", + "3 aspirante a maestría en derecho y negocios int... \n", + "4 mr trent hammond is an honorary research fello... \n", + ".. ... \n", + "136 médico, internista, neumólogo, intensivista, e... \n", + "137 NaN \n", + "138 NaN \n", + "139 bayu purbha saktisaya adalah bayu purbha sakti... \n", + "140 karla haydee ortíz palafoxmiembro del sistema ... \n", "\n", - " urls primary_email \\\n", - "45566 [[Adjusting to Change: American Foreign Policy... NaN \n", - "72674 [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN \n", - "172820 [[Air Force Office of Scientific Research (WRI... NaN \n", - "209505 [[Migration Scholar and Ananda , https://grfdt... NaN \n", - "259877 [[Website, https://ejournal.stiesyariahbengkal... NaN \n", - "... ... ... \n", - "10494820 [[De rufianes y franchutas Representaciones y ... NaN \n", - "10495806 [[Website, https://fix-itrite.com], [Muckrack,... NaN \n", - "10633545 [[Conservation Efforts as a Result of Theoreti... NaN \n", - "10648241 [[Web of Science (Pub) Researcher ID: A-3544-2... NaN \n", - "10679699 [[A Systematic Study on the Dynamic Softening ... NaN \n", - "\n", - " ... external_ids \\\n", - "45566 ... [[Scopus Author ID, 25649901800]] \n", - "72674 ... NaN \n", - "172820 ... NaN \n", - "209505 ... NaN \n", - "259877 ... NaN \n", - "... ... ... \n", - "10494820 ... NaN \n", - "10495806 ... NaN \n", - "10633545 ... NaN \n", - "10648241 ... [[Scopus Author ID, 56556873600]] \n", - "10679699 ... NaN \n", - "\n", - " education \\\n", - "45566 [[Political Science, Ph.D., Massachusetts Inst... \n", - "72674 NaN \n", - "172820 [[Faculty of Philosophy, History - Ph.D., Char... \n", - "209505 [[Education , B.Ed. After Degree , University ... \n", - "259877 NaN \n", - "... ... \n", - "10494820 [[, Maestría en Ciencias Sociales con Mención ... \n", - "10495806 NaN \n", - "10633545 NaN \n", - "10648241 NaN \n", - "10679699 NaN \n", - "\n", - " employment n_works \\\n", - "45566 [[Professor of Government and Politics, George... 58 \n", - "72674 NaN 4 \n", - "172820 [[responsible for the Project Service Level Ag... 20 \n", - "209505 [[General Coordinator- University of Alberta C... 43 \n", - "259877 NaN 67 \n", - "... ... ... \n", - "10494820 [[Profesora, Universidad Nacional de La Matanz... 7 \n", - "10495806 NaN 1 \n", - "10633545 [[Membership, Paguyuban Pelestarian Budaya Ban... 2 \n", - "10648241 NaN 6 \n", - "10679699 NaN 3 \n", - "\n", - " works_source primary_email_domain \\\n", - "45566 [Scopus - Elsevier] NaN \n", - "72674 [Phòng khám tư nhân Hà Nội] NaN \n", - "172820 [Juan Carlos Garcia Hoyos] NaN \n", - "209505 [Ananda Majumdar] NaN \n", - "259877 [JAS (Jurnal Akuntansi Syariah)] NaN \n", - "... ... ... \n", - "10494820 [Vanesa Natalia Rodriguez] NaN \n", - "10495806 [Fix-It Rite] NaN \n", - "10633545 [Benny Soewandi] NaN \n", - "10648241 [Scopus - Elsevier] NaN \n", - "10679699 [FENGZHI WU] NaN \n", - "\n", - " other_email_domains n_emails \\\n", - "45566 NaN NaN \n", - "72674 NaN NaN \n", - "172820 NaN NaN \n", - "209505 NaN NaN \n", - "259877 NaN NaN \n", - "... ... ... \n", - "10494820 NaN NaN \n", - "10495806 NaN NaN \n", - "10633545 NaN NaN \n", - "10648241 NaN NaN \n", - "10679699 NaN NaN \n", + " other_names \\\n", + "0 [professor davi barbosa delmont] \n", + "1 [[kataˈʐɨna ˈɔxman], catharina ochman, cathari... \n", + "2 [ graham colin dawson, g.c. dawson] \n", + "3 [juan pablo wolff, pablo wolff mejia, juan p. ... \n", + "4 [trent ernest hammond (t.e.hammond)] \n", + ".. ... \n", + "136 [guillermo ortiz-ruiz] \n", + "137 NaN \n", + "138 NaN \n", + "139 [bayu purbha sakti] \n", + "140 [karla palafox] \n", + "\n", + " urls \\\n", + "0 [[plataforma de cursos ideia criativa, https:/... \n", + "1 [[researchgate, https://www.researchgate.net/p... \n", + "2 [[qut home page, https://www.library.qut.edu.a... \n", + "3 [[twitter, https://twitter.com/pablomejiam], [... \n", + "4 [[academic support masters, http://trenthammon... \n", + ".. ... \n", + "136 [[elsevier, https://www.elsevier.com/], [asoci... \n", + "137 [[about, https://about.me/textprotocol], [gith... \n", + "138 [[papers-1, https://www.researchgate.net/profi... \n", + "139 [[osf, http://osf.io/qe2ug], [inarxiv, https:/... \n", + "140 [[opinión día del maestro, http://www.cronicaj... \n", + "\n", + " primary_email ... n_doi n_arxiv n_pmc \\\n", + "0 NaN ... 0 0 0 \n", + "1 NaN ... 1 0 0 \n", + "2 g.dawson@qut.edu.au ... 0 0 0 \n", + "3 juanpmejia@ulasallista.edu.co ... 0 0 0 \n", + "4 trent.hammond@academicsupportmasters.com.au ... 1 0 0 \n", + ".. ... ... ... ... ... \n", + "136 NaN ... 62 0 0 \n", + "137 NaN ... 0 0 0 \n", + "138 leonardofernando.basso@mackenzie.br ... 5 0 0 \n", + "139 NaN ... 0 0 0 \n", + "140 NaN ... 0 0 0 \n", + "\n", + " n_other_pids label primary_email_domain \\\n", + "0 0 0 NaN \n", + "1 0 1 NaN \n", + "2 6 1 qut.edu.au \n", + "3 0 1 ulasallista.edu.co \n", + "4 1 1 academicsupportmasters.com.au \n", + ".. ... ... ... \n", + "136 88 0 NaN \n", + "137 0 0 NaN \n", + "138 0 1 mackenzie.br \n", + "139 0 1 NaN \n", + "140 2 1 NaN \n", + "\n", + " other_email_domains n_emails \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 [health.nsw.gov.au, csu.edu.au, sociologist.co... 5.0 \n", + ".. ... ... \n", + "136 NaN NaN \n", + "137 NaN NaN \n", + "138 [mackenzie.br] 1.0 \n", + "139 NaN NaN \n", + "140 NaN NaN \n", "\n", - " url_domains n_urls \n", - "45566 [wordpress.com, marknkatz.com, gmu.edu, atlant... 16.0 \n", - "72674 [onhealth.vn, onhealth.vn, onhealth.vn, onheal... 49.0 \n", - "172820 [af.mil, gst.com, govtribe.com, sbir.gov, open... 28.0 \n", - "209505 [grfdt.com, linkedin.com, academia.edu, resear... 24.0 \n", - "259877 [stiesyariahbengkalis.ac.id, lipi.go.id, cross... 17.0 \n", - "... ... ... \n", - "10494820 [unlam.edu.ar, unirioja.es, amazon.fr, abebook... 19.0 \n", - "10495806 [fix-itrite.com, muckrack.com, tumblr.com, dri... 11.0 \n", - "10633545 [wordpress.com, wordpress.com, linkedin.com, f... 11.0 \n", - "10648241 [publons.com, articulate.com, zenodo.org, orci... 15.0 \n", - "10679699 [springer.com, sciencedirect.com, sciencedirec... 23.0 \n", + " url_domains n_urls \n", + "0 [eadplataforma.com, facebook.com, youtube.com,... 39.0 \n", + "1 [researchgate.net, academia.edu, facebook.com,... 11.0 \n", + "2 [qut.edu.au, qut.edu.au, google.com.au, resear... 11.0 \n", + "3 [twitter.com, youtube.com, google.com, linkedi... 11.0 \n", + "4 [wix.com, academia.edu, researchgate.net, rese... 12.0 \n", + ".. ... ... \n", + "136 [elsevier.com, amci.org.co, springer.com, revi... 12.0 \n", + "137 [about.me, github.com, gitlab.com, gravatar.co... 12.0 \n", + "138 [researchgate.net, ssrn.com, cnpq.br, google.c... 17.0 \n", + "139 [osf.io, osf.io, academia.edu, mendeley.com, f... 12.0 \n", + "140 [cronicajalisco.com, youtube.com, tlaquepaque.... 22.0 \n", "\n", - "[139 rows x 22 columns]" + "[141 rows x 29 columns]" ] }, - "execution_count": 49, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]" + "exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)\n", + "exploded_sources" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 48, "metadata": { "scrolled": true }, @@ -9162,11 +8767,11 @@ " urls\n", " primary_email\n", " ...\n", - " external_ids\n", - " education\n", - " employment\n", - " n_works\n", - " works_source\n", + " n_doi\n", + " n_arxiv\n", + " n_pmc\n", + " n_other_pids\n", + " label\n", " primary_email_domain\n", " other_email_domains\n", " n_emails\n", @@ -9177,123 +8782,123 @@ " \n", " \n", " 0\n", - " 0000-0003-1948-3180\n", + " 0000-0002-7843-8497\n", " 1\n", " 1\n", " 1\n", - " Mark\n", - " Katz\n", - " Mark N. Katz is a professor of government and ...\n", - " NaN\n", - " [[Adjusting to Change: American Foreign Policy...\n", + " davi\n", + " barbosa\n", + " pesquisador na área sociojurídica, professor, ...\n", + " [professor davi barbosa delmont]\n", + " [[plataforma de cursos ideia criativa, https:/...\n", " NaN\n", " ...\n", - " [[Scopus Author ID, 25649901800]]\n", - " [[Political Science, Ph.D., Massachusetts Inst...\n", - " [[Professor of Government and Politics, George...\n", - " 58\n", - " Scopus - Elsevier\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", " NaN\n", " NaN\n", " NaN\n", - " [wordpress.com, marknkatz.com, gmu.edu, atlant...\n", - " 16.0\n", + " [eadplataforma.com, facebook.com, youtube.com,...\n", + " 39.0\n", " \n", " \n", " 1\n", - " 0000-0002-2000-8339\n", + " 0000-0003-1554-1531\n", " 1\n", " 1\n", " 1\n", - " Phòng khám tư nhân Hà Nội\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [[Sức khỏe, https://onhealth.vn/], [Khám phụ k...\n", + " katarzyna\n", + " ochman\n", + " katarzyna ochman [kataˈʐɨna ˈɔxman] is assista...\n", + " [[kataˈʐɨna ˈɔxman], catharina ochman, cathari...\n", + " [[researchgate, https://www.researchgate.net/p...\n", " NaN\n", " ...\n", + " 1\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", " NaN\n", " NaN\n", " NaN\n", - " 4\n", - " Phòng khám tư nhân Hà Nội\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [onhealth.vn, onhealth.vn, onhealth.vn, onheal...\n", - " 49.0\n", + " [researchgate.net, academia.edu, facebook.com,...\n", + " 11.0\n", " \n", " \n", - " 2\n", - " 0000-0001-9293-2224\n", + " 3\n", + " 0000-0003-3193-030x\n", " 1\n", " 1\n", " 1\n", - " Juan Carlos\n", - " Garcia Hoyos\n", - " My name is Juan Carlos García Hoyos. I was bor...\n", - " [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /...\n", - " [[Air Force Office of Scientific Research (WRI...\n", - " NaN\n", + " juan pablo\n", + " wolff mejia\n", + " aspirante a maestría en derecho y negocios int...\n", + " [juan pablo wolff, pablo wolff mejia, juan p. ...\n", + " [[twitter, https://twitter.com/pablomejiam], [...\n", + " juanpmejia@ulasallista.edu.co\n", " ...\n", - " NaN\n", - " [[Faculty of Philosophy, History - Ph.D., Char...\n", - " [[responsible for the Project Service Level Ag...\n", - " 20\n", - " Juan Carlos Garcia Hoyos\n", - " NaN\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " ulasallista.edu.co\n", " NaN\n", " NaN\n", - " [af.mil, gst.com, govtribe.com, sbir.gov, open...\n", - " 28.0\n", + " [twitter.com, youtube.com, google.com, linkedi...\n", + " 11.0\n", " \n", " \n", - " 3\n", - " 0000-0003-3045-0056\n", + " 4\n", + " 0000-0002-5341-6531\n", " 1\n", " 1\n", " 1\n", - " Ananda\n", - " Majumdar\n", - " I am Ananda Majumdar, Child Care Educator at B...\n", - " NaN\n", - " [[Migration Scholar and Ananda , https://grfdt...\n", - " NaN\n", + " trent\n", + " hammond\n", + " mr trent hammond is an honorary research fello...\n", + " [trent ernest hammond (t.e.hammond)]\n", + " [[academic support masters, http://trenthammon...\n", + " trent.hammond@academicsupportmasters.com.au\n", " ...\n", - " NaN\n", - " [[Education , B.Ed. After Degree , University ...\n", - " [[General Coordinator- University of Alberta C...\n", - " 43\n", - " Ananda Majumdar\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [grfdt.com, linkedin.com, academia.edu, resear...\n", - " 24.0\n", + " 1\n", + " 0\n", + " 0\n", + " 1\n", + " 1\n", + " academicsupportmasters.com.au\n", + " [health.nsw.gov.au, csu.edu.au, sociologist.co...\n", + " 5.0\n", + " [wix.com, academia.edu, researchgate.net, rese...\n", + " 12.0\n", " \n", " \n", - " 4\n", - " 0000-0003-1815-5732\n", + " 5\n", + " 0000-0001-5295-2271\n", " 1\n", " 1\n", " 1\n", - " JAS\n", - " (Jurnal Akuntansi Syariah)\n", - " JAS (Jurnal Akuntansi Syariah) published in pr...\n", + " antoniy\n", + " moysey\n", " NaN\n", - " [[Website, https://ejournal.stiesyariahbengkal...\n", " NaN\n", + " [[academic journals database, http://journalda...\n", + " antoniimoisei@bsmu.edu.ua\n", " ...\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", + " bsmu.edu.ua\n", " NaN\n", " NaN\n", - " NaN\n", - " 67\n", - " JAS (Jurnal Akuntansi Syariah)\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [stiesyariahbengkalis.ac.id, lipi.go.id, cross...\n", - " 17.0\n", + " [journaldatabase.info, nplu.org, acls.org, ind...\n", + " 21.0\n", " \n", " \n", " ...\n", @@ -9320,262 +8925,314 @@ " ...\n", " \n", " \n", - " 134\n", - " 0000-0002-1324-7171\n", + " 135\n", + " 0000-0002-8125-0081\n", " 1\n", " 1\n", " 1\n", - " Vanesa Natalia\n", - " Rodriguez\n", - " Nombre y Apellido: Vanesa Natalia Rodriguez. ...\n", - " [Vanesa Rodriguez, Vanesa N. Rodriguez]\n", - " [[De rufianes y franchutas Representaciones y ...\n", + " issam\n", + " bencheikh\n", " NaN\n", - " ...\n", + " [issame1982, دكتور عصام بن الشيخ]\n", + " [[my blog web site, http://issame1982.blogspot...\n", " NaN\n", - " [[, Maestría en Ciencias Sociales con Mención ...\n", - " [[Profesora, Universidad Nacional de La Matanz...\n", - " 7\n", - " Vanesa Natalia Rodriguez\n", + " ...\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", " NaN\n", " NaN\n", " NaN\n", - " [unlam.edu.ar, unirioja.es, amazon.fr, abebook...\n", - " 19.0\n", + " [blogspot.com, researchgate.net, google.com, l...\n", + " 12.0\n", " \n", " \n", - " 135\n", - " 0000-0002-1700-8311\n", + " 136\n", + " 0000-0002-3374-5709\n", " 1\n", " 1\n", " 1\n", - " Fix-IT\n", - " Rite\n", - " NaN\n", - " [Best Heating & Plumbing Company]\n", - " [[Website, https://fix-itrite.com], [Muckrack,...\n", + " guillermo\n", + " ortiz\n", + " médico, internista, neumólogo, intensivista, e...\n", + " [guillermo ortiz-ruiz]\n", + " [[elsevier, https://www.elsevier.com/], [asoci...\n", " NaN\n", " ...\n", + " 62\n", + " 0\n", + " 0\n", + " 88\n", + " 0\n", " NaN\n", " NaN\n", " NaN\n", - " 1\n", - " Fix-It Rite\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [fix-itrite.com, muckrack.com, tumblr.com, dri...\n", - " 11.0\n", + " [elsevier.com, amci.org.co, springer.com, revi...\n", + " 12.0\n", " \n", " \n", - " 136\n", - " 0000-0003-2676-4431\n", + " 137\n", + " 0000-0001-7228-5680\n", " 1\n", " 1\n", " 1\n", - " Benny\n", - " Soewandi\n", + " text\n", + " protocol\n", " NaN\n", - " [Benny Soewandi]\n", - " [[Conservation Efforts as a Result of Theoreti...\n", - " NaN\n", - " ...\n", " NaN\n", + " [[about, https://about.me/textprotocol], [gith...\n", " NaN\n", - " [[Membership, Paguyuban Pelestarian Budaya Ban...\n", - " 2\n", - " Benny Soewandi\n", + " ...\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", " NaN\n", " NaN\n", " NaN\n", - " [wordpress.com, wordpress.com, linkedin.com, f...\n", - " 11.0\n", + " [about.me, github.com, gitlab.com, gravatar.co...\n", + " 12.0\n", " \n", " \n", - " 137\n", - " 0000-0001-8157-0600\n", + " 139\n", + " 0000-0003-1047-4229\n", " 1\n", " 1\n", " 1\n", - " Bijan\n", - " Yavar\n", - " Senior Research Assistant and Phd Student in O...\n", - " [B. Yavar, Yavar Bijan]\n", - " [[Web of Science (Pub) Researcher ID: A-3544-2...\n", + " bayu\n", + " sakti\n", + " bayu purbha saktisaya adalah bayu purbha sakti...\n", + " [bayu purbha sakti]\n", + " [[osf, http://osf.io/qe2ug], [inarxiv, https:/...\n", " NaN\n", " ...\n", - " [[Scopus Author ID, 56556873600]]\n", - " NaN\n", - " NaN\n", - " 6\n", - " Scopus - Elsevier\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 1\n", " NaN\n", " NaN\n", " NaN\n", - " [publons.com, articulate.com, zenodo.org, orci...\n", - " 15.0\n", + " [osf.io, osf.io, academia.edu, mendeley.com, f...\n", + " 12.0\n", " \n", " \n", - " 138\n", - " 0000-0002-9874-1450\n", + " 140\n", + " 0000-0003-4836-7074\n", " 1\n", " 1\n", " 1\n", - " FENGZHI\n", - " WU\n", - " NaN\n", - " NaN\n", - " [[A Systematic Study on the Dynamic Softening ...\n", + " karla haydee\n", + " ortiz palafox\n", + " karla haydee ortíz palafoxmiembro del sistema ...\n", + " [karla palafox]\n", + " [[opinión día del maestro, http://www.cronicaj...\n", " NaN\n", " ...\n", + " 0\n", + " 0\n", + " 0\n", + " 2\n", + " 1\n", " NaN\n", " NaN\n", " NaN\n", - " 3\n", - " FENGZHI WU\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [springer.com, sciencedirect.com, sciencedirec...\n", - " 23.0\n", + " [cronicajalisco.com, youtube.com, tlaquepaque....\n", + " 22.0\n", " \n", " \n", "\n", - "

139 rows × 22 columns

\n", + "

115 rows × 29 columns

\n", "" ], "text/plain": [ - " orcid claimed verified_email verified_primary_email \\\n", - "0 0000-0003-1948-3180 1 1 1 \n", - "1 0000-0002-2000-8339 1 1 1 \n", - "2 0000-0001-9293-2224 1 1 1 \n", - "3 0000-0003-3045-0056 1 1 1 \n", - "4 0000-0003-1815-5732 1 1 1 \n", - ".. ... ... ... ... \n", - "134 0000-0002-1324-7171 1 1 1 \n", - "135 0000-0002-1700-8311 1 1 1 \n", - "136 0000-0003-2676-4431 1 1 1 \n", - "137 0000-0001-8157-0600 1 1 1 \n", - "138 0000-0002-9874-1450 1 1 1 \n", - "\n", - " given_names family_name \\\n", - "0 Mark Katz \n", - "1 Phòng khám tư nhân Hà Nội NaN \n", - "2 Juan Carlos Garcia Hoyos \n", - "3 Ananda Majumdar \n", - "4 JAS (Jurnal Akuntansi Syariah) \n", - ".. ... ... \n", - "134 Vanesa Natalia Rodriguez \n", - "135 Fix-IT Rite \n", - "136 Benny Soewandi \n", - "137 Bijan Yavar \n", - "138 FENGZHI WU \n", - "\n", - " biography \\\n", - "0 Mark N. Katz is a professor of government and ... \n", - "1 NaN \n", - "2 My name is Juan Carlos García Hoyos. I was bor... \n", - "3 I am Ananda Majumdar, Child Care Educator at B... \n", - "4 JAS (Jurnal Akuntansi Syariah) published in pr... \n", - ".. ... \n", - "134 Nombre y Apellido: Vanesa Natalia Rodriguez. ... \n", - "135 NaN \n", - "136 NaN \n", - "137 Senior Research Assistant and Phd Student in O... \n", - "138 NaN \n", - "\n", - " other_names \\\n", - "0 NaN \n", - "1 NaN \n", - "2 [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... \n", - "3 NaN \n", - "4 NaN \n", - ".. ... \n", - "134 [Vanesa Rodriguez, Vanesa N. Rodriguez] \n", - "135 [Best Heating & Plumbing Company] \n", - "136 [Benny Soewandi] \n", - "137 [B. Yavar, Yavar Bijan] \n", - "138 NaN \n", - "\n", - " urls primary_email ... \\\n", - "0 [[Adjusting to Change: American Foreign Policy... NaN ... \n", - "1 [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN ... \n", - "2 [[Air Force Office of Scientific Research (WRI... NaN ... \n", - "3 [[Migration Scholar and Ananda , https://grfdt... NaN ... \n", - "4 [[Website, https://ejournal.stiesyariahbengkal... NaN ... \n", - ".. ... ... ... \n", - "134 [[De rufianes y franchutas Representaciones y ... NaN ... \n", - "135 [[Website, https://fix-itrite.com], [Muckrack,... NaN ... \n", - "136 [[Conservation Efforts as a Result of Theoreti... NaN ... \n", - "137 [[Web of Science (Pub) Researcher ID: A-3544-2... NaN ... \n", - "138 [[A Systematic Study on the Dynamic Softening ... NaN ... \n", - "\n", - " external_ids \\\n", - "0 [[Scopus Author ID, 25649901800]] \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN \n", - ".. ... \n", - "134 NaN \n", - "135 NaN \n", - "136 NaN \n", - "137 [[Scopus Author ID, 56556873600]] \n", - "138 NaN \n", - "\n", - " education \\\n", - "0 [[Political Science, Ph.D., Massachusetts Inst... \n", - "1 NaN \n", - "2 [[Faculty of Philosophy, History - Ph.D., Char... \n", - "3 [[Education , B.Ed. After Degree , University ... \n", - "4 NaN \n", - ".. ... \n", - "134 [[, Maestría en Ciencias Sociales con Mención ... \n", - "135 NaN \n", - "136 NaN \n", - "137 NaN \n", - "138 NaN \n", - "\n", - " employment n_works \\\n", - "0 [[Professor of Government and Politics, George... 58 \n", - "1 NaN 4 \n", - "2 [[responsible for the Project Service Level Ag... 20 \n", - "3 [[General Coordinator- University of Alberta C... 43 \n", - "4 NaN 67 \n", - ".. ... ... \n", - "134 [[Profesora, Universidad Nacional de La Matanz... 7 \n", - "135 NaN 1 \n", - "136 [[Membership, Paguyuban Pelestarian Budaya Ban... 2 \n", - "137 NaN 6 \n", - "138 NaN 3 \n", - "\n", - " works_source primary_email_domain other_email_domains \\\n", - "0 Scopus - Elsevier NaN NaN \n", - "1 Phòng khám tư nhân Hà Nội NaN NaN \n", - "2 Juan Carlos Garcia Hoyos NaN NaN \n", - "3 Ananda Majumdar NaN NaN \n", - "4 JAS (Jurnal Akuntansi Syariah) NaN NaN \n", - ".. ... ... ... \n", - "134 Vanesa Natalia Rodriguez NaN NaN \n", - "135 Fix-It Rite NaN NaN \n", - "136 Benny Soewandi NaN NaN \n", - "137 Scopus - Elsevier NaN NaN \n", - "138 FENGZHI WU NaN NaN \n", - "\n", - " n_emails url_domains n_urls \n", - "0 NaN [wordpress.com, marknkatz.com, gmu.edu, atlant... 16.0 \n", - "1 NaN [onhealth.vn, onhealth.vn, onhealth.vn, onheal... 49.0 \n", - "2 NaN [af.mil, gst.com, govtribe.com, sbir.gov, open... 28.0 \n", - "3 NaN [grfdt.com, linkedin.com, academia.edu, resear... 24.0 \n", - "4 NaN [stiesyariahbengkalis.ac.id, lipi.go.id, cross... 17.0 \n", - ".. ... ... ... \n", - "134 NaN [unlam.edu.ar, unirioja.es, amazon.fr, abebook... 19.0 \n", - "135 NaN [fix-itrite.com, muckrack.com, tumblr.com, dri... 11.0 \n", - "136 NaN [wordpress.com, wordpress.com, linkedin.com, f... 11.0 \n", - "137 NaN [publons.com, articulate.com, zenodo.org, orci... 15.0 \n", - "138 NaN [springer.com, sciencedirect.com, sciencedirec... 23.0 \n", - "\n", - "[139 rows x 22 columns]" + " orcid claimed verified_email verified_primary_email \\\n", + "0 0000-0002-7843-8497 1 1 1 \n", + "1 0000-0003-1554-1531 1 1 1 \n", + "3 0000-0003-3193-030x 1 1 1 \n", + "4 0000-0002-5341-6531 1 1 1 \n", + "5 0000-0001-5295-2271 1 1 1 \n", + ".. ... ... ... ... \n", + "135 0000-0002-8125-0081 1 1 1 \n", + "136 0000-0002-3374-5709 1 1 1 \n", + "137 0000-0001-7228-5680 1 1 1 \n", + "139 0000-0003-1047-4229 1 1 1 \n", + "140 0000-0003-4836-7074 1 1 1 \n", + "\n", + " given_names family_name \\\n", + "0 davi barbosa \n", + "1 katarzyna ochman \n", + "3 juan pablo wolff mejia \n", + "4 trent hammond \n", + "5 antoniy moysey \n", + ".. ... ... \n", + "135 issam bencheikh \n", + "136 guillermo ortiz \n", + "137 text protocol \n", + "139 bayu sakti \n", + "140 karla haydee ortiz palafox \n", + "\n", + " biography \\\n", + "0 pesquisador na área sociojurídica, professor, ... \n", + "1 katarzyna ochman [kataˈʐɨna ˈɔxman] is assista... \n", + "3 aspirante a maestría en derecho y negocios int... \n", + "4 mr trent hammond is an honorary research fello... \n", + "5 NaN \n", + ".. ... \n", + "135 NaN \n", + "136 médico, internista, neumólogo, intensivista, e... \n", + "137 NaN \n", + "139 bayu purbha saktisaya adalah bayu purbha sakti... \n", + "140 karla haydee ortíz palafoxmiembro del sistema ... \n", + "\n", + " other_names \\\n", + "0 [professor davi barbosa delmont] \n", + "1 [[kataˈʐɨna ˈɔxman], catharina ochman, cathari... \n", + "3 [juan pablo wolff, pablo wolff mejia, juan p. ... \n", + "4 [trent ernest hammond (t.e.hammond)] \n", + "5 NaN \n", + ".. ... \n", + "135 [issame1982, دكتور عصام بن الشيخ] \n", + "136 [guillermo ortiz-ruiz] \n", + "137 NaN \n", + "139 [bayu purbha sakti] \n", + "140 [karla palafox] \n", + "\n", + " urls \\\n", + "0 [[plataforma de cursos ideia criativa, https:/... \n", + "1 [[researchgate, https://www.researchgate.net/p... \n", + "3 [[twitter, https://twitter.com/pablomejiam], [... \n", + "4 [[academic support masters, http://trenthammon... \n", + "5 [[academic journals database, http://journalda... \n", + ".. ... \n", + "135 [[my blog web site, http://issame1982.blogspot... \n", + "136 [[elsevier, https://www.elsevier.com/], [asoci... \n", + "137 [[about, https://about.me/textprotocol], [gith... \n", + "139 [[osf, http://osf.io/qe2ug], [inarxiv, https:/... \n", + "140 [[opinión día del maestro, http://www.cronicaj... \n", + "\n", + " primary_email ... n_doi n_arxiv n_pmc \\\n", + "0 NaN ... 0 0 0 \n", + "1 NaN ... 1 0 0 \n", + "3 juanpmejia@ulasallista.edu.co ... 0 0 0 \n", + "4 trent.hammond@academicsupportmasters.com.au ... 1 0 0 \n", + "5 antoniimoisei@bsmu.edu.ua ... 0 0 0 \n", + ".. ... ... ... ... ... \n", + "135 NaN ... 0 0 0 \n", + "136 NaN ... 62 0 0 \n", + "137 NaN ... 0 0 0 \n", + "139 NaN ... 0 0 0 \n", + "140 NaN ... 0 0 0 \n", + "\n", + " n_other_pids label primary_email_domain \\\n", + "0 0 0 NaN \n", + "1 0 1 NaN \n", + "3 0 1 ulasallista.edu.co \n", + "4 1 1 academicsupportmasters.com.au \n", + "5 0 1 bsmu.edu.ua \n", + ".. ... ... ... \n", + "135 0 1 NaN \n", + "136 88 0 NaN \n", + "137 0 0 NaN \n", + "139 0 1 NaN \n", + "140 2 1 NaN \n", + "\n", + " other_email_domains n_emails \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "3 NaN NaN \n", + "4 [health.nsw.gov.au, csu.edu.au, sociologist.co... 5.0 \n", + "5 NaN NaN \n", + ".. ... ... \n", + "135 NaN NaN \n", + "136 NaN NaN \n", + "137 NaN NaN \n", + "139 NaN NaN \n", + "140 NaN NaN \n", + "\n", + " url_domains n_urls \n", + "0 [eadplataforma.com, facebook.com, youtube.com,... 39.0 \n", + "1 [researchgate.net, academia.edu, facebook.com,... 11.0 \n", + "3 [twitter.com, youtube.com, google.com, linkedi... 11.0 \n", + "4 [wix.com, academia.edu, researchgate.net, rese... 12.0 \n", + "5 [journaldatabase.info, nplu.org, acls.org, ind... 21.0 \n", + ".. ... ... \n", + "135 [blogspot.com, researchgate.net, google.com, l... 12.0 \n", + "136 [elsevier.com, amci.org.co, springer.com, revi... 12.0 \n", + "137 [about.me, github.com, gitlab.com, gravatar.co... 12.0 \n", + "139 [osf.io, osf.io, academia.edu, mendeley.com, f... 12.0 \n", + "140 [cronicajalisco.com, youtube.com, tlaquepaque.... 22.0 \n", + "\n", + "[115 rows x 29 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Works source" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Paste from Miriam" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## External IDs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "External IDs should come from reliable sources. ORCiD registrants cannot add them freely." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "df['n_ids'] = df[df['external_ids'].notna()].external_ids.str.len()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 1.301959e+06\n", + "mean 1.358640e+00\n", + "std 6.635087e-01\n", + "min 1.000000e+00\n", + "25% 1.000000e+00\n", + "50% 1.000000e+00\n", + "75% 2.000000e+00\n", + "max 8.000000e+01\n", + "Name: n_ids, dtype: float64" ] }, "execution_count": 50, @@ -9584,16 +9241,13 @@ } ], "source": [ - "exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)\n", - "exploded_sources" + "df.n_ids.describe()" ] }, { "cell_type": "code", "execution_count": 51, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -9627,640 +9281,1322 @@ " urls\n", " primary_email\n", " ...\n", - " external_ids\n", - " education\n", - " employment\n", - " n_works\n", - " works_source\n", + " n_arxiv\n", + " n_pmc\n", + " n_other_pids\n", + " label\n", " primary_email_domain\n", " other_email_domains\n", " n_emails\n", " url_domains\n", " n_urls\n", + " n_ids\n", " \n", " \n", " \n", " \n", - " 1\n", - " 0000-0002-2000-8339\n", - " 1\n", - " 1\n", - " 1\n", - " Phòng khám tư nhân Hà Nội\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [[Sức khỏe, https://onhealth.vn/], [Khám phụ k...\n", - " NaN\n", - " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 4\n", - " Phòng khám tư nhân Hà Nội\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [onhealth.vn, onhealth.vn, onhealth.vn, onheal...\n", - " 49.0\n", - " \n", - " \n", - " 2\n", - " 0000-0001-9293-2224\n", - " 1\n", - " 1\n", - " 1\n", - " Juan Carlos\n", - " Garcia Hoyos\n", - " My name is Juan Carlos García Hoyos. I was bor...\n", - " [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /...\n", - " [[Air Force Office of Scientific Research (WRI...\n", - " NaN\n", - " ...\n", - " NaN\n", - " [[Faculty of Philosophy, History - Ph.D., Char...\n", - " [[responsible for the Project Service Level Ag...\n", - " 20\n", - " Juan Carlos Garcia Hoyos\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [af.mil, gst.com, govtribe.com, sbir.gov, open...\n", - " 28.0\n", - " \n", - " \n", - " 3\n", - " 0000-0003-3045-0056\n", + " 7253330\n", + " 0000-0002-9554-6633\n", " 1\n", " 1\n", " 1\n", - " Ananda\n", - " Majumdar\n", - " I am Ananda Majumdar, Child Care Educator at B...\n", - " NaN\n", - " [[Migration Scholar and Ananda , https://grfdt...\n", - " NaN\n", - " ...\n", + " john a\n", + " williams\n", " NaN\n", - " [[Education , B.Ed. After Degree , University ...\n", - " [[General Coordinator- University of Alberta C...\n", - " 43\n", - " Ananda Majumdar\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [grfdt.com, linkedin.com, academia.edu, resear...\n", - " 24.0\n", - " \n", - " \n", - " 4\n", - " 0000-0003-1815-5732\n", - " 1\n", - " 1\n", - " 1\n", - " JAS\n", - " (Jurnal Akuntansi Syariah)\n", - " JAS (Jurnal Akuntansi Syariah) published in pr...\n", " NaN\n", - " [[Website, https://ejournal.stiesyariahbengkal...\n", + " [[aston university profile page, https://resea...\n", " NaN\n", " ...\n", - " NaN\n", - " NaN\n", - " NaN\n", - " 67\n", - " JAS (Jurnal Akuntansi Syariah)\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [stiesyariahbengkalis.ac.id, lipi.go.id, cross...\n", - " 17.0\n", - " \n", - " \n", - " 5\n", - " 0000-0002-4379-6454\n", - " 1\n", - " 1\n", + " 0\n", + " 0\n", + " 208\n", " 1\n", - " Caroline Wanjiru\n", - " Kariuki\n", - " Caroline holds a PhD in Economics from Curtin ...\n", - " NaN\n", - " [[Scopus Profile, https://www.scopus.com/dashb...\n", - " NaN\n", - " ...\n", - " NaN\n", - " [[Economics, Doctor of Philosophy , Curtin Uni...\n", - " [[Director, Educational Development, Strathmor...\n", - " 4\n", - " Caroline Wanjiru Kariuki\n", " NaN\n", " NaN\n", " NaN\n", - " [scopus.com, mendeley.com, publons.com, resear...\n", - " 13.0\n", + " [aston.ac.uk]\n", + " 1.0\n", + " 80.0\n", " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", + " \n", + "\n", + "

1 rows × 30 columns

\n", + "" + ], + "text/plain": [ + " orcid claimed verified_email verified_primary_email \\\n", + "7253330 0000-0002-9554-6633 1 1 1 \n", + "\n", + " given_names family_name biography other_names \\\n", + "7253330 john a williams NaN NaN \n", + "\n", + " urls primary_email ... \\\n", + "7253330 [[aston university profile page, https://resea... NaN ... \n", + "\n", + " n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", + "7253330 0 0 208 1 NaN \n", + "\n", + " other_email_domains n_emails url_domains n_urls n_ids \n", + "7253330 NaN NaN [aston.ac.uk] 1.0 80.0 \n", + "\n", + "[1 rows x 30 columns]" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df.n_ids == df.n_ids.max()]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
orcidexternal_idsprovider
1320000-0001-6352-7086111SusanHawthorneSusan is a poet, novelist, publisher and Sansk...[S. Hawthorne, Susan C. C. Hawthorne][[Spinifex Press, http://www.spinifexpress.com...NaN...[[ResearcherID, K-6039-2018]][[School of Asian Studies, Honours Sanskrit, A...[[Adjunct Professor, James Cook University, To...352Susan HawthorneNaNNaNNaN[spinifexpress.com.au, linkedin.com, twitter.c...12.070000-0001-7463-977x[loop profile, 371409]loop profile
1330000-0002-4062-3603111JUAN DE DIOSBELTRÁN MANCILLAJUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut...[Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD...[[01.- Juan de Dios Beltrán Mancilla. Teoría O...NaN...NaN[[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR...[[INSPECTOR GENERAL JORNADA VESPERTINA // De 2...11JUAN DE DIOS BELTR´´ÁN MANCILLANaNNaNNaN[yumpu.com, ijopm.org, google.com, blogspot.co...69.090000-0001-8718-0056[scopus author id, 55466912100]scopus author id
1340000-0002-1324-7171111Vanesa NataliaRodriguezNombre y Apellido: Vanesa Natalia Rodriguez. ...[Vanesa Rodriguez, Vanesa N. Rodriguez][[De rufianes y franchutas Representaciones y ...NaN...NaN[[, Maestría en Ciencias Sociales con Mención ...[[Profesora, Universidad Nacional de La Matanz...7Vanesa Natalia RodriguezNaNNaNNaN[unlam.edu.ar, unirioja.es, amazon.fr, abebook...19.0100000-0001-8718-0056[scopus author id, 7102015452]scopus author id
1360000-0003-2676-4431111BennySoewandiNaN[Benny Soewandi][[Conservation Efforts as a Result of Theoreti...NaN...NaNNaN[[Membership, Paguyuban Pelestarian Budaya Ban...2Benny SoewandiNaNNaNNaN[wordpress.com, wordpress.com, linkedin.com, f...11.0140000-0001-9708-5570[researcherid, p-5112-2015]researcherid
1380000-0002-9874-1450111FENGZHIWUNaNNaN[[A Systematic Study on the Dynamic Softening ...NaN...NaNNaNNaN3FENGZHI WUNaNNaNNaN[springer.com, sciencedirect.com, sciencedirec...23.0150000-0001-9708-5570[scopus author id, 42062216900]scopus author id
\n", - "

108 rows × 22 columns

\n", "
" ], "text/plain": [ - " orcid claimed verified_email verified_primary_email \\\n", - "1 0000-0002-2000-8339 1 1 1 \n", - "2 0000-0001-9293-2224 1 1 1 \n", - "3 0000-0003-3045-0056 1 1 1 \n", - "4 0000-0003-1815-5732 1 1 1 \n", - "5 0000-0002-4379-6454 1 1 1 \n", - ".. ... ... ... ... \n", - "132 0000-0001-6352-7086 1 1 1 \n", - "133 0000-0002-4062-3603 1 1 1 \n", - "134 0000-0002-1324-7171 1 1 1 \n", - "136 0000-0003-2676-4431 1 1 1 \n", - "138 0000-0002-9874-1450 1 1 1 \n", - "\n", - " given_names family_name \\\n", - "1 Phòng khám tư nhân Hà Nội NaN \n", - "2 Juan Carlos Garcia Hoyos \n", - "3 Ananda Majumdar \n", - "4 JAS (Jurnal Akuntansi Syariah) \n", - "5 Caroline Wanjiru Kariuki \n", - ".. ... ... \n", - "132 Susan Hawthorne \n", - "133 JUAN DE DIOS BELTRÁN MANCILLA \n", - "134 Vanesa Natalia Rodriguez \n", - "136 Benny Soewandi \n", - "138 FENGZHI WU \n", - "\n", - " biography \\\n", - "1 NaN \n", - "2 My name is Juan Carlos García Hoyos. I was bor... \n", - "3 I am Ananda Majumdar, Child Care Educator at B... \n", - "4 JAS (Jurnal Akuntansi Syariah) published in pr... \n", - "5 Caroline holds a PhD in Economics from Curtin ... \n", - ".. ... \n", - "132 Susan is a poet, novelist, publisher and Sansk... \n", - "133 JUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut... \n", - "134 Nombre y Apellido: Vanesa Natalia Rodriguez. ... \n", - "136 NaN \n", - "138 NaN \n", - "\n", - " other_names \\\n", - "1 NaN \n", - "2 [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... \n", - "3 NaN \n", - "4 NaN \n", - "5 NaN \n", - ".. ... \n", - "132 [S. Hawthorne, Susan C. C. Hawthorne] \n", - "133 [Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD... \n", - "134 [Vanesa Rodriguez, Vanesa N. Rodriguez] \n", - "136 [Benny Soewandi] \n", - "138 NaN \n", - "\n", - " urls primary_email ... \\\n", - "1 [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN ... \n", - "2 [[Air Force Office of Scientific Research (WRI... NaN ... \n", - "3 [[Migration Scholar and Ananda , https://grfdt... NaN ... \n", - "4 [[Website, https://ejournal.stiesyariahbengkal... NaN ... \n", - "5 [[Scopus Profile, https://www.scopus.com/dashb... NaN ... \n", - ".. ... ... ... \n", - "132 [[Spinifex Press, http://www.spinifexpress.com... NaN ... \n", - "133 [[01.- Juan de Dios Beltrán Mancilla. Teoría O... NaN ... \n", - "134 [[De rufianes y franchutas Representaciones y ... NaN ... \n", - "136 [[Conservation Efforts as a Result of Theoreti... NaN ... \n", - "138 [[A Systematic Study on the Dynamic Softening ... NaN ... \n", - "\n", - " external_ids \\\n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN \n", - "5 NaN \n", - ".. ... \n", - "132 [[ResearcherID, K-6039-2018]] \n", - "133 NaN \n", - "134 NaN \n", - "136 NaN \n", - "138 NaN \n", - "\n", - " education \\\n", - "1 NaN \n", - "2 [[Faculty of Philosophy, History - Ph.D., Char... \n", - "3 [[Education , B.Ed. After Degree , University ... \n", - "4 NaN \n", - "5 [[Economics, Doctor of Philosophy , Curtin Uni... \n", - ".. ... \n", - "132 [[School of Asian Studies, Honours Sanskrit, A... \n", - "133 [[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR... \n", - "134 [[, Maestría en Ciencias Sociales con Mención ... \n", - "136 NaN \n", - "138 NaN \n", + " orcid external_ids provider\n", + "7 0000-0001-7463-977x [loop profile, 371409] loop profile\n", + "9 0000-0001-8718-0056 [scopus author id, 55466912100] scopus author id\n", + "10 0000-0001-8718-0056 [scopus author id, 7102015452] scopus author id\n", + "14 0000-0001-9708-5570 [researcherid, p-5112-2015] researcherid\n", + "15 0000-0001-9708-5570 [scopus author id, 42062216900] scopus author id" + ] + }, + "execution_count": 54, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ids[ids.provider.notna()].head()" + ] + }, + { + "cell_type": "code", + "execution_count": 55, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": false + }, + "data": [ + { + "type": "bar", + "x": [ + "scopus author id", + "researcherid", + "loop profile", + "ciência id", + "researcher name resolver id", + "中国科学家在线", + "sciprofile", + "isni", + "gnd", + "pitt id", + "technical university of denmark cwis", + "researcher id", + "id dialnet", + "digital author id", + "scopus author id: ", + "authenticusid", + "hku researcherpage", + "uow scholars", + "cti vitae", + "scopus author id:", + "hkust profile", + "chalmers id", + "scopus id", + "iauthor", + "google scholar", + "digital author id (dai)", + "authid", + "dai", + "us epa vivo", + "scopus id", + "authenticus", + "smithsonian profiles", + "github", + "escientist", + "vivo cornell", + "researcherid:", + "id dialnet:", + "dialnet id", + "sciprofiles", + "kaken", + "une researcher id", + "researcherid: ", + "orcid", + "scienceopen", + "profile system identifier", + "orcid id", + "custom" + ], + "y": [ + 1030807, + 544825, + 117325, + 36666, + 7907, + 4804, + 4411, + 3075, + 2954, + 2674, + 2483, + 1445, + 1168, + 1124, + 1077, + 869, + 741, + 646, + 581, + 548, + 522, + 430, + 254, + 212, + 200, + 177, + 175, + 155, + 146, + 127, + 83, + 61, + 51, + 49, + 46, + 39, + 7, + 6, + 5, + 5, + 4, + 3, + 2, + 1, + 1, + 1, + 1 + ] + } + ], + "layout": { + "template": { + "data": { + "bar": [ + { + "error_x": { + "color": "#2a3f5f" + }, + "error_y": { + "color": "#2a3f5f" + }, + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "bar" + } + ], + "barpolar": [ + { + "marker": { + "line": { + "color": "#E5ECF6", + "width": 0.5 + } + }, + "type": "barpolar" + } + ], + "carpet": [ + { + "aaxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "baxis": { + "endlinecolor": "#2a3f5f", + "gridcolor": "white", + "linecolor": "white", + "minorgridcolor": "white", + "startlinecolor": "#2a3f5f" + }, + "type": "carpet" + } + ], + "choropleth": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "choropleth" + } + ], + "contour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "contour" + } + ], + "contourcarpet": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "contourcarpet" + } + ], + "heatmap": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmap" + } + ], + "heatmapgl": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "heatmapgl" + } + ], + "histogram": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "histogram" + } + ], + "histogram2d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2d" + } + ], + "histogram2dcontour": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "histogram2dcontour" + } + ], + "mesh3d": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "type": "mesh3d" + } + ], + "parcoords": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "parcoords" + } + ], + "pie": [ + { + "automargin": true, + "type": "pie" + } + ], + "scatter": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter" + } + ], + "scatter3d": [ + { + "line": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatter3d" + } + ], + "scattercarpet": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattercarpet" + } + ], + "scattergeo": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergeo" + } + ], + "scattergl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattergl" + } + ], + "scattermapbox": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scattermapbox" + } + ], + "scatterpolar": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolar" + } + ], + "scatterpolargl": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterpolargl" + } + ], + "scatterternary": [ + { + "marker": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "type": "scatterternary" + } + ], + "surface": [ + { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + }, + "colorscale": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "type": "surface" + } + ], + "table": [ + { + "cells": { + "fill": { + "color": "#EBF0F8" + }, + "line": { + "color": "white" + } + }, + "header": { + "fill": { + "color": "#C8D4E3" + }, + "line": { + "color": "white" + } + }, + "type": "table" + } + ] + }, + "layout": { + "annotationdefaults": { + "arrowcolor": "#2a3f5f", + "arrowhead": 0, + "arrowwidth": 1 + }, + "autotypenumbers": "strict", + "coloraxis": { + "colorbar": { + "outlinewidth": 0, + "ticks": "" + } + }, + "colorscale": { + "diverging": [ + [ + 0, + "#8e0152" + ], + [ + 0.1, + "#c51b7d" + ], + [ + 0.2, + "#de77ae" + ], + [ + 0.3, + "#f1b6da" + ], + [ + 0.4, + "#fde0ef" + ], + [ + 0.5, + "#f7f7f7" + ], + [ + 0.6, + "#e6f5d0" + ], + [ + 0.7, + "#b8e186" + ], + [ + 0.8, + "#7fbc41" + ], + [ + 0.9, + "#4d9221" + ], + [ + 1, + "#276419" + ] + ], + "sequential": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ], + "sequentialminus": [ + [ + 0, + "#0d0887" + ], + [ + 0.1111111111111111, + "#46039f" + ], + [ + 0.2222222222222222, + "#7201a8" + ], + [ + 0.3333333333333333, + "#9c179e" + ], + [ + 0.4444444444444444, + "#bd3786" + ], + [ + 0.5555555555555556, + "#d8576b" + ], + [ + 0.6666666666666666, + "#ed7953" + ], + [ + 0.7777777777777778, + "#fb9f3a" + ], + [ + 0.8888888888888888, + "#fdca26" + ], + [ + 1, + "#f0f921" + ] + ] + }, + "colorway": [ + "#636efa", + "#EF553B", + "#00cc96", + "#ab63fa", + "#FFA15A", + "#19d3f3", + "#FF6692", + "#B6E880", + "#FF97FF", + "#FECB52" + ], + "font": { + "color": "#2a3f5f" + }, + "geo": { + "bgcolor": "white", + "lakecolor": "white", + "landcolor": "#E5ECF6", + "showlakes": true, + "showland": true, + "subunitcolor": "white" + }, + "hoverlabel": { + "align": "left" + }, + "hovermode": "closest", + "mapbox": { + "style": "light" + }, + "paper_bgcolor": "white", + "plot_bgcolor": "#E5ECF6", + "polar": { + "angularaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "radialaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "scene": { + "xaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "yaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + }, + "zaxis": { + "backgroundcolor": "#E5ECF6", + "gridcolor": "white", + "gridwidth": 2, + "linecolor": "white", + "showbackground": true, + "ticks": "", + "zerolinecolor": "white" + } + }, + "shapedefaults": { + "line": { + "color": "#2a3f5f" + } + }, + "ternary": { + "aaxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "baxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + }, + "bgcolor": "#E5ECF6", + "caxis": { + "gridcolor": "white", + "linecolor": "white", + "ticks": "" + } + }, + "title": { + "x": 0.05 + }, + "xaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + }, + "yaxis": { + "automargin": true, + "gridcolor": "white", + "linecolor": "white", + "ticks": "", + "title": { + "standoff": 15 + }, + "zerolinecolor": "white", + "zerolinewidth": 2 + } + } + }, + "title": { + "text": "IDs provided by providers" + }, + "xaxis": { + "tickangle": 45, + "tickfont": { + "size": 12 + } + } + } + }, + "text/html": [ + "
" ] }, - "execution_count": 51, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]" + "data = [\n", + " go.Bar(\n", + " x=ids.groupby('provider').count().sort_values('orcid', ascending=False).index,\n", + " y=ids.groupby('provider').count().sort_values('orcid', ascending=False)['orcid']\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='IDs provided by providers',\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 56, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([nan, 'loop profile', 'scopus author id', 'researcherid',\n", + " 'scopus author id: ', 'gnd', 'isni', 'ciência id', 'pitt id',\n", + " 'id dialnet', 'technical university of denmark cwis',\n", + " 'researcher name resolver id', 'scopus author id:',\n", + " 'hkust profile', '中国科学家在线', 'cti vitae', 'escientist',\n", + " 'researcher id', 'sciprofile', 'digital author id', 'scopus id',\n", + " 'uow scholars', 'authenticusid', 'authenticus', 'authid',\n", + " 'hku researcherpage', 'chalmers id', 'iauthor', 'us epa vivo',\n", + " 'digital author id (dai)', 'vivo cornell', 'smithsonian profiles',\n", + " 'github', 'google scholar', 'scopus id', 'researcherid:', 'dai',\n", + " 'kaken', 'orcid id', 'dialnet id', 'profile system identifier',\n", + " 'sciprofiles', 'id dialnet:', 'researcherid: ', 'scienceopen',\n", + " 'une researcher id', 'custom', 'orcid'], dtype=object)" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "## Works source" + "pd.unique(ids['provider'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Paste from Miriam" + "## Keywords" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "## External IDs" + "This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 57, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['data science ',\n", + " 'science of science',\n", + " 'scholarly knowledge mining',\n", + " 'open science',\n", + " 'research infrastructures']" + ] + }, + "execution_count": 57, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "External IDs should come from reliable sources. ORCiD registrants cannot add them freely." + "df[df['orcid'] == AM]['keywords'].values[0]" ] }, { - "cell_type": "code", - "execution_count": 52, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "df['n_ids'] = df[df['external_ids'].notna()].external_ids.str.len()" + "I did a good job. The following instead is dirty" ] }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 58, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "count 1.285292e+06\n", - "mean 1.357162e+00\n", - "std 6.607097e-01\n", - "min 1.000000e+00\n", - "25% 1.000000e+00\n", - "50% 1.000000e+00\n", - "75% 2.000000e+00\n", - "max 8.000000e+01\n", - "Name: n_ids, dtype: float64" + "['open access, open science, libraries, repositories, social web,']" ] }, - "execution_count": 53, + "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.n_ids.describe()" + "df[df['orcid'] == PP]['keywords'].values[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "So the keyword field needs some cleaning" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "def fix_keywords(lst):\n", + " fixed = set()\n", + " for k in lst:\n", + " tokens = set(k.split(','))\n", + "# tokens.remove('')\n", + " for t in tokens:\n", + " fixed.add(str.strip(t))\n", + " fixed.discard('')\n", + " return list(fixed)" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": {}, + "outputs": [], + "source": [ + "df['fixed_keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))" ] }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 61, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidclaimedverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesurlsprimary_email...educationemploymentn_worksworks_sourceprimary_email_domainother_email_domainsn_emailsurl_domainsn_urlsn_ids
92287930000-0002-9554-6633111John AWilliamsNaNNaN[[Aston University profile page, https://resea...NaN...NaN[[, Aston University, Birmingham, , GB, 1722, ...91[Aston Research Explorer]NaNNaNNaN[aston.ac.uk]1.080.0
\n", - "

1 rows × 23 columns

\n", - "
" - ], "text/plain": [ - " orcid claimed verified_email verified_primary_email \\\n", - "9228793 0000-0002-9554-6633 1 1 1 \n", - "\n", - " given_names family_name biography other_names \\\n", - "9228793 John A Williams NaN NaN \n", - "\n", - " urls primary_email ... \\\n", - "9228793 [[Aston University profile page, https://resea... NaN ... \n", - "\n", - " education employment n_works \\\n", - "9228793 NaN [[, Aston University, Birmingham, , GB, 1722, ... 91 \n", - "\n", - " works_source primary_email_domain other_email_domains \\\n", - "9228793 [Aston Research Explorer] NaN NaN \n", - "\n", - " n_emails url_domains n_urls n_ids \n", - "9228793 NaN [aston.ac.uk] 1.0 80.0 \n", - "\n", - "[1 rows x 23 columns]" + "['open science', 'repositories', 'open access', 'libraries', 'social web']" ] }, - "execution_count": 54, + "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df[df.n_ids == df.n_ids.max()]" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "metadata": {}, - "outputs": [], - "source": [ - "ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)" + "df[df['orcid'] == PP]['fixed_keywords'].values[0]" ] }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 62, "metadata": {}, "outputs": [], "source": [ - "ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])" + "df['n_keywords'] = df.keywords.str.len()" ] }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 63, "metadata": {}, "outputs": [ { @@ -10285,66 +10621,99 @@ " \n", " \n", " orcid\n", - " external_ids\n", - " provider\n", + " n_keywords\n", " \n", " \n", " \n", " \n", - " 13\n", - " 0000-0001-5004-4608\n", - " [Scopus Author ID, 40661094300]\n", - " Scopus Author ID\n", + " 2851081\n", + " 0000-0002-0673-0341\n", + " 154.0\n", + " \n", + " \n", + " 7344151\n", + " 0000-0002-7060-4112\n", + " 141.0\n", + " \n", + " \n", + " 2235440\n", + " 0000-0002-6075-3501\n", + " 140.0\n", + " \n", + " \n", + " 2994233\n", + " 0000-0002-4071-0301\n", + " 118.0\n", + " \n", + " \n", + " 3971323\n", + " 0000-0002-9638-8091\n", + " 115.0\n", + " \n", + " \n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 10916569\n", + " 0000-0001-5692-7639\n", + " NaN\n", " \n", " \n", - " 24\n", - " 0000-0001-5008-2479\n", - " [Scopus Author ID, 12789856200]\n", - " Scopus Author ID\n", + " 10916570\n", + " 0000-0003-1539-0999\n", + " NaN\n", " \n", " \n", - " 25\n", - " 0000-0001-5008-2479\n", - " [Ciência ID, 2F1C-479B-B071]\n", - " Ciência ID\n", + " 10916571\n", + " 0000-0003-2858-5509\n", + " NaN\n", " \n", " \n", - " 31\n", - " 0000-0001-5010-9539\n", - " [Loop profile, 1098977]\n", - " Loop profile\n", + " 10916572\n", + " 0000-0003-2438-9500\n", + " NaN\n", " \n", " \n", - " 42\n", - " 0000-0001-5013-6529\n", - " [Scopus Author ID, 8986698300]\n", - " Scopus Author ID\n", + " 10916573\n", + " 0000-0003-4119-4772\n", + " NaN\n", " \n", " \n", "\n", + "

10916574 rows × 2 columns

\n", "" ], "text/plain": [ - " orcid external_ids provider\n", - "13 0000-0001-5004-4608 [Scopus Author ID, 40661094300] Scopus Author ID\n", - "24 0000-0001-5008-2479 [Scopus Author ID, 12789856200] Scopus Author ID\n", - "25 0000-0001-5008-2479 [Ciência ID, 2F1C-479B-B071] Ciência ID\n", - "31 0000-0001-5010-9539 [Loop profile, 1098977] Loop profile\n", - "42 0000-0001-5013-6529 [Scopus Author ID, 8986698300] Scopus Author ID" + " orcid n_keywords\n", + "2851081 0000-0002-0673-0341 154.0\n", + "7344151 0000-0002-7060-4112 141.0\n", + "2235440 0000-0002-6075-3501 140.0\n", + "2994233 0000-0002-4071-0301 118.0\n", + "3971323 0000-0002-9638-8091 115.0\n", + "... ... ...\n", + "10916569 0000-0001-5692-7639 NaN\n", + "10916570 0000-0003-1539-0999 NaN\n", + "10916571 0000-0003-2858-5509 NaN\n", + "10916572 0000-0003-2438-9500 NaN\n", + "10916573 0000-0003-4119-4772 NaN\n", + "\n", + "[10916574 rows x 2 columns]" ] }, - "execution_count": 79, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ids[ids.provider.notna()].head()" + "df.sort_values('n_keywords', ascending=False)[['orcid', 'n_keywords']]" ] }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 64, "metadata": {}, "outputs": [ { @@ -10359,108 +10728,208 @@ { "type": "bar", "x": [ - "Scopus Author ID", - "ResearcherID", - "Loop profile", - "Ciência ID", - "Researcher Name Resolver ID", - "中国科学家在线", - "ISNI", - "GND", - "Pitt ID", - "Technical University of Denmark CWIS", - "Sciprofile", - "Researcher ID", - "ID Dialnet", - "Digital author ID", - "Scopus Author ID: ", - "AuthenticusID", - "HKU ResearcherPage", - "UOW Scholars", - "CTI Vitae", - "Scopus Author ID:", - "HKUST Profile", - "Scopus author ID", - "Chalmers ID", - "Scopus ID", - "iAuthor", - "Google Scholar", - "AuthID", - "DAI", - "US EPA VIVO", - "Digital Author ID (DAI)", - "Scopus ID", - "Authenticus", - "Smithsonian Profiles", - "GitHub", - "eScientist", - "VIVO Cornell", - "ResearcherID:", - "Digital Author ID", - "Digital author ID (DAI)", - "ID Dialnet:", - "Dialnet ID", - "KAKEN", - "UNE Researcher ID", - "ResearcherID: ", - "ORCID", - "Custom", - "ORCID iD", - "ScienceOpen", - "ResearcherId", - "Profile system identifier" + "0000-0002-0673-0341", + "0000-0002-7060-4112", + "0000-0002-6075-3501", + "0000-0002-4071-0301", + "0000-0002-9638-8091", + "0000-0002-4235-4259", + "0000-0001-9462-5666", + "0000-0003-0076-6287", + "0000-0002-1878-9762", + "0000-0001-6537-7683", + "0000-0001-6307-6027", + "0000-0003-2273-9888", + "0000-0003-1799-0971", + "0000-0001-5287-1949", + "0000-0002-0937-7061", + "0000-0001-9715-9357", + "0000-0001-5696-1052", + "0000-0003-2998-5520", + "0000-0001-5869-2204", + "0000-0002-0156-3580", + "0000-0002-9625-6742", + "0000-0002-8401-8018", + "0000-0001-9985-1697", + "0000-0003-4246-8579", + "0000-0002-7710-0355", + "0000-0002-8083-7382", + "0000-0001-7654-5013", + "0000-0001-6939-3859", + "0000-0002-3061-3364", + "0000-0003-2509-2549", + "0000-0002-0463-0048", + "0000-0001-5230-715x", + "0000-0001-9336-6850", + "0000-0001-5458-7167", + "0000-0003-0209-180x", + "0000-0003-3584-6834", + "0000-0002-8227-5387", + "0000-0002-9381-2264", + "0000-0003-3340-6413", + "0000-0002-2935-1934", + "0000-0002-8644-8396", + "0000-0002-3123-3021", + "0000-0002-8659-6321", + "0000-0002-8449-2211", + "0000-0001-5167-7466", + "0000-0001-5637-1124", + "0000-0003-2532-2906", + "0000-0003-4673-1063", + "0000-0003-4608-3844", + "0000-0002-3532-043x", + "0000-0002-6347-9464", + "0000-0003-4505-3678", + "0000-0002-2683-4527", + "0000-0003-4374-6374", + "0000-0003-4511-7942", + "0000-0002-1103-9651", + "0000-0001-9280-6017", + "0000-0003-3720-1183", + "0000-0001-9586-0780", + "0000-0002-5306-7781", + "0000-0003-2218-1343", + "0000-0002-8499-1045", + "0000-0003-1863-0265", + "0000-0002-5539-1761", + "0000-0003-2550-1859", + "0000-0002-8072-1152", + "0000-0003-3342-6123", + "0000-0001-6861-9561", + "0000-0002-2252-672x", + "0000-0002-3597-3350", + "0000-0002-3907-3552", + "0000-0001-8689-185x", + "0000-0002-5274-7742", + "0000-0002-3186-8860", + "0000-0001-6843-9325", + "0000-0001-7133-7848", + "0000-0003-4486-2684", + "0000-0003-3343-5660", + "0000-0002-9014-2090", + "0000-0002-6282-0640", + "0000-0001-7857-4133", + "0000-0002-1294-2156", + "0000-0002-4432-3448", + "0000-0003-0097-4182", + "0000-0003-1245-7705", + "0000-0001-8445-412x", + "0000-0003-4153-6779", + "0000-0002-9125-6022", + "0000-0002-4598-2891", + "0000-0003-3387-3193", + "0000-0002-3866-6460", + "0000-0002-1411-3028", + "0000-0003-4283-2895", + "0000-0002-0211-7195", + "0000-0002-3898-9542", + "0000-0002-1545-7818", + "0000-0002-4963-9345", + "0000-0002-1770-9660", + "0000-0002-1960-5857", + "0000-0003-2054-477x" ], "y": [ - 1015175, - 543545, - 114316, - 33870, - 7810, - 4794, - 3054, - 2825, - 2672, - 2486, - 2441, - 1417, - 1167, - 1079, - 1076, - 847, - 740, - 644, - 581, - 549, - 521, - 501, - 430, - 232, - 212, - 200, - 175, - 153, - 146, - 135, - 127, + 154, + 141, + 140, + 118, + 115, + 104, + 98, + 94, + 92, + 91, + 88, + 86, + 84, 82, + 78, + 77, + 76, + 75, + 74, + 73, + 71, + 70, + 69, + 66, + 64, + 62, 61, + 60, + 58, + 57, + 56, + 54, + 53, + 53, + 52, + 51, 51, + 51, + 51, + 50, + 50, + 50, + 50, + 49, + 49, 49, + 48, + 48, + 48, + 48, + 48, + 48, + 48, + 47, + 47, + 46, 46, + 46, + 45, + 44, + 44, + 44, + 44, + 44, + 44, + 43, + 43, + 42, + 42, + 42, + 42, + 42, + 42, + 42, + 41, + 41, + 41, + 41, + 41, + 41, + 40, + 40, + 40, + 40, + 40, + 40, 39, - 35, - 34, - 7, - 6, - 5, - 4, - 3, - 2, - 1, - 1, - 1, - 1, - 1 + 39, + 39, + 39, + 39, + 39, + 39, + 39, + 39, + 39, + 38, + 38, + 38, + 38 ] } ], @@ -11270,563 +11739,204 @@ "zerolinewidth": 2 } } - }, - "title": { - "text": "IDs provided by providers" - }, - "xaxis": { - "tickangle": 45, - "tickfont": { - "size": 12 - } - } - } - }, - "text/html": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "data = [\n", - " go.Bar(\n", - " x=ids.groupby('provider').count().sort_values('orcid', ascending=False).index,\n", - " y=ids.groupby('provider').count().sort_values('orcid', ascending=False)['orcid']\n", - " )\n", - "]\n", - "\n", - "layout = go.Layout(\n", - " title='IDs provided by providers',\n", - " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", - ")\n", - "fig = go.Figure(data=data, layout=layout)\n", - "plotly.offline.iplot(fig)" - ] - }, - { - "cell_type": "code", - "execution_count": 81, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "array([nan, 'Scopus Author ID', 'Ciência ID', 'Loop profile',\n", - " 'ResearcherID', 'Researcher Name Resolver ID', 'UOW Scholars',\n", - " '中国科学家在线', 'Pitt ID', 'AuthenticusID', 'Sciprofile', 'GND', 'ISNI',\n", - " 'HKU ResearcherPage', 'CTI Vitae', 'Researcher ID', 'ID Dialnet',\n", - " 'Digital author ID', 'HKUST Profile',\n", - " 'Technical University of Denmark CWIS', 'Scopus Author ID: ',\n", - " 'Digital Author ID (DAI)', 'Scopus Author ID:', 'Google Scholar',\n", - " 'AuthID', 'Digital Author ID', 'iAuthor', 'US EPA VIVO', 'GitHub',\n", - " 'Scopus author ID', 'Chalmers ID', 'Scopus ID', 'Authenticus',\n", - " 'VIVO Cornell', 'Scopus ID', 'ScienceOpen',\n", - " 'Smithsonian Profiles', 'ResearcherID:', 'DAI', 'eScientist',\n", - " 'KAKEN', 'Digital author ID (DAI)', 'ORCID', 'ID Dialnet:',\n", - " 'Dialnet ID', 'UNE Researcher ID', 'ResearcherID: ',\n", - " 'Profile system identifier', 'Custom', 'ResearcherId', 'ORCID iD'],\n", - " dtype=object)" - ] - }, - "execution_count": 81, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "pd.unique(ids['provider'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Keywords" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This field is problematic as users can be nasty and put multiple keywords in one as opposed of having different keywords. Look this" - ] - }, - { - "cell_type": "code", - "execution_count": 165, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['Data science ',\n", - " 'science of science',\n", - " 'scholarly knowledge mining',\n", - " 'open science',\n", - " 'research infrastructures']" - ] - }, - "execution_count": 165, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['orcid'] == AM]['keywords'].values[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "I did a good job. The following instead is dirty" - ] - }, - { - "cell_type": "code", - "execution_count": 166, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['open access, open science, libraries, repositories, social web,']" - ] - }, - "execution_count": 166, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['orcid'] == PP]['keywords'].values[0]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "So the keyword field needs some cleaning" - ] - }, - { - "cell_type": "code", - "execution_count": 197, - "metadata": {}, - "outputs": [], - "source": [ - "def fix_keywords(lst):\n", - " fixed = set()\n", - " for k in lst:\n", - " tokens = set(k.split(','))\n", - "# tokens.remove('')\n", - " for t in tokens:\n", - " fixed.add(str.strip(t))\n", - " fixed.discard('')\n", - " return list(fixed)" - ] - }, - { - "cell_type": "code", - "execution_count": 199, - "metadata": {}, - "outputs": [], - "source": [ - "df['fixed_keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 200, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['open science', 'open access', 'repositories', 'libraries', 'social web']" + }, + "title": { + "text": "Keywords provided by ORCiD" + }, + "xaxis": { + "range": [ + -0.5, + 99.5 + ], + "tickangle": 45, + "tickfont": { + "size": 12 + } + } + } + }, + "text/html": [ + "
" ] }, - "execution_count": 200, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "df[df['orcid'] == PP]['fixed_keywords'].values[0]" + "set_top_n(100)\n", + "data = [\n", + " go.Bar(\n", + " x=df.sort_values('n_keywords', ascending=False)['orcid'][:TOP_N],\n", + " y=df.sort_values('n_keywords', ascending=False)['n_keywords'][:TOP_N]\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='Keywords provided by ORCiD',\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12), range=TOP_RANGE)\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" ] }, { "cell_type": "code", - "execution_count": 201, + "execution_count": 65, "metadata": {}, "outputs": [], "source": [ - "df['n_keywords'] = df.keywords.str.len()" - ] - }, - { - "cell_type": "code", - "execution_count": 83, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
orcidn_keywords
16813100000-0002-0673-0341154.0
77176990000-0002-7060-4112141.0
45976740000-0002-6075-3501140.0
20665800000-0002-4071-0301118.0
35310300000-0002-9638-8091115.0
.........
107470350000-0003-4998-1551NaN
107470360000-0003-4998-4111NaN
107470370000-0003-4998-6045NaN
107470380000-0003-4998-8868NaN
107470390000-0003-4999-7916NaN
\n", - "

10744621 rows × 2 columns

\n", - "
" - ], - "text/plain": [ - " orcid n_keywords\n", - "1681310 0000-0002-0673-0341 154.0\n", - "7717699 0000-0002-7060-4112 141.0\n", - "4597674 0000-0002-6075-3501 140.0\n", - "2066580 0000-0002-4071-0301 118.0\n", - "3531030 0000-0002-9638-8091 115.0\n", - "... ... ...\n", - "10747035 0000-0003-4998-1551 NaN\n", - "10747036 0000-0003-4998-4111 NaN\n", - "10747037 0000-0003-4998-6045 NaN\n", - "10747038 0000-0003-4998-8868 NaN\n", - "10747039 0000-0003-4999-7916 NaN\n", - "\n", - "[10744621 rows x 2 columns]" - ] - }, - "execution_count": 83, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.sort_values('n_keywords', ascending=False)[['orcid', 'n_keywords']]" + "grouped_keywords = df[['orcid', 'keywords']]\\\n", + " .explode('keywords')\\\n", + " .reset_index(drop=True)\\\n", + " .groupby('keywords')\\\n", + " .count()\\\n", + " .sort_values('orcid', ascending=False)" ] }, { "cell_type": "code", - "execution_count": 205, + "execution_count": 66, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.plotly.v1+json": { - "config": { - "linkText": "Export to plot.ly", - "plotlyServerURL": "https://plot.ly", - "showLink": false - }, - "data": [ - { - "type": "bar", - "x": [ - "0000-0002-0673-0341", - "0000-0002-7060-4112", - "0000-0002-6075-3501", - "0000-0002-4071-0301", - "0000-0002-9638-8091", - "0000-0002-4235-4259", - "0000-0001-9462-5666", - "0000-0003-0076-6287", - "0000-0002-1878-9762", - "0000-0001-6537-7683", - "0000-0001-6307-6027", - "0000-0003-2273-9888", - "0000-0003-1799-0971", - "0000-0001-5287-1949", - "0000-0002-0937-7061", - "0000-0001-9715-9357", - "0000-0001-5696-1052", - "0000-0003-2998-5520", - "0000-0001-5869-2204", - "0000-0002-0156-3580", - "0000-0002-9625-6742", - "0000-0002-8401-8018", - "0000-0001-9985-1697", - "0000-0003-4246-8579", - "0000-0002-7710-0355", - "0000-0002-8083-7382", - "0000-0001-7654-5013", - "0000-0001-6939-3859", - "0000-0002-3061-3364", - "0000-0003-2509-2549", - "0000-0002-0463-0048", - "0000-0001-5230-715X", - "0000-0001-9336-6850", - "0000-0001-5458-7167", - "0000-0003-0209-180X", - "0000-0003-3584-6834", - "0000-0002-9381-2264", - "0000-0002-8227-5387", - "0000-0003-3340-6413", - "0000-0002-2935-1934", - "0000-0002-3123-3021", - "0000-0002-8659-6321", - "0000-0002-8644-8396", - "0000-0001-5637-1124", - "0000-0002-8449-2211", - "0000-0001-5167-7466", - "0000-0003-4608-3844", - "0000-0002-2683-4527", - "0000-0002-6347-9464", - "0000-0003-4505-3678", - "0000-0002-3532-043X", - "0000-0003-2532-2906", - "0000-0003-4511-7942", - "0000-0003-4374-6374", - "0000-0003-3720-1183", - "0000-0002-1103-9651", - "0000-0001-9280-6017", - "0000-0003-4673-1063", - "0000-0001-9586-0780", - "0000-0003-2218-1343", - "0000-0003-2550-1859", - "0000-0002-5539-1761", - "0000-0002-8499-1045", - "0000-0002-5306-7781", - "0000-0003-1863-0265", - "0000-0003-3342-6123", - "0000-0002-8072-1152", - "0000-0002-3907-3552", - "0000-0001-6861-9561", - "0000-0002-5274-7742", - "0000-0002-3597-3350", - "0000-0002-3186-8860", - "0000-0002-2252-672X", - "0000-0001-6843-9325", - "0000-0002-9014-2090", - "0000-0003-4486-2684", - "0000-0003-3343-5660", - "0000-0001-8689-185X", - "0000-0001-7133-7848", - "0000-0002-6282-0640", - "0000-0002-1294-2156", - "0000-0001-7857-4133", - "0000-0003-1245-7705", - "0000-0002-4432-3448", - "0000-0003-0097-4182", - "0000-0001-8445-412X", - "0000-0002-3866-6460", - "0000-0003-4153-6779", - "0000-0002-1545-7818", - "0000-0003-3387-3193", - "0000-0002-4598-2891", - "0000-0002-3898-9542", - "0000-0002-0211-7195", - "0000-0003-4283-2895", - "0000-0002-1411-3028", - "0000-0002-9125-6022", - "0000-0003-2640-6757", - "0000-0002-1960-5857", - "0000-0002-4963-9345", - "0000-0002-1770-9660" + "outputs": [ + { + "data": { + "application/vnd.plotly.v1+json": { + "config": { + "linkText": "Export to plot.ly", + "plotlyServerURL": "https://plot.ly", + "showLink": false + }, + "data": [ + { + "type": "bar", + "x": [ + "machine learning", + "bioinformatics", + "molecular biology", + "artificial intelligence", + "education", + "epidemiology", + "neuroscience", + "public health", + "cancer", + "immunology", + "microbiology", + "ecology", + "remote sensing", + "genetics", + "climate change", + "deep learning", + "genomics", + "biochemistry", + "data science", + "computer vision", + "psychology", + "sustainability", + "biotechnology", + "nanotechnology", + "robotics", + "data mining", + "statistics", + "image processing", + "gis", + "nutrition", + "chemistry", + "optimization", + "computer science", + "marketing", + "biomaterials", + "nanomaterials", + "renewable energy", + "organic chemistry", + "electrochemistry", + "educação", + "diabetes", + "analytical chemistry", + "innovation", + "materials science", + "mass spectrometry", + "architecture", + "evolution", + "epigenetics", + "physics", + "biomechanics" ], "y": [ - 154, - 141, - 140, - 118, - 115, - 104, - 98, - 94, - 92, - 91, - 88, - 86, - 84, - 82, - 78, - 77, - 76, - 75, - 74, - 73, - 71, - 70, - 69, - 66, - 64, - 62, - 61, - 60, - 58, - 57, - 56, - 54, - 53, - 53, - 52, - 51, - 51, - 51, - 51, - 50, - 50, - 50, - 50, - 49, - 49, - 49, - 48, - 48, - 48, - 48, - 48, - 48, - 47, - 47, - 46, - 46, - 46, - 45, - 45, - 44, - 44, - 44, - 44, - 44, - 44, - 43, - 43, - 42, - 42, - 42, - 42, - 42, - 42, - 41, - 41, - 41, - 41, - 41, - 41, - 41, - 40, - 40, - 40, - 40, - 40, - 40, - 39, - 39, - 39, - 39, - 39, - 39, - 39, - 39, - 39, - 39, - 38, - 38, - 38, - 38 + 5090, + 3299, + 2377, + 2322, + 2218, + 2121, + 2010, + 1973, + 1956, + 1829, + 1805, + 1788, + 1688, + 1648, + 1621, + 1485, + 1451, + 1398, + 1363, + 1339, + 1331, + 1313, + 1298, + 1294, + 1149, + 1135, + 1134, + 1104, + 1075, + 1071, + 1064, + 1042, + 1037, + 1030, + 1013, + 1010, + 1005, + 977, + 977, + 959, + 958, + 953, + 937, + 917, + 910, + 895, + 892, + 884, + 883, + 882 ] } ], @@ -12638,13 +12748,9 @@ } }, "title": { - "text": "Keywords provided" + "text": "Top-50 keywords occurrence" }, "xaxis": { - "range": [ - -0.5, - 99.5 - ], "tickangle": 45, "tickfont": { "size": 12 @@ -12653,9 +12759,9 @@ } }, "text/html": [ - "