diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index 0f8d755..4efffc5 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 73, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -89,12 +89,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Anomalies ORCiD profile" + "Notable anomalies:" ] }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -108,7 +108,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Notable fake ORCID iDs for explorative purposes:" + "Notable fake ORCID iDs:" ] }, { @@ -133,25 +133,10 @@ "Load the dataset" ] }, - { - "cell_type": "code", - "execution_count": 163, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header=0,\n", - " names=['orcid', 'claimed','verified_email', 'verified_primary_email', \n", - " 'given_names', 'family_name', 'biography', 'other_names', 'urls', \n", - " 'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', \n", - " 'employment', 'n_works', 'works_source'])" - ] - }, { "cell_type": "code", "execution_count": 6, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -184,360 +169,175 @@ "
2418 rows × 17 columns
\n", + "5 rows × 24 columns
\n", "" ], "text/plain": [ - " orcid claimed verified_email \\\n", - "7552 0000-0001-7831-7567 1 1 \n", - "8416 0000-0001-8161-1345 1 1 \n", - "16498 0000-0002-1133-1505 1 1 \n", - "16830 0000-0002-1257-5536 1 1 \n", - "18835 0000-0002-2026-4156 1 1 \n", - "... ... ... ... \n", - "10733293 0000-0002-9887-7788 1 1 \n", - "10737258 0000-0003-1367-8104 1 1 \n", - "10738308 0000-0003-1741-3437 1 1 \n", - "10741460 0000-0003-2909-8585 1 1 \n", - "10745078 0000-0003-4259-5324 1 1 \n", - "\n", - " verified_primary_email given_names family_name biography \\\n", - "7552 1 Vahab Vahdat NaN \n", - "8416 1 AYFER TEKIN ATACAN NaN \n", - "16498 1 Xianrong Lai NaN \n", - "16830 1 Alexandra Zimmer NaN \n", - "18835 1 Fatma Sri Wahyuni NaN \n", - "... ... ... ... ... \n", - "10733293 1 Markéta Laštůvková NaN \n", - "10737258 1 LORENA GUTIÉRREZ GARCÍA NaN \n", - "10738308 1 Xing Liu NaN \n", - "10741460 1 Yusuf Özcan NaN \n", - "10745078 1 P Rama Mohan NaN NaN \n", - "\n", - " other_names urls \\\n", - "7552 NaN NaN \n", - "8416 NaN NaN \n", - "16498 NaN NaN \n", - "16830 NaN NaN \n", - "18835 [\"Ayu\"] NaN \n", - "... ... ... \n", - "10733293 NaN NaN \n", - "10737258 NaN [[\"LinkedIn\", \"https://www.linkedin.com/in/lor... \n", - "10738308 NaN NaN \n", - "10741460 NaN NaN \n", - "10745078 NaN NaN \n", - "\n", - " primary_email other_emails \\\n", - "7552 NaN NaN \n", - "8416 NaN NaN \n", - "16498 NaN NaN \n", - "16830 NaN NaN \n", - "18835 NaN NaN \n", - "... ... ... \n", - "10733293 NaN NaN \n", - "10737258 lorenagg@unex.es NaN \n", - "10738308 NaN NaN \n", - "10741460 NaN NaN \n", - "10745078 NaN NaN \n", - "\n", - " keywords \\\n", - "7552 NaN \n", - "8416 NaN \n", - "16498 NaN \n", - "16830 NaN \n", - "18835 NaN \n", - "... ... \n", - "10733293 NaN \n", - "10737258 [\"Agroecolog\\u00eda, Bot\\u00e1nica, Did\\u00e1c... \n", - "10738308 NaN \n", - "10741460 NaN \n", - "10745078 NaN \n", - "\n", - " external_ids \\\n", - "7552 [[\"Scopus Author ID\", \"57193490305\"], [\"Scopus... \n", - "8416 NaN \n", - "16498 [[\"Scopus Author ID\", \"15769435500\"]] \n", - "16830 NaN \n", - "18835 [[\"ResearcherID\", \"C-5194-2015\"], [\"Scopus Aut... \n", - "... ... \n", - "10733293 NaN \n", - "10737258 [[\"ResearcherID\", \"AAE-6316-2021\"]] \n", - "10738308 [[\"ResearcherID\", \"S-3053-2017\"]] \n", - "10741460 NaN \n", - "10745078 [[\"Scopus Author ID\", \"24776757000\"]] \n", - "\n", - " education \\\n", - "7552 [[\"Industrial Engineering\", \"PhD\", \"Northeaste... \n", - "8416 NaN \n", - "16498 [[\"Department of pharmacy\", \"Bachelor of Tradi... \n", - "16830 NaN \n", - "18835 [[\"Biosains\", \"PHD\", \"Universiti Putra Malaysi... \n", - "... ... \n", - "10733293 NaN \n", - "10737258 [[\"\", \"M\\u00e1ster en Formaci\\u00f3n del profe... \n", - "10738308 NaN \n", - "10741460 [[\"\\u0130lahiyat Fak\\u00fcltesi\", \"Doktora\", \"... \n", - "10745078 [[\"EEE Department\", \"Ph.D. (Power Electronics ... \n", - "\n", - " employment n_works \\\n", - "7552 [[\"Post-doctorate fellow\", \"Harvard Medical Sc... 25 \n", - "8416 NaN 0 \n", - "16498 [[\"Associate Research, Professor\", \"Chengdu Un... 115 \n", - "16830 [[\"Research assistent\", \"Fraunhofer-Institut f... 0 \n", - "18835 [[\"Lecturer\", \"Universitas Andalas\", \"Padang\",... 27 \n", - "... ... ... \n", - "10733293 [[\"\", \"VSB - Technical University of Ostrava\",... 0 \n", - "10737258 [[\"PCI\", \"Universidad de Extremadura - Campus ... 14 \n", - "10738308 NaN 0 \n", - "10741460 [[\"Research Assistant\", \"\\u00c7ukurova Univers... 0 \n", - "10745078 [[\"Associate Professor\", \"RGM College of Engin... 21 \n", - "\n", - " works_source \n", - "7552 [\"Vahab Vahdat\", \"Scopus - Elsevier\", \"Multidi... \n", - "8416 NaN \n", - "16498 [\"Xianrong Lai\", \"Scopus - Elsevier\", \"Crossref\"] \n", - "16830 NaN \n", - "18835 [\"Publons\", \"Crossref Metadata Search\", \"Scopu... \n", - "... ... \n", - "10733293 NaN \n", - "10737258 [\"Multidisciplinary Digital Publishing Institu... \n", - "10738308 NaN \n", - "10741460 NaN \n", - "10745078 [\"Scopus - Elsevier\", \"P Rama Mohan\"] \n", - "\n", - "[2418 rows x 17 columns]" + " orcid claimed verified_email verified_primary_email \\\n", + "0 0000-0001-5009-2052 1 1 1 \n", + "1 0000-0001-5943-0732 1 1 1 \n", + "2 0000-0001-6083-622x 1 1 1 \n", + "3 0000-0001-6262-5709 1 1 1 \n", + "4 0000-0001-6616-4890 1 1 1 \n", + "\n", + " given_names family_name biography other_names urls primary_email ... \\\n", + "0 NaN NaN NaN NaN NaN NaN ... \n", + "1 NaN NaN NaN NaN NaN NaN ... \n", + "2 NaN NaN NaN NaN NaN NaN ... \n", + "3 NaN NaN NaN NaN NaN NaN ... \n", + "4 NaN NaN NaN NaN NaN NaN ... \n", + "\n", + " employment n_works works_source activation_date \\\n", + "0 NaN 0 NaN 2019-06-05t20:25:43.066z \n", + "1 NaN 0 NaN 2015-08-18t13:10:42.871z \n", + "2 NaN 0 NaN 2019-01-21t10:55:27.997z \n", + "3 NaN 0 NaN 2015-08-18t14:29:39.440z \n", + "4 NaN 0 NaN 2015-08-13t01:59:51.802z \n", + "\n", + " last_update_date n_doi n_arxiv n_pmc n_other_pids label \n", + "0 2019-12-11t03:57:41.741z 0 0 0 0 0 \n", + "1 2016-06-15t01:05:19.986z 0 0 0 0 0 \n", + "2 2019-01-28t16:24:02.199z 0 0 0 0 0 \n", + "3 2017-06-21t07:18:20.787z 0 0 0 0 0 \n", + "4 2016-06-15t01:05:21.373z 0 0 0 0 0 \n", + "\n", + "[5 rows x 24 columns]" ] }, "execution_count": 6, @@ -546,107 +346,20 @@ } ], "source": [ - "df[df.duplicated()]" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "df.drop_duplicates(inplace=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Basic column manipulation (interpret columns as lists when necessary)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "df['other_names'] = df[df.other_names.notna()]['other_names'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 164, - "metadata": {}, - "outputs": [], - "source": [ - "df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "df['education'] = df[df.education.notna()]['education'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "df['employment'] = df[df.employment.notna()]['employment'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [], - "source": [ - "df['external_ids'] = df[df.external_ids.notna()]['external_ids'].apply(lambda x: ast.literal_eval(x))" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [], - "source": [ - "df['works_source'] = df[df.works_source.notna()]['works_source'].apply(lambda x: ast.literal_eval(x))" + "df = pd.read_pickle('../data/processed/dataset.pkl')\n", + "df.head(5)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "That's how it looks like" + "Notable profiles inspection" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -680,169 +393,92 @@ "1 rows × 24 columns
\n", "" ], "text/plain": [ - " orcid claimed verified_email verified_primary_email \\\n", - "0 0000-0001-5000-2053 1 0 0 \n", - "1 0000-0001-5000-6548 1 0 0 \n", - "2 0000-0001-5000-7962 1 1 1 \n", - "3 0000-0001-5000-8586 1 0 0 \n", - "4 0000-0001-5001-0256 1 0 0 \n", - "\n", - " given_names family_name biography other_names urls primary_email \\\n", - "0 Jorge Jaramillo Sanchez NaN NaN NaN NaN \n", - "1 Wiseman Bekelesi NaN NaN NaN NaN \n", - "2 ALICE INDIMULI NaN NaN NaN NaN \n", - "3 shim ji yun NaN NaN NaN NaN \n", - "4 Sandro Caramaschi NaN NaN NaN NaN \n", - "\n", - " other_emails keywords external_ids education employment n_works \\\n", - "0 NaN NaN NaN NaN NaN 0 \n", - "1 NaN NaN NaN NaN NaN 0 \n", - "2 NaN NaN NaN NaN NaN 0 \n", - "3 NaN NaN NaN NaN NaN 0 \n", - "4 NaN NaN NaN NaN NaN 0 \n", - "\n", - " works_source \n", - "0 NaN \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN " + " orcid claimed verified_email verified_primary_email \\\n", + "1575869 0000-0002-5193-7851 1 1 1 \n", + "\n", + " given_names family_name \\\n", + "1575869 andrea mannocci \n", + "\n", + " biography other_names \\\n", + "1575869 data scientist & researcher; scholarly knowled... NaN \n", + "\n", + " urls \\\n", + "1575869 [[personal website, https://andremann.github.i... \n", + "\n", + " primary_email ... \\\n", + "1575869 andrea.mannocci@isti.cnr.it ... \n", + "\n", + " employment n_works \\\n", + "1575869 [[research associate, istituto di scienza e te... 37 \n", + "\n", + " works_source \\\n", + "1575869 [scopus - elsevier, crossref metadata search, ... \n", + "\n", + " activation_date last_update_date n_doi n_arxiv \\\n", + "1575869 2017-09-12t14:28:33.467z 2021-03-09t08:32:47.840z 34 0 \n", + "\n", + " n_pmc n_other_pids label \n", + "1575869 0 60 1 \n", + "\n", + "[1 rows x 24 columns]" ] }, - "execution_count": 16, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.head(5)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Add labels to the dataframe based on their presence in OpenAIRE" + "df[df['orcid'] == AM]" ] }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -867,105 +503,218 @@ "2934116 rows × 1 columns
\n", + "1 rows × 24 columns
\n", "" ], "text/plain": [ - " orcid\n", - "0 0000-0002-4292-8072\n", - "1 0000-0003-3026-718x\n", - "2 0000-0002-4568-8748\n", - "3 0000-0002-1380-859x\n", - "4 0000-0001-7526-5095\n", - "... ...\n", - "2934111 0000-0002-2394-5387\n", - "2934112 0000-0002-6125-3170\n", - "2934113 0000-0002-7062-5845\n", - "2934114 0000-0001-8677-5162\n", - "2934115 0000-0002-0240-8021\n", - "\n", - "[2934116 rows x 1 columns]" + " orcid claimed verified_email verified_primary_email \\\n", + "6819986 0000-0001-6997-9470 1 1 1 \n", + "\n", + " given_names family_name biography other_names \\\n", + "6819986 other whatsapp NaN NaN \n", + "\n", + " urls primary_email ... \\\n", + "6819986 [[otherwhatsapp, https://otherwhatsapp.com/], ... NaN ... \n", + "\n", + " employment n_works works_source activation_date \\\n", + "6819986 NaN 0 NaN 2020-10-07t10:37:12.237z \n", + "\n", + " last_update_date n_doi n_arxiv n_pmc n_other_pids label \n", + "6819986 2020-10-08t02:32:03.935z 0 0 0 0 0 \n", + "\n", + "[1 rows x 24 columns]" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df['orcid'] == WHATSAPP]" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "orcid 10916574\n", + "claimed 10916574\n", + "verified_email 10916574\n", + "verified_primary_email 10916574\n", + "given_names 10886150\n", + "family_name 10601571\n", + "biography 348649\n", + "other_names 551482\n", + "urls 707687\n", + "primary_email 123851\n", + "other_emails 48306\n", + "keywords 646400\n", + "external_ids 1301959\n", + "education 2430233\n", + "employment 2665092\n", + "n_works 10916574\n", + "works_source 2721431\n", + "activation_date 10916574\n", + "last_update_date 10916574\n", + "n_doi 10916574\n", + "n_arxiv 10916574\n", + "n_pmc 10916574\n", + "n_other_pids 10916574\n", + "label 10916574\n", + "dtype: int64" ] }, - "execution_count": 97, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "openaire_orcid = pd.read_csv('../data/raw/orcid_openaire.txt', header=None, names=['orcid'])\n", - "openaire_orcid" + "df.count()" ] }, { "cell_type": "code", - "execution_count": 110, + "execution_count": 10, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "count 10916574\n", + "unique 10916574\n", + "top 0000-0002-5454-7613\n", + "freq 1\n", + "Name: orcid, dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "df['label'] = df.orcid.isin(openaire_orcid['orcid'])\n", - "df[\"label\"] = df[\"label\"].astype(int)" + "df['orcid'].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Notable profiles inspection" + "## Primary email" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 123851\n", + "unique 123848\n", + "top patrick.davey@monash.edu\n", + "freq 2\n", + "Name: primary_email, dtype: object" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['primary_email'].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Dupe emails" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6347224 maykin@owasp.org\n", + "7027865 patrick.davey@monash.edu\n", + "9529005 opercin@erbakan.edu.tr\n", + "Name: primary_email, dtype: object" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['primary_email'].dropna().loc[df['primary_email'].duplicated()]" ] }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -1002,83 +751,106 @@ "1 rows × 25 columns
\n", + "2 rows × 25 columns
\n", "" ], "text/plain": [ " orcid claimed verified_email verified_primary_email \\\n", - "8840413 0000-0002-5193-7851 1 1 1 \n", + "4450046 0000-0001-9855-1676 1 1 1 \n", + "6347224 0000-0002-0836-2271 1 1 1 \n", "\n", - " given_names family_name biography other_names \\\n", - "8840413 Andrea Mannocci NaN NaN \n", + " given_names family_name biography other_names urls primary_email \\\n", + "4450046 maykin warasart NaN NaN NaN maykin@owasp.org \n", + "6347224 maykin warasart NaN NaN NaN maykin@owasp.org \n", "\n", - " urls \\\n", - "8840413 [[Personal website, https://andremann.github.i... \n", - "\n", - " primary_email ... n_works \\\n", - "8840413 andrea.mannocci@isti.cnr.it ... 37 \n", - "\n", - " works_source \\\n", - "8840413 [Scopus - Elsevier, Crossref Metadata Search, ... \n", + " ... n_works works_source activation_date \\\n", + "4450046 ... 0 NaN 2020-10-23t17:51:51.925z \n", + "6347224 ... 0 NaN 2020-09-15t04:43:55.709z \n", "\n", - " primary_email_domain other_email_domains n_emails \\\n", - "8840413 isti.cnr.it NaN NaN \n", + " last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n", + "4450046 2021-01-01t15:00:52.053z 0 0 0 0 0 \n", + "6347224 2020-09-15t05:17:28.509z 0 0 0 0 0 \n", "\n", - " url_domains n_urls n_ids n_keywords label \n", - "8840413 [github.io, twitter.com, linkedin.com] 3.0 1.0 5.0 1 \n", + " primary_email_domain \n", + "4450046 owasp.org \n", + "6347224 owasp.org \n", "\n", - "[1 rows x 25 columns]" + "[2 rows x 25 columns]" ] }, - "execution_count": 111, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df[df['orcid'] == AM]" + "df[df['primary_email'] == 'maykin@owasp.org']" ] }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 23, "metadata": {}, "outputs": [ { @@ -1115,125 +887,106 @@ "1 rows × 25 columns
\n", + "2 rows × 25 columns
\n", "" ], "text/plain": [ " orcid claimed verified_email verified_primary_email \\\n", - "9517099 0000-0001-6997-9470 1 1 1 \n", - "\n", - " given_names family_name biography other_names \\\n", - "9517099 other whatsapp NaN NaN \n", + "6840791 0000-0002-2232-9638 1 1 1 \n", + "9529005 0000-0003-0033-0918 1 1 1 \n", "\n", - " urls primary_email ... \\\n", - "9517099 [[Otherwhatsapp, https://otherwhatsapp.com/], ... NaN ... \n", + " given_names family_name biography other_names urls \\\n", + "6840791 osman perçin NaN NaN NaN \n", + "9529005 osman perçin NaN NaN NaN \n", "\n", - " n_works works_source primary_email_domain other_email_domains \\\n", - "9517099 0 NaN NaN NaN \n", + " primary_email ... n_works works_source \\\n", + "6840791 opercin@erbakan.edu.tr ... 0 NaN \n", + "9529005 opercin@erbakan.edu.tr ... 0 NaN \n", "\n", - " n_emails url_domains n_urls \\\n", - "9517099 NaN [otherwhatsapp.com, im-creator.com, facebook.c... 27.0 \n", + " activation_date last_update_date n_doi n_arxiv \\\n", + "6840791 2015-01-12t13:47:55.549z 2020-01-27t07:38:24.269z 0 0 \n", + "9529005 2015-10-13t05:47:12.014z 2020-12-25t13:52:03.976z 0 0 \n", "\n", - " n_ids n_keywords label \n", - "9517099 NaN 4.0 0 \n", + " n_pmc n_other_pids label primary_email_domain \n", + "6840791 0 0 0 erbakan.edu.tr \n", + "9529005 0 0 0 erbakan.edu.tr \n", "\n", - "[1 rows x 25 columns]" - ] - }, - "execution_count": 112, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df['orcid'] == WHATSAPP]" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "orcid 10744621\n", - "claimed 10744621\n", - "verified_email 10744621\n", - "verified_primary_email 10744621\n", - "given_names 10716788\n", - "family_name 10437093\n", - "biography 333885\n", - "other_names 544550\n", - "urls 688262\n", - "primary_email 121476\n", - "other_emails 47470\n", - "keywords 638634\n", - "external_ids 1285292\n", - "education 2402439\n", - "employment 2626669\n", - "n_works 10744621\n", - "works_source 2671905\n", - "primary_email_domain 121476\n", - "other_email_domains 47470\n", - "n_emails 47470\n", - "url_domains 688262\n", - "n_urls 688262\n", - "n_ids 1285292\n", - "n_keywords 638634\n", - "label 10744621\n", - "dtype: int64" + "[2 rows x 25 columns]" ] }, - "execution_count": 103, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.count()" + "df[df['primary_email'] == 'opercin@erbakan.edu.tr']" ] }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -1270,171 +1023,139 @@ "1 rows × 25 columns
\n", + "2 rows × 25 columns
\n", "" ], "text/plain": [ " orcid claimed verified_email verified_primary_email \\\n", - "4595263 0000-0002-5154-6404 1 1 1 \n", + "944993 0000-0002-9158-1757 1 1 1 \n", + "7027865 0000-0002-8774-0030 1 1 1 \n", "\n", - " given_names family_name biography other_names urls primary_email ... \\\n", - "4595263 Olusola Bamisile NaN NaN NaN NaN ... \n", + " given_names family_name biography other_names urls \\\n", + "944993 patrick davey NaN NaN NaN \n", + "7027865 patrick davey NaN NaN NaN \n", "\n", - " n_works works_source \\\n", - "4595263 3 [Multidisciplinary Digital Publishing Institut... \n", + " primary_email ... n_works works_source \\\n", + "944993 patrick.davey@monash.edu ... 0 NaN \n", + "7027865 patrick.davey@monash.edu ... 1 [crossref] \n", "\n", - " primary_email_domain other_email_domains n_emails url_domains n_urls \\\n", - "4595263 NaN NaN NaN NaN NaN \n", + " activation_date last_update_date n_doi n_arxiv \\\n", + "944993 2019-05-09t23:01:02.170z 2019-08-20t03:00:17.844z 0 0 \n", + "7027865 2018-09-11t10:47:10.997z 2021-02-09t06:21:44.138z 1 0 \n", "\n", - " n_ids n_keywords label \n", - "4595263 NaN NaN False \n", + " n_pmc n_other_pids label primary_email_domain \n", + "944993 0 0 0 monash.edu \n", + "7027865 0 0 1 monash.edu \n", "\n", - "[1 rows x 25 columns]" + "[2 rows x 25 columns]" ] }, - "execution_count": 104, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df[df['orcid'] == '0000-0002-5154-6404']" + "df[df['primary_email'] == 'patrick.davey@monash.edu']" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ - "df.drop(index=4595264, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 10744621\n", - "unique 10744621\n", - "top 0000-0001-8644-5622\n", - "freq 1\n", - "Name: orcid, dtype: object" - ] - }, - "execution_count": 22, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['orcid'].describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Primary email" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 121476\n", - "unique 121473\n", - "top patrick.davey@monash.edu\n", - "freq 2\n", - "Name: primary_email, dtype: object" - ] - }, - "execution_count": 23, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['primary_email'].describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Dupe emails" + "df['primary_email_domain'] = df['primary_email'].apply(lambda x: x.split('@')[1] if pd.notna(x) else x)" ] }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "7483666 maykin@owasp.org\n", - "9068234 opercin@erbakan.edu.tr\n", - "10246485 patrick.davey@monash.edu\n", - "Name: primary_email, dtype: object" + "count 123851\n", + "unique 17089\n", + "top gmail.com\n", + "freq 26540\n", + "Name: primary_email_domain, dtype: object" ] }, - "execution_count": 24, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df['primary_email'].dropna().loc[df['primary_email'].duplicated()]" + "df['primary_email_domain'].describe()" ] }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 27, "metadata": {}, "outputs": [ { @@ -1459,489 +1180,93 @@ "17089 rows × 1 columns
\n", "" ], "text/plain": [ - " orcid claimed verified_email verified_primary_email \\\n", - "3776350 0000-0002-0836-2271 1 1 1 \n", - "7483666 0000-0001-9855-1676 1 1 1 \n", - "\n", - " given_names family_name biography other_names urls primary_email \\\n", - "3776350 Maykin Warasart NaN NaN NaN maykin@owasp.org \n", - "7483666 Maykin Warasart NaN NaN NaN maykin@owasp.org \n", - "\n", - " other_emails keywords external_ids education \\\n", - "3776350 [maykin@dga.or.th] NaN NaN NaN \n", - "7483666 [maykin@dga.or.th, maykin@ieee.org] NaN NaN NaN \n", + " orcid\n", + "primary_email_domain \n", + "gmail.com 26540\n", + "hotmail.com 3769\n", + "yahoo.com 2614\n", + "163.com 2109\n", + "yuhs.ac 1132\n", + "... ...\n", + "imean-biotech.com 1\n", + "imec.msu.ru 1\n", + "imedea.uib-csic.es 1\n", + "imes.uni-hannover.de 1\n", + "zzuli.edu.cn 1\n", "\n", - " employment n_works works_source \n", - "3776350 NaN 0 NaN \n", - "7483666 NaN 0 NaN " + "[17089 rows x 1 columns]" ] }, - "execution_count": 25, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df[df['primary_email'] == 'maykin@owasp.org']" + "primary_emails = df[['primary_email_domain', 'orcid']].groupby('primary_email_domain').count().sort_values('orcid', ascending=False)\n", + "primary_emails" ] }, { "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " | orcid | \n", - "claimed | \n", - "verified_email | \n", - "verified_primary_email | \n", - "given_names | \n", - "family_name | \n", - "biography | \n", - "other_names | \n", - "urls | \n", - "primary_email | \n", - "other_emails | \n", - "keywords | \n", - "external_ids | \n", - "education | \n", - "employment | \n", - "n_works | \n", - "works_source | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3995032 | \n", - "0000-0002-2232-9638 | \n", - "1 | \n", - "1 | \n", - "1 | \n", - "Osman | \n", - "Perçin | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "opercin@erbakan.edu.tr | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "0 | \n", - "NaN | \n", - "
9068234 | \n", - "0000-0003-0033-0918 | \n", - "1 | \n", - "1 | \n", - "1 | \n", - "Osman | \n", - "PERÇİN | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "opercin@erbakan.edu.tr | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "[[, Necmettin Erbakan University, Konya, , TR,... | \n", - "0 | \n", - "NaN | \n", - "
\n", - " | orcid | \n", - "claimed | \n", - "verified_email | \n", - "verified_primary_email | \n", - "given_names | \n", - "family_name | \n", - "biography | \n", - "other_names | \n", - "urls | \n", - "primary_email | \n", - "other_emails | \n", - "keywords | \n", - "external_ids | \n", - "education | \n", - "employment | \n", - "n_works | \n", - "works_source | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5087745 | \n", - "0000-0002-8774-0030 | \n", - "1 | \n", - "1 | \n", - "1 | \n", - "Patrick | \n", - "Davey | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "patrick.davey@monash.edu | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "[[PhD Student, Monash University, Melbourne, V... | \n", - "1 | \n", - "[Crossref] | \n", - "
10246485 | \n", - "0000-0002-9158-1757 | \n", - "1 | \n", - "1 | \n", - "1 | \n", - "Patrick | \n", - "Davey | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "patrick.davey@monash.edu | \n", - "NaN | \n", - "[Radiopharmaceuticals, Inorganic Chemistry, Bi... | \n", - "NaN | \n", - "NaN | \n", - "[[PhD Student, Monash University, Melbourne, ,... | \n", - "0 | \n", - "NaN | \n", - "
\n", - " | orcid | \n", - "
---|---|
primary_email_domain | \n", - "\n", - " |
gmail.com | \n", - "25892 | \n", - "
hotmail.com | \n", - "3674 | \n", - "
yahoo.com | \n", - "2578 | \n", - "
163.com | \n", - "2067 | \n", - "
yuhs.ac | \n", - "1124 | \n", - "
... | \n", - "... | \n", - "
iiap.gob.pe | \n", - "1 | \n", - "
iiap.org.pe | \n", - "1 | \n", - "
iibb.csic.es | \n", - "1 | \n", - "
iic.hokudai.ac.jp | \n", - "1 | \n", - "
zzuli.edu.cn | \n", - "1 | \n", - "
17047 rows × 1 columns
\n", - "\n", + " | orcid | \n", + "claimed | \n", + "verified_email | \n", + "verified_primary_email | \n", + "given_names | \n", + "family_name | \n", + "biography | \n", + "other_names | \n", + "urls | \n", + "primary_email | \n", + "... | \n", + "n_doi | \n", + "n_arxiv | \n", + "n_pmc | \n", + "n_other_pids | \n", + "label | \n", + "primary_email_domain | \n", + "other_email_domains | \n", + "n_emails | \n", + "url_domains | \n", + "n_urls | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
382497 | \n", + "0000-0002-9025-8632 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "buycannabis | \n", + "dispensary | \n", + "we procure and deliver premium cannabis strain... | \n", + "[we procure and deliver premium cannabis strai... | \n", + "[[find your cannabis & marijuana dispensary , ... | \n", + "NaN | \n", + "... | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "[goowonderland.com, goowonderland.com, goowond... | \n", + "81.0 | \n", + "
911811 | \n", + "0000-0002-4062-3603 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "juan de dios | \n", + "beltrán mancilla | \n", + "juan de dios beltrán mancilla (*) filósofo aut... | \n", + "[juan de dios beltrán mancilla, filósofo autod... | \n", + "[[01.- juan de dios beltrán mancilla. teoría o... | \n", + "NaN | \n", + "... | \n", + "0 | \n", + "0 | \n", + "0 | \n", + "7 | \n", + "0 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "[yumpu.com, ijopm.org, google.com, blogspot.co... | \n", + "69.0 | \n", + "
1136129 | \n", + "0000-0002-1929-6054 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "franklin américo | \n", + "canaza choque | \n", + "docente-investigador social. maestrando en der... | \n", + "[franklin américo canaza-choque , franklin a. ... | \n", + "[[consejo nacional de ciencia, tecnología e in... | \n", + "leo_123fa@hotmail.com | \n", + "... | \n", + "29 | \n", + "0 | \n", + "0 | \n", + "33 | \n", + "1 | \n", + "hotmail.com | \n", + "[gmail.com, gmail.com, hotmail.com, baldwin.ed... | \n", + "5.0 | \n", + "[concytec.gob.pe, redalyc.org, redalyc.org, un... | \n", + "61.0 | \n", + "
3102686 | \n", + "0000-0003-2593-7134 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "aan | \n", + "jaelani | \n", + "all my papers can be downloaded from portal:re... | \n", + "[jaelani, a., jaelani, aan] | \n", + "[[microsoft academic research, https://academi... | \n", + "aan_jaelani@syekhnurjati.ac.id | \n", + "... | \n", + "88 | \n", + "0 | \n", + "0 | \n", + "193 | \n", + "1 | \n", + "syekhnurjati.ac.id | \n", + "[gmail.com] | \n", + "1.0 | \n", + "[microsoft.com, twitter.com, academia.edu, aca... | \n", + "67.0 | \n", + "
6868932 | \n", + "0000-0002-5710-4041 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "ryszard | \n", + "romaniuk | \n", + "professor of electronics and communications en... | \n", + "[r.romaniuk, r.s.romaniuk, ryszard romaniuk, r... | \n", + "[[scholar google, http://scholar.google.pl/cit... | \n", + "rrom@ise.pw.edu.pl | \n", + "... | \n", + "1221 | \n", + "25 | \n", + "0 | \n", + "1742 | \n", + "1 | \n", + "ise.pw.edu.pl | \n", + "[ise.pw.edu.pl, elka.pw.edu.pl, cern.ch] | \n", + "3.0 | \n", + "[google.pl, publons.com, scopus.com, mendeley.... | \n", + "114.0 | \n", + "
8088987 | \n", + "0000-0002-9965-2425 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "jaroslaw | \n", + "spychala | \n", + "jaroslaw spychala has received a doctoral degr... | \n", + "[jaroslaw jozef spychala] | \n", + "[[resume, http://www.biowebspin.com/wp-content... | \n", + "NaN | \n", + "... | \n", + "15 | \n", + "0 | \n", + "0 | \n", + "29 | \n", + "1 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "[biowebspin.com, biowebspin.com, google.com, l... | \n", + "73.0 | \n", + "
8658355 | \n", + "0000-0002-3920-7389 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "а. | \n", + "гусев | \n", + "surname, name gusev alexander leonidovichdate... | \n", + "[alexander l. gusev , alexander leonidovich gu... | \n", + "[[a.l. gusev alternative energy and ecology, ... | \n", + "NaN | \n", + "... | \n", + "37 | \n", + "0 | \n", + "0 | \n", + "21 | \n", + "1 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "[youtube.com, isjaee.com, researchgate.net, re... | \n", + "111.0 | \n", + "
8778864 | \n", + "0000-0002-3997-5070 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "dr. parameshachari | \n", + "b d | \n", + "dr. parameshachari b dacm distinguished speake... | \n", + "[dr. parameshachari b d] | \n", + "[[gsssietw,mysuru, http://geethashishu.in/], [... | \n", + "NaN | \n", + "... | \n", + "47 | \n", + "0 | \n", + "0 | \n", + "48 | \n", + "1 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "[geethashishu.in, geethashishu.in, acm.org, go... | \n", + "71.0 | \n", + "
9980164 | \n", + "0000-0003-4948-9268 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "gustavo | \n", + "duperré | \n", + "gustavo norberto duperré graduated in arts and... | \n", + "[gustavo norberto duperré, duperré, g. n., gus... | \n", + "[[gis in cultural heritage - icomos românia, h... | \n", + "gustavo.duperre@usal.edu.ar | \n", + "... | \n", + "13 | \n", + "0 | \n", + "0 | \n", + "34 | \n", + "0 | \n", + "usal.edu.ar | \n", + "NaN | \n", + "NaN | \n", + "[icomos.ro, unirioja.es, unirioja.es, unc.edu.... | \n", + "61.0 | \n", + "
10024501 | \n", + "0000-0003-2407-3557 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "abdul | \n", + "aziz | \n", + "abdul aziz was born on may 25, 1973, in brebes... | \n", + "[abdul aziz, aziz, abdul, aziz, a., aziz, abd,... | \n", + "[[google scholar, https://scholar.google.com/c... | \n", + "NaN | \n", + "... | \n", + "19 | \n", + "0 | \n", + "0 | \n", + "77 | \n", + "1 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "[google.com, syekhnurjati.ac.id, orcid.org, bl... | \n", + "59.0 | \n", + "
10091165 | \n", + "0000-0003-2183-8112 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "pelayo munhoz | \n", + "olea | \n", + "pós-doutorado em gestão ambiental pela univers... | \n", + "[ munhoz, pelayo olea, olea, pelayo, olea, p... | \n", + "[[currículo lattes, http://lattes.cnpq.br/6209... | \n", + "NaN | \n", + "... | \n", + "797 | \n", + "0 | \n", + "1 | \n", + "582 | \n", + "1 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "[cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c... | \n", + "61.0 | \n", + "
10523205 | \n", + "0000-0003-2450-090x | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "eduard | \n", + "babulak | \n", + "professor eduard babulak is accomplished inter... | \n", + "[professor eduard babulak] | \n", + "[[honorary chair, chief mentor & senior adviso... | \n", + "NaN | \n", + "... | \n", + "199 | \n", + "0 | \n", + "1 | \n", + "174 | \n", + "1 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "[worldassessmentcouncil.org, spseke.sk, bcs.or... | \n", + "114.0 | \n", + "
10696059 | \n", + "0000-0002-6938-9638 | \n", + "1 | \n", + "1 | \n", + "1 | \n", + "adolfo | \n", + "catral sanabria | \n", + "my education is in computer science, mathemati... | \n", + "NaN | \n", + "[[researchgate adolfo catral , https://www.res... | \n", + "NaN | \n", + "... | \n", + "2022 | \n", + "0 | \n", + "0 | \n", + "16 | \n", + "1 | \n", + "NaN | \n", + "NaN | \n", + "NaN | \n", + "[researchgate.net, youtube.com, linkedin.com, ... | \n", + "152.0 | \n", + "
13 rows × 29 columns
\n", + "13 rows × 22 columns
\n", + "141 rows × 29 columns
\n", "" ], "text/plain": [ " orcid claimed verified_email \\\n", - "482862 0000-0003-4948-9268 1 1 \n", - "554859 0000-0002-1929-6054 1 1 \n", - "1381092 0000-0002-9025-8632 1 1 \n", - "2679353 0000-0003-2407-3557 1 1 \n", - "3354430 0000-0002-3920-7389 1 1 \n", - "4004281 0000-0002-5710-4041 1 1 \n", - "4022480 0000-0003-2450-090X 1 1 \n", - "6335357 0000-0003-2593-7134 1 1 \n", - "6489838 0000-0002-9965-2425 1 1 \n", - "7570584 0000-0003-2183-8112 1 1 \n", - "10240510 0000-0002-6938-9638 1 1 \n", - "10448304 0000-0002-4062-3603 1 1 \n", - "10663894 0000-0002-3997-5070 1 1 \n", - "\n", - " verified_primary_email given_names family_name \\\n", - "482862 1 Gustavo Duperré \n", - "554859 1 Franklin Américo Canaza Choque \n", - "1381092 1 buycannabis dispensary \n", - "2679353 1 Abdul Aziz \n", - "3354430 1 А. Гусев \n", - "4004281 1 Ryszard Romaniuk \n", - "4022480 1 Eduard Babulak \n", - "6335357 1 Aan Jaelani \n", - "6489838 1 Jaroslaw Spychala \n", - "7570584 1 Pelayo Munhoz Olea \n", - "10240510 1 Adolfo Catral Sanabria \n", - "10448304 1 JUAN DE DIOS BELTRÁN MANCILLA \n", - "10663894 1 Dr. Parameshachari B D \n", + "97666 0000-0002-7843-8497 1 1 \n", + "200670 0000-0003-1554-1531 1 1 \n", + "210325 0000-0003-3080-4643 1 1 \n", + "218947 0000-0003-3193-030x 1 1 \n", + "261974 0000-0002-5341-6531 1 1 \n", + "... ... ... ... \n", + "10405738 0000-0002-3374-5709 1 1 \n", + "10472264 0000-0001-7228-5680 1 1 \n", + "10785961 0000-0002-3064-0194 1 1 \n", + "10845645 0000-0003-1047-4229 1 1 \n", + "10896059 0000-0003-4836-7074 1 1 \n", + "\n", + " verified_primary_email given_names family_name \\\n", + "97666 1 davi barbosa \n", + "200670 1 katarzyna ochman \n", + "210325 1 graham dawson \n", + "218947 1 juan pablo wolff mejia \n", + "261974 1 trent hammond \n", + "... ... ... ... \n", + "10405738 1 guillermo ortiz \n", + "10472264 1 text protocol \n", + "10785961 1 leonardo fernando cruz basso \n", + "10845645 1 bayu sakti \n", + "10896059 1 karla haydee ortiz palafox \n", "\n", " biography \\\n", - "482862 Gustavo Norberto Duperré graduated in Arts and... \n", - "554859 Docente-Investigador Social. Maestrando en Der... \n", - "1381092 We procure and deliver premium cannabis strain... \n", - "2679353 Abdul Aziz was born on May 25, 1973, in Brebes... \n", - "3354430 Surname, Name Gusev Alexander LeonidovichDate... \n", - "4004281 Professor of Electronics and Communications En... \n", - "4022480 Professor Eduard Babulak is accomplished inter... \n", - "6335357 All my papers can be downloaded from portal:Re... \n", - "6489838 Jaroslaw Spychala has received a doctoral degr... \n", - "7570584 Pós-Doutorado em Gestão Ambiental pela Univers... \n", - "10240510 My education is in computer science, mathemati... \n", - "10448304 JUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut... \n", - "10663894 Dr. Parameshachari B DACM Distinguished Speake... \n", + "97666 pesquisador na área sociojurídica, professor, ... \n", + "200670 katarzyna ochman [kataˈʐɨna ˈɔxman] is assista... \n", + "210325 science and engineering faculty (sef) libraria... \n", + "218947 aspirante a maestría en derecho y negocios int... \n", + "261974 mr trent hammond is an honorary research fello... \n", + "... ... \n", + "10405738 médico, internista, neumólogo, intensivista, e... \n", + "10472264 NaN \n", + "10785961 NaN \n", + "10845645 bayu purbha saktisaya adalah bayu purbha sakti... \n", + "10896059 karla haydee ortíz palafoxmiembro del sistema ... \n", "\n", " other_names \\\n", - "482862 [Gustavo Norberto Duperré, Duperré, G. N.] \n", - "554859 [Franklin Américo Canaza-Choque , Franklin A. ... \n", - "1381092 [We procure and deliver premium cannabis strai... \n", - "2679353 [Abdul Aziz, Aziz, Abdul, Aziz, A., Aziz, Abd,... \n", - "3354430 [Alexander L. Gusev , Alexander Leonidovich Gu... \n", - "4004281 [R.Romaniuk, R.S.Romaniuk, Ryszard Romaniuk, R... \n", - "4022480 [Professor Eduard Babulak] \n", - "6335357 [Jaelani, A., Jaelani, Aan] \n", - "6489838 [Jaroslaw Jozef Spychala] \n", - "7570584 [ Munhoz, Pelayo Olea, Olea, Pelayo, Olea, P... \n", - "10240510 NaN \n", - "10448304 [Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD... \n", - "10663894 [Dr. PARAMESHACHARI B D] \n", + "97666 [professor davi barbosa delmont] \n", + "200670 [[kataˈʐɨna ˈɔxman], catharina ochman, cathari... \n", + "210325 [ graham colin dawson, g.c. dawson] \n", + "218947 [juan pablo wolff, pablo wolff mejia, juan p. ... \n", + "261974 [trent ernest hammond (t.e.hammond)] \n", + "... ... \n", + "10405738 [guillermo ortiz-ruiz] \n", + "10472264 NaN \n", + "10785961 NaN \n", + "10845645 [bayu purbha sakti] \n", + "10896059 [karla palafox] \n", "\n", " urls \\\n", - "482862 [[Gis in Cultural Heritage - ICOMOS România, h... \n", - "554859 [[Consejo Nacional de Ciencia, Tecnología e In... \n", - "1381092 [[find your cannabis & marijuana dispensary , ... \n", - "2679353 [[Google Scholar, https://scholar.google.com/c... \n", - "3354430 [[A.L. Gusev Alternative Energy and Ecology, ... \n", - "4004281 [[Scholar Google, http://scholar.google.pl/cit... \n", - "4022480 [[Honorary Chair, Chief Mentor & Senior Adviso... \n", - "6335357 [[Microsoft Academic Research, https://academi... \n", - "6489838 [[RESUME, http://www.biowebspin.com/wp-content... \n", - "7570584 [[Currículo Lattes, http://lattes.cnpq.br/6209... \n", - "10240510 [[ResearchGate Adolfo Catral , https://www.res... \n", - "10448304 [[01.- Juan de Dios Beltrán Mancilla. Teoría O... \n", - "10663894 [[GSSSIETW,MYSURU, http://geethashishu.in/], [... \n", - "\n", - " primary_email ... \\\n", - "482862 gustavo.duperre@usal.edu.ar ... \n", - "554859 Leo_123fa@hotmail.com ... \n", - "1381092 NaN ... \n", - "2679353 NaN ... \n", - "3354430 NaN ... \n", - "4004281 rrom@ise.pw.edu.pl ... \n", - "4022480 NaN ... \n", - "6335357 aan_jaelani@syekhnurjati.ac.id ... \n", - "6489838 NaN ... \n", - "7570584 NaN ... \n", - "10240510 NaN ... \n", - "10448304 NaN ... \n", - "10663894 NaN ... \n", - "\n", - " external_ids \\\n", - "482862 [[Scopus Author ID, 57195936346], [ResearcherI... \n", - "554859 [[ResearcherID, P-8613-2018], [Loop profile, 8... \n", - "1381092 NaN \n", - "2679353 NaN \n", - "3354430 [[ResearcherID, F-8048-2014], [Scopus Author I... \n", - "4004281 [[ISNI, 0000000071432485], [ResearcherID, B-91... \n", - "4022480 [[Scopus Author ID, 6506867432], [ResearcherID... \n", - "6335357 [[Scopus Author ID, 57195963463], [Loop profil... \n", - "6489838 [[Scopus Author ID, 7006745874]] \n", - "7570584 [[Scopus Author ID, 55175503300], [ResearcherI... \n", - "10240510 [[Loop profile, 747193]] \n", - "10448304 NaN \n", - "10663894 [[ResearcherID, F-7045-2018], [Scopus Author I... \n", - "\n", - " education \\\n", - "482862 [[Programme in History, History of Art and Ter... \n", - "554859 [[Facultad de Ciencias de la Educación , Maest... \n", - "1381092 NaN \n", - "2679353 [[Ilmu Ekonomi, Dr, Universitas Borobudur, Jak... \n", - "3354430 [[Chemical technology and cryogenic-vacuum tec... \n", - "4004281 [[Faculty of Electronics and Information Techn... \n", - "4022480 [[Information Technology, Doctor Habilitated (... \n", - "6335357 [[Post Graduate, S3/Dr, Universitas Islam Nege... \n", - "6489838 [[Department of Chemistry, Postdoctoral Associ... \n", - "7570584 [[, Postdoctoral in Environmental Sustainabili... \n", - "10240510 [[Education, Capacitación para la enseñanza en... \n", - "10448304 [[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR... \n", - "10663894 [[Electronics and Communication Engineering, P... \n", - "\n", - " employment n_works \\\n", - "482862 [[Titular Professor, Dirección General de Cult... 13 \n", - "554859 [[Investigador Social, Universidad Católica de... 38 \n", - "1381092 NaN 10 \n", - "2679353 [[Assisten Professor/Dr, Institut Agama Islam ... 72 \n", - "3354430 [[General Director, Scientific Technical Centr... 472 \n", - "4004281 [[Professor, Institute Director, Politechnika ... 5008 \n", - "4022480 [[Consultant, Horizon 2020 Framework Programme... 274 \n", - "6335357 [[Dr, Institut Agama Islam Negeri Syekh Nurjat... 79 \n", - "6489838 [[Assistant Professor, Adam Mickiewicz Univers... 29 \n", - "7570584 [[Professor, Universidade Federal do Rio Grand... 1105 \n", - "10240510 NaN 2023 \n", - "10448304 [[INSPECTOR GENERAL JORNADA VESPERTINA // De 2... 11 \n", - "10663894 [[ACM Distinguished Speaker (Volunteer), Assoc... 93 \n", - "\n", - " works_source \\\n", - "482862 [Gustavo Duperré, Scopus - Elsevier, Publons, ... \n", - "554859 [ResearcherID, BASE - Bielefeld Academic Searc... \n", - "1381092 [goowonderland dispensary] \n", - "2679353 [BASE - Bielefeld Academic Search Engine, Abdu... \n", - "3354430 [Publons, DataCite, Scopus - Elsevier, A.L. Gu... \n", - "4004281 [INSPIRE-HEP, ResearcherID, ISNI2ORCID search ... \n", - "4022480 [The Lens, BASE - Bielefeld Academic Search En... \n", - "6335357 [Publons, Aan Jaelani, Scopus - Elsevier, Dime... \n", - "6489838 [Scopus - Elsevier] \n", - "7570584 [The Lens, Pelayo Munhoz Olea, Dimensions, BAS... \n", - "10240510 [BASE - Bielefeld Academic Search Engine, Data... \n", - "10448304 [JUAN DE DIOS BELTR´´ÁN MANCILLA] \n", - "10663894 [Publons, Multidisciplinary Digital Publishing... \n", - "\n", - " primary_email_domain \\\n", - "482862 usal.edu.ar \n", - "554859 hotmail.com \n", - "1381092 NaN \n", - "2679353 NaN \n", - "3354430 NaN \n", - "4004281 ise.pw.edu.pl \n", - "4022480 NaN \n", - "6335357 syekhnurjati.ac.id \n", - "6489838 NaN \n", - "7570584 NaN \n", - "10240510 NaN \n", - "10448304 NaN \n", - "10663894 NaN \n", + "97666 [[plataforma de cursos ideia criativa, https:/... \n", + "200670 [[researchgate, https://www.researchgate.net/p... \n", + "210325 [[qut home page, https://www.library.qut.edu.a... \n", + "218947 [[twitter, https://twitter.com/pablomejiam], [... \n", + "261974 [[academic support masters, http://trenthammon... \n", + "... ... \n", + "10405738 [[elsevier, https://www.elsevier.com/], [asoci... \n", + "10472264 [[about, https://about.me/textprotocol], [gith... \n", + "10785961 [[papers-1, https://www.researchgate.net/profi... \n", + "10845645 [[osf, http://osf.io/qe2ug], [inarxiv, https:/... \n", + "10896059 [[opinión día del maestro, http://www.cronicaj... \n", + "\n", + " primary_email ... n_doi n_arxiv \\\n", + "97666 NaN ... 0 0 \n", + "200670 NaN ... 1 0 \n", + "210325 g.dawson@qut.edu.au ... 0 0 \n", + "218947 juanpmejia@ulasallista.edu.co ... 0 0 \n", + "261974 trent.hammond@academicsupportmasters.com.au ... 1 0 \n", + "... ... ... ... ... \n", + "10405738 NaN ... 62 0 \n", + "10472264 NaN ... 0 0 \n", + "10785961 leonardofernando.basso@mackenzie.br ... 5 0 \n", + "10845645 NaN ... 0 0 \n", + "10896059 NaN ... 0 0 \n", + "\n", + " n_pmc n_other_pids label primary_email_domain \\\n", + "97666 0 0 0 NaN \n", + "200670 0 0 1 NaN \n", + "210325 0 6 1 qut.edu.au \n", + "218947 0 0 1 ulasallista.edu.co \n", + "261974 0 1 1 academicsupportmasters.com.au \n", + "... ... ... ... ... \n", + "10405738 0 88 0 NaN \n", + "10472264 0 0 0 NaN \n", + "10785961 0 0 1 mackenzie.br \n", + "10845645 0 0 1 NaN \n", + "10896059 0 2 1 NaN \n", "\n", " other_email_domains n_emails \\\n", - "482862 NaN NaN \n", - "554859 [gmail.com, gmail.com, hotmail.com, baldwin.ed... 5.0 \n", - "1381092 NaN NaN \n", - "2679353 NaN NaN \n", - "3354430 NaN NaN \n", - "4004281 [ise.pw.edu.pl, elka.pw.edu.pl, cern.ch] 3.0 \n", - "4022480 NaN NaN \n", - "6335357 [gmail.com] 1.0 \n", - "6489838 NaN NaN \n", - "7570584 NaN NaN \n", - "10240510 NaN NaN \n", - "10448304 NaN NaN \n", - "10663894 NaN NaN \n", + "97666 NaN NaN \n", + "200670 NaN NaN \n", + "210325 NaN NaN \n", + "218947 NaN NaN \n", + "261974 [health.nsw.gov.au, csu.edu.au, sociologist.co... 5.0 \n", + "... ... ... \n", + "10405738 NaN NaN \n", + "10472264 NaN NaN \n", + "10785961 [mackenzie.br] 1.0 \n", + "10845645 NaN NaN \n", + "10896059 NaN NaN \n", "\n", " url_domains n_urls \n", - "482862 [icomos.ro, unirioja.es, unirioja.es, unc.edu.... 51.0 \n", - "554859 [concytec.gob.pe, redalyc.org, redalyc.org, un... 61.0 \n", - "1381092 [goowonderland.com, goowonderland.com, goowond... 81.0 \n", - "2679353 [google.com, syekhnurjati.ac.id, orcid.org, bl... 59.0 \n", - "3354430 [youtube.com, isjaee.com, researchgate.net, re... 111.0 \n", - "4004281 [google.pl, publons.com, scopus.com, mendeley.... 114.0 \n", - "4022480 [worldassessmentcouncil.org, spseke.sk, bcs.or... 114.0 \n", - "6335357 [microsoft.com, twitter.com, academia.edu, aca... 67.0 \n", - "6489838 [biowebspin.com, biowebspin.com, google.com, l... 73.0 \n", - "7570584 [cnpq.br, cnpq.br, cnpq.br, cnpq.br, publons.c... 61.0 \n", - "10240510 [researchgate.net, youtube.com, linkedin.com, ... 152.0 \n", - "10448304 [yumpu.com, ijopm.org, google.com, blogspot.co... 69.0 \n", - "10663894 [geethashishu.in, geethashishu.in, acm.org, go... 71.0 \n", - "\n", - "[13 rows x 22 columns]" + "97666 [eadplataforma.com, facebook.com, youtube.com,... 39.0 \n", + "200670 [researchgate.net, academia.edu, facebook.com,... 11.0 \n", + "210325 [qut.edu.au, qut.edu.au, google.com.au, resear... 11.0 \n", + "218947 [twitter.com, youtube.com, google.com, linkedi... 11.0 \n", + "261974 [wix.com, academia.edu, researchgate.net, rese... 12.0 \n", + "... ... ... \n", + "10405738 [elsevier.com, amci.org.co, springer.com, revi... 12.0 \n", + "10472264 [about.me, github.com, gitlab.com, gravatar.co... 12.0 \n", + "10785961 [researchgate.net, ssrn.com, cnpq.br, google.c... 17.0 \n", + "10845645 [osf.io, osf.io, academia.edu, mendeley.com, f... 12.0 \n", + "10896059 [cronicajalisco.com, youtube.com, tlaquepaque.... 22.0 \n", + "\n", + "[141 rows x 29 columns]" ] }, - "execution_count": 48, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df[(df['url_domains'].str.len() > 50) & (df['n_works'] > 0)]" + "df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]" ] }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 47, "metadata": { "scrolled": true }, @@ -8672,11 +8315,11 @@ "139 rows × 22 columns
\n", + "141 rows × 29 columns
\n", "" ], "text/plain": [ - " orcid claimed verified_email \\\n", - "45566 0000-0003-1948-3180 1 1 \n", - "72674 0000-0002-2000-8339 1 1 \n", - "172820 0000-0001-9293-2224 1 1 \n", - "209505 0000-0003-3045-0056 1 1 \n", - "259877 0000-0003-1815-5732 1 1 \n", - "... ... ... ... \n", - "10494820 0000-0002-1324-7171 1 1 \n", - "10495806 0000-0002-1700-8311 1 1 \n", - "10633545 0000-0003-2676-4431 1 1 \n", - "10648241 0000-0001-8157-0600 1 1 \n", - "10679699 0000-0002-9874-1450 1 1 \n", - "\n", - " verified_primary_email given_names \\\n", - "45566 1 Mark \n", - "72674 1 Phòng khám tư nhân Hà Nội \n", - "172820 1 Juan Carlos \n", - "209505 1 Ananda \n", - "259877 1 JAS \n", - "... ... ... \n", - "10494820 1 Vanesa Natalia \n", - "10495806 1 Fix-IT \n", - "10633545 1 Benny \n", - "10648241 1 Bijan \n", - "10679699 1 FENGZHI \n", - "\n", - " family_name \\\n", - "45566 Katz \n", - "72674 NaN \n", - "172820 Garcia Hoyos \n", - "209505 Majumdar \n", - "259877 (Jurnal Akuntansi Syariah) \n", - "... ... \n", - "10494820 Rodriguez \n", - "10495806 Rite \n", - "10633545 Soewandi \n", - "10648241 Yavar \n", - "10679699 WU \n", - "\n", - " biography \\\n", - "45566 Mark N. Katz is a professor of government and ... \n", - "72674 NaN \n", - "172820 My name is Juan Carlos García Hoyos. I was bor... \n", - "209505 I am Ananda Majumdar, Child Care Educator at B... \n", - "259877 JAS (Jurnal Akuntansi Syariah) published in pr... \n", - "... ... \n", - "10494820 Nombre y Apellido: Vanesa Natalia Rodriguez. ... \n", - "10495806 NaN \n", - "10633545 NaN \n", - "10648241 Senior Research Assistant and Phd Student in O... \n", - "10679699 NaN \n", + " orcid claimed verified_email verified_primary_email \\\n", + "0 0000-0002-7843-8497 1 1 1 \n", + "1 0000-0003-1554-1531 1 1 1 \n", + "2 0000-0003-3080-4643 1 1 1 \n", + "3 0000-0003-3193-030x 1 1 1 \n", + "4 0000-0002-5341-6531 1 1 1 \n", + ".. ... ... ... ... \n", + "136 0000-0002-3374-5709 1 1 1 \n", + "137 0000-0001-7228-5680 1 1 1 \n", + "138 0000-0002-3064-0194 1 1 1 \n", + "139 0000-0003-1047-4229 1 1 1 \n", + "140 0000-0003-4836-7074 1 1 1 \n", + "\n", + " given_names family_name \\\n", + "0 davi barbosa \n", + "1 katarzyna ochman \n", + "2 graham dawson \n", + "3 juan pablo wolff mejia \n", + "4 trent hammond \n", + ".. ... ... \n", + "136 guillermo ortiz \n", + "137 text protocol \n", + "138 leonardo fernando cruz basso \n", + "139 bayu sakti \n", + "140 karla haydee ortiz palafox \n", "\n", - " other_names \\\n", - "45566 NaN \n", - "72674 NaN \n", - "172820 [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... \n", - "209505 NaN \n", - "259877 NaN \n", - "... ... \n", - "10494820 [Vanesa Rodriguez, Vanesa N. Rodriguez] \n", - "10495806 [Best Heating & Plumbing Company] \n", - "10633545 [Benny Soewandi] \n", - "10648241 [B. Yavar, Yavar Bijan] \n", - "10679699 NaN \n", + " biography \\\n", + "0 pesquisador na área sociojurídica, professor, ... \n", + "1 katarzyna ochman [kataˈʐɨna ˈɔxman] is assista... \n", + "2 science and engineering faculty (sef) libraria... \n", + "3 aspirante a maestría en derecho y negocios int... \n", + "4 mr trent hammond is an honorary research fello... \n", + ".. ... \n", + "136 médico, internista, neumólogo, intensivista, e... \n", + "137 NaN \n", + "138 NaN \n", + "139 bayu purbha saktisaya adalah bayu purbha sakti... \n", + "140 karla haydee ortíz palafoxmiembro del sistema ... \n", "\n", - " urls primary_email \\\n", - "45566 [[Adjusting to Change: American Foreign Policy... NaN \n", - "72674 [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN \n", - "172820 [[Air Force Office of Scientific Research (WRI... NaN \n", - "209505 [[Migration Scholar and Ananda , https://grfdt... NaN \n", - "259877 [[Website, https://ejournal.stiesyariahbengkal... NaN \n", - "... ... ... \n", - "10494820 [[De rufianes y franchutas Representaciones y ... NaN \n", - "10495806 [[Website, https://fix-itrite.com], [Muckrack,... NaN \n", - "10633545 [[Conservation Efforts as a Result of Theoreti... NaN \n", - "10648241 [[Web of Science (Pub) Researcher ID: A-3544-2... NaN \n", - "10679699 [[A Systematic Study on the Dynamic Softening ... NaN \n", - "\n", - " ... external_ids \\\n", - "45566 ... [[Scopus Author ID, 25649901800]] \n", - "72674 ... NaN \n", - "172820 ... NaN \n", - "209505 ... NaN \n", - "259877 ... NaN \n", - "... ... ... \n", - "10494820 ... NaN \n", - "10495806 ... NaN \n", - "10633545 ... NaN \n", - "10648241 ... [[Scopus Author ID, 56556873600]] \n", - "10679699 ... NaN \n", - "\n", - " education \\\n", - "45566 [[Political Science, Ph.D., Massachusetts Inst... \n", - "72674 NaN \n", - "172820 [[Faculty of Philosophy, History - Ph.D., Char... \n", - "209505 [[Education , B.Ed. After Degree , University ... \n", - "259877 NaN \n", - "... ... \n", - "10494820 [[, Maestría en Ciencias Sociales con Mención ... \n", - "10495806 NaN \n", - "10633545 NaN \n", - "10648241 NaN \n", - "10679699 NaN \n", - "\n", - " employment n_works \\\n", - "45566 [[Professor of Government and Politics, George... 58 \n", - "72674 NaN 4 \n", - "172820 [[responsible for the Project Service Level Ag... 20 \n", - "209505 [[General Coordinator- University of Alberta C... 43 \n", - "259877 NaN 67 \n", - "... ... ... \n", - "10494820 [[Profesora, Universidad Nacional de La Matanz... 7 \n", - "10495806 NaN 1 \n", - "10633545 [[Membership, Paguyuban Pelestarian Budaya Ban... 2 \n", - "10648241 NaN 6 \n", - "10679699 NaN 3 \n", - "\n", - " works_source primary_email_domain \\\n", - "45566 [Scopus - Elsevier] NaN \n", - "72674 [Phòng khám tư nhân Hà Nội] NaN \n", - "172820 [Juan Carlos Garcia Hoyos] NaN \n", - "209505 [Ananda Majumdar] NaN \n", - "259877 [JAS (Jurnal Akuntansi Syariah)] NaN \n", - "... ... ... \n", - "10494820 [Vanesa Natalia Rodriguez] NaN \n", - "10495806 [Fix-It Rite] NaN \n", - "10633545 [Benny Soewandi] NaN \n", - "10648241 [Scopus - Elsevier] NaN \n", - "10679699 [FENGZHI WU] NaN \n", - "\n", - " other_email_domains n_emails \\\n", - "45566 NaN NaN \n", - "72674 NaN NaN \n", - "172820 NaN NaN \n", - "209505 NaN NaN \n", - "259877 NaN NaN \n", - "... ... ... \n", - "10494820 NaN NaN \n", - "10495806 NaN NaN \n", - "10633545 NaN NaN \n", - "10648241 NaN NaN \n", - "10679699 NaN NaN \n", + " other_names \\\n", + "0 [professor davi barbosa delmont] \n", + "1 [[kataˈʐɨna ˈɔxman], catharina ochman, cathari... \n", + "2 [ graham colin dawson, g.c. dawson] \n", + "3 [juan pablo wolff, pablo wolff mejia, juan p. ... \n", + "4 [trent ernest hammond (t.e.hammond)] \n", + ".. ... \n", + "136 [guillermo ortiz-ruiz] \n", + "137 NaN \n", + "138 NaN \n", + "139 [bayu purbha sakti] \n", + "140 [karla palafox] \n", + "\n", + " urls \\\n", + "0 [[plataforma de cursos ideia criativa, https:/... \n", + "1 [[researchgate, https://www.researchgate.net/p... \n", + "2 [[qut home page, https://www.library.qut.edu.a... \n", + "3 [[twitter, https://twitter.com/pablomejiam], [... \n", + "4 [[academic support masters, http://trenthammon... \n", + ".. ... \n", + "136 [[elsevier, https://www.elsevier.com/], [asoci... \n", + "137 [[about, https://about.me/textprotocol], [gith... \n", + "138 [[papers-1, https://www.researchgate.net/profi... \n", + "139 [[osf, http://osf.io/qe2ug], [inarxiv, https:/... \n", + "140 [[opinión día del maestro, http://www.cronicaj... \n", + "\n", + " primary_email ... n_doi n_arxiv n_pmc \\\n", + "0 NaN ... 0 0 0 \n", + "1 NaN ... 1 0 0 \n", + "2 g.dawson@qut.edu.au ... 0 0 0 \n", + "3 juanpmejia@ulasallista.edu.co ... 0 0 0 \n", + "4 trent.hammond@academicsupportmasters.com.au ... 1 0 0 \n", + ".. ... ... ... ... ... \n", + "136 NaN ... 62 0 0 \n", + "137 NaN ... 0 0 0 \n", + "138 leonardofernando.basso@mackenzie.br ... 5 0 0 \n", + "139 NaN ... 0 0 0 \n", + "140 NaN ... 0 0 0 \n", + "\n", + " n_other_pids label primary_email_domain \\\n", + "0 0 0 NaN \n", + "1 0 1 NaN \n", + "2 6 1 qut.edu.au \n", + "3 0 1 ulasallista.edu.co \n", + "4 1 1 academicsupportmasters.com.au \n", + ".. ... ... ... \n", + "136 88 0 NaN \n", + "137 0 0 NaN \n", + "138 0 1 mackenzie.br \n", + "139 0 1 NaN \n", + "140 2 1 NaN \n", + "\n", + " other_email_domains n_emails \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 [health.nsw.gov.au, csu.edu.au, sociologist.co... 5.0 \n", + ".. ... ... \n", + "136 NaN NaN \n", + "137 NaN NaN \n", + "138 [mackenzie.br] 1.0 \n", + "139 NaN NaN \n", + "140 NaN NaN \n", "\n", - " url_domains n_urls \n", - "45566 [wordpress.com, marknkatz.com, gmu.edu, atlant... 16.0 \n", - "72674 [onhealth.vn, onhealth.vn, onhealth.vn, onheal... 49.0 \n", - "172820 [af.mil, gst.com, govtribe.com, sbir.gov, open... 28.0 \n", - "209505 [grfdt.com, linkedin.com, academia.edu, resear... 24.0 \n", - "259877 [stiesyariahbengkalis.ac.id, lipi.go.id, cross... 17.0 \n", - "... ... ... \n", - "10494820 [unlam.edu.ar, unirioja.es, amazon.fr, abebook... 19.0 \n", - "10495806 [fix-itrite.com, muckrack.com, tumblr.com, dri... 11.0 \n", - "10633545 [wordpress.com, wordpress.com, linkedin.com, f... 11.0 \n", - "10648241 [publons.com, articulate.com, zenodo.org, orci... 15.0 \n", - "10679699 [springer.com, sciencedirect.com, sciencedirec... 23.0 \n", + " url_domains n_urls \n", + "0 [eadplataforma.com, facebook.com, youtube.com,... 39.0 \n", + "1 [researchgate.net, academia.edu, facebook.com,... 11.0 \n", + "2 [qut.edu.au, qut.edu.au, google.com.au, resear... 11.0 \n", + "3 [twitter.com, youtube.com, google.com, linkedi... 11.0 \n", + "4 [wix.com, academia.edu, researchgate.net, rese... 12.0 \n", + ".. ... ... \n", + "136 [elsevier.com, amci.org.co, springer.com, revi... 12.0 \n", + "137 [about.me, github.com, gitlab.com, gravatar.co... 12.0 \n", + "138 [researchgate.net, ssrn.com, cnpq.br, google.c... 17.0 \n", + "139 [osf.io, osf.io, academia.edu, mendeley.com, f... 12.0 \n", + "140 [cronicajalisco.com, youtube.com, tlaquepaque.... 22.0 \n", "\n", - "[139 rows x 22 columns]" + "[141 rows x 29 columns]" ] }, - "execution_count": 49, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)]" + "exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)\n", + "exploded_sources" ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 48, "metadata": { "scrolled": true }, @@ -9162,11 +8767,11 @@ "139 rows × 22 columns
\n", + "115 rows × 29 columns
\n", "" ], "text/plain": [ - " orcid claimed verified_email verified_primary_email \\\n", - "0 0000-0003-1948-3180 1 1 1 \n", - "1 0000-0002-2000-8339 1 1 1 \n", - "2 0000-0001-9293-2224 1 1 1 \n", - "3 0000-0003-3045-0056 1 1 1 \n", - "4 0000-0003-1815-5732 1 1 1 \n", - ".. ... ... ... ... \n", - "134 0000-0002-1324-7171 1 1 1 \n", - "135 0000-0002-1700-8311 1 1 1 \n", - "136 0000-0003-2676-4431 1 1 1 \n", - "137 0000-0001-8157-0600 1 1 1 \n", - "138 0000-0002-9874-1450 1 1 1 \n", - "\n", - " given_names family_name \\\n", - "0 Mark Katz \n", - "1 Phòng khám tư nhân Hà Nội NaN \n", - "2 Juan Carlos Garcia Hoyos \n", - "3 Ananda Majumdar \n", - "4 JAS (Jurnal Akuntansi Syariah) \n", - ".. ... ... \n", - "134 Vanesa Natalia Rodriguez \n", - "135 Fix-IT Rite \n", - "136 Benny Soewandi \n", - "137 Bijan Yavar \n", - "138 FENGZHI WU \n", - "\n", - " biography \\\n", - "0 Mark N. Katz is a professor of government and ... \n", - "1 NaN \n", - "2 My name is Juan Carlos García Hoyos. I was bor... \n", - "3 I am Ananda Majumdar, Child Care Educator at B... \n", - "4 JAS (Jurnal Akuntansi Syariah) published in pr... \n", - ".. ... \n", - "134 Nombre y Apellido: Vanesa Natalia Rodriguez. ... \n", - "135 NaN \n", - "136 NaN \n", - "137 Senior Research Assistant and Phd Student in O... \n", - "138 NaN \n", - "\n", - " other_names \\\n", - "0 NaN \n", - "1 NaN \n", - "2 [Juan Carlos Garcia Hoyos /, EXTRATERRANOVAS /... \n", - "3 NaN \n", - "4 NaN \n", - ".. ... \n", - "134 [Vanesa Rodriguez, Vanesa N. Rodriguez] \n", - "135 [Best Heating & Plumbing Company] \n", - "136 [Benny Soewandi] \n", - "137 [B. Yavar, Yavar Bijan] \n", - "138 NaN \n", - "\n", - " urls primary_email ... \\\n", - "0 [[Adjusting to Change: American Foreign Policy... NaN ... \n", - "1 [[Sức khỏe, https://onhealth.vn/], [Khám phụ k... NaN ... \n", - "2 [[Air Force Office of Scientific Research (WRI... NaN ... \n", - "3 [[Migration Scholar and Ananda , https://grfdt... NaN ... \n", - "4 [[Website, https://ejournal.stiesyariahbengkal... NaN ... \n", - ".. ... ... ... \n", - "134 [[De rufianes y franchutas Representaciones y ... NaN ... \n", - "135 [[Website, https://fix-itrite.com], [Muckrack,... NaN ... \n", - "136 [[Conservation Efforts as a Result of Theoreti... NaN ... \n", - "137 [[Web of Science (Pub) Researcher ID: A-3544-2... NaN ... \n", - "138 [[A Systematic Study on the Dynamic Softening ... NaN ... \n", - "\n", - " external_ids \\\n", - "0 [[Scopus Author ID, 25649901800]] \n", - "1 NaN \n", - "2 NaN \n", - "3 NaN \n", - "4 NaN \n", - ".. ... \n", - "134 NaN \n", - "135 NaN \n", - "136 NaN \n", - "137 [[Scopus Author ID, 56556873600]] \n", - "138 NaN \n", - "\n", - " education \\\n", - "0 [[Political Science, Ph.D., Massachusetts Inst... \n", - "1 NaN \n", - "2 [[Faculty of Philosophy, History - Ph.D., Char... \n", - "3 [[Education , B.Ed. After Degree , University ... \n", - "4 NaN \n", - ".. ... \n", - "134 [[, Maestría en Ciencias Sociales con Mención ... \n", - "135 NaN \n", - "136 NaN \n", - "137 NaN \n", - "138 NaN \n", - "\n", - " employment n_works \\\n", - "0 [[Professor of Government and Politics, George... 58 \n", - "1 NaN 4 \n", - "2 [[responsible for the Project Service Level Ag... 20 \n", - "3 [[General Coordinator- University of Alberta C... 43 \n", - "4 NaN 67 \n", - ".. ... ... \n", - "134 [[Profesora, Universidad Nacional de La Matanz... 7 \n", - "135 NaN 1 \n", - "136 [[Membership, Paguyuban Pelestarian Budaya Ban... 2 \n", - "137 NaN 6 \n", - "138 NaN 3 \n", - "\n", - " works_source primary_email_domain other_email_domains \\\n", - "0 Scopus - Elsevier NaN NaN \n", - "1 Phòng khám tư nhân Hà Nội NaN NaN \n", - "2 Juan Carlos Garcia Hoyos NaN NaN \n", - "3 Ananda Majumdar NaN NaN \n", - "4 JAS (Jurnal Akuntansi Syariah) NaN NaN \n", - ".. ... ... ... \n", - "134 Vanesa Natalia Rodriguez NaN NaN \n", - "135 Fix-It Rite NaN NaN \n", - "136 Benny Soewandi NaN NaN \n", - "137 Scopus - Elsevier NaN NaN \n", - "138 FENGZHI WU NaN NaN \n", - "\n", - " n_emails url_domains n_urls \n", - "0 NaN [wordpress.com, marknkatz.com, gmu.edu, atlant... 16.0 \n", - "1 NaN [onhealth.vn, onhealth.vn, onhealth.vn, onheal... 49.0 \n", - "2 NaN [af.mil, gst.com, govtribe.com, sbir.gov, open... 28.0 \n", - "3 NaN [grfdt.com, linkedin.com, academia.edu, resear... 24.0 \n", - "4 NaN [stiesyariahbengkalis.ac.id, lipi.go.id, cross... 17.0 \n", - ".. ... ... ... \n", - "134 NaN [unlam.edu.ar, unirioja.es, amazon.fr, abebook... 19.0 \n", - "135 NaN [fix-itrite.com, muckrack.com, tumblr.com, dri... 11.0 \n", - "136 NaN [wordpress.com, wordpress.com, linkedin.com, f... 11.0 \n", - "137 NaN [publons.com, articulate.com, zenodo.org, orci... 15.0 \n", - "138 NaN [springer.com, sciencedirect.com, sciencedirec... 23.0 \n", - "\n", - "[139 rows x 22 columns]" + " orcid claimed verified_email verified_primary_email \\\n", + "0 0000-0002-7843-8497 1 1 1 \n", + "1 0000-0003-1554-1531 1 1 1 \n", + "3 0000-0003-3193-030x 1 1 1 \n", + "4 0000-0002-5341-6531 1 1 1 \n", + "5 0000-0001-5295-2271 1 1 1 \n", + ".. ... ... ... ... \n", + "135 0000-0002-8125-0081 1 1 1 \n", + "136 0000-0002-3374-5709 1 1 1 \n", + "137 0000-0001-7228-5680 1 1 1 \n", + "139 0000-0003-1047-4229 1 1 1 \n", + "140 0000-0003-4836-7074 1 1 1 \n", + "\n", + " given_names family_name \\\n", + "0 davi barbosa \n", + "1 katarzyna ochman \n", + "3 juan pablo wolff mejia \n", + "4 trent hammond \n", + "5 antoniy moysey \n", + ".. ... ... \n", + "135 issam bencheikh \n", + "136 guillermo ortiz \n", + "137 text protocol \n", + "139 bayu sakti \n", + "140 karla haydee ortiz palafox \n", + "\n", + " biography \\\n", + "0 pesquisador na área sociojurídica, professor, ... \n", + "1 katarzyna ochman [kataˈʐɨna ˈɔxman] is assista... \n", + "3 aspirante a maestría en derecho y negocios int... \n", + "4 mr trent hammond is an honorary research fello... \n", + "5 NaN \n", + ".. ... \n", + "135 NaN \n", + "136 médico, internista, neumólogo, intensivista, e... \n", + "137 NaN \n", + "139 bayu purbha saktisaya adalah bayu purbha sakti... \n", + "140 karla haydee ortíz palafoxmiembro del sistema ... \n", + "\n", + " other_names \\\n", + "0 [professor davi barbosa delmont] \n", + "1 [[kataˈʐɨna ˈɔxman], catharina ochman, cathari... \n", + "3 [juan pablo wolff, pablo wolff mejia, juan p. ... \n", + "4 [trent ernest hammond (t.e.hammond)] \n", + "5 NaN \n", + ".. ... \n", + "135 [issame1982, دكتور عصام بن الشيخ] \n", + "136 [guillermo ortiz-ruiz] \n", + "137 NaN \n", + "139 [bayu purbha sakti] \n", + "140 [karla palafox] \n", + "\n", + " urls \\\n", + "0 [[plataforma de cursos ideia criativa, https:/... \n", + "1 [[researchgate, https://www.researchgate.net/p... \n", + "3 [[twitter, https://twitter.com/pablomejiam], [... \n", + "4 [[academic support masters, http://trenthammon... \n", + "5 [[academic journals database, http://journalda... \n", + ".. ... \n", + "135 [[my blog web site, http://issame1982.blogspot... \n", + "136 [[elsevier, https://www.elsevier.com/], [asoci... \n", + "137 [[about, https://about.me/textprotocol], [gith... \n", + "139 [[osf, http://osf.io/qe2ug], [inarxiv, https:/... \n", + "140 [[opinión día del maestro, http://www.cronicaj... \n", + "\n", + " primary_email ... n_doi n_arxiv n_pmc \\\n", + "0 NaN ... 0 0 0 \n", + "1 NaN ... 1 0 0 \n", + "3 juanpmejia@ulasallista.edu.co ... 0 0 0 \n", + "4 trent.hammond@academicsupportmasters.com.au ... 1 0 0 \n", + "5 antoniimoisei@bsmu.edu.ua ... 0 0 0 \n", + ".. ... ... ... ... ... \n", + "135 NaN ... 0 0 0 \n", + "136 NaN ... 62 0 0 \n", + "137 NaN ... 0 0 0 \n", + "139 NaN ... 0 0 0 \n", + "140 NaN ... 0 0 0 \n", + "\n", + " n_other_pids label primary_email_domain \\\n", + "0 0 0 NaN \n", + "1 0 1 NaN \n", + "3 0 1 ulasallista.edu.co \n", + "4 1 1 academicsupportmasters.com.au \n", + "5 0 1 bsmu.edu.ua \n", + ".. ... ... ... \n", + "135 0 1 NaN \n", + "136 88 0 NaN \n", + "137 0 0 NaN \n", + "139 0 1 NaN \n", + "140 2 1 NaN \n", + "\n", + " other_email_domains n_emails \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "3 NaN NaN \n", + "4 [health.nsw.gov.au, csu.edu.au, sociologist.co... 5.0 \n", + "5 NaN NaN \n", + ".. ... ... \n", + "135 NaN NaN \n", + "136 NaN NaN \n", + "137 NaN NaN \n", + "139 NaN NaN \n", + "140 NaN NaN \n", + "\n", + " url_domains n_urls \n", + "0 [eadplataforma.com, facebook.com, youtube.com,... 39.0 \n", + "1 [researchgate.net, academia.edu, facebook.com,... 11.0 \n", + "3 [twitter.com, youtube.com, google.com, linkedi... 11.0 \n", + "4 [wix.com, academia.edu, researchgate.net, rese... 12.0 \n", + "5 [journaldatabase.info, nplu.org, acls.org, ind... 21.0 \n", + ".. ... ... \n", + "135 [blogspot.com, researchgate.net, google.com, l... 12.0 \n", + "136 [elsevier.com, amci.org.co, springer.com, revi... 12.0 \n", + "137 [about.me, github.com, gitlab.com, gravatar.co... 12.0 \n", + "139 [osf.io, osf.io, academia.edu, mendeley.com, f... 12.0 \n", + "140 [cronicajalisco.com, youtube.com, tlaquepaque.... 22.0 \n", + "\n", + "[115 rows x 29 columns]" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "exploded_sources[exploded_sources.apply(lambda x: x['works_source'].find(x['given_names']) >= 0, axis=1)]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Works source" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Paste from Miriam" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## External IDs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "External IDs should come from reliable sources. ORCiD registrants cannot add them freely." + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "metadata": {}, + "outputs": [], + "source": [ + "df['n_ids'] = df[df['external_ids'].notna()].external_ids.str.len()" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 1.301959e+06\n", + "mean 1.358640e+00\n", + "std 6.635087e-01\n", + "min 1.000000e+00\n", + "25% 1.000000e+00\n", + "50% 1.000000e+00\n", + "75% 2.000000e+00\n", + "max 8.000000e+01\n", + "Name: n_ids, dtype: float64" ] }, "execution_count": 50, @@ -9584,16 +9241,13 @@ } ], "source": [ - "exploded_sources = df[(df['url_domains'].str.len() > 10) & (df['n_works'] > 0) & (df['works_source'].str.len() == 1)].explode('works_source').reset_index(drop=True)\n", - "exploded_sources" + "df.n_ids.describe()" ] }, { "cell_type": "code", "execution_count": 51, - "metadata": { - "scrolled": true - }, + "metadata": {}, "outputs": [ { "data": { @@ -9627,640 +9281,1322 @@ "1 rows × 30 columns
\n", + "" + ], + "text/plain": [ + " orcid claimed verified_email verified_primary_email \\\n", + "7253330 0000-0002-9554-6633 1 1 1 \n", + "\n", + " given_names family_name biography other_names \\\n", + "7253330 john a williams NaN NaN \n", + "\n", + " urls primary_email ... \\\n", + "7253330 [[aston university profile page, https://resea... NaN ... \n", + "\n", + " n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", + "7253330 0 0 208 1 NaN \n", + "\n", + " other_email_domains n_emails url_domains n_urls n_ids \n", + "7253330 NaN NaN [aston.ac.uk] 1.0 80.0 \n", + "\n", + "[1 rows x 30 columns]" + ] + }, + "execution_count": 51, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[df.n_ids == df.n_ids.max()]" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [], + "source": [ + "ids = df[['orcid', 'external_ids']].explode('external_ids').reset_index(drop=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": {}, + "outputs": [], + "source": [ + "ids['provider'] = ids[ids.external_ids.notna()]['external_ids'].apply(lambda x: x[0])" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + " | orcid | \n", + "external_ids | \n", + "provider | \n", "||||||||||||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
132 | \n", - "0000-0001-6352-7086 | \n", - "1 | \n", - "1 | \n", - "1 | \n", - "Susan | \n", - "Hawthorne | \n", - "Susan is a poet, novelist, publisher and Sansk... | \n", - "[S. Hawthorne, Susan C. C. Hawthorne] | \n", - "[[Spinifex Press, http://www.spinifexpress.com... | \n", - "NaN | \n", - "... | \n", - "[[ResearcherID, K-6039-2018]] | \n", - "[[School of Asian Studies, Honours Sanskrit, A... | \n", - "[[Adjunct Professor, James Cook University, To... | \n", - "352 | \n", - "Susan Hawthorne | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "[spinifexpress.com.au, linkedin.com, twitter.c... | \n", - "12.0 | \n", + "7 | \n", + "0000-0001-7463-977x | \n", + "[loop profile, 371409] | \n", + "loop profile | \n", "
133 | \n", - "0000-0002-4062-3603 | \n", - "1 | \n", - "1 | \n", - "1 | \n", - "JUAN DE DIOS | \n", - "BELTRÁN MANCILLA | \n", - "JUAN DE DIOS BELTRÁN MANCILLA (*) Filósofo aut... | \n", - "[Juan de Dios Beltrán Mancilla, FILÓSOFO AUTOD... | \n", - "[[01.- Juan de Dios Beltrán Mancilla. Teoría O... | \n", - "NaN | \n", - "... | \n", - "NaN | \n", - "[[, DIPLOMADO EN PRACTICAS DIRECTIVAS PARA OR... | \n", - "[[INSPECTOR GENERAL JORNADA VESPERTINA // De 2... | \n", - "11 | \n", - "JUAN DE DIOS BELTR´´ÁN MANCILLA | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "[yumpu.com, ijopm.org, google.com, blogspot.co... | \n", - "69.0 | \n", + "9 | \n", + "0000-0001-8718-0056 | \n", + "[scopus author id, 55466912100] | \n", + "scopus author id | \n", "
134 | \n", - "0000-0002-1324-7171 | \n", - "1 | \n", - "1 | \n", - "1 | \n", - "Vanesa Natalia | \n", - "Rodriguez | \n", - "Nombre y Apellido: Vanesa Natalia Rodriguez. ... | \n", - "[Vanesa Rodriguez, Vanesa N. Rodriguez] | \n", - "[[De rufianes y franchutas Representaciones y ... | \n", - "NaN | \n", - "... | \n", - "NaN | \n", - "[[, Maestría en Ciencias Sociales con Mención ... | \n", - "[[Profesora, Universidad Nacional de La Matanz... | \n", - "7 | \n", - "Vanesa Natalia Rodriguez | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "[unlam.edu.ar, unirioja.es, amazon.fr, abebook... | \n", - "19.0 | \n", + "10 | \n", + "0000-0001-8718-0056 | \n", + "[scopus author id, 7102015452] | \n", + "scopus author id | \n", "
136 | \n", - "0000-0003-2676-4431 | \n", - "1 | \n", - "1 | \n", - "1 | \n", - "Benny | \n", - "Soewandi | \n", - "NaN | \n", - "[Benny Soewandi] | \n", - "[[Conservation Efforts as a Result of Theoreti... | \n", - "NaN | \n", - "... | \n", - "NaN | \n", - "NaN | \n", - "[[Membership, Paguyuban Pelestarian Budaya Ban... | \n", - "2 | \n", - "Benny Soewandi | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "[wordpress.com, wordpress.com, linkedin.com, f... | \n", - "11.0 | \n", + "14 | \n", + "0000-0001-9708-5570 | \n", + "[researcherid, p-5112-2015] | \n", + "researcherid | \n", "
138 | \n", - "0000-0002-9874-1450 | \n", - "1 | \n", - "1 | \n", - "1 | \n", - "FENGZHI | \n", - "WU | \n", - "NaN | \n", - "NaN | \n", - "[[A Systematic Study on the Dynamic Softening ... | \n", - "NaN | \n", - "... | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "3 | \n", - "FENGZHI WU | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "[springer.com, sciencedirect.com, sciencedirec... | \n", - "23.0 | \n", + "15 | \n", + "0000-0001-9708-5570 | \n", + "[scopus author id, 42062216900] | \n", + "scopus author id | \n", "
108 rows × 22 columns
\n", "\n", - " | orcid | \n", - "claimed | \n", - "verified_email | \n", - "verified_primary_email | \n", - "given_names | \n", - "family_name | \n", - "biography | \n", - "other_names | \n", - "urls | \n", - "primary_email | \n", - "... | \n", - "education | \n", - "employment | \n", - "n_works | \n", - "works_source | \n", - "primary_email_domain | \n", - "other_email_domains | \n", - "n_emails | \n", - "url_domains | \n", - "n_urls | \n", - "n_ids | \n", - "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
9228793 | \n", - "0000-0002-9554-6633 | \n", - "1 | \n", - "1 | \n", - "1 | \n", - "John A | \n", - "Williams | \n", - "NaN | \n", - "NaN | \n", - "[[Aston University profile page, https://resea... | \n", - "NaN | \n", - "... | \n", - "NaN | \n", - "[[, Aston University, Birmingham, , GB, 1722, ... | \n", - "91 | \n", - "[Aston Research Explorer] | \n", - "NaN | \n", - "NaN | \n", - "NaN | \n", - "[aston.ac.uk] | \n", - "1.0 | \n", - "80.0 | \n", - "
1 rows × 23 columns
\n", - "10916574 rows × 2 columns
\n", "" ], "text/plain": [ - " orcid external_ids provider\n", - "13 0000-0001-5004-4608 [Scopus Author ID, 40661094300] Scopus Author ID\n", - "24 0000-0001-5008-2479 [Scopus Author ID, 12789856200] Scopus Author ID\n", - "25 0000-0001-5008-2479 [Ciência ID, 2F1C-479B-B071] Ciência ID\n", - "31 0000-0001-5010-9539 [Loop profile, 1098977] Loop profile\n", - "42 0000-0001-5013-6529 [Scopus Author ID, 8986698300] Scopus Author ID" + " orcid n_keywords\n", + "2851081 0000-0002-0673-0341 154.0\n", + "7344151 0000-0002-7060-4112 141.0\n", + "2235440 0000-0002-6075-3501 140.0\n", + "2994233 0000-0002-4071-0301 118.0\n", + "3971323 0000-0002-9638-8091 115.0\n", + "... ... ...\n", + "10916569 0000-0001-5692-7639 NaN\n", + "10916570 0000-0003-1539-0999 NaN\n", + "10916571 0000-0003-2858-5509 NaN\n", + "10916572 0000-0003-2438-9500 NaN\n", + "10916573 0000-0003-4119-4772 NaN\n", + "\n", + "[10916574 rows x 2 columns]" ] }, - "execution_count": 79, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ids[ids.provider.notna()].head()" + "df.sort_values('n_keywords', ascending=False)[['orcid', 'n_keywords']]" ] }, { "cell_type": "code", - "execution_count": 92, + "execution_count": 64, "metadata": {}, "outputs": [ { @@ -10359,108 +10728,208 @@ { "type": "bar", "x": [ - "Scopus Author ID", - "ResearcherID", - "Loop profile", - "Ciência ID", - "Researcher Name Resolver ID", - "中国科学家在线", - "ISNI", - "GND", - "Pitt ID", - "Technical University of Denmark CWIS", - "Sciprofile", - "Researcher ID", - "ID Dialnet", - "Digital author ID", - "Scopus Author ID: ", - "AuthenticusID", - "HKU ResearcherPage", - "UOW Scholars", - "CTI Vitae", - "Scopus Author ID:", - "HKUST Profile", - "Scopus author ID", - "Chalmers ID", - "Scopus ID", - "iAuthor", - "Google Scholar", - "AuthID", - "DAI", - "US EPA VIVO", - "Digital Author ID (DAI)", - "Scopus ID", - "Authenticus", - "Smithsonian Profiles", - "GitHub", - "eScientist", - "VIVO Cornell", - "ResearcherID:", - "Digital Author ID", - "Digital author ID (DAI)", - "ID Dialnet:", - "Dialnet ID", - "KAKEN", - "UNE Researcher ID", - "ResearcherID: ", - "ORCID", - "Custom", - "ORCID iD", - "ScienceOpen", - "ResearcherId", - "Profile system identifier" + "0000-0002-0673-0341", + "0000-0002-7060-4112", + "0000-0002-6075-3501", + "0000-0002-4071-0301", + "0000-0002-9638-8091", + "0000-0002-4235-4259", + "0000-0001-9462-5666", + "0000-0003-0076-6287", + "0000-0002-1878-9762", + "0000-0001-6537-7683", + "0000-0001-6307-6027", + "0000-0003-2273-9888", + "0000-0003-1799-0971", + "0000-0001-5287-1949", + "0000-0002-0937-7061", + "0000-0001-9715-9357", + "0000-0001-5696-1052", + "0000-0003-2998-5520", + "0000-0001-5869-2204", + "0000-0002-0156-3580", + "0000-0002-9625-6742", + "0000-0002-8401-8018", + "0000-0001-9985-1697", + "0000-0003-4246-8579", + "0000-0002-7710-0355", + "0000-0002-8083-7382", + "0000-0001-7654-5013", + "0000-0001-6939-3859", + "0000-0002-3061-3364", + "0000-0003-2509-2549", + "0000-0002-0463-0048", + "0000-0001-5230-715x", + "0000-0001-9336-6850", + "0000-0001-5458-7167", + "0000-0003-0209-180x", + "0000-0003-3584-6834", + "0000-0002-8227-5387", + "0000-0002-9381-2264", + "0000-0003-3340-6413", + "0000-0002-2935-1934", + "0000-0002-8644-8396", + "0000-0002-3123-3021", + "0000-0002-8659-6321", + "0000-0002-8449-2211", + "0000-0001-5167-7466", + "0000-0001-5637-1124", + "0000-0003-2532-2906", + "0000-0003-4673-1063", + "0000-0003-4608-3844", + "0000-0002-3532-043x", + "0000-0002-6347-9464", + "0000-0003-4505-3678", + "0000-0002-2683-4527", + "0000-0003-4374-6374", + "0000-0003-4511-7942", + "0000-0002-1103-9651", + "0000-0001-9280-6017", + "0000-0003-3720-1183", + "0000-0001-9586-0780", + "0000-0002-5306-7781", + "0000-0003-2218-1343", + "0000-0002-8499-1045", + "0000-0003-1863-0265", + "0000-0002-5539-1761", + "0000-0003-2550-1859", + "0000-0002-8072-1152", + "0000-0003-3342-6123", + "0000-0001-6861-9561", + "0000-0002-2252-672x", + "0000-0002-3597-3350", + "0000-0002-3907-3552", + "0000-0001-8689-185x", + "0000-0002-5274-7742", + "0000-0002-3186-8860", + "0000-0001-6843-9325", + "0000-0001-7133-7848", + "0000-0003-4486-2684", + "0000-0003-3343-5660", + "0000-0002-9014-2090", + "0000-0002-6282-0640", + "0000-0001-7857-4133", + "0000-0002-1294-2156", + "0000-0002-4432-3448", + "0000-0003-0097-4182", + "0000-0003-1245-7705", + "0000-0001-8445-412x", + "0000-0003-4153-6779", + "0000-0002-9125-6022", + "0000-0002-4598-2891", + "0000-0003-3387-3193", + "0000-0002-3866-6460", + "0000-0002-1411-3028", + "0000-0003-4283-2895", + "0000-0002-0211-7195", + "0000-0002-3898-9542", + "0000-0002-1545-7818", + "0000-0002-4963-9345", + "0000-0002-1770-9660", + "0000-0002-1960-5857", + "0000-0003-2054-477x" ], "y": [ - 1015175, - 543545, - 114316, - 33870, - 7810, - 4794, - 3054, - 2825, - 2672, - 2486, - 2441, - 1417, - 1167, - 1079, - 1076, - 847, - 740, - 644, - 581, - 549, - 521, - 501, - 430, - 232, - 212, - 200, - 175, - 153, - 146, - 135, - 127, + 154, + 141, + 140, + 118, + 115, + 104, + 98, + 94, + 92, + 91, + 88, + 86, + 84, 82, + 78, + 77, + 76, + 75, + 74, + 73, + 71, + 70, + 69, + 66, + 64, + 62, 61, + 60, + 58, + 57, + 56, + 54, + 53, + 53, + 52, + 51, 51, + 51, + 51, + 50, + 50, + 50, + 50, + 49, + 49, 49, + 48, + 48, + 48, + 48, + 48, + 48, + 48, + 47, + 47, + 46, 46, + 46, + 45, + 44, + 44, + 44, + 44, + 44, + 44, + 43, + 43, + 42, + 42, + 42, + 42, + 42, + 42, + 42, + 41, + 41, + 41, + 41, + 41, + 41, + 40, + 40, + 40, + 40, + 40, + 40, 39, - 35, - 34, - 7, - 6, - 5, - 4, - 3, - 2, - 1, - 1, - 1, - 1, - 1 + 39, + 39, + 39, + 39, + 39, + 39, + 39, + 39, + 39, + 38, + 38, + 38, + 38 ] } ], @@ -11270,563 +11739,204 @@ "zerolinewidth": 2 } } - }, - "title": { - "text": "IDs provided by providers" - }, - "xaxis": { - "tickangle": 45, - "tickfont": { - "size": 12 - } - } - } - }, - "text/html": [ - "\n", - " | orcid | \n", - "n_keywords | \n", - "
---|---|---|
1681310 | \n", - "0000-0002-0673-0341 | \n", - "154.0 | \n", - "
7717699 | \n", - "0000-0002-7060-4112 | \n", - "141.0 | \n", - "
4597674 | \n", - "0000-0002-6075-3501 | \n", - "140.0 | \n", - "
2066580 | \n", - "0000-0002-4071-0301 | \n", - "118.0 | \n", - "
3531030 | \n", - "0000-0002-9638-8091 | \n", - "115.0 | \n", - "
... | \n", - "... | \n", - "... | \n", - "
10747035 | \n", - "0000-0003-4998-1551 | \n", - "NaN | \n", - "
10747036 | \n", - "0000-0003-4998-4111 | \n", - "NaN | \n", - "
10747037 | \n", - "0000-0003-4998-6045 | \n", - "NaN | \n", - "
10747038 | \n", - "0000-0003-4998-8868 | \n", - "NaN | \n", - "
10747039 | \n", - "0000-0003-4999-7916 | \n", - "NaN | \n", - "
10744621 rows × 2 columns
\n", - "