From 41f2dab89d51264e07a39526013ebddbc21a918f Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Tue, 23 Mar 2021 12:13:04 +0100 Subject: [PATCH] moving on with keywords --- notebooks/01-Exploration.ipynb | 776 ++++++++++++++++++++++++++------- 1 file changed, 614 insertions(+), 162 deletions(-) diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index 7395104..0f8d755 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -94,12 +94,13 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ "JOURNAL = '0000-0003-1815-5732'\n", - "NOINFO= '0000-0001-5009-2052'\n", + "NOINFO = '0000-0001-5009-2052'\n", + "VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE\n", "# todo: find group-shared ORCiD, if possible" ] }, @@ -134,12 +135,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 163, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header=0,\n", - " names = ['orcid', 'claimed','verified_email', 'verified_primary_email', \n", + " names=['orcid', 'claimed','verified_email', 'verified_primary_email', \n", " 'given_names', 'family_name', 'biography', 'other_names', 'urls', \n", " 'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', \n", " 'employment', 'n_works', 'works_source'])" @@ -575,7 +576,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 164, "metadata": {}, "outputs": [], "source": [ @@ -636,6 +637,13 @@ "df['works_source'] = df[df.works_source.notna()]['works_source'].apply(lambda x: ast.literal_eval(x))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's how it looks like" + ] + }, { "cell_type": "code", "execution_count": 16, @@ -825,9 +833,139 @@ "df.head(5)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add labels to the dataframe based on their presence in OpenAIRE" + ] + }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
orcid
00000-0002-4292-8072
10000-0003-3026-718x
20000-0002-4568-8748
30000-0002-1380-859x
40000-0001-7526-5095
......
29341110000-0002-2394-5387
29341120000-0002-6125-3170
29341130000-0002-7062-5845
29341140000-0001-8677-5162
29341150000-0002-0240-8021
\n", + "

2934116 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " orcid\n", + "0 0000-0002-4292-8072\n", + "1 0000-0003-3026-718x\n", + "2 0000-0002-4568-8748\n", + "3 0000-0002-1380-859x\n", + "4 0000-0001-7526-5095\n", + "... ...\n", + "2934111 0000-0002-2394-5387\n", + "2934112 0000-0002-6125-3170\n", + "2934113 0000-0002-7062-5845\n", + "2934114 0000-0001-8677-5162\n", + "2934115 0000-0002-0240-8021\n", + "\n", + "[2934116 rows x 1 columns]" + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "openaire_orcid = pd.read_csv('../data/raw/orcid_openaire.txt', header=None, names=['orcid'])\n", + "openaire_orcid" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [], + "source": [ + "df['label'] = df.orcid.isin(openaire_orcid['orcid'])\n", + "df[\"label\"] = df[\"label\"].astype(int)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Notable profiles inspection" + ] + }, + { + "cell_type": "code", + "execution_count": 111, "metadata": {}, "outputs": [ { @@ -861,13 +999,17 @@ " other_names\n", " urls\n", " primary_email\n", - " other_emails\n", - " keywords\n", - " external_ids\n", - " education\n", - " employment\n", + " ...\n", " n_works\n", " works_source\n", + " primary_email_domain\n", + " other_email_domains\n", + " n_emails\n", + " url_domains\n", + " n_urls\n", + " n_ids\n", + " n_keywords\n", + " label\n", " \n", " \n", " \n", @@ -883,16 +1025,21 @@ " NaN\n", " [[Personal website, https://andremann.github.i...\n", " andrea.mannocci@isti.cnr.it\n", - " NaN\n", - " [Data science , science of science, scholarly ...\n", - " [[Scopus Author ID, 55233589900]]\n", - " [[Information engineering, Ph.D., Università d...\n", - " [[Research Associate, Istituto di Scienza e Te...\n", + " ...\n", " 37\n", " [Scopus - Elsevier, Crossref Metadata Search, ...\n", + " isti.cnr.it\n", + " NaN\n", + " NaN\n", + " [github.io, twitter.com, linkedin.com]\n", + " 3.0\n", + " 1.0\n", + " 5.0\n", + " 1\n", " \n", " \n", "\n", + "

1 rows × 25 columns

\n", "" ], "text/plain": [ @@ -905,26 +1052,22 @@ " urls \\\n", "8840413 [[Personal website, https://andremann.github.i... \n", "\n", - " primary_email other_emails \\\n", - "8840413 andrea.mannocci@isti.cnr.it NaN \n", + " primary_email ... n_works \\\n", + "8840413 andrea.mannocci@isti.cnr.it ... 37 \n", "\n", - " keywords \\\n", - "8840413 [Data science , science of science, scholarly ... \n", + " works_source \\\n", + "8840413 [Scopus - Elsevier, Crossref Metadata Search, ... \n", "\n", - " external_ids \\\n", - "8840413 [[Scopus Author ID, 55233589900]] \n", + " primary_email_domain other_email_domains n_emails \\\n", + "8840413 isti.cnr.it NaN NaN \n", "\n", - " education \\\n", - "8840413 [[Information engineering, Ph.D., Università d... \n", + " url_domains n_urls n_ids n_keywords label \n", + "8840413 [github.io, twitter.com, linkedin.com] 3.0 1.0 5.0 1 \n", "\n", - " employment n_works \\\n", - "8840413 [[Research Associate, Istituto di Scienza e Te... 37 \n", - "\n", - " works_source \n", - "8840413 [Scopus - Elsevier, Crossref Metadata Search, ... " + "[1 rows x 25 columns]" ] }, - "execution_count": 17, + "execution_count": 111, "metadata": {}, "output_type": "execute_result" } @@ -935,7 +1078,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 112, "metadata": {}, "outputs": [ { @@ -969,13 +1112,17 @@ " other_names\n", " urls\n", " primary_email\n", - " other_emails\n", - " keywords\n", - " external_ids\n", - " education\n", - " employment\n", + " ...\n", " n_works\n", " works_source\n", + " primary_email_domain\n", + " other_email_domains\n", + " n_emails\n", + " url_domains\n", + " n_urls\n", + " n_ids\n", + " n_keywords\n", + " label\n", " \n", " \n", " \n", @@ -991,16 +1138,21 @@ " NaN\n", " [[Otherwhatsapp, https://otherwhatsapp.com/], ...\n", " NaN\n", - " NaN\n", - " [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba...\n", - " NaN\n", - " NaN\n", - " NaN\n", + " ...\n", " 0\n", " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " [otherwhatsapp.com, im-creator.com, facebook.c...\n", + " 27.0\n", + " NaN\n", + " 4.0\n", + " 0\n", " \n", " \n", "\n", + "

1 rows × 25 columns

\n", "" ], "text/plain": [ @@ -1010,17 +1162,22 @@ " given_names family_name biography other_names \\\n", "9517099 other whatsapp NaN NaN \n", "\n", - " urls primary_email \\\n", - "9517099 [[Otherwhatsapp, https://otherwhatsapp.com/], ... NaN \n", + " urls primary_email ... \\\n", + "9517099 [[Otherwhatsapp, https://otherwhatsapp.com/], ... NaN ... \n", "\n", - " other_emails keywords \\\n", - "9517099 NaN [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... \n", + " n_works works_source primary_email_domain other_email_domains \\\n", + "9517099 0 NaN NaN NaN \n", "\n", - " external_ids education employment n_works works_source \n", - "9517099 NaN NaN NaN 0 NaN " + " n_emails url_domains n_urls \\\n", + "9517099 NaN [otherwhatsapp.com, im-creator.com, facebook.c... 27.0 \n", + "\n", + " n_ids n_keywords label \n", + "9517099 NaN 4.0 0 \n", + "\n", + "[1 rows x 25 columns]" ] }, - "execution_count": 18, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } @@ -1031,18 +1188,18 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "orcid 10744622\n", - "claimed 10744622\n", - "verified_email 10744622\n", - "verified_primary_email 10744622\n", - "given_names 10716789\n", - "family_name 10437094\n", + "orcid 10744621\n", + "claimed 10744621\n", + "verified_email 10744621\n", + "verified_primary_email 10744621\n", + "given_names 10716788\n", + "family_name 10437093\n", "biography 333885\n", "other_names 544550\n", "urls 688262\n", @@ -1050,14 +1207,22 @@ "other_emails 47470\n", "keywords 638634\n", "external_ids 1285292\n", - "education 2402440\n", - "employment 2626670\n", - "n_works 10744622\n", - "works_source 2671906\n", + "education 2402439\n", + "employment 2626669\n", + "n_works 10744621\n", + "works_source 2671905\n", + "primary_email_domain 121476\n", + "other_email_domains 47470\n", + "n_emails 47470\n", + "url_domains 688262\n", + "n_urls 688262\n", + "n_ids 1285292\n", + "n_keywords 638634\n", + "label 10744621\n", "dtype: int64" ] }, - "execution_count": 19, + "execution_count": 103, "metadata": {}, "output_type": "execute_result" } @@ -1068,7 +1233,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 104, "metadata": {}, "outputs": [ { @@ -1102,13 +1267,17 @@ " other_names\n", " urls\n", " primary_email\n", - " other_emails\n", - " keywords\n", - " external_ids\n", - " education\n", - " employment\n", + " ...\n", " n_works\n", " works_source\n", + " primary_email_domain\n", + " other_email_domains\n", + " n_emails\n", + " url_domains\n", + " n_urls\n", + " n_ids\n", + " n_keywords\n", + " label\n", " \n", " \n", " \n", @@ -1124,22 +1293,9 @@ " NaN\n", " NaN\n", " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", - " [[Energy Systems Engineering , Doctoral, Cypru...\n", - " [[, University of Electronic Science and Techn...\n", + " ...\n", " 3\n", " [Multidisciplinary Digital Publishing Institut...\n", - " \n", - " \n", - " 4595264\n", - " 0000-0002-5154-6404\n", - " 1\n", - " 1\n", - " 1\n", - " Olusola\n", - " Bamisile\n", " NaN\n", " NaN\n", " NaN\n", @@ -1147,42 +1303,33 @@ " NaN\n", " NaN\n", " NaN\n", - " [[Energy Systems Engineering , Doctoral, Cypru...\n", - " [[, University of Electronic Science and Techn...\n", - " 2\n", - " [Crossref]\n", + " False\n", " \n", " \n", "\n", + "

1 rows × 25 columns

\n", "" ], "text/plain": [ " orcid claimed verified_email verified_primary_email \\\n", "4595263 0000-0002-5154-6404 1 1 1 \n", - "4595264 0000-0002-5154-6404 1 1 1 \n", "\n", - " given_names family_name biography other_names urls primary_email \\\n", - "4595263 Olusola Bamisile NaN NaN NaN NaN \n", - "4595264 Olusola Bamisile NaN NaN NaN NaN \n", + " given_names family_name biography other_names urls primary_email ... \\\n", + "4595263 Olusola Bamisile NaN NaN NaN NaN ... \n", "\n", - " other_emails keywords external_ids \\\n", - "4595263 NaN NaN NaN \n", - "4595264 NaN NaN NaN \n", + " n_works works_source \\\n", + "4595263 3 [Multidisciplinary Digital Publishing Institut... \n", "\n", - " education \\\n", - "4595263 [[Energy Systems Engineering , Doctoral, Cypru... \n", - "4595264 [[Energy Systems Engineering , Doctoral, Cypru... \n", + " primary_email_domain other_email_domains n_emails url_domains n_urls \\\n", + "4595263 NaN NaN NaN NaN NaN \n", "\n", - " employment n_works \\\n", - "4595263 [[, University of Electronic Science and Techn... 3 \n", - "4595264 [[, University of Electronic Science and Techn... 2 \n", + " n_ids n_keywords label \n", + "4595263 NaN NaN False \n", "\n", - " works_source \n", - "4595263 [Multidisciplinary Digital Publishing Institut... \n", - "4595264 [Crossref] " + "[1 rows x 25 columns]" ] }, - "execution_count": 20, + "execution_count": 104, "metadata": {}, "output_type": "execute_result" } @@ -10197,7 +10344,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 92, "metadata": {}, "outputs": [ { @@ -11125,7 +11272,7 @@ } }, "title": { - "text": "IDs provided" + "text": "IDs provided by providers" }, "xaxis": { "tickangle": 45, @@ -11136,9 +11283,9 @@ } }, "text/html": [ - "