diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index 7395104..0f8d755 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -94,12 +94,13 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 105, "metadata": {}, "outputs": [], "source": [ "JOURNAL = '0000-0003-1815-5732'\n", - "NOINFO= '0000-0001-5009-2052'\n", + "NOINFO = '0000-0001-5009-2052'\n", + "VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE\n", "# todo: find group-shared ORCiD, if possible" ] }, @@ -134,12 +135,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 163, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv('../data/raw/initial_info_whole.tsv', sep='\\t', header=0,\n", - " names = ['orcid', 'claimed','verified_email', 'verified_primary_email', \n", + " names=['orcid', 'claimed','verified_email', 'verified_primary_email', \n", " 'given_names', 'family_name', 'biography', 'other_names', 'urls', \n", " 'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', \n", " 'employment', 'n_works', 'works_source'])" @@ -575,7 +576,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 164, "metadata": {}, "outputs": [], "source": [ @@ -636,6 +637,13 @@ "df['works_source'] = df[df.works_source.notna()]['works_source'].apply(lambda x: ast.literal_eval(x))" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "That's how it looks like" + ] + }, { "cell_type": "code", "execution_count": 16, @@ -825,9 +833,139 @@ "df.head(5)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Add labels to the dataframe based on their presence in OpenAIRE" + ] + }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | orcid | \n", + "
---|---|
0 | \n", + "0000-0002-4292-8072 | \n", + "
1 | \n", + "0000-0003-3026-718x | \n", + "
2 | \n", + "0000-0002-4568-8748 | \n", + "
3 | \n", + "0000-0002-1380-859x | \n", + "
4 | \n", + "0000-0001-7526-5095 | \n", + "
... | \n", + "... | \n", + "
2934111 | \n", + "0000-0002-2394-5387 | \n", + "
2934112 | \n", + "0000-0002-6125-3170 | \n", + "
2934113 | \n", + "0000-0002-7062-5845 | \n", + "
2934114 | \n", + "0000-0001-8677-5162 | \n", + "
2934115 | \n", + "0000-0002-0240-8021 | \n", + "
2934116 rows × 1 columns
\n", + "1 rows × 25 columns
\n", "" ], "text/plain": [ @@ -905,26 +1052,22 @@ " urls \\\n", "8840413 [[Personal website, https://andremann.github.i... \n", "\n", - " primary_email other_emails \\\n", - "8840413 andrea.mannocci@isti.cnr.it NaN \n", + " primary_email ... n_works \\\n", + "8840413 andrea.mannocci@isti.cnr.it ... 37 \n", "\n", - " keywords \\\n", - "8840413 [Data science , science of science, scholarly ... \n", + " works_source \\\n", + "8840413 [Scopus - Elsevier, Crossref Metadata Search, ... \n", "\n", - " external_ids \\\n", - "8840413 [[Scopus Author ID, 55233589900]] \n", + " primary_email_domain other_email_domains n_emails \\\n", + "8840413 isti.cnr.it NaN NaN \n", "\n", - " education \\\n", - "8840413 [[Information engineering, Ph.D., Università d... \n", + " url_domains n_urls n_ids n_keywords label \n", + "8840413 [github.io, twitter.com, linkedin.com] 3.0 1.0 5.0 1 \n", "\n", - " employment n_works \\\n", - "8840413 [[Research Associate, Istituto di Scienza e Te... 37 \n", - "\n", - " works_source \n", - "8840413 [Scopus - Elsevier, Crossref Metadata Search, ... " + "[1 rows x 25 columns]" ] }, - "execution_count": 17, + "execution_count": 111, "metadata": {}, "output_type": "execute_result" } @@ -935,7 +1078,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 112, "metadata": {}, "outputs": [ { @@ -969,13 +1112,17 @@ "1 rows × 25 columns
\n", "" ], "text/plain": [ @@ -1010,17 +1162,22 @@ " given_names family_name biography other_names \\\n", "9517099 other whatsapp NaN NaN \n", "\n", - " urls primary_email \\\n", - "9517099 [[Otherwhatsapp, https://otherwhatsapp.com/], ... NaN \n", + " urls primary_email ... \\\n", + "9517099 [[Otherwhatsapp, https://otherwhatsapp.com/], ... NaN ... \n", "\n", - " other_emails keywords \\\n", - "9517099 NaN [Whatsapp GB, whatsapp gb 2020, whatsapp gb ba... \n", + " n_works works_source primary_email_domain other_email_domains \\\n", + "9517099 0 NaN NaN NaN \n", "\n", - " external_ids education employment n_works works_source \n", - "9517099 NaN NaN NaN 0 NaN " + " n_emails url_domains n_urls \\\n", + "9517099 NaN [otherwhatsapp.com, im-creator.com, facebook.c... 27.0 \n", + "\n", + " n_ids n_keywords label \n", + "9517099 NaN 4.0 0 \n", + "\n", + "[1 rows x 25 columns]" ] }, - "execution_count": 18, + "execution_count": 112, "metadata": {}, "output_type": "execute_result" } @@ -1031,18 +1188,18 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 103, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "orcid 10744622\n", - "claimed 10744622\n", - "verified_email 10744622\n", - "verified_primary_email 10744622\n", - "given_names 10716789\n", - "family_name 10437094\n", + "orcid 10744621\n", + "claimed 10744621\n", + "verified_email 10744621\n", + "verified_primary_email 10744621\n", + "given_names 10716788\n", + "family_name 10437093\n", "biography 333885\n", "other_names 544550\n", "urls 688262\n", @@ -1050,14 +1207,22 @@ "other_emails 47470\n", "keywords 638634\n", "external_ids 1285292\n", - "education 2402440\n", - "employment 2626670\n", - "n_works 10744622\n", - "works_source 2671906\n", + "education 2402439\n", + "employment 2626669\n", + "n_works 10744621\n", + "works_source 2671905\n", + "primary_email_domain 121476\n", + "other_email_domains 47470\n", + "n_emails 47470\n", + "url_domains 688262\n", + "n_urls 688262\n", + "n_ids 1285292\n", + "n_keywords 638634\n", + "label 10744621\n", "dtype: int64" ] }, - "execution_count": 19, + "execution_count": 103, "metadata": {}, "output_type": "execute_result" } @@ -1068,7 +1233,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 104, "metadata": {}, "outputs": [ { @@ -1102,13 +1267,17 @@ "1 rows × 25 columns
\n", "" ], "text/plain": [ " orcid claimed verified_email verified_primary_email \\\n", "4595263 0000-0002-5154-6404 1 1 1 \n", - "4595264 0000-0002-5154-6404 1 1 1 \n", "\n", - " given_names family_name biography other_names urls primary_email \\\n", - "4595263 Olusola Bamisile NaN NaN NaN NaN \n", - "4595264 Olusola Bamisile NaN NaN NaN NaN \n", + " given_names family_name biography other_names urls primary_email ... \\\n", + "4595263 Olusola Bamisile NaN NaN NaN NaN ... \n", "\n", - " other_emails keywords external_ids \\\n", - "4595263 NaN NaN NaN \n", - "4595264 NaN NaN NaN \n", + " n_works works_source \\\n", + "4595263 3 [Multidisciplinary Digital Publishing Institut... \n", "\n", - " education \\\n", - "4595263 [[Energy Systems Engineering , Doctoral, Cypru... \n", - "4595264 [[Energy Systems Engineering , Doctoral, Cypru... \n", + " primary_email_domain other_email_domains n_emails url_domains n_urls \\\n", + "4595263 NaN NaN NaN NaN NaN \n", "\n", - " employment n_works \\\n", - "4595263 [[, University of Electronic Science and Techn... 3 \n", - "4595264 [[, University of Electronic Science and Techn... 2 \n", + " n_ids n_keywords label \n", + "4595263 NaN NaN False \n", "\n", - " works_source \n", - "4595263 [Multidisciplinary Digital Publishing Institut... \n", - "4595264 [Crossref] " + "[1 rows x 25 columns]" ] }, - "execution_count": 20, + "execution_count": 104, "metadata": {}, "output_type": "execute_result" } @@ -10197,7 +10344,7 @@ }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 92, "metadata": {}, "outputs": [ { @@ -11125,7 +11272,7 @@ } }, "title": { - "text": "IDs provided" + "text": "IDs provided by providers" }, "xaxis": { "tickangle": 45, @@ -11136,9 +11283,9 @@ } }, "text/html": [ - "