From 25d225dd5fae16c7916c29c170bf2e903ff16eca Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Mon, 29 Mar 2021 15:58:37 +0200 Subject: [PATCH] importing mirima analysis to main notebook --- notebooks/01-Exploration.ipynb | 2498 +++++++++++++++++++++++++++----- 1 file changed, 2166 insertions(+), 332 deletions(-) diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index 6fc55a8..843eca0 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -20,7 +20,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 76, "metadata": {}, "outputs": [ { @@ -84,7 +84,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 77, "metadata": {}, "outputs": [], "source": [ @@ -101,13 +101,14 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 78, "metadata": {}, "outputs": [], "source": [ "JOURNAL = '0000-0003-1815-5732'\n", "NOINFO = '0000-0001-5009-2052'\n", "VALID_NO_OA = '0000-0002-5154-6404' # True profile, but not in OpenAIRE\n", + "WORK_MISUSE = '0000-0001-7870-1120'\n", "# todo: find group-shared ORCiD, if possible" ] }, @@ -120,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 79, "metadata": {}, "outputs": [], "source": [ @@ -142,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 80, "metadata": {}, "outputs": [ { @@ -202,13 +203,13 @@ " \n", " 0\n", " 0000-0001-6097-3953\n", - " 0\n", - " 0\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " False\n", + " False\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " NaN\n", + " <NA>\n", " NaN\n", " NaN\n", " NaN\n", @@ -235,13 +236,13 @@ " \n", " 1\n", " 0000-0001-6112-5550\n", - " 1\n", - " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", + " True\n", + " True\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " [v.i. yurtaev; v. yurtaev]\n", - " NaN\n", + " <NA>\n", " NaN\n", " NaN\n", " NaN\n", @@ -268,13 +269,13 @@ " \n", " 2\n", " 0000-0001-6152-2695\n", - " 1\n", - " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " True\n", + " True\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " NaN\n", + " <NA>\n", " NaN\n", " NaN\n", " NaN\n", @@ -301,13 +302,13 @@ " \n", " 3\n", " 0000-0001-6220-5683\n", - " 1\n", - " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " True\n", + " True\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " NaN\n", + " <NA>\n", " NaN\n", " NaN\n", " NaN\n", @@ -334,13 +335,13 @@ " \n", " 4\n", " 0000-0001-7071-8294\n", - " 1\n", - " 1\n", - " NaN\n", - " NaN\n", - " NaN\n", - " NaN\n", + " True\n", + " True\n", + " <NA>\n", + " <NA>\n", + " <NA>\n", " NaN\n", + " <NA>\n", " NaN\n", " NaN\n", " NaN\n", @@ -370,18 +371,18 @@ ], "text/plain": [ " orcid verified_email verified_primary_email given_names \\\n", - "0 0000-0001-6097-3953 0 0 NaN \n", - "1 0000-0001-6112-5550 1 1 NaN \n", - "2 0000-0001-6152-2695 1 1 NaN \n", - "3 0000-0001-6220-5683 1 1 NaN \n", - "4 0000-0001-7071-8294 1 1 NaN \n", + "0 0000-0001-6097-3953 False False \n", + "1 0000-0001-6112-5550 True True \n", + "2 0000-0001-6152-2695 True True \n", + "3 0000-0001-6220-5683 True True \n", + "4 0000-0001-7071-8294 True True \n", "\n", " family_name biography other_names primary_email keywords \\\n", - "0 NaN NaN NaN NaN NaN \n", - "1 NaN NaN [v.i. yurtaev; v. yurtaev] NaN NaN \n", - "2 NaN NaN NaN NaN NaN \n", - "3 NaN NaN NaN NaN NaN \n", - "4 NaN NaN NaN NaN NaN \n", + "0 NaN NaN \n", + "1 [v.i. yurtaev; v. yurtaev] NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", "\n", " external_ids education employment \\\n", "0 NaN NaN NaN \n", @@ -419,7 +420,7 @@ "4 NaN 2.0 " ] }, - "execution_count": 5, + "execution_count": 80, "metadata": {}, "output_type": "execute_result" } @@ -777,7 +778,7 @@ "text/plain": [ "count 10989649\n", "unique 10989649\n", - "top 0000-0002-7963-4502\n", + "top 0000-0001-7886-4851\n", "freq 1\n", "Name: orcid, dtype: object" ] @@ -806,10 +807,10 @@ { "data": { "text/plain": [ - "count 124722\n", - "unique 124718\n", - "top opercin@erbakan.edu.tr\n", - "freq 2\n", + "count 124722\n", + "unique 124718\n", + "top maykin@owasp.org\n", + "freq 2\n", "Name: primary_email, dtype: object" ] }, @@ -2412,9 +2413,9 @@ } }, "text/html": [ - "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "data = [\n", + " go.Bar(\n", + " x=grouped_ext_sources[:30].ext_works_source,\n", + " y=grouped_ext_sources[:30].orcid\n", + " )\n", + "]\n", + "\n", + "layout = go.Layout(\n", + " title='Top 30 works_source',\n", + " xaxis=dict(tickangle=45, tickfont=dict(size=12))\n", + ")\n", + "fig = go.Figure(data=data, layout=layout)\n", + "plotly.offline.iplot(fig)" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ext_works_sourceorcid
0crossref1460841
1scopus - elsevier902231
2crossref metadata search297684
3multidisciplinary digital publishing institute281664
4europe pubmed central181605
.........
337uta - oa journal global insight3
338francis crick institute3
339anna3
340santos3
341universitäts- und stadtbibliothek köln3
\n", + "

342 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " ext_works_source orcid\n", + "0 crossref 1460841\n", + "1 scopus - elsevier 902231\n", + "2 crossref metadata search 297684\n", + "3 multidisciplinary digital publishing institute 281664\n", + "4 europe pubmed central 181605\n", + ".. ... ...\n", + "337 uta - oa journal global insight 3\n", + "338 francis crick institute 3\n", + "339 anna 3\n", + "340 santos 3\n", + "341 universitäts- und stadtbibliothek köln 3\n", + "\n", + "[342 rows x 2 columns]" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "authoritative_sources = grouped_ext_sources[grouped_ext_sources['orcid'] > 2]\n", + "authoritative_sources" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [], + "source": [ + "exploded_external_sources['authoritative'] = exploded_external_sources.ext_works_source\\\n", + " .isin(authoritative_sources['ext_works_source'])" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [], + "source": [ + "orcid_authoritative_source = exploded_external_sources\\\n", + " .groupby('orcid')['authoritative']\\\n", + " .any()\\\n", + " .reset_index()[['orcid', 'authoritative']]" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.set_index('orcid').join(orcid_authoritative_source.set_index('orcid')).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [], + "source": [ + "df.loc[df.authoritative.isna(), 'authoritative'] = False" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, "outputs": [ { "data": { @@ -10624,6 +11741,357 @@ " n_keywords\n", " n_education\n", " n_employment\n", + " ext_works_source\n", + " n_ext_work_source\n", + " authoritative\n", + " \n", + " \n", + " \n", + " \n", + " 0\n", + " 0000-0001-6097-3953\n", + " 0\n", + " 0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0\n", + " NaN\n", + " 2018-03-02t09:29:16.528z\n", + " 2018-03-02t09:43:07.551z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " False\n", + " \n", + " \n", + " 1\n", + " 0000-0001-6112-5550\n", + " 1\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " [v.i. yurtaev; v. yurtaev]\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " [[professor, peoples friendship university of ...\n", + " 0\n", + " NaN\n", + " 2018-04-03t07:50:23.358z\n", + " 2020-03-18t09:42:44.753z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " False\n", + " \n", + " \n", + " 2\n", + " 0000-0001-6152-2695\n", + " 1\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 0\n", + " NaN\n", + " 2019-12-11t15:31:56.388z\n", + " 2020-01-28t15:34:17.309z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " False\n", + " \n", + " \n", + " 3\n", + " 0000-0001-6220-5683\n", + " 1\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " [[research scientist, new york university abu ...\n", + " 0\n", + " NaN\n", + " 2015-08-18t12:36:45.307z\n", + " 2020-09-23t13:37:54.180z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 1.0\n", + " NaN\n", + " NaN\n", + " False\n", + " \n", + " \n", + " 4\n", + " 0000-0001-7071-8294\n", + " 1\n", + " 1\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " [[researcher (academic), universidad de zarago...\n", + " 0\n", + " NaN\n", + " 2014-03-10t13:22:01.966z\n", + " 2016-06-14t22:17:54.470z\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " 0\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " 2.0\n", + " NaN\n", + " NaN\n", + " False\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " orcid verified_email verified_primary_email given_names \\\n", + "0 0000-0001-6097-3953 0 0 NaN \n", + "1 0000-0001-6112-5550 1 1 NaN \n", + "2 0000-0001-6152-2695 1 1 NaN \n", + "3 0000-0001-6220-5683 1 1 NaN \n", + "4 0000-0001-7071-8294 1 1 NaN \n", + "\n", + " family_name biography other_names primary_email keywords \\\n", + "0 NaN NaN NaN NaN NaN \n", + "1 NaN NaN [v.i. yurtaev; v. yurtaev] NaN NaN \n", + "2 NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN NaN \n", + "\n", + " external_ids education employment \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN [[professor, peoples friendship university of ... \n", + "2 NaN NaN NaN \n", + "3 NaN NaN [[research scientist, new york university abu ... \n", + "4 NaN NaN [[researcher (academic), universidad de zarago... \n", + "\n", + " n_works works_source activation_date last_update_date \\\n", + "0 0 NaN 2018-03-02t09:29:16.528z 2018-03-02t09:43:07.551z \n", + "1 0 NaN 2018-04-03t07:50:23.358z 2020-03-18t09:42:44.753z \n", + "2 0 NaN 2019-12-11t15:31:56.388z 2020-01-28t15:34:17.309z \n", + "3 0 NaN 2015-08-18t12:36:45.307z 2020-09-23t13:37:54.180z \n", + "4 0 NaN 2014-03-10t13:22:01.966z 2016-06-14t22:17:54.470z \n", + "\n", + " n_doi n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", + "0 0 0 0 0 0 NaN \n", + "1 0 0 0 0 0 NaN \n", + "2 0 0 0 0 0 NaN \n", + "3 0 0 0 0 0 NaN \n", + "4 0 0 0 0 0 NaN \n", + "\n", + " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", + "0 NaN NaN NaN NaN NaN NaN \n", + "1 NaN NaN NaN NaN NaN NaN \n", + "2 NaN NaN NaN NaN NaN NaN \n", + "3 NaN NaN NaN NaN NaN NaN \n", + "4 NaN NaN NaN NaN NaN NaN \n", + "\n", + " n_education n_employment ext_works_source n_ext_work_source authoritative \n", + "0 NaN NaN NaN NaN False \n", + "1 NaN 1.0 NaN NaN False \n", + "2 NaN NaN NaN NaN False \n", + "3 NaN 1.0 NaN NaN False \n", + "4 NaN 2.0 NaN NaN False " + ] + }, + "execution_count": 44, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## External IDs" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "External IDs should come from reliable sources. ORCiD registrants cannot add them freely." + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 1.308598e+06\n", + "mean 1.359082e+00\n", + "std 6.643235e-01\n", + "min 1.000000e+00\n", + "25% 1.000000e+00\n", + "50% 1.000000e+00\n", + "75% 2.000000e+00\n", + "max 8.000000e+01\n", + "Name: n_ids, dtype: float64" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.n_ids.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", @@ -10659,6 +12127,9 @@ " \n", " \n", " \n", + " \n", + " \n", + " \n", " \n", " \n", "
orcidverified_emailverified_primary_emailgiven_namesfamily_namebiographyother_namesprimary_emailkeywordsexternal_idseducationemploymentn_worksworks_sourceactivation_datelast_update_daten_doin_arxivn_pmcn_other_pidslabelprimary_email_domainother_email_domainsurl_domainsn_emailsn_urlsn_idsn_keywordsn_educationn_employmentext_works_sourcen_ext_work_sourceauthoritative
NaNNaN1.0[aston research explorer]1.0True
\n", @@ -10686,11 +12157,14 @@ " primary_email_domain other_email_domains url_domains n_emails \\\n", "3896226 NaN NaN [aston.ac.uk] NaN \n", "\n", - " n_urls n_ids n_keywords n_education n_employment \n", - "3896226 1.0 80.0 NaN NaN 1.0 " + " n_urls n_ids n_keywords n_education n_employment \\\n", + "3896226 1.0 80.0 NaN NaN 1.0 \n", + "\n", + " ext_works_source n_ext_work_source authoritative \n", + "3896226 [aston research explorer] 1.0 True " ] }, - "execution_count": 34, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -10701,7 +12175,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": 47, "metadata": {}, "outputs": [], "source": [ @@ -10710,7 +12184,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": 48, "metadata": {}, "outputs": [], "source": [ @@ -10719,7 +12193,7 @@ }, { "cell_type": "code", - "execution_count": 37, + "execution_count": 49, "metadata": {}, "outputs": [ { @@ -10792,7 +12266,7 @@ "64 0000-0002-7397-5824 [scopus author id, 8399842800] scopus author id" ] }, - "execution_count": 37, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -10803,7 +12277,7 @@ }, { "cell_type": "code", - "execution_count": 38, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ @@ -10812,7 +12286,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -11745,9 +13219,9 @@ } }, "text/html": [ - "