fake-orcid-analysis/notebooks/01.1-MB-Exploration_WorksSo...

2968 lines
99 KiB
Plaintext
Raw Normal View History

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "subsequent-cornell",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" <script type=\"text/javascript\">\n",
" window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
" if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
" if (typeof require !== 'undefined') {\n",
" require.undef(\"plotly\");\n",
" requirejs.config({\n",
" paths: {\n",
" 'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n",
" }\n",
" });\n",
" require(['plotly'], function(Plotly) {\n",
" window._Plotly = Plotly;\n",
" });\n",
" }\n",
" </script>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import glob\n",
"\n",
"import pandas as pd\n",
"import ast\n",
"import tldextract\n",
"import numpy\n",
"\n",
"import plotly\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.graph_objs as go\n",
"import plotly.express as px\n",
"\n",
"init_notebook_mode(connected=True)\n",
"TOP_N = 0\n",
"TOP_RANGE = [0, 0]\n",
"def set_top_n(n):\n",
" global TOP_N, TOP_RANGE\n",
" TOP_N = n\n",
" TOP_RANGE = [-.5, n - 1 + .5]"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "hydraulic-baker",
"metadata": {},
"outputs": [],
"source": [
"parts = glob.glob('/Users/miriam.baglioni/Develop/Gitea/fake-orcid-analysis-v2/fake-orcid-analysis/data/processed/dataset.pkl.*')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "lesbian-routine",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>given_names</th>\n",
" <th>family_name</th>\n",
" <th>biography</th>\n",
" <th>other_names</th>\n",
" <th>urls</th>\n",
" <th>primary_email</th>\n",
" <th>other_emails</th>\n",
" <th>...</th>\n",
" <th>employment</th>\n",
" <th>n_works</th>\n",
" <th>works_source</th>\n",
" <th>activation_date</th>\n",
" <th>last_update_date</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>label</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>10000000</th>\n",
" <td>0000-0002-7790-0483</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>abel</td>\n",
" <td>elias</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-16t16:51:54.155z</td>\n",
" <td>2020-09-16t17:00:08.451z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10000001</th>\n",
" <td>0000-0001-6368-0531</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>abelardo</td>\n",
" <td>ramirez</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>2017-05-10t19:28:13.217z</td>\n",
" <td>2017-05-10t19:28:17.315z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10000002</th>\n",
" <td>0000-0001-8149-4900</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>abelardo</td>\n",
" <td>mancinas</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>[[profesor investigador, instituto tecnológico...</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>2018-10-15t21:46:52.162z</td>\n",
" <td>2020-01-13t03:33:47.645z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10000003</th>\n",
" <td>0000-0002-8684-2422</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>abera</td>\n",
" <td>nigussie</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>2020-09-23t08:36:17.451z</td>\n",
" <td>2020-09-23t08:36:17.450z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10000004</th>\n",
" <td>0000-0003-4814-7872</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>abhijeet</td>\n",
" <td>singh</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>NaN</td>\n",
" <td>0</td>\n",
" <td>NaN</td>\n",
" <td>2018-05-01t22:43:17.407z</td>\n",
" <td>2018-10-06t22:21:54.024z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 23 columns</p>\n",
"</div>"
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
"10000000 0000-0002-7790-0483 1 0 \n",
"10000001 0000-0001-6368-0531 0 0 \n",
"10000002 0000-0001-8149-4900 1 1 \n",
"10000003 0000-0002-8684-2422 0 0 \n",
"10000004 0000-0003-4814-7872 1 1 \n",
"\n",
" given_names family_name biography other_names urls primary_email \\\n",
"10000000 abel elias NaN NaN NaN NaN \n",
"10000001 abelardo ramirez NaN NaN NaN NaN \n",
"10000002 abelardo mancinas NaN NaN NaN NaN \n",
"10000003 abera nigussie NaN NaN NaN NaN \n",
"10000004 abhijeet singh NaN NaN NaN NaN \n",
"\n",
" other_emails ... employment \\\n",
"10000000 NaN ... NaN \n",
"10000001 NaN ... NaN \n",
"10000002 NaN ... [[profesor investigador, instituto tecnológico... \n",
"10000003 NaN ... NaN \n",
"10000004 NaN ... NaN \n",
"\n",
" n_works works_source activation_date \\\n",
"10000000 0 NaN 2020-09-16t16:51:54.155z \n",
"10000001 0 NaN 2017-05-10t19:28:13.217z \n",
"10000002 0 NaN 2018-10-15t21:46:52.162z \n",
"10000003 0 NaN 2020-09-23t08:36:17.451z \n",
"10000004 0 NaN 2018-05-01t22:43:17.407z \n",
"\n",
" last_update_date n_doi n_arxiv n_pmc n_other_pids label \n",
"10000000 2020-09-16t17:00:08.451z 0 0 0 0 0 \n",
"10000001 2017-05-10t19:28:17.315z 0 0 0 0 0 \n",
"10000002 2020-01-13t03:33:47.645z 0 0 0 0 0 \n",
"10000003 2020-09-23t08:36:17.450z 0 0 0 0 0 \n",
"10000004 2018-10-06t22:21:54.024z 0 0 0 0 0 \n",
"\n",
"[5 rows x 23 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.concat((pd.read_pickle(part) for part in parts))\n",
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "olympic-missile",
"metadata": {},
"outputs": [],
"source": [
"def remove_own_source(lst, given, family):\n",
" res = []\n",
" if isinstance(lst, list) and pd.notna(given):\n",
" for ws in lst:\n",
" if ws.lower().find(given.lower()) == -1:\n",
" if pd.notna(family):\n",
" if ws.lower().find(family.lower()) == -1:\n",
" res.append(ws)\n",
" else:\n",
" res.append(ws)\n",
" return res"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "informational-carrier",
"metadata": {},
"outputs": [],
"source": [
"df['ext_works_source'] = df.apply(lambda x: remove_own_source(x['works_source'], x['given_names'], x['family_name']), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "hydraulic-pharmaceutical",
"metadata": {},
"outputs": [],
"source": [
"df['n_ext_work_source'] = df.ext_works_source.str.len()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "tropical-stockholm",
"metadata": {},
"outputs": [],
"source": [
"exploded_external_sources = df[df['ext_works_source'].str.len() > 0][['orcid','ext_works_source']].explode('ext_works_source').reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "enhanced-blanket",
"metadata": {},
"outputs": [],
"source": [
"grouped_ext_sources = exploded_external_sources.groupby('ext_works_source').count().sort_values('orcid', ascending=False).reset_index()\n"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "black-congo",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"linkText": "Export to plot.ly",
"plotlyServerURL": "https://plot.ly",
"showLink": false
},
"data": [
{
"type": "bar",
"x": [
"crossref",
"scopus - elsevier",
"crossref metadata search",
"multidisciplinary digital publishing institute",
"europe pubmed central",
"researcherid",
"publons",
"ciênciavitae",
"base - bielefeld academic search engine",
"datacite",
"redalyc",
"mla international bibliography",
"deutsche nationalbibliothek (dnb)",
"nasa astrophysics data system",
"national information processing institute ",
"f1000",
"inspire-hep",
"university of helsinki",
"hal",
"igi global",
"airiti",
"university of copenhagen",
"universidade federal de uberlândia",
"aarhus university",
"universidad del país vasco",
"university of manchester - pure",
"kings college london",
"university of southern denmark",
"wellcome open research",
"macquarie university"
],
"y": [
1460841,
902231,
297684,
281664,
181605,
158148,
39786,
32315,
20699,
16107,
9640,
8059,
7855,
7403,
6509,
5221,
4872,
4152,
4136,
3833,
3725,
3127,
2718,
2311,
2271,
2227,
2199,
2185,
2113,
2053
]
}
],
"layout": {
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Top 30 works_source"
},
"xaxis": {
"tickangle": 45,
"tickfont": {
"size": 12
}
}
}
},
"text/html": [
"<div> <div id=\"a5847718-ded6-45aa-a6c0-03dac82d5597\" class=\"plotly-graph-div\" style=\"height:525px; width:100%;\"></div> <script type=\"text/javascript\"> require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById(\"a5847718-ded6-45aa-a6c0-03dac82d5597\")) { Plotly.newPlot( \"a5847718-ded6-45aa-a6c0-03dac82d5597\", [{\"type\": \"bar\", \"x\": [\"crossref\", \"scopus - elsevier\", \"crossref metadata search\", \"multidisciplinary digital publishing institute\", \"europe pubmed central\", \"researcherid\", \"publons\", \"ci\\u00eanciavitae\", \"base - bielefeld academic search engine\", \"datacite\", \"redalyc\", \"mla international bibliography\", \"deutsche nationalbibliothek (dnb)\", \"nasa astrophysics data system\", \"national information processing institute \", \"f1000\", \"inspire-hep\", \"university of helsinki\", \"hal\", \"igi global\", \"airiti\", \"university of copenhagen\", \"universidade federal de uberl\\u00e2ndia\", \"aarhus university\", \"universidad del pa\\u00eds vasco\", \"university of manchester - pure\", \"kings college london\", \"university of southern denmark\", \"wellcome open research\", \"macquarie university\"], \"y\": [1460841, 902231, 297684, 281664, 181605, 158148, 39786, 32315, 20699, 16107, 9640, 8059, 7855, 7403, 6509, 5221, 4872, 4152, 4136, 3833, 3725, 3127, 2718, 2311, 2271, 2227, 2199, 2185, 2113, 2053]}], {\"template\": {\"data\": {\"bar\": [{\"error_x\": {\"color\": \"#2a3f5f\"}, \"error_y\": {\"color\": \"#2a3f5f\"}, \"marker\": {\"line\": {\"color\": \"#E5ECF6\", \"width\": 0.5}}, \"type\": \"bar\"}], \"barpolar\": [{\"marker\": {\"line\": {\"color\": \"#E5ECF6\", \"width\": 0.5}}, \"type\": \"barpolar\"}], \"carpet\": [{\"aaxis\": {\"endlinecolor\": \"#2a3f5f\", \"gridcolor\": \"white\", \"linecolor\": \"white\", \"minorgridcolor\": \"white\", \"startlinecolor\": \"#2a3f5f\"}, \"baxis\": {\"endlinecolor\": \"#2a3f5f\", \"gridcolor\": \"white\", \"linecolor\": \"white\", \"minorgridcolor\": \"white\", \"startlinecolor\": \"#2a3f5f\"}, \"type\": \"carpet\"}], \"choropleth\": [{\"colorbar\": {\"outlinewidth\": 0, \"ticks\": \"\"}, \"type\": \"choropleth\"}], \"contour\": [{\"colorbar\": {\"outlinewidth\": 0, \"ticks\": \"\"}, \"colorscale\": [[0.0, \"#0d0887\"], [0.1111111111111111, \"#46039f\"], [0.2222222222222222, \"#7201a8\"], [0.3333333333333333, \"#9c179e\"], [0.4444444444444444, \"#bd3786\"], [0.5555555555555556, \"#d8576b\"], [0.6666666666666666, \"#ed7953\"], [0.7777777777777778, \"#fb9f3a\"], [0.8888888888888888, \"#fdca26\"], [1.0, \"#f0f921\"]], \"type\": \"contour\"}], \"contourcarpet\": [{\"colorbar\": {\"outlinewidth\": 0, \"ticks\": \"\"}, \"type\": \"contourcarpet\"}], \"heatmap\": [{\"colorbar\": {\"outlinewidth\": 0, \"ticks\": \"\"}, \"colorscale\": [[0.0, \"#0d0887\"], [0.1111111111111111, \"#46039f\"], [0.2222222222222222, \"#7201a8\"], [0.3333333333333333, \"#9c179e\"], [0.4444444444444444, \"#bd3786\"], [0.5555555555555556, \"#d8576b\"], [0.6666666666666666, \"#ed7953\"], [0.7777777777777778, \"#fb9f3a\"], [0.8888888888888888, \"#fdca26\"], [1.0, \"#f0f921\"]], \"type\": \"heatmap\"}], \"heatmapgl\": [{\"colorbar\": {\"outlinewidth\": 0, \"ticks\": \"\"}, \"colorscale\": [[0.0, \"#0d0887\"], [0.1111111111111111, \"#46039f\"], [0.2222222222222222, \"#7201a8\"], [0.3333333333333333, \"#9c179e\"], [0.4444444444444444, \"#bd3786\"], [0.5555555555555556, \"#d8576b\"], [0.6666666666666666, \"#ed7953\"], [0.7777777777777778, \"#fb9f3a\"], [0.8888888888888888, \"#fdca26\"], [1.0, \"#f0f921\"]], \"type\": \"heatmapgl\"}], \"histogram\": [{\"marker\": {\"colorbar\": {\"outlinewidth\": 0, \"ticks\": \"\"}}, \"type\": \"histogram\"}], \"histogram2d\": [{\"colorbar\": {\"outlinewidth\": 0, \"ticks\": \"\"}, \"colorscale\": [[0.0, \"#0d0887\"], [0.1111111111111111, \"#46039f\"], [0.2222222222222222, \"#7201a8\"], [
" \n",
"var gd = document.getElementById('a5847718-ded6-45aa-a6c0-03dac82d5597');\n",
"var x = new MutationObserver(function (mutations, observer) {{\n",
" var display = window.getComputedStyle(gd).display;\n",
" if (!display || display === 'none') {{\n",
" console.log([gd, 'removed!']);\n",
" Plotly.purge(gd);\n",
" observer.disconnect();\n",
" }}\n",
"}});\n",
"\n",
"// Listen for the removal of the full notebook cells\n",
"var notebookContainer = gd.closest('#notebook-container');\n",
"if (notebookContainer) {{\n",
" x.observe(notebookContainer, {childList: true});\n",
"}}\n",
"\n",
"// Listen for the clearing of the current output cell\n",
"var outputEl = gd.closest('.output');\n",
"if (outputEl) {{\n",
" x.observe(outputEl, {childList: true});\n",
"}}\n",
"\n",
" }) }; }); </script> </div>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"\n",
"data = [\n",
" go.Bar(\n",
" x=grouped_ext_sources[:30].ext_works_source,\n",
" y=grouped_ext_sources[:30].orcid\n",
" )\n",
"]\n",
"\n",
"layout = go.Layout(\n",
" title='Top 30 works_source',\n",
" xaxis=dict(tickangle=45, tickfont=dict(size=12))\n",
")\n",
"fig = go.Figure(data=data, layout=layout)\n",
"plotly.offline.iplot(fig)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "sophisticated-madness",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ext_works_source</th>\n",
" <th>orcid</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>crossref</td>\n",
" <td>1460841</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>scopus - elsevier</td>\n",
" <td>902231</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>crossref metadata search</td>\n",
" <td>297684</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>multidisciplinary digital publishing institute</td>\n",
" <td>281664</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>europe pubmed central</td>\n",
" <td>181605</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>337</th>\n",
" <td>uta - oa journal global insight</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>338</th>\n",
" <td>francis crick institute</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>339</th>\n",
" <td>anna</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>340</th>\n",
" <td>santos</td>\n",
" <td>3</td>\n",
" </tr>\n",
" <tr>\n",
" <th>341</th>\n",
" <td>universitäts- und stadtbibliothek köln</td>\n",
" <td>3</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>342 rows × 2 columns</p>\n",
"</div>"
],
"text/plain": [
" ext_works_source orcid\n",
"0 crossref 1460841\n",
"1 scopus - elsevier 902231\n",
"2 crossref metadata search 297684\n",
"3 multidisciplinary digital publishing institute 281664\n",
"4 europe pubmed central 181605\n",
".. ... ...\n",
"337 uta - oa journal global insight 3\n",
"338 francis crick institute 3\n",
"339 anna 3\n",
"340 santos 3\n",
"341 universitäts- und stadtbibliothek köln 3\n",
"\n",
"[342 rows x 2 columns]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"authoritative_sources = grouped_ext_sources[grouped_ext_sources['orcid'] > 2]\n",
"authoritative_sources"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "fifty-translator",
"metadata": {},
"outputs": [],
"source": [
"exploded_external_sources['authoritative'] = exploded_external_sources.ext_works_source.isin(authoritative_sources['ext_works_source'])"
]
},
{
"cell_type": "code",
"execution_count": 57,
"id": "sweet-silicon",
"metadata": {},
"outputs": [],
"source": [
"orcid_authoritative_source = exploded_external_sources.groupby('orcid')['authoritative'].any().reset_index()[['orcid', 'authoritative']]"
]
},
{
"cell_type": "code",
"execution_count": 64,
"id": "iraqi-million",
"metadata": {},
"outputs": [],
"source": [
"df = df.set_index('orcid').join(orcid_authoritative_source.set_index('orcid')).reset_index()"
]
},
{
"cell_type": "code",
"execution_count": 65,
"id": "current-convergence",
"metadata": {},
"outputs": [],
"source": [
"df.loc[df.authoritative.isna(), 'authoritative'] = False"
]
},
{
"cell_type": "code",
"execution_count": 66,
"id": "median-smith",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>orcid</th>\n",
" <th>verified_email</th>\n",
" <th>verified_primary_email</th>\n",
" <th>given_names</th>\n",
" <th>family_name</th>\n",
" <th>biography</th>\n",
" <th>other_names</th>\n",
" <th>urls</th>\n",
" <th>primary_email</th>\n",
" <th>other_emails</th>\n",
" <th>...</th>\n",
" <th>activation_date</th>\n",
" <th>last_update_date</th>\n",
" <th>n_doi</th>\n",
" <th>n_arxiv</th>\n",
" <th>n_pmc</th>\n",
" <th>n_other_pids</th>\n",
" <th>label</th>\n",
" <th>ext_works_source</th>\n",
" <th>n_ext_work_source</th>\n",
" <th>authoritative</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0000-0002-7790-0483</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>abel</td>\n",
" <td>elias</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2020-09-16t16:51:54.155z</td>\n",
" <td>2020-09-16t17:00:08.451z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0000-0001-6368-0531</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>abelardo</td>\n",
" <td>ramirez</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2017-05-10t19:28:13.217z</td>\n",
" <td>2017-05-10t19:28:17.315z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0000-0001-8149-4900</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>abelardo</td>\n",
" <td>mancinas</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2018-10-15t21:46:52.162z</td>\n",
" <td>2020-01-13t03:33:47.645z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0000-0002-8684-2422</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>abera</td>\n",
" <td>nigussie</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2020-09-23t08:36:17.451z</td>\n",
" <td>2020-09-23t08:36:17.450z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0000-0003-4814-7872</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>abhijeet</td>\n",
" <td>singh</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2018-05-01t22:43:17.407z</td>\n",
" <td>2018-10-06t22:21:54.024z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>...</th>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" <td>...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989644</th>\n",
" <td>0000-0001-7468-9881</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>abeer</td>\n",
" <td>elbaroudi</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2020-02-06t15:04:42.485z</td>\n",
" <td>2020-02-06t15:16:45.537z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989645</th>\n",
" <td>0000-0003-0081-4285</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>abeer</td>\n",
" <td>sohrab</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2020-05-12t22:39:26.356z</td>\n",
" <td>2020-05-12t22:41:45.239z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989646</th>\n",
" <td>0000-0003-2004-3457</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>abeer</td>\n",
" <td>abdelmaksoud</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2019-12-19t23:09:12.579z</td>\n",
" <td>2019-12-19t23:09:12.798z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989647</th>\n",
" <td>0000-0003-2841-9754</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>abeer</td>\n",
" <td>al-ghazali</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2019-06-02t18:35:32.973z</td>\n",
" <td>2019-08-05t14:54:41.796z</td>\n",
" <td>2</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>[crossref metadata search]</td>\n",
" <td>1</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10989648</th>\n",
" <td>0000-0002-3675-6876</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>abegail</td>\n",
" <td>palos-simbre</td>\n",
" <td>NaN</td>\n",
" <td>[gail]</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>...</td>\n",
" <td>2017-02-10t16:38:52.988z</td>\n",
" <td>2019-12-11t01:37:15.405z</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>[]</td>\n",
" <td>0</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>10989649 rows × 26 columns</p>\n",
"</div>"
],
"text/plain": [
" orcid verified_email verified_primary_email \\\n",
"0 0000-0002-7790-0483 1 0 \n",
"1 0000-0001-6368-0531 0 0 \n",
"2 0000-0001-8149-4900 1 1 \n",
"3 0000-0002-8684-2422 0 0 \n",
"4 0000-0003-4814-7872 1 1 \n",
"... ... ... ... \n",
"10989644 0000-0001-7468-9881 1 1 \n",
"10989645 0000-0003-0081-4285 1 1 \n",
"10989646 0000-0003-2004-3457 0 0 \n",
"10989647 0000-0003-2841-9754 1 1 \n",
"10989648 0000-0002-3675-6876 0 0 \n",
"\n",
" given_names family_name biography other_names urls primary_email \\\n",
"0 abel elias NaN NaN NaN NaN \n",
"1 abelardo ramirez NaN NaN NaN NaN \n",
"2 abelardo mancinas NaN NaN NaN NaN \n",
"3 abera nigussie NaN NaN NaN NaN \n",
"4 abhijeet singh NaN NaN NaN NaN \n",
"... ... ... ... ... ... ... \n",
"10989644 abeer elbaroudi NaN NaN NaN NaN \n",
"10989645 abeer sohrab NaN NaN NaN NaN \n",
"10989646 abeer abdelmaksoud NaN NaN NaN NaN \n",
"10989647 abeer al-ghazali NaN NaN NaN NaN \n",
"10989648 abegail palos-simbre NaN [gail] NaN NaN \n",
"\n",
" other_emails ... activation_date \\\n",
"0 NaN ... 2020-09-16t16:51:54.155z \n",
"1 NaN ... 2017-05-10t19:28:13.217z \n",
"2 NaN ... 2018-10-15t21:46:52.162z \n",
"3 NaN ... 2020-09-23t08:36:17.451z \n",
"4 NaN ... 2018-05-01t22:43:17.407z \n",
"... ... ... ... \n",
"10989644 NaN ... 2020-02-06t15:04:42.485z \n",
"10989645 NaN ... 2020-05-12t22:39:26.356z \n",
"10989646 NaN ... 2019-12-19t23:09:12.579z \n",
"10989647 NaN ... 2019-06-02t18:35:32.973z \n",
"10989648 NaN ... 2017-02-10t16:38:52.988z \n",
"\n",
" last_update_date n_doi n_arxiv n_pmc n_other_pids label \\\n",
"0 2020-09-16t17:00:08.451z 0 0 0 0 0 \n",
"1 2017-05-10t19:28:17.315z 0 0 0 0 0 \n",
"2 2020-01-13t03:33:47.645z 0 0 0 0 0 \n",
"3 2020-09-23t08:36:17.450z 0 0 0 0 0 \n",
"4 2018-10-06t22:21:54.024z 0 0 0 0 0 \n",
"... ... ... ... ... ... ... \n",
"10989644 2020-02-06t15:16:45.537z 0 0 0 0 0 \n",
"10989645 2020-05-12t22:41:45.239z 0 0 0 0 0 \n",
"10989646 2019-12-19t23:09:12.798z 0 0 0 0 0 \n",
"10989647 2019-08-05t14:54:41.796z 2 0 0 2 1 \n",
"10989648 2019-12-11t01:37:15.405z 0 0 0 0 0 \n",
"\n",
" ext_works_source n_ext_work_source authoritative \n",
"0 [] 0 False \n",
"1 [] 0 False \n",
"2 [] 0 False \n",
"3 [] 0 False \n",
"4 [] 0 False \n",
"... ... ... ... \n",
"10989644 [] 0 False \n",
"10989645 [] 0 False \n",
"10989646 [] 0 False \n",
"10989647 [crossref metadata search] 1 True \n",
"10989648 [] 0 False \n",
"\n",
"[10989649 rows x 26 columns]"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 67,
"id": "veterinary-phrase",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"coloraxis": "coloraxis",
"hovertemplate": "x: %{x}<br>y: %{y}<br>color: %{z}<extra></extra>",
"name": "0",
"type": "heatmap",
"x": [
"verified_email",
"verified_primary_email",
"n_works",
"n_doi",
"n_arxiv",
"n_pmc",
"n_other_pids",
"label",
"n_ext_work_source",
"authoritative"
],
"xaxis": "x",
"y": [
"verified_email",
"verified_primary_email",
"n_works",
"n_doi",
"n_arxiv",
"n_pmc",
"n_other_pids",
"label",
"n_ext_work_source",
"authoritative"
],
"yaxis": "y",
"z": [
[
1,
0.9649829131837351,
0.07899833525810977,
0.07259719921935899,
0.006461363868256276,
0.030614701011724168,
0.06062464201233044,
0.1531839773366329,
0.1919719557229596,
0.21531668352175948
],
[
0.9649829131837351,
1,
0.08183974046701105,
0.07518160639621922,
0.006686059029180166,
0.03171235345994569,
0.06277678931007252,
0.15995695182918668,
0.1981210698185993,
0.22184413814951587
],
[
0.07899833525810977,
0.08183974046701105,
1,
0.9378726254396619,
0.31262992500470826,
0.3510856389397645,
0.8353346326814892,
0.22974076078503264,
0.42502019390055656,
0.2990392382833506
],
[
0.07259719921935899,
0.07518160639621922,
0.9378726254396619,
1,
0.35605399617713956,
0.3624050122938356,
0.801819617534692,
0.2133388352039022,
0.41375193880464456,
0.28780401348168333
],
[
0.006461363868256276,
0.006686059029180166,
0.31262992500470826,
0.35605399617713956,
1,
0.0009072282179230607,
0.2420914875525837,
0.01939797095250517,
0.021262173261030495,
0.02440100048344857
],
[
0.030614701011724168,
0.03171235345994569,
0.3510856389397645,
0.3624050122938356,
0.0009072282179230607,
1,
0.2570742999530638,
0.08736856703205036,
0.16873991088778023,
0.11447380021013033
],
[
0.06062464201233044,
0.06277678931007252,
0.8353346326814892,
0.801819617534692,
0.2420914875525837,
0.2570742999530638,
1,
0.17528852589870983,
0.3572799642364996,
0.24303586233733107
],
[
0.1531839773366329,
0.15995695182918668,
0.22974076078503264,
0.2133388352039022,
0.01939797095250517,
0.08736856703205036,
0.17528852589870983,
1,
0.49221037696497033,
0.5245689815824116
],
[
0.1919719557229596,
0.1981210698185993,
0.42502019390055656,
0.41375193880464456,
0.021262173261030495,
0.16873991088778023,
0.3572799642364996,
0.49221037696497033,
1,
0.8380242299586107
],
[
0.21531668352175948,
0.22184413814951587,
0.2990392382833506,
0.28780401348168333,
0.02440100048344857,
0.11447380021013033,
0.24303586233733107,
0.5245689815824116,
0.8380242299586107,
1
]
]
}
],
"layout": {
"coloraxis": {
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"margin": {
"t": 60
},
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"xaxis": {
"anchor": "y",
"constrain": "domain",
"domain": [
0,
1
],
"scaleanchor": "y"
},
"yaxis": {
"anchor": "x",
"autorange": "reversed",
"constrain": "domain",
"domain": [
0,
1
]
}
}
},
"text/html": [
"<div> <div id=\"61dd860d-85e6-4bdc-b5e0-335c1bc45301\" class=\"plotly-graph-div\" style=\"height:525px; width:100%;\"></div> <script type=\"text/javascript\"> require([\"plotly\"], function(Plotly) { window.PLOTLYENV=window.PLOTLYENV || {}; if (document.getElementById(\"61dd860d-85e6-4bdc-b5e0-335c1bc45301\")) { Plotly.newPlot( \"61dd860d-85e6-4bdc-b5e0-335c1bc45301\", [{\"coloraxis\": \"coloraxis\", \"hovertemplate\": \"x: %{x}<br>y: %{y}<br>color: %{z}<extra></extra>\", \"name\": \"0\", \"type\": \"heatmap\", \"x\": [\"verified_email\", \"verified_primary_email\", \"n_works\", \"n_doi\", \"n_arxiv\", \"n_pmc\", \"n_other_pids\", \"label\", \"n_ext_work_source\", \"authoritative\"], \"xaxis\": \"x\", \"y\": [\"verified_email\", \"verified_primary_email\", \"n_works\", \"n_doi\", \"n_arxiv\", \"n_pmc\", \"n_other_pids\", \"label\", \"n_ext_work_source\", \"authoritative\"], \"yaxis\": \"y\", \"z\": [[1.0, 0.9649829131837351, 0.07899833525810977, 0.07259719921935899, 0.006461363868256276, 0.030614701011724168, 0.06062464201233044, 0.1531839773366329, 0.1919719557229596, 0.21531668352175948], [0.9649829131837351, 1.0, 0.08183974046701105, 0.07518160639621922, 0.006686059029180166, 0.03171235345994569, 0.06277678931007252, 0.15995695182918668, 0.1981210698185993, 0.22184413814951587], [0.07899833525810977, 0.08183974046701105, 1.0, 0.9378726254396619, 0.31262992500470826, 0.3510856389397645, 0.8353346326814892, 0.22974076078503264, 0.42502019390055656, 0.2990392382833506], [0.07259719921935899, 0.07518160639621922, 0.9378726254396619, 1.0, 0.35605399617713956, 0.3624050122938356, 0.801819617534692, 0.2133388352039022, 0.41375193880464456, 0.28780401348168333], [0.006461363868256276, 0.006686059029180166, 0.31262992500470826, 0.35605399617713956, 1.0, 0.0009072282179230607, 0.2420914875525837, 0.01939797095250517, 0.021262173261030495, 0.02440100048344857], [0.030614701011724168, 0.03171235345994569, 0.3510856389397645, 0.3624050122938356, 0.0009072282179230607, 1.0, 0.2570742999530638, 0.08736856703205036, 0.16873991088778023, 0.11447380021013033], [0.06062464201233044, 0.06277678931007252, 0.8353346326814892, 0.801819617534692, 0.2420914875525837, 0.2570742999530638, 1.0, 0.17528852589870983, 0.3572799642364996, 0.24303586233733107], [0.1531839773366329, 0.15995695182918668, 0.22974076078503264, 0.2133388352039022, 0.01939797095250517, 0.08736856703205036, 0.17528852589870983, 1.0, 0.49221037696497033, 0.5245689815824116], [0.1919719557229596, 0.1981210698185993, 0.42502019390055656, 0.41375193880464456, 0.021262173261030495, 0.16873991088778023, 0.3572799642364996, 0.49221037696497033, 1.0, 0.8380242299586107], [0.21531668352175948, 0.22184413814951587, 0.2990392382833506, 0.28780401348168333, 0.02440100048344857, 0.11447380021013033, 0.24303586233733107, 0.5245689815824116, 0.8380242299586107, 1.0]]}], {\"coloraxis\": {\"colorscale\": [[0.0, \"#0d0887\"], [0.1111111111111111, \"#46039f\"], [0.2222222222222222, \"#7201a8\"], [0.3333333333333333, \"#9c179e\"], [0.4444444444444444, \"#bd3786\"], [0.5555555555555556, \"#d8576b\"], [0.6666666666666666, \"#ed7953\"], [0.7777777777777778, \"#fb9f3a\"], [0.8888888888888888, \"#fdca26\"], [1.0, \"#f0f921\"]]}, \"margin\": {\"t\": 60}, \"template\": {\"data\": {\"bar\": [{\"error_x\": {\"color\": \"#2a3f5f\"}, \"error_y\": {\"color\": \"#2a3f5f\"}, \"marker\": {\"line\": {\"color\": \"#E5ECF6\", \"width\": 0.5}}, \"type\": \"bar\"}], \"barpolar\": [{\"marker\": {\"line\": {\"color\": \"#E5ECF6\", \"width\": 0.5}}, \"type\": \"barpolar\"}], \"carpet\": [{\"aaxis\": {\"endlinecolor\": \"#2a3f5f\", \"gridcolor\": \"white\", \"linecolor\": \"white\", \"minorgridcolor\": \"white\", \"startlinecolor\": \"#2a3f5f\"}, \"baxis\": {\"endlinecolor\": \"#2a3f5f\", \"gridcolor\": \"white\", \"linecolor\": \"white\", \"minorgridcolor\": \"white\", \"startlinecolor\": \"#2a3f5f\"}, \"type\": \"c
" \n",
"var gd = document.getElementById('61dd860d-85e6-4bdc-b5e0-335c1bc45301');\n",
"var x = new MutationObserver(function (mutations, observer) {{\n",
" var display = window.getComputedStyle(gd).display;\n",
" if (!display || display === 'none') {{\n",
" console.log([gd, 'removed!']);\n",
" Plotly.purge(gd);\n",
" observer.disconnect();\n",
" }}\n",
"}});\n",
"\n",
"// Listen for the removal of the full notebook cells\n",
"var notebookContainer = gd.closest('#notebook-container');\n",
"if (notebookContainer) {{\n",
" x.observe(notebookContainer, {childList: true});\n",
"}}\n",
"\n",
"// Listen for the clearing of the current output cell\n",
"var outputEl = gd.closest('.output');\n",
"if (outputEl) {{\n",
" x.observe(outputEl, {childList: true});\n",
"}}\n",
"\n",
" }) }; }); </script> </div>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"fig = px.imshow(df.fillna(0).corr())\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "outer-egyptian",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}