|
|
|
@ -190,7 +190,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 5,
|
|
|
|
|
"execution_count": 221,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
@ -221,6 +221,7 @@
|
|
|
|
|
" <th>family_name</th>\n",
|
|
|
|
|
" <th>biography</th>\n",
|
|
|
|
|
" <th>other_names</th>\n",
|
|
|
|
|
" <th>urls</th>\n",
|
|
|
|
|
" <th>primary_email</th>\n",
|
|
|
|
|
" <th>keywords</th>\n",
|
|
|
|
|
" <th>external_ids</th>\n",
|
|
|
|
@ -256,6 +257,7 @@
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
@ -289,6 +291,7 @@
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>[v.i. yurtaev; v. yurtaev]</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
@ -322,6 +325,7 @@
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
@ -355,6 +359,7 @@
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
@ -388,6 +393,7 @@
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td><NA></td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
|
" <td>NaN</td>\n",
|
|
|
|
@ -424,50 +430,57 @@
|
|
|
|
|
"3 0000-0001-6220-5683 True True <NA> \n",
|
|
|
|
|
"4 0000-0001-7071-8294 True True <NA> \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" family_name biography other_names primary_email keywords \\\n",
|
|
|
|
|
"0 <NA> <NA> NaN <NA> NaN \n",
|
|
|
|
|
"1 <NA> <NA> [v.i. yurtaev; v. yurtaev] <NA> NaN \n",
|
|
|
|
|
"2 <NA> <NA> NaN <NA> NaN \n",
|
|
|
|
|
"3 <NA> <NA> NaN <NA> NaN \n",
|
|
|
|
|
"4 <NA> <NA> NaN <NA> NaN \n",
|
|
|
|
|
" family_name biography other_names urls primary_email \\\n",
|
|
|
|
|
"0 <NA> <NA> NaN NaN <NA> \n",
|
|
|
|
|
"1 <NA> <NA> [v.i. yurtaev; v. yurtaev] NaN <NA> \n",
|
|
|
|
|
"2 <NA> <NA> NaN NaN <NA> \n",
|
|
|
|
|
"3 <NA> <NA> NaN NaN <NA> \n",
|
|
|
|
|
"4 <NA> <NA> NaN NaN <NA> \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" external_ids education employment \\\n",
|
|
|
|
|
" keywords external_ids education \\\n",
|
|
|
|
|
"0 NaN NaN NaN \n",
|
|
|
|
|
"1 NaN NaN [[professor, peoples friendship university of ... \n",
|
|
|
|
|
"1 NaN NaN NaN \n",
|
|
|
|
|
"2 NaN NaN NaN \n",
|
|
|
|
|
"3 NaN NaN [[research scientist, new york university abu ... \n",
|
|
|
|
|
"4 NaN NaN [[researcher (academic), universidad de zarago... \n",
|
|
|
|
|
"3 NaN NaN NaN \n",
|
|
|
|
|
"4 NaN NaN NaN \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" n_works works_source activation_date \\\n",
|
|
|
|
|
"0 0 NaN 2018-03-02 09:29:16.528000+00:00 \n",
|
|
|
|
|
"1 0 NaN 2018-04-03 07:50:23.358000+00:00 \n",
|
|
|
|
|
"2 0 NaN 2019-12-11 15:31:56.388000+00:00 \n",
|
|
|
|
|
"3 0 NaN 2015-08-18 12:36:45.307000+00:00 \n",
|
|
|
|
|
"4 0 NaN 2014-03-10 13:22:01.966000+00:00 \n",
|
|
|
|
|
" employment n_works works_source \\\n",
|
|
|
|
|
"0 NaN 0 NaN \n",
|
|
|
|
|
"1 [[professor, peoples friendship university of ... 0 NaN \n",
|
|
|
|
|
"2 NaN 0 NaN \n",
|
|
|
|
|
"3 [[research scientist, new york university abu ... 0 NaN \n",
|
|
|
|
|
"4 [[researcher (academic), universidad de zarago... 0 NaN \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n",
|
|
|
|
|
"0 2018-03-02 09:43:07.551000+00:00 0 0 0 0 \n",
|
|
|
|
|
"1 2020-03-18 09:42:44.753000+00:00 0 0 0 0 \n",
|
|
|
|
|
"2 2020-01-28 15:34:17.309000+00:00 0 0 0 0 \n",
|
|
|
|
|
"3 2020-09-23 13:37:54.180000+00:00 0 0 0 0 \n",
|
|
|
|
|
"4 2016-06-14 22:17:54.470000+00:00 0 0 0 0 \n",
|
|
|
|
|
" activation_date last_update_date n_doi \\\n",
|
|
|
|
|
"0 2018-03-02 09:29:16.528000+00:00 2018-03-02 09:43:07.551000+00:00 0 \n",
|
|
|
|
|
"1 2018-04-03 07:50:23.358000+00:00 2020-03-18 09:42:44.753000+00:00 0 \n",
|
|
|
|
|
"2 2019-12-11 15:31:56.388000+00:00 2020-01-28 15:34:17.309000+00:00 0 \n",
|
|
|
|
|
"3 2015-08-18 12:36:45.307000+00:00 2020-09-23 13:37:54.180000+00:00 0 \n",
|
|
|
|
|
"4 2014-03-10 13:22:01.966000+00:00 2016-06-14 22:17:54.470000+00:00 0 \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" label primary_email_domain other_email_domains url_domains n_emails \\\n",
|
|
|
|
|
"0 False NaN NaN NaN <NA> \n",
|
|
|
|
|
"1 False NaN NaN NaN <NA> \n",
|
|
|
|
|
"2 False NaN NaN NaN <NA> \n",
|
|
|
|
|
"3 False NaN NaN NaN <NA> \n",
|
|
|
|
|
"4 False NaN NaN NaN <NA> \n",
|
|
|
|
|
" n_arxiv n_pmc n_other_pids label primary_email_domain \\\n",
|
|
|
|
|
"0 0 0 0 False NaN \n",
|
|
|
|
|
"1 0 0 0 False NaN \n",
|
|
|
|
|
"2 0 0 0 False NaN \n",
|
|
|
|
|
"3 0 0 0 False NaN \n",
|
|
|
|
|
"4 0 0 0 False NaN \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" n_urls n_ids n_keywords n_education n_employment \n",
|
|
|
|
|
"0 <NA> <NA> <NA> <NA> <NA> \n",
|
|
|
|
|
"1 <NA> <NA> <NA> <NA> 1 \n",
|
|
|
|
|
"2 <NA> <NA> <NA> <NA> <NA> \n",
|
|
|
|
|
"3 <NA> <NA> <NA> <NA> 1 \n",
|
|
|
|
|
"4 <NA> <NA> <NA> <NA> 2 "
|
|
|
|
|
" other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n",
|
|
|
|
|
"0 NaN NaN <NA> <NA> <NA> <NA> \n",
|
|
|
|
|
"1 NaN NaN <NA> <NA> <NA> <NA> \n",
|
|
|
|
|
"2 NaN NaN <NA> <NA> <NA> <NA> \n",
|
|
|
|
|
"3 NaN NaN <NA> <NA> <NA> <NA> \n",
|
|
|
|
|
"4 NaN NaN <NA> <NA> <NA> <NA> \n",
|
|
|
|
|
"\n",
|
|
|
|
|
" n_education n_employment \n",
|
|
|
|
|
"0 <NA> <NA> \n",
|
|
|
|
|
"1 <NA> 1 \n",
|
|
|
|
|
"2 <NA> <NA> \n",
|
|
|
|
|
"3 <NA> 1 \n",
|
|
|
|
|
"4 <NA> 2 "
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
|
"execution_count": 5,
|
|
|
|
|
"execution_count": 221,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"output_type": "execute_result"
|
|
|
|
|
}
|
|
|
|
@ -10733,11 +10746,11 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 170,
|
|
|
|
|
"execution_count": 223,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [],
|
|
|
|
|
"source": [
|
|
|
|
|
"grid_df = pd.read_csv('../data/external/grid-2021-03-25/full_tables/links.csv', index_col='grid_id')\n",
|
|
|
|
|
"grid_df = pd.read_csv('../data/external/grid/full_tables/links.csv', index_col='grid_id')\n",
|
|
|
|
|
"grid_df['domain'] = grid_df.link.apply(extract_domain)"
|
|
|
|
|
]
|
|
|
|
|
},
|
|
|
|
@ -70361,7 +70374,7 @@
|
|
|
|
|
},
|
|
|
|
|
{
|
|
|
|
|
"cell_type": "code",
|
|
|
|
|
"execution_count": 124,
|
|
|
|
|
"execution_count": 222,
|
|
|
|
|
"metadata": {},
|
|
|
|
|
"outputs": [
|
|
|
|
|
{
|
|
|
|
@ -70369,53 +70382,43 @@
|
|
|
|
|
"output_type": "stream",
|
|
|
|
|
"text": [
|
|
|
|
|
"<class 'pandas.core.frame.DataFrame'>\n",
|
|
|
|
|
"Int64Index: 10989649 entries, 0 to 10989648\n",
|
|
|
|
|
"Data columns (total 41 columns):\n",
|
|
|
|
|
"RangeIndex: 10989649 entries, 0 to 10989648\n",
|
|
|
|
|
"Data columns (total 31 columns):\n",
|
|
|
|
|
" # Column Dtype \n",
|
|
|
|
|
"--- ------ ----- \n",
|
|
|
|
|
" 0 orcid object \n",
|
|
|
|
|
" 0 orcid string \n",
|
|
|
|
|
" 1 verified_email bool \n",
|
|
|
|
|
" 2 verified_primary_email bool \n",
|
|
|
|
|
" 3 given_names string \n",
|
|
|
|
|
" 4 family_name string \n",
|
|
|
|
|
" 5 biography string \n",
|
|
|
|
|
" 6 other_names object \n",
|
|
|
|
|
" 7 primary_email string \n",
|
|
|
|
|
" 8 keywords object \n",
|
|
|
|
|
" 9 external_ids object \n",
|
|
|
|
|
" 10 education object \n",
|
|
|
|
|
" 11 employment object \n",
|
|
|
|
|
" 12 n_works Int16 \n",
|
|
|
|
|
" 13 works_source object \n",
|
|
|
|
|
" 14 activation_date datetime64[ns, UTC]\n",
|
|
|
|
|
" 15 last_update_date datetime64[ns, UTC]\n",
|
|
|
|
|
" 16 n_doi Int16 \n",
|
|
|
|
|
" 17 n_arxiv Int16 \n",
|
|
|
|
|
" 18 n_pmc Int16 \n",
|
|
|
|
|
" 19 n_other_pids Int16 \n",
|
|
|
|
|
" 20 label bool \n",
|
|
|
|
|
" 21 primary_email_domain object \n",
|
|
|
|
|
" 22 other_email_domains object \n",
|
|
|
|
|
" 23 url_domains object \n",
|
|
|
|
|
" 24 n_emails Int16 \n",
|
|
|
|
|
" 25 n_urls Int16 \n",
|
|
|
|
|
" 26 n_ids Int16 \n",
|
|
|
|
|
" 27 n_keywords Int16 \n",
|
|
|
|
|
" 28 n_education Int16 \n",
|
|
|
|
|
" 29 n_employment Int16 \n",
|
|
|
|
|
" 30 ext_works_source object \n",
|
|
|
|
|
" 31 n_ext_work_source Int16 \n",
|
|
|
|
|
" 32 authoritative object \n",
|
|
|
|
|
" 33 n_valid_education float64 \n",
|
|
|
|
|
" 34 n_valid_employment float64 \n",
|
|
|
|
|
" 35 biography_length Int64 \n",
|
|
|
|
|
" 36 biography_n_sentences float64 \n",
|
|
|
|
|
" 37 biography_n_words float64 \n",
|
|
|
|
|
" 38 date_diff float64 \n",
|
|
|
|
|
" 39 ref_year int64 \n",
|
|
|
|
|
" 40 date_stale float64 \n",
|
|
|
|
|
"dtypes: Int16(12), Int64(1), bool(3), datetime64[ns, UTC](2), float64(6), int64(1), object(12), string(4)\n",
|
|
|
|
|
"memory usage: 2.6+ GB\n"
|
|
|
|
|
" 7 urls object \n",
|
|
|
|
|
" 8 primary_email string \n",
|
|
|
|
|
" 9 keywords object \n",
|
|
|
|
|
" 10 external_ids object \n",
|
|
|
|
|
" 11 education object \n",
|
|
|
|
|
" 12 employment object \n",
|
|
|
|
|
" 13 n_works Int16 \n",
|
|
|
|
|
" 14 works_source object \n",
|
|
|
|
|
" 15 activation_date datetime64[ns, UTC]\n",
|
|
|
|
|
" 16 last_update_date datetime64[ns, UTC]\n",
|
|
|
|
|
" 17 n_doi Int16 \n",
|
|
|
|
|
" 18 n_arxiv Int16 \n",
|
|
|
|
|
" 19 n_pmc Int16 \n",
|
|
|
|
|
" 20 n_other_pids Int16 \n",
|
|
|
|
|
" 21 label bool \n",
|
|
|
|
|
" 22 primary_email_domain object \n",
|
|
|
|
|
" 23 other_email_domains object \n",
|
|
|
|
|
" 24 url_domains object \n",
|
|
|
|
|
" 25 n_emails Int16 \n",
|
|
|
|
|
" 26 n_urls Int16 \n",
|
|
|
|
|
" 27 n_ids Int16 \n",
|
|
|
|
|
" 28 n_keywords Int16 \n",
|
|
|
|
|
" 29 n_education Int16 \n",
|
|
|
|
|
" 30 n_employment Int16 \n",
|
|
|
|
|
"dtypes: Int16(11), bool(3), datetime64[ns, UTC](2), object(10), string(5)\n",
|
|
|
|
|
"memory usage: 1.8+ GB\n"
|
|
|
|
|
]
|
|
|
|
|
}
|
|
|
|
|
],
|
|
|
|
|