new make data with grid download
This commit is contained in:
parent
f21a3f7d30
commit
087129a9c5
2
Makefile
2
Makefile
|
@ -27,7 +27,7 @@ requirements: test_environment
|
|||
|
||||
## Make Dataset
|
||||
data: requirements
|
||||
$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed
|
||||
$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed data/external
|
||||
|
||||
## Delete all compiled Python files
|
||||
clean:
|
||||
|
|
|
@ -190,7 +190,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"execution_count": 221,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -221,6 +221,7 @@
|
|||
" <th>family_name</th>\n",
|
||||
" <th>biography</th>\n",
|
||||
" <th>other_names</th>\n",
|
||||
" <th>urls</th>\n",
|
||||
" <th>primary_email</th>\n",
|
||||
" <th>keywords</th>\n",
|
||||
" <th>external_ids</th>\n",
|
||||
|
@ -256,6 +257,7 @@
|
|||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
|
@ -289,6 +291,7 @@
|
|||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>[v.i. yurtaev; v. yurtaev]</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
|
@ -322,6 +325,7 @@
|
|||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
|
@ -355,6 +359,7 @@
|
|||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
|
@ -388,6 +393,7 @@
|
|||
" <td><NA></td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td><NA></td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
" <td>NaN</td>\n",
|
||||
|
@ -424,50 +430,57 @@
|
|||
"3 0000-0001-6220-5683 True True <NA> \n",
|
||||
"4 0000-0001-7071-8294 True True <NA> \n",
|
||||
"\n",
|
||||
" family_name biography other_names primary_email keywords \\\n",
|
||||
"0 <NA> <NA> NaN <NA> NaN \n",
|
||||
"1 <NA> <NA> [v.i. yurtaev; v. yurtaev] <NA> NaN \n",
|
||||
"2 <NA> <NA> NaN <NA> NaN \n",
|
||||
"3 <NA> <NA> NaN <NA> NaN \n",
|
||||
"4 <NA> <NA> NaN <NA> NaN \n",
|
||||
" family_name biography other_names urls primary_email \\\n",
|
||||
"0 <NA> <NA> NaN NaN <NA> \n",
|
||||
"1 <NA> <NA> [v.i. yurtaev; v. yurtaev] NaN <NA> \n",
|
||||
"2 <NA> <NA> NaN NaN <NA> \n",
|
||||
"3 <NA> <NA> NaN NaN <NA> \n",
|
||||
"4 <NA> <NA> NaN NaN <NA> \n",
|
||||
"\n",
|
||||
" external_ids education employment \\\n",
|
||||
"0 NaN NaN NaN \n",
|
||||
"1 NaN NaN [[professor, peoples friendship university of ... \n",
|
||||
"2 NaN NaN NaN \n",
|
||||
"3 NaN NaN [[research scientist, new york university abu ... \n",
|
||||
"4 NaN NaN [[researcher (academic), universidad de zarago... \n",
|
||||
" keywords external_ids education \\\n",
|
||||
"0 NaN NaN NaN \n",
|
||||
"1 NaN NaN NaN \n",
|
||||
"2 NaN NaN NaN \n",
|
||||
"3 NaN NaN NaN \n",
|
||||
"4 NaN NaN NaN \n",
|
||||
"\n",
|
||||
" n_works works_source activation_date \\\n",
|
||||
"0 0 NaN 2018-03-02 09:29:16.528000+00:00 \n",
|
||||
"1 0 NaN 2018-04-03 07:50:23.358000+00:00 \n",
|
||||
"2 0 NaN 2019-12-11 15:31:56.388000+00:00 \n",
|
||||
"3 0 NaN 2015-08-18 12:36:45.307000+00:00 \n",
|
||||
"4 0 NaN 2014-03-10 13:22:01.966000+00:00 \n",
|
||||
" employment n_works works_source \\\n",
|
||||
"0 NaN 0 NaN \n",
|
||||
"1 [[professor, peoples friendship university of ... 0 NaN \n",
|
||||
"2 NaN 0 NaN \n",
|
||||
"3 [[research scientist, new york university abu ... 0 NaN \n",
|
||||
"4 [[researcher (academic), universidad de zarago... 0 NaN \n",
|
||||
"\n",
|
||||
" last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n",
|
||||
"0 2018-03-02 09:43:07.551000+00:00 0 0 0 0 \n",
|
||||
"1 2020-03-18 09:42:44.753000+00:00 0 0 0 0 \n",
|
||||
"2 2020-01-28 15:34:17.309000+00:00 0 0 0 0 \n",
|
||||
"3 2020-09-23 13:37:54.180000+00:00 0 0 0 0 \n",
|
||||
"4 2016-06-14 22:17:54.470000+00:00 0 0 0 0 \n",
|
||||
" activation_date last_update_date n_doi \\\n",
|
||||
"0 2018-03-02 09:29:16.528000+00:00 2018-03-02 09:43:07.551000+00:00 0 \n",
|
||||
"1 2018-04-03 07:50:23.358000+00:00 2020-03-18 09:42:44.753000+00:00 0 \n",
|
||||
"2 2019-12-11 15:31:56.388000+00:00 2020-01-28 15:34:17.309000+00:00 0 \n",
|
||||
"3 2015-08-18 12:36:45.307000+00:00 2020-09-23 13:37:54.180000+00:00 0 \n",
|
||||
"4 2014-03-10 13:22:01.966000+00:00 2016-06-14 22:17:54.470000+00:00 0 \n",
|
||||
"\n",
|
||||
" label primary_email_domain other_email_domains url_domains n_emails \\\n",
|
||||
"0 False NaN NaN NaN <NA> \n",
|
||||
"1 False NaN NaN NaN <NA> \n",
|
||||
"2 False NaN NaN NaN <NA> \n",
|
||||
"3 False NaN NaN NaN <NA> \n",
|
||||
"4 False NaN NaN NaN <NA> \n",
|
||||
" n_arxiv n_pmc n_other_pids label primary_email_domain \\\n",
|
||||
"0 0 0 0 False NaN \n",
|
||||
"1 0 0 0 False NaN \n",
|
||||
"2 0 0 0 False NaN \n",
|
||||
"3 0 0 0 False NaN \n",
|
||||
"4 0 0 0 False NaN \n",
|
||||
"\n",
|
||||
" n_urls n_ids n_keywords n_education n_employment \n",
|
||||
"0 <NA> <NA> <NA> <NA> <NA> \n",
|
||||
"1 <NA> <NA> <NA> <NA> 1 \n",
|
||||
"2 <NA> <NA> <NA> <NA> <NA> \n",
|
||||
"3 <NA> <NA> <NA> <NA> 1 \n",
|
||||
"4 <NA> <NA> <NA> <NA> 2 "
|
||||
" other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n",
|
||||
"0 NaN NaN <NA> <NA> <NA> <NA> \n",
|
||||
"1 NaN NaN <NA> <NA> <NA> <NA> \n",
|
||||
"2 NaN NaN <NA> <NA> <NA> <NA> \n",
|
||||
"3 NaN NaN <NA> <NA> <NA> <NA> \n",
|
||||
"4 NaN NaN <NA> <NA> <NA> <NA> \n",
|
||||
"\n",
|
||||
" n_education n_employment \n",
|
||||
"0 <NA> <NA> \n",
|
||||
"1 <NA> 1 \n",
|
||||
"2 <NA> <NA> \n",
|
||||
"3 <NA> 1 \n",
|
||||
"4 <NA> 2 "
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"execution_count": 221,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
|
@ -10733,11 +10746,11 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 170,
|
||||
"execution_count": 223,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"grid_df = pd.read_csv('../data/external/grid-2021-03-25/full_tables/links.csv', index_col='grid_id')\n",
|
||||
"grid_df = pd.read_csv('../data/external/grid/full_tables/links.csv', index_col='grid_id')\n",
|
||||
"grid_df['domain'] = grid_df.link.apply(extract_domain)"
|
||||
]
|
||||
},
|
||||
|
@ -70361,7 +70374,7 @@
|
|||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 124,
|
||||
"execution_count": 222,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
|
@ -70369,53 +70382,43 @@
|
|||
"output_type": "stream",
|
||||
"text": [
|
||||
"<class 'pandas.core.frame.DataFrame'>\n",
|
||||
"Int64Index: 10989649 entries, 0 to 10989648\n",
|
||||
"Data columns (total 41 columns):\n",
|
||||
"RangeIndex: 10989649 entries, 0 to 10989648\n",
|
||||
"Data columns (total 31 columns):\n",
|
||||
" # Column Dtype \n",
|
||||
"--- ------ ----- \n",
|
||||
" 0 orcid object \n",
|
||||
" 0 orcid string \n",
|
||||
" 1 verified_email bool \n",
|
||||
" 2 verified_primary_email bool \n",
|
||||
" 3 given_names string \n",
|
||||
" 4 family_name string \n",
|
||||
" 5 biography string \n",
|
||||
" 6 other_names object \n",
|
||||
" 7 primary_email string \n",
|
||||
" 8 keywords object \n",
|
||||
" 9 external_ids object \n",
|
||||
" 10 education object \n",
|
||||
" 11 employment object \n",
|
||||
" 12 n_works Int16 \n",
|
||||
" 13 works_source object \n",
|
||||
" 14 activation_date datetime64[ns, UTC]\n",
|
||||
" 15 last_update_date datetime64[ns, UTC]\n",
|
||||
" 16 n_doi Int16 \n",
|
||||
" 17 n_arxiv Int16 \n",
|
||||
" 18 n_pmc Int16 \n",
|
||||
" 19 n_other_pids Int16 \n",
|
||||
" 20 label bool \n",
|
||||
" 21 primary_email_domain object \n",
|
||||
" 22 other_email_domains object \n",
|
||||
" 23 url_domains object \n",
|
||||
" 24 n_emails Int16 \n",
|
||||
" 25 n_urls Int16 \n",
|
||||
" 26 n_ids Int16 \n",
|
||||
" 27 n_keywords Int16 \n",
|
||||
" 28 n_education Int16 \n",
|
||||
" 29 n_employment Int16 \n",
|
||||
" 30 ext_works_source object \n",
|
||||
" 31 n_ext_work_source Int16 \n",
|
||||
" 32 authoritative object \n",
|
||||
" 33 n_valid_education float64 \n",
|
||||
" 34 n_valid_employment float64 \n",
|
||||
" 35 biography_length Int64 \n",
|
||||
" 36 biography_n_sentences float64 \n",
|
||||
" 37 biography_n_words float64 \n",
|
||||
" 38 date_diff float64 \n",
|
||||
" 39 ref_year int64 \n",
|
||||
" 40 date_stale float64 \n",
|
||||
"dtypes: Int16(12), Int64(1), bool(3), datetime64[ns, UTC](2), float64(6), int64(1), object(12), string(4)\n",
|
||||
"memory usage: 2.6+ GB\n"
|
||||
" 7 urls object \n",
|
||||
" 8 primary_email string \n",
|
||||
" 9 keywords object \n",
|
||||
" 10 external_ids object \n",
|
||||
" 11 education object \n",
|
||||
" 12 employment object \n",
|
||||
" 13 n_works Int16 \n",
|
||||
" 14 works_source object \n",
|
||||
" 15 activation_date datetime64[ns, UTC]\n",
|
||||
" 16 last_update_date datetime64[ns, UTC]\n",
|
||||
" 17 n_doi Int16 \n",
|
||||
" 18 n_arxiv Int16 \n",
|
||||
" 19 n_pmc Int16 \n",
|
||||
" 20 n_other_pids Int16 \n",
|
||||
" 21 label bool \n",
|
||||
" 22 primary_email_domain object \n",
|
||||
" 23 other_email_domains object \n",
|
||||
" 24 url_domains object \n",
|
||||
" 25 n_emails Int16 \n",
|
||||
" 26 n_urls Int16 \n",
|
||||
" 27 n_ids Int16 \n",
|
||||
" 28 n_keywords Int16 \n",
|
||||
" 29 n_education Int16 \n",
|
||||
" 30 n_employment Int16 \n",
|
||||
"dtypes: Int16(11), bool(3), datetime64[ns, UTC](2), object(10), string(5)\n",
|
||||
"memory usage: 1.8+ GB\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
|
|
|
@ -3,10 +3,13 @@ import click
|
|||
import logging
|
||||
from pathlib import Path
|
||||
from dotenv import find_dotenv, load_dotenv
|
||||
import pandas as pd
|
||||
import re
|
||||
import ast
|
||||
import os
|
||||
import requests
|
||||
import zipfile
|
||||
import tldextract
|
||||
import pandas as pd
|
||||
|
||||
def fix_keywords(lst):
|
||||
fixed = set()
|
||||
|
@ -39,12 +42,27 @@ def extract_url_domains(lst):
|
|||
@click.command()
|
||||
@click.argument('input_filepath', type=click.Path(exists=True))
|
||||
@click.argument('output_filepath', type=click.Path())
|
||||
def main(input_filepath, output_filepath):
|
||||
@click.argument('external_filepath', type=click.Path())
|
||||
def main(input_filepath, output_filepath, external_filepath):
|
||||
""" Runs data processing scripts to turn raw data from (./data/raw) into
|
||||
cleaned data ready to be analyzed (saved in ./data/processed).
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info('Making final data set from raw data')
|
||||
logger.info('Fetching external datasets')
|
||||
logger.info('Downloading grid.ac dataset')
|
||||
r = requests.get('https://ndownloader.figshare.com/files/27251693')
|
||||
with open(os.path.join(external_filepath, 'grid.zip'), 'wb') as f:
|
||||
f.write(r.content)
|
||||
|
||||
logger.info('... unzipping')
|
||||
try:
|
||||
with zipfile.ZipFile(os.path.join(external_filepath, 'grid.zip')) as z:
|
||||
z.extractall(os.path.join(external_filepath, 'grid'))
|
||||
logger.info('... done!')
|
||||
except:
|
||||
logger.error('Invalid file')
|
||||
|
||||
logger.info('Making pickle dataset from raw data')
|
||||
logger.info('Loading the zipped dataset')
|
||||
df = pd.read_csv(os.path.join(input_filepath, 'data.gz'), compression='gzip',
|
||||
sep='\t', header=None,
|
||||
|
|
Loading…
Reference in New Issue