new make data with grid download

This commit is contained in:
Andrea Mannocci 2021-04-23 11:03:53 +02:00
parent f21a3f7d30
commit 087129a9c5
3 changed files with 106 additions and 85 deletions

View File

@ -27,7 +27,7 @@ requirements: test_environment
## Make Dataset
data: requirements
$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed
$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed data/external
## Delete all compiled Python files
clean:

View File

@ -190,7 +190,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 221,
"metadata": {},
"outputs": [
{
@ -221,6 +221,7 @@
" <th>family_name</th>\n",
" <th>biography</th>\n",
" <th>other_names</th>\n",
" <th>urls</th>\n",
" <th>primary_email</th>\n",
" <th>keywords</th>\n",
" <th>external_ids</th>\n",
@ -256,6 +257,7 @@
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -289,6 +291,7 @@
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>[v.i. yurtaev; v. yurtaev]</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -322,6 +325,7 @@
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -355,6 +359,7 @@
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -388,6 +393,7 @@
" <td>&lt;NA&gt;</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
" <td>&lt;NA&gt;</td>\n",
" <td>NaN</td>\n",
" <td>NaN</td>\n",
@ -424,50 +430,57 @@
"3 0000-0001-6220-5683 True True <NA> \n",
"4 0000-0001-7071-8294 True True <NA> \n",
"\n",
" family_name biography other_names primary_email keywords \\\n",
"0 <NA> <NA> NaN <NA> NaN \n",
"1 <NA> <NA> [v.i. yurtaev; v. yurtaev] <NA> NaN \n",
"2 <NA> <NA> NaN <NA> NaN \n",
"3 <NA> <NA> NaN <NA> NaN \n",
"4 <NA> <NA> NaN <NA> NaN \n",
" family_name biography other_names urls primary_email \\\n",
"0 <NA> <NA> NaN NaN <NA> \n",
"1 <NA> <NA> [v.i. yurtaev; v. yurtaev] NaN <NA> \n",
"2 <NA> <NA> NaN NaN <NA> \n",
"3 <NA> <NA> NaN NaN <NA> \n",
"4 <NA> <NA> NaN NaN <NA> \n",
"\n",
" external_ids education employment \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN [[professor, peoples friendship university of ... \n",
"2 NaN NaN NaN \n",
"3 NaN NaN [[research scientist, new york university abu ... \n",
"4 NaN NaN [[researcher (academic), universidad de zarago... \n",
" keywords external_ids education \\\n",
"0 NaN NaN NaN \n",
"1 NaN NaN NaN \n",
"2 NaN NaN NaN \n",
"3 NaN NaN NaN \n",
"4 NaN NaN NaN \n",
"\n",
" n_works works_source activation_date \\\n",
"0 0 NaN 2018-03-02 09:29:16.528000+00:00 \n",
"1 0 NaN 2018-04-03 07:50:23.358000+00:00 \n",
"2 0 NaN 2019-12-11 15:31:56.388000+00:00 \n",
"3 0 NaN 2015-08-18 12:36:45.307000+00:00 \n",
"4 0 NaN 2014-03-10 13:22:01.966000+00:00 \n",
" employment n_works works_source \\\n",
"0 NaN 0 NaN \n",
"1 [[professor, peoples friendship university of ... 0 NaN \n",
"2 NaN 0 NaN \n",
"3 [[research scientist, new york university abu ... 0 NaN \n",
"4 [[researcher (academic), universidad de zarago... 0 NaN \n",
"\n",
" last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n",
"0 2018-03-02 09:43:07.551000+00:00 0 0 0 0 \n",
"1 2020-03-18 09:42:44.753000+00:00 0 0 0 0 \n",
"2 2020-01-28 15:34:17.309000+00:00 0 0 0 0 \n",
"3 2020-09-23 13:37:54.180000+00:00 0 0 0 0 \n",
"4 2016-06-14 22:17:54.470000+00:00 0 0 0 0 \n",
" activation_date last_update_date n_doi \\\n",
"0 2018-03-02 09:29:16.528000+00:00 2018-03-02 09:43:07.551000+00:00 0 \n",
"1 2018-04-03 07:50:23.358000+00:00 2020-03-18 09:42:44.753000+00:00 0 \n",
"2 2019-12-11 15:31:56.388000+00:00 2020-01-28 15:34:17.309000+00:00 0 \n",
"3 2015-08-18 12:36:45.307000+00:00 2020-09-23 13:37:54.180000+00:00 0 \n",
"4 2014-03-10 13:22:01.966000+00:00 2016-06-14 22:17:54.470000+00:00 0 \n",
"\n",
" label primary_email_domain other_email_domains url_domains n_emails \\\n",
"0 False NaN NaN NaN <NA> \n",
"1 False NaN NaN NaN <NA> \n",
"2 False NaN NaN NaN <NA> \n",
"3 False NaN NaN NaN <NA> \n",
"4 False NaN NaN NaN <NA> \n",
" n_arxiv n_pmc n_other_pids label primary_email_domain \\\n",
"0 0 0 0 False NaN \n",
"1 0 0 0 False NaN \n",
"2 0 0 0 False NaN \n",
"3 0 0 0 False NaN \n",
"4 0 0 0 False NaN \n",
"\n",
" n_urls n_ids n_keywords n_education n_employment \n",
"0 <NA> <NA> <NA> <NA> <NA> \n",
"1 <NA> <NA> <NA> <NA> 1 \n",
"2 <NA> <NA> <NA> <NA> <NA> \n",
"3 <NA> <NA> <NA> <NA> 1 \n",
"4 <NA> <NA> <NA> <NA> 2 "
" other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n",
"0 NaN NaN <NA> <NA> <NA> <NA> \n",
"1 NaN NaN <NA> <NA> <NA> <NA> \n",
"2 NaN NaN <NA> <NA> <NA> <NA> \n",
"3 NaN NaN <NA> <NA> <NA> <NA> \n",
"4 NaN NaN <NA> <NA> <NA> <NA> \n",
"\n",
" n_education n_employment \n",
"0 <NA> <NA> \n",
"1 <NA> 1 \n",
"2 <NA> <NA> \n",
"3 <NA> 1 \n",
"4 <NA> 2 "
]
},
"execution_count": 5,
"execution_count": 221,
"metadata": {},
"output_type": "execute_result"
}
@ -10733,11 +10746,11 @@
},
{
"cell_type": "code",
"execution_count": 170,
"execution_count": 223,
"metadata": {},
"outputs": [],
"source": [
"grid_df = pd.read_csv('../data/external/grid-2021-03-25/full_tables/links.csv', index_col='grid_id')\n",
"grid_df = pd.read_csv('../data/external/grid/full_tables/links.csv', index_col='grid_id')\n",
"grid_df['domain'] = grid_df.link.apply(extract_domain)"
]
},
@ -70361,7 +70374,7 @@
},
{
"cell_type": "code",
"execution_count": 124,
"execution_count": 222,
"metadata": {},
"outputs": [
{
@ -70369,53 +70382,43 @@
"output_type": "stream",
"text": [
"<class 'pandas.core.frame.DataFrame'>\n",
"Int64Index: 10989649 entries, 0 to 10989648\n",
"Data columns (total 41 columns):\n",
"RangeIndex: 10989649 entries, 0 to 10989648\n",
"Data columns (total 31 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
" 0 orcid object \n",
" 0 orcid string \n",
" 1 verified_email bool \n",
" 2 verified_primary_email bool \n",
" 3 given_names string \n",
" 4 family_name string \n",
" 5 biography string \n",
" 6 other_names object \n",
" 7 primary_email string \n",
" 8 keywords object \n",
" 9 external_ids object \n",
" 10 education object \n",
" 11 employment object \n",
" 12 n_works Int16 \n",
" 13 works_source object \n",
" 14 activation_date datetime64[ns, UTC]\n",
" 15 last_update_date datetime64[ns, UTC]\n",
" 16 n_doi Int16 \n",
" 17 n_arxiv Int16 \n",
" 18 n_pmc Int16 \n",
" 19 n_other_pids Int16 \n",
" 20 label bool \n",
" 21 primary_email_domain object \n",
" 22 other_email_domains object \n",
" 23 url_domains object \n",
" 24 n_emails Int16 \n",
" 25 n_urls Int16 \n",
" 26 n_ids Int16 \n",
" 27 n_keywords Int16 \n",
" 28 n_education Int16 \n",
" 29 n_employment Int16 \n",
" 30 ext_works_source object \n",
" 31 n_ext_work_source Int16 \n",
" 32 authoritative object \n",
" 33 n_valid_education float64 \n",
" 34 n_valid_employment float64 \n",
" 35 biography_length Int64 \n",
" 36 biography_n_sentences float64 \n",
" 37 biography_n_words float64 \n",
" 38 date_diff float64 \n",
" 39 ref_year int64 \n",
" 40 date_stale float64 \n",
"dtypes: Int16(12), Int64(1), bool(3), datetime64[ns, UTC](2), float64(6), int64(1), object(12), string(4)\n",
"memory usage: 2.6+ GB\n"
" 7 urls object \n",
" 8 primary_email string \n",
" 9 keywords object \n",
" 10 external_ids object \n",
" 11 education object \n",
" 12 employment object \n",
" 13 n_works Int16 \n",
" 14 works_source object \n",
" 15 activation_date datetime64[ns, UTC]\n",
" 16 last_update_date datetime64[ns, UTC]\n",
" 17 n_doi Int16 \n",
" 18 n_arxiv Int16 \n",
" 19 n_pmc Int16 \n",
" 20 n_other_pids Int16 \n",
" 21 label bool \n",
" 22 primary_email_domain object \n",
" 23 other_email_domains object \n",
" 24 url_domains object \n",
" 25 n_emails Int16 \n",
" 26 n_urls Int16 \n",
" 27 n_ids Int16 \n",
" 28 n_keywords Int16 \n",
" 29 n_education Int16 \n",
" 30 n_employment Int16 \n",
"dtypes: Int16(11), bool(3), datetime64[ns, UTC](2), object(10), string(5)\n",
"memory usage: 1.8+ GB\n"
]
}
],

View File

@ -3,10 +3,13 @@ import click
import logging
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
import pandas as pd
import re
import ast
import os
import requests
import zipfile
import tldextract
import pandas as pd
def fix_keywords(lst):
fixed = set()
@ -39,12 +42,27 @@ def extract_url_domains(lst):
@click.command()
@click.argument('input_filepath', type=click.Path(exists=True))
@click.argument('output_filepath', type=click.Path())
def main(input_filepath, output_filepath):
@click.argument('external_filepath', type=click.Path())
def main(input_filepath, output_filepath, external_filepath):
""" Runs data processing scripts to turn raw data from (./data/raw) into
cleaned data ready to be analyzed (saved in ./data/processed).
"""
logger = logging.getLogger(__name__)
logger.info('Making final data set from raw data')
logger.info('Fetching external datasets')
logger.info('Downloading grid.ac dataset')
r = requests.get('https://ndownloader.figshare.com/files/27251693')
with open(os.path.join(external_filepath, 'grid.zip'), 'wb') as f:
f.write(r.content)
logger.info('... unzipping')
try:
with zipfile.ZipFile(os.path.join(external_filepath, 'grid.zip')) as z:
z.extractall(os.path.join(external_filepath, 'grid'))
logger.info('... done!')
except:
logger.error('Invalid file')
logger.info('Making pickle dataset from raw data')
logger.info('Loading the zipped dataset')
df = pd.read_csv(os.path.join(input_filepath, 'data.gz'), compression='gzip',
sep='\t', header=None,