diff --git a/Makefile b/Makefile
index 227fcf0..6782983 100644
--- a/Makefile
+++ b/Makefile
@@ -27,7 +27,7 @@ requirements: test_environment
## Make Dataset
data: requirements
- $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed
+ $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed data/external
## Delete all compiled Python files
clean:
diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb
index 38f1ca8..7280a4b 100644
--- a/notebooks/01-Exploration.ipynb
+++ b/notebooks/01-Exploration.ipynb
@@ -190,7 +190,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 221,
"metadata": {},
"outputs": [
{
@@ -221,6 +221,7 @@
"
family_name | \n",
" biography | \n",
" other_names | \n",
+ " urls | \n",
" primary_email | \n",
" keywords | \n",
" external_ids | \n",
@@ -256,6 +257,7 @@
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
+ " NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
@@ -289,6 +291,7 @@
" <NA> | \n",
" <NA> | \n",
" [v.i. yurtaev; v. yurtaev] | \n",
+ " NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
@@ -322,6 +325,7 @@
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
+ " NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
@@ -355,6 +359,7 @@
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
+ " NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
@@ -388,6 +393,7 @@
" <NA> | \n",
" <NA> | \n",
" NaN | \n",
+ " NaN | \n",
" <NA> | \n",
" NaN | \n",
" NaN | \n",
@@ -424,50 +430,57 @@
"3 0000-0001-6220-5683 True True \n",
"4 0000-0001-7071-8294 True True \n",
"\n",
- " family_name biography other_names primary_email keywords \\\n",
- "0 NaN NaN \n",
- "1 [v.i. yurtaev; v. yurtaev] NaN \n",
- "2 NaN NaN \n",
- "3 NaN NaN \n",
- "4 NaN NaN \n",
+ " family_name biography other_names urls primary_email \\\n",
+ "0 NaN NaN \n",
+ "1 [v.i. yurtaev; v. yurtaev] NaN \n",
+ "2 NaN NaN \n",
+ "3 NaN NaN \n",
+ "4 NaN NaN \n",
"\n",
- " external_ids education employment \\\n",
- "0 NaN NaN NaN \n",
- "1 NaN NaN [[professor, peoples friendship university of ... \n",
- "2 NaN NaN NaN \n",
- "3 NaN NaN [[research scientist, new york university abu ... \n",
- "4 NaN NaN [[researcher (academic), universidad de zarago... \n",
+ " keywords external_ids education \\\n",
+ "0 NaN NaN NaN \n",
+ "1 NaN NaN NaN \n",
+ "2 NaN NaN NaN \n",
+ "3 NaN NaN NaN \n",
+ "4 NaN NaN NaN \n",
"\n",
- " n_works works_source activation_date \\\n",
- "0 0 NaN 2018-03-02 09:29:16.528000+00:00 \n",
- "1 0 NaN 2018-04-03 07:50:23.358000+00:00 \n",
- "2 0 NaN 2019-12-11 15:31:56.388000+00:00 \n",
- "3 0 NaN 2015-08-18 12:36:45.307000+00:00 \n",
- "4 0 NaN 2014-03-10 13:22:01.966000+00:00 \n",
+ " employment n_works works_source \\\n",
+ "0 NaN 0 NaN \n",
+ "1 [[professor, peoples friendship university of ... 0 NaN \n",
+ "2 NaN 0 NaN \n",
+ "3 [[research scientist, new york university abu ... 0 NaN \n",
+ "4 [[researcher (academic), universidad de zarago... 0 NaN \n",
"\n",
- " last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n",
- "0 2018-03-02 09:43:07.551000+00:00 0 0 0 0 \n",
- "1 2020-03-18 09:42:44.753000+00:00 0 0 0 0 \n",
- "2 2020-01-28 15:34:17.309000+00:00 0 0 0 0 \n",
- "3 2020-09-23 13:37:54.180000+00:00 0 0 0 0 \n",
- "4 2016-06-14 22:17:54.470000+00:00 0 0 0 0 \n",
+ " activation_date last_update_date n_doi \\\n",
+ "0 2018-03-02 09:29:16.528000+00:00 2018-03-02 09:43:07.551000+00:00 0 \n",
+ "1 2018-04-03 07:50:23.358000+00:00 2020-03-18 09:42:44.753000+00:00 0 \n",
+ "2 2019-12-11 15:31:56.388000+00:00 2020-01-28 15:34:17.309000+00:00 0 \n",
+ "3 2015-08-18 12:36:45.307000+00:00 2020-09-23 13:37:54.180000+00:00 0 \n",
+ "4 2014-03-10 13:22:01.966000+00:00 2016-06-14 22:17:54.470000+00:00 0 \n",
"\n",
- " label primary_email_domain other_email_domains url_domains n_emails \\\n",
- "0 False NaN NaN NaN \n",
- "1 False NaN NaN NaN \n",
- "2 False NaN NaN NaN \n",
- "3 False NaN NaN NaN \n",
- "4 False NaN NaN NaN \n",
+ " n_arxiv n_pmc n_other_pids label primary_email_domain \\\n",
+ "0 0 0 0 False NaN \n",
+ "1 0 0 0 False NaN \n",
+ "2 0 0 0 False NaN \n",
+ "3 0 0 0 False NaN \n",
+ "4 0 0 0 False NaN \n",
+ "\n",
+ " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n",
+ "0 NaN NaN \n",
+ "1 NaN NaN \n",
+ "2 NaN NaN \n",
+ "3 NaN NaN \n",
+ "4 NaN NaN \n",
"\n",
- " n_urls n_ids n_keywords n_education n_employment \n",
- "0 \n",
- "1 1 \n",
- "2 \n",
- "3 1 \n",
- "4 2 "
+ " n_education n_employment \n",
+ "0 \n",
+ "1 1 \n",
+ "2 \n",
+ "3 1 \n",
+ "4 2 "
]
},
- "execution_count": 5,
+ "execution_count": 221,
"metadata": {},
"output_type": "execute_result"
}
@@ -10733,11 +10746,11 @@
},
{
"cell_type": "code",
- "execution_count": 170,
+ "execution_count": 223,
"metadata": {},
"outputs": [],
"source": [
- "grid_df = pd.read_csv('../data/external/grid-2021-03-25/full_tables/links.csv', index_col='grid_id')\n",
+ "grid_df = pd.read_csv('../data/external/grid/full_tables/links.csv', index_col='grid_id')\n",
"grid_df['domain'] = grid_df.link.apply(extract_domain)"
]
},
@@ -70361,7 +70374,7 @@
},
{
"cell_type": "code",
- "execution_count": 124,
+ "execution_count": 222,
"metadata": {},
"outputs": [
{
@@ -70369,53 +70382,43 @@
"output_type": "stream",
"text": [
"\n",
- "Int64Index: 10989649 entries, 0 to 10989648\n",
- "Data columns (total 41 columns):\n",
+ "RangeIndex: 10989649 entries, 0 to 10989648\n",
+ "Data columns (total 31 columns):\n",
" # Column Dtype \n",
"--- ------ ----- \n",
- " 0 orcid object \n",
+ " 0 orcid string \n",
" 1 verified_email bool \n",
" 2 verified_primary_email bool \n",
" 3 given_names string \n",
" 4 family_name string \n",
" 5 biography string \n",
" 6 other_names object \n",
- " 7 primary_email string \n",
- " 8 keywords object \n",
- " 9 external_ids object \n",
- " 10 education object \n",
- " 11 employment object \n",
- " 12 n_works Int16 \n",
- " 13 works_source object \n",
- " 14 activation_date datetime64[ns, UTC]\n",
- " 15 last_update_date datetime64[ns, UTC]\n",
- " 16 n_doi Int16 \n",
- " 17 n_arxiv Int16 \n",
- " 18 n_pmc Int16 \n",
- " 19 n_other_pids Int16 \n",
- " 20 label bool \n",
- " 21 primary_email_domain object \n",
- " 22 other_email_domains object \n",
- " 23 url_domains object \n",
- " 24 n_emails Int16 \n",
- " 25 n_urls Int16 \n",
- " 26 n_ids Int16 \n",
- " 27 n_keywords Int16 \n",
- " 28 n_education Int16 \n",
- " 29 n_employment Int16 \n",
- " 30 ext_works_source object \n",
- " 31 n_ext_work_source Int16 \n",
- " 32 authoritative object \n",
- " 33 n_valid_education float64 \n",
- " 34 n_valid_employment float64 \n",
- " 35 biography_length Int64 \n",
- " 36 biography_n_sentences float64 \n",
- " 37 biography_n_words float64 \n",
- " 38 date_diff float64 \n",
- " 39 ref_year int64 \n",
- " 40 date_stale float64 \n",
- "dtypes: Int16(12), Int64(1), bool(3), datetime64[ns, UTC](2), float64(6), int64(1), object(12), string(4)\n",
- "memory usage: 2.6+ GB\n"
+ " 7 urls object \n",
+ " 8 primary_email string \n",
+ " 9 keywords object \n",
+ " 10 external_ids object \n",
+ " 11 education object \n",
+ " 12 employment object \n",
+ " 13 n_works Int16 \n",
+ " 14 works_source object \n",
+ " 15 activation_date datetime64[ns, UTC]\n",
+ " 16 last_update_date datetime64[ns, UTC]\n",
+ " 17 n_doi Int16 \n",
+ " 18 n_arxiv Int16 \n",
+ " 19 n_pmc Int16 \n",
+ " 20 n_other_pids Int16 \n",
+ " 21 label bool \n",
+ " 22 primary_email_domain object \n",
+ " 23 other_email_domains object \n",
+ " 24 url_domains object \n",
+ " 25 n_emails Int16 \n",
+ " 26 n_urls Int16 \n",
+ " 27 n_ids Int16 \n",
+ " 28 n_keywords Int16 \n",
+ " 29 n_education Int16 \n",
+ " 30 n_employment Int16 \n",
+ "dtypes: Int16(11), bool(3), datetime64[ns, UTC](2), object(10), string(5)\n",
+ "memory usage: 1.8+ GB\n"
]
}
],
diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py
index 365c0e5..12a110b 100644
--- a/src/data/make_dataset.py
+++ b/src/data/make_dataset.py
@@ -3,10 +3,13 @@ import click
import logging
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
-import pandas as pd
+import re
import ast
import os
+import requests
+import zipfile
import tldextract
+import pandas as pd
def fix_keywords(lst):
fixed = set()
@@ -39,14 +42,29 @@ def extract_url_domains(lst):
@click.command()
@click.argument('input_filepath', type=click.Path(exists=True))
@click.argument('output_filepath', type=click.Path())
-def main(input_filepath, output_filepath):
+@click.argument('external_filepath', type=click.Path())
+def main(input_filepath, output_filepath, external_filepath):
""" Runs data processing scripts to turn raw data from (./data/raw) into
cleaned data ready to be analyzed (saved in ./data/processed).
"""
logger = logging.getLogger(__name__)
- logger.info('Making final data set from raw data')
+ logger.info('Fetching external datasets')
+ logger.info('Downloading grid.ac dataset')
+ r = requests.get('https://ndownloader.figshare.com/files/27251693')
+ with open(os.path.join(external_filepath, 'grid.zip'), 'wb') as f:
+ f.write(r.content)
+
+ logger.info('... unzipping')
+ try:
+ with zipfile.ZipFile(os.path.join(external_filepath, 'grid.zip')) as z:
+ z.extractall(os.path.join(external_filepath, 'grid'))
+ logger.info('... done!')
+ except:
+ logger.error('Invalid file')
+
+ logger.info('Making pickle dataset from raw data')
logger.info('Loading the zipped dataset')
- df = pd.read_csv(os.path.join(input_filepath, 'data.gz'), compression='gzip',
+ df = pd.read_csv(os.path.join(input_filepath, 'data.gz'), compression='gzip',
sep='\t', header=None,
names=['orcid','verified_email', 'verified_primary_email',
'given_names', 'family_name', 'biography', 'other_names', 'urls',