diff --git a/Makefile b/Makefile index 227fcf0..6782983 100644 --- a/Makefile +++ b/Makefile @@ -27,7 +27,7 @@ requirements: test_environment ## Make Dataset data: requirements - $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed + $(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed data/external ## Delete all compiled Python files clean: diff --git a/notebooks/01-Exploration.ipynb b/notebooks/01-Exploration.ipynb index 38f1ca8..7280a4b 100644 --- a/notebooks/01-Exploration.ipynb +++ b/notebooks/01-Exploration.ipynb @@ -190,7 +190,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 221, "metadata": {}, "outputs": [ { @@ -221,6 +221,7 @@ " family_name\n", " biography\n", " other_names\n", + " urls\n", " primary_email\n", " keywords\n", " external_ids\n", @@ -256,6 +257,7 @@ " <NA>\n", " <NA>\n", " NaN\n", + " NaN\n", " <NA>\n", " NaN\n", " NaN\n", @@ -289,6 +291,7 @@ " <NA>\n", " <NA>\n", " [v.i. yurtaev; v. yurtaev]\n", + " NaN\n", " <NA>\n", " NaN\n", " NaN\n", @@ -322,6 +325,7 @@ " <NA>\n", " <NA>\n", " NaN\n", + " NaN\n", " <NA>\n", " NaN\n", " NaN\n", @@ -355,6 +359,7 @@ " <NA>\n", " <NA>\n", " NaN\n", + " NaN\n", " <NA>\n", " NaN\n", " NaN\n", @@ -388,6 +393,7 @@ " <NA>\n", " <NA>\n", " NaN\n", + " NaN\n", " <NA>\n", " NaN\n", " NaN\n", @@ -424,50 +430,57 @@ "3 0000-0001-6220-5683 True True \n", "4 0000-0001-7071-8294 True True \n", "\n", - " family_name biography other_names primary_email keywords \\\n", - "0 NaN NaN \n", - "1 [v.i. yurtaev; v. yurtaev] NaN \n", - "2 NaN NaN \n", - "3 NaN NaN \n", - "4 NaN NaN \n", + " family_name biography other_names urls primary_email \\\n", + "0 NaN NaN \n", + "1 [v.i. yurtaev; v. yurtaev] NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", "\n", - " external_ids education employment \\\n", - "0 NaN NaN NaN \n", - "1 NaN NaN [[professor, peoples friendship university of ... \n", - "2 NaN NaN NaN \n", - "3 NaN NaN [[research scientist, new york university abu ... \n", - "4 NaN NaN [[researcher (academic), universidad de zarago... \n", + " keywords external_ids education \\\n", + "0 NaN NaN NaN \n", + "1 NaN NaN NaN \n", + "2 NaN NaN NaN \n", + "3 NaN NaN NaN \n", + "4 NaN NaN NaN \n", "\n", - " n_works works_source activation_date \\\n", - "0 0 NaN 2018-03-02 09:29:16.528000+00:00 \n", - "1 0 NaN 2018-04-03 07:50:23.358000+00:00 \n", - "2 0 NaN 2019-12-11 15:31:56.388000+00:00 \n", - "3 0 NaN 2015-08-18 12:36:45.307000+00:00 \n", - "4 0 NaN 2014-03-10 13:22:01.966000+00:00 \n", + " employment n_works works_source \\\n", + "0 NaN 0 NaN \n", + "1 [[professor, peoples friendship university of ... 0 NaN \n", + "2 NaN 0 NaN \n", + "3 [[research scientist, new york university abu ... 0 NaN \n", + "4 [[researcher (academic), universidad de zarago... 0 NaN \n", "\n", - " last_update_date n_doi n_arxiv n_pmc n_other_pids \\\n", - "0 2018-03-02 09:43:07.551000+00:00 0 0 0 0 \n", - "1 2020-03-18 09:42:44.753000+00:00 0 0 0 0 \n", - "2 2020-01-28 15:34:17.309000+00:00 0 0 0 0 \n", - "3 2020-09-23 13:37:54.180000+00:00 0 0 0 0 \n", - "4 2016-06-14 22:17:54.470000+00:00 0 0 0 0 \n", + " activation_date last_update_date n_doi \\\n", + "0 2018-03-02 09:29:16.528000+00:00 2018-03-02 09:43:07.551000+00:00 0 \n", + "1 2018-04-03 07:50:23.358000+00:00 2020-03-18 09:42:44.753000+00:00 0 \n", + "2 2019-12-11 15:31:56.388000+00:00 2020-01-28 15:34:17.309000+00:00 0 \n", + "3 2015-08-18 12:36:45.307000+00:00 2020-09-23 13:37:54.180000+00:00 0 \n", + "4 2014-03-10 13:22:01.966000+00:00 2016-06-14 22:17:54.470000+00:00 0 \n", "\n", - " label primary_email_domain other_email_domains url_domains n_emails \\\n", - "0 False NaN NaN NaN \n", - "1 False NaN NaN NaN \n", - "2 False NaN NaN NaN \n", - "3 False NaN NaN NaN \n", - "4 False NaN NaN NaN \n", + " n_arxiv n_pmc n_other_pids label primary_email_domain \\\n", + "0 0 0 0 False NaN \n", + "1 0 0 0 False NaN \n", + "2 0 0 0 False NaN \n", + "3 0 0 0 False NaN \n", + "4 0 0 0 False NaN \n", + "\n", + " other_email_domains url_domains n_emails n_urls n_ids n_keywords \\\n", + "0 NaN NaN \n", + "1 NaN NaN \n", + "2 NaN NaN \n", + "3 NaN NaN \n", + "4 NaN NaN \n", "\n", - " n_urls n_ids n_keywords n_education n_employment \n", - "0 \n", - "1 1 \n", - "2 \n", - "3 1 \n", - "4 2 " + " n_education n_employment \n", + "0 \n", + "1 1 \n", + "2 \n", + "3 1 \n", + "4 2 " ] }, - "execution_count": 5, + "execution_count": 221, "metadata": {}, "output_type": "execute_result" } @@ -10733,11 +10746,11 @@ }, { "cell_type": "code", - "execution_count": 170, + "execution_count": 223, "metadata": {}, "outputs": [], "source": [ - "grid_df = pd.read_csv('../data/external/grid-2021-03-25/full_tables/links.csv', index_col='grid_id')\n", + "grid_df = pd.read_csv('../data/external/grid/full_tables/links.csv', index_col='grid_id')\n", "grid_df['domain'] = grid_df.link.apply(extract_domain)" ] }, @@ -70361,7 +70374,7 @@ }, { "cell_type": "code", - "execution_count": 124, + "execution_count": 222, "metadata": {}, "outputs": [ { @@ -70369,53 +70382,43 @@ "output_type": "stream", "text": [ "\n", - "Int64Index: 10989649 entries, 0 to 10989648\n", - "Data columns (total 41 columns):\n", + "RangeIndex: 10989649 entries, 0 to 10989648\n", + "Data columns (total 31 columns):\n", " # Column Dtype \n", "--- ------ ----- \n", - " 0 orcid object \n", + " 0 orcid string \n", " 1 verified_email bool \n", " 2 verified_primary_email bool \n", " 3 given_names string \n", " 4 family_name string \n", " 5 biography string \n", " 6 other_names object \n", - " 7 primary_email string \n", - " 8 keywords object \n", - " 9 external_ids object \n", - " 10 education object \n", - " 11 employment object \n", - " 12 n_works Int16 \n", - " 13 works_source object \n", - " 14 activation_date datetime64[ns, UTC]\n", - " 15 last_update_date datetime64[ns, UTC]\n", - " 16 n_doi Int16 \n", - " 17 n_arxiv Int16 \n", - " 18 n_pmc Int16 \n", - " 19 n_other_pids Int16 \n", - " 20 label bool \n", - " 21 primary_email_domain object \n", - " 22 other_email_domains object \n", - " 23 url_domains object \n", - " 24 n_emails Int16 \n", - " 25 n_urls Int16 \n", - " 26 n_ids Int16 \n", - " 27 n_keywords Int16 \n", - " 28 n_education Int16 \n", - " 29 n_employment Int16 \n", - " 30 ext_works_source object \n", - " 31 n_ext_work_source Int16 \n", - " 32 authoritative object \n", - " 33 n_valid_education float64 \n", - " 34 n_valid_employment float64 \n", - " 35 biography_length Int64 \n", - " 36 biography_n_sentences float64 \n", - " 37 biography_n_words float64 \n", - " 38 date_diff float64 \n", - " 39 ref_year int64 \n", - " 40 date_stale float64 \n", - "dtypes: Int16(12), Int64(1), bool(3), datetime64[ns, UTC](2), float64(6), int64(1), object(12), string(4)\n", - "memory usage: 2.6+ GB\n" + " 7 urls object \n", + " 8 primary_email string \n", + " 9 keywords object \n", + " 10 external_ids object \n", + " 11 education object \n", + " 12 employment object \n", + " 13 n_works Int16 \n", + " 14 works_source object \n", + " 15 activation_date datetime64[ns, UTC]\n", + " 16 last_update_date datetime64[ns, UTC]\n", + " 17 n_doi Int16 \n", + " 18 n_arxiv Int16 \n", + " 19 n_pmc Int16 \n", + " 20 n_other_pids Int16 \n", + " 21 label bool \n", + " 22 primary_email_domain object \n", + " 23 other_email_domains object \n", + " 24 url_domains object \n", + " 25 n_emails Int16 \n", + " 26 n_urls Int16 \n", + " 27 n_ids Int16 \n", + " 28 n_keywords Int16 \n", + " 29 n_education Int16 \n", + " 30 n_employment Int16 \n", + "dtypes: Int16(11), bool(3), datetime64[ns, UTC](2), object(10), string(5)\n", + "memory usage: 1.8+ GB\n" ] } ], diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 365c0e5..12a110b 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -3,10 +3,13 @@ import click import logging from pathlib import Path from dotenv import find_dotenv, load_dotenv -import pandas as pd +import re import ast import os +import requests +import zipfile import tldextract +import pandas as pd def fix_keywords(lst): fixed = set() @@ -39,14 +42,29 @@ def extract_url_domains(lst): @click.command() @click.argument('input_filepath', type=click.Path(exists=True)) @click.argument('output_filepath', type=click.Path()) -def main(input_filepath, output_filepath): +@click.argument('external_filepath', type=click.Path()) +def main(input_filepath, output_filepath, external_filepath): """ Runs data processing scripts to turn raw data from (./data/raw) into cleaned data ready to be analyzed (saved in ./data/processed). """ logger = logging.getLogger(__name__) - logger.info('Making final data set from raw data') + logger.info('Fetching external datasets') + logger.info('Downloading grid.ac dataset') + r = requests.get('https://ndownloader.figshare.com/files/27251693') + with open(os.path.join(external_filepath, 'grid.zip'), 'wb') as f: + f.write(r.content) + + logger.info('... unzipping') + try: + with zipfile.ZipFile(os.path.join(external_filepath, 'grid.zip')) as z: + z.extractall(os.path.join(external_filepath, 'grid')) + logger.info('... done!') + except: + logger.error('Invalid file') + + logger.info('Making pickle dataset from raw data') logger.info('Loading the zipped dataset') - df = pd.read_csv(os.path.join(input_filepath, 'data.gz'), compression='gzip', + df = pd.read_csv(os.path.join(input_filepath, 'data.gz'), compression='gzip', sep='\t', header=None, names=['orcid','verified_email', 'verified_primary_email', 'given_names', 'family_name', 'biography', 'other_names', 'urls',