new make data with grid download

2021-04-23 11:03:53 +02:00 · 2021-04-23 11:03:53 +02:00 · 087129a9c5
parent f21a3f7d30
commit 087129a9c5
3 changed files with 106 additions and 85 deletions
--- a/2
+++ b/2
@ -27,7 +27,7 @@ requirements: test_environment

 ## Make Dataset
 data: requirements
-	$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed
+	$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed data/external

 ## Delete all compiled Python files
 clean:
--- a/notebooks/01-Exploration.ipynb
+++ b/notebooks/01-Exploration.ipynb
@ -190,7 +190,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 221,
   "metadata": {},
   "outputs": [
    {
@ -221,6 +221,7 @@
       "      <th>family_name</th>\n",
       "      <th>biography</th>\n",
       "      <th>other_names</th>\n",
+       "      <th>urls</th>\n",
       "      <th>primary_email</th>\n",
       "      <th>keywords</th>\n",
       "      <th>external_ids</th>\n",
@ -256,6 +257,7 @@
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
@ -289,6 +291,7 @@
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>[v.i. yurtaev; v. yurtaev]</td>\n",
+       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
@ -322,6 +325,7 @@
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
@ -355,6 +359,7 @@
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
@ -388,6 +393,7 @@
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
       "      <td>&lt;NA&gt;</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
@ -424,50 +430,57 @@
       "3  0000-0001-6220-5683            True                    True        <NA>   \n",
       "4  0000-0001-7071-8294            True                    True        <NA>   \n",
       "\n",
-       "  family_name biography                 other_names primary_email keywords  \\\n",
-       "0        <NA>      <NA>                         NaN          <NA>      NaN   \n",
-       "1        <NA>      <NA>  [v.i. yurtaev; v. yurtaev]          <NA>      NaN   \n",
-       "2        <NA>      <NA>                         NaN          <NA>      NaN   \n",
-       "3        <NA>      <NA>                         NaN          <NA>      NaN   \n",
-       "4        <NA>      <NA>                         NaN          <NA>      NaN   \n",
+       "  family_name biography                 other_names urls primary_email  \\\n",
+       "0        <NA>      <NA>                         NaN  NaN          <NA>   \n",
+       "1        <NA>      <NA>  [v.i. yurtaev; v. yurtaev]  NaN          <NA>   \n",
+       "2        <NA>      <NA>                         NaN  NaN          <NA>   \n",
+       "3        <NA>      <NA>                         NaN  NaN          <NA>   \n",
+       "4        <NA>      <NA>                         NaN  NaN          <NA>   \n",
       "\n",
-       "  external_ids education                                         employment  \\\n",
-       "0          NaN       NaN                                                NaN   \n",
-       "1          NaN       NaN  [[professor, peoples friendship university of ...   \n",
-       "2          NaN       NaN                                                NaN   \n",
-       "3          NaN       NaN  [[research scientist, new york university abu ...   \n",
-       "4          NaN       NaN  [[researcher (academic), universidad de zarago...   \n",
+       "  keywords external_ids education  \\\n",
+       "0      NaN          NaN       NaN   \n",
+       "1      NaN          NaN       NaN   \n",
+       "2      NaN          NaN       NaN   \n",
+       "3      NaN          NaN       NaN   \n",
+       "4      NaN          NaN       NaN   \n",
       "\n",
-       "   n_works works_source                  activation_date  \\\n",
-       "0        0          NaN 2018-03-02 09:29:16.528000+00:00   \n",
-       "1        0          NaN 2018-04-03 07:50:23.358000+00:00   \n",
-       "2        0          NaN 2019-12-11 15:31:56.388000+00:00   \n",
-       "3        0          NaN 2015-08-18 12:36:45.307000+00:00   \n",
-       "4        0          NaN 2014-03-10 13:22:01.966000+00:00   \n",
+       "                                          employment  n_works works_source  \\\n",
+       "0                                                NaN        0          NaN   \n",
+       "1  [[professor, peoples friendship university of ...        0          NaN   \n",
+       "2                                                NaN        0          NaN   \n",
+       "3  [[research scientist, new york university abu ...        0          NaN   \n",
+       "4  [[researcher (academic), universidad de zarago...        0          NaN   \n",
       "\n",
-       "                  last_update_date  n_doi  n_arxiv  n_pmc  n_other_pids  \\\n",
-       "0 2018-03-02 09:43:07.551000+00:00      0        0      0             0   \n",
-       "1 2020-03-18 09:42:44.753000+00:00      0        0      0             0   \n",
-       "2 2020-01-28 15:34:17.309000+00:00      0        0      0             0   \n",
-       "3 2020-09-23 13:37:54.180000+00:00      0        0      0             0   \n",
-       "4 2016-06-14 22:17:54.470000+00:00      0        0      0             0   \n",
+       "                   activation_date                 last_update_date  n_doi  \\\n",
+       "0 2018-03-02 09:29:16.528000+00:00 2018-03-02 09:43:07.551000+00:00      0   \n",
+       "1 2018-04-03 07:50:23.358000+00:00 2020-03-18 09:42:44.753000+00:00      0   \n",
+       "2 2019-12-11 15:31:56.388000+00:00 2020-01-28 15:34:17.309000+00:00      0   \n",
+       "3 2015-08-18 12:36:45.307000+00:00 2020-09-23 13:37:54.180000+00:00      0   \n",
+       "4 2014-03-10 13:22:01.966000+00:00 2016-06-14 22:17:54.470000+00:00      0   \n",
       "\n",
-       "   label primary_email_domain other_email_domains url_domains  n_emails  \\\n",
-       "0  False                  NaN                 NaN         NaN      <NA>   \n",
-       "1  False                  NaN                 NaN         NaN      <NA>   \n",
-       "2  False                  NaN                 NaN         NaN      <NA>   \n",
-       "3  False                  NaN                 NaN         NaN      <NA>   \n",
-       "4  False                  NaN                 NaN         NaN      <NA>   \n",
+       "   n_arxiv  n_pmc  n_other_pids  label primary_email_domain  \\\n",
+       "0        0      0             0  False                  NaN   \n",
+       "1        0      0             0  False                  NaN   \n",
+       "2        0      0             0  False                  NaN   \n",
+       "3        0      0             0  False                  NaN   \n",
+       "4        0      0             0  False                  NaN   \n",
       "\n",
-       "   n_urls  n_ids  n_keywords  n_education  n_employment  \n",
-       "0    <NA>   <NA>        <NA>         <NA>          <NA>  \n",
-       "1    <NA>   <NA>        <NA>         <NA>             1  \n",
-       "2    <NA>   <NA>        <NA>         <NA>          <NA>  \n",
-       "3    <NA>   <NA>        <NA>         <NA>             1  \n",
-       "4    <NA>   <NA>        <NA>         <NA>             2  "
+       "  other_email_domains url_domains  n_emails  n_urls  n_ids  n_keywords  \\\n",
+       "0                 NaN         NaN      <NA>    <NA>   <NA>        <NA>   \n",
+       "1                 NaN         NaN      <NA>    <NA>   <NA>        <NA>   \n",
+       "2                 NaN         NaN      <NA>    <NA>   <NA>        <NA>   \n",
+       "3                 NaN         NaN      <NA>    <NA>   <NA>        <NA>   \n",
+       "4                 NaN         NaN      <NA>    <NA>   <NA>        <NA>   \n",
+       "\n",
+       "   n_education  n_employment  \n",
+       "0         <NA>          <NA>  \n",
+       "1         <NA>             1  \n",
+       "2         <NA>          <NA>  \n",
+       "3         <NA>             1  \n",
+       "4         <NA>             2  "
      ]
     },
-     "execution_count": 5,
+     "execution_count": 221,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -10733,11 +10746,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 170,
+   "execution_count": 223,
   "metadata": {},
   "outputs": [],
   "source": [
-    "grid_df = pd.read_csv('../data/external/grid-2021-03-25/full_tables/links.csv', index_col='grid_id')\n",
+    "grid_df = pd.read_csv('../data/external/grid/full_tables/links.csv', index_col='grid_id')\n",
    "grid_df['domain'] = grid_df.link.apply(extract_domain)"
   ]
  },
@ -70361,7 +70374,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 124,
+   "execution_count": 222,
   "metadata": {},
   "outputs": [
    {
@ -70369,53 +70382,43 @@
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
-      "Int64Index: 10989649 entries, 0 to 10989648\n",
-      "Data columns (total 41 columns):\n",
+      "RangeIndex: 10989649 entries, 0 to 10989648\n",
+      "Data columns (total 31 columns):\n",
      " #   Column                  Dtype              \n",
      "---  ------                  -----              \n",
-      " 0   orcid                   object             \n",
+      " 0   orcid                   string             \n",
      " 1   verified_email          bool               \n",
      " 2   verified_primary_email  bool               \n",
      " 3   given_names             string             \n",
      " 4   family_name             string             \n",
      " 5   biography               string             \n",
      " 6   other_names             object             \n",
-      " 7   primary_email           string             \n",
-      " 8   keywords                object             \n",
-      " 9   external_ids            object             \n",
-      " 10  education               object             \n",
-      " 11  employment              object             \n",
-      " 12  n_works                 Int16              \n",
-      " 13  works_source            object             \n",
-      " 14  activation_date         datetime64[ns, UTC]\n",
-      " 15  last_update_date        datetime64[ns, UTC]\n",
-      " 16  n_doi                   Int16              \n",
-      " 17  n_arxiv                 Int16              \n",
-      " 18  n_pmc                   Int16              \n",
-      " 19  n_other_pids            Int16              \n",
-      " 20  label                   bool               \n",
-      " 21  primary_email_domain    object             \n",
-      " 22  other_email_domains     object             \n",
-      " 23  url_domains             object             \n",
-      " 24  n_emails                Int16              \n",
-      " 25  n_urls                  Int16              \n",
-      " 26  n_ids                   Int16              \n",
-      " 27  n_keywords              Int16              \n",
-      " 28  n_education             Int16              \n",
-      " 29  n_employment            Int16              \n",
-      " 30  ext_works_source        object             \n",
-      " 31  n_ext_work_source       Int16              \n",
-      " 32  authoritative           object             \n",
-      " 33  n_valid_education       float64            \n",
-      " 34  n_valid_employment      float64            \n",
-      " 35  biography_length        Int64              \n",
-      " 36  biography_n_sentences   float64            \n",
-      " 37  biography_n_words       float64            \n",
-      " 38  date_diff               float64            \n",
-      " 39  ref_year                int64              \n",
-      " 40  date_stale              float64            \n",
-      "dtypes: Int16(12), Int64(1), bool(3), datetime64[ns, UTC](2), float64(6), int64(1), object(12), string(4)\n",
-      "memory usage: 2.6+ GB\n"
+      " 7   urls                    object             \n",
+      " 8   primary_email           string             \n",
+      " 9   keywords                object             \n",
+      " 10  external_ids            object             \n",
+      " 11  education               object             \n",
+      " 12  employment              object             \n",
+      " 13  n_works                 Int16              \n",
+      " 14  works_source            object             \n",
+      " 15  activation_date         datetime64[ns, UTC]\n",
+      " 16  last_update_date        datetime64[ns, UTC]\n",
+      " 17  n_doi                   Int16              \n",
+      " 18  n_arxiv                 Int16              \n",
+      " 19  n_pmc                   Int16              \n",
+      " 20  n_other_pids            Int16              \n",
+      " 21  label                   bool               \n",
+      " 22  primary_email_domain    object             \n",
+      " 23  other_email_domains     object             \n",
+      " 24  url_domains             object             \n",
+      " 25  n_emails                Int16              \n",
+      " 26  n_urls                  Int16              \n",
+      " 27  n_ids                   Int16              \n",
+      " 28  n_keywords              Int16              \n",
+      " 29  n_education             Int16              \n",
+      " 30  n_employment            Int16              \n",
+      "dtypes: Int16(11), bool(3), datetime64[ns, UTC](2), object(10), string(5)\n",
+      "memory usage: 1.8+ GB\n"
     ]
    }
   ],
--- a/src/data/make_dataset.py
+++ b/src/data/make_dataset.py
@ -3,10 +3,13 @@ import click
 import logging
 from pathlib import Path
 from dotenv import find_dotenv, load_dotenv
-import pandas as pd
+import re
 import ast
 import os
+import requests
+import zipfile
 import tldextract
+import pandas as pd

 def fix_keywords(lst):
        fixed = set()
@ -39,14 +42,29 @@ def extract_url_domains(lst):
@click.command()
@click.argument('input_filepath', type=click.Path(exists=True))
@click.argument('output_filepath', type=click.Path())
-def main(input_filepath, output_filepath):
+@click.argument('external_filepath', type=click.Path())
+def main(input_filepath, output_filepath, external_filepath):
    """ Runs data processing scripts to turn raw data from (./data/raw) into
        cleaned data ready to be analyzed (saved in ./data/processed).
    """
    logger = logging.getLogger(__name__)
-    logger.info('Making final data set from raw data')
+    logger.info('Fetching external datasets')
+    logger.info('Downloading grid.ac dataset')
+    r = requests.get('https://ndownloader.figshare.com/files/27251693')
+    with open(os.path.join(external_filepath, 'grid.zip'), 'wb') as f:
+        f.write(r.content)
+    
+    logger.info('... unzipping')
+    try:
+        with zipfile.ZipFile(os.path.join(external_filepath, 'grid.zip')) as z:
+            z.extractall(os.path.join(external_filepath, 'grid'))
+            logger.info('... done!')
+    except:
+        logger.error('Invalid file')
+
+    logger.info('Making pickle dataset from raw data')
    logger.info('Loading the zipped dataset')
-    df = pd.read_csv(os.path.join(input_filepath, 'data.gz'), compression='gzip', 
+    df = pd.read_csv(os.path.join(input_filepath, 'data.gz'), compression='gzip',
                         sep='\t', header=None,
                         names=['orcid','verified_email', 'verified_primary_email', 
                                  'given_names', 'family_name', 'biography', 'other_names', 'urls',