optimised memory allocation for dataframe

3 years ago · efc63f88db
parent 83e2005c0e
commit efc63f88db
1 changed files with 31 additions and 4 deletions
--- a/src/data/make_dataset.py
+++ b/src/data/make_dataset.py
@ -36,8 +36,8 @@ def extract_url_domains(lst):
@click.argument('input_filepath', type=click.Path(exists=True))
@click.argument('output_filepath', type=click.Path())
 def main(input_filepath, output_filepath):
-    """ Runs data processing scripts to turn raw data from (../raw) into
-        cleaned data ready to be analyzed (saved in ../processed).
+    """ Runs data processing scripts to turn raw data from (./data/raw) into
+        cleaned data ready to be analyzed (saved in ./data/processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('Making final data set from raw data')
@ -48,7 +48,22 @@ def main(input_filepath, output_filepath):
                                  'given_names', 'family_name', 'biography', 'other_names', 'urls', 
                                  'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', 
                                  'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date',
-                                   'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8')
+                                   'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8',
+                         dtype={'orcid': 'string', 
+                                'verified_email':'bool',
+                                'verified_primary_email':'bool',
+                                'given_names': 'string',
+                                'family_name': 'string',
+                                'biography': 'string',
+                                'primary_email': 'string',
+                                'activation_date': 'string',
+                                'last_update_date': 'string',
+                                'n_works': pd.UInt16Dtype(),
+                                'n_doi': pd.UInt16Dtype(),
+                                'n_arxiv': pd.UInt16Dtype(),
+                                'n_pmc': pd.UInt16Dtype(),
+                                'n_other_pids': pd.UInt16Dtype()})
+#                          parse_dates=['activation_date', 'last_update_date'])
    
    logger.info('Loading list columns')
    logger.info('... other_names')
@ -78,7 +93,7 @@ def main(input_filepath, output_filepath):
    logger.info('Integrating labels from ORCID found in OpenAIRE')
    openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid'])
    df['label'] = df.orcid.isin(openaire_orcid['orcid'])
-    df['label'] = df['label'].astype(int)
+    df['label'] = df['label'].astype('bool')
    
    logger.info('Fixing keywords')
    df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))
@ -90,11 +105,22 @@ def main(input_filepath, output_filepath):
    
    logger.info('Creating simple numeric columns')
    df['n_emails'] = df.other_emails.str.len()
+    df.n_emails = df.n_emails.astype(pd.UInt16Dtype())
+    
    df['n_urls'] = df.url_domains.str.len()
+    df.n_urls = df.n_urls.astype(pd.UInt16Dtype())
+    
    df['n_ids'] = df.external_ids.str.len()
+    df.n_ids = df.n_ids.astype(pd.UInt16Dtype())
+    
    df['n_keywords'] = df.keywords.str.len()
+    df.n_keywords = df.n_keywords.astype(pd.UInt16Dtype())
+    
    df['n_education'] = df.education.str.len()
+    df.n_education = df.n_education.astype(pd.UInt16Dtype())
+    
    df['n_employment'] = df.employment.str.len()
+    df.n_employment = df.n_employment.astype(pd.UInt16Dtype())
        
    logger.info('Dropping useless columns')
    df = df.drop(['urls', 'other_emails'], axis=1)
@ -106,6 +132,7 @@ def main(input_filepath, output_filepath):
        chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i))
    
    logger.info('DONE!')
+    print(df.info())


 if __name__ == '__main__':