From e4f8fcab0e48ca208f6781871ac84e15b16e798a Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Wed, 24 Mar 2021 13:29:06 +0100 Subject: [PATCH] pickle in chunks --- src/data/make_dataset.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index ea88b9b..297e4a1 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -54,11 +54,15 @@ def main(input_filepath, output_filepath): df['label'] = df.orcid.isin(openaire_orcid['orcid']) df['label'] = df['label'].astype(int) - logger.info('Serializing the dataset in ../processed') - df.to_pickle(os.path.join(output_filepath, 'dataset.pkl')) + logger.info('Serializing the dataset in ./data/processed') + n = 1000000 + chunks = [df[i:i+n] for i in range(0, df.shape[0], n)] + for i in range(len(chunks)): + chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i)) logger.info('DONE!') + if __name__ == '__main__': log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.INFO, format=log_fmt)