pickle in chunks

This commit is contained in:
Andrea Mannocci 2021-03-24 13:29:06 +01:00
parent a465bb027d
commit e4f8fcab0e
1 changed files with 6 additions and 2 deletions

View File

@ -54,11 +54,15 @@ def main(input_filepath, output_filepath):
df['label'] = df.orcid.isin(openaire_orcid['orcid'])
df['label'] = df['label'].astype(int)
logger.info('Serializing the dataset in ../processed')
df.to_pickle(os.path.join(output_filepath, 'dataset.pkl'))
logger.info('Serializing the dataset in ./data/processed')
n = 1000000
chunks = [df[i:i+n] for i in range(0, df.shape[0], n)]
for i in range(len(chunks)):
chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i))
logger.info('DONE!')
if __name__ == '__main__':
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)