pickle in chunks
This commit is contained in:
parent
a465bb027d
commit
e4f8fcab0e
|
@ -54,11 +54,15 @@ def main(input_filepath, output_filepath):
|
|||
df['label'] = df.orcid.isin(openaire_orcid['orcid'])
|
||||
df['label'] = df['label'].astype(int)
|
||||
|
||||
logger.info('Serializing the dataset in ../processed')
|
||||
df.to_pickle(os.path.join(output_filepath, 'dataset.pkl'))
|
||||
logger.info('Serializing the dataset in ./data/processed')
|
||||
n = 1000000
|
||||
chunks = [df[i:i+n] for i in range(0, df.shape[0], n)]
|
||||
for i in range(len(chunks)):
|
||||
chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i))
|
||||
|
||||
logger.info('DONE!')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
logging.basicConfig(level=logging.INFO, format=log_fmt)
|
||||
|
|
Loading…
Reference in New Issue