|
|
|
@ -54,11 +54,15 @@ def main(input_filepath, output_filepath):
|
|
|
|
|
df['label'] = df.orcid.isin(openaire_orcid['orcid'])
|
|
|
|
|
df['label'] = df['label'].astype(int)
|
|
|
|
|
|
|
|
|
|
logger.info('Serializing the dataset in ../processed')
|
|
|
|
|
df.to_pickle(os.path.join(output_filepath, 'dataset.pkl'))
|
|
|
|
|
logger.info('Serializing the dataset in ./data/processed')
|
|
|
|
|
n = 1000000
|
|
|
|
|
chunks = [df[i:i+n] for i in range(0, df.shape[0], n)]
|
|
|
|
|
for i in range(len(chunks)):
|
|
|
|
|
chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i))
|
|
|
|
|
|
|
|
|
|
logger.info('DONE!')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
|
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
|
|
|
logging.basicConfig(level=logging.INFO, format=log_fmt)
|
|
|
|
|