small changes in the make data file

master
Andrea Mannocci 3 years ago
parent 5537705192
commit 6bf69bb2ab

@ -27,21 +27,29 @@ def main(input_filepath, output_filepath):
logger.info('Loading list columns')
logger.info('... other_names')
df['other_names'] = df[df.other_names.notna()]['other_names'].apply(lambda x: ast.literal_eval(x))
logger.info('... keywords')
df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: ast.literal_eval(x))
logger.info('... urls')
df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x))
logger.info('... other_emails')
df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x))
logger.info('... education')
df['education'] = df[df.education.notna()]['education'].apply(lambda x: ast.literal_eval(x))
logger.info('... employment')
df['employment'] = df[df.employment.notna()]['employment'].apply(lambda x: ast.literal_eval(x))
logger.info('... external_ids')
df['external_ids'] = df[df.external_ids.notna()]['external_ids'].apply(lambda x: ast.literal_eval(x))
logger.info('... works_source')
df['works_source'] = df[df.works_source.notna()]['works_source'].apply(lambda x: ast.literal_eval(x))
logger.info('Integrating labels from ORCID found in OpenAIRE')
openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid'])
df['label'] = df.orcid.isin(openaire_orcid['orcid'])
df['label'] = df['label'].astype(int)
@ -49,7 +57,7 @@ def main(input_filepath, output_filepath):
logger.info('Serializing the dataset in ../processed')
df.to_pickle(os.path.join(output_filepath, 'dataset.pkl'))
logger.info('DONE!')
if __name__ == '__main__':
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'

Loading…
Cancel
Save