diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index ca68273..16bafe1 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -27,21 +27,29 @@ def main(input_filepath, output_filepath): logger.info('Loading list columns') logger.info('... other_names') df['other_names'] = df[df.other_names.notna()]['other_names'].apply(lambda x: ast.literal_eval(x)) + logger.info('... keywords') df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: ast.literal_eval(x)) + logger.info('... urls') df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x)) + logger.info('... other_emails') df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x)) + logger.info('... education') df['education'] = df[df.education.notna()]['education'].apply(lambda x: ast.literal_eval(x)) + logger.info('... employment') df['employment'] = df[df.employment.notna()]['employment'].apply(lambda x: ast.literal_eval(x)) + logger.info('... external_ids') df['external_ids'] = df[df.external_ids.notna()]['external_ids'].apply(lambda x: ast.literal_eval(x)) + logger.info('... works_source') df['works_source'] = df[df.works_source.notna()]['works_source'].apply(lambda x: ast.literal_eval(x)) + logger.info('Integrating labels from ORCID found in OpenAIRE') openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid']) df['label'] = df.orcid.isin(openaire_orcid['orcid']) df['label'] = df['label'].astype(int) @@ -49,7 +57,7 @@ def main(input_filepath, output_filepath): logger.info('Serializing the dataset in ../processed') df.to_pickle(os.path.join(output_filepath, 'dataset.pkl')) - + logger.info('DONE!') if __name__ == '__main__': log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'