diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 3ba2f8e..3861211 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -36,8 +36,8 @@ def extract_url_domains(lst): @click.argument('input_filepath', type=click.Path(exists=True)) @click.argument('output_filepath', type=click.Path()) def main(input_filepath, output_filepath): - """ Runs data processing scripts to turn raw data from (../raw) into - cleaned data ready to be analyzed (saved in ../processed). + """ Runs data processing scripts to turn raw data from (./data/raw) into + cleaned data ready to be analyzed (saved in ./data/processed). """ logger = logging.getLogger(__name__) logger.info('Making final data set from raw data') @@ -48,7 +48,22 @@ def main(input_filepath, output_filepath): 'given_names', 'family_name', 'biography', 'other_names', 'urls', 'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', 'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date', - 'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8') + 'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8', + dtype={'orcid': 'string', + 'verified_email':'bool', + 'verified_primary_email':'bool', + 'given_names': 'string', + 'family_name': 'string', + 'biography': 'string', + 'primary_email': 'string', + 'activation_date': 'string', + 'last_update_date': 'string', + 'n_works': pd.UInt16Dtype(), + 'n_doi': pd.UInt16Dtype(), + 'n_arxiv': pd.UInt16Dtype(), + 'n_pmc': pd.UInt16Dtype(), + 'n_other_pids': pd.UInt16Dtype()}) +# parse_dates=['activation_date', 'last_update_date']) logger.info('Loading list columns') logger.info('... other_names') @@ -78,7 +93,7 @@ def main(input_filepath, output_filepath): logger.info('Integrating labels from ORCID found in OpenAIRE') openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid']) df['label'] = df.orcid.isin(openaire_orcid['orcid']) - df['label'] = df['label'].astype(int) + df['label'] = df['label'].astype('bool') logger.info('Fixing keywords') df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x)) @@ -90,11 +105,22 @@ def main(input_filepath, output_filepath): logger.info('Creating simple numeric columns') df['n_emails'] = df.other_emails.str.len() + df.n_emails = df.n_emails.astype(pd.UInt16Dtype()) + df['n_urls'] = df.url_domains.str.len() + df.n_urls = df.n_urls.astype(pd.UInt16Dtype()) + df['n_ids'] = df.external_ids.str.len() + df.n_ids = df.n_ids.astype(pd.UInt16Dtype()) + df['n_keywords'] = df.keywords.str.len() + df.n_keywords = df.n_keywords.astype(pd.UInt16Dtype()) + df['n_education'] = df.education.str.len() + df.n_education = df.n_education.astype(pd.UInt16Dtype()) + df['n_employment'] = df.employment.str.len() + df.n_employment = df.n_employment.astype(pd.UInt16Dtype()) logger.info('Dropping useless columns') df = df.drop(['urls', 'other_emails'], axis=1) @@ -106,6 +132,7 @@ def main(input_filepath, output_filepath): chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i)) logger.info('DONE!') + print(df.info()) if __name__ == '__main__':