optimised memory allocation for dataframe

This commit is contained in:
Andrea Mannocci 2021-03-29 15:57:24 +02:00
parent 83e2005c0e
commit efc63f88db
1 changed files with 31 additions and 4 deletions

View File

@ -36,8 +36,8 @@ def extract_url_domains(lst):
@click.argument('input_filepath', type=click.Path(exists=True)) @click.argument('input_filepath', type=click.Path(exists=True))
@click.argument('output_filepath', type=click.Path()) @click.argument('output_filepath', type=click.Path())
def main(input_filepath, output_filepath): def main(input_filepath, output_filepath):
""" Runs data processing scripts to turn raw data from (../raw) into """ Runs data processing scripts to turn raw data from (./data/raw) into
cleaned data ready to be analyzed (saved in ../processed). cleaned data ready to be analyzed (saved in ./data/processed).
""" """
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
logger.info('Making final data set from raw data') logger.info('Making final data set from raw data')
@ -48,7 +48,22 @@ def main(input_filepath, output_filepath):
'given_names', 'family_name', 'biography', 'other_names', 'urls', 'given_names', 'family_name', 'biography', 'other_names', 'urls',
'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', 'primary_email', 'other_emails', 'keywords', 'external_ids', 'education',
'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date', 'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date',
'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8') 'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8',
dtype={'orcid': 'string',
'verified_email':'bool',
'verified_primary_email':'bool',
'given_names': 'string',
'family_name': 'string',
'biography': 'string',
'primary_email': 'string',
'activation_date': 'string',
'last_update_date': 'string',
'n_works': pd.UInt16Dtype(),
'n_doi': pd.UInt16Dtype(),
'n_arxiv': pd.UInt16Dtype(),
'n_pmc': pd.UInt16Dtype(),
'n_other_pids': pd.UInt16Dtype()})
# parse_dates=['activation_date', 'last_update_date'])
logger.info('Loading list columns') logger.info('Loading list columns')
logger.info('... other_names') logger.info('... other_names')
@ -78,7 +93,7 @@ def main(input_filepath, output_filepath):
logger.info('Integrating labels from ORCID found in OpenAIRE') logger.info('Integrating labels from ORCID found in OpenAIRE')
openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid']) openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid'])
df['label'] = df.orcid.isin(openaire_orcid['orcid']) df['label'] = df.orcid.isin(openaire_orcid['orcid'])
df['label'] = df['label'].astype(int) df['label'] = df['label'].astype('bool')
logger.info('Fixing keywords') logger.info('Fixing keywords')
df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x)) df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))
@ -90,11 +105,22 @@ def main(input_filepath, output_filepath):
logger.info('Creating simple numeric columns') logger.info('Creating simple numeric columns')
df['n_emails'] = df.other_emails.str.len() df['n_emails'] = df.other_emails.str.len()
df.n_emails = df.n_emails.astype(pd.UInt16Dtype())
df['n_urls'] = df.url_domains.str.len() df['n_urls'] = df.url_domains.str.len()
df.n_urls = df.n_urls.astype(pd.UInt16Dtype())
df['n_ids'] = df.external_ids.str.len() df['n_ids'] = df.external_ids.str.len()
df.n_ids = df.n_ids.astype(pd.UInt16Dtype())
df['n_keywords'] = df.keywords.str.len() df['n_keywords'] = df.keywords.str.len()
df.n_keywords = df.n_keywords.astype(pd.UInt16Dtype())
df['n_education'] = df.education.str.len() df['n_education'] = df.education.str.len()
df.n_education = df.n_education.astype(pd.UInt16Dtype())
df['n_employment'] = df.employment.str.len() df['n_employment'] = df.employment.str.len()
df.n_employment = df.n_employment.astype(pd.UInt16Dtype())
logger.info('Dropping useless columns') logger.info('Dropping useless columns')
df = df.drop(['urls', 'other_emails'], axis=1) df = df.drop(['urls', 'other_emails'], axis=1)
@ -106,6 +132,7 @@ def main(input_filepath, output_filepath):
chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i)) chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i))
logger.info('DONE!') logger.info('DONE!')
print(df.info())
if __name__ == '__main__': if __name__ == '__main__':