optimised memory allocation for dataframe
This commit is contained in:
parent
83e2005c0e
commit
efc63f88db
|
@ -36,8 +36,8 @@ def extract_url_domains(lst):
|
||||||
@click.argument('input_filepath', type=click.Path(exists=True))
|
@click.argument('input_filepath', type=click.Path(exists=True))
|
||||||
@click.argument('output_filepath', type=click.Path())
|
@click.argument('output_filepath', type=click.Path())
|
||||||
def main(input_filepath, output_filepath):
|
def main(input_filepath, output_filepath):
|
||||||
""" Runs data processing scripts to turn raw data from (../raw) into
|
""" Runs data processing scripts to turn raw data from (./data/raw) into
|
||||||
cleaned data ready to be analyzed (saved in ../processed).
|
cleaned data ready to be analyzed (saved in ./data/processed).
|
||||||
"""
|
"""
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
logger.info('Making final data set from raw data')
|
logger.info('Making final data set from raw data')
|
||||||
|
@ -48,7 +48,22 @@ def main(input_filepath, output_filepath):
|
||||||
'given_names', 'family_name', 'biography', 'other_names', 'urls',
|
'given_names', 'family_name', 'biography', 'other_names', 'urls',
|
||||||
'primary_email', 'other_emails', 'keywords', 'external_ids', 'education',
|
'primary_email', 'other_emails', 'keywords', 'external_ids', 'education',
|
||||||
'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date',
|
'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date',
|
||||||
'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8')
|
'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8',
|
||||||
|
dtype={'orcid': 'string',
|
||||||
|
'verified_email':'bool',
|
||||||
|
'verified_primary_email':'bool',
|
||||||
|
'given_names': 'string',
|
||||||
|
'family_name': 'string',
|
||||||
|
'biography': 'string',
|
||||||
|
'primary_email': 'string',
|
||||||
|
'activation_date': 'string',
|
||||||
|
'last_update_date': 'string',
|
||||||
|
'n_works': pd.UInt16Dtype(),
|
||||||
|
'n_doi': pd.UInt16Dtype(),
|
||||||
|
'n_arxiv': pd.UInt16Dtype(),
|
||||||
|
'n_pmc': pd.UInt16Dtype(),
|
||||||
|
'n_other_pids': pd.UInt16Dtype()})
|
||||||
|
# parse_dates=['activation_date', 'last_update_date'])
|
||||||
|
|
||||||
logger.info('Loading list columns')
|
logger.info('Loading list columns')
|
||||||
logger.info('... other_names')
|
logger.info('... other_names')
|
||||||
|
@ -78,7 +93,7 @@ def main(input_filepath, output_filepath):
|
||||||
logger.info('Integrating labels from ORCID found in OpenAIRE')
|
logger.info('Integrating labels from ORCID found in OpenAIRE')
|
||||||
openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid'])
|
openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid'])
|
||||||
df['label'] = df.orcid.isin(openaire_orcid['orcid'])
|
df['label'] = df.orcid.isin(openaire_orcid['orcid'])
|
||||||
df['label'] = df['label'].astype(int)
|
df['label'] = df['label'].astype('bool')
|
||||||
|
|
||||||
logger.info('Fixing keywords')
|
logger.info('Fixing keywords')
|
||||||
df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))
|
df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))
|
||||||
|
@ -90,11 +105,22 @@ def main(input_filepath, output_filepath):
|
||||||
|
|
||||||
logger.info('Creating simple numeric columns')
|
logger.info('Creating simple numeric columns')
|
||||||
df['n_emails'] = df.other_emails.str.len()
|
df['n_emails'] = df.other_emails.str.len()
|
||||||
|
df.n_emails = df.n_emails.astype(pd.UInt16Dtype())
|
||||||
|
|
||||||
df['n_urls'] = df.url_domains.str.len()
|
df['n_urls'] = df.url_domains.str.len()
|
||||||
|
df.n_urls = df.n_urls.astype(pd.UInt16Dtype())
|
||||||
|
|
||||||
df['n_ids'] = df.external_ids.str.len()
|
df['n_ids'] = df.external_ids.str.len()
|
||||||
|
df.n_ids = df.n_ids.astype(pd.UInt16Dtype())
|
||||||
|
|
||||||
df['n_keywords'] = df.keywords.str.len()
|
df['n_keywords'] = df.keywords.str.len()
|
||||||
|
df.n_keywords = df.n_keywords.astype(pd.UInt16Dtype())
|
||||||
|
|
||||||
df['n_education'] = df.education.str.len()
|
df['n_education'] = df.education.str.len()
|
||||||
|
df.n_education = df.n_education.astype(pd.UInt16Dtype())
|
||||||
|
|
||||||
df['n_employment'] = df.employment.str.len()
|
df['n_employment'] = df.employment.str.len()
|
||||||
|
df.n_employment = df.n_employment.astype(pd.UInt16Dtype())
|
||||||
|
|
||||||
logger.info('Dropping useless columns')
|
logger.info('Dropping useless columns')
|
||||||
df = df.drop(['urls', 'other_emails'], axis=1)
|
df = df.drop(['urls', 'other_emails'], axis=1)
|
||||||
|
@ -106,6 +132,7 @@ def main(input_filepath, output_filepath):
|
||||||
chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i))
|
chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i))
|
||||||
|
|
||||||
logger.info('DONE!')
|
logger.info('DONE!')
|
||||||
|
print(df.info())
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
Loading…
Reference in New Issue