making uint columns int

This commit is contained in:
Andrea Mannocci 2021-03-29 16:50:39 +02:00
parent 25d225dd5f
commit 0bec69ec6d
2 changed files with 1787 additions and 3216 deletions

File diff suppressed because one or more lines are too long

View File

@ -58,11 +58,11 @@ def main(input_filepath, output_filepath):
'primary_email': 'string', 'primary_email': 'string',
'activation_date': 'string', 'activation_date': 'string',
'last_update_date': 'string', 'last_update_date': 'string',
'n_works': pd.UInt16Dtype(), 'n_works': pd.Int16Dtype(),
'n_doi': pd.UInt16Dtype(), 'n_doi': pd.Int16Dtype(),
'n_arxiv': pd.UInt16Dtype(), 'n_arxiv': pd.Int16Dtype(),
'n_pmc': pd.UInt16Dtype(), 'n_pmc': pd.Int16Dtype(),
'n_other_pids': pd.UInt16Dtype()}) 'n_other_pids': pd.Int16Dtype()})
# parse_dates=['activation_date', 'last_update_date']) # parse_dates=['activation_date', 'last_update_date'])
logger.info('Loading list columns') logger.info('Loading list columns')
@ -105,22 +105,22 @@ def main(input_filepath, output_filepath):
logger.info('Creating simple numeric columns') logger.info('Creating simple numeric columns')
df['n_emails'] = df.other_emails.str.len() df['n_emails'] = df.other_emails.str.len()
df.n_emails = df.n_emails.astype(pd.UInt16Dtype()) df.n_emails = df.n_emails.astype(pd.Int16Dtype())
df['n_urls'] = df.url_domains.str.len() df['n_urls'] = df.url_domains.str.len()
df.n_urls = df.n_urls.astype(pd.UInt16Dtype()) df.n_urls = df.n_urls.astype(pd.Int16Dtype())
df['n_ids'] = df.external_ids.str.len() df['n_ids'] = df.external_ids.str.len()
df.n_ids = df.n_ids.astype(pd.UInt16Dtype()) df.n_ids = df.n_ids.astype(pd.Int16Dtype())
df['n_keywords'] = df.keywords.str.len() df['n_keywords'] = df.keywords.str.len()
df.n_keywords = df.n_keywords.astype(pd.UInt16Dtype()) df.n_keywords = df.n_keywords.astype(pd.Int16Dtype())
df['n_education'] = df.education.str.len() df['n_education'] = df.education.str.len()
df.n_education = df.n_education.astype(pd.UInt16Dtype()) df.n_education = df.n_education.astype(pd.Int16Dtype())
df['n_employment'] = df.employment.str.len() df['n_employment'] = df.employment.str.len()
df.n_employment = df.n_employment.astype(pd.UInt16Dtype()) df.n_employment = df.n_employment.astype(pd.Int16Dtype())
logger.info('Dropping useless columns') logger.info('Dropping useless columns')
df = df.drop(['urls', 'other_emails'], axis=1) df = df.drop(['urls', 'other_emails'], axis=1)