making uint columns int
This commit is contained in:
parent
25d225dd5f
commit
0bec69ec6d
File diff suppressed because one or more lines are too long
|
@ -58,11 +58,11 @@ def main(input_filepath, output_filepath):
|
||||||
'primary_email': 'string',
|
'primary_email': 'string',
|
||||||
'activation_date': 'string',
|
'activation_date': 'string',
|
||||||
'last_update_date': 'string',
|
'last_update_date': 'string',
|
||||||
'n_works': pd.UInt16Dtype(),
|
'n_works': pd.Int16Dtype(),
|
||||||
'n_doi': pd.UInt16Dtype(),
|
'n_doi': pd.Int16Dtype(),
|
||||||
'n_arxiv': pd.UInt16Dtype(),
|
'n_arxiv': pd.Int16Dtype(),
|
||||||
'n_pmc': pd.UInt16Dtype(),
|
'n_pmc': pd.Int16Dtype(),
|
||||||
'n_other_pids': pd.UInt16Dtype()})
|
'n_other_pids': pd.Int16Dtype()})
|
||||||
# parse_dates=['activation_date', 'last_update_date'])
|
# parse_dates=['activation_date', 'last_update_date'])
|
||||||
|
|
||||||
logger.info('Loading list columns')
|
logger.info('Loading list columns')
|
||||||
|
@ -105,22 +105,22 @@ def main(input_filepath, output_filepath):
|
||||||
|
|
||||||
logger.info('Creating simple numeric columns')
|
logger.info('Creating simple numeric columns')
|
||||||
df['n_emails'] = df.other_emails.str.len()
|
df['n_emails'] = df.other_emails.str.len()
|
||||||
df.n_emails = df.n_emails.astype(pd.UInt16Dtype())
|
df.n_emails = df.n_emails.astype(pd.Int16Dtype())
|
||||||
|
|
||||||
df['n_urls'] = df.url_domains.str.len()
|
df['n_urls'] = df.url_domains.str.len()
|
||||||
df.n_urls = df.n_urls.astype(pd.UInt16Dtype())
|
df.n_urls = df.n_urls.astype(pd.Int16Dtype())
|
||||||
|
|
||||||
df['n_ids'] = df.external_ids.str.len()
|
df['n_ids'] = df.external_ids.str.len()
|
||||||
df.n_ids = df.n_ids.astype(pd.UInt16Dtype())
|
df.n_ids = df.n_ids.astype(pd.Int16Dtype())
|
||||||
|
|
||||||
df['n_keywords'] = df.keywords.str.len()
|
df['n_keywords'] = df.keywords.str.len()
|
||||||
df.n_keywords = df.n_keywords.astype(pd.UInt16Dtype())
|
df.n_keywords = df.n_keywords.astype(pd.Int16Dtype())
|
||||||
|
|
||||||
df['n_education'] = df.education.str.len()
|
df['n_education'] = df.education.str.len()
|
||||||
df.n_education = df.n_education.astype(pd.UInt16Dtype())
|
df.n_education = df.n_education.astype(pd.Int16Dtype())
|
||||||
|
|
||||||
df['n_employment'] = df.employment.str.len()
|
df['n_employment'] = df.employment.str.len()
|
||||||
df.n_employment = df.n_employment.astype(pd.UInt16Dtype())
|
df.n_employment = df.n_employment.astype(pd.Int16Dtype())
|
||||||
|
|
||||||
logger.info('Dropping useless columns')
|
logger.info('Dropping useless columns')
|
||||||
df = df.drop(['urls', 'other_emails'], axis=1)
|
df = df.drop(['urls', 'other_emails'], axis=1)
|
||||||
|
|
Loading…
Reference in New Issue