diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py
index 3ba2f8e..3861211 100644
--- a/src/data/make_dataset.py
+++ b/src/data/make_dataset.py
@@ -36,8 +36,8 @@ def extract_url_domains(lst):
 @click.argument('input_filepath', type=click.Path(exists=True))
 @click.argument('output_filepath', type=click.Path())
 def main(input_filepath, output_filepath):
-    """ Runs data processing scripts to turn raw data from (../raw) into
-        cleaned data ready to be analyzed (saved in ../processed).
+    """ Runs data processing scripts to turn raw data from (./data/raw) into
+        cleaned data ready to be analyzed (saved in ./data/processed).
     """
     logger = logging.getLogger(__name__)
     logger.info('Making final data set from raw data')
@@ -48,7 +48,22 @@ def main(input_filepath, output_filepath):
                                   'given_names', 'family_name', 'biography', 'other_names', 'urls', 
                                   'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', 
                                   'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date',
-                                   'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8')
+                                   'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8',
+                         dtype={'orcid': 'string', 
+                                'verified_email':'bool',
+                                'verified_primary_email':'bool',
+                                'given_names': 'string',
+                                'family_name': 'string',
+                                'biography': 'string',
+                                'primary_email': 'string',
+                                'activation_date': 'string',
+                                'last_update_date': 'string',
+                                'n_works': pd.UInt16Dtype(),
+                                'n_doi': pd.UInt16Dtype(),
+                                'n_arxiv': pd.UInt16Dtype(),
+                                'n_pmc': pd.UInt16Dtype(),
+                                'n_other_pids': pd.UInt16Dtype()})
+#                          parse_dates=['activation_date', 'last_update_date'])
     
     logger.info('Loading list columns')
     logger.info('... other_names')
@@ -78,7 +93,7 @@ def main(input_filepath, output_filepath):
     logger.info('Integrating labels from ORCID found in OpenAIRE')
     openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid'])
     df['label'] = df.orcid.isin(openaire_orcid['orcid'])
-    df['label'] = df['label'].astype(int)
+    df['label'] = df['label'].astype('bool')
     
     logger.info('Fixing keywords')
     df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))
@@ -90,11 +105,22 @@ def main(input_filepath, output_filepath):
     
     logger.info('Creating simple numeric columns')
     df['n_emails'] = df.other_emails.str.len()
+    df.n_emails = df.n_emails.astype(pd.UInt16Dtype())
+    
     df['n_urls'] = df.url_domains.str.len()
+    df.n_urls = df.n_urls.astype(pd.UInt16Dtype())
+    
     df['n_ids'] = df.external_ids.str.len()
+    df.n_ids = df.n_ids.astype(pd.UInt16Dtype())
+    
     df['n_keywords'] = df.keywords.str.len()
+    df.n_keywords = df.n_keywords.astype(pd.UInt16Dtype())
+    
     df['n_education'] = df.education.str.len()
+    df.n_education = df.n_education.astype(pd.UInt16Dtype())
+    
     df['n_employment'] = df.employment.str.len()
+    df.n_employment = df.n_employment.astype(pd.UInt16Dtype())
         
     logger.info('Dropping useless columns')
     df = df.drop(['urls', 'other_emails'], axis=1)
@@ -106,6 +132,7 @@ def main(input_filepath, output_filepath):
         chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i))
     
     logger.info('DONE!')
+    print(df.info())
 
 
 if __name__ == '__main__':