added michele notebook

This commit is contained in:
Andrea Mannocci 2021-07-20 12:15:17 +02:00
parent 7b948bb29e
commit 3854e03d10
5 changed files with 18557 additions and 15797 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -125,6 +125,7 @@ def main(input_filepath, output_filepath, external_filepath):
df['primary_email_domain'] = df[df.primary_email.notna()]['primary_email'].apply(lambda x: x.split('@')[1])
df['other_email_domains'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: extract_email_domains(x))
df['url_domains'] = df[df.urls.notna()]['urls'].apply(lambda x: extract_url_domains(x))
df['other_url_domains'] = df[df.other_urls.notna()]['other_urls'].apply(lambda x: extract_url_domains(x))
logger.info('Creating simple numeric columns')
df['n_emails'] = df.other_emails.str.len()