From 7b948bb29ea97b4a954a10d1dd63cfe8b20af543 Mon Sep 17 00:00:00 2001 From: Andrea Mannocci Date: Wed, 12 May 2021 16:27:29 +0200 Subject: [PATCH] new column other_urls being processed --- src/data/make_dataset.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/data/make_dataset.py b/src/data/make_dataset.py index 12a110b..ada4c32 100644 --- a/src/data/make_dataset.py +++ b/src/data/make_dataset.py @@ -27,16 +27,16 @@ def extract_email_domains(lst): return res def extract_url_domains(lst): - domains = [] + domains = set() for e in lst: # e[0] is a string describing the url # e[1] is the url or IP address domain = tldextract.extract(e[1]).registered_domain if domain == '': # it's an IP address - domains.append(tldextract.extract(e[1]).domain) + domains.add(tldextract.extract(e[1]).domain) else: - domains.append(domain) + domains.add(domain) return domains @click.command() @@ -70,7 +70,7 @@ def main(input_filepath, output_filepath, external_filepath): 'given_names', 'family_name', 'biography', 'other_names', 'urls', 'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', 'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date', - 'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8', + 'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids', 'other_urls'], encoding = 'utf-8', dtype={'orcid': 'string', 'verified_email':'bool', 'verified_primary_email':'bool', @@ -96,6 +96,7 @@ def main(input_filepath, output_filepath, external_filepath): logger.info('... urls') df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x)) + df['other_urls'] = df[df.other_urls.notna()]['other_urls'].apply(lambda x: ast.literal_eval(x)) logger.info('... other_emails') df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x))