new column other_urls being processed

master
Andrea Mannocci 3 years ago
parent 9739054a74
commit 7b948bb29e

@ -27,16 +27,16 @@ def extract_email_domains(lst):
return res
def extract_url_domains(lst):
domains = []
domains = set()
for e in lst:
# e[0] is a string describing the url
# e[1] is the url or IP address
domain = tldextract.extract(e[1]).registered_domain
if domain == '':
# it's an IP address
domains.append(tldextract.extract(e[1]).domain)
domains.add(tldextract.extract(e[1]).domain)
else:
domains.append(domain)
domains.add(domain)
return domains
@click.command()
@ -70,7 +70,7 @@ def main(input_filepath, output_filepath, external_filepath):
'given_names', 'family_name', 'biography', 'other_names', 'urls',
'primary_email', 'other_emails', 'keywords', 'external_ids', 'education',
'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date',
'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8',
'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids', 'other_urls'], encoding = 'utf-8',
dtype={'orcid': 'string',
'verified_email':'bool',
'verified_primary_email':'bool',
@ -96,6 +96,7 @@ def main(input_filepath, output_filepath, external_filepath):
logger.info('... urls')
df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x))
df['other_urls'] = df[df.other_urls.notna()]['other_urls'].apply(lambda x: ast.literal_eval(x))
logger.info('... other_emails')
df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x))

Loading…
Cancel
Save