new column other_urls being processed
This commit is contained in:
parent
9739054a74
commit
7b948bb29e
|
@ -27,16 +27,16 @@ def extract_email_domains(lst):
|
|||
return res
|
||||
|
||||
def extract_url_domains(lst):
|
||||
domains = []
|
||||
domains = set()
|
||||
for e in lst:
|
||||
# e[0] is a string describing the url
|
||||
# e[1] is the url or IP address
|
||||
domain = tldextract.extract(e[1]).registered_domain
|
||||
if domain == '':
|
||||
# it's an IP address
|
||||
domains.append(tldextract.extract(e[1]).domain)
|
||||
domains.add(tldextract.extract(e[1]).domain)
|
||||
else:
|
||||
domains.append(domain)
|
||||
domains.add(domain)
|
||||
return domains
|
||||
|
||||
@click.command()
|
||||
|
@ -70,7 +70,7 @@ def main(input_filepath, output_filepath, external_filepath):
|
|||
'given_names', 'family_name', 'biography', 'other_names', 'urls',
|
||||
'primary_email', 'other_emails', 'keywords', 'external_ids', 'education',
|
||||
'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date',
|
||||
'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8',
|
||||
'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids', 'other_urls'], encoding = 'utf-8',
|
||||
dtype={'orcid': 'string',
|
||||
'verified_email':'bool',
|
||||
'verified_primary_email':'bool',
|
||||
|
@ -96,6 +96,7 @@ def main(input_filepath, output_filepath, external_filepath):
|
|||
|
||||
logger.info('... urls')
|
||||
df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x))
|
||||
df['other_urls'] = df[df.other_urls.notna()]['other_urls'].apply(lambda x: ast.literal_eval(x))
|
||||
|
||||
logger.info('... other_emails')
|
||||
df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x))
|
||||
|
|
Loading…
Reference in New Issue