moved lots of preprocessing under make
This commit is contained in:
parent
9433fbe46d
commit
629d781645
File diff suppressed because one or more lines are too long
|
@ -6,6 +6,31 @@ from dotenv import find_dotenv, load_dotenv
|
|||
import pandas as pd
|
||||
import ast
|
||||
import os
|
||||
import tldextract
|
||||
|
||||
def fix_keywords(lst):
|
||||
fixed = set()
|
||||
for k in lst:
|
||||
tokens = set(k.split(','))
|
||||
for t in tokens:
|
||||
fixed.add(str.strip(t))
|
||||
fixed.discard('')
|
||||
return list(fixed)
|
||||
|
||||
def extract_email_domains(lst):
|
||||
res = []
|
||||
for email in lst:
|
||||
res.append(email.split('@')[1])
|
||||
return res
|
||||
|
||||
def extract_url_domains(lst):
|
||||
domains = []
|
||||
for e in lst:
|
||||
# e[0] is a string describing the url
|
||||
# e[1] is the url
|
||||
domain = tldextract.extract(e[1])
|
||||
domains.append(domain.registered_domain)
|
||||
return domains
|
||||
|
||||
@click.command()
|
||||
@click.argument('input_filepath', type=click.Path(exists=True))
|
||||
|
@ -17,8 +42,9 @@ def main(input_filepath, output_filepath):
|
|||
logger = logging.getLogger(__name__)
|
||||
logger.info('Making final data set from raw data')
|
||||
logger.info('Loading the zipped dataset')
|
||||
df = pd.read_csv(os.path.join(input_filepath, 'initial_info_whole_20210322.tsv.gz'), compression='gzip', sep='\t', header=0,
|
||||
names=['orcid', 'claimed','verified_email', 'verified_primary_email',
|
||||
df = pd.read_csv(os.path.join(input_filepath, 'data.gz'), compression='gzip',
|
||||
sep='\t', header=None,
|
||||
names=['orcid','verified_email', 'verified_primary_email',
|
||||
'given_names', 'family_name', 'biography', 'other_names', 'urls',
|
||||
'primary_email', 'other_emails', 'keywords', 'external_ids', 'education',
|
||||
'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date',
|
||||
|
@ -54,6 +80,25 @@ def main(input_filepath, output_filepath):
|
|||
df['label'] = df.orcid.isin(openaire_orcid['orcid'])
|
||||
df['label'] = df['label'].astype(int)
|
||||
|
||||
logger.info('Fixing keywords')
|
||||
df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))
|
||||
|
||||
logger.info('Extracting domains from URLs and emails')
|
||||
df['primary_email_domain'] = df[df.primary_email.notna()]['primary_email'].apply(lambda x: x.split('@')[1])
|
||||
df['other_email_domains'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: extract_email_domains(x))
|
||||
df['url_domains'] = df[df.urls.notna()]['urls'].apply(lambda x: extract_url_domains(x))
|
||||
|
||||
logger.info('Creating simple numeric columns')
|
||||
df['n_emails'] = df.other_emails.str.len()
|
||||
df['n_urls'] = df.url_domains.str.len()
|
||||
df['n_ids'] = df.external_ids.str.len()
|
||||
df['n_keywords'] = df.keywords.str.len()
|
||||
df['n_education'] = df.education.str.len()
|
||||
df['n_employment'] = df.employment.str.len()
|
||||
|
||||
logger.info('Dropping useless columns')
|
||||
df = df.drop(['urls', 'other_emails'], axis=1)
|
||||
|
||||
logger.info('Serializing the dataset in ./data/processed')
|
||||
n = 1000000
|
||||
chunks = [df[i:i+n] for i in range(0, df.shape[0], n)]
|
||||
|
|
Loading…
Reference in New Issue