fake-orcid-analysis/src/data/make_dataset.py

# -*- coding: utf-8 -*-
import click
import logging
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
import pandas as pd
import ast
import os
import tldextract

def fix_keywords(lst):
        fixed = set()
        for k in lst:
            tokens = set(k.split(','))
            for t in tokens:
                fixed.add(str.strip(t))
        fixed.discard('')
        return list(fixed)
    
def extract_email_domains(lst):
    res = []
    for email in lst:
        res.append(email.split('@')[1])
    return res

def extract_url_domains(lst):
    domains = []
    for e in lst:
        # e[0] is a string describing the url
        # e[1] is the url or IP address
        domain = tldextract.extract(e[1]).registered_domain
        if domain == '':
            # it's an IP address
            domains.append(tldextract.extract(e[1]).domain)
        else:
            domains.append(domain)
    return domains

@click.command()
@click.argument('input_filepath', type=click.Path(exists=True))
@click.argument('output_filepath', type=click.Path())
def main(input_filepath, output_filepath):
    """ Runs data processing scripts to turn raw data from (./data/raw) into
        cleaned data ready to be analyzed (saved in ./data/processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('Making final data set from raw data')
    logger.info('Loading the zipped dataset')
    df = pd.read_csv(os.path.join(input_filepath, 'data.gz'), compression='gzip', 
                         sep='\t', header=None,
                         names=['orcid','verified_email', 'verified_primary_email', 
                                  'given_names', 'family_name', 'biography', 'other_names', 'urls', 
                                  'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', 
                                  'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date',
                                   'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8',
                         dtype={'orcid': 'string', 
                                'verified_email':'bool',
                                'verified_primary_email':'bool',
                                'given_names': 'string',
                                'family_name': 'string',
                                'biography': 'string',
                                'primary_email': 'string',
#                                 'activation_date': 'string',
#                                 'last_update_date': 'string',
                                'n_works': pd.Int16Dtype(),
                                'n_doi': pd.Int16Dtype(),
                                'n_arxiv': pd.Int16Dtype(),
                                'n_pmc': pd.Int16Dtype(),
                                'n_other_pids': pd.Int16Dtype()},
                         parse_dates=['activation_date', 'last_update_date'])
    
    logger.info('Loading list columns')
    logger.info('... other_names')
    df['other_names'] = df[df.other_names.notna()]['other_names'].apply(lambda x: ast.literal_eval(x))
    
    logger.info('... keywords')
    df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: ast.literal_eval(x))
    
    logger.info('... urls')
    df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x))
    
    logger.info('... other_emails')
    df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x))
    
    logger.info('... education')
    df['education'] = df[df.education.notna()]['education'].apply(lambda x: ast.literal_eval(x))
    
    logger.info('... employment')
    df['employment'] = df[df.employment.notna()]['employment'].apply(lambda x: ast.literal_eval(x))
    
    logger.info('... external_ids')    
    df['external_ids'] = df[df.external_ids.notna()]['external_ids'].apply(lambda x: ast.literal_eval(x))
    
    logger.info('... works_source')
    df['works_source'] = df[df.works_source.notna()]['works_source'].apply(lambda x: ast.literal_eval(x))
    
    logger.info('Integrating labels from ORCID found in OpenAIRE')
    openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid'])
    df['label'] = df.orcid.isin(openaire_orcid['orcid'])
    df['label'] = df['label'].astype('bool')
    
    logger.info('Fixing keywords')
    df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))
    
    logger.info('Extracting domains from URLs and emails')
    df['primary_email_domain'] = df[df.primary_email.notna()]['primary_email'].apply(lambda x: x.split('@')[1])
    df['other_email_domains'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: extract_email_domains(x))
    df['url_domains'] = df[df.urls.notna()]['urls'].apply(lambda x: extract_url_domains(x))
    
    logger.info('Creating simple numeric columns')
    df['n_emails'] = df.other_emails.str.len()
    df.n_emails = df.n_emails.astype(pd.Int16Dtype())
    
    df['n_urls'] = df.url_domains.str.len()
    df.n_urls = df.n_urls.astype(pd.Int16Dtype())
    
    df['n_ids'] = df.external_ids.str.len()
    df.n_ids = df.n_ids.astype(pd.Int16Dtype())
    
    df['n_keywords'] = df.keywords.str.len()
    df.n_keywords = df.n_keywords.astype(pd.Int16Dtype())
    
    df['n_education'] = df.education.str.len()
    df.n_education = df.n_education.astype(pd.Int16Dtype())
    
    df['n_employment'] = df.employment.str.len()
    df.n_employment = df.n_employment.astype(pd.Int16Dtype())
        
    logger.info('Dropping useless columns')
    df = df.drop(['urls', 'other_emails'], axis=1)
    
    logger.info('Serializing the dataset in ./data/processed')
    n = 1000000
    chunks = [df[i:i+n] for i in range(0, df.shape[0], n)]
    for i in range(len(chunks)):
        chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i))
    
    logger.info('DONE!')
    print(df.info())


if __name__ == '__main__':
    log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
    logging.basicConfig(level=logging.INFO, format=log_fmt)

    # not used in this stub but often useful for finding various files
    project_dir = Path(__file__).resolve().parents[2]

    # find .env automagically by walking up directories until it's found, then
    # load up the .env entries as environment variables
    load_dotenv(find_dotenv())

    main()
first commit 2021-03-18 17:43:00 +01:00			`# -- coding: utf-8 --`
			`import click`
			`import logging`
			`from pathlib import Path`
			`from dotenv import find_dotenv, load_dotenv`
adding preprocessing with make 2021-03-23 19:03:37 +01:00			`import pandas as pd`
			`import ast`
			`import os`
moved lots of preprocessing under make 2021-03-25 15:20:06 +01:00			`import tldextract`

			`def fix_keywords(lst):`
			`fixed = set()`
			`for k in lst:`
			`tokens = set(k.split(','))`
			`for t in tokens:`
			`fixed.add(str.strip(t))`
			`fixed.discard('')`
			`return list(fixed)`

			`def extract_email_domains(lst):`
			`res = []`
			`for email in lst:`
			`res.append(email.split('@')[1])`
			`return res`

			`def extract_url_domains(lst):`
			`domains = []`
			`for e in lst:`
			`# e[0] is a string describing the url`
fixed IP extraction 2021-03-31 12:56:49 +02:00			`# e[1] is the url or IP address`
			`domain = tldextract.extract(e[1]).registered_domain`
			`if domain == '':`
			`# it's an IP address`
			`domains.append(tldextract.extract(e[1]).domain)`
			`else:`
			`domains.append(domain)`
moved lots of preprocessing under make 2021-03-25 15:20:06 +01:00			`return domains`
first commit 2021-03-18 17:43:00 +01:00
			`@click.command()`
			`@click.argument('input_filepath', type=click.Path(exists=True))`
			`@click.argument('output_filepath', type=click.Path())`
			`def main(input_filepath, output_filepath):`
optimised memory allocation for dataframe 2021-03-29 15:57:24 +02:00			`""" Runs data processing scripts to turn raw data from (./data/raw) into`
			`cleaned data ready to be analyzed (saved in ./data/processed).`
first commit 2021-03-18 17:43:00 +01:00			`"""`
			`logger = logging.getLogger(__name__)`
adding preprocessing with make 2021-03-23 19:03:37 +01:00			`logger.info('Making final data set from raw data')`
			`logger.info('Loading the zipped dataset')`
moved lots of preprocessing under make 2021-03-25 15:20:06 +01:00			`df = pd.read_csv(os.path.join(input_filepath, 'data.gz'), compression='gzip',`
			`sep='\t', header=None,`
			`names=['orcid','verified_email', 'verified_primary_email',`
adding preprocessing with make 2021-03-23 19:03:37 +01:00			`'given_names', 'family_name', 'biography', 'other_names', 'urls',`
			`'primary_email', 'other_emails', 'keywords', 'external_ids', 'education',`
			`'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date',`
optimised memory allocation for dataframe 2021-03-29 15:57:24 +02:00			`'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8',`
			`dtype={'orcid': 'string',`
			`'verified_email':'bool',`
			`'verified_primary_email':'bool',`
			`'given_names': 'string',`
			`'family_name': 'string',`
			`'biography': 'string',`
			`'primary_email': 'string',`
datetime parsing 2021-04-15 15:01:34 +02:00			`# 'activation_date': 'string',`
			`# 'last_update_date': 'string',`
making uint columns int 2021-03-29 16:50:39 +02:00			`'n_works': pd.Int16Dtype(),`
			`'n_doi': pd.Int16Dtype(),`
			`'n_arxiv': pd.Int16Dtype(),`
			`'n_pmc': pd.Int16Dtype(),`
datetime parsing 2021-04-15 15:01:34 +02:00			`'n_other_pids': pd.Int16Dtype()},`
			`parse_dates=['activation_date', 'last_update_date'])`
adding preprocessing with make 2021-03-23 19:03:37 +01:00
			`logger.info('Loading list columns')`
			`logger.info('... other_names')`
			`df['other_names'] = df[df.other_names.notna()]['other_names'].apply(lambda x: ast.literal_eval(x))`
small changes in the make data file 2021-03-24 12:21:00 +01:00
adding preprocessing with make 2021-03-23 19:03:37 +01:00			`logger.info('... keywords')`
			`df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: ast.literal_eval(x))`
small changes in the make data file 2021-03-24 12:21:00 +01:00
adding preprocessing with make 2021-03-23 19:03:37 +01:00			`logger.info('... urls')`
			`df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x))`
small changes in the make data file 2021-03-24 12:21:00 +01:00
adding preprocessing with make 2021-03-23 19:03:37 +01:00			`logger.info('... other_emails')`
			`df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x))`
small changes in the make data file 2021-03-24 12:21:00 +01:00
adding preprocessing with make 2021-03-23 19:03:37 +01:00			`logger.info('... education')`
			`df['education'] = df[df.education.notna()]['education'].apply(lambda x: ast.literal_eval(x))`
small changes in the make data file 2021-03-24 12:21:00 +01:00
adding preprocessing with make 2021-03-23 19:03:37 +01:00			`logger.info('... employment')`
			`df['employment'] = df[df.employment.notna()]['employment'].apply(lambda x: ast.literal_eval(x))`
small changes in the make data file 2021-03-24 12:21:00 +01:00
adding preprocessing with make 2021-03-23 19:03:37 +01:00			`logger.info('... external_ids')`
			`df['external_ids'] = df[df.external_ids.notna()]['external_ids'].apply(lambda x: ast.literal_eval(x))`
small changes in the make data file 2021-03-24 12:21:00 +01:00
adding preprocessing with make 2021-03-23 19:03:37 +01:00			`logger.info('... works_source')`
			`df['works_source'] = df[df.works_source.notna()]['works_source'].apply(lambda x: ast.literal_eval(x))`

small changes in the make data file 2021-03-24 12:21:00 +01:00			`logger.info('Integrating labels from ORCID found in OpenAIRE')`
adding preprocessing with make 2021-03-23 19:03:37 +01:00			`openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid'])`
			`df['label'] = df.orcid.isin(openaire_orcid['orcid'])`
optimised memory allocation for dataframe 2021-03-29 15:57:24 +02:00			`df['label'] = df['label'].astype('bool')`
adding preprocessing with make 2021-03-23 19:03:37 +01:00
moved lots of preprocessing under make 2021-03-25 15:20:06 +01:00			`logger.info('Fixing keywords')`
			`df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))`

			`logger.info('Extracting domains from URLs and emails')`
			`df['primary_email_domain'] = df[df.primary_email.notna()]['primary_email'].apply(lambda x: x.split('@')[1])`
			`df['other_email_domains'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: extract_email_domains(x))`
			`df['url_domains'] = df[df.urls.notna()]['urls'].apply(lambda x: extract_url_domains(x))`

			`logger.info('Creating simple numeric columns')`
			`df['n_emails'] = df.other_emails.str.len()`
making uint columns int 2021-03-29 16:50:39 +02:00			`df.n_emails = df.n_emails.astype(pd.Int16Dtype())`
optimised memory allocation for dataframe 2021-03-29 15:57:24 +02:00
moved lots of preprocessing under make 2021-03-25 15:20:06 +01:00			`df['n_urls'] = df.url_domains.str.len()`
making uint columns int 2021-03-29 16:50:39 +02:00			`df.n_urls = df.n_urls.astype(pd.Int16Dtype())`
optimised memory allocation for dataframe 2021-03-29 15:57:24 +02:00
moved lots of preprocessing under make 2021-03-25 15:20:06 +01:00			`df['n_ids'] = df.external_ids.str.len()`
making uint columns int 2021-03-29 16:50:39 +02:00			`df.n_ids = df.n_ids.astype(pd.Int16Dtype())`
optimised memory allocation for dataframe 2021-03-29 15:57:24 +02:00
moved lots of preprocessing under make 2021-03-25 15:20:06 +01:00			`df['n_keywords'] = df.keywords.str.len()`
making uint columns int 2021-03-29 16:50:39 +02:00			`df.n_keywords = df.n_keywords.astype(pd.Int16Dtype())`
optimised memory allocation for dataframe 2021-03-29 15:57:24 +02:00
moved lots of preprocessing under make 2021-03-25 15:20:06 +01:00			`df['n_education'] = df.education.str.len()`
making uint columns int 2021-03-29 16:50:39 +02:00			`df.n_education = df.n_education.astype(pd.Int16Dtype())`
optimised memory allocation for dataframe 2021-03-29 15:57:24 +02:00
moved lots of preprocessing under make 2021-03-25 15:20:06 +01:00			`df['n_employment'] = df.employment.str.len()`
making uint columns int 2021-03-29 16:50:39 +02:00			`df.n_employment = df.n_employment.astype(pd.Int16Dtype())`
moved lots of preprocessing under make 2021-03-25 15:20:06 +01:00
			`logger.info('Dropping useless columns')`
			`df = df.drop(['urls', 'other_emails'], axis=1)`

pickle in chunks 2021-03-24 13:29:06 +01:00			`logger.info('Serializing the dataset in ./data/processed')`
			`n = 1000000`
			`chunks = [df[i:i+n] for i in range(0, df.shape[0], n)]`
			`for i in range(len(chunks)):`
			`chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i))`
adding preprocessing with make 2021-03-23 19:03:37 +01:00
small changes in the make data file 2021-03-24 12:21:00 +01:00			`logger.info('DONE!')`
optimised memory allocation for dataframe 2021-03-29 15:57:24 +02:00			`print(df.info())`
first commit 2021-03-18 17:43:00 +01:00
pickle in chunks 2021-03-24 13:29:06 +01:00
first commit 2021-03-18 17:43:00 +01:00			`if __name__ == '__main__':`
			`log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'`
			`logging.basicConfig(level=logging.INFO, format=log_fmt)`

			`# not used in this stub but often useful for finding various files`
			`project_dir = Path(__file__).resolve().parents[2]`

			`# find .env automagically by walking up directories until it's found, then`
			`# load up the .env entries as environment variables`
			`load_dotenv(find_dotenv())`

			`main()`