# -*- coding: utf-8 -*- import click import logging from pathlib import Path from dotenv import find_dotenv, load_dotenv import re import ast import os import requests import zipfile import tldextract import pandas as pd def fix_keywords(lst): fixed = set() for k in lst: tokens = set(k.split(',')) for t in tokens: fixed.add(str.strip(t)) fixed.discard('') return list(fixed) def extract_email_domains(lst): res = [] for email in lst: res.append(email.split('@')[1]) return res def extract_url_domains(lst): domains = set() for e in lst: # e[0] is a string describing the url # e[1] is the url or IP address domain = tldextract.extract(e[1]).registered_domain if domain == '': # it's an IP address domains.add(tldextract.extract(e[1]).domain) else: domains.add(domain) return domains @click.command() @click.argument('input_filepath', type=click.Path(exists=True)) @click.argument('output_filepath', type=click.Path()) @click.argument('external_filepath', type=click.Path()) def main(input_filepath, output_filepath, external_filepath): """ Runs data processing scripts to turn raw data from (./data/raw) into cleaned data ready to be analyzed (saved in ./data/processed). """ logger = logging.getLogger(__name__) logger.info('Fetching external datasets') logger.info('Downloading grid.ac dataset') r = requests.get('https://ndownloader.figshare.com/files/27251693') with open(os.path.join(external_filepath, 'grid.zip'), 'wb') as f: f.write(r.content) logger.info('... unzipping') try: with zipfile.ZipFile(os.path.join(external_filepath, 'grid.zip')) as z: z.extractall(os.path.join(external_filepath, 'grid')) logger.info('... done!') except: logger.error('Invalid file') logger.info('Making pickle dataset from raw data') logger.info('Loading the zipped dataset') df = pd.read_csv(os.path.join(input_filepath, 'data.gz'), compression='gzip', sep='\t', header=None, names=['orcid','verified_email', 'verified_primary_email', 'given_names', 'family_name', 'biography', 'other_names', 'urls', 'primary_email', 'other_emails', 'keywords', 'external_ids', 'education', 'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date', 'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids', 'other_urls'], encoding = 'utf-8', dtype={'orcid': 'string', 'verified_email':'bool', 'verified_primary_email':'bool', 'given_names': 'string', 'family_name': 'string', 'biography': 'string', 'primary_email': 'string', # 'activation_date': 'string', # 'last_update_date': 'string', 'n_works': pd.Int16Dtype(), 'n_doi': pd.Int16Dtype(), 'n_arxiv': pd.Int16Dtype(), 'n_pmc': pd.Int16Dtype(), 'n_other_pids': pd.Int16Dtype()}, parse_dates=['activation_date', 'last_update_date']) logger.info('Loading list columns') logger.info('... other_names') df['other_names'] = df[df.other_names.notna()]['other_names'].apply(lambda x: ast.literal_eval(x)) logger.info('... keywords') df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: ast.literal_eval(x)) logger.info('... urls') df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x)) df['other_urls'] = df[df.other_urls.notna()]['other_urls'].apply(lambda x: ast.literal_eval(x)) logger.info('... other_emails') df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x)) logger.info('... education') df['education'] = df[df.education.notna()]['education'].apply(lambda x: ast.literal_eval(x)) logger.info('... employment') df['employment'] = df[df.employment.notna()]['employment'].apply(lambda x: ast.literal_eval(x)) logger.info('... external_ids') df['external_ids'] = df[df.external_ids.notna()]['external_ids'].apply(lambda x: ast.literal_eval(x)) logger.info('... works_source') df['works_source'] = df[df.works_source.notna()]['works_source'].apply(lambda x: ast.literal_eval(x)) logger.info('Integrating labels from ORCID found in OpenAIRE') openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid']) df['label'] = df.orcid.isin(openaire_orcid['orcid']) df['label'] = df['label'].astype('bool') logger.info('Fixing keywords') df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x)) logger.info('Extracting domains from URLs and emails') df['primary_email_domain'] = df[df.primary_email.notna()]['primary_email'].apply(lambda x: x.split('@')[1]) df['other_email_domains'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: extract_email_domains(x)) df['url_domains'] = df[df.urls.notna()]['urls'].apply(lambda x: extract_url_domains(x)) df['other_url_domains'] = df[df.other_urls.notna()]['other_urls'].apply(lambda x: extract_url_domains(x)) logger.info('Creating simple numeric columns') df['n_emails'] = df.other_emails.str.len() df.n_emails = df.n_emails.astype(pd.Int16Dtype()) df['n_urls'] = df.url_domains.str.len() df.n_urls = df.n_urls.astype(pd.Int16Dtype()) df['n_ids'] = df.external_ids.str.len() df.n_ids = df.n_ids.astype(pd.Int16Dtype()) df['n_keywords'] = df.keywords.str.len() df.n_keywords = df.n_keywords.astype(pd.Int16Dtype()) df['n_education'] = df.education.str.len() df.n_education = df.n_education.astype(pd.Int16Dtype()) df['n_employment'] = df.employment.str.len() df.n_employment = df.n_employment.astype(pd.Int16Dtype()) logger.info('Dropping useless columns') df.drop(['other_emails'], axis=1, inplace=True) logger.info('Serializing the dataset in ./data/processed') n = 1000000 chunks = [df[i:i+n] for i in range(0, df.shape[0], n)] for i in range(len(chunks)): chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i)) logger.info('DONE!') print(df.info()) if __name__ == '__main__': log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' logging.basicConfig(level=logging.INFO, format=log_fmt) # not used in this stub but often useful for finding various files project_dir = Path(__file__).resolve().parents[2] # find .env automagically by walking up directories until it's found, then # load up the .env entries as environment variables load_dotenv(find_dotenv()) main()