2021-03-18 17:43:00 +01:00
|
|
|
# -*- coding: utf-8 -*-
|
|
|
|
import click
|
|
|
|
import logging
|
|
|
|
from pathlib import Path
|
|
|
|
from dotenv import find_dotenv, load_dotenv
|
2021-03-23 19:03:37 +01:00
|
|
|
import pandas as pd
|
|
|
|
import ast
|
|
|
|
import os
|
2021-03-25 15:20:06 +01:00
|
|
|
import tldextract
|
|
|
|
|
|
|
|
def fix_keywords(lst):
|
|
|
|
fixed = set()
|
|
|
|
for k in lst:
|
|
|
|
tokens = set(k.split(','))
|
|
|
|
for t in tokens:
|
|
|
|
fixed.add(str.strip(t))
|
|
|
|
fixed.discard('')
|
|
|
|
return list(fixed)
|
|
|
|
|
|
|
|
def extract_email_domains(lst):
|
|
|
|
res = []
|
|
|
|
for email in lst:
|
|
|
|
res.append(email.split('@')[1])
|
|
|
|
return res
|
|
|
|
|
|
|
|
def extract_url_domains(lst):
|
|
|
|
domains = []
|
|
|
|
for e in lst:
|
|
|
|
# e[0] is a string describing the url
|
2021-03-31 12:56:49 +02:00
|
|
|
# e[1] is the url or IP address
|
|
|
|
domain = tldextract.extract(e[1]).registered_domain
|
|
|
|
if domain == '':
|
|
|
|
# it's an IP address
|
|
|
|
domains.append(tldextract.extract(e[1]).domain)
|
|
|
|
else:
|
|
|
|
domains.append(domain)
|
2021-03-25 15:20:06 +01:00
|
|
|
return domains
|
2021-03-18 17:43:00 +01:00
|
|
|
|
|
|
|
@click.command()
|
|
|
|
@click.argument('input_filepath', type=click.Path(exists=True))
|
|
|
|
@click.argument('output_filepath', type=click.Path())
|
|
|
|
def main(input_filepath, output_filepath):
|
2021-03-29 15:57:24 +02:00
|
|
|
""" Runs data processing scripts to turn raw data from (./data/raw) into
|
|
|
|
cleaned data ready to be analyzed (saved in ./data/processed).
|
2021-03-18 17:43:00 +01:00
|
|
|
"""
|
|
|
|
logger = logging.getLogger(__name__)
|
2021-03-23 19:03:37 +01:00
|
|
|
logger.info('Making final data set from raw data')
|
|
|
|
logger.info('Loading the zipped dataset')
|
2021-03-25 15:20:06 +01:00
|
|
|
df = pd.read_csv(os.path.join(input_filepath, 'data.gz'), compression='gzip',
|
|
|
|
sep='\t', header=None,
|
|
|
|
names=['orcid','verified_email', 'verified_primary_email',
|
2021-03-23 19:03:37 +01:00
|
|
|
'given_names', 'family_name', 'biography', 'other_names', 'urls',
|
|
|
|
'primary_email', 'other_emails', 'keywords', 'external_ids', 'education',
|
|
|
|
'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date',
|
2021-03-29 15:57:24 +02:00
|
|
|
'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'], encoding = 'utf-8',
|
|
|
|
dtype={'orcid': 'string',
|
|
|
|
'verified_email':'bool',
|
|
|
|
'verified_primary_email':'bool',
|
|
|
|
'given_names': 'string',
|
|
|
|
'family_name': 'string',
|
|
|
|
'biography': 'string',
|
|
|
|
'primary_email': 'string',
|
2021-04-15 15:01:34 +02:00
|
|
|
# 'activation_date': 'string',
|
|
|
|
# 'last_update_date': 'string',
|
2021-03-29 16:50:39 +02:00
|
|
|
'n_works': pd.Int16Dtype(),
|
|
|
|
'n_doi': pd.Int16Dtype(),
|
|
|
|
'n_arxiv': pd.Int16Dtype(),
|
|
|
|
'n_pmc': pd.Int16Dtype(),
|
2021-04-15 15:01:34 +02:00
|
|
|
'n_other_pids': pd.Int16Dtype()},
|
|
|
|
parse_dates=['activation_date', 'last_update_date'])
|
2021-03-23 19:03:37 +01:00
|
|
|
|
|
|
|
logger.info('Loading list columns')
|
|
|
|
logger.info('... other_names')
|
|
|
|
df['other_names'] = df[df.other_names.notna()]['other_names'].apply(lambda x: ast.literal_eval(x))
|
2021-03-24 12:21:00 +01:00
|
|
|
|
2021-03-23 19:03:37 +01:00
|
|
|
logger.info('... keywords')
|
|
|
|
df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: ast.literal_eval(x))
|
2021-03-24 12:21:00 +01:00
|
|
|
|
2021-03-23 19:03:37 +01:00
|
|
|
logger.info('... urls')
|
|
|
|
df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x))
|
2021-03-24 12:21:00 +01:00
|
|
|
|
2021-03-23 19:03:37 +01:00
|
|
|
logger.info('... other_emails')
|
|
|
|
df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x))
|
2021-03-24 12:21:00 +01:00
|
|
|
|
2021-03-23 19:03:37 +01:00
|
|
|
logger.info('... education')
|
|
|
|
df['education'] = df[df.education.notna()]['education'].apply(lambda x: ast.literal_eval(x))
|
2021-03-24 12:21:00 +01:00
|
|
|
|
2021-03-23 19:03:37 +01:00
|
|
|
logger.info('... employment')
|
|
|
|
df['employment'] = df[df.employment.notna()]['employment'].apply(lambda x: ast.literal_eval(x))
|
2021-03-24 12:21:00 +01:00
|
|
|
|
2021-03-23 19:03:37 +01:00
|
|
|
logger.info('... external_ids')
|
|
|
|
df['external_ids'] = df[df.external_ids.notna()]['external_ids'].apply(lambda x: ast.literal_eval(x))
|
2021-03-24 12:21:00 +01:00
|
|
|
|
2021-03-23 19:03:37 +01:00
|
|
|
logger.info('... works_source')
|
|
|
|
df['works_source'] = df[df.works_source.notna()]['works_source'].apply(lambda x: ast.literal_eval(x))
|
|
|
|
|
2021-03-24 12:21:00 +01:00
|
|
|
logger.info('Integrating labels from ORCID found in OpenAIRE')
|
2021-03-23 19:03:37 +01:00
|
|
|
openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid'])
|
|
|
|
df['label'] = df.orcid.isin(openaire_orcid['orcid'])
|
2021-03-29 15:57:24 +02:00
|
|
|
df['label'] = df['label'].astype('bool')
|
2021-03-23 19:03:37 +01:00
|
|
|
|
2021-03-25 15:20:06 +01:00
|
|
|
logger.info('Fixing keywords')
|
|
|
|
df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: fix_keywords(x))
|
|
|
|
|
|
|
|
logger.info('Extracting domains from URLs and emails')
|
|
|
|
df['primary_email_domain'] = df[df.primary_email.notna()]['primary_email'].apply(lambda x: x.split('@')[1])
|
|
|
|
df['other_email_domains'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: extract_email_domains(x))
|
|
|
|
df['url_domains'] = df[df.urls.notna()]['urls'].apply(lambda x: extract_url_domains(x))
|
|
|
|
|
|
|
|
logger.info('Creating simple numeric columns')
|
|
|
|
df['n_emails'] = df.other_emails.str.len()
|
2021-03-29 16:50:39 +02:00
|
|
|
df.n_emails = df.n_emails.astype(pd.Int16Dtype())
|
2021-03-29 15:57:24 +02:00
|
|
|
|
2021-03-25 15:20:06 +01:00
|
|
|
df['n_urls'] = df.url_domains.str.len()
|
2021-03-29 16:50:39 +02:00
|
|
|
df.n_urls = df.n_urls.astype(pd.Int16Dtype())
|
2021-03-29 15:57:24 +02:00
|
|
|
|
2021-03-25 15:20:06 +01:00
|
|
|
df['n_ids'] = df.external_ids.str.len()
|
2021-03-29 16:50:39 +02:00
|
|
|
df.n_ids = df.n_ids.astype(pd.Int16Dtype())
|
2021-03-29 15:57:24 +02:00
|
|
|
|
2021-03-25 15:20:06 +01:00
|
|
|
df['n_keywords'] = df.keywords.str.len()
|
2021-03-29 16:50:39 +02:00
|
|
|
df.n_keywords = df.n_keywords.astype(pd.Int16Dtype())
|
2021-03-29 15:57:24 +02:00
|
|
|
|
2021-03-25 15:20:06 +01:00
|
|
|
df['n_education'] = df.education.str.len()
|
2021-03-29 16:50:39 +02:00
|
|
|
df.n_education = df.n_education.astype(pd.Int16Dtype())
|
2021-03-29 15:57:24 +02:00
|
|
|
|
2021-03-25 15:20:06 +01:00
|
|
|
df['n_employment'] = df.employment.str.len()
|
2021-03-29 16:50:39 +02:00
|
|
|
df.n_employment = df.n_employment.astype(pd.Int16Dtype())
|
2021-03-25 15:20:06 +01:00
|
|
|
|
|
|
|
logger.info('Dropping useless columns')
|
|
|
|
df = df.drop(['urls', 'other_emails'], axis=1)
|
|
|
|
|
2021-03-24 13:29:06 +01:00
|
|
|
logger.info('Serializing the dataset in ./data/processed')
|
|
|
|
n = 1000000
|
|
|
|
chunks = [df[i:i+n] for i in range(0, df.shape[0], n)]
|
|
|
|
for i in range(len(chunks)):
|
|
|
|
chunks[i].to_pickle(os.path.join(output_filepath, 'dataset.pkl.part%02d' % i))
|
2021-03-23 19:03:37 +01:00
|
|
|
|
2021-03-24 12:21:00 +01:00
|
|
|
logger.info('DONE!')
|
2021-03-29 15:57:24 +02:00
|
|
|
print(df.info())
|
2021-03-18 17:43:00 +01:00
|
|
|
|
2021-03-24 13:29:06 +01:00
|
|
|
|
2021-03-18 17:43:00 +01:00
|
|
|
if __name__ == '__main__':
|
|
|
|
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
|
|
logging.basicConfig(level=logging.INFO, format=log_fmt)
|
|
|
|
|
|
|
|
# not used in this stub but often useful for finding various files
|
|
|
|
project_dir = Path(__file__).resolve().parents[2]
|
|
|
|
|
|
|
|
# find .env automagically by walking up directories until it's found, then
|
|
|
|
# load up the .env entries as environment variables
|
|
|
|
load_dotenv(find_dotenv())
|
|
|
|
|
|
|
|
main()
|