Compare commits
2 Commits
41f2dab89d
...
b5e99701b1
Author | SHA1 | Date |
---|---|---|
Andrea Mannocci | b5e99701b1 | |
Andrea Mannocci | 4e30d743d8 |
File diff suppressed because one or more lines are too long
|
@ -8,3 +8,9 @@ coverage
|
|||
awscli
|
||||
flake8
|
||||
python-dotenv>=0.5.1
|
||||
|
||||
# analysis requirements
|
||||
pandas
|
||||
tldextract
|
||||
numpy
|
||||
plotly
|
||||
|
|
|
@ -3,7 +3,9 @@ import click
|
|||
import logging
|
||||
from pathlib import Path
|
||||
from dotenv import find_dotenv, load_dotenv
|
||||
|
||||
import pandas as pd
|
||||
import ast
|
||||
import os
|
||||
|
||||
@click.command()
|
||||
@click.argument('input_filepath', type=click.Path(exists=True))
|
||||
|
@ -13,8 +15,41 @@ def main(input_filepath, output_filepath):
|
|||
cleaned data ready to be analyzed (saved in ../processed).
|
||||
"""
|
||||
logger = logging.getLogger(__name__)
|
||||
logger.info('making final data set from raw data')
|
||||
|
||||
logger.info('Making final data set from raw data')
|
||||
logger.info('Loading the zipped dataset')
|
||||
df = pd.read_csv(os.path.join(input_filepath, 'initial_info_whole_20210322.tsv.gz'), compression='gzip', sep='\t', header=0,
|
||||
names=['orcid', 'claimed','verified_email', 'verified_primary_email',
|
||||
'given_names', 'family_name', 'biography', 'other_names', 'urls',
|
||||
'primary_email', 'other_emails', 'keywords', 'external_ids', 'education',
|
||||
'employment', 'n_works', 'works_source', 'activation_date', 'last_update_date',
|
||||
'n_doi', 'n_arxiv', 'n_pmc', 'n_other_pids'])
|
||||
|
||||
logger.info('Loading list columns')
|
||||
logger.info('... other_names')
|
||||
df['other_names'] = df[df.other_names.notna()]['other_names'].apply(lambda x: ast.literal_eval(x))
|
||||
logger.info('... keywords')
|
||||
df['keywords'] = df[df.keywords.notna()]['keywords'].apply(lambda x: ast.literal_eval(x))
|
||||
logger.info('... urls')
|
||||
df['urls'] = df[df.urls.notna()]['urls'].apply(lambda x: ast.literal_eval(x))
|
||||
logger.info('... other_emails')
|
||||
df['other_emails'] = df[df.other_emails.notna()]['other_emails'].apply(lambda x: ast.literal_eval(x))
|
||||
logger.info('... education')
|
||||
df['education'] = df[df.education.notna()]['education'].apply(lambda x: ast.literal_eval(x))
|
||||
logger.info('... employment')
|
||||
df['employment'] = df[df.employment.notna()]['employment'].apply(lambda x: ast.literal_eval(x))
|
||||
logger.info('... external_ids')
|
||||
df['external_ids'] = df[df.external_ids.notna()]['external_ids'].apply(lambda x: ast.literal_eval(x))
|
||||
logger.info('... works_source')
|
||||
df['works_source'] = df[df.works_source.notna()]['works_source'].apply(lambda x: ast.literal_eval(x))
|
||||
|
||||
openaire_orcid = pd.read_csv(os.path.join(input_filepath, 'orcid_openaire.txt'), header=None, names=['orcid'])
|
||||
df['label'] = df.orcid.isin(openaire_orcid['orcid'])
|
||||
df['label'] = df['label'].astype(int)
|
||||
|
||||
logger.info('Serializing the dataset in ../processed')
|
||||
df.to_pickle(os.path.join(output_filepath, 'dataset.pkl'))
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
|
|
Loading…
Reference in New Issue