registries_analysis/src/data/make_dataset.py

61 lines
2.3 KiB
Python
Raw Normal View History

2021-07-02 17:49:38 +02:00
# -*- coding: utf-8 -*-
import csv
import os
import json
import click
import logging
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
def get_value_or_none(obj, key):
if key in obj:
return obj[key]['value']
else:
return None
@click.command()
@click.argument('input_filepath', type=click.Path(exists=True))
@click.argument('output_filepath', type=click.Path())
def main(input_filepath, output_filepath):
""" Runs data processing scripts to turn raw data from (../raw) into
cleaned data ready to be analyzed (saved in ../processed).
"""
logger = logging.getLogger(__name__)
logger.info('making final data set from raw data')
with open(os.path.join(input_filepath, 'OpenAIRE_DS_re3data_opendoar.json'), mode='r') as f:
with open(os.path.join(output_filepath, 're3data_opendoar.csv'), mode='w') as csvfile:
csv_writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
csv_writer.writerow(['id', 'url', 'official_name', 'english_name', 'description', 'latitude', 'longitude', 'subjects'])
for line in f:
repo = json.loads(line)
identifier = repo['id']
official_name = repo['officialname']['value']
url = get_value_or_none(repo, 'websiteurl')
english_name = get_value_or_none(repo, 'englishname')
description = get_value_or_none(repo, 'description')
latitude = get_value_or_none(repo, 'latitude')
longitude = get_value_or_none(repo, 'longitude')
subjects = []
for s in repo['subjects']:
subjects.append(s['value'])
csv_writer.writerow([identifier, url, official_name, english_name, description, latitude, longitude, subjects])
if __name__ == '__main__':
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)
# not used in this stub but often useful for finding various files
project_dir = Path(__file__).resolve().parents[2]
# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
load_dotenv(find_dotenv())
main()