openaire_beginners_kit/src/data/make_dataset.py

63 lines
2.3 KiB
Python

# -*- coding: utf-8 -*-
import os
from urllib.request import urlretrieve
from urllib.parse import urlsplit
import tarfile
import click
import logging
from pathlib import Path
from dotenv import find_dotenv, load_dotenv
logger = logging.getLogger(__name__)
openaire_files = ["https://zenodo.org/record/7490192/files/communities_infrastructures.tar",
"https://zenodo.org/record/7490192/files/dataset.tar",
"https://zenodo.org/record/7490192/files/datasource.tar",
"https://zenodo.org/record/7490192/files/organization.tar",
"https://zenodo.org/record/7490192/files/otherresearchproduct.tar",
"https://zenodo.org/record/7490192/files/project.tar",
"https://zenodo.org/record/7490192/files/publication.tar",
"https://zenodo.org/record/7490192/files/relation.tar",
"https://zenodo.org/record/7490192/files/software.tar"]
def download_tar(url, path):
tar_name = urlsplit(url).path.split('/')[-1] # publication.tar
tar_path = os.path.join(path, tar_name) # data/raw/publication.tar
untarred_folder = tar_name.split('.')[0] # publication
untarred_path = os.path.join(path, untarred_folder) # data/raw/publication
if not os.path.exists(untarred_path):
if not os.path.exists(tar_path):
logger.info('downloading %s' % url)
urlretrieve(url, tar_path)
logger.info('untar %s' % tar_name)
with tarfile.open(tar_path, "r") as tar:
tar.extractall(path)
logger.info('cleaning')
os.remove(tar_path)
@click.command()
@click.argument('output_filepath', type=click.Path(exists=True))
def main(output_filepath):
""" Downloads data into /data/raw
"""
for tar in openaire_files:
download_tar(tar, output_filepath)
if __name__ == '__main__':
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)
# not used in this stub but often useful for finding various files
project_dir = Path(__file__).resolve().parents[2]
# find .env automagically by walking up directories until it's found, then
# load up the .env entries as environment variables
load_dotenv(find_dotenv())
main()