63 lines
2.3 KiB
Python
63 lines
2.3 KiB
Python
# -*- coding: utf-8 -*-
|
|
import os
|
|
from urllib.request import urlretrieve
|
|
from urllib.parse import urlsplit
|
|
import tarfile
|
|
import click
|
|
import logging
|
|
from pathlib import Path
|
|
from dotenv import find_dotenv, load_dotenv
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
openaire_files = ["https://zenodo.org/record/7490192/files/communities_infrastructures.tar",
|
|
"https://zenodo.org/record/7490192/files/dataset.tar",
|
|
"https://zenodo.org/record/7490192/files/datasource.tar",
|
|
"https://zenodo.org/record/7490192/files/organization.tar",
|
|
"https://zenodo.org/record/7490192/files/otherresearchproduct.tar",
|
|
"https://zenodo.org/record/7490192/files/project.tar",
|
|
"https://zenodo.org/record/7490192/files/publication.tar",
|
|
"https://zenodo.org/record/7490192/files/relation.tar",
|
|
"https://zenodo.org/record/7490192/files/software.tar"]
|
|
|
|
|
|
def download_tar(url, path):
|
|
tar_name = urlsplit(url).path.split('/')[-1] # publication.tar
|
|
tar_path = os.path.join(path, tar_name) # data/raw/publication.tar
|
|
untarred_folder = tar_name.split('.')[0] # publication
|
|
untarred_path = os.path.join(path, untarred_folder) # data/raw/publication
|
|
if not os.path.exists(untarred_path):
|
|
if not os.path.exists(tar_path):
|
|
logger.info('downloading %s' % url)
|
|
urlretrieve(url, tar_path)
|
|
|
|
logger.info('untar %s' % tar_name)
|
|
with tarfile.open(tar_path, "r") as tar:
|
|
tar.extractall(path)
|
|
|
|
logger.info('cleaning')
|
|
os.remove(tar_path)
|
|
|
|
|
|
@click.command()
|
|
@click.argument('output_filepath', type=click.Path(exists=True))
|
|
def main(output_filepath):
|
|
""" Downloads data into /data/raw
|
|
"""
|
|
for tar in openaire_files:
|
|
download_tar(tar, output_filepath)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
|
logging.basicConfig(level=logging.INFO, format=log_fmt)
|
|
|
|
# not used in this stub but often useful for finding various files
|
|
project_dir = Path(__file__).resolve().parents[2]
|
|
|
|
# find .env automagically by walking up directories until it's found, then
|
|
# load up the .env entries as environment variables
|
|
load_dotenv(find_dotenv())
|
|
|
|
main()
|