import os import tarfile from datetime import datetime, timedelta from io import BytesIO from airflow.decorators import task from airflow.models.baseoperator import chain from airflow.models.dag import DAG from airflow.providers.amazon.aws.hooks.s3 import S3Hook S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME", "zenodo-bucket") S3_BUCKET_KEY = os.getenv("S3_BUCKET_KEY", "test") S3_BUCKET_KEY_LIST = os.getenv("S3_BUCKET_KEY_LIST", "test2") S3_BUCKET_WILDCARD_KEY = os.getenv("S3_BUCKET_WILDCARD_KEY", "test*") PREFIX = os.getenv("S3_PREFIX", "test") INACTIVITY_PERIOD = float(os.getenv("INACTIVITY_PERIOD", 5)) AWS_DEFAULT_REGION = os.getenv("AWS_DEFAULT_REGION", "us-east-1") LOCAL_FILE_PATH = os.getenv("LOCAL_FILE_PATH", "/usr/local/airflow/dags/example_s3_test_file.txt") AWS_CONN_ID = os.getenv("ASTRO_AWS_S3_CONN_ID", "s3_conn") EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6)) DATA = os.environ.get( "DATA", """ apple,0.5 milk,2.5 bread,4.0 """, ) default_args = { "execution_timeout": timedelta(hours=EXECUTION_TIMEOUT), "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 2)), "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))), } @task def untar_to_s3(key: str, bucket: str): hook = S3Hook(AWS_CONN_ID, transfer_config_args={'use_threads': False}) tarball_obj = hook.get_key(key, bucket_name=bucket) with tarfile.open(name=None, mode="r|", fileobj=tarball_obj.get()['Body']) as tarball: for member in tarball: if not member.isfile(): continue fd = tarball.extractfile(member) hook.load_file_obj(BytesIO(fd.read()), member.path, S3_BUCKET_NAME) with DAG( dag_id="untar_zenodo_organization", schedule=None, start_date=datetime(2021, 1, 1), catchup=False, default_args=default_args, tags=["example", "async", "s3"], ) as dag: untar_task = untar_to_s3("organization.tar", S3_BUCKET_NAME) chain(untar_task)