lot1-kickoff/airflow/dags/untar_beginnerskit.py

64 lines
2.0 KiB
Python

import os
import tarfile
from datetime import datetime, timedelta
from io import BytesIO
from airflow.decorators import task
from airflow.models.baseoperator import chain
from airflow.models.dag import DAG
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME", "zenodo-bucket")
S3_BUCKET_KEY = os.getenv("S3_BUCKET_KEY", "test")
S3_BUCKET_KEY_LIST = os.getenv("S3_BUCKET_KEY_LIST", "test2")
S3_BUCKET_WILDCARD_KEY = os.getenv("S3_BUCKET_WILDCARD_KEY", "test*")
PREFIX = os.getenv("S3_PREFIX", "test")
INACTIVITY_PERIOD = float(os.getenv("INACTIVITY_PERIOD", 5))
AWS_DEFAULT_REGION = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
LOCAL_FILE_PATH = os.getenv("LOCAL_FILE_PATH", "/usr/local/airflow/dags/example_s3_test_file.txt")
AWS_CONN_ID = os.getenv("ASTRO_AWS_S3_CONN_ID", "s3_conn")
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
DATA = os.environ.get(
"DATA",
"""
apple,0.5
milk,2.5
bread,4.0
""",
)
default_args = {
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 2)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
@task
def untar_to_s3(key: str, bucket: str):
hook = S3Hook(AWS_CONN_ID, transfer_config_args={'use_threads': False})
tarball_obj = hook.get_key(key, bucket_name=bucket)
with tarfile.open(name=None, mode="r|", fileobj=tarball_obj.get()['Body']) as tarball:
for member in tarball:
if not member.isfile():
continue
fd = tarball.extractfile(member)
hook.load_file_obj(BytesIO(fd.read()), member.path, S3_BUCKET_NAME)
with DAG(
dag_id="untar_zenodo_organization",
schedule=None,
start_date=datetime(2021, 1, 1),
catchup=False,
default_args=default_args,
tags=["example", "async", "s3"],
) as dag:
untar_task = untar_to_s3("organization.tar", S3_BUCKET_NAME)
chain(untar_task)