lot1-kickoff/airflow/dags/untar_beginnerskit.py

64 lines
2.0 KiB
Python
Raw Normal View History

2024-03-06 17:29:24 +01:00
import os
import tarfile
from datetime import datetime, timedelta
2024-03-06 17:47:14 +01:00
from io import BytesIO
2024-03-06 17:29:24 +01:00
from airflow.decorators import task
from airflow.models.baseoperator import chain
from airflow.models.dag import DAG
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME", "zenodo-bucket")
S3_BUCKET_KEY = os.getenv("S3_BUCKET_KEY", "test")
S3_BUCKET_KEY_LIST = os.getenv("S3_BUCKET_KEY_LIST", "test2")
S3_BUCKET_WILDCARD_KEY = os.getenv("S3_BUCKET_WILDCARD_KEY", "test*")
PREFIX = os.getenv("S3_PREFIX", "test")
INACTIVITY_PERIOD = float(os.getenv("INACTIVITY_PERIOD", 5))
AWS_DEFAULT_REGION = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
LOCAL_FILE_PATH = os.getenv("LOCAL_FILE_PATH", "/usr/local/airflow/dags/example_s3_test_file.txt")
AWS_CONN_ID = os.getenv("ASTRO_AWS_S3_CONN_ID", "s3_conn")
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
DATA = os.environ.get(
"DATA",
"""
apple,0.5
milk,2.5
bread,4.0
""",
)
default_args = {
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 2)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
@task
def untar_to_s3(key: str, bucket: str):
hook = S3Hook(AWS_CONN_ID, transfer_config_args={'use_threads': False})
tarball_obj = hook.get_key(key, bucket_name=bucket)
2024-03-06 17:58:28 +01:00
with tarfile.open(name=None, mode="r|", fileobj=tarball_obj.get()['Body']) as tarball:
2024-03-06 17:29:24 +01:00
for member in tarball:
if not member.isfile():
continue
2024-03-06 17:33:17 +01:00
fd = tarball.extractfile(member)
2024-03-06 17:47:14 +01:00
hook.load_file_obj(BytesIO(fd.read()), member.path, S3_BUCKET_NAME)
2024-03-06 17:29:24 +01:00
with DAG(
dag_id="untar_zenodo_organization",
schedule=None,
start_date=datetime(2021, 1, 1),
catchup=False,
default_args=default_args,
tags=["example", "async", "s3"],
) as dag:
untar_task = untar_to_s3("organization.tar", S3_BUCKET_NAME)
chain(untar_task)