lot1-kickoff/airflow/dags/S3_unzip.py

55 lines
1.9 KiB
Python

import os
from datetime import timedelta
import pendulum
from airflow.decorators import dag
from airflow.decorators import task
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.file import TemporaryDirectory
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
default_args = {
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
def s3_dowload_unzip_upload(s3conn: str, src_key: str, src_bucket: str, dest_bucket: str):
hook = S3Hook(s3conn, transfer_config_args={'use_threads': False})
with TemporaryDirectory() as dwl_dir:
with TemporaryDirectory() as tmp_dir:
archive = f'{dwl_dir}/{src_key}'
hook.download_file(key=src_key, bucket_name=src_bucket, local_path=dwl_dir, preserve_file_name=True,
use_autogenerated_subdir=False)
with zipfile.ZipFile(archive, 'r') as zip_ref:
for info in zip_ref.infolist():
with zip_ref.open(info.filename) as file:
hook.load_file_obj(file, info.filename, dest_bucket, replace=True)
@dag(
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
schedule=None,
catchup=False,
default_args=default_args,
params={
"zipfile": "File to unzip",
"src_bucket": "bucket containing the zip file",
"dst_bucket": "bucket that will contain unzipped files"
},
tags=["s3"],
)
def s3_unzip():
@task
def unzip(**context):
s3_dowload_unzip_upload(S3_CONN_ID,
context["params"]["zipfile"],
context["params"]["src_bucket"],
context["params"]["dst_bucket"])
unzip()
s3_unzip()