2024-06-10 00:46:04 +02:00
|
|
|
import os
|
|
|
|
import tarfile
|
|
|
|
from datetime import timedelta
|
|
|
|
|
|
|
|
import pendulum
|
|
|
|
from airflow.decorators import dag
|
|
|
|
from airflow.decorators import task
|
|
|
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
|
|
|
|
|
|
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
|
|
|
|
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
|
|
|
|
|
|
|
default_args = {
|
|
|
|
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
|
|
|
|
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
|
|
|
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
@dag(
|
|
|
|
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
|
|
|
schedule=None,
|
|
|
|
catchup=False,
|
|
|
|
default_args=default_args,
|
|
|
|
params={
|
|
|
|
"src_key": "File to untar",
|
|
|
|
"src_bucket": "bucket containing the zip file",
|
|
|
|
"dst_key_prefix": "",
|
|
|
|
"dst_bucket": "bucket that will contain unzipped files"
|
|
|
|
},
|
|
|
|
tags=["s3"],
|
|
|
|
)
|
|
|
|
def s3_untar():
|
|
|
|
@task
|
|
|
|
def untar(**context):
|
|
|
|
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
|
|
|
|
s3_obj = hook.get_key(context["params"]["src_key"], bucket_name=context["params"]["src_bucket"])
|
2024-06-10 13:58:32 +02:00
|
|
|
with tarfile.open(fileobj=s3_obj.get()["Body"], mode='r|*') as tar:
|
2024-06-10 00:46:04 +02:00
|
|
|
for member in tar:
|
2024-06-10 10:55:32 +02:00
|
|
|
dst_key = context["params"]["dst_key_prefix"] + "/" + member.name
|
|
|
|
dst_key = os.path.normpath(dst_key)
|
2024-06-10 09:28:36 +02:00
|
|
|
# Ignore directories, links, devices, fifos, etc.
|
2024-06-10 13:41:03 +02:00
|
|
|
if (not member.isfile()) or member.name.endswith('/'):
|
2024-06-11 21:58:07 +02:00
|
|
|
print(f"Skipping {member.name}")
|
2024-06-10 09:28:36 +02:00
|
|
|
continue
|
2024-06-10 09:23:18 +02:00
|
|
|
print(f"Extracting {member.name} to {dst_key}")
|
2024-06-10 13:58:32 +02:00
|
|
|
fileobj = tar.extractfile(member)
|
|
|
|
fileobj.seekable = lambda: False
|
|
|
|
hook.load_file_obj(fileobj,
|
2024-06-10 10:57:35 +02:00
|
|
|
dst_key,
|
|
|
|
context["params"]["dst_bucket"],
|
|
|
|
replace=True)
|
2024-06-10 00:46:04 +02:00
|
|
|
|
|
|
|
untar()
|
|
|
|
|
|
|
|
|
|
|
|
s3_untar()
|