lot1-kickoff/airflow/dags/S3_untar.py

99 lines
3.7 KiB
Python
Raw Normal View History

2024-06-10 00:46:04 +02:00
import os
import tarfile
2024-07-03 01:18:55 +02:00
import time
2024-06-10 00:46:04 +02:00
from datetime import timedelta
import pendulum
from airflow.decorators import dag
from airflow.decorators import task
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
2024-07-03 01:18:55 +02:00
from botocore.exceptions import ClientError
2024-06-10 00:46:04 +02:00
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
default_args = {
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
2024-07-03 01:18:55 +02:00
def check_for_key_with_backoff(hook: S3Hook, key:str, bucket:str) -> bool:
delay = 10 # initial delay
delay_incr = 10 # additional delay in each loop
max_delay = 60 # max delay of one loop. Total delay is (max_delay**2)/2
while delay < max_delay:
try:
return hook.check_for_key(key=key, bucket_name=bucket)
except ClientError as err:
code = err.response.get('Error',{}).get('Code', '')
if code in ['NoSuchBucket']:
print(f"Error: {code}. Check s3path: s3://{bucket}/{key}")
raise err
time.sleep(delay)
delay += delay_incr
def load_file_obj_with_backoff(hook: S3Hook, fileobj, key:str, bucket:str, replace:bool) -> bool:
delay = 10 # initial delay
delay_incr = 10 # additional delay in each loop
max_delay = 60 # max delay of one loop. Total delay is (max_delay**2)/2
while delay < max_delay:
try:
2024-07-03 01:21:43 +02:00
return hook.load_file_obj(fileobj,
2024-07-03 01:18:55 +02:00
key,
bucket,
replace=replace)
except ClientError as err:
code = err.response.get('Error',{}).get('Code', '')
if code in ['NoSuchBucket']:
print(f"Error: {code}. Check s3path: s3://{bucket}/{key}")
raise err
time.sleep(delay)
delay += delay_incr
2024-06-10 00:46:04 +02:00
@dag(
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
schedule=None,
catchup=False,
default_args=default_args,
params={
"src_key": "File to untar",
"src_bucket": "bucket containing the zip file",
"dst_key_prefix": "",
"dst_bucket": "bucket that will contain unzipped files"
},
tags=["s3"],
)
def s3_untar():
@task
def untar(**context):
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
s3_obj = hook.get_key(context["params"]["src_key"], bucket_name=context["params"]["src_bucket"])
2024-06-10 13:58:32 +02:00
with tarfile.open(fileobj=s3_obj.get()["Body"], mode='r|*') as tar:
2024-06-10 00:46:04 +02:00
for member in tar:
2024-06-10 10:55:32 +02:00
dst_key = context["params"]["dst_key_prefix"] + "/" + member.name
dst_key = os.path.normpath(dst_key)
2024-06-10 09:28:36 +02:00
# Ignore directories, links, devices, fifos, etc.
2024-06-10 13:41:03 +02:00
if (not member.isfile()) or member.name.endswith('/'):
2024-07-03 00:59:53 +02:00
print(f"Skipping {member.name}: is not a file")
2024-06-10 09:28:36 +02:00
continue
2024-07-03 01:18:55 +02:00
if check_for_key_with_backoff(hook, key=dst_key, bucket=context["params"]["dst_bucket"]):
2024-07-03 00:59:53 +02:00
print(f"Skipping {member.name}: already exists")
2024-07-03 01:01:37 +02:00
continue
2024-06-10 09:23:18 +02:00
print(f"Extracting {member.name} to {dst_key}")
2024-06-10 13:58:32 +02:00
fileobj = tar.extractfile(member)
fileobj.seekable = lambda: False
2024-07-03 01:18:55 +02:00
load_file_obj_with_backoff(hook, fileobj,
dst_key,
context["params"]["dst_bucket"],
replace=True)
2024-06-10 00:46:04 +02:00
untar()
s3_untar()