lot1-kickoff/airflow/dags/S3_untar.py

99 lines
3.7 KiB
Python

import os
import tarfile
import time
from datetime import timedelta
import pendulum
from airflow.decorators import dag
from airflow.decorators import task
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from botocore.exceptions import ClientError
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
default_args = {
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
def check_for_key_with_backoff(hook: S3Hook, key:str, bucket:str) -> bool:
delay = 10 # initial delay
delay_incr = 10 # additional delay in each loop
max_delay = 60 # max delay of one loop. Total delay is (max_delay**2)/2
while delay < max_delay:
try:
return hook.check_for_key(key=key, bucket_name=bucket)
except ClientError as err:
code = err.response.get('Error',{}).get('Code', '')
if code in ['NoSuchBucket']:
print(f"Error: {code}. Check s3path: s3://{bucket}/{key}")
raise err
time.sleep(delay)
delay += delay_incr
def load_file_obj_with_backoff(hook: S3Hook, fileobj, key:str, bucket:str, replace:bool) -> bool:
delay = 10 # initial delay
delay_incr = 10 # additional delay in each loop
max_delay = 60 # max delay of one loop. Total delay is (max_delay**2)/2
while delay < max_delay:
try:
return hook.load_file_obj(fileobj,
key,
bucket,
replace=replace)
except ClientError as err:
code = err.response.get('Error',{}).get('Code', '')
if code in ['NoSuchBucket']:
print(f"Error: {code}. Check s3path: s3://{bucket}/{key}")
raise err
time.sleep(delay)
delay += delay_incr
@dag(
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
schedule=None,
catchup=False,
default_args=default_args,
params={
"src_key": "File to untar",
"src_bucket": "bucket containing the zip file",
"dst_key_prefix": "",
"dst_bucket": "bucket that will contain unzipped files"
},
tags=["s3"],
)
def s3_untar():
@task
def untar(**context):
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
s3_obj = hook.get_key(context["params"]["src_key"], bucket_name=context["params"]["src_bucket"])
with tarfile.open(fileobj=s3_obj.get()["Body"], mode='r|*') as tar:
for member in tar:
dst_key = context["params"]["dst_key_prefix"] + "/" + member.name
dst_key = os.path.normpath(dst_key)
# Ignore directories, links, devices, fifos, etc.
if (not member.isfile()) or member.name.endswith('/'):
print(f"Skipping {member.name}: is not a file")
continue
if check_for_key_with_backoff(hook, key=dst_key, bucket=context["params"]["dst_bucket"]):
print(f"Skipping {member.name}: already exists")
continue
print(f"Extracting {member.name} to {dst_key}")
fileobj = tar.extractfile(member)
fileobj.seekable = lambda: False
load_file_obj_with_backoff(hook, fileobj,
dst_key,
context["params"]["dst_bucket"],
replace=True)
untar()
s3_untar()