Add untar task
This commit is contained in:
parent
df6e23666e
commit
412e008df7
|
@ -0,0 +1,97 @@
|
|||
import os
|
||||
import tarfile
|
||||
import time
|
||||
from datetime import timedelta
|
||||
|
||||
import pendulum
|
||||
from airflow.decorators import dag
|
||||
from airflow.decorators import task
|
||||
from airflow.models.param import Param
|
||||
from airflow.operators.python import get_current_context
|
||||
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
import dag_utils
|
||||
|
||||
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||
|
||||
default_args = {
|
||||
"execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
|
||||
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
||||
}
|
||||
|
||||
|
||||
def load_file_obj_with_backoff(hook: S3Hook, fileobj, key: str, bucket: str, replace: bool) -> bool:
|
||||
delay = 10 # initial delay
|
||||
delay_incr = 10 # additional delay in each loop
|
||||
max_delay = 60 # max delay of one loop. Total delay is (max_delay**2)/2
|
||||
|
||||
while delay < max_delay:
|
||||
try:
|
||||
return hook.load_file_obj(fileobj,
|
||||
key,
|
||||
bucket,
|
||||
replace=replace)
|
||||
except ClientError as err:
|
||||
code = err.response.get('Error', {}).get('Code', '')
|
||||
if code in ['NoSuchBucket']:
|
||||
print(f"Error: {code}. Check s3path: s3://{bucket}/{key}")
|
||||
raise err
|
||||
time.sleep(delay)
|
||||
delay += delay_incr
|
||||
|
||||
|
||||
@dag(
|
||||
dag_id="s3_untar",
|
||||
dag_display_name="S3 streaming untar",
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
schedule=None,
|
||||
catchup=False,
|
||||
default_args=default_args,
|
||||
params={
|
||||
"S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection of S3 endpoint"),
|
||||
"src_bucket": Param(dag_utils.get_default_bucket(), type='string',
|
||||
description="Override S3 default bucket for source"),
|
||||
"src_key": Param("", type='string', description="File to untar"),
|
||||
"dst_bucket": Param(dag_utils.get_default_bucket(), type='string',
|
||||
description="Override S3 default bucket for destination"),
|
||||
"dst_key_prefix": Param("", type='string', description="Key prefix for unarchived files"),
|
||||
},
|
||||
tags=["s3", "tools"],
|
||||
)
|
||||
def s3_untar():
|
||||
@task
|
||||
def untar():
|
||||
context = get_current_context()
|
||||
hook = S3Hook(context["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
|
||||
src_bucket = context['params']['src_bucket']
|
||||
dst_bucket = context['params']['dst_bucket']
|
||||
dst_key_prefix = os.path.normpath(context["params"]["dst_key_prefix"])
|
||||
|
||||
existing_keys = dict.fromkeys(hook.list_keys(bucket_name=dst_bucket,
|
||||
prefix=dst_key_prefix + "/"), 0)
|
||||
s3_obj = hook.get_key(context["params"]["src_key"], bucket_name=src_bucket)
|
||||
|
||||
with tarfile.open(fileobj=s3_obj.get()["Body"], mode='r|*') as tar:
|
||||
for member in tar:
|
||||
dst_key = os.path.normpath(dst_key_prefix + "/" + member.name)
|
||||
# Ignore directories, links, devices, fifos, etc.
|
||||
if (not member.isfile()) or member.name.endswith('/'):
|
||||
print(f"Skipping {member.name}: is not a file")
|
||||
continue
|
||||
if dst_key in existing_keys:
|
||||
print(f"Skipping {member.name}: already exists")
|
||||
continue
|
||||
print(f"Extracting {member.name} to {dst_key}")
|
||||
fileobj = tar.extractfile(member)
|
||||
fileobj.seekable = lambda: False
|
||||
load_file_obj_with_backoff(hook, fileobj,
|
||||
dst_key,
|
||||
dst_bucket,
|
||||
replace=True)
|
||||
|
||||
untar()
|
||||
|
||||
|
||||
s3_untar()
|
Loading…
Reference in New Issue