2024-03-06 17:29:24 +01:00
|
|
|
import os
|
|
|
|
import tarfile
|
|
|
|
from datetime import datetime, timedelta
|
2024-03-06 17:47:14 +01:00
|
|
|
from io import BytesIO
|
2024-03-06 17:29:24 +01:00
|
|
|
|
|
|
|
from airflow.decorators import task
|
|
|
|
from airflow.models.baseoperator import chain
|
|
|
|
from airflow.models.dag import DAG
|
|
|
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
|
|
|
|
|
|
|
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME", "zenodo-bucket")
|
|
|
|
S3_BUCKET_KEY = os.getenv("S3_BUCKET_KEY", "test")
|
|
|
|
S3_BUCKET_KEY_LIST = os.getenv("S3_BUCKET_KEY_LIST", "test2")
|
|
|
|
S3_BUCKET_WILDCARD_KEY = os.getenv("S3_BUCKET_WILDCARD_KEY", "test*")
|
|
|
|
PREFIX = os.getenv("S3_PREFIX", "test")
|
|
|
|
INACTIVITY_PERIOD = float(os.getenv("INACTIVITY_PERIOD", 5))
|
|
|
|
AWS_DEFAULT_REGION = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
|
|
|
|
LOCAL_FILE_PATH = os.getenv("LOCAL_FILE_PATH", "/usr/local/airflow/dags/example_s3_test_file.txt")
|
|
|
|
AWS_CONN_ID = os.getenv("ASTRO_AWS_S3_CONN_ID", "s3_conn")
|
|
|
|
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
|
|
|
DATA = os.environ.get(
|
|
|
|
"DATA",
|
|
|
|
"""
|
|
|
|
apple,0.5
|
|
|
|
milk,2.5
|
|
|
|
bread,4.0
|
|
|
|
""",
|
|
|
|
)
|
|
|
|
|
|
|
|
default_args = {
|
|
|
|
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
|
|
|
|
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 2)),
|
|
|
|
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
|
|
|
}
|
|
|
|
|
|
|
|
@task
|
|
|
|
def untar_to_s3(key: str, bucket: str):
|
|
|
|
hook = S3Hook(AWS_CONN_ID, transfer_config_args={'use_threads': False})
|
|
|
|
|
|
|
|
tarball_obj = hook.get_key(key, bucket_name=bucket)
|
|
|
|
|
2024-03-06 17:58:28 +01:00
|
|
|
with tarfile.open(name=None, mode="r|", fileobj=tarball_obj.get()['Body']) as tarball:
|
2024-03-06 17:29:24 +01:00
|
|
|
for member in tarball:
|
|
|
|
if not member.isfile():
|
|
|
|
continue
|
2024-03-06 17:33:17 +01:00
|
|
|
fd = tarball.extractfile(member)
|
2024-03-06 17:47:14 +01:00
|
|
|
hook.load_file_obj(BytesIO(fd.read()), member.path, S3_BUCKET_NAME)
|
2024-03-06 17:29:24 +01:00
|
|
|
|
|
|
|
|
|
|
|
with DAG(
|
|
|
|
dag_id="untar_zenodo_organization",
|
|
|
|
schedule=None,
|
|
|
|
start_date=datetime(2021, 1, 1),
|
|
|
|
catchup=False,
|
|
|
|
default_args=default_args,
|
|
|
|
tags=["example", "async", "s3"],
|
|
|
|
) as dag:
|
|
|
|
untar_task = untar_to_s3("organization.tar", S3_BUCKET_NAME)
|
|
|
|
|
|
|
|
chain(untar_task)
|
|
|
|
|
|
|
|
|
|
|
|
|