2024-03-05 15:51:38 +01:00
|
|
|
import os
|
|
|
|
from datetime import datetime, timedelta
|
|
|
|
|
2024-03-06 15:15:22 +01:00
|
|
|
from airflow import settings
|
|
|
|
from airflow.decorators import task
|
|
|
|
from airflow.models.baseoperator import chain
|
|
|
|
from airflow.models.connection import Connection
|
2024-03-05 15:51:38 +01:00
|
|
|
from airflow.models.dag import DAG
|
|
|
|
from airflow.providers.amazon.aws.operators.s3 import (
|
|
|
|
S3CreateBucketOperator,
|
|
|
|
)
|
2024-03-06 15:15:22 +01:00
|
|
|
from airflow.providers.amazon.aws.transfers.http_to_s3 import HttpToS3Operator
|
2024-03-05 15:51:38 +01:00
|
|
|
|
2024-03-06 15:15:22 +01:00
|
|
|
S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME", "zenodo-bucket")
|
2024-03-05 15:51:38 +01:00
|
|
|
S3_BUCKET_KEY = os.getenv("S3_BUCKET_KEY", "test")
|
|
|
|
S3_BUCKET_KEY_LIST = os.getenv("S3_BUCKET_KEY_LIST", "test2")
|
|
|
|
S3_BUCKET_WILDCARD_KEY = os.getenv("S3_BUCKET_WILDCARD_KEY", "test*")
|
|
|
|
PREFIX = os.getenv("S3_PREFIX", "test")
|
|
|
|
INACTIVITY_PERIOD = float(os.getenv("INACTIVITY_PERIOD", 5))
|
|
|
|
AWS_DEFAULT_REGION = os.getenv("AWS_DEFAULT_REGION", "us-east-1")
|
|
|
|
LOCAL_FILE_PATH = os.getenv("LOCAL_FILE_PATH", "/usr/local/airflow/dags/example_s3_test_file.txt")
|
|
|
|
AWS_CONN_ID = os.getenv("ASTRO_AWS_S3_CONN_ID", "s3_conn")
|
|
|
|
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
|
|
|
DATA = os.environ.get(
|
|
|
|
"DATA",
|
|
|
|
"""
|
|
|
|
apple,0.5
|
|
|
|
milk,2.5
|
|
|
|
bread,4.0
|
|
|
|
""",
|
|
|
|
)
|
|
|
|
|
|
|
|
default_args = {
|
|
|
|
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
|
|
|
|
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 2)),
|
|
|
|
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
|
|
|
}
|
|
|
|
|
2024-03-06 15:15:22 +01:00
|
|
|
@task
|
|
|
|
def create_connection(conn_id_name: str):
|
|
|
|
conn = Connection(
|
|
|
|
conn_id=conn_id_name,
|
|
|
|
conn_type="https",
|
|
|
|
host="zenodo.org",
|
|
|
|
port=80,
|
|
|
|
)
|
|
|
|
session = settings.Session()
|
|
|
|
session.add(conn)
|
|
|
|
session.commit()
|
2024-03-05 15:51:38 +01:00
|
|
|
|
|
|
|
with DAG(
|
2024-03-06 15:15:22 +01:00
|
|
|
dag_id="zenodo_download_to_s3",
|
2024-03-05 15:51:38 +01:00
|
|
|
schedule=None,
|
|
|
|
start_date=datetime(2021, 1, 1),
|
|
|
|
catchup=False,
|
|
|
|
default_args=default_args,
|
|
|
|
tags=["example", "async", "s3"],
|
|
|
|
) as dag:
|
2024-03-06 15:15:22 +01:00
|
|
|
|
|
|
|
conn_id_name = "zenodo"
|
|
|
|
|
|
|
|
set_up_connection = create_connection(conn_id_name)
|
|
|
|
|
2024-03-05 15:51:38 +01:00
|
|
|
create_bucket = S3CreateBucketOperator(
|
|
|
|
task_id="create_bucket",
|
|
|
|
region_name=AWS_DEFAULT_REGION,
|
|
|
|
bucket_name=S3_BUCKET_NAME,
|
|
|
|
aws_conn_id=AWS_CONN_ID,
|
|
|
|
)
|
|
|
|
|
2024-03-06 15:15:22 +01:00
|
|
|
http_to_s3_task = HttpToS3Operator(
|
|
|
|
task_id="http_to_s3_task",
|
|
|
|
http_conn_id=conn_id_name,
|
|
|
|
endpoint="/records/8223812/files/organization.tar",
|
|
|
|
s3_bucket=S3_BUCKET_NAME,
|
|
|
|
s3_key="organization.tar",
|
|
|
|
replace=True,
|
2024-03-06 15:17:38 +01:00
|
|
|
aws_conn_id=AWS_CONN_ID,
|
2024-03-06 15:15:22 +01:00
|
|
|
)
|
2024-03-05 15:51:38 +01:00
|
|
|
|
2024-03-06 15:15:22 +01:00
|
|
|
chain(set_up_connection, create_bucket, http_to_s3_task)
|