2024-06-07 18:10:48 +02:00
|
|
|
import os
|
|
|
|
from datetime import timedelta
|
|
|
|
|
|
|
|
import pendulum
|
2024-06-07 18:22:18 +02:00
|
|
|
from airflow import DAG
|
2024-06-07 18:10:48 +02:00
|
|
|
from airflow.decorators import dag
|
|
|
|
from airflow.providers.amazon.aws.transfers.http_to_s3 import HttpToS3Operator
|
|
|
|
|
|
|
|
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
|
|
|
|
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
|
|
|
|
|
|
|
default_args = {
|
|
|
|
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
|
|
|
|
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
|
|
|
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
|
|
|
}
|
|
|
|
|
2024-06-07 18:22:18 +02:00
|
|
|
with DAG(
|
|
|
|
dag_id="openaire_to_s3",
|
|
|
|
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
|
|
|
schedule=None,
|
|
|
|
catchup=False,
|
|
|
|
default_args=default_args,
|
|
|
|
params={
|
|
|
|
"file": "File to download",
|
|
|
|
"dst_bucket": "bucket that will contain file"
|
|
|
|
},
|
|
|
|
tags=["s3"]
|
|
|
|
) as dag:
|
|
|
|
HttpToS3Operator(
|
2024-06-07 18:10:48 +02:00
|
|
|
task_id="http_to_s3_task",
|
2024-06-07 18:36:19 +02:00
|
|
|
http_conn_id="openaire_default",
|
2024-06-07 18:40:21 +02:00
|
|
|
endpoint="/data/graph/{{ params.file }}",
|
2024-06-07 18:24:13 +02:00
|
|
|
aws_conn_id=S3_CONN_ID,
|
2024-06-07 18:40:59 +02:00
|
|
|
s3_bucket="{{ params.dst_bucket }}",
|
2024-06-07 18:40:21 +02:00
|
|
|
s3_key="/data/graph/{{ params.file }}",
|
2024-06-07 18:10:48 +02:00
|
|
|
replace=True,
|
|
|
|
)
|