2024-06-07 18:10:48 +02:00
|
|
|
import os
|
|
|
|
from datetime import timedelta
|
|
|
|
|
|
|
|
import pendulum
|
2024-06-07 19:17:13 +02:00
|
|
|
import requests
|
2024-06-07 18:10:48 +02:00
|
|
|
from airflow.decorators import dag
|
2024-06-07 19:17:13 +02:00
|
|
|
from airflow.decorators import task
|
|
|
|
from airflow.hooks.base import BaseHook
|
|
|
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
2024-06-07 18:10:48 +02:00
|
|
|
|
|
|
|
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
|
|
|
|
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
|
|
|
|
|
|
|
default_args = {
|
|
|
|
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
|
|
|
|
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
|
|
|
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
|
|
|
}
|
|
|
|
|
2024-06-07 19:17:13 +02:00
|
|
|
|
|
|
|
@dag(
|
|
|
|
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
|
|
|
schedule=None,
|
|
|
|
catchup=False,
|
|
|
|
default_args=default_args,
|
|
|
|
params={
|
|
|
|
"file": "File to download",
|
|
|
|
"dst_bucket": "bucket that will contain file"
|
|
|
|
},
|
|
|
|
tags=["s3"],
|
|
|
|
)
|
|
|
|
def openaire_to_s3():
|
|
|
|
@task
|
|
|
|
def download(**context):
|
|
|
|
conn = BaseHook.get_connection("openaire_default")
|
2024-06-07 19:31:47 +02:00
|
|
|
session = requests.Session()
|
|
|
|
session.auth = (conn.login, conn.password)
|
2024-06-07 19:17:13 +02:00
|
|
|
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
|
2024-06-07 19:31:47 +02:00
|
|
|
with session.get("https://" + conn.host + "/data/graph/" + context["params"]["file"], stream=True) as r:
|
2024-06-07 19:17:13 +02:00
|
|
|
r.raise_for_status()
|
|
|
|
hook.load_file_obj(r.raw, "/data/graph/" + context["params"]["file"], bucket_name=context["params"]["dst_bucket"], replace=True, encrypt=False)
|
|
|
|
|
|
|
|
download()
|
|
|
|
|
|
|
|
|
|
|
|
openaire_to_s3()
|