lot1-kickoff/airflow/dags/antispam-batch.py

69 lines
1.8 KiB
Python
Raw Normal View History

2024-06-12 22:56:17 +02:00
import os
from datetime import timedelta, datetime
import pendulum
from airflow import DAG
from airflow.hooks.base import BaseHook
2024-06-12 23:00:21 +02:00
from airflow.models.baseoperator import chain
2024-06-12 22:56:17 +02:00
from airflow.providers.cncf.kubernetes.operators.pod import KubernetesPodOperator
from airflow.providers.cncf.kubernetes.secret import Secret
default_args = {
"execution_timeout": timedelta(days=6),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
conn = BaseHook.get_connection("opensearch_default")
dag = DAG(
'antispam_batch_check',
default_args=default_args,
schedule=None,
dagrun_timeout=None,
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
catchup=False,
schedule_interval=timedelta(days=1)
)
secrets = [
Secret(
deploy_type='env',
2024-06-13 00:11:39 +02:00
deploy_target='CURATION_OPENSEARCH__USER',
2024-06-12 23:02:18 +02:00
secret='opensearch-conn-secrets',
2024-06-12 22:56:17 +02:00
key='username',
),
Secret(
deploy_type='env',
2024-06-13 00:11:39 +02:00
deploy_target='CURATION_OPENSEARCH__PASSWORD',
2024-06-12 23:02:18 +02:00
secret='opensearch-conn-secrets',
2024-06-12 22:56:17 +02:00
key='password',
),
]
# Define the KubernetesPodOperator
task = KubernetesPodOperator(
task_id='antispam_checker',
name='antispam_checker',
namespace='kg-airflow',
2024-06-12 23:04:19 +02:00
image='gbloisi/curation:1.0.0',
2024-06-12 23:23:44 +02:00
image_pull_policy="Always",
2024-06-13 00:49:59 +02:00
cmds=['python3'],
arguments=['/antispam-batch.py',
"--opensearch.host", conn.host,
"--opensearch.port", str(conn.port),
2024-06-13 01:11:13 +02:00
"--openai.host", "local-ai.kg-airflow.svc.cluster.local",
2024-06-13 00:49:59 +02:00
"--openai.port", "8000",
2024-06-18 21:35:08 +02:00
"--parallelism", "36"
2024-06-13 00:49:59 +02:00
],
2024-06-12 22:56:17 +02:00
secrets=secrets,
is_delete_operator_pod=True,
in_cluster=True,
get_logs=True,
dag=dag
)
# Set the task dependencies
2024-06-12 23:00:21 +02:00
chain(task)