2024-06-12 22:56:17 +02:00
|
|
|
import os
|
|
|
|
from datetime import timedelta, datetime
|
|
|
|
|
|
|
|
import pendulum
|
|
|
|
from airflow import DAG
|
|
|
|
from airflow.hooks.base import BaseHook
|
2024-06-12 23:00:21 +02:00
|
|
|
from airflow.models.baseoperator import chain
|
2024-06-12 22:56:17 +02:00
|
|
|
from airflow.providers.cncf.kubernetes.operators.pod import KubernetesPodOperator
|
|
|
|
from airflow.providers.cncf.kubernetes.secret import Secret
|
|
|
|
|
|
|
|
default_args = {
|
|
|
|
"execution_timeout": timedelta(days=6),
|
|
|
|
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
|
|
|
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
|
|
|
}
|
|
|
|
|
|
|
|
conn = BaseHook.get_connection("opensearch_default")
|
|
|
|
|
|
|
|
dag = DAG(
|
|
|
|
'antispam_batch_check',
|
|
|
|
default_args=default_args,
|
|
|
|
schedule=None,
|
|
|
|
dagrun_timeout=None,
|
|
|
|
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
|
|
|
catchup=False,
|
|
|
|
schedule_interval=timedelta(days=1)
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
secrets = [
|
|
|
|
Secret(
|
|
|
|
deploy_type='env',
|
2024-06-13 00:11:39 +02:00
|
|
|
deploy_target='CURATION_OPENSEARCH__USER',
|
2024-06-12 23:02:18 +02:00
|
|
|
secret='opensearch-conn-secrets',
|
2024-06-12 22:56:17 +02:00
|
|
|
key='username',
|
|
|
|
),
|
|
|
|
Secret(
|
|
|
|
deploy_type='env',
|
2024-06-13 00:11:39 +02:00
|
|
|
deploy_target='CURATION_OPENSEARCH__PASSWORD',
|
2024-06-12 23:02:18 +02:00
|
|
|
secret='opensearch-conn-secrets',
|
2024-06-12 22:56:17 +02:00
|
|
|
key='password',
|
|
|
|
),
|
|
|
|
]
|
|
|
|
|
|
|
|
# Define the KubernetesPodOperator
|
|
|
|
task = KubernetesPodOperator(
|
|
|
|
task_id='antispam_checker',
|
|
|
|
name='antispam_checker',
|
|
|
|
namespace='kg-airflow',
|
2024-06-12 23:04:19 +02:00
|
|
|
image='gbloisi/curation:1.0.0',
|
2024-06-12 23:23:44 +02:00
|
|
|
image_pull_policy="Always",
|
2024-06-13 00:49:59 +02:00
|
|
|
cmds=['python3'],
|
|
|
|
arguments=['/antispam-batch.py',
|
|
|
|
"--opensearch.host", conn.host,
|
|
|
|
"--opensearch.port", str(conn.port),
|
2024-06-13 01:11:13 +02:00
|
|
|
"--openai.host", "local-ai.kg-airflow.svc.cluster.local",
|
2024-06-13 00:49:59 +02:00
|
|
|
"--openai.port", "8000",
|
2024-06-18 21:35:08 +02:00
|
|
|
"--parallelism", "36"
|
2024-06-13 00:49:59 +02:00
|
|
|
],
|
2024-06-12 22:56:17 +02:00
|
|
|
secrets=secrets,
|
|
|
|
is_delete_operator_pod=True,
|
|
|
|
in_cluster=True,
|
|
|
|
get_logs=True,
|
|
|
|
dag=dag
|
|
|
|
)
|
|
|
|
|
|
|
|
# Set the task dependencies
|
2024-06-12 23:00:21 +02:00
|
|
|
chain(task)
|