lot1-kickoff/airflow/dags/antispam-batch.py

69 lines
1.8 KiB
Python

import os
from datetime import timedelta, datetime
import pendulum
from airflow import DAG
from airflow.hooks.base import BaseHook
from airflow.models.baseoperator import chain
from airflow.providers.cncf.kubernetes.operators.pod import KubernetesPodOperator
from airflow.providers.cncf.kubernetes.secret import Secret
default_args = {
"execution_timeout": timedelta(days=6),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
conn = BaseHook.get_connection("opensearch_default")
dag = DAG(
'antispam_batch_check',
default_args=default_args,
schedule=None,
dagrun_timeout=None,
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
catchup=False,
schedule_interval=timedelta(days=1)
)
secrets = [
Secret(
deploy_type='env',
deploy_target='CURATION_OPENSEARCH__USER',
secret='opensearch-conn-secrets',
key='username',
),
Secret(
deploy_type='env',
deploy_target='CURATION_OPENSEARCH__PASSWORD',
secret='opensearch-conn-secrets',
key='password',
),
]
# Define the KubernetesPodOperator
task = KubernetesPodOperator(
task_id='antispam_checker',
name='antispam_checker',
namespace='kg-airflow',
image='gbloisi/curation:1.0.0',
image_pull_policy="Always",
cmds=['python3'],
arguments=['/antispam-batch.py',
"--opensearch.host", conn.host,
"--opensearch.port", str(conn.port),
"--openai.host", "local-ai.kg-airflow.svc.cluster.local",
"--openai.port", "8000",
"--parallelism", "36"
],
secrets=secrets,
is_delete_operator_pod=True,
in_cluster=True,
get_logs=True,
dag=dag
)
# Set the task dependencies
chain(task)