2024-03-09 18:09:15 +01:00
|
|
|
#
|
|
|
|
# Licensed to the Apache Software Foundation (ASF) under one
|
|
|
|
# or more contributor license agreements. See the NOTICE file
|
|
|
|
# distributed with this work for additional information
|
|
|
|
# regarding copyright ownership. The ASF licenses this file
|
|
|
|
# to you under the Apache License, Version 2.0 (the
|
|
|
|
# "License"); you may not use this file except in compliance
|
|
|
|
# with the License. You may obtain a copy of the License at
|
|
|
|
#
|
|
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
#
|
|
|
|
# Unless required by applicable law or agreed to in writing,
|
|
|
|
# software distributed under the License is distributed on an
|
|
|
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
|
|
# KIND, either express or implied. See the License for the
|
|
|
|
# specific language governing permissions and limitations
|
|
|
|
# under the License.
|
|
|
|
"""
|
|
|
|
This is an example DAG which uses SparkKubernetesOperator and SparkKubernetesSensor.
|
|
|
|
In this example, we create two tasks which execute sequentially.
|
|
|
|
The first task is to submit sparkApplication on Kubernetes cluster(the example uses spark-pi application).
|
|
|
|
and the second task is to check the final state of the sparkApplication that submitted in the first state.
|
|
|
|
|
|
|
|
Spark-on-k8s operator is required to be already installed on Kubernetes
|
|
|
|
https://github.com/GoogleCloudPlatform/spark-on-k8s-operator
|
|
|
|
"""
|
|
|
|
|
|
|
|
from os import path
|
|
|
|
from datetime import timedelta, datetime
|
|
|
|
|
|
|
|
# [START import_module]
|
|
|
|
# The DAG object; we'll need this to instantiate a DAG
|
|
|
|
from airflow import DAG
|
|
|
|
# Operators; we need this to operate!
|
|
|
|
from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator
|
|
|
|
from airflow.providers.cncf.kubernetes.sensors.spark_kubernetes import SparkKubernetesSensor
|
|
|
|
from airflow.utils.dates import days_ago
|
|
|
|
|
|
|
|
# [END import_module]
|
|
|
|
|
|
|
|
# [START default_args]
|
|
|
|
# These args will get passed on to each operator
|
|
|
|
# You can override them on a per-task basis during operator initialization
|
|
|
|
default_args = {
|
|
|
|
'owner': 'airflow',
|
|
|
|
'depends_on_past': False,
|
|
|
|
'start_date': days_ago(1),
|
|
|
|
'email': ['airflow@example.com'],
|
|
|
|
'email_on_failure': False,
|
|
|
|
'email_on_retry': False,
|
|
|
|
'max_active_runs': 1,
|
|
|
|
'retries': 3
|
|
|
|
}
|
|
|
|
|
2024-03-10 12:58:45 +01:00
|
|
|
spec = {'apiVersion': 'sparkoperator.k8s.io/v1beta2',
|
|
|
|
'kind': 'SparkApplication',
|
|
|
|
'metadata': {
|
2024-03-10 14:07:59 +01:00
|
|
|
'name': 'spark-pi-{{ ds }}-{{ task_instance.try_number }}',
|
2024-03-10 12:58:45 +01:00
|
|
|
'namespace': 'lot1-spark-jobs'
|
|
|
|
},
|
|
|
|
'spec': {
|
|
|
|
'type': 'Scala',
|
|
|
|
'mode': 'cluster',
|
|
|
|
'image': 'apache/spark:v3.1.3',
|
|
|
|
'imagePullPolicy': 'Always',
|
|
|
|
'mainApplicationFile': 'local:///opt/spark/examples/jars/spark-examples_2.12-3.1.3.jar',
|
|
|
|
'mainClass': 'org.apache.spark.examples.SparkPi',
|
|
|
|
'sparkVersion': '3.1.3',
|
|
|
|
'restartPolicy': {'type': 'Never'},
|
2024-03-10 13:18:02 +01:00
|
|
|
# 'arguments': ['{{ds}}'],
|
2024-03-10 12:58:45 +01:00
|
|
|
'driver': {
|
|
|
|
'coreLimit': '1200m',
|
|
|
|
'cores': 1,
|
|
|
|
'labels': {'version': '3.1.3'},
|
|
|
|
'memory': '1g',
|
|
|
|
'serviceAccount': 'spark',
|
|
|
|
},
|
|
|
|
'executor': {
|
|
|
|
'cores': 1,
|
|
|
|
'instances': 1,
|
2024-03-10 13:14:27 +01:00
|
|
|
'memory': '512m',
|
2024-03-10 12:58:45 +01:00
|
|
|
'labels': {'version': '3.1.3'}
|
|
|
|
}
|
|
|
|
}}
|
2024-03-09 18:09:15 +01:00
|
|
|
|
|
|
|
dag = DAG(
|
|
|
|
'spark_pi',
|
|
|
|
default_args=default_args,
|
|
|
|
schedule_interval=None,
|
|
|
|
tags=['example', 'spark']
|
|
|
|
)
|
|
|
|
|
|
|
|
submit = SparkKubernetesOperator(
|
|
|
|
task_id='spark_pi_submit',
|
2024-03-09 18:15:21 +01:00
|
|
|
namespace='lot1-spark-jobs',
|
2024-03-10 12:58:45 +01:00
|
|
|
template_spec=spec,
|
2024-03-09 19:25:51 +01:00
|
|
|
kubernetes_conn_id="kubernetes_default",
|
2024-03-11 19:16:27 +01:00
|
|
|
# do_xcom_push=True,
|
2024-03-10 14:18:46 +01:00
|
|
|
# delete_on_termination=True,
|
2024-03-11 19:04:39 +01:00
|
|
|
base_container_name="spark-kubernetes-driver",
|
2024-03-09 19:42:58 +01:00
|
|
|
dag=dag
|
2024-03-09 18:09:15 +01:00
|
|
|
)
|
|
|
|
|
2024-03-10 14:18:46 +01:00
|
|
|
# sensor = SparkKubernetesSensor(
|
|
|
|
# task_id='spark_pi_monitor',
|
|
|
|
# namespace='lot1-spark-jobs',
|
|
|
|
# application_name="{{ task_instance.xcom_pull(task_ids='spark_pi_submit')['metadata']['name'] }}",
|
|
|
|
# kubernetes_conn_id="kubernetes_default",
|
|
|
|
# dag=dag,
|
|
|
|
# attach_log=False
|
|
|
|
# )
|
2024-03-09 18:09:15 +01:00
|
|
|
|
2024-03-10 14:18:46 +01:00
|
|
|
submit
|