added workflow test
This commit is contained in:
parent
ed3422673f
commit
07ce192207
|
@ -0,0 +1,87 @@
|
|||
#
|
||||
# Licensed to the Apache Software Foundation (ASF) under one
|
||||
# or more contributor license agreements. See the NOTICE file
|
||||
# distributed with this work for additional information
|
||||
# regarding copyright ownership. The ASF licenses this file
|
||||
# to you under the Apache License, Version 2.0 (the
|
||||
# "License"); you may not use this file except in compliance
|
||||
# with the License. You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing,
|
||||
# software distributed under the License is distributed on an
|
||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
# KIND, either express or implied. See the License for the
|
||||
# specific language governing permissions and limitations
|
||||
# under the License.
|
||||
"""
|
||||
This is an example DAG which uses SparkKubernetesOperator and SparkKubernetesSensor.
|
||||
In this example, we create two tasks which execute sequentially.
|
||||
The first task is to submit sparkApplication on Kubernetes cluster(the example uses spark-pi application).
|
||||
and the second task is to check the final state of the sparkApplication that submitted in the first state.
|
||||
|
||||
Spark-on-k8s operator is required to be already installed on Kubernetes
|
||||
https://github.com/GoogleCloudPlatform/spark-on-k8s-operator
|
||||
"""
|
||||
|
||||
from os import path
|
||||
from datetime import timedelta, datetime
|
||||
from spark_configurator import SparkConfigurator
|
||||
|
||||
# [START import_module]
|
||||
# The DAG object; we'll need this to instantiate a DAG
|
||||
from airflow import DAG
|
||||
# Operators; we need this to operate!
|
||||
from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator
|
||||
from airflow.providers.cncf.kubernetes.sensors.spark_kubernetes import SparkKubernetesSensor
|
||||
from airflow.utils.dates import days_ago
|
||||
|
||||
|
||||
# [END import_module]
|
||||
|
||||
# [START default_args]
|
||||
# These args will get passed on to each operator
|
||||
# You can override them on a per-task basis during operator initialization
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'start_date': days_ago(1),
|
||||
'email': ['airflow@example.com'],
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'max_active_runs': 1,
|
||||
'retries': 3
|
||||
}
|
||||
|
||||
spec =SparkConfigurator(
|
||||
name="spark-scholix-{{ ds }}-{{ task_instance.try_number }}",
|
||||
mainClass="eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump",
|
||||
jarLocation = 's3a://deps/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||
arguments =[ "--sourcePath", "s3a://raw-graph/01", "--targetPath", "s3a://scholix"],\
|
||||
executor_cores=10,
|
||||
executor_memory="4G",
|
||||
executor_instances=1,
|
||||
executor_memoryOverhead="3G").get_configuration()
|
||||
|
||||
dag = DAG(
|
||||
'spark_scholix',
|
||||
default_args=default_args,
|
||||
schedule_interval=None,
|
||||
tags=['example', 'spark']
|
||||
)
|
||||
|
||||
submit = SparkKubernetesOperator(
|
||||
task_id='spark-scholix',
|
||||
namespace='dnet-spark-jobs',
|
||||
template_spec=spec,
|
||||
kubernetes_conn_id="kubernetes_default",
|
||||
# do_xcom_push=True,
|
||||
# delete_on_termination=True,
|
||||
# base_container_name="spark-kubernetes-driver",
|
||||
dag=dag
|
||||
)
|
||||
|
||||
|
||||
|
||||
submit
|
|
@ -0,0 +1,119 @@
|
|||
class SparkConfigurator:
|
||||
def __init__(self,
|
||||
name,
|
||||
mainClass,
|
||||
jarLocation:str,
|
||||
arguments,
|
||||
apiVersion=None,
|
||||
namespace="dnet-spark-jobs",
|
||||
image= "dnet-spark:1.0.0",
|
||||
driver_cores=1,
|
||||
driver_memory='1G',
|
||||
executor_cores=1,
|
||||
executor_memory="1G",
|
||||
executor_memoryOverhead= "1G",
|
||||
executor_instances=1
|
||||
) -> None:
|
||||
if apiVersion:
|
||||
self.apiVersion = apiVersion
|
||||
else:
|
||||
self.apiVersion = "sparkoperator.k8s.io/v1beta2"
|
||||
self.namespace= namespace
|
||||
self.name = name
|
||||
self.image= image
|
||||
self.mainClass = mainClass
|
||||
self.jarLocation = jarLocation
|
||||
self.arguments= arguments
|
||||
self.s3Configuration = {
|
||||
"spark.driver.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
|
||||
"spark.executor.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
|
||||
"spark.hadoop.fs.defaultFS": "s3a://spark",
|
||||
"spark.hadoop.fs.s3a.access.key": "minio",
|
||||
"spark.hadoop.fs.s3a.secret.key": "minio123",
|
||||
"spark.hadoop.fs.s3a.endpoint": "https://minio.dnet-minio-tenant.svc.cluster.local",
|
||||
"spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
|
||||
"spark.hadoop.fs.s3a.path.style.access": "true",
|
||||
"spark.hadoop.fs.s3a.attempts.maximum": "1",
|
||||
"spark.hadoop.fs.s3a.connection.establish.timeout": "5000",
|
||||
"spark.hadoop.fs.s3a.connection.timeout": "10001",
|
||||
"spark.hadoop.fs.s3a.connection.ssl.enabled": "false",
|
||||
"com.amazonaws.sdk.disableCertChecking": "true",
|
||||
"com.cloudera.com.amazonaws.sdk.disableCertChecking": "true",
|
||||
"fs.s3a.connection.ssl.strictverify": "false",
|
||||
"fs.s3a.connection.ssl.enabled": "false",
|
||||
"fs.s3a.ssl.enabled": "false",
|
||||
"spark.hadoop.fs.s3a.ssl.enabled": "false"
|
||||
}
|
||||
self.sparkResoruceConf= {
|
||||
'driver_cores':driver_cores,
|
||||
'driver_memory':driver_memory,
|
||||
'executor_cores':executor_cores,
|
||||
'executor_memory':executor_memory,
|
||||
'executor_instances':executor_instances,
|
||||
'memoryOverhead':executor_memoryOverhead
|
||||
|
||||
}
|
||||
|
||||
def get_configuration(self) -> dict:
|
||||
return {
|
||||
"apiVersion": self.apiVersion,
|
||||
"kind": "SparkApplication",
|
||||
"metadata": {
|
||||
"name": self.name,
|
||||
"namespace": self.namespace
|
||||
},
|
||||
"spec": {
|
||||
"type": "Scala",
|
||||
"mode": "cluster",
|
||||
"image":self.image,
|
||||
"imagePullPolicy": "IfNotPresent",
|
||||
"mainClass": self.mainClass,
|
||||
"mainApplicationFile": self.jarLocation,
|
||||
"arguments": self.arguments,
|
||||
"sparkVersion": "3.5.1",
|
||||
"sparkConf": self.s3Configuration,
|
||||
"restartPolicy": {
|
||||
"type": "Never"
|
||||
},
|
||||
"volumes": [
|
||||
{
|
||||
"name": "test-volume",
|
||||
"persistentVolumeClaim": {
|
||||
"claimName": "my-spark-pvc-tmp"
|
||||
}
|
||||
}
|
||||
],
|
||||
"driver": {
|
||||
"javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
|
||||
"cores": self.sparkResoruceConf['driver_cores'],
|
||||
"coreLimit": "1200m",
|
||||
"memory": self.sparkResoruceConf['driver_memory'],
|
||||
"labels": {
|
||||
"version": "3.5.1"
|
||||
},
|
||||
"serviceAccount": "spark",
|
||||
"volumeMounts": [
|
||||
{
|
||||
"name": "test-volume",
|
||||
"mountPath": "/tmp"
|
||||
}
|
||||
]
|
||||
},
|
||||
"executor": {
|
||||
"javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
|
||||
"cores": self.sparkResoruceConf['executor_cores'],
|
||||
"memoryOverhead": self.sparkResoruceConf['memoryOverhead'],
|
||||
"memory": self.sparkResoruceConf['executor_memory'],
|
||||
"instances": self.sparkResoruceConf['executor_instances'],
|
||||
"labels": {
|
||||
"version": "3.5.1"
|
||||
},
|
||||
"volumeMounts": [
|
||||
{
|
||||
"name": "test-volume",
|
||||
"mountPath": "/tmp"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue