oct-update #1
|
@ -0,0 +1,83 @@
|
||||||
|
#
|
||||||
|
# Licensed to the Apache Software Foundation (ASF) under one
|
||||||
|
# or more contributor license agreements. See the NOTICE file
|
||||||
|
# distributed with this work for additional information
|
||||||
|
# regarding copyright ownership. The ASF licenses this file
|
||||||
|
# to you under the Apache License, Version 2.0 (the
|
||||||
|
# "License"); you may not use this file except in compliance
|
||||||
|
# with the License. You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing,
|
||||||
|
# software distributed under the License is distributed on an
|
||||||
|
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||||
|
# KIND, either express or implied. See the License for the
|
||||||
|
# specific language governing permissions and limitations
|
||||||
|
# under the License.
|
||||||
|
"""
|
||||||
|
This is an example DAG which uses SparkKubernetesOperator and SparkKubernetesSensor.
|
||||||
|
In this example, we create two tasks which execute sequentially.
|
||||||
|
The first task is to submit sparkApplication on Kubernetes cluster(the example uses spark-pi application).
|
||||||
|
and the second task is to check the final state of the sparkApplication that submitted in the first state.
|
||||||
|
|
||||||
|
Spark-on-k8s operator is required to be already installed on Kubernetes
|
||||||
|
https://github.com/GoogleCloudPlatform/spark-on-k8s-operator
|
||||||
|
"""
|
||||||
|
|
||||||
|
from spark_configurator import SparkConfigurator
|
||||||
|
|
||||||
|
# [START import_module]
|
||||||
|
# The DAG object; we'll need this to instantiate a DAG
|
||||||
|
from airflow import DAG
|
||||||
|
# Operators; we need this to operate!
|
||||||
|
from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator
|
||||||
|
from airflow.providers.cncf.kubernetes.sensors.spark_kubernetes import SparkKubernetesSensor
|
||||||
|
from airflow.utils.dates import days_ago
|
||||||
|
|
||||||
|
# [END import_module]
|
||||||
|
|
||||||
|
# [START default_args]
|
||||||
|
# These args will get passed on to each operator
|
||||||
|
# You can override them on a per-task basis during operator initialization
|
||||||
|
default_args = {
|
||||||
|
'owner': 'airflow',
|
||||||
|
'depends_on_past': False,
|
||||||
|
'start_date': days_ago(1),
|
||||||
|
'email': ['airflow@example.com'],
|
||||||
|
'email_on_failure': False,
|
||||||
|
'email_on_retry': False,
|
||||||
|
'max_active_runs': 1,
|
||||||
|
'retries': 3
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
dag = DAG(
|
||||||
|
'dedup_graph',
|
||||||
|
default_args=default_args,
|
||||||
|
schedule_interval=None,
|
||||||
|
tags=['example', 'spark']
|
||||||
|
)
|
||||||
|
|
||||||
|
submit = SparkKubernetesOperator(
|
||||||
|
task_id='CreateSimRel',
|
||||||
|
namespace='dnet-spark-jobs',
|
||||||
|
template_spec=SparkConfigurator(
|
||||||
|
name="createsimrels-{{ ds }}-{{ task_instance.try_number }}",
|
||||||
|
mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels",
|
||||||
|
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||||
|
arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred",
|
||||||
|
"--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl",
|
||||||
|
"--actionSetId", "dedup-result-decisiontree-v4",
|
||||||
|
"--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup",
|
||||||
|
"--numPartitions", "64"
|
||||||
|
],
|
||||||
|
executor_cores=10,
|
||||||
|
executor_memory="4G",
|
||||||
|
executor_instances=1,
|
||||||
|
executor_memoryOverhead="3G").get_configuration(),
|
||||||
|
kubernetes_conn_id="kubernetes_default",
|
||||||
|
dag=dag
|
||||||
|
)
|
||||||
|
|
||||||
|
submit
|
|
@ -27,7 +27,7 @@ class SparkConfigurator:
|
||||||
self.s3Configuration = {
|
self.s3Configuration = {
|
||||||
"spark.driver.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
|
"spark.driver.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
|
||||||
"spark.executor.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
|
"spark.executor.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
|
||||||
"spark.hadoop.fs.defaultFS": "s3a://spark",
|
"spark.hadoop.fs.defaultFS": "s3a://graph",
|
||||||
"spark.hadoop.fs.s3a.access.key": "minio",
|
"spark.hadoop.fs.s3a.access.key": "minio",
|
||||||
"spark.hadoop.fs.s3a.secret.key": "minio123",
|
"spark.hadoop.fs.s3a.secret.key": "minio123",
|
||||||
"spark.hadoop.fs.s3a.endpoint": "https://minio.dnet-minio-tenant.svc.cluster.local",
|
"spark.hadoop.fs.s3a.endpoint": "https://minio.dnet-minio-tenant.svc.cluster.local",
|
||||||
|
@ -75,14 +75,6 @@ class SparkConfigurator:
|
||||||
"restartPolicy": {
|
"restartPolicy": {
|
||||||
"type": "Never"
|
"type": "Never"
|
||||||
},
|
},
|
||||||
"volumes": [
|
|
||||||
{
|
|
||||||
"name": "test-volume",
|
|
||||||
"persistentVolumeClaim": {
|
|
||||||
"claimName": "my-spark-pvc-tmp"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"driver": {
|
"driver": {
|
||||||
"javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
|
"javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
|
||||||
"cores": self.sparkResoruceConf['driver_cores'],
|
"cores": self.sparkResoruceConf['driver_cores'],
|
||||||
|
@ -91,13 +83,7 @@ class SparkConfigurator:
|
||||||
"labels": {
|
"labels": {
|
||||||
"version": "3.5.1"
|
"version": "3.5.1"
|
||||||
},
|
},
|
||||||
"serviceAccount": "spark",
|
"serviceAccount": "spark"
|
||||||
"volumeMounts": [
|
|
||||||
{
|
|
||||||
"name": "test-volume",
|
|
||||||
"mountPath": "/tmp"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
},
|
||||||
"executor": {
|
"executor": {
|
||||||
"javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
|
"javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
|
||||||
|
@ -107,13 +93,7 @@ class SparkConfigurator:
|
||||||
"instances": self.sparkResoruceConf['executor_instances'],
|
"instances": self.sparkResoruceConf['executor_instances'],
|
||||||
"labels": {
|
"labels": {
|
||||||
"version": "3.5.1"
|
"version": "3.5.1"
|
||||||
},
|
}
|
||||||
"volumeMounts": [
|
|
||||||
{
|
|
||||||
"name": "test-volume",
|
|
||||||
"mountPath": "/tmp"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
Loading…
Reference in New Issue