code-infrastructure-lab/workflow/dnet/spark_configurator.py

119 lines
5.2 KiB
Python

class SparkConfigurator:
def __init__(self,
name,
mainClass,
jarLocation:str,
arguments,
apiVersion=None,
namespace="dnet-spark-jobs",
image= "dnet-spark:1.0.0",
driver_cores=1,
driver_memory='1G',
executor_cores=1,
executor_memory="1G",
executor_memoryOverhead= "1G",
executor_instances=1
) -> None:
if apiVersion:
self.apiVersion = apiVersion
else:
self.apiVersion = "sparkoperator.k8s.io/v1beta2"
self.namespace= namespace
self.name = name
self.image= image
self.mainClass = mainClass
self.jarLocation = jarLocation
self.arguments= arguments
self.s3Configuration = {
"spark.driver.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
"spark.executor.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
"spark.hadoop.fs.defaultFS": "s3a://spark",
"spark.hadoop.fs.s3a.access.key": "minio",
"spark.hadoop.fs.s3a.secret.key": "minio123",
"spark.hadoop.fs.s3a.endpoint": "https://minio.dnet-minio-tenant.svc.cluster.local",
"spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem",
"spark.hadoop.fs.s3a.path.style.access": "true",
"spark.hadoop.fs.s3a.attempts.maximum": "1",
"spark.hadoop.fs.s3a.connection.establish.timeout": "5000",
"spark.hadoop.fs.s3a.connection.timeout": "10001",
"spark.hadoop.fs.s3a.connection.ssl.enabled": "false",
"com.amazonaws.sdk.disableCertChecking": "true",
"com.cloudera.com.amazonaws.sdk.disableCertChecking": "true",
"fs.s3a.connection.ssl.strictverify": "false",
"fs.s3a.connection.ssl.enabled": "false",
"fs.s3a.ssl.enabled": "false",
"spark.hadoop.fs.s3a.ssl.enabled": "false"
}
self.sparkResoruceConf= {
'driver_cores':driver_cores,
'driver_memory':driver_memory,
'executor_cores':executor_cores,
'executor_memory':executor_memory,
'executor_instances':executor_instances,
'memoryOverhead':executor_memoryOverhead
}
def get_configuration(self) -> dict:
return {
"apiVersion": self.apiVersion,
"kind": "SparkApplication",
"metadata": {
"name": self.name,
"namespace": self.namespace
},
"spec": {
"type": "Scala",
"mode": "cluster",
"image":self.image,
"imagePullPolicy": "IfNotPresent",
"mainClass": self.mainClass,
"mainApplicationFile": self.jarLocation,
"arguments": self.arguments,
"sparkVersion": "3.5.1",
"sparkConf": self.s3Configuration,
"restartPolicy": {
"type": "Never"
},
"volumes": [
{
"name": "test-volume",
"persistentVolumeClaim": {
"claimName": "my-spark-pvc-tmp"
}
}
],
"driver": {
"javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
"cores": self.sparkResoruceConf['driver_cores'],
"coreLimit": "1200m",
"memory": self.sparkResoruceConf['driver_memory'],
"labels": {
"version": "3.5.1"
},
"serviceAccount": "spark",
"volumeMounts": [
{
"name": "test-volume",
"mountPath": "/tmp"
}
]
},
"executor": {
"javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
"cores": self.sparkResoruceConf['executor_cores'],
"memoryOverhead": self.sparkResoruceConf['memoryOverhead'],
"memory": self.sparkResoruceConf['executor_memory'],
"instances": self.sparkResoruceConf['executor_instances'],
"labels": {
"version": "3.5.1"
},
"volumeMounts": [
{
"name": "test-volume",
"mountPath": "/tmp"
}
]
}
}
}