added spark driver image

This commit is contained in:
Sandro La Bruzzo 2024-05-01 16:35:21 +02:00
parent 0863c9b2e9
commit 32e8e86aa7
4 changed files with 77 additions and 2 deletions

View File

@ -1,6 +1,6 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
name: openaire-data-platform
name: dnet-data-platform
nodes:
- role: control-plane

View File

@ -1,5 +1,5 @@
env = "local"
kube_context= "kind-openaire-data-platform"
kube_context= "kind-dnet-data-platform"
domain = "local-dataplatform"
admin_user = "admin"
admin_password = "admin"

8
spark-image/Dockerfile Normal file
View File

@ -0,0 +1,8 @@
FROM spark:3.5.1-scala2.12-java17-ubuntu
user root
RUN curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar -o ${SPARK_HOME}/jars/hadoop-aws-3.3.4.jar
RUN curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar -o ${SPARK_HOME}/jars/aws-java-sdk-bundle-1.12.262.jar
user spark

67
spark-run.yaml Normal file
View File

@ -0,0 +1,67 @@
apiVersion: "sparkoperator.k8s.io/v1beta2"
kind: SparkApplication
metadata:
name: spark-scholix
namespace: dnet-spark-jobs
spec:
type: Scala
mode: cluster
image: "dnet-spark:1.0.0"
imagePullPolicy: IfNotPresent
mainClass: eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump
mainApplicationFile: "s3a://lib/dhp-shade-package-1.2.5-SNAPSHOT.jar"
arguments: [
"--sourcePath", "s3a://raw-graph",
"--targetPath", "s3a://scholix",
"--master", "local[*]" ]
sparkVersion: "3.5.1"
sparkConf:
spark.driver.extraJavaOptions: "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true"
spark.executor.extraJavaOptions: "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true"
spark.hadoop.fs.defaultFS: "s3a://scholix"
spark.hadoop.fs.s3a.access.key: "minio"
spark.hadoop.fs.s3a.secret.key: "minio123"
spark.hadoop.fs.s3a.endpoint: "https://minio.dnet-minio-tenant.svc.cluster.local"
spark.hadoop.fs.s3a.impl: "org.apache.hadoop.fs.s3a.S3AFileSystem"
spark.hadoop.fs.s3a.path.style.access: "true"
spark.hadoop.fs.s3a.attempts.maximum: "1"
spark.hadoop.fs.s3a.connection.establish.timeout : "5000"
spark.hadoop.fs.s3a.connection.timeout: "10001"
spark.hadoop.fs.s3a.connection.ssl.enabled: "false"
com.amazonaws.sdk.disableCertChecking: "true"
com.cloudera.com.amazonaws.sdk.disableCertChecking: "true"
fs.s3a.connection.ssl.strictverify: "false"
fs.s3a.connection.ssl.enabled: "false"
fs.s3a.ssl.enabled: "false"
spark.hadoop.fs.s3a.ssl.enabled: "false"
restartPolicy:
type: Never
volumes:
- name: "test-volume"
persistentVolumeClaim:
claimName: my-spark-pvc-tmp
dynamicAllocation:
enabled: true
initialExecutors: 2
minExecutors: 2
maxExecutors: 16
driver:
javaOptions: "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true"
cores: 1
coreLimit: "1200m"
memory: "2G"
labels:
version: 3.5.1
serviceAccount: spark
volumeMounts:
- name: "test-volume"
mountPath: "/tmp"
executor:
javaOptions: "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true"
cores: 1
memory: "2G"
labels:
version: 3.5.1
volumeMounts:
- name: "test-volume"
mountPath: "/tmp"