From 32e8e86aa7d2696de6366414c45ffe1f7ed1032f Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 1 May 2024 16:35:21 +0200 Subject: [PATCH] added spark driver image --- clusters/local/kind-cluster-config.yaml | 2 +- local.tfvars.template | 2 +- spark-image/Dockerfile | 8 +++ spark-run.yaml | 67 +++++++++++++++++++++++++ 4 files changed, 77 insertions(+), 2 deletions(-) create mode 100644 spark-image/Dockerfile create mode 100644 spark-run.yaml diff --git a/clusters/local/kind-cluster-config.yaml b/clusters/local/kind-cluster-config.yaml index c2a6dbd..a0e810f 100644 --- a/clusters/local/kind-cluster-config.yaml +++ b/clusters/local/kind-cluster-config.yaml @@ -1,6 +1,6 @@ kind: Cluster apiVersion: kind.x-k8s.io/v1alpha4 -name: openaire-data-platform +name: dnet-data-platform nodes: - role: control-plane diff --git a/local.tfvars.template b/local.tfvars.template index bd2d2ce..3d3e5d3 100644 --- a/local.tfvars.template +++ b/local.tfvars.template @@ -1,5 +1,5 @@ env = "local" -kube_context= "kind-openaire-data-platform" +kube_context= "kind-dnet-data-platform" domain = "local-dataplatform" admin_user = "admin" admin_password = "admin" diff --git a/spark-image/Dockerfile b/spark-image/Dockerfile new file mode 100644 index 0000000..0f8dd25 --- /dev/null +++ b/spark-image/Dockerfile @@ -0,0 +1,8 @@ +FROM spark:3.5.1-scala2.12-java17-ubuntu + +user root +RUN curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar -o ${SPARK_HOME}/jars/hadoop-aws-3.3.4.jar +RUN curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar -o ${SPARK_HOME}/jars/aws-java-sdk-bundle-1.12.262.jar + + +user spark \ No newline at end of file diff --git a/spark-run.yaml b/spark-run.yaml new file mode 100644 index 0000000..1f7d95f --- /dev/null +++ b/spark-run.yaml @@ -0,0 +1,67 @@ +apiVersion: "sparkoperator.k8s.io/v1beta2" +kind: SparkApplication +metadata: + name: spark-scholix + namespace: dnet-spark-jobs +spec: + type: Scala + mode: cluster + image: "dnet-spark:1.0.0" + imagePullPolicy: IfNotPresent + mainClass: eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump + mainApplicationFile: "s3a://lib/dhp-shade-package-1.2.5-SNAPSHOT.jar" + arguments: [ + "--sourcePath", "s3a://raw-graph", + "--targetPath", "s3a://scholix", + "--master", "local[*]" ] + sparkVersion: "3.5.1" + sparkConf: + spark.driver.extraJavaOptions: "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true" + spark.executor.extraJavaOptions: "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true" + spark.hadoop.fs.defaultFS: "s3a://scholix" + spark.hadoop.fs.s3a.access.key: "minio" + spark.hadoop.fs.s3a.secret.key: "minio123" + spark.hadoop.fs.s3a.endpoint: "https://minio.dnet-minio-tenant.svc.cluster.local" + spark.hadoop.fs.s3a.impl: "org.apache.hadoop.fs.s3a.S3AFileSystem" + spark.hadoop.fs.s3a.path.style.access: "true" + spark.hadoop.fs.s3a.attempts.maximum: "1" + spark.hadoop.fs.s3a.connection.establish.timeout : "5000" + spark.hadoop.fs.s3a.connection.timeout: "10001" + spark.hadoop.fs.s3a.connection.ssl.enabled: "false" + com.amazonaws.sdk.disableCertChecking: "true" + com.cloudera.com.amazonaws.sdk.disableCertChecking: "true" + fs.s3a.connection.ssl.strictverify: "false" + fs.s3a.connection.ssl.enabled: "false" + fs.s3a.ssl.enabled: "false" + spark.hadoop.fs.s3a.ssl.enabled: "false" + restartPolicy: + type: Never + volumes: + - name: "test-volume" + persistentVolumeClaim: + claimName: my-spark-pvc-tmp + dynamicAllocation: + enabled: true + initialExecutors: 2 + minExecutors: 2 + maxExecutors: 16 + driver: + javaOptions: "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true" + cores: 1 + coreLimit: "1200m" + memory: "2G" + labels: + version: 3.5.1 + serviceAccount: spark + volumeMounts: + - name: "test-volume" + mountPath: "/tmp" + executor: + javaOptions: "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true" + cores: 1 + memory: "2G" + labels: + version: 3.5.1 + volumeMounts: + - name: "test-volume" + mountPath: "/tmp" \ No newline at end of file