2024-11-18 10:43:07 +01:00
2 changed files with 86 additions and 23 deletions
--- a/workflow/dnet/dedup.py
+++ b/workflow/dnet/dedup.py
@ -0,0 +1,83 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""
+This is an example DAG which uses SparkKubernetesOperator and SparkKubernetesSensor.
+In this example, we create two tasks which execute sequentially.
+The first task is to submit sparkApplication on Kubernetes cluster(the example uses spark-pi application).
+and the second task is to check the final state of the sparkApplication that submitted in the first state.
+
+Spark-on-k8s operator is required to be already installed on Kubernetes
+https://github.com/GoogleCloudPlatform/spark-on-k8s-operator
+"""
+
+from spark_configurator import SparkConfigurator
+
+# [START import_module]
+# The DAG object; we'll need this to instantiate a DAG
+from airflow import DAG
+# Operators; we need this to operate!
+from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator
+from airflow.providers.cncf.kubernetes.sensors.spark_kubernetes import SparkKubernetesSensor
+from airflow.utils.dates import days_ago
+
+# [END import_module]
+
+# [START default_args]
+# These args will get passed on to each operator
+# You can override them on a per-task basis during operator initialization
+default_args = {
+    'owner': 'airflow',
+    'depends_on_past': False,
+    'start_date': days_ago(1),
+    'email': ['airflow@example.com'],
+    'email_on_failure': False,
+    'email_on_retry': False,
+    'max_active_runs': 1,
+    'retries': 3
+}
+
+
+dag = DAG(
+    'dedup_graph',
+    default_args=default_args,
+    schedule_interval=None,
+    tags=['example', 'spark']
+)
+
+submit = SparkKubernetesOperator(
+    task_id='CreateSimRel',
+    namespace='dnet-spark-jobs',
+    template_spec=SparkConfigurator(
+        name="createsimrels-{{ ds }}-{{ task_instance.try_number }}",
+        mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels",
+        jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
+        arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred",
+                   "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl",
+                   "--actionSetId", "dedup-result-decisiontree-v4",
+                   "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup",
+                   "--numPartitions", "64"
+                   ],
+        executor_cores=10,
+        executor_memory="4G",
+        executor_instances=1,
+        executor_memoryOverhead="3G").get_configuration(),
+    kubernetes_conn_id="kubernetes_default",
+    dag=dag
+)
+
+submit
--- a/workflow/dnet/spark_configurator.py
+++ b/workflow/dnet/spark_configurator.py
@ -27,7 +27,7 @@ class SparkConfigurator:
        self.s3Configuration =  {
                    "spark.driver.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
                    "spark.executor.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
-                    "spark.hadoop.fs.defaultFS": "s3a://spark",
+                    "spark.hadoop.fs.defaultFS": "s3a://graph",
                    "spark.hadoop.fs.s3a.access.key": "minio",
                    "spark.hadoop.fs.s3a.secret.key": "minio123",
                    "spark.hadoop.fs.s3a.endpoint": "https://minio.dnet-minio-tenant.svc.cluster.local",
@ -75,14 +75,6 @@ class SparkConfigurator:
                "restartPolicy": {
                    "type": "Never"
                },
-                "volumes": [
-                    {
-                        "name": "test-volume",
-                        "persistentVolumeClaim": {
-                            "claimName": "my-spark-pvc-tmp"
-                        }
-                    }
-                ],
                "driver": {
                    "javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
                    "cores": self.sparkResoruceConf['driver_cores'],
@ -91,13 +83,7 @@ class SparkConfigurator:
                    "labels": {
                        "version": "3.5.1"
                    },
-                    "serviceAccount": "spark",
-                    "volumeMounts": [
-                        {
-                            "name": "test-volume",
-                            "mountPath": "/tmp"
-                        }
-                    ]
+                    "serviceAccount": "spark"
                },
                "executor": {
                    "javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true  -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true",
@ -107,13 +93,7 @@ class SparkConfigurator:
                    "instances": self.sparkResoruceConf['executor_instances'],
                    "labels": {
                        "version": "3.5.1"
-                    },
-                    "volumeMounts": [
-                        {
-                            "name": "test-volume",
-                            "mountPath": "/tmp"
-                        }
-                    ]
+                    }
                }
            }
        }