From 75286755904a3da5530297c6c07084c9e1fcc491 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 16 Oct 2024 09:08:14 +0200 Subject: [PATCH 01/42] update version of minio --- envs/local/airflow.yaml | 8 +-- envs/local/minio-tenant.yaml | 120 +++++++++++++++++++------------- modules/minio/minio-operator.tf | 2 +- modules/minio/minio-tenant.tf | 40 +---------- 4 files changed, 79 insertions(+), 91 deletions(-) diff --git a/envs/local/airflow.yaml b/envs/local/airflow.yaml index 2eb8c99..6943482 100644 --- a/envs/local/airflow.yaml +++ b/envs/local/airflow.yaml @@ -31,9 +31,9 @@ dags: enabled: true gitSync: enabled: true - repo: "https://code-repo.d4science.org/D-Net/code-infrasturcutre-lab.git" - branch: "airflow" - subPath: "airflow/dags" + repo: "https://code-repo.d4science.org/D-Net/code-infrastructure-lab.git" + branch: "master" + subPath: "workflow/dnet" config: webserver: @@ -42,7 +42,7 @@ config: logging: remote_logging: "True" logging_level: "INFO" - remote_base_log_folder: "s3://dnet-airflow/logs" + remote_base_log_folder: "s3://workflow-logs/logs" remote_log_conn_id: "s3_conn" encrypt_s3_logs: "False" diff --git a/envs/local/minio-tenant.yaml b/envs/local/minio-tenant.yaml index a646013..0ec38fa 100644 --- a/envs/local/minio-tenant.yaml +++ b/envs/local/minio-tenant.yaml @@ -1,42 +1,3 @@ - - -### -# Root key for dynamically creating a secret for use with configuring root MinIO User -# Specify the ``name`` and then a list of environment variables. -# -# .. important:: -# -# Do not use this in production environments. -# This field is intended for use with rapid development or testing only. -# -# For example: -# -# .. code-block:: yaml -# -# name: myminio-env-configuration -# accessKey: minio -# secretKey: minio123 -# -secrets: - name: myminio-env-configuration - accessKey: minio - secretKey: minio123 - ### - # The name of an existing Kubernetes secret to import to the MinIO Tenant - # The secret must contain a key ``config.env``. - # The values should be a series of export statements to set environment variables for the Tenant. - # For example: - # - # .. code-block:: shell - # - # stringData: - # config.env: | - - # export MINIO_ROOT_USER=ROOTUSERNAME - # export MINIO_ROOT_PASSWORD=ROOTUSERPASSWORD - # - #existingSecret: - # name: myminio-env-configuration -### # Root key for MinIO Tenant Chart tenant: ### @@ -47,14 +8,14 @@ tenant: ### # Specify the Operator container image to use for the deployment. # ``image.tag`` - # For example, the following sets the image to the ``quay.io/minio/operator`` repo and the v5.0.12 tag. + # For example, the following sets the image to the ``quay.io/minio/operator`` repo and the v6.0.4 tag. # The container pulls the image if not already present: # # .. code-block:: yaml # # image: # repository: quay.io/minio/minio - # tag: RELEASE.2024-02-09T21-25-16Z + # tag: RELEASE.2024-10-02T17-50-41Z # pullPolicy: IfNotPresent # # The chart also supports specifying an image based on digest value: @@ -69,7 +30,7 @@ tenant: # image: repository: quay.io/minio/minio - tag: RELEASE.2024-02-09T21-25-16Z + tag: RELEASE.2024-10-02T17-50-41Z pullPolicy: IfNotPresent ### # @@ -87,6 +48,44 @@ tenant: configuration: name: myminio-env-configuration ### + # Root key for dynamically creating a secret for use with configuring root MinIO User + # Specify the ``name`` and then a list of environment variables. + # + # .. important:: + # + # Do not use this in production environments. + # This field is intended for use with rapid development or testing only. + # + # For example: + # + # .. code-block:: yaml + # + # name: myminio-env-configuration + # accessKey: minio + # secretKey: minio123 + # + configSecret: + name: myminio-env-configuration + accessKey: minio + secretKey: minio123 + #existingSecret: true + + ### + # If this variable is set to true, then enable the usage of an existing Kubernetes secret to set environment variables for the Tenant. + # The existing Kubernetes secret name must be placed under .tenant.configuration.name e.g. existing-minio-env-configuration + # The secret must contain a key ``config.env``. + # The values should be a series of export statements to set environment variables for the Tenant. + # For example: + # + # .. code-block:: shell + # + # stringData: + # config.env: |- + # export MINIO_ROOT_USER=ROOTUSERNAME + # export MINIO_ROOT_PASSWORD=ROOTUSERPASSWORD + # + # existingSecret: false + ### # Top level key for configuring MinIO Pool(s) in this Tenant. # # See `Operator CRD: Pools `__ for more information on all subfields. @@ -104,7 +103,7 @@ tenant: volumesPerServer: 4 ### # The capacity per volume requested per MinIO Tenant Pod. - size: 1Gi + size: 50Gi ### # The `storageClass `__ to associate with volumes generated for this pool. # @@ -166,6 +165,12 @@ tenant: runAsUser: 1000 runAsGroup: 1000 runAsNonRoot: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + seccompProfile: + type: RuntimeDefault ### # # An array of `Topology Spread Constraints `__ to associate to Operator Console pods. @@ -225,6 +230,10 @@ tenant: # Enable automatic Kubernetes based `certificate generation and signing `__ requestAutoCert: true ### + # The minimum number of days to expiry before an alert for an expiring certificate is fired. + # In the below example, if a given certificate will expire in 7 days then expiration events will only be triggered 1 day before expiry + # certExpiryAlertThreshold: 1 + ### # This field is used only when ``requestAutoCert: true``. # Use this field to set CommonName for the auto-generated certificate. # MinIO defaults to using the internal Kubernetes DNS name for the pod @@ -248,7 +257,7 @@ tenant: # - name: my-minio-bucket # objectLock: false # optional # region: us-east-1 # optional - buckets: [ ] + buckets: [ "workflow-logs", "binaries", "graph"] ### # Array of Kubernetes secrets from which the Operator generates MinIO users during tenant provisioning. # @@ -271,6 +280,9 @@ tenant: # Refer startup: { } ### + # The `Lifecycle hooks `__ for container. + lifecycle: { } + ### # Directs the Operator to deploy the MinIO S3 API and Console services as LoadBalancer objects. # # If the Kubernetes cluster has a configured LoadBalancer, it can attempt to route traffic to those services automatically. @@ -337,14 +349,14 @@ tenant: # # Image from tag (original behavior), for example: # # image: # # repository: quay.io/minio/kes - # # tag: 2024-01-11T13-09-29Z + # # tag: 2024-09-11T07-22-50Z # # Image from digest (added after original behavior), for example: # # image: # # repository: quay.io/minio/kes@sha256 # # digest: fb15af611149892f357a8a99d1bcd8bf5dae713bd64c15e6eb27fbdb88fc208b # image: # repository: quay.io/minio/kes - # tag: 2024-01-11T13-09-29Z + # tag: 2024-09-11T07-22-50Z # pullPolicy: IfNotPresent # env: [ ] # replicas: 2 @@ -417,6 +429,17 @@ tenant: # runAsGroup: 1000 # runAsNonRoot: true # fsGroup: 1000 + # containerSecurityContext: + # runAsUser: 1000 + # runAsGroup: 1000 + # runAsNonRoot: true + # allowPrivilegeEscalation: false + # capabilities: + # drop: + # - ALL + # seccompProfile: + # type: RuntimeDefault + ### # Configures `Ingress `__ for the Tenant S3 API and Console. # @@ -428,7 +451,7 @@ ingress: labels: { } annotations: nginx.ingress.kubernetes.io/backend-protocol: "HTTPS" - nginx.ingress.kubernetes.io/proxy-body-size: 100m + nginx.ingress.kubernetes.io/proxy-body-size: 10000m tls: [ ] host: minio.local path: / @@ -439,6 +462,7 @@ ingress: labels: { } annotations: nginx.ingress.kubernetes.io/backend-protocol: "HTTPS" + nginx.ingress.kubernetes.io/proxy-body-size: 10000m tls: [ ] host: minio-console.local path: / @@ -451,7 +475,7 @@ ingress: # kind: Secret # type: Opaque # metadata: -# name: {{ dig "secrets" "existingSecret" "" (.Values | merge (dict)) }} +# name: {{ dig "tenant" "configSecret" "name" "" (.Values | merge (dict)) }} # stringData: # config.env: |- # export MINIO_ROOT_USER='minio' diff --git a/modules/minio/minio-operator.tf b/modules/minio/minio-operator.tf index 16ebae0..ba5a720 100644 --- a/modules/minio/minio-operator.tf +++ b/modules/minio/minio-operator.tf @@ -5,5 +5,5 @@ resource "helm_release" "minio_operator" { create_namespace = "true" namespace = "minio-operator" dependency_update = "true" - version = "5.0.12" + version = "6.0.4" } \ No newline at end of file diff --git a/modules/minio/minio-tenant.tf b/modules/minio/minio-tenant.tf index ff70913..9625d2e 100644 --- a/modules/minio/minio-tenant.tf +++ b/modules/minio/minio-tenant.tf @@ -6,7 +6,7 @@ resource "helm_release" "minio_tenant" { create_namespace = "true" namespace = "${var.namespace_prefix}minio-tenant" dependency_update = "true" - version = "5.0.12" + version = "6.0.4" values = [ file("./envs/${var.env}/minio-tenant.yaml") @@ -21,40 +21,4 @@ resource "helm_release" "minio_tenant" { name = "ingress.console.host" value = "console-minio.${var.domain}" } -} - -/* -resource "kubernetes_manifest" "minio_ingress" { - manifest = yamldecode(< Date: Wed, 16 Oct 2024 10:45:48 +0200 Subject: [PATCH 02/42] fixed bucket creation updated tenant yaml --- envs/local/minio-tenant.yaml | 8 +++++++- modules/minio/minio-bucket.yaml | 34 --------------------------------- 2 files changed, 7 insertions(+), 35 deletions(-) delete mode 100644 modules/minio/minio-bucket.yaml diff --git a/envs/local/minio-tenant.yaml b/envs/local/minio-tenant.yaml index 0ec38fa..ed643b5 100644 --- a/envs/local/minio-tenant.yaml +++ b/envs/local/minio-tenant.yaml @@ -257,7 +257,13 @@ tenant: # - name: my-minio-bucket # objectLock: false # optional # region: us-east-1 # optional - buckets: [ "workflow-logs", "binaries", "graph"] + buckets: + - name: workflow-logs + - name: binaries + - name: graph + + + # [ "workflow-logs", "binaries", "graph"] ### # Array of Kubernetes secrets from which the Operator generates MinIO users during tenant provisioning. # diff --git a/modules/minio/minio-bucket.yaml b/modules/minio/minio-bucket.yaml deleted file mode 100644 index e476cba..0000000 --- a/modules/minio/minio-bucket.yaml +++ /dev/null @@ -1,34 +0,0 @@ -apiVersion: batch/v1 -kind: Job -metadata: - name: create-bucket - namespace: block-storage -spec: - template: - spec: - containers: - - name: createbucket - image: amazon/aws-cli - command: ["aws"] - args: - - s3api - - create-bucket - - --bucket - - postgres - - --endpoint-url - - http://minio:80 - env: - - name: AWS_ACCESS_KEY_ID - valueFrom: - secretKeyRef: - name: minio-secret - key: accesskey - - - name: AWS_SECRET_ACCESS_KEY - valueFrom: - secretKeyRef: - name: minio-secret - key: secretkey - - restartPolicy: Never - backoffLimit: 1 -- 2.17.1 From 35c44845d2317a87c2e775beae5b7e2557a435e5 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 16 Oct 2024 12:07:14 +0200 Subject: [PATCH 03/42] added creation of bucket using variables --- envs/local/minio-tenant.yaml | 8 ++------ main.tf | 27 ++++++++++++++------------- modules/minio/minio-tenant.tf | 22 ++++++++++++++++++++++ modules/minio/variables.tf | 6 ++++++ variables.tf | 10 ++++++++++ 5 files changed, 54 insertions(+), 19 deletions(-) diff --git a/envs/local/minio-tenant.yaml b/envs/local/minio-tenant.yaml index ed643b5..fd64ea7 100644 --- a/envs/local/minio-tenant.yaml +++ b/envs/local/minio-tenant.yaml @@ -257,13 +257,9 @@ tenant: # - name: my-minio-bucket # objectLock: false # optional # region: us-east-1 # optional - buckets: - - name: workflow-logs - - name: binaries - - name: graph + buckets: [ ] + - - # [ "workflow-logs", "binaries", "graph"] ### # Array of Kubernetes secrets from which the Operator generates MinIO users during tenant provisioning. # diff --git a/main.tf b/main.tf index 6842df4..4c63dac 100644 --- a/main.tf +++ b/main.tf @@ -2,20 +2,21 @@ module "minio" { source = "./modules/minio" kube_context = var.kube_context namespace_prefix=var.namespace_prefix + buckets = var.minio_buckets } -module "airflow" { - source = "./modules/airflow" - kube_context = var.kube_context - admin_user = var.admin_user - admin_password = var.admin_password - namespace_prefix= var.namespace_prefix - admin_hash = var.admin_hash - env = var.env - domain = var.domain - s3_endpoint = var.s3_endpoint - s3_key = var.s3_key - s3_secret = var.s3_secret +# module "airflow" { +# source = "./modules/airflow" +# kube_context = var.kube_context +# admin_user = var.admin_user +# admin_password = var.admin_password +# namespace_prefix= var.namespace_prefix +# admin_hash = var.admin_hash +# env = var.env +# domain = var.domain +# s3_endpoint = var.s3_endpoint +# s3_key = var.s3_key +# s3_secret = var.s3_secret -} +# } diff --git a/modules/minio/minio-tenant.tf b/modules/minio/minio-tenant.tf index 9625d2e..e1b2211 100644 --- a/modules/minio/minio-tenant.tf +++ b/modules/minio/minio-tenant.tf @@ -21,4 +21,26 @@ resource "helm_release" "minio_tenant" { name = "ingress.console.host" value = "console-minio.${var.domain}" } + + dynamic "set" { + for_each = var.buckets + content { + name = "tenant.buckets[${set.key}].name" + value = set.value.name + } + } + + + # set { + # name = "tenant.buckets[0].name" + # value = "workflow-logs" + # } + + # set { + # name = "tenant.buckets[1].name" + # value = "binaries" + # } + + + # ,"binaries","graph","pippo"] } \ No newline at end of file diff --git a/modules/minio/variables.tf b/modules/minio/variables.tf index 100ac16..947eba5 100644 --- a/modules/minio/variables.tf +++ b/modules/minio/variables.tf @@ -22,3 +22,9 @@ variable "domain" { type = string default = "local-dataplatform" } + +variable "buckets" { + type = list(map(string)) + default = [ ] +} + diff --git a/variables.tf b/variables.tf index 2d8e276..207e4df 100644 --- a/variables.tf +++ b/variables.tf @@ -44,3 +44,13 @@ variable "s3_key" { variable "s3_secret" { default = "minio123" } + +variable "minio_buckets" { + type = list(map(string)) + default = [ + { name = "workflow-logs" }, + { name = "binaries" }, + { name = "graph" }, + ] +} + -- 2.17.1 From ed3422673fce936589aa673631857534c4ea39ce Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 16 Oct 2024 13:35:49 +0200 Subject: [PATCH 04/42] added git variable for airflow module --- envs/local/airbyte.yaml | 13 ----------- main.tf | 27 +++++++++++----------- modules/airflow/airflow.tf | 43 ++++++++++++++++++++++++------------ modules/airflow/variables.tf | 15 +++++++++++++ variables.tf | 4 ++++ 5 files changed, 62 insertions(+), 40 deletions(-) delete mode 100644 envs/local/airbyte.yaml diff --git a/envs/local/airbyte.yaml b/envs/local/airbyte.yaml deleted file mode 100644 index aa79aa4..0000000 --- a/envs/local/airbyte.yaml +++ /dev/null @@ -1,13 +0,0 @@ - -webapp: - ingress: - enabled: true - className: "nginx" - annotations: - kubernetes.io/ingress.class: nginx - hosts: - - host: localhost - paths: - - path: / - pathType: ImplementationSpecific - tls: [] \ No newline at end of file diff --git a/main.tf b/main.tf index 4c63dac..20a2ffd 100644 --- a/main.tf +++ b/main.tf @@ -6,17 +6,18 @@ module "minio" { } -# module "airflow" { -# source = "./modules/airflow" -# kube_context = var.kube_context -# admin_user = var.admin_user -# admin_password = var.admin_password -# namespace_prefix= var.namespace_prefix -# admin_hash = var.admin_hash -# env = var.env -# domain = var.domain -# s3_endpoint = var.s3_endpoint -# s3_key = var.s3_key -# s3_secret = var.s3_secret +module "airflow" { + source = "./modules/airflow" + kube_context = var.kube_context + admin_user = var.admin_user + admin_password = var.admin_password + namespace_prefix= var.namespace_prefix + admin_hash = var.admin_hash + env = var.env + domain = var.domain + s3_endpoint = var.s3_endpoint + s3_key = var.s3_key + s3_secret = var.s3_secret + branch_name= var.dag_branch_name -# } +} diff --git a/modules/airflow/airflow.tf b/modules/airflow/airflow.tf index fada7c8..b5000d4 100644 --- a/modules/airflow/airflow.tf +++ b/modules/airflow/airflow.tf @@ -104,17 +104,17 @@ resource "helm_release" "gcp_spark_operator" { create_namespace = "true" namespace = "${var.namespace_prefix}gcp-spark-operator" dependency_update = "true" - version = "1.2.7" + version = "2.0.2" - set { - name = "image.repository" - value = "kubeflow/spark-operator" - } + # set { + # name = "image.repository" + # value = "kubeflow/spark-operator" + # } - set { - name = "image.tag" - value = "v1beta2-1.4.5-3.5.0" - } + # set { + # name = "image.tag" + # value = "v1beta2-1.4.5-3.5.0" + # } set { name = "sparkJobNamespaces" @@ -180,7 +180,7 @@ resource "helm_release" "airflow" { repository = "https://airflow.apache.org" namespace = "${var.namespace_prefix}airflow" dependency_update = "true" - version = "1.13.0" + version = "1.15.0" values = [ file("./envs/${var.env}/airflow.yaml") @@ -211,15 +211,30 @@ resource "helm_release" "airflow" { } set { - name = "images.airflow.repository" - value = "gbloisi/airflow" + name ="dags.gitSync.repo" + value = var.repo_url } set { - name = "images.airflow.tag" - value = "2.8.3rc1-python3.11" + name ="dags.gitSync.branch" + value = var.branch_name } + set { + name ="dags.gitSync.subPath" + value = var.dag_path + } + + # set { + # name = "images.airflow.repository" + # value = "gbloisi/airflow" + # } + + # set { + # name = "images.airflow.tag" + # value = "2.8.3rc1-python3.11" + # } + set { name = "ingress.web.host" value = "airflow.${var.domain}" diff --git a/modules/airflow/variables.tf b/modules/airflow/variables.tf index 8c93472..6b4b7ae 100644 --- a/modules/airflow/variables.tf +++ b/modules/airflow/variables.tf @@ -49,3 +49,18 @@ variable "admin_password" { variable "admin_hash" { type = string } + +variable "repo_url" { + type = string + default = "https://code-repo.d4science.org/D-Net/code-infrastructure-lab.git" +} + +variable "branch_name" { + type = string + default = "master" +} + +variable "dag_path" { + type = string + default = "workflow/dnet" +} diff --git a/variables.tf b/variables.tf index 207e4df..f624ad3 100644 --- a/variables.tf +++ b/variables.tf @@ -54,3 +54,7 @@ variable "minio_buckets" { ] } +variable "dag_branch_name" { + default = "master" +} + -- 2.17.1 From 07ce192207888f38b63f168bd8e32f6512c00a97 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 16 Oct 2024 13:38:26 +0200 Subject: [PATCH 05/42] added workflow test --- workflow/dnet/run_spark.py | 87 ++++++++++++++++++++ workflow/dnet/spark_configurator.py | 119 ++++++++++++++++++++++++++++ 2 files changed, 206 insertions(+) create mode 100644 workflow/dnet/run_spark.py create mode 100644 workflow/dnet/spark_configurator.py diff --git a/workflow/dnet/run_spark.py b/workflow/dnet/run_spark.py new file mode 100644 index 0000000..970fdac --- /dev/null +++ b/workflow/dnet/run_spark.py @@ -0,0 +1,87 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +This is an example DAG which uses SparkKubernetesOperator and SparkKubernetesSensor. +In this example, we create two tasks which execute sequentially. +The first task is to submit sparkApplication on Kubernetes cluster(the example uses spark-pi application). +and the second task is to check the final state of the sparkApplication that submitted in the first state. + +Spark-on-k8s operator is required to be already installed on Kubernetes +https://github.com/GoogleCloudPlatform/spark-on-k8s-operator +""" + +from os import path +from datetime import timedelta, datetime +from spark_configurator import SparkConfigurator + +# [START import_module] +# The DAG object; we'll need this to instantiate a DAG +from airflow import DAG +# Operators; we need this to operate! +from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator +from airflow.providers.cncf.kubernetes.sensors.spark_kubernetes import SparkKubernetesSensor +from airflow.utils.dates import days_ago + + +# [END import_module] + +# [START default_args] +# These args will get passed on to each operator +# You can override them on a per-task basis during operator initialization +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': days_ago(1), + 'email': ['airflow@example.com'], + 'email_on_failure': False, + 'email_on_retry': False, + 'max_active_runs': 1, + 'retries': 3 +} + +spec =SparkConfigurator( + name="spark-scholix-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump", + jarLocation = 's3a://deps/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments =[ "--sourcePath", "s3a://raw-graph/01", "--targetPath", "s3a://scholix"],\ + executor_cores=10, + executor_memory="4G", + executor_instances=1, + executor_memoryOverhead="3G").get_configuration() + +dag = DAG( + 'spark_scholix', + default_args=default_args, + schedule_interval=None, + tags=['example', 'spark'] +) + +submit = SparkKubernetesOperator( + task_id='spark-scholix', + namespace='dnet-spark-jobs', + template_spec=spec, + kubernetes_conn_id="kubernetes_default", + # do_xcom_push=True, + # delete_on_termination=True, + # base_container_name="spark-kubernetes-driver", + dag=dag +) + + + +submit \ No newline at end of file diff --git a/workflow/dnet/spark_configurator.py b/workflow/dnet/spark_configurator.py new file mode 100644 index 0000000..b943ca1 --- /dev/null +++ b/workflow/dnet/spark_configurator.py @@ -0,0 +1,119 @@ +class SparkConfigurator: + def __init__(self, + name, + mainClass, + jarLocation:str, + arguments, + apiVersion=None, + namespace="dnet-spark-jobs", + image= "dnet-spark:1.0.0", + driver_cores=1, + driver_memory='1G', + executor_cores=1, + executor_memory="1G", + executor_memoryOverhead= "1G", + executor_instances=1 + ) -> None: + if apiVersion: + self.apiVersion = apiVersion + else: + self.apiVersion = "sparkoperator.k8s.io/v1beta2" + self.namespace= namespace + self.name = name + self.image= image + self.mainClass = mainClass + self.jarLocation = jarLocation + self.arguments= arguments + self.s3Configuration = { + "spark.driver.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", + "spark.executor.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", + "spark.hadoop.fs.defaultFS": "s3a://spark", + "spark.hadoop.fs.s3a.access.key": "minio", + "spark.hadoop.fs.s3a.secret.key": "minio123", + "spark.hadoop.fs.s3a.endpoint": "https://minio.dnet-minio-tenant.svc.cluster.local", + "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem", + "spark.hadoop.fs.s3a.path.style.access": "true", + "spark.hadoop.fs.s3a.attempts.maximum": "1", + "spark.hadoop.fs.s3a.connection.establish.timeout": "5000", + "spark.hadoop.fs.s3a.connection.timeout": "10001", + "spark.hadoop.fs.s3a.connection.ssl.enabled": "false", + "com.amazonaws.sdk.disableCertChecking": "true", + "com.cloudera.com.amazonaws.sdk.disableCertChecking": "true", + "fs.s3a.connection.ssl.strictverify": "false", + "fs.s3a.connection.ssl.enabled": "false", + "fs.s3a.ssl.enabled": "false", + "spark.hadoop.fs.s3a.ssl.enabled": "false" + } + self.sparkResoruceConf= { + 'driver_cores':driver_cores, + 'driver_memory':driver_memory, + 'executor_cores':executor_cores, + 'executor_memory':executor_memory, + 'executor_instances':executor_instances, + 'memoryOverhead':executor_memoryOverhead + + } + + def get_configuration(self) -> dict: + return { + "apiVersion": self.apiVersion, + "kind": "SparkApplication", + "metadata": { + "name": self.name, + "namespace": self.namespace + }, + "spec": { + "type": "Scala", + "mode": "cluster", + "image":self.image, + "imagePullPolicy": "IfNotPresent", + "mainClass": self.mainClass, + "mainApplicationFile": self.jarLocation, + "arguments": self.arguments, + "sparkVersion": "3.5.1", + "sparkConf": self.s3Configuration, + "restartPolicy": { + "type": "Never" + }, + "volumes": [ + { + "name": "test-volume", + "persistentVolumeClaim": { + "claimName": "my-spark-pvc-tmp" + } + } + ], + "driver": { + "javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", + "cores": self.sparkResoruceConf['driver_cores'], + "coreLimit": "1200m", + "memory": self.sparkResoruceConf['driver_memory'], + "labels": { + "version": "3.5.1" + }, + "serviceAccount": "spark", + "volumeMounts": [ + { + "name": "test-volume", + "mountPath": "/tmp" + } + ] + }, + "executor": { + "javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", + "cores": self.sparkResoruceConf['executor_cores'], + "memoryOverhead": self.sparkResoruceConf['memoryOverhead'], + "memory": self.sparkResoruceConf['executor_memory'], + "instances": self.sparkResoruceConf['executor_instances'], + "labels": { + "version": "3.5.1" + }, + "volumeMounts": [ + { + "name": "test-volume", + "mountPath": "/tmp" + } + ] + } + } + } \ No newline at end of file -- 2.17.1 From b8bf21f8e52a0cdb88f1d037958b44bd095f3b52 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 16 Oct 2024 13:51:49 +0200 Subject: [PATCH 06/42] fixed import --- workflow/dnet/run_spark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/dnet/run_spark.py b/workflow/dnet/run_spark.py index 970fdac..482df67 100644 --- a/workflow/dnet/run_spark.py +++ b/workflow/dnet/run_spark.py @@ -27,7 +27,7 @@ https://github.com/GoogleCloudPlatform/spark-on-k8s-operator from os import path from datetime import timedelta, datetime -from spark_configurator import SparkConfigurator +from .spark_configurator import SparkConfigurator # [START import_module] # The DAG object; we'll need this to instantiate a DAG -- 2.17.1 From 6b555b8f6eae49d46ca358e196673f3340a03333 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 16 Oct 2024 13:56:36 +0200 Subject: [PATCH 07/42] added workflow test --- workflow/dnet/run_spark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/dnet/run_spark.py b/workflow/dnet/run_spark.py index 482df67..87c54bd 100644 --- a/workflow/dnet/run_spark.py +++ b/workflow/dnet/run_spark.py @@ -27,7 +27,7 @@ https://github.com/GoogleCloudPlatform/spark-on-k8s-operator from os import path from datetime import timedelta, datetime -from .spark_configurator import SparkConfigurator +from workflow.dnet.spark_configurator import SparkConfigurator # [START import_module] # The DAG object; we'll need this to instantiate a DAG -- 2.17.1 From dcd2efd3b460258cbca088bd3f9c174021db9f30 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 16 Oct 2024 13:56:50 +0200 Subject: [PATCH 08/42] added workflow test --- workflow/__init__.py | 0 workflow/dnet/__init__.py | 0 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 workflow/__init__.py create mode 100644 workflow/dnet/__init__.py diff --git a/workflow/__init__.py b/workflow/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/workflow/dnet/__init__.py b/workflow/dnet/__init__.py new file mode 100644 index 0000000..e69de29 -- 2.17.1 From d1afcd4395efa05b771d05225f13dc0f01929c84 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 16 Oct 2024 14:08:00 +0200 Subject: [PATCH 09/42] fixed import --- main.tf | 3 +- variables.tf | 4 ++ workflow/dnet/run_spark.py | 124 ++++++++++++++++++++++++++++++++++++- 3 files changed, 129 insertions(+), 2 deletions(-) diff --git a/main.tf b/main.tf index 20a2ffd..4456f38 100644 --- a/main.tf +++ b/main.tf @@ -18,6 +18,7 @@ module "airflow" { s3_endpoint = var.s3_endpoint s3_key = var.s3_key s3_secret = var.s3_secret - branch_name= var.dag_branch_name + branch_name = var.dag_branch_name + dag_path= var.dag_path_name } diff --git a/variables.tf b/variables.tf index f624ad3..8390108 100644 --- a/variables.tf +++ b/variables.tf @@ -58,3 +58,7 @@ variable "dag_branch_name" { default = "master" } +variable "dag_path_name" { + default = "workflow/dnet" +} + diff --git a/workflow/dnet/run_spark.py b/workflow/dnet/run_spark.py index 87c54bd..3a2530a 100644 --- a/workflow/dnet/run_spark.py +++ b/workflow/dnet/run_spark.py @@ -27,7 +27,7 @@ https://github.com/GoogleCloudPlatform/spark-on-k8s-operator from os import path from datetime import timedelta, datetime -from workflow.dnet.spark_configurator import SparkConfigurator +# from workflow.dnet.spark_configurator import SparkConfigurator # [START import_module] # The DAG object; we'll need this to instantiate a DAG @@ -40,6 +40,128 @@ from airflow.utils.dates import days_ago # [END import_module] + +class SparkConfigurator: + def __init__(self, + name, + mainClass, + jarLocation:str, + arguments, + apiVersion=None, + namespace="dnet-spark-jobs", + image= "dnet-spark:1.0.0", + driver_cores=1, + driver_memory='1G', + executor_cores=1, + executor_memory="1G", + executor_memoryOverhead= "1G", + executor_instances=1 + ) -> None: + if apiVersion: + self.apiVersion = apiVersion + else: + self.apiVersion = "sparkoperator.k8s.io/v1beta2" + self.namespace= namespace + self.name = name + self.image= image + self.mainClass = mainClass + self.jarLocation = jarLocation + self.arguments= arguments + self.s3Configuration = { + "spark.driver.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", + "spark.executor.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", + "spark.hadoop.fs.defaultFS": "s3a://spark", + "spark.hadoop.fs.s3a.access.key": "minio", + "spark.hadoop.fs.s3a.secret.key": "minio123", + "spark.hadoop.fs.s3a.endpoint": "https://minio.dnet-minio-tenant.svc.cluster.local", + "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem", + "spark.hadoop.fs.s3a.path.style.access": "true", + "spark.hadoop.fs.s3a.attempts.maximum": "1", + "spark.hadoop.fs.s3a.connection.establish.timeout": "5000", + "spark.hadoop.fs.s3a.connection.timeout": "10001", + "spark.hadoop.fs.s3a.connection.ssl.enabled": "false", + "com.amazonaws.sdk.disableCertChecking": "true", + "com.cloudera.com.amazonaws.sdk.disableCertChecking": "true", + "fs.s3a.connection.ssl.strictverify": "false", + "fs.s3a.connection.ssl.enabled": "false", + "fs.s3a.ssl.enabled": "false", + "spark.hadoop.fs.s3a.ssl.enabled": "false" + } + self.sparkResoruceConf= { + 'driver_cores':driver_cores, + 'driver_memory':driver_memory, + 'executor_cores':executor_cores, + 'executor_memory':executor_memory, + 'executor_instances':executor_instances, + 'memoryOverhead':executor_memoryOverhead + + } + + def get_configuration(self) -> dict: + return { + "apiVersion": self.apiVersion, + "kind": "SparkApplication", + "metadata": { + "name": self.name, + "namespace": self.namespace + }, + "spec": { + "type": "Scala", + "mode": "cluster", + "image":self.image, + "imagePullPolicy": "IfNotPresent", + "mainClass": self.mainClass, + "mainApplicationFile": self.jarLocation, + "arguments": self.arguments, + "sparkVersion": "3.5.1", + "sparkConf": self.s3Configuration, + "restartPolicy": { + "type": "Never" + }, + "volumes": [ + { + "name": "test-volume", + "persistentVolumeClaim": { + "claimName": "my-spark-pvc-tmp" + } + } + ], + "driver": { + "javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", + "cores": self.sparkResoruceConf['driver_cores'], + "coreLimit": "1200m", + "memory": self.sparkResoruceConf['driver_memory'], + "labels": { + "version": "3.5.1" + }, + "serviceAccount": "spark", + "volumeMounts": [ + { + "name": "test-volume", + "mountPath": "/tmp" + } + ] + }, + "executor": { + "javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", + "cores": self.sparkResoruceConf['executor_cores'], + "memoryOverhead": self.sparkResoruceConf['memoryOverhead'], + "memory": self.sparkResoruceConf['executor_memory'], + "instances": self.sparkResoruceConf['executor_instances'], + "labels": { + "version": "3.5.1" + }, + "volumeMounts": [ + { + "name": "test-volume", + "mountPath": "/tmp" + } + ] + } + } + } + + # [START default_args] # These args will get passed on to each operator # You can override them on a per-task basis during operator initialization -- 2.17.1 From df6e23666e3d09e60e83d82d49b2d4cf03101e4e Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 16 Oct 2024 16:35:01 +0200 Subject: [PATCH 10/42] fix --- workflow/dags/hello.py | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 workflow/dags/hello.py diff --git a/workflow/dags/hello.py b/workflow/dags/hello.py new file mode 100644 index 0000000..a6ef2ae --- /dev/null +++ b/workflow/dags/hello.py @@ -0,0 +1,11 @@ +import datetime + +from airflow import DAG +from airflow.operators.empty import EmptyOperator + +with DAG( + dag_id="my_dag_name", + start_date=datetime.datetime(2021, 1, 1), + schedule="@daily", + ): + EmptyOperator(task_id="task" ) \ No newline at end of file -- 2.17.1 From 412e008df7655d84b00753c09dede29e1ae36f3f Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Fri, 18 Oct 2024 16:42:54 +0200 Subject: [PATCH 11/42] Add untar task --- workflow/dnet/S3_untar.py | 97 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 97 insertions(+) create mode 100644 workflow/dnet/S3_untar.py diff --git a/workflow/dnet/S3_untar.py b/workflow/dnet/S3_untar.py new file mode 100644 index 0000000..863b3aa --- /dev/null +++ b/workflow/dnet/S3_untar.py @@ -0,0 +1,97 @@ +import os +import tarfile +import time +from datetime import timedelta + +import pendulum +from airflow.decorators import dag +from airflow.decorators import task +from airflow.models.param import Param +from airflow.operators.python import get_current_context +from airflow.providers.amazon.aws.hooks.s3 import S3Hook +from botocore.exceptions import ClientError + +import dag_utils + +EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6)) + +default_args = { + "execution_timeout": timedelta(days=EXECUTION_TIMEOUT), + "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)), + "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))), +} + + +def load_file_obj_with_backoff(hook: S3Hook, fileobj, key: str, bucket: str, replace: bool) -> bool: + delay = 10 # initial delay + delay_incr = 10 # additional delay in each loop + max_delay = 60 # max delay of one loop. Total delay is (max_delay**2)/2 + + while delay < max_delay: + try: + return hook.load_file_obj(fileobj, + key, + bucket, + replace=replace) + except ClientError as err: + code = err.response.get('Error', {}).get('Code', '') + if code in ['NoSuchBucket']: + print(f"Error: {code}. Check s3path: s3://{bucket}/{key}") + raise err + time.sleep(delay) + delay += delay_incr + + +@dag( + dag_id="s3_untar", + dag_display_name="S3 streaming untar", + start_date=pendulum.datetime(2021, 1, 1, tz="UTC"), + schedule=None, + catchup=False, + default_args=default_args, + params={ + "S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection of S3 endpoint"), + "src_bucket": Param(dag_utils.get_default_bucket(), type='string', + description="Override S3 default bucket for source"), + "src_key": Param("", type='string', description="File to untar"), + "dst_bucket": Param(dag_utils.get_default_bucket(), type='string', + description="Override S3 default bucket for destination"), + "dst_key_prefix": Param("", type='string', description="Key prefix for unarchived files"), + }, + tags=["s3", "tools"], +) +def s3_untar(): + @task + def untar(): + context = get_current_context() + hook = S3Hook(context["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False}) + src_bucket = context['params']['src_bucket'] + dst_bucket = context['params']['dst_bucket'] + dst_key_prefix = os.path.normpath(context["params"]["dst_key_prefix"]) + + existing_keys = dict.fromkeys(hook.list_keys(bucket_name=dst_bucket, + prefix=dst_key_prefix + "/"), 0) + s3_obj = hook.get_key(context["params"]["src_key"], bucket_name=src_bucket) + + with tarfile.open(fileobj=s3_obj.get()["Body"], mode='r|*') as tar: + for member in tar: + dst_key = os.path.normpath(dst_key_prefix + "/" + member.name) + # Ignore directories, links, devices, fifos, etc. + if (not member.isfile()) or member.name.endswith('/'): + print(f"Skipping {member.name}: is not a file") + continue + if dst_key in existing_keys: + print(f"Skipping {member.name}: already exists") + continue + print(f"Extracting {member.name} to {dst_key}") + fileobj = tar.extractfile(member) + fileobj.seekable = lambda: False + load_file_obj_with_backoff(hook, fileobj, + dst_key, + dst_bucket, + replace=True) + + untar() + + +s3_untar() -- 2.17.1 From c3ba29e4c54426c7400f1e9ca5323dd5f2c48cae Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Fri, 18 Oct 2024 16:53:14 +0200 Subject: [PATCH 12/42] Add dagutils --- workflow/dnet/dag_utils.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 workflow/dnet/dag_utils.py diff --git a/workflow/dnet/dag_utils.py b/workflow/dnet/dag_utils.py new file mode 100644 index 0000000..53a75aa --- /dev/null +++ b/workflow/dnet/dag_utils.py @@ -0,0 +1,16 @@ +from airflow.hooks.base import BaseHook +from airflow.providers.amazon.aws.hooks.s3 import S3Hook + +def get_bucket_name(context: dict, hook: S3Hook, param_name: str): + bucket_name = context["params"][param_name] + if not bucket_name: + bucket_name = hook.extra_args['bucket_name'] + return bucket_name + + +def get_default_bucket(): + hook = S3Hook("s3_conn", transfer_config_args={'use_threads': False}) + try: + return hook.service_config['bucket_name'] + except KeyError: + return '' -- 2.17.1 From 0fcabed2ae9a5878ee5ab4584c7729ab5d0bc44d Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Fri, 18 Oct 2024 16:58:42 +0200 Subject: [PATCH 13/42] change dag name --- workflow/dnet/run_spark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/dnet/run_spark.py b/workflow/dnet/run_spark.py index 3a2530a..0f18bf4 100644 --- a/workflow/dnet/run_spark.py +++ b/workflow/dnet/run_spark.py @@ -187,7 +187,7 @@ spec =SparkConfigurator( executor_memoryOverhead="3G").get_configuration() dag = DAG( - 'spark_scholix', + 'spark_run_test', default_args=default_args, schedule_interval=None, tags=['example', 'spark'] -- 2.17.1 From 8da265f0188af5e779d771e4c210a3c14a77ff24 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Fri, 18 Oct 2024 17:00:51 +0200 Subject: [PATCH 14/42] add utils in the parent folder --- workflow/dag_utils.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 workflow/dag_utils.py diff --git a/workflow/dag_utils.py b/workflow/dag_utils.py new file mode 100644 index 0000000..53a75aa --- /dev/null +++ b/workflow/dag_utils.py @@ -0,0 +1,16 @@ +from airflow.hooks.base import BaseHook +from airflow.providers.amazon.aws.hooks.s3 import S3Hook + +def get_bucket_name(context: dict, hook: S3Hook, param_name: str): + bucket_name = context["params"][param_name] + if not bucket_name: + bucket_name = hook.extra_args['bucket_name'] + return bucket_name + + +def get_default_bucket(): + hook = S3Hook("s3_conn", transfer_config_args={'use_threads': False}) + try: + return hook.service_config['bucket_name'] + except KeyError: + return '' -- 2.17.1 From bf7c9e2dce7653b1c4ea0935b451e71025790410 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Fri, 18 Oct 2024 17:16:37 +0200 Subject: [PATCH 15/42] revert some changes --- workflow/dnet/run_spark.py | 124 +------------------------------------ 1 file changed, 1 insertion(+), 123 deletions(-) diff --git a/workflow/dnet/run_spark.py b/workflow/dnet/run_spark.py index 0f18bf4..09a431b 100644 --- a/workflow/dnet/run_spark.py +++ b/workflow/dnet/run_spark.py @@ -27,7 +27,7 @@ https://github.com/GoogleCloudPlatform/spark-on-k8s-operator from os import path from datetime import timedelta, datetime -# from workflow.dnet.spark_configurator import SparkConfigurator +from spark_configurator import SparkConfigurator # [START import_module] # The DAG object; we'll need this to instantiate a DAG @@ -40,128 +40,6 @@ from airflow.utils.dates import days_ago # [END import_module] - -class SparkConfigurator: - def __init__(self, - name, - mainClass, - jarLocation:str, - arguments, - apiVersion=None, - namespace="dnet-spark-jobs", - image= "dnet-spark:1.0.0", - driver_cores=1, - driver_memory='1G', - executor_cores=1, - executor_memory="1G", - executor_memoryOverhead= "1G", - executor_instances=1 - ) -> None: - if apiVersion: - self.apiVersion = apiVersion - else: - self.apiVersion = "sparkoperator.k8s.io/v1beta2" - self.namespace= namespace - self.name = name - self.image= image - self.mainClass = mainClass - self.jarLocation = jarLocation - self.arguments= arguments - self.s3Configuration = { - "spark.driver.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", - "spark.executor.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", - "spark.hadoop.fs.defaultFS": "s3a://spark", - "spark.hadoop.fs.s3a.access.key": "minio", - "spark.hadoop.fs.s3a.secret.key": "minio123", - "spark.hadoop.fs.s3a.endpoint": "https://minio.dnet-minio-tenant.svc.cluster.local", - "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem", - "spark.hadoop.fs.s3a.path.style.access": "true", - "spark.hadoop.fs.s3a.attempts.maximum": "1", - "spark.hadoop.fs.s3a.connection.establish.timeout": "5000", - "spark.hadoop.fs.s3a.connection.timeout": "10001", - "spark.hadoop.fs.s3a.connection.ssl.enabled": "false", - "com.amazonaws.sdk.disableCertChecking": "true", - "com.cloudera.com.amazonaws.sdk.disableCertChecking": "true", - "fs.s3a.connection.ssl.strictverify": "false", - "fs.s3a.connection.ssl.enabled": "false", - "fs.s3a.ssl.enabled": "false", - "spark.hadoop.fs.s3a.ssl.enabled": "false" - } - self.sparkResoruceConf= { - 'driver_cores':driver_cores, - 'driver_memory':driver_memory, - 'executor_cores':executor_cores, - 'executor_memory':executor_memory, - 'executor_instances':executor_instances, - 'memoryOverhead':executor_memoryOverhead - - } - - def get_configuration(self) -> dict: - return { - "apiVersion": self.apiVersion, - "kind": "SparkApplication", - "metadata": { - "name": self.name, - "namespace": self.namespace - }, - "spec": { - "type": "Scala", - "mode": "cluster", - "image":self.image, - "imagePullPolicy": "IfNotPresent", - "mainClass": self.mainClass, - "mainApplicationFile": self.jarLocation, - "arguments": self.arguments, - "sparkVersion": "3.5.1", - "sparkConf": self.s3Configuration, - "restartPolicy": { - "type": "Never" - }, - "volumes": [ - { - "name": "test-volume", - "persistentVolumeClaim": { - "claimName": "my-spark-pvc-tmp" - } - } - ], - "driver": { - "javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", - "cores": self.sparkResoruceConf['driver_cores'], - "coreLimit": "1200m", - "memory": self.sparkResoruceConf['driver_memory'], - "labels": { - "version": "3.5.1" - }, - "serviceAccount": "spark", - "volumeMounts": [ - { - "name": "test-volume", - "mountPath": "/tmp" - } - ] - }, - "executor": { - "javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", - "cores": self.sparkResoruceConf['executor_cores'], - "memoryOverhead": self.sparkResoruceConf['memoryOverhead'], - "memory": self.sparkResoruceConf['executor_memory'], - "instances": self.sparkResoruceConf['executor_instances'], - "labels": { - "version": "3.5.1" - }, - "volumeMounts": [ - { - "name": "test-volume", - "mountPath": "/tmp" - } - ] - } - } - } - - # [START default_args] # These args will get passed on to each operator # You can override them on a per-task basis during operator initialization -- 2.17.1 From 448bb924ab2314113283f082651cf872c857fa85 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Fri, 18 Oct 2024 22:51:17 +0200 Subject: [PATCH 16/42] add test dedup task --- workflow/dnet/dedup.py | 83 +++++++++++++++++++++++++++++ workflow/dnet/spark_configurator.py | 26 ++------- 2 files changed, 86 insertions(+), 23 deletions(-) create mode 100644 workflow/dnet/dedup.py diff --git a/workflow/dnet/dedup.py b/workflow/dnet/dedup.py new file mode 100644 index 0000000..c51b842 --- /dev/null +++ b/workflow/dnet/dedup.py @@ -0,0 +1,83 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +This is an example DAG which uses SparkKubernetesOperator and SparkKubernetesSensor. +In this example, we create two tasks which execute sequentially. +The first task is to submit sparkApplication on Kubernetes cluster(the example uses spark-pi application). +and the second task is to check the final state of the sparkApplication that submitted in the first state. + +Spark-on-k8s operator is required to be already installed on Kubernetes +https://github.com/GoogleCloudPlatform/spark-on-k8s-operator +""" + +from spark_configurator import SparkConfigurator + +# [START import_module] +# The DAG object; we'll need this to instantiate a DAG +from airflow import DAG +# Operators; we need this to operate! +from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator +from airflow.providers.cncf.kubernetes.sensors.spark_kubernetes import SparkKubernetesSensor +from airflow.utils.dates import days_ago + +# [END import_module] + +# [START default_args] +# These args will get passed on to each operator +# You can override them on a per-task basis during operator initialization +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': days_ago(1), + 'email': ['airflow@example.com'], + 'email_on_failure': False, + 'email_on_retry': False, + 'max_active_runs': 1, + 'retries': 3 +} + + +dag = DAG( + 'dedup_graph', + default_args=default_args, + schedule_interval=None, + tags=['example', 'spark'] +) + +submit = SparkKubernetesOperator( + task_id='CreateSimRel', + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="createsimrels-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", + "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", + "--actionSetId", "dedup-result-decisiontree-v4", + "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", + "--numPartitions", "64" + ], + executor_cores=10, + executor_memory="4G", + executor_instances=1, + executor_memoryOverhead="3G").get_configuration(), + kubernetes_conn_id="kubernetes_default", + dag=dag +) + +submit diff --git a/workflow/dnet/spark_configurator.py b/workflow/dnet/spark_configurator.py index b943ca1..e77e9df 100644 --- a/workflow/dnet/spark_configurator.py +++ b/workflow/dnet/spark_configurator.py @@ -27,7 +27,7 @@ class SparkConfigurator: self.s3Configuration = { "spark.driver.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", "spark.executor.extraJavaOptions": "-Divy.cache.dir=/tmp -Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", - "spark.hadoop.fs.defaultFS": "s3a://spark", + "spark.hadoop.fs.defaultFS": "s3a://graph", "spark.hadoop.fs.s3a.access.key": "minio", "spark.hadoop.fs.s3a.secret.key": "minio123", "spark.hadoop.fs.s3a.endpoint": "https://minio.dnet-minio-tenant.svc.cluster.local", @@ -75,14 +75,6 @@ class SparkConfigurator: "restartPolicy": { "type": "Never" }, - "volumes": [ - { - "name": "test-volume", - "persistentVolumeClaim": { - "claimName": "my-spark-pvc-tmp" - } - } - ], "driver": { "javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", "cores": self.sparkResoruceConf['driver_cores'], @@ -91,13 +83,7 @@ class SparkConfigurator: "labels": { "version": "3.5.1" }, - "serviceAccount": "spark", - "volumeMounts": [ - { - "name": "test-volume", - "mountPath": "/tmp" - } - ] + "serviceAccount": "spark" }, "executor": { "javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", @@ -107,13 +93,7 @@ class SparkConfigurator: "instances": self.sparkResoruceConf['executor_instances'], "labels": { "version": "3.5.1" - }, - "volumeMounts": [ - { - "name": "test-volume", - "mountPath": "/tmp" - } - ] + } } } } \ No newline at end of file -- 2.17.1 From ba3f3517366e719925c5d1b69c8dfb7f2efe5d1f Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Sat, 19 Oct 2024 10:15:30 +0200 Subject: [PATCH 17/42] print existing files --- workflow/dnet/S3_untar.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflow/dnet/S3_untar.py b/workflow/dnet/S3_untar.py index 863b3aa..5c30569 100644 --- a/workflow/dnet/S3_untar.py +++ b/workflow/dnet/S3_untar.py @@ -69,8 +69,11 @@ def s3_untar(): dst_bucket = context['params']['dst_bucket'] dst_key_prefix = os.path.normpath(context["params"]["dst_key_prefix"]) + print(f"Existing keys with prefix: {dst_key_prefix}/") existing_keys = dict.fromkeys(hook.list_keys(bucket_name=dst_bucket, prefix=dst_key_prefix + "/"), 0) + for k in existing_keys.keys(): + print(f"{k}") s3_obj = hook.get_key(context["params"]["src_key"], bucket_name=src_bucket) with tarfile.open(fileobj=s3_obj.get()["Body"], mode='r|*') as tar: @@ -81,7 +84,7 @@ def s3_untar(): print(f"Skipping {member.name}: is not a file") continue if dst_key in existing_keys: - print(f"Skipping {member.name}: already exists") + print(f"Skipping {member.name}: already exists as {dst_key}") continue print(f"Extracting {member.name} to {dst_key}") fileobj = tar.extractfile(member) -- 2.17.1 From c5f42630612d51f486a5aaf287ab92ef9378b54d Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Sat, 19 Oct 2024 11:09:24 +0200 Subject: [PATCH 18/42] update spark-version --- workflow/dnet/spark_configurator.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/dnet/spark_configurator.py b/workflow/dnet/spark_configurator.py index e77e9df..e5b084a 100644 --- a/workflow/dnet/spark_configurator.py +++ b/workflow/dnet/spark_configurator.py @@ -70,7 +70,7 @@ class SparkConfigurator: "mainClass": self.mainClass, "mainApplicationFile": self.jarLocation, "arguments": self.arguments, - "sparkVersion": "3.5.1", + "sparkVersion": "3.5.3", "sparkConf": self.s3Configuration, "restartPolicy": { "type": "Never" @@ -81,7 +81,7 @@ class SparkConfigurator: "coreLimit": "1200m", "memory": self.sparkResoruceConf['driver_memory'], "labels": { - "version": "3.5.1" + "version": "3.5.3" }, "serviceAccount": "spark" }, @@ -92,7 +92,7 @@ class SparkConfigurator: "memory": self.sparkResoruceConf['executor_memory'], "instances": self.sparkResoruceConf['executor_instances'], "labels": { - "version": "3.5.1" + "version": "3.5.3" } } } -- 2.17.1 From 48f688cda9b01c477324d3f572dc1dd725647a05 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Sat, 19 Oct 2024 11:13:21 +0200 Subject: [PATCH 19/42] add deps jar --- workflow/dnet/spark_configurator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflow/dnet/spark_configurator.py b/workflow/dnet/spark_configurator.py index e5b084a..04fbd89 100644 --- a/workflow/dnet/spark_configurator.py +++ b/workflow/dnet/spark_configurator.py @@ -69,6 +69,9 @@ class SparkConfigurator: "imagePullPolicy": "IfNotPresent", "mainClass": self.mainClass, "mainApplicationFile": self.jarLocation, + "deps": { + "jar": [self.jarLocation] + }, "arguments": self.arguments, "sparkVersion": "3.5.3", "sparkConf": self.s3Configuration, -- 2.17.1 From 0a2956d81fa6abd7e183f52a0bdcc5fdd04a2471 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Sat, 19 Oct 2024 17:55:53 +0200 Subject: [PATCH 20/42] reduce executor cores --- workflow/dnet/dedup.py | 2 +- workflow/dnet/run_spark.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/dnet/dedup.py b/workflow/dnet/dedup.py index c51b842..d62a58d 100644 --- a/workflow/dnet/dedup.py +++ b/workflow/dnet/dedup.py @@ -72,7 +72,7 @@ submit = SparkKubernetesOperator( "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", "--numPartitions", "64" ], - executor_cores=10, + executor_cores=4, executor_memory="4G", executor_instances=1, executor_memoryOverhead="3G").get_configuration(), diff --git a/workflow/dnet/run_spark.py b/workflow/dnet/run_spark.py index 09a431b..61fef77 100644 --- a/workflow/dnet/run_spark.py +++ b/workflow/dnet/run_spark.py @@ -59,7 +59,7 @@ spec =SparkConfigurator( mainClass="eu.dnetlib.dhp.sx.graph.SparkCreateScholexplorerDump", jarLocation = 's3a://deps/dhp-shade-package-1.2.5-SNAPSHOT.jar', arguments =[ "--sourcePath", "s3a://raw-graph/01", "--targetPath", "s3a://scholix"],\ - executor_cores=10, + executor_cores=4, executor_memory="4G", executor_instances=1, executor_memoryOverhead="3G").get_configuration() -- 2.17.1 From fa90a9dbe0d96bc0b0ca632b477227d39030c544 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 09:36:55 +0200 Subject: [PATCH 21/42] Update spark operator version --- docker-images/spark-operator/Dockerfile | 7 ++ modules/airflow/airflow.tf | 126 +++++++++++------------- modules/airflow/providers.tf | 17 ++-- modules/minio/providers.tf | 17 ++-- providers.tf | 12 +++ spark-image/Dockerfile | 5 +- 6 files changed, 96 insertions(+), 88 deletions(-) create mode 100644 docker-images/spark-operator/Dockerfile create mode 100644 providers.tf diff --git a/docker-images/spark-operator/Dockerfile b/docker-images/spark-operator/Dockerfile new file mode 100644 index 0000000..4e58fa3 --- /dev/null +++ b/docker-images/spark-operator/Dockerfile @@ -0,0 +1,7 @@ +# docker build -t spark-operator:2.0.2 . && kind load docker-image -n dnet-data-platform spark-operator:2.0.2 +FROM kubeflow/spark-operator:2.0.2 + +ENV SPARK_HOME /opt/spark +USER root +RUN curl https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar -o ${SPARK_HOME}/jars/hadoop-aws-3.3.4.jar +RUN curl https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar -o ${SPARK_HOME}/jars/aws-java-sdk-bundle-1.12.262.jar diff --git a/modules/airflow/airflow.tf b/modules/airflow/airflow.tf index b5000d4..6b0c267 100644 --- a/modules/airflow/airflow.tf +++ b/modules/airflow/airflow.tf @@ -22,15 +22,17 @@ resource "kubernetes_role" "airflow_spark_role" { rule { api_groups = ["sparkoperator.k8s.io"] - resources = ["sparkapplications", "sparkapplications/status", - "scheduledsparkapplications", "scheduledsparkapplications/status"] - verbs = ["*"] + resources = [ + "sparkapplications", "sparkapplications/status", + "scheduledsparkapplications", "scheduledsparkapplications/status" + ] + verbs = ["*"] } rule { api_groups = [""] - resources = ["pods/log"] - verbs = ["*"] + resources = ["pods", "pods/log"] + verbs = ["*"] } } @@ -55,49 +57,27 @@ resource "kubernetes_role_binding_v1" "airflow_spark_role_binding" { } resource "kubernetes_role_binding_v1" "airflow_spark_role_binding2" { - depends_on = [kubernetes_namespace.spark_jobs_namespace] - metadata { - name = "airflow-spark-role-binding2" - namespace = "${var.namespace_prefix}spark-jobs" - } + depends_on = [kubernetes_namespace.spark_jobs_namespace] + metadata { + name = "airflow-spark-role-binding2" + namespace = "${var.namespace_prefix}spark-jobs" + } - subject { - kind = "ServiceAccount" - name = "airflow-worker" - namespace = "${var.namespace_prefix}airflow" - } + subject { + kind = "ServiceAccount" + name = "airflow-worker" + namespace = "${var.namespace_prefix}airflow" + } - role_ref { - api_group = "rbac.authorization.k8s.io" - kind = "Role" - name = "spark-role" - } + role_ref { + api_group = "rbac.authorization.k8s.io" + kind = "Role" + name = "spark-role" + } } -# -# -# resource "kubernetes_role_binding_v1" "spark_role_binding" { -# depends_on = [kubernetes_namespace.spark_jobs_namespace] -# metadata { -# name = "spark-role-binding" -# namespace = "${var.namespace_prefix}spark-jobs" -# } -# -# subject { -# kind = "ServiceAccount" -# name = "spark" -# namespace = "${var.namespace_prefix}spark-jobs" -# } -# -# role_ref { -# api_group = "rbac.authorization.k8s.io" -# kind = "Role" -# name = "spark-role" -# } -# } -# resource "helm_release" "gcp_spark_operator" { - depends_on = [kubernetes_namespace.spark_jobs_namespace] + depends_on = [kubernetes_namespace.spark_jobs_namespace] name = "gcp-spark-operator" chart = "spark-operator" repository = "https://kubeflow.github.io/spark-operator" @@ -106,23 +86,38 @@ resource "helm_release" "gcp_spark_operator" { dependency_update = "true" version = "2.0.2" - # set { - # name = "image.repository" - # value = "kubeflow/spark-operator" - # } - - # set { - # name = "image.tag" - # value = "v1beta2-1.4.5-3.5.0" - # } + set { + name = "image.repository" + value = "spark-operator" + } set { - name = "sparkJobNamespaces" + name = "image.tag" + value = "2.0.2" + } + + set { + name = "spark.jobNamespaces" value = "{${var.namespace_prefix}spark-jobs}" } set { - name = "serviceAccounts.spark.name" + name = "spark.serviceAccount.create" + value = "true" + } + + set { + name = "spark.serviceAccount.name" + value = "spark" + } + + set { + name = "controller.serviceAccount.create" + value = "true" + } + + set { + name = "controller.serviceAccount.name" value = "spark" } @@ -147,13 +142,13 @@ resource "kubernetes_namespace" "airflow" { resource "kubernetes_secret" "s3_conn_secrets" { depends_on = [kubernetes_namespace.airflow] metadata { - name = "s3-conn-secrets" + name = "s3-conn-secrets" namespace = "${var.namespace_prefix}airflow" } data = { - username = var.s3_key - password = var.s3_secret + username = var.s3_key + password = var.s3_secret AIRFLOW_CONN_S3_CONN = < Date: Mon, 21 Oct 2024 14:10:28 +0200 Subject: [PATCH 22/42] implemente whole scan pipeline --- workflow/dnet/dedup.py | 164 +++++++++++++++++++++++++++++++++++++++-- 1 file changed, 157 insertions(+), 7 deletions(-) diff --git a/workflow/dnet/dedup.py b/workflow/dnet/dedup.py index d62a58d..db8317f 100644 --- a/workflow/dnet/dedup.py +++ b/workflow/dnet/dedup.py @@ -25,16 +25,15 @@ Spark-on-k8s operator is required to be already installed on Kubernetes https://github.com/GoogleCloudPlatform/spark-on-k8s-operator """ -from spark_configurator import SparkConfigurator - # [START import_module] # The DAG object; we'll need this to instantiate a DAG from airflow import DAG # Operators; we need this to operate! from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator -from airflow.providers.cncf.kubernetes.sensors.spark_kubernetes import SparkKubernetesSensor from airflow.utils.dates import days_ago +from spark_configurator import SparkConfigurator + # [END import_module] # [START default_args] @@ -51,7 +50,6 @@ default_args = { 'retries': 3 } - dag = DAG( 'dedup_graph', default_args=default_args, @@ -59,7 +57,7 @@ dag = DAG( tags=['example', 'spark'] ) -submit = SparkKubernetesOperator( +simrel = SparkKubernetesOperator( task_id='CreateSimRel', namespace='dnet-spark-jobs', template_spec=SparkConfigurator( @@ -72,7 +70,7 @@ submit = SparkKubernetesOperator( "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", "--numPartitions", "64" ], - executor_cores=4, + executor_cores=8, executor_memory="4G", executor_instances=1, executor_memoryOverhead="3G").get_configuration(), @@ -80,4 +78,156 @@ submit = SparkKubernetesOperator( dag=dag ) -submit +whitelist = SparkKubernetesOperator( + task_id='WhitelistSimRels', + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="whitelistsimrels-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", + "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", + "--actionSetId", "dedup-result-decisiontree-v4", + "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", + "--whiteListPath", "s3a://graph/data/dedup/whitelist_prod", # TODO: copy! + "--numPartitions", "64" + ], + executor_cores=8, + executor_memory="4G", + executor_instances=1, + executor_memoryOverhead="3G").get_configuration(), + kubernetes_conn_id="kubernetes_default", + dag=dag +) + +createmergerel = SparkKubernetesOperator( + task_id='CreateMergeRels', + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="createmergerels-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", + "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", + "--actionSetId", "dedup-result-decisiontree-v4", + "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", + "--cutConnectedComponent", "200", + "--hiveMetastoreUris", "", + "--pivotHistoryDatabase", "", + "--numPartitions", "64" + ], + executor_cores=8, + executor_memory="4G", + executor_instances=1, + executor_memoryOverhead="3G").get_configuration(), + kubernetes_conn_id="kubernetes_default", + dag=dag +) + +creatededuprecord = SparkKubernetesOperator( + task_id='CreateDedupRecord', + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="creatededuprecord-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", + "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", + "--actionSetId", "dedup-result-decisiontree-v4", + "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", + "--numPartitions", "64" + ], + executor_cores=8, + executor_memory="4G", + executor_instances=1, + executor_memoryOverhead="3G").get_configuration(), + kubernetes_conn_id="kubernetes_default", + dag=dag +) + +copyopenorgsmergerel = SparkKubernetesOperator( + task_id='CopyOpenorgsMergeRels', + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="copyopenorgsmergerels-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", + "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", + "--actionSetId", "dedup-result-decisiontree-v4", + "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", + "--numPartitions", "64" + ], + executor_cores=8, + executor_memory="4G", + executor_instances=1, + executor_memoryOverhead="3G").get_configuration(), + kubernetes_conn_id="kubernetes_default", + dag=dag +) + +createorgsdeduprecord = SparkKubernetesOperator( + task_id='CreateOrgsDedupRecord', + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="createorgsdeduprecord-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", + "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", + "--actionSetId", "dedup-result-decisiontree-v4", + "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", + "--numPartitions", "64" + ], + executor_cores=8, + executor_memory="4G", + executor_instances=1, + executor_memoryOverhead="3G").get_configuration(), + kubernetes_conn_id="kubernetes_default", + dag=dag +) + +updateentity = SparkKubernetesOperator( + task_id='UpdateEntity', + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="updateentity-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", + "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", + "--dedupGraphPath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup", + "--numPartitions", "64" + ], + executor_cores=8, + executor_memory="4G", + executor_instances=1, + executor_memoryOverhead="3G").get_configuration(), + kubernetes_conn_id="kubernetes_default", + dag=dag +) + +copyrelations = SparkKubernetesOperator( + task_id='copyRelations', + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="copyrelations-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", + "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", + "--dedupGraphPath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup", + "--numPartitions", "64" + ], + executor_cores=8, + executor_memory="4G", + executor_instances=1, + executor_memoryOverhead="3G").get_configuration(), + kubernetes_conn_id="kubernetes_default", + dag=dag +) + +simrel >> whitelist >> createmergerel \ + >> creatededuprecord >> copyopenorgsmergerel \ + >> createorgsdeduprecord \ + >> updateentity >> copyrelations -- 2.17.1 From c6fbfd3f0ada28764ff4645a17642f1da5cc6316 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 14:30:40 +0200 Subject: [PATCH 23/42] Remove numpartitions argument where not needed --- workflow/dnet/dedup.py | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/workflow/dnet/dedup.py b/workflow/dnet/dedup.py index db8317f..8c7bac3 100644 --- a/workflow/dnet/dedup.py +++ b/workflow/dnet/dedup.py @@ -113,8 +113,7 @@ createmergerel = SparkKubernetesOperator( "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", "--cutConnectedComponent", "200", "--hiveMetastoreUris", "", - "--pivotHistoryDatabase", "", - "--numPartitions", "64" + "--pivotHistoryDatabase", "" ], executor_cores=8, executor_memory="4G", @@ -134,8 +133,7 @@ creatededuprecord = SparkKubernetesOperator( arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", "--actionSetId", "dedup-result-decisiontree-v4", - "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", - "--numPartitions", "64" + "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup" ], executor_cores=8, executor_memory="4G", @@ -176,8 +174,7 @@ createorgsdeduprecord = SparkKubernetesOperator( arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", "--actionSetId", "dedup-result-decisiontree-v4", - "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", - "--numPartitions", "64" + "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup" ], executor_cores=8, executor_memory="4G", @@ -196,8 +193,7 @@ updateentity = SparkKubernetesOperator( jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", - "--dedupGraphPath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup", - "--numPartitions", "64" + "--dedupGraphPath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup" ], executor_cores=8, executor_memory="4G", @@ -216,8 +212,7 @@ copyrelations = SparkKubernetesOperator( jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", - "--dedupGraphPath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup", - "--numPartitions", "64" + "--dedupGraphPath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup" ], executor_cores=8, executor_memory="4G", -- 2.17.1 From 034a01542af3b599dfd7f078c478844ed7890819 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 15:25:48 +0200 Subject: [PATCH 24/42] Implement consistency workflow --- workflow/dnet/consistency.py | 101 +++++++++++++++++++++++++++++++++++ 1 file changed, 101 insertions(+) create mode 100644 workflow/dnet/consistency.py diff --git a/workflow/dnet/consistency.py b/workflow/dnet/consistency.py new file mode 100644 index 0000000..190a754 --- /dev/null +++ b/workflow/dnet/consistency.py @@ -0,0 +1,101 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +This is an example DAG which uses SparkKubernetesOperator and SparkKubernetesSensor. +In this example, we create two tasks which execute sequentially. +The first task is to submit sparkApplication on Kubernetes cluster(the example uses spark-pi application). +and the second task is to check the final state of the sparkApplication that submitted in the first state. + +Spark-on-k8s operator is required to be already installed on Kubernetes +https://github.com/GoogleCloudPlatform/spark-on-k8s-operator +""" + +# [START import_module] +# The DAG object; we'll need this to instantiate a DAG +from airflow import DAG +# Operators; we need this to operate! +from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator +from airflow.utils.dates import days_ago + +from spark_configurator import SparkConfigurator + +# [END import_module] + +# [START default_args] +# These args will get passed on to each operator +# You can override them on a per-task basis during operator initialization +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': days_ago(1), + 'email': ['airflow@example.com'], + 'email_on_failure': False, + 'email_on_retry': False, + 'max_active_runs': 1, + 'retries': 3 +} + +dag = DAG( + 'consistency_graph', + default_args=default_args, + schedule_interval=None, + tags=['example', 'spark'] +) + +propagaterel = SparkKubernetesOperator( + task_id='PropagateRelation', + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="propagaterels-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkPropagateRelation", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup", + "--graphOutputPath", "s3a://graph/tmp/prod_provision/graph/07_graph_consistent", + "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup" + ], + executor_cores=8, + executor_memory="4G", + executor_instances=1, + executor_memoryOverhead="3G").get_configuration(), + kubernetes_conn_id="kubernetes_default", + dag=dag +) + +group_entities = SparkKubernetesOperator( + task_id='GroupEntities', + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="groupentities-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphInputPath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup", + "--checkpointPath", "s3a://graph/tmp/prod_provision/working_dir/dedup/grouped_entities", + "--outputPath", "s3a://graph/tmp/prod_provision/graph/07_graph_consistent", + "--isLookupUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", + "--filterInvisible", "true" + ], +# + executor_cores=8, + executor_memory="4G", + executor_instances=1, + executor_memoryOverhead="3G").get_configuration(), + kubernetes_conn_id="kubernetes_default", + dag=dag +) + +propagaterel >> group_entities \ No newline at end of file -- 2.17.1 From df46c8c65ff208f4d445b787c860d609cc56cd5e Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 18:49:21 +0200 Subject: [PATCH 25/42] Added ORCID enrichment workflows --- workflow/dnet/orcid_enrich.py | 82 +++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 workflow/dnet/orcid_enrich.py diff --git a/workflow/dnet/orcid_enrich.py b/workflow/dnet/orcid_enrich.py new file mode 100644 index 0000000..2e93996 --- /dev/null +++ b/workflow/dnet/orcid_enrich.py @@ -0,0 +1,82 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +This is an example DAG which uses SparkKubernetesOperator and SparkKubernetesSensor. +In this example, we create two tasks which execute sequentially. +The first task is to submit sparkApplication on Kubernetes cluster(the example uses spark-pi application). +and the second task is to check the final state of the sparkApplication that submitted in the first state. + +Spark-on-k8s operator is required to be already installed on Kubernetes +https://github.com/GoogleCloudPlatform/spark-on-k8s-operator +""" + +# [START import_module] +# The DAG object; we'll need this to instantiate a DAG +from airflow import DAG +# Operators; we need this to operate! +from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator +from airflow.utils.dates import days_ago + +from spark_configurator import SparkConfigurator + +# [END import_module] + + +# [START default_args] +# These args will get passed on to each operator +# You can override them on a per-task basis during operator initialization +default_args = { + 'owner': 'airflow', + 'depends_on_past': False, + 'start_date': days_ago(1), + 'email': ['airflow@example.com'], + 'email_on_failure': False, + 'email_on_retry': False, + 'max_active_runs': 1, + 'retries': 3 +} + +dag = DAG( + 'orcid_enrichment_graph', + default_args=default_args, + schedule_interval=None, + tags=['example', 'spark'] +) + +orcid_enrich = SparkKubernetesOperator( + task_id='EnrichGraphWithOrcidAuthors', + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="orcidenrich-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.enrich.orcid.SparkEnrichGraphWithOrcidAuthors", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--orcidPath", "s3a://graph/data/orcid_2023/tables", + "--graphPath", "s3a://graph/tmp/prod_provision/graph/07_graph_consistent", + "--targetPath", "s3a://graph/tmp/prod_provision/graph/09_graph_orcid_enriched", + "--workingDir", "s3a://graph/tmp/prod_provision/working_dir/orcid_enrichment", + "--master", "" + ], + executor_cores=8, + executor_memory="4G", + executor_instances=1, + executor_memoryOverhead="3G").get_configuration(), + kubernetes_conn_id="kubernetes_default", + dag=dag +) + +orcid_enrich -- 2.17.1 From 131f6e559249282264dca36e28a6e932560d98f7 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 20:23:31 +0200 Subject: [PATCH 26/42] enable dynamic allocation --- workflow/dnet/spark_configurator.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/workflow/dnet/spark_configurator.py b/workflow/dnet/spark_configurator.py index 04fbd89..b007a65 100644 --- a/workflow/dnet/spark_configurator.py +++ b/workflow/dnet/spark_configurator.py @@ -78,6 +78,9 @@ class SparkConfigurator: "restartPolicy": { "type": "Never" }, + "dynamicAllocation": { + "enables": True + }, "driver": { "javaOptions": "-Dcom.amazonaws.sdk.disableCertChecking=true -Dcom.cloudera.com.amazonaws.sdk.disableCertChecking=true", "cores": self.sparkResoruceConf['driver_cores'], -- 2.17.1 From aae37058f719645b568c39fd0484535336c12200 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 20:30:31 +0200 Subject: [PATCH 27/42] Increase memory --- workflow/dnet/orcid_enrich.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflow/dnet/orcid_enrich.py b/workflow/dnet/orcid_enrich.py index 2e93996..7d5904c 100644 --- a/workflow/dnet/orcid_enrich.py +++ b/workflow/dnet/orcid_enrich.py @@ -72,9 +72,9 @@ orcid_enrich = SparkKubernetesOperator( "--master", "" ], executor_cores=8, - executor_memory="4G", + executor_memory="16G", executor_instances=1, - executor_memoryOverhead="3G").get_configuration(), + executor_memoryOverhead="8G").get_configuration(), kubernetes_conn_id="kubernetes_default", dag=dag ) -- 2.17.1 From 73e78d687780b872c96d18e8664dc1be09621a6b Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 21 Oct 2024 21:31:43 +0200 Subject: [PATCH 28/42] Add workflow with all graph construction steps --- workflow/dnet/build_openaire_graph.py | 49 +++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) create mode 100644 workflow/dnet/build_openaire_graph.py diff --git a/workflow/dnet/build_openaire_graph.py b/workflow/dnet/build_openaire_graph.py new file mode 100644 index 0000000..72d5aff --- /dev/null +++ b/workflow/dnet/build_openaire_graph.py @@ -0,0 +1,49 @@ +from __future__ import annotations + +import pendulum +from airflow.decorators import dag +from airflow.models.baseoperator import chain +from airflow.models.param import Param +from airflow.operators.trigger_dagrun import TriggerDagRunOperator + +import dag_utils + +@dag( + dag_id="build_openaire_graph", + dag_display_name="Build the OpenAIRE graph", + params={ + "S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection for S3 endpoint") + }, + tags=["openaire"] +) +def build_new_graph(): + chain(TriggerDagRunOperator( + task_id="dedup", + trigger_dag_id="dedup_graph", + wait_for_completion=True), + + TriggerDagRunOperator( + task_id="consistency", + trigger_dag_id="consistency_graph", + wait_for_completion=True + + # conf={ + # "file": "{{ task_instance.xcom_pull(task_ids='check_new_dump_availability', key='file_path') }}", + # "dst_bucket": "{{ dag_run.conf.get('S3_BUCKET') }}", + # } + ), + TriggerDagRunOperator( + task_id="orcid_enrichment", + trigger_dag_id="orcid_enrichment_graph", + wait_for_completion=True + + # conf={ + # "src_key": "/data/graph/{{ task_instance.xcom_pull(task_ids='check_new_dump_availability', key='file_path') }}", + # "src_bucket": "{{ dag_run.conf.get('S3_BUCKET') }}", + # "dst_key_prefix": "/data/graph/{{ task_instance.xcom_pull(task_ids='check_new_dump_availability', key='timestamp') }}", + # "dst_bucket": "{{ dag_run.conf.get('S3_BUCKET') }}" + # } + ) + ) + +build_new_graph() -- 2.17.1 From 15ba3cf2028abd8bbe7ae0039afed1a4680b0490 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Tue, 22 Oct 2024 10:16:32 +0200 Subject: [PATCH 29/42] Provide paths as dag configuration parameters --- workflow/dnet/orcid_enrich.py | 116 ++++++++++++++-------------------- 1 file changed, 49 insertions(+), 67 deletions(-) diff --git a/workflow/dnet/orcid_enrich.py b/workflow/dnet/orcid_enrich.py index 7d5904c..230f48f 100644 --- a/workflow/dnet/orcid_enrich.py +++ b/workflow/dnet/orcid_enrich.py @@ -1,82 +1,64 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -This is an example DAG which uses SparkKubernetesOperator and SparkKubernetesSensor. -In this example, we create two tasks which execute sequentially. -The first task is to submit sparkApplication on Kubernetes cluster(the example uses spark-pi application). -and the second task is to check the final state of the sparkApplication that submitted in the first state. +import os +import tarfile +import time +from datetime import timedelta -Spark-on-k8s operator is required to be already installed on Kubernetes -https://github.com/GoogleCloudPlatform/spark-on-k8s-operator -""" +import pendulum +from airflow.decorators import dag +from airflow.decorators import task +from airflow.models.param import Param +from airflow.operators.python import get_current_context -# [START import_module] -# The DAG object; we'll need this to instantiate a DAG -from airflow import DAG -# Operators; we need this to operate! from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator from airflow.utils.dates import days_ago from spark_configurator import SparkConfigurator -# [END import_module] +EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6)) - -# [START default_args] -# These args will get passed on to each operator -# You can override them on a per-task basis during operator initialization default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'start_date': days_ago(1), - 'email': ['airflow@example.com'], - 'email_on_failure': False, - 'email_on_retry': False, - 'max_active_runs': 1, - 'retries': 3 + "execution_timeout": timedelta(days=EXECUTION_TIMEOUT), + "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)), + "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))), } -dag = DAG( - 'orcid_enrichment_graph', + +@dag( + dag_id="orcid_enrichment_graph", + dag_display_name="Enrich Graph with ORCID data", default_args=default_args, - schedule_interval=None, - tags=['example', 'spark'] + params={ + "S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection of S3 endpoint"), + "ORCID_PATH": Param("s3a://graph/data/orcid_2023/tables", type='string', description=""), + "INPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/07_graph_consistent", type='string', description=""), + "OUTPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/09_graph_orcid_enriched", type='string', description=""), + "WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir/orcid_enrichment", type='string', description=""), + }, + tags=["openaire"], ) +def orcid_enrichment_dag(): + orcid_enrich = SparkKubernetesOperator( + task_id='EnrichGraphWithOrcidAuthors', + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="orcidenrich-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.enrich.orcid.SparkEnrichGraphWithOrcidAuthors", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--orcidPath", "{{ dag_run.conf.get('ORCID_PATH') }}", + "--graphPath", "{{ dag_run.conf.get('INPUT_PATH') }}", + "--targetPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}", + "--workingDir", "{{ dag_run.conf.get('WRKDIR_PATH') }}", + "--master", "" + ], + executor_cores=8, + executor_memory="16G", + executor_instances=1, + executor_memoryOverhead="8G").get_configuration(), + kubernetes_conn_id="kubernetes_default", + dag=dag + ) -orcid_enrich = SparkKubernetesOperator( - task_id='EnrichGraphWithOrcidAuthors', - namespace='dnet-spark-jobs', - template_spec=SparkConfigurator( - name="orcidenrich-{{ ds }}-{{ task_instance.try_number }}", - mainClass="eu.dnetlib.dhp.enrich.orcid.SparkEnrichGraphWithOrcidAuthors", - jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', - arguments=["--orcidPath", "s3a://graph/data/orcid_2023/tables", - "--graphPath", "s3a://graph/tmp/prod_provision/graph/07_graph_consistent", - "--targetPath", "s3a://graph/tmp/prod_provision/graph/09_graph_orcid_enriched", - "--workingDir", "s3a://graph/tmp/prod_provision/working_dir/orcid_enrichment", - "--master", "" - ], - executor_cores=8, - executor_memory="16G", - executor_instances=1, - executor_memoryOverhead="8G").get_configuration(), - kubernetes_conn_id="kubernetes_default", - dag=dag -) + orcid_enrich() -orcid_enrich + +orcid_enrichment_dag() -- 2.17.1 From 1bd836b88a7bc5fd0bf35a63f0ef8b05b128707c Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Tue, 22 Oct 2024 10:19:40 +0200 Subject: [PATCH 30/42] Provide paths as dag configuration parameters --- workflow/dnet/build_openaire_graph.py | 62 +++-- workflow/dnet/consistency.py | 144 ++++------ workflow/dnet/dag_utils.py | 8 + workflow/dnet/dedup.py | 361 +++++++++++--------------- workflow/dnet/orcid_enrich.py | 29 +-- workflow/dnet/spark_configurator.py | 7 +- 6 files changed, 276 insertions(+), 335 deletions(-) diff --git a/workflow/dnet/build_openaire_graph.py b/workflow/dnet/build_openaire_graph.py index 72d5aff..ac4ab6a 100644 --- a/workflow/dnet/build_openaire_graph.py +++ b/workflow/dnet/build_openaire_graph.py @@ -1,6 +1,5 @@ from __future__ import annotations -import pendulum from airflow.decorators import dag from airflow.models.baseoperator import chain from airflow.models.param import Param @@ -8,42 +7,69 @@ from airflow.operators.trigger_dagrun import TriggerDagRunOperator import dag_utils + @dag( dag_id="build_openaire_graph", dag_display_name="Build the OpenAIRE graph", params={ - "S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection for S3 endpoint") + "S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection for S3 endpoint"), + "GRAPH_PATH": Param("s3a://graph/tmp/prod_provision/graph", type='string', description=""), + "WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir", type='string', description=""), + "IS_LOOKUP_URL": Param("http://services.openaire.eu:8280/is/services/isLookUp?wsdl", type='string', + description=""), + "DEDUP_CONFIG_ID": Param("dedup-result-decisiontree-v4", type='string', description=""), + "ORCID_PATH": Param("s3a://graph/data/orcid_2023/tables", type='string', description="") }, tags=["openaire"] ) def build_new_graph(): - chain(TriggerDagRunOperator( - task_id="dedup", - trigger_dag_id="dedup_graph", - wait_for_completion=True), + chain( + TriggerDagRunOperator( + task_id="dedup", + task_display_name="Deduplicate Research Results", + trigger_dag_id="results_deduplication", + wait_for_completion=True, + conf={ + "S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}", + "INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["inference"], + "OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["dedup"], + "WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/dedup", + "IS_LOOKUP_URL": "{{ dag_run.conf.get('IS_LOOKUP_URL') }}", + "DEDUP_CONFIG_ID": "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}" + } + ), TriggerDagRunOperator( task_id="consistency", + task_display_name="Enforce Consistency of Graph", trigger_dag_id="consistency_graph", - wait_for_completion=True + wait_for_completion=True, - # conf={ - # "file": "{{ task_instance.xcom_pull(task_ids='check_new_dump_availability', key='file_path') }}", - # "dst_bucket": "{{ dag_run.conf.get('S3_BUCKET') }}", - # } + conf={ + "S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}", + + "INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["dedup"], + "OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"], + "WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/dedup", + "IS_LOOKUP_URL": "{{ dag_run.conf.get('IS_LOOKUP_URL') }}" + } ), TriggerDagRunOperator( task_id="orcid_enrichment", + task_display_name="Enrich Graph with ORCID data", trigger_dag_id="orcid_enrichment_graph", - wait_for_completion=True + wait_for_completion=True, - # conf={ - # "src_key": "/data/graph/{{ task_instance.xcom_pull(task_ids='check_new_dump_availability', key='file_path') }}", - # "src_bucket": "{{ dag_run.conf.get('S3_BUCKET') }}", - # "dst_key_prefix": "/data/graph/{{ task_instance.xcom_pull(task_ids='check_new_dump_availability', key='timestamp') }}", - # "dst_bucket": "{{ dag_run.conf.get('S3_BUCKET') }}" - # } + conf={ + "S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}", + + "ORCID_PATH": "{{ dag_run.conf.get('ORCID_PATH') }}", + "INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"], + "OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["orcid_enhancement"], + "WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/orcid_enrichment" + } ) ) + build_new_graph() diff --git a/workflow/dnet/consistency.py b/workflow/dnet/consistency.py index 190a754..ad5a938 100644 --- a/workflow/dnet/consistency.py +++ b/workflow/dnet/consistency.py @@ -1,101 +1,71 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -This is an example DAG which uses SparkKubernetesOperator and SparkKubernetesSensor. -In this example, we create two tasks which execute sequentially. -The first task is to submit sparkApplication on Kubernetes cluster(the example uses spark-pi application). -and the second task is to check the final state of the sparkApplication that submitted in the first state. +import os +from datetime import timedelta -Spark-on-k8s operator is required to be already installed on Kubernetes -https://github.com/GoogleCloudPlatform/spark-on-k8s-operator -""" - -# [START import_module] -# The DAG object; we'll need this to instantiate a DAG -from airflow import DAG -# Operators; we need this to operate! +from airflow.decorators import dag +from airflow.models.baseoperator import chain +from airflow.models.param import Param from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator -from airflow.utils.dates import days_ago from spark_configurator import SparkConfigurator -# [END import_module] +EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6)) -# [START default_args] -# These args will get passed on to each operator -# You can override them on a per-task basis during operator initialization default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'start_date': days_ago(1), - 'email': ['airflow@example.com'], - 'email_on_failure': False, - 'email_on_retry': False, - 'max_active_runs': 1, - 'retries': 3 + "execution_timeout": timedelta(days=EXECUTION_TIMEOUT), + "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)), + "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))) } -dag = DAG( - 'consistency_graph', + +@dag( + dag_id="consistency_graph", + dag_display_name="Enforce Consistency of Graph", default_args=default_args, - schedule_interval=None, - tags=['example', 'spark'] -) + params={ + "S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection of S3 endpoint"), -propagaterel = SparkKubernetesOperator( - task_id='PropagateRelation', - namespace='dnet-spark-jobs', - template_spec=SparkConfigurator( - name="propagaterels-{{ ds }}-{{ task_instance.try_number }}", - mainClass="eu.dnetlib.dhp.oa.dedup.SparkPropagateRelation", - jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', - arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup", - "--graphOutputPath", "s3a://graph/tmp/prod_provision/graph/07_graph_consistent", - "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup" - ], - executor_cores=8, - executor_memory="4G", - executor_instances=1, - executor_memoryOverhead="3G").get_configuration(), - kubernetes_conn_id="kubernetes_default", - dag=dag + "INPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/06_graph_dedup", type='string', description=""), + "OUTPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/07_graph_consistent", type='string', description=""), + "WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir/dedup", type='string', description=""), + "IS_LOOKUP_URL": Param("http://services.openaire.eu:8280/is/services/isLookUp?wsdl", type='string', + description="") + }, + tags=["openaire"] ) +def consistency_graph_dag(): + propagate_rel = SparkKubernetesOperator( + task_id='PropagateRelation', + task_display_name="Propagate Relations", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="propagaterels-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkPropagateRelation", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}", + "--graphOutputPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}", + "--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) -group_entities = SparkKubernetesOperator( - task_id='GroupEntities', - namespace='dnet-spark-jobs', - template_spec=SparkConfigurator( - name="groupentities-{{ ds }}-{{ task_instance.try_number }}", - mainClass="eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob", - jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', - arguments=["--graphInputPath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup", - "--checkpointPath", "s3a://graph/tmp/prod_provision/working_dir/dedup/grouped_entities", - "--outputPath", "s3a://graph/tmp/prod_provision/graph/07_graph_consistent", - "--isLookupUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", - "--filterInvisible", "true" - ], -# - executor_cores=8, - executor_memory="4G", - executor_instances=1, - executor_memoryOverhead="3G").get_configuration(), - kubernetes_conn_id="kubernetes_default", - dag=dag -) + group_entities = SparkKubernetesOperator( + task_id='GroupEntities', + task_display_name="Group results by id", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="groupentities-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphInputPath", "{{ dag_run.conf.get('INPUT_PATH') }}", + "--checkpointPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}/grouped_entities", + "--outputPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}", + "--isLookupUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}", + "--filterInvisible", "true" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) -propagaterel >> group_entities \ No newline at end of file + chain(propagate_rel, group_entities) + + +consistency_graph_dag() diff --git a/workflow/dnet/dag_utils.py b/workflow/dnet/dag_utils.py index 53a75aa..88b611e 100644 --- a/workflow/dnet/dag_utils.py +++ b/workflow/dnet/dag_utils.py @@ -1,6 +1,14 @@ from airflow.hooks.base import BaseHook from airflow.providers.amazon.aws.hooks.s3 import S3Hook +BUILD_PHASES = { + "inference": "05_graph_inferred", + "dedup": "06_graph_dedup", + "consistency": "07_graph_consistent", + "enrichment": "08_graph_dedup_enriched", # actionset + "orcid_enhancement": "09_graph_orcid_enriched" +} + def get_bucket_name(context: dict, hook: S3Hook, param_name: str): bucket_name = context["params"][param_name] if not bucket_name: diff --git a/workflow/dnet/dedup.py b/workflow/dnet/dedup.py index 8c7bac3..e2b8050 100644 --- a/workflow/dnet/dedup.py +++ b/workflow/dnet/dedup.py @@ -1,228 +1,173 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -""" -This is an example DAG which uses SparkKubernetesOperator and SparkKubernetesSensor. -In this example, we create two tasks which execute sequentially. -The first task is to submit sparkApplication on Kubernetes cluster(the example uses spark-pi application). -and the second task is to check the final state of the sparkApplication that submitted in the first state. +import os +from datetime import timedelta -Spark-on-k8s operator is required to be already installed on Kubernetes -https://github.com/GoogleCloudPlatform/spark-on-k8s-operator -""" - -# [START import_module] -# The DAG object; we'll need this to instantiate a DAG -from airflow import DAG -# Operators; we need this to operate! +from airflow.decorators import dag +from airflow.models.baseoperator import chain +from airflow.models.param import Param from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator -from airflow.utils.dates import days_ago from spark_configurator import SparkConfigurator -# [END import_module] +EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6)) -# [START default_args] -# These args will get passed on to each operator -# You can override them on a per-task basis during operator initialization default_args = { - 'owner': 'airflow', - 'depends_on_past': False, - 'start_date': days_ago(1), - 'email': ['airflow@example.com'], - 'email_on_failure': False, - 'email_on_retry': False, - 'max_active_runs': 1, - 'retries': 3 + "execution_timeout": timedelta(days=EXECUTION_TIMEOUT), + "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)), + "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))) } -dag = DAG( - 'dedup_graph', + +@dag( + dag_id="results_deduplication", + dag_display_name="Deduplicate Research Results", default_args=default_args, - schedule_interval=None, - tags=['example', 'spark'] -) + params={ + "S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection of S3 endpoint"), -simrel = SparkKubernetesOperator( - task_id='CreateSimRel', - namespace='dnet-spark-jobs', - template_spec=SparkConfigurator( - name="createsimrels-{{ ds }}-{{ task_instance.try_number }}", - mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels", - jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', - arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", - "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", - "--actionSetId", "dedup-result-decisiontree-v4", - "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", - "--numPartitions", "64" - ], - executor_cores=8, - executor_memory="4G", - executor_instances=1, - executor_memoryOverhead="3G").get_configuration(), - kubernetes_conn_id="kubernetes_default", - dag=dag + "INPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/05_graph_inferred", type='string', description=""), + "OUTPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/06_graph_dedup", type='string', description=""), + "WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir/dedup", type='string', description=""), + "IS_LOOKUP_URL": Param("http://services.openaire.eu:8280/is/services/isLookUp?wsdl", type='string', + description=""), + "DEDUP_CONFIG_ID": Param("dedup-result-decisiontree-v4", type='string', description="") + }, + tags=["openaire"] ) +def results_deduplication_dag(): + simrel = SparkKubernetesOperator( + task_id='CreateSimRel', + task_display_name="Create Similarity Relations", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="createsimrels-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}", + "--isLookUpUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}", + "--actionSetId", "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}", + "--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}", + "--numPartitions", "64" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) -whitelist = SparkKubernetesOperator( - task_id='WhitelistSimRels', - namespace='dnet-spark-jobs', - template_spec=SparkConfigurator( - name="whitelistsimrels-{{ ds }}-{{ task_instance.try_number }}", - mainClass="eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels", - jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', - arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", - "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", - "--actionSetId", "dedup-result-decisiontree-v4", - "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", - "--whiteListPath", "s3a://graph/data/dedup/whitelist_prod", # TODO: copy! - "--numPartitions", "64" - ], - executor_cores=8, - executor_memory="4G", - executor_instances=1, - executor_memoryOverhead="3G").get_configuration(), - kubernetes_conn_id="kubernetes_default", - dag=dag -) + whitelist = SparkKubernetesOperator( + task_id='WhitelistSimRels', + task_display_name="Add Whitelist Similarity Relations", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="whitelistsimrels-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}", + "--isLookUpUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}", + "--actionSetId", "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}", + "--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}", + "--whiteListPath", "s3a://graph/data/dedup/whitelist_prod", # TODO: copy! + "--numPartitions", "64" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) -createmergerel = SparkKubernetesOperator( - task_id='CreateMergeRels', - namespace='dnet-spark-jobs', - template_spec=SparkConfigurator( - name="createmergerels-{{ ds }}-{{ task_instance.try_number }}", - mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels", - jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', - arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", - "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", - "--actionSetId", "dedup-result-decisiontree-v4", - "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", - "--cutConnectedComponent", "200", - "--hiveMetastoreUris", "", - "--pivotHistoryDatabase", "" - ], - executor_cores=8, - executor_memory="4G", - executor_instances=1, - executor_memoryOverhead="3G").get_configuration(), - kubernetes_conn_id="kubernetes_default", - dag=dag -) + createmergerel = SparkKubernetesOperator( + task_id='CreateMergeRels', + task_display_name="Create Merge Relations", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="createmergerels-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}", + "--isLookUpUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}", + "--actionSetId", "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}", + "--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}", + "--cutConnectedComponent", "200", + "--hiveMetastoreUris", "", + "--pivotHistoryDatabase", "" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) -creatededuprecord = SparkKubernetesOperator( - task_id='CreateDedupRecord', - namespace='dnet-spark-jobs', - template_spec=SparkConfigurator( - name="creatededuprecord-{{ ds }}-{{ task_instance.try_number }}", - mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord", - jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', - arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", - "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", - "--actionSetId", "dedup-result-decisiontree-v4", - "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup" - ], - executor_cores=8, - executor_memory="4G", - executor_instances=1, - executor_memoryOverhead="3G").get_configuration(), - kubernetes_conn_id="kubernetes_default", - dag=dag -) + creatededuprecord = SparkKubernetesOperator( + task_id='CreateDedupRecord', + task_display_name="Create Dedup Record", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="creatededuprecord-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}", + "--isLookUpUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}", + "--actionSetId", "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}", + "--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) -copyopenorgsmergerel = SparkKubernetesOperator( - task_id='CopyOpenorgsMergeRels', - namespace='dnet-spark-jobs', - template_spec=SparkConfigurator( - name="copyopenorgsmergerels-{{ ds }}-{{ task_instance.try_number }}", - mainClass="eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels", - jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', - arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", - "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", - "--actionSetId", "dedup-result-decisiontree-v4", - "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", - "--numPartitions", "64" - ], - executor_cores=8, - executor_memory="4G", - executor_instances=1, - executor_memoryOverhead="3G").get_configuration(), - kubernetes_conn_id="kubernetes_default", - dag=dag -) + copyopenorgsmergerel = SparkKubernetesOperator( + task_id='CopyOpenorgsMergeRels', + task_display_name="Copy Openorgs Merge Relations", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="copyopenorgsmergerels-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}", + "--isLookUpUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}", + "--actionSetId", "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}", + "--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}", + "--numPartitions", "64" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) -createorgsdeduprecord = SparkKubernetesOperator( - task_id='CreateOrgsDedupRecord', - namespace='dnet-spark-jobs', - template_spec=SparkConfigurator( - name="createorgsdeduprecord-{{ ds }}-{{ task_instance.try_number }}", - mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord", - jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', - arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", - "--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl", - "--actionSetId", "dedup-result-decisiontree-v4", - "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup" - ], - executor_cores=8, - executor_memory="4G", - executor_instances=1, - executor_memoryOverhead="3G").get_configuration(), - kubernetes_conn_id="kubernetes_default", - dag=dag -) + createorgsdeduprecord = SparkKubernetesOperator( + task_id='CreateOrgsDedupRecord', + task_display_name="Create Organizations Dedup Records", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="createorgsdeduprecord-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}", + "--isLookUpUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}", + "--actionSetId", "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}", + "--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) -updateentity = SparkKubernetesOperator( - task_id='UpdateEntity', - namespace='dnet-spark-jobs', - template_spec=SparkConfigurator( - name="updateentity-{{ ds }}-{{ task_instance.try_number }}", - mainClass="eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity", - jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', - arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", - "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", - "--dedupGraphPath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup" - ], - executor_cores=8, - executor_memory="4G", - executor_instances=1, - executor_memoryOverhead="3G").get_configuration(), - kubernetes_conn_id="kubernetes_default", - dag=dag -) + updateentity = SparkKubernetesOperator( + task_id='UpdateEntity', + task_display_name="Update Entity", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="updateentity-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}", + "--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}", + "--dedupGraphPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) -copyrelations = SparkKubernetesOperator( - task_id='copyRelations', - namespace='dnet-spark-jobs', - template_spec=SparkConfigurator( - name="copyrelations-{{ ds }}-{{ task_instance.try_number }}", - mainClass="eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs", - jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', - arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred", - "--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup", - "--dedupGraphPath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup" - ], - executor_cores=8, - executor_memory="4G", - executor_instances=1, - executor_memoryOverhead="3G").get_configuration(), - kubernetes_conn_id="kubernetes_default", - dag=dag -) + copyrelations = SparkKubernetesOperator( + task_id='copyRelations', + task_display_name="Copy Non-Openorgs Relations", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="copyrelations-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}", + "--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}", + "--dedupGraphPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) -simrel >> whitelist >> createmergerel \ - >> creatededuprecord >> copyopenorgsmergerel \ - >> createorgsdeduprecord \ - >> updateentity >> copyrelations + chain(simrel, whitelist, createmergerel, creatededuprecord, copyopenorgsmergerel, createorgsdeduprecord, updateentity, copyrelations) + + +results_deduplication_dag() diff --git a/workflow/dnet/orcid_enrich.py b/workflow/dnet/orcid_enrich.py index 230f48f..c1e3bcd 100644 --- a/workflow/dnet/orcid_enrich.py +++ b/workflow/dnet/orcid_enrich.py @@ -1,16 +1,9 @@ import os -import tarfile -import time from datetime import timedelta -import pendulum from airflow.decorators import dag -from airflow.decorators import task from airflow.models.param import Param -from airflow.operators.python import get_current_context - from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator -from airflow.utils.dates import days_ago from spark_configurator import SparkConfigurator @@ -19,7 +12,7 @@ EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6)) default_args = { "execution_timeout": timedelta(days=EXECUTION_TIMEOUT), "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)), - "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))), + "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))) } @@ -31,14 +24,17 @@ default_args = { "S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection of S3 endpoint"), "ORCID_PATH": Param("s3a://graph/data/orcid_2023/tables", type='string', description=""), "INPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/07_graph_consistent", type='string', description=""), - "OUTPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/09_graph_orcid_enriched", type='string', description=""), - "WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir/orcid_enrichment", type='string', description=""), + "OUTPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/09_graph_orcid_enriched", type='string', + description=""), + "WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir/orcid_enrichment", type='string', + description="") }, - tags=["openaire"], + tags=["openaire"] ) def orcid_enrichment_dag(): orcid_enrich = SparkKubernetesOperator( task_id='EnrichGraphWithOrcidAuthors', + task_display_name='Enrich Authors with ORCID', namespace='dnet-spark-jobs', template_spec=SparkConfigurator( name="orcidenrich-{{ ds }}-{{ task_instance.try_number }}", @@ -49,16 +45,11 @@ def orcid_enrichment_dag(): "--targetPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}", "--workingDir", "{{ dag_run.conf.get('WRKDIR_PATH') }}", "--master", "" - ], - executor_cores=8, - executor_memory="16G", - executor_instances=1, - executor_memoryOverhead="8G").get_configuration(), - kubernetes_conn_id="kubernetes_default", - dag=dag + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" ) - orcid_enrich() + orcid_enrich orcid_enrichment_dag() diff --git a/workflow/dnet/spark_configurator.py b/workflow/dnet/spark_configurator.py index b007a65..5ee534a 100644 --- a/workflow/dnet/spark_configurator.py +++ b/workflow/dnet/spark_configurator.py @@ -1,3 +1,4 @@ + class SparkConfigurator: def __init__(self, name, @@ -9,9 +10,9 @@ class SparkConfigurator: image= "dnet-spark:1.0.0", driver_cores=1, driver_memory='1G', - executor_cores=1, - executor_memory="1G", - executor_memoryOverhead= "1G", + executor_cores=8, + executor_memory="16G", + executor_memoryOverhead="8G", executor_instances=1 ) -> None: if apiVersion: -- 2.17.1 From e92ef9fbcb28b6c3bedec1252a8a1d0b52a76264 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Tue, 22 Oct 2024 21:31:48 +0200 Subject: [PATCH 31/42] enable ingressUrlFormat --- modules/airflow/airflow.tf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modules/airflow/airflow.tf b/modules/airflow/airflow.tf index 6b0c267..f1f0d15 100644 --- a/modules/airflow/airflow.tf +++ b/modules/airflow/airflow.tf @@ -127,7 +127,7 @@ resource "helm_release" "gcp_spark_operator" { } set { - name = "ingressUrlFormat" + name = "driver.ingressUrlFormat" value = "\\{\\{$appName\\}\\}.\\{\\{$appNamespace\\}\\}.${var.domain}" type = "string" } -- 2.17.1 From e64b3ec6b19ca14a27ab4c3022720b20575862e6 Mon Sep 17 00:00:00 2001 From: Sandro La Bruzzo Date: Wed, 23 Oct 2024 08:37:01 +0200 Subject: [PATCH 32/42] removed test dir --- workflow/dags/hello.py | 11 ----------- 1 file changed, 11 deletions(-) delete mode 100644 workflow/dags/hello.py diff --git a/workflow/dags/hello.py b/workflow/dags/hello.py deleted file mode 100644 index a6ef2ae..0000000 --- a/workflow/dags/hello.py +++ /dev/null @@ -1,11 +0,0 @@ -import datetime - -from airflow import DAG -from airflow.operators.empty import EmptyOperator - -with DAG( - dag_id="my_dag_name", - start_date=datetime.datetime(2021, 1, 1), - schedule="@daily", - ): - EmptyOperator(task_id="task" ) \ No newline at end of file -- 2.17.1 From b3d7dda0c1d106dbf3fff420ec64e377c9e9abf1 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 28 Oct 2024 17:04:44 +0100 Subject: [PATCH 33/42] DAG to build the graph from a delta --- .../dnet/build_openaire_graph_incremental.py | 125 ++++++++++++++++++ workflow/dnet/dag_utils.py | 4 + 2 files changed, 129 insertions(+) create mode 100644 workflow/dnet/build_openaire_graph_incremental.py diff --git a/workflow/dnet/build_openaire_graph_incremental.py b/workflow/dnet/build_openaire_graph_incremental.py new file mode 100644 index 0000000..b6d5f80 --- /dev/null +++ b/workflow/dnet/build_openaire_graph_incremental.py @@ -0,0 +1,125 @@ +from __future__ import annotations + +from airflow.decorators import dag +from airflow.models.baseoperator import chain +from airflow.models.param import Param +from airflow.operators.trigger_dagrun import TriggerDagRunOperator +from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator + +from spark_configurator import SparkConfigurator +import dag_utils + + +@dag( + dag_id="build_openaire_graph_incremental", + dag_display_name="Build the OpenAIRE graph incrementally", + params={ + "S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection for S3 endpoint"), + "GRAPH_PATH": Param("s3a://graph/tmp/prod_provision/graph", type='string', description=""), + "WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir", type='string', description=""), + "IS_LOOKUP_URL": Param("http://services.openaire.eu:8280/is/services/isLookUp?wsdl", type='string', + description=""), + "DEDUP_CONFIG_ID": Param("dedup-result-decisiontree-v4", type='string', description=""), + "ORCID_PATH": Param("s3a://graph/data/orcid_2023/tables", type='string', description=""), + "DELTA_PATH": Param("s3a://graph/data/delta", type='string', description=""), + }, + tags=["openaire"] +) +def build_new_graph(): + chain( + SparkKubernetesOperator( + task_id='raw_graph', + task_display_name="Generate Raw Graph", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="rawgraph-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.graph.raw.CopyIncrementalOafSparkApplication", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--inputPath", "{{ dag_run.conf.get('DELTA_PATH') }}", + "--graphOutputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["raw"] + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ), + + SparkKubernetesOperator( + task_id='grouped_graph', + task_display_name="Generate Grouped-by-id Graph", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="groupedgraph-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphInputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["raw"], + "--outputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["grouped"], + "--checkpointPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}/grouped_entities", + "--isLookupUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}", + "--filterInvisible", "false" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ), + + SparkKubernetesOperator( + task_id='copygroupedrels', + task_display_name="Copy relations to Grouped-by-id Graph", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="copygroupedrels-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.merge.CopyEntitiesSparkJob", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphInputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["raw"], + "--outputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["grouped"], + "--entities", "relation", + "--format", "text" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) + + # , TriggerDagRunOperator( + # task_id="dedup", + # task_display_name="Deduplicate Research Results", + # trigger_dag_id="results_deduplication", + # wait_for_completion=True, + # conf={ + # "S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}", + # + # "INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["inference"], + # "OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["dedup"], + # "WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/dedup", + # "IS_LOOKUP_URL": "{{ dag_run.conf.get('IS_LOOKUP_URL') }}", + # "DEDUP_CONFIG_ID": "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}" + # } + # ), + # TriggerDagRunOperator( + # task_id="consistency", + # task_display_name="Enforce Consistency of Graph", + # trigger_dag_id="consistency_graph", + # wait_for_completion=True, + # + # conf={ + # "S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}", + # + # "INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["dedup"], + # "OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"], + # "WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/dedup", + # "IS_LOOKUP_URL": "{{ dag_run.conf.get('IS_LOOKUP_URL') }}" + # } + # ), + # TriggerDagRunOperator( + # task_id="orcid_enrichment", + # task_display_name="Enrich Graph with ORCID data", + # trigger_dag_id="orcid_enrichment_graph", + # wait_for_completion=True, + # + # conf={ + # "S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}", + # + # "ORCID_PATH": "{{ dag_run.conf.get('ORCID_PATH') }}", + # "INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"], + # "OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["orcid_enhancement"], + # "WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/orcid_enrichment" + # } + # ) + ) + + +build_new_graph() diff --git a/workflow/dnet/dag_utils.py b/workflow/dnet/dag_utils.py index 88b611e..4c59f3b 100644 --- a/workflow/dnet/dag_utils.py +++ b/workflow/dnet/dag_utils.py @@ -2,6 +2,10 @@ from airflow.hooks.base import BaseHook from airflow.providers.amazon.aws.hooks.s3 import S3Hook BUILD_PHASES = { + "raw": "01_graph_raw", + "grouped": "02_graph_grouped", + "clean": "03_graph_cleaned", + "inference": "05_graph_inferred", "dedup": "06_graph_dedup", "consistency": "07_graph_consistent", -- 2.17.1 From 6c25db9ac25e45c688001d191b1c7897afdf8b5d Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 28 Oct 2024 21:04:17 +0100 Subject: [PATCH 34/42] Clean DAG --- workflow/dnet/clean.py | 107 +++++++++++++++++++++++++++++++++++++ workflow/dnet/dag_utils.py | 15 ++++++ 2 files changed, 122 insertions(+) create mode 100644 workflow/dnet/clean.py diff --git a/workflow/dnet/clean.py b/workflow/dnet/clean.py new file mode 100644 index 0000000..93ca057 --- /dev/null +++ b/workflow/dnet/clean.py @@ -0,0 +1,107 @@ +import os +from datetime import timedelta + +from airflow.decorators import dag +from airflow.models.baseoperator import chain +from airflow.models.param import Param +from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator + +import dag_utils +from spark_configurator import SparkConfigurator + +EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6)) + +default_args = { + "execution_timeout": timedelta(days=EXECUTION_TIMEOUT), + "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)), + "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))) +} + + +@dag( + dag_id="clean_graph", + dag_display_name="Cleaning of Graph", + default_args=default_args, + params={ + "S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection of S3 endpoint"), + "POSTGRES_CONN_ID": Param("postgres_conn", type='string', description="Airflow connection of S3 endpoint"), + + "INPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/02_graph_grouped", type='string', description=""), + "OUTPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/03_graph_cleaned", type='string', description=""), + "WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir/clean", type='string', description=""), + "IS_LOOKUP_URL": Param("http://services.openaire.eu:8280/is/services/isLookUp?wsdl", type='string', + description=""), + "COUNTRY": Param("NL", type='string', description=""), + "SHOULD_CLEAN": Param("false", type='string', description=""), + "CONTEXT_ID": Param("sobigdata", type='string', description=""), + "VERIFY_PARAM": Param("gcube", type='string', description=""), + "VERIFY_COUNTRY_PARAM": Param("10.17632;10.5061", type='string', description=""), + "COLLECTED_FROM": Param("NARCIS", type='string', description="") + }, + tags=["openaire"] +) +def clean_graph_dag(): + getdatasourcefromcountry = SparkKubernetesOperator( + task_id='getdatasourcefromcountry', + task_display_name="Propagate Relations", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="getdatasourcefromcountry-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.graph.clean.GetDatasourceFromCountry", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--inputPath", "{{ dag_run.conf.get('INPUT_PATH') }}", + "--country", "{{ dag_run.conf.get('COUNTRY') }}", + "--workingDir", "{{ dag_run.conf.get('WRKDIR_PATH') }}/working/hostedby" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) + + masterduplicateaction = SparkKubernetesOperator( + task_id='masterduplicateaction', + task_display_name="MasterDuplicateAction", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="masterduplicateaction-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.graph.clean.MasterDuplicateAction", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--hdfsNameNode", "s3a://graph/", + "--hdfsPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}/masterduplicate", + "--postgresUrl", "jdbc:postgresql://{{ conn.get(dag_run.conf.get('POSTGRES_CONN_ID')).host }}:{{ conn.get(dag_run.conf.get('POSTGRES_CONN_ID')).port }}/dnet_openaireplus", + "--postgresUser", "{{ conn.get(dag_run.conf.get('POSTGRES_CONN_ID')).login }}", + "--postgresPassword", "{{ conn.get(dag_run.conf.get('POSTGRES_CONN_ID')).password }}" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) + + clean_tasks = [] + for entity in dag_utils.GRAPH_ENTITIES: + clean_tasks.append(SparkKubernetesOperator( + task_id='masterduplicateaction', + task_display_name="MasterDuplicateAction", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="cleansparkjob-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.graph.clean.CleanGraphSparkJob", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=[ + "--inputPath", "{{ dag_run.conf.get('INPUT_PATH') }}/" + entity, + "--outputPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}/" + entity, + "--graphTableClassName", dag_utils.GRAPH_ENTITIES_CLASS_NAMES[entity], + "--isLookupUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}", + "--contextId", "{{ dag_run.conf.get('CONTEXT_ID') }}", + "--verifyParam", "{{ dag_run.conf.get('VERIFY_PARAM') }}", + "--country", "{{ dag_run.conf.get('COUNTRY') }}", + "--verifyCountryParam", "{{ dag_run.conf.get('VERIFY_COUNTRY_PARAM') }}", + "--hostedBy", "{{ dag_run.conf.get('WRKDIR_PATH') }}/working/hostedby", + "--collectedfrom", "{{ dag_run.conf.get('COLLECTED_FROM') }}", + "--masterDuplicatePath", "{{ dag_run.conf.get('WRKDIR_PATH') }}/masterduplicate", + "--deepClean", "{{ dag_run.conf.get('SHOULD_CLEAN') }}" + + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + )) + + chain([getdatasourcefromcountry, masterduplicateaction], clean_tasks) + + +clean_graph_dag() diff --git a/workflow/dnet/dag_utils.py b/workflow/dnet/dag_utils.py index 4c59f3b..13b0fe5 100644 --- a/workflow/dnet/dag_utils.py +++ b/workflow/dnet/dag_utils.py @@ -26,3 +26,18 @@ def get_default_bucket(): return hook.service_config['bucket_name'] except KeyError: return '' + + +GRAPH_ENTITIES = ["publication", "dataset", "otherresearchproduct", "software", "datasource", "organization", "project", "relation"] + + +GRAPH_ENTITIES_CLASS_NAMES = { + "publication": "eu.dnetlib.dhp.schema.oaf.Publication", + "dataset": "eu.dnetlib.dhp.schema.oaf.Dataset", + "otherresearchproduct": "eu.dnetlib.dhp.schema.oaf.OtherResearchProduct", + "software": "eu.dnetlib.dhp.schema.oaf.Software", + "datasource": "eu.dnetlib.dhp.schema.oaf.Datasource", + "organization": "eu.dnetlib.dhp.schema.oaf.Organization", + "project": "eu.dnetlib.dhp.schema.oaf.Project", + "relation": "eu.dnetlib.dhp.schema.oaf.Relation" +} \ No newline at end of file -- 2.17.1 From 7af06fbda5bfaff63c3a05ac3b2f2b89982d3d37 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 28 Oct 2024 21:10:38 +0100 Subject: [PATCH 35/42] Clean DAG --- workflow/dnet/clean.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/workflow/dnet/clean.py b/workflow/dnet/clean.py index 93ca057..d6b0e48 100644 --- a/workflow/dnet/clean.py +++ b/workflow/dnet/clean.py @@ -24,7 +24,7 @@ default_args = { default_args=default_args, params={ "S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection of S3 endpoint"), - "POSTGRES_CONN_ID": Param("postgres_conn", type='string', description="Airflow connection of S3 endpoint"), + "POSTGRES_CONN_ID": Param("postgres_conn", type='string', description=""), "INPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/02_graph_grouped", type='string', description=""), "OUTPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/03_graph_cleaned", type='string', description=""), @@ -43,7 +43,7 @@ default_args = { def clean_graph_dag(): getdatasourcefromcountry = SparkKubernetesOperator( task_id='getdatasourcefromcountry', - task_display_name="Propagate Relations", + task_display_name="Get datasource from Country", namespace='dnet-spark-jobs', template_spec=SparkConfigurator( name="getdatasourcefromcountry-{{ ds }}-{{ task_instance.try_number }}", @@ -101,7 +101,7 @@ def clean_graph_dag(): kubernetes_conn_id="kubernetes_default" )) - chain([getdatasourcefromcountry, masterduplicateaction], clean_tasks) + chain(getdatasourcefromcountry, masterduplicateaction, clean_tasks) clean_graph_dag() -- 2.17.1 From b01331d4d0504175e251802255f4e254291bae8b Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 28 Oct 2024 21:11:33 +0100 Subject: [PATCH 36/42] Clean DAG --- workflow/dnet/clean.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/dnet/clean.py b/workflow/dnet/clean.py index d6b0e48..828ff07 100644 --- a/workflow/dnet/clean.py +++ b/workflow/dnet/clean.py @@ -76,7 +76,7 @@ def clean_graph_dag(): clean_tasks = [] for entity in dag_utils.GRAPH_ENTITIES: clean_tasks.append(SparkKubernetesOperator( - task_id='masterduplicateaction', + task_id='cleansparkjob_' + entity, task_display_name="MasterDuplicateAction", namespace='dnet-spark-jobs', template_spec=SparkConfigurator( -- 2.17.1 From 9058dbb9575089d55d562c8ecf07d6687224a591 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Mon, 28 Oct 2024 21:12:23 +0100 Subject: [PATCH 37/42] Clean DAG --- workflow/dnet/clean.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/workflow/dnet/clean.py b/workflow/dnet/clean.py index 828ff07..c7d69fb 100644 --- a/workflow/dnet/clean.py +++ b/workflow/dnet/clean.py @@ -77,7 +77,7 @@ def clean_graph_dag(): for entity in dag_utils.GRAPH_ENTITIES: clean_tasks.append(SparkKubernetesOperator( task_id='cleansparkjob_' + entity, - task_display_name="MasterDuplicateAction", + task_display_name="Clean " + entity, namespace='dnet-spark-jobs', template_spec=SparkConfigurator( name="cleansparkjob-{{ ds }}-{{ task_instance.try_number }}", @@ -101,7 +101,9 @@ def clean_graph_dag(): kubernetes_conn_id="kubernetes_default" )) - chain(getdatasourcefromcountry, masterduplicateaction, clean_tasks) + chain(getdatasourcefromcountry, + #masterduplicateaction, + clean_tasks) clean_graph_dag() -- 2.17.1 From 004be2e97fd8f668d25a245768c3a763f9c554de Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Tue, 29 Oct 2024 14:39:01 +0100 Subject: [PATCH 38/42] Add resolvereletion step --- .../dnet/build_openaire_graph_incremental.py | 46 +++++++++++++++++++ workflow/dnet/dag_utils.py | 1 + 2 files changed, 47 insertions(+) diff --git a/workflow/dnet/build_openaire_graph_incremental.py b/workflow/dnet/build_openaire_graph_incremental.py index b6d5f80..3b05d79 100644 --- a/workflow/dnet/build_openaire_graph_incremental.py +++ b/workflow/dnet/build_openaire_graph_incremental.py @@ -72,6 +72,52 @@ def build_new_graph(): "--format", "text" ]).get_configuration(), kubernetes_conn_id="kubernetes_default" + ), + + TriggerDagRunOperator( + task_id="clean_graph", + task_display_name="Clean Results", + trigger_dag_id="clean_graph", + wait_for_completion=True, + conf={ + "S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}", + + "INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["grouped"], + "OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["clean"], + "WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/dedup", + "IS_LOOKUP_URL": "{{ dag_run.conf.get('IS_LOOKUP_URL') }}" + } + ), + + SparkKubernetesOperator( + task_id='resolverels', + task_display_name="Resolve Relations", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="resolverels-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.graph.resolution.SparkResolveRelationById", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphBasePath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["clean"], + "--targetPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["resolve"], + "--relationPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"] + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ), + + SparkKubernetesOperator( + task_id='copyresolveents', + task_display_name="Copy entities to Resolved Graph", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="copyresolveents-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.merge.CopyEntitiesSparkJob", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphInputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["clean"], + "--outputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["resolve"], + "--entities", ",".join([item for item in dag_utils.GRAPH_ENTITIES if item != "relation"]), + "--format", "text" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" ) # , TriggerDagRunOperator( diff --git a/workflow/dnet/dag_utils.py b/workflow/dnet/dag_utils.py index 13b0fe5..18d212d 100644 --- a/workflow/dnet/dag_utils.py +++ b/workflow/dnet/dag_utils.py @@ -5,6 +5,7 @@ BUILD_PHASES = { "raw": "01_graph_raw", "grouped": "02_graph_grouped", "clean": "03_graph_cleaned", + "resolve": "04_graph_resolved", "inference": "05_graph_inferred", "dedup": "06_graph_dedup", -- 2.17.1 From c0788fcd10e8ad183f0c5466a8a88512ce5b3146 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Tue, 29 Oct 2024 14:49:26 +0100 Subject: [PATCH 39/42] Add resolvereletion step --- workflow/dnet/build_openaire_graph_incremental.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/workflow/dnet/build_openaire_graph_incremental.py b/workflow/dnet/build_openaire_graph_incremental.py index 3b05d79..39e2223 100644 --- a/workflow/dnet/build_openaire_graph_incremental.py +++ b/workflow/dnet/build_openaire_graph_incremental.py @@ -6,8 +6,8 @@ from airflow.models.param import Param from airflow.operators.trigger_dagrun import TriggerDagRunOperator from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator -from spark_configurator import SparkConfigurator import dag_utils +from spark_configurator import SparkConfigurator @dag( @@ -99,7 +99,8 @@ def build_new_graph(): jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', arguments=["--graphBasePath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["clean"], "--targetPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["resolve"], - "--relationPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"] + "--relationPath", + "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"] + "/relation" ]).get_configuration(), kubernetes_conn_id="kubernetes_default" ), @@ -112,7 +113,8 @@ def build_new_graph(): name="copyresolveents-{{ ds }}-{{ task_instance.try_number }}", mainClass="eu.dnetlib.dhp.oa.merge.CopyEntitiesSparkJob", jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', - arguments=["--graphInputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["clean"], + arguments=["--graphInputPath", + "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["clean"], "--outputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["resolve"], "--entities", ",".join([item for item in dag_utils.GRAPH_ENTITIES if item != "relation"]), "--format", "text" -- 2.17.1 From ac2fbbb9f90fb20f7d03397cba0852f4db536450 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 6 Nov 2024 14:04:33 +0100 Subject: [PATCH 40/42] Add jupyterhub --- main.tf | 41 +++++++++++++++++------------- modules/jupyterhub/main.tf | 44 +++++++++++++++++++++++++++++++++ modules/jupyterhub/providers.tf | 9 +++++++ modules/jupyterhub/variables.tf | 25 +++++++++++++++++++ 4 files changed, 102 insertions(+), 17 deletions(-) create mode 100644 modules/jupyterhub/main.tf create mode 100644 modules/jupyterhub/providers.tf create mode 100644 modules/jupyterhub/variables.tf diff --git a/main.tf b/main.tf index 4456f38..75b185e 100644 --- a/main.tf +++ b/main.tf @@ -1,24 +1,31 @@ module "minio" { - source = "./modules/minio" - kube_context = var.kube_context - namespace_prefix=var.namespace_prefix - buckets = var.minio_buckets + source = "./modules/minio" + kube_context = var.kube_context + namespace_prefix = var.namespace_prefix + buckets = var.minio_buckets } module "airflow" { - source = "./modules/airflow" - kube_context = var.kube_context - admin_user = var.admin_user - admin_password = var.admin_password - namespace_prefix= var.namespace_prefix - admin_hash = var.admin_hash - env = var.env - domain = var.domain - s3_endpoint = var.s3_endpoint - s3_key = var.s3_key - s3_secret = var.s3_secret - branch_name = var.dag_branch_name - dag_path= var.dag_path_name + source = "./modules/airflow" + kube_context = var.kube_context + admin_user = var.admin_user + admin_password = var.admin_password + namespace_prefix = var.namespace_prefix + admin_hash = var.admin_hash + env = var.env + domain = var.domain + s3_endpoint = var.s3_endpoint + s3_key = var.s3_key + s3_secret = var.s3_secret + branch_name = var.dag_branch_name + dag_path = var.dag_path_name } + +module "jupyterhub" { + source = "./modules/jupyterhub" + kube_context = var.kube_context + namespace_prefix = var.namespace_prefix + domain = var.domain +} diff --git a/modules/jupyterhub/main.tf b/modules/jupyterhub/main.tf new file mode 100644 index 0000000..0ab4763 --- /dev/null +++ b/modules/jupyterhub/main.tf @@ -0,0 +1,44 @@ +resource "helm_release" "jupyterhub" { + name = "jupyterhub" + chart = "jupyterhub" + repository = "https://hub.jupyter.org/helm-chart/" + create_namespace = "true" + namespace = "${var.namespace_prefix}spark-jobs" + dependency_update = "true" + version = "3.3.8" + + set { + name = "ingress.enabled" + value = "true" + } + + set { + name = "ingress.ingressClassName" + value = "nginx" + } + + set { + name = "ingress.hosts[0]" + value = "jupyter.${var.domain}" + } + + set { + name = "singleuser.image.name" + value = "jupyter/all-spark-notebook" + } + + set { + name = "singleuser.image.tag" + value = "spark-3.5.0" + } + + set { + name = "singleuser.cmd" + value = "start-notebook.py" + } + + set { + name = "singleuser.serviceAccountName" + value = "spark" + } +} \ No newline at end of file diff --git a/modules/jupyterhub/providers.tf b/modules/jupyterhub/providers.tf new file mode 100644 index 0000000..d88c184 --- /dev/null +++ b/modules/jupyterhub/providers.tf @@ -0,0 +1,9 @@ +terraform { + required_providers { + helm = { + } + + kubernetes = { + } + } +} diff --git a/modules/jupyterhub/variables.tf b/modules/jupyterhub/variables.tf new file mode 100644 index 0000000..1f07583 --- /dev/null +++ b/modules/jupyterhub/variables.tf @@ -0,0 +1,25 @@ +variable "env" { + type = string + default = "local" +} + +variable "kube_config" { + type = string + default = "~/.kube/config" +} + +variable "kube_context" { + type = string + default = "default" +} + +variable "namespace_prefix" { + type = string + default = "lot1-" +} + +variable "domain" { + type = string + default = "local-dataplatform" +} + -- 2.17.1 From 13ac9767c6534cbaf2b205c95f617c166fa1a722 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 6 Nov 2024 15:18:17 +0100 Subject: [PATCH 41/42] Orcid propagation step --- workflow/dnet/orcid_propagate.py | 55 ++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 workflow/dnet/orcid_propagate.py diff --git a/workflow/dnet/orcid_propagate.py b/workflow/dnet/orcid_propagate.py new file mode 100644 index 0000000..f8b73dd --- /dev/null +++ b/workflow/dnet/orcid_propagate.py @@ -0,0 +1,55 @@ +import os +from datetime import timedelta + +from airflow.decorators import dag +from airflow.models.param import Param +from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator + +from spark_configurator import SparkConfigurator + +EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6)) + +default_args = { + "execution_timeout": timedelta(days=EXECUTION_TIMEOUT), + "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)), + "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))) +} + + +@dag( + dag_id="orcid_propagation_graph", + dag_display_name="Propagate ORCID data in graph", + default_args=default_args, + params={ + "S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection of S3 endpoint"), + "ORCID_PATH": Param("s3a://graph/tmp/prod_provision/graph/09_graph_orcid_enriched", type='string', description=""), + "INPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/09_graph_orcid_enriched", type='string', description=""), + "OUTPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/10_graph_propagated", type='string', + description=""), + "WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir/orcid_propagation", type='string', + description="") + }, + tags=["openaire"] +) +def orcid_propagation_dag(): + orcid_propagate = SparkKubernetesOperator( + task_id='PropagateGraphWithOrcid', + task_display_name="Propagate ORCID data", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="orcidpropagate-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.orcidtoresultfromsemrel.SparkPropagateOrcidAuthor", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--orcidPath", "{{ dag_run.conf.get('ORCID_PATH') }}", + "--graphPath", "{{ dag_run.conf.get('INPUT_PATH') }}", + "--targetPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}", + "--workingDir", "{{ dag_run.conf.get('WRKDIR_PATH') }}", + "--matchingSource", "graph" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) + + orcid_propagate + + +orcid_propagation_dag() -- 2.17.1 From 9e69ded5ef18bd1ef42f55f289c23d05dce82677 Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Wed, 6 Nov 2024 16:07:34 +0100 Subject: [PATCH 42/42] Copy relations in ORCID enrichment DAG --- workflow/dnet/orcid_enrich.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/workflow/dnet/orcid_enrich.py b/workflow/dnet/orcid_enrich.py index c1e3bcd..b18f8bb 100644 --- a/workflow/dnet/orcid_enrich.py +++ b/workflow/dnet/orcid_enrich.py @@ -2,6 +2,7 @@ import os from datetime import timedelta from airflow.decorators import dag +from airflow.models.baseoperator import chain from airflow.models.param import Param from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator @@ -32,7 +33,7 @@ default_args = { tags=["openaire"] ) def orcid_enrichment_dag(): - orcid_enrich = SparkKubernetesOperator( + chain(SparkKubernetesOperator( task_id='EnrichGraphWithOrcidAuthors', task_display_name='Enrich Authors with ORCID', namespace='dnet-spark-jobs', @@ -47,9 +48,23 @@ def orcid_enrichment_dag(): "--master", "" ]).get_configuration(), kubernetes_conn_id="kubernetes_default" + ), + SparkKubernetesOperator( + task_id='copyorcidenrichrels', + task_display_name="Copy relations to ORCID Enriched graph", + namespace='dnet-spark-jobs', + template_spec=SparkConfigurator( + name="copygroupedrels-{{ ds }}-{{ task_instance.try_number }}", + mainClass="eu.dnetlib.dhp.oa.merge.CopyEntitiesSparkJob", + jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar', + arguments=["--graphInputPath", "{{ dag_run.conf.get('INPUT_PATH') }}/", + "--outputPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}/", + "--entities", "relation", + "--format", "text" + ]).get_configuration(), + kubernetes_conn_id="kubernetes_default" + ) ) - orcid_enrich - orcid_enrichment_dag() -- 2.17.1