lot1-kickoff/airflow/dags/import_EOSC_catalog.py

from __future__ import annotations

import gzip
import io
import json
import os
from datetime import timedelta

import pendulum
from airflow.decorators import dag
from airflow.decorators import task
from airflow.operators.python import PythonOperator
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.helpers import chain
from airflow.hooks.base import BaseHook

from opensearchpy import OpenSearch, helpers
from EOSC_indexes import mappings

EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
ENTITIES = ["interoperability", "services", "training"]

default_args = {
    "execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
    "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
    "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}

@dag(
    schedule=None,
    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
    catchup=False,
    default_args=default_args,
    params={
        "S3_CONN_ID": "s3_conn",
        "OPENSEARCH_CONN_ID": "opensearch_default",
        "EOSC_CATALOG_BUCKET": "eosc-portal-import",
        "BATCH_LOADERS_NUM": 2
    },
    tags=["lot1"],
)
def import_EOSC_catalog():
    @task
    def create_indexes(**kwargs):
        conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
        client = OpenSearch(
            hosts=[{'host': conn.host, 'port': conn.port}],
            http_auth=(conn.login, conn.password),
            use_ssl=True,
            verify_certs=False,
            ssl_show_warn=False,
            pool_maxsize=20
        )

        for entity in ENTITIES:
            if client.indices.exists(entity):
                client.indices.delete(entity)
            client.indices.create(entity, {
                "settings": {
                    "index": {
                        "number_of_shards": 3,
                        "number_of_replicas": 1,
                        "refresh_interval": -1,

                        "codec": "zstd_no_dict",
                        "replication.type": "SEGMENT",

                        "translog.flush_threshold_size": "2048MB",

                        "mapping.ignore_malformed": "true"
                    }

                },
                "mappings": mappings[entity]
            })

    def compute_batches(ds=None, **kwargs):
        pieces = []
        for entity in ENTITIES:
            hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
            keys = hook.list_keys(bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"], prefix=f'{entity}/')
            for key in keys:
                pieces.append((entity, key))

        def split_list(list_a, chunk_size):
            for i in range(0, len(list_a), chunk_size):
                yield {"files": list_a[i:i + chunk_size]}

        num_batches = len(pieces)//kwargs["params"]["BATCH_LOADERS_NUM"]
        if num_batches > 0:
            return list(split_list(pieces, num_batches))
        return pieces

    @task
    def bulk_load(files: list[(str, str)], **kwargs):
        conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
        client = OpenSearch(
            hosts=[{'host': conn.host, 'port': conn.port}],
            http_auth=(conn.login, conn.password),
            use_ssl=True,
            verify_certs=False,
            ssl_show_warn=False,
            pool_maxsize=20
        )
        hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})

        def _generate_data():
            for (entity, key) in files:
                print(f'{entity}: {key}')
                s3_obj = hook.get_key(key, bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"])
                with gzip.GzipFile(fileobj=s3_obj.get()["Body"]) as gzipfile:
                    buff = io.BufferedReader(gzipfile)
                    for line in buff:
                        data = json.loads(line)
                        data['_index'] = entity
                        data['_id'] = data['id']
                        yield data

        succeeded = []
        failed = []
        for success, item in helpers.parallel_bulk(client, actions=_generate_data()):
            if success:
                succeeded.append(item)
            else:
                failed.append(item)

        if len(failed) > 0:
            print(f"There were {len(failed)} errors:")
            for item in failed:
                print(item["index"]["error"])

        if len(succeeded) > 0:
            print(f"Bulk-inserted {len(succeeded)} items (streaming_bulk).")

    @task
    def close_indexes(**kwargs):
        conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
        client = OpenSearch(
            hosts=[{'host': conn.host, 'port': conn.port}],
            http_auth=(conn.login, conn.password),
            use_ssl=True,
            verify_certs=False,
            ssl_show_warn=False,
            pool_maxsize=20
        )
        for entity in ENTITIES:
            client.indices.refresh(entity)

    parallel_batches = PythonOperator(task_id="compute_parallel_batches", python_callable=compute_batches)

    chain(
        create_indexes.override(task_id="create_indexes")(),
        parallel_batches,
        bulk_load.expand_kwargs(parallel_batches.output),
        close_indexes.override(task_id="close_indexes")()
    )


import_EOSC_catalog()