simple test DAG

2024-03-18 10:11:51 +01:00 · 2024-03-18 10:11:51 +01:00 · fa3214dc2c
parent cb4f9c838a
commit fa3214dc2c
1 changed files with 190 additions and 0 deletions
--- a/airflow/dags/import_eosc_catalog.py
+++ b/airflow/dags/import_eosc_catalog.py
@ -0,0 +1,190 @@
+from __future__ import annotations
+
+import gzip
+import io
+import json
+import os
+import zipfile
+from datetime import timedelta
+
+import pendulum
+from airflow.decorators import dag
+from airflow.decorators import task
+from airflow.operators.python import PythonOperator
+from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+from airflow.utils.file import TemporaryDirectory
+from airflow.utils.helpers import chain
+from airflow.models import Variable
+
+from opensearchpy import OpenSearch, helpers
+from opensearch_indexes import mappings
+
+S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME", "eosc-catalog")
+AWS_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
+EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
+OPENSEARCH_HOST = Variable.get("OPENSEARCH_URL", "opensearch-cluster.lot1-opensearch-cluster.svc.cluster.local")
+OPENSEARCH_URL = Variable.get("OPENSEARCH_URL", "https://opensearch-cluster.lot1-opensearch-cluster.svc.cluster.local:9200")
+OPENSEARCH_USER = Variable.get("OPENSEARCH_USER", "admin")
+OPENSEARCH_PASSWD = Variable.get("OPENSEARCH_PASSWORD", "admin")
+
+ENTITIES = [ "interoperability", "services", "training"]
+
+BULK_PARALLELISM = 2
+
+#
+
+default_args = {
+    "execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
+    "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
+    "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
+}
+
+
+def strip_prefix(s, p):
+    if s.startswith(p):
+        return s[len(p):]
+    else:
+        return s
+
+
+@dag(
+    schedule=None,
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    catchup=False,
+    default_args=default_args,
+    tags=["example", "async", "s3"],
+)
+def eosc_catalog_import():
+    @task
+    def unzip_to_s3(key: str, bucket: str):
+        hook = S3Hook(AWS_CONN_ID, transfer_config_args={'use_threads': False})
+
+        with TemporaryDirectory() as dwl_dir:
+            with TemporaryDirectory() as tmp_dir:
+                archive = f'{dwl_dir}/{key}'
+                hook.download_file(key=key, bucket_name=bucket, local_path=dwl_dir, preserve_file_name=True,
+                                   use_autogenerated_subdir=False)
+                with zipfile.ZipFile(archive, 'r') as zip_ref:
+                    zip_ref.extractall(tmp_dir)
+
+                for root, _, files in os.walk(tmp_dir):
+                    for file in files:
+                        if file == key:
+                            continue
+                        local_file_path = os.path.join(root, file)
+                        hook.load_file(local_file_path, strip_prefix(local_file_path, tmp_dir), S3_BUCKET_NAME,
+                                       replace=True)
+        return ""
+
+    @task
+    def create_indexes():
+        client = OpenSearch(
+            hosts=[{'host': OPENSEARCH_HOST, 'port': 9200}],
+            http_auth=(OPENSEARCH_USER, OPENSEARCH_PASSWD),
+            use_ssl=True,
+            verify_certs=False,
+            ssl_show_warn=False,
+            pool_maxsize=20
+        )
+
+        for entity in ENTITIES:
+            client.indices.delete(entity)
+            client.indices.create(entity, {
+                "settings": {
+                    "index": {
+                        "number_of_shards": 3,
+                        "number_of_replicas": 1,
+                        "refresh_interval": -1,
+
+                        "codec": "zstd_no_dict",
+                        "replication.type": "SEGMENT",
+
+                        "translog.flush_threshold_size": "2048MB",
+
+                        "mapping.ignore_malformed": "true"
+                    }
+
+                },
+                #"mappings": mappings[entity]
+            })
+
+    def compute_batches(ds=None, **kwargs):
+        pieces = []
+        for entity in ENTITIES:
+            hook = S3Hook(AWS_CONN_ID, transfer_config_args={'use_threads': False})
+            keys = hook.list_keys(bucket_name=S3_BUCKET_NAME, prefix=f'{entity}/')
+            for key in keys:
+                pieces.append((entity, key))
+
+        def split_list(list_a, chunk_size):
+            for i in range(0, len(list_a), chunk_size):
+                yield {"files": list_a[i:i + chunk_size]}
+
+        return list(split_list(pieces, len(pieces)//BULK_PARALLELISM))
+
+    @task
+    def bulk_load(files: list[(str, str)]):
+        client = OpenSearch(
+            hosts=[{'host': OPENSEARCH_HOST, 'port': 9200}],
+            http_auth=(OPENSEARCH_USER, OPENSEARCH_PASSWD),
+            use_ssl=True,
+            verify_certs=False,
+            ssl_show_warn=False,
+            pool_maxsize=20
+        )
+        hook = S3Hook(AWS_CONN_ID, transfer_config_args={'use_threads': False})
+
+        def _generate_data():
+            for (entity, key) in files:
+                print(f'{entity}: {key}')
+                s3_obj = hook.get_key(key, bucket_name=S3_BUCKET_NAME)
+                with gzip.GzipFile(fileobj=s3_obj.get()["Body"]) as gzipfile:
+                    buff = io.BufferedReader(gzipfile)
+                    for line in buff:
+                        data = json.loads(line)
+                        data['_index'] = entity
+                        data['_id'] = data['id']
+                        yield data
+
+        succeeded = []
+        failed = []
+        for success, item in helpers.parallel_bulk(client, actions=_generate_data()):
+            if success:
+                succeeded.append(item)
+            else:
+                failed.append(item)
+
+        if len(failed) > 0:
+            print(f"There were {len(failed)} errors:")
+            for item in failed:
+                print(item["index"]["error"])
+
+        if len(succeeded) > 0:
+            print(f"Bulk-inserted {len(succeeded)} items (streaming_bulk).")
+
+
+
+    @task
+    def close_indexes():
+        client = OpenSearch(
+            hosts=[{'host': OPENSEARCH_HOST, 'port': 9200}],
+            http_auth=(OPENSEARCH_USER, OPENSEARCH_PASSWD),
+            use_ssl=True,
+            verify_certs=False,
+            ssl_show_warn=False,
+            pool_maxsize=20
+        )
+        for entity in ENTITIES:
+            client.indices.refresh(entity)
+
+    parallel_batches = PythonOperator(task_id="compute_parallel_batches", python_callable=compute_batches)
+
+    chain(
+        unzip_to_s3.override(task_id="unzip_to_s3")("dump.zip", S3_BUCKET_NAME),
+        create_indexes.override(task_id="create_indexes")(),
+        parallel_batches,
+        bulk_load.expand_kwargs(parallel_batches.output),
+        close_indexes.override(task_id="close_indexes")()
+    )
+
+eosc_catalog_import()