initial stage

2024-03-27 22:48:13 +01:00 · 2024-03-27 22:48:13 +01:00 · 7c919f5278
parent 33cb4ce636
commit 7c919f5278
2 changed files with 28 additions and 165 deletions
--- a/airflow/dags/import_EOSC_catalog.py
+++ b/airflow/dags/import_EOSC_catalog.py
@ -1,159 +0,0 @@
-from __future__ import annotations
-
-import gzip
-import io
-import json
-import os
-from datetime import timedelta
-
-import pendulum
-from airflow.decorators import dag
-from airflow.decorators import task
-from airflow.operators.python import PythonOperator
-from airflow.providers.amazon.aws.hooks.s3 import S3Hook
-from airflow.utils.helpers import chain
-from airflow.hooks.base import BaseHook
-
-from opensearchpy import OpenSearch, helpers
-from EOSC_indexes import mappings
-
-EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
-ENTITIES = ["interoperability", "services", "training"]
-
-default_args = {
-    "execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
-    "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
-    "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
-}
-
-@dag(
-    schedule=None,
-    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
-    catchup=False,
-    default_args=default_args,
-    params={
-        "S3_CONN_ID": "s3_conn",
-        "OPENSEARCH_CONN_ID": "opensearch_default",
-        "EOSC_CATALOG_BUCKET": "eosc-portal-import",
-        "BATCH_LOADERS_NUM": 2
-    },
-    tags=["lot1"],
-)
-def import_EOSC_catalog():
-    @task
-    def create_indexes(**kwargs):
-        conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
-        client = OpenSearch(
-            hosts=[{'host': conn.host, 'port': conn.port}],
-            http_auth=(conn.login, conn.password),
-            use_ssl=True,
-            verify_certs=False,
-            ssl_show_warn=False,
-            pool_maxsize=20
-        )
-
-        for entity in ENTITIES:
-            if client.indices.exists(entity):
-                client.indices.delete(entity)
-            client.indices.create(entity, {
-                "settings": {
-                    "index": {
-                        "number_of_shards": 3,
-                        "number_of_replicas": 1,
-                        "refresh_interval": -1,
-
-                        "codec": "zstd_no_dict",
-                        "replication.type": "SEGMENT",
-
-                        "translog.flush_threshold_size": "2048MB",
-
-                        "mapping.ignore_malformed": "true"
-                    }
-
-                },
-                "mappings": mappings[entity]
-            })
-
-    def compute_batches(ds=None, **kwargs):
-        pieces = []
-        for entity in ENTITIES:
-            hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
-            keys = hook.list_keys(bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"], prefix=f'{entity}/')
-            for key in keys:
-                pieces.append((entity, key))
-
-        def split_list(list_a, chunk_size):
-            for i in range(0, len(list_a), chunk_size):
-                yield {"files": list_a[i:i + chunk_size]}
-
-        num_batches = len(pieces)//kwargs["params"]["BATCH_LOADERS_NUM"]
-        if num_batches > 0:
-            return [*split_list(pieces, num_batches)]
-        return [*split_list(pieces, len(pieces))]
-
-    @task
-    def bulk_load(files: list[(str, str)], **kwargs):
-        conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
-        client = OpenSearch(
-            hosts=[{'host': conn.host, 'port': conn.port}],
-            http_auth=(conn.login, conn.password),
-            use_ssl=True,
-            verify_certs=False,
-            ssl_show_warn=False,
-            pool_maxsize=20
-        )
-        hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
-
-        def _generate_data():
-            for (entity, key) in files:
-                print(f'{entity}: {key}')
-                s3_obj = hook.get_key(key, bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"])
-                with gzip.GzipFile(fileobj=s3_obj.get()["Body"]) as gzipfile:
-                    buff = io.BufferedReader(gzipfile)
-                    for line in buff:
-                        data = json.loads(line)
-                        data['_index'] = entity
-                        data['_id'] = data['id']
-                        yield data
-
-        succeeded = []
-        failed = []
-        for success, item in helpers.parallel_bulk(client, actions=_generate_data()):
-            if success:
-                succeeded.append(item)
-            else:
-                failed.append(item)
-
-        if len(failed) > 0:
-            print(f"There were {len(failed)} errors:")
-            for item in failed:
-                print(item["index"]["error"])
-
-        if len(succeeded) > 0:
-            print(f"Bulk-inserted {len(succeeded)} items (streaming_bulk).")
-
-    @task
-    def close_indexes(**kwargs):
-        conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
-        client = OpenSearch(
-            hosts=[{'host': conn.host, 'port': conn.port}],
-            http_auth=(conn.login, conn.password),
-            use_ssl=True,
-            verify_certs=False,
-            ssl_show_warn=False,
-            pool_maxsize=20
-        )
-        for entity in ENTITIES:
-            client.indices.refresh(entity)
-
-    parallel_batches = PythonOperator(task_id="compute_parallel_batches", python_callable=compute_batches)
-
-    chain(
-        create_indexes.override(task_id="create_indexes")(),
-        parallel_batches,
-        bulk_load.expand_kwargs(parallel_batches.output),
-        close_indexes.override(task_id="close_indexes")()
-    )
-
-
-import_EOSC_catalog()
--- a/airflow/dags/import_EOSC_graph.py
+++ b/airflow/dags/import_EOSC_graph.py
@ -75,10 +75,11 @@ for config_name, config in configs.items():
            })

            for entity in kwargs["params"]["ENTITIES"]:
-                if client.indices.exists(entity):
-                    client.indices.delete(entity)
+                indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
+                if client.indices.exists(indexname):
+                    client.indices.delete(indexname)

-                client.indices.create(entity, {
+                client.indices.create(indexname, {
                    "settings": {
                        "index": {
                            "number_of_shards": 40,
@ -146,10 +147,11 @@ for config_name, config in configs.items():
            hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})

            for (entity, key) in files:
+                indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
                if hook.check_for_key(key=f"{key}.PROCESSED", bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"]):
                    print(f'Skipping {entity}: {key}')
                    continue
-                print(f'Processing {entity}: {key}')
+                print(f'Processing {indexname}: {key}')
                s3_obj = hook.get_key(key, bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"])
                with s3_obj.get()["Body"] as body:
                    with gzip.GzipFile(fileobj=body) as gzipfile:
@ -157,7 +159,7 @@ for config_name, config in configs.items():
                            buff = io.BufferedReader(gzipfile)
                            for line in buff:
                                data = json.loads(line)
-                                data['_index'] = entity
+                                data['_index'] = indexname
                                if entity in transform_entities:
                                    data = transform_entities[entity](data)
                                yield data
@ -204,7 +206,27 @@ for config_name, config in configs.items():
                timeout=180
            )
            for entity in kwargs["params"]["ENTITIES"]:
-                client.indices.refresh(entity)
+                indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
+                client.indices.refresh(indexname)
+            # update aliases
+            for entity in kwargs["params"]["ENTITIES"]:
+                indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
+                client.indices.update_aliases(
+                    actions=[
+                        {"remove": {"index": f"{entity}_*", "alias": entity}},
+                        {"add": {"index": indexname, "alias": entity}},
+                    ]
+                )
+            # update "allresources" alias
+            actions = []
+            for entity in kwargs["params"]["ENTITIES"]:
+                if entity in ['products', 'services', 'training', 'interoperability']:
+                    actions.append({"remove": {"index": f"{entity}_*", "alias": "allresources"}})
+                    actions.append({"add": {"index": "allresources", "alias": entity}})
+            if len(actions) > 0:
+                client.indices.update_aliases(
+                    actions=actions
+                )

        parallel_batches = PythonOperator(task_id="compute_parallel_batches", python_callable=compute_batches)