initial stage

This commit is contained in:
Giambattista Bloisi 2024-03-27 22:48:13 +01:00
parent 33cb4ce636
commit 7c919f5278
2 changed files with 28 additions and 165 deletions

View File

@ -1,159 +0,0 @@
from __future__ import annotations
import gzip
import io
import json
import os
from datetime import timedelta
import pendulum
from airflow.decorators import dag
from airflow.decorators import task
from airflow.operators.python import PythonOperator
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.helpers import chain
from airflow.hooks.base import BaseHook
from opensearchpy import OpenSearch, helpers
from EOSC_indexes import mappings
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
ENTITIES = ["interoperability", "services", "training"]
default_args = {
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
@dag(
schedule=None,
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
catchup=False,
default_args=default_args,
params={
"S3_CONN_ID": "s3_conn",
"OPENSEARCH_CONN_ID": "opensearch_default",
"EOSC_CATALOG_BUCKET": "eosc-portal-import",
"BATCH_LOADERS_NUM": 2
},
tags=["lot1"],
)
def import_EOSC_catalog():
@task
def create_indexes(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20
)
for entity in ENTITIES:
if client.indices.exists(entity):
client.indices.delete(entity)
client.indices.create(entity, {
"settings": {
"index": {
"number_of_shards": 3,
"number_of_replicas": 1,
"refresh_interval": -1,
"codec": "zstd_no_dict",
"replication.type": "SEGMENT",
"translog.flush_threshold_size": "2048MB",
"mapping.ignore_malformed": "true"
}
},
"mappings": mappings[entity]
})
def compute_batches(ds=None, **kwargs):
pieces = []
for entity in ENTITIES:
hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
keys = hook.list_keys(bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"], prefix=f'{entity}/')
for key in keys:
pieces.append((entity, key))
def split_list(list_a, chunk_size):
for i in range(0, len(list_a), chunk_size):
yield {"files": list_a[i:i + chunk_size]}
num_batches = len(pieces)//kwargs["params"]["BATCH_LOADERS_NUM"]
if num_batches > 0:
return [*split_list(pieces, num_batches)]
return [*split_list(pieces, len(pieces))]
@task
def bulk_load(files: list[(str, str)], **kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20
)
hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
def _generate_data():
for (entity, key) in files:
print(f'{entity}: {key}')
s3_obj = hook.get_key(key, bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"])
with gzip.GzipFile(fileobj=s3_obj.get()["Body"]) as gzipfile:
buff = io.BufferedReader(gzipfile)
for line in buff:
data = json.loads(line)
data['_index'] = entity
data['_id'] = data['id']
yield data
succeeded = []
failed = []
for success, item in helpers.parallel_bulk(client, actions=_generate_data()):
if success:
succeeded.append(item)
else:
failed.append(item)
if len(failed) > 0:
print(f"There were {len(failed)} errors:")
for item in failed:
print(item["index"]["error"])
if len(succeeded) > 0:
print(f"Bulk-inserted {len(succeeded)} items (streaming_bulk).")
@task
def close_indexes(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20
)
for entity in ENTITIES:
client.indices.refresh(entity)
parallel_batches = PythonOperator(task_id="compute_parallel_batches", python_callable=compute_batches)
chain(
create_indexes.override(task_id="create_indexes")(),
parallel_batches,
bulk_load.expand_kwargs(parallel_batches.output),
close_indexes.override(task_id="close_indexes")()
)
import_EOSC_catalog()

View File

@ -75,10 +75,11 @@ for config_name, config in configs.items():
})
for entity in kwargs["params"]["ENTITIES"]:
if client.indices.exists(entity):
client.indices.delete(entity)
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
if client.indices.exists(indexname):
client.indices.delete(indexname)
client.indices.create(entity, {
client.indices.create(indexname, {
"settings": {
"index": {
"number_of_shards": 40,
@ -146,10 +147,11 @@ for config_name, config in configs.items():
hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
for (entity, key) in files:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
if hook.check_for_key(key=f"{key}.PROCESSED", bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"]):
print(f'Skipping {entity}: {key}')
continue
print(f'Processing {entity}: {key}')
print(f'Processing {indexname}: {key}')
s3_obj = hook.get_key(key, bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"])
with s3_obj.get()["Body"] as body:
with gzip.GzipFile(fileobj=body) as gzipfile:
@ -157,7 +159,7 @@ for config_name, config in configs.items():
buff = io.BufferedReader(gzipfile)
for line in buff:
data = json.loads(line)
data['_index'] = entity
data['_index'] = indexname
if entity in transform_entities:
data = transform_entities[entity](data)
yield data
@ -204,7 +206,27 @@ for config_name, config in configs.items():
timeout=180
)
for entity in kwargs["params"]["ENTITIES"]:
client.indices.refresh(entity)
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
client.indices.refresh(indexname)
# update aliases
for entity in kwargs["params"]["ENTITIES"]:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
client.indices.update_aliases(
actions=[
{"remove": {"index": f"{entity}_*", "alias": entity}},
{"add": {"index": indexname, "alias": entity}},
]
)
# update "allresources" alias
actions = []
for entity in kwargs["params"]["ENTITIES"]:
if entity in ['products', 'services', 'training', 'interoperability']:
actions.append({"remove": {"index": f"{entity}_*", "alias": "allresources"}})
actions.append({"add": {"index": "allresources", "alias": entity}})
if len(actions) > 0:
client.indices.update_aliases(
actions=actions
)
parallel_batches = PythonOperator(task_id="compute_parallel_batches", python_callable=compute_batches)