initial stage
This commit is contained in:
parent
33cb4ce636
commit
7c919f5278
|
@ -1,159 +0,0 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import gzip
|
||||
import io
|
||||
import json
|
||||
import os
|
||||
from datetime import timedelta
|
||||
|
||||
import pendulum
|
||||
from airflow.decorators import dag
|
||||
from airflow.decorators import task
|
||||
from airflow.operators.python import PythonOperator
|
||||
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
||||
from airflow.utils.helpers import chain
|
||||
from airflow.hooks.base import BaseHook
|
||||
|
||||
from opensearchpy import OpenSearch, helpers
|
||||
from EOSC_indexes import mappings
|
||||
|
||||
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||
ENTITIES = ["interoperability", "services", "training"]
|
||||
|
||||
default_args = {
|
||||
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
|
||||
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
||||
}
|
||||
|
||||
@dag(
|
||||
schedule=None,
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
catchup=False,
|
||||
default_args=default_args,
|
||||
params={
|
||||
"S3_CONN_ID": "s3_conn",
|
||||
"OPENSEARCH_CONN_ID": "opensearch_default",
|
||||
"EOSC_CATALOG_BUCKET": "eosc-portal-import",
|
||||
"BATCH_LOADERS_NUM": 2
|
||||
},
|
||||
tags=["lot1"],
|
||||
)
|
||||
def import_EOSC_catalog():
|
||||
@task
|
||||
def create_indexes(**kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20
|
||||
)
|
||||
|
||||
for entity in ENTITIES:
|
||||
if client.indices.exists(entity):
|
||||
client.indices.delete(entity)
|
||||
client.indices.create(entity, {
|
||||
"settings": {
|
||||
"index": {
|
||||
"number_of_shards": 3,
|
||||
"number_of_replicas": 1,
|
||||
"refresh_interval": -1,
|
||||
|
||||
"codec": "zstd_no_dict",
|
||||
"replication.type": "SEGMENT",
|
||||
|
||||
"translog.flush_threshold_size": "2048MB",
|
||||
|
||||
"mapping.ignore_malformed": "true"
|
||||
}
|
||||
|
||||
},
|
||||
"mappings": mappings[entity]
|
||||
})
|
||||
|
||||
def compute_batches(ds=None, **kwargs):
|
||||
pieces = []
|
||||
for entity in ENTITIES:
|
||||
hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
|
||||
keys = hook.list_keys(bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"], prefix=f'{entity}/')
|
||||
for key in keys:
|
||||
pieces.append((entity, key))
|
||||
|
||||
def split_list(list_a, chunk_size):
|
||||
for i in range(0, len(list_a), chunk_size):
|
||||
yield {"files": list_a[i:i + chunk_size]}
|
||||
|
||||
num_batches = len(pieces)//kwargs["params"]["BATCH_LOADERS_NUM"]
|
||||
if num_batches > 0:
|
||||
return [*split_list(pieces, num_batches)]
|
||||
return [*split_list(pieces, len(pieces))]
|
||||
|
||||
@task
|
||||
def bulk_load(files: list[(str, str)], **kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20
|
||||
)
|
||||
hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
|
||||
|
||||
def _generate_data():
|
||||
for (entity, key) in files:
|
||||
print(f'{entity}: {key}')
|
||||
s3_obj = hook.get_key(key, bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"])
|
||||
with gzip.GzipFile(fileobj=s3_obj.get()["Body"]) as gzipfile:
|
||||
buff = io.BufferedReader(gzipfile)
|
||||
for line in buff:
|
||||
data = json.loads(line)
|
||||
data['_index'] = entity
|
||||
data['_id'] = data['id']
|
||||
yield data
|
||||
|
||||
succeeded = []
|
||||
failed = []
|
||||
for success, item in helpers.parallel_bulk(client, actions=_generate_data()):
|
||||
if success:
|
||||
succeeded.append(item)
|
||||
else:
|
||||
failed.append(item)
|
||||
|
||||
if len(failed) > 0:
|
||||
print(f"There were {len(failed)} errors:")
|
||||
for item in failed:
|
||||
print(item["index"]["error"])
|
||||
|
||||
if len(succeeded) > 0:
|
||||
print(f"Bulk-inserted {len(succeeded)} items (streaming_bulk).")
|
||||
|
||||
@task
|
||||
def close_indexes(**kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20
|
||||
)
|
||||
for entity in ENTITIES:
|
||||
client.indices.refresh(entity)
|
||||
|
||||
parallel_batches = PythonOperator(task_id="compute_parallel_batches", python_callable=compute_batches)
|
||||
|
||||
chain(
|
||||
create_indexes.override(task_id="create_indexes")(),
|
||||
parallel_batches,
|
||||
bulk_load.expand_kwargs(parallel_batches.output),
|
||||
close_indexes.override(task_id="close_indexes")()
|
||||
)
|
||||
|
||||
|
||||
import_EOSC_catalog()
|
|
@ -75,10 +75,11 @@ for config_name, config in configs.items():
|
|||
})
|
||||
|
||||
for entity in kwargs["params"]["ENTITIES"]:
|
||||
if client.indices.exists(entity):
|
||||
client.indices.delete(entity)
|
||||
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
|
||||
if client.indices.exists(indexname):
|
||||
client.indices.delete(indexname)
|
||||
|
||||
client.indices.create(entity, {
|
||||
client.indices.create(indexname, {
|
||||
"settings": {
|
||||
"index": {
|
||||
"number_of_shards": 40,
|
||||
|
@ -146,10 +147,11 @@ for config_name, config in configs.items():
|
|||
hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
|
||||
|
||||
for (entity, key) in files:
|
||||
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
|
||||
if hook.check_for_key(key=f"{key}.PROCESSED", bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"]):
|
||||
print(f'Skipping {entity}: {key}')
|
||||
continue
|
||||
print(f'Processing {entity}: {key}')
|
||||
print(f'Processing {indexname}: {key}')
|
||||
s3_obj = hook.get_key(key, bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"])
|
||||
with s3_obj.get()["Body"] as body:
|
||||
with gzip.GzipFile(fileobj=body) as gzipfile:
|
||||
|
@ -157,7 +159,7 @@ for config_name, config in configs.items():
|
|||
buff = io.BufferedReader(gzipfile)
|
||||
for line in buff:
|
||||
data = json.loads(line)
|
||||
data['_index'] = entity
|
||||
data['_index'] = indexname
|
||||
if entity in transform_entities:
|
||||
data = transform_entities[entity](data)
|
||||
yield data
|
||||
|
@ -204,7 +206,27 @@ for config_name, config in configs.items():
|
|||
timeout=180
|
||||
)
|
||||
for entity in kwargs["params"]["ENTITIES"]:
|
||||
client.indices.refresh(entity)
|
||||
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
|
||||
client.indices.refresh(indexname)
|
||||
# update aliases
|
||||
for entity in kwargs["params"]["ENTITIES"]:
|
||||
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
|
||||
client.indices.update_aliases(
|
||||
actions=[
|
||||
{"remove": {"index": f"{entity}_*", "alias": entity}},
|
||||
{"add": {"index": indexname, "alias": entity}},
|
||||
]
|
||||
)
|
||||
# update "allresources" alias
|
||||
actions = []
|
||||
for entity in kwargs["params"]["ENTITIES"]:
|
||||
if entity in ['products', 'services', 'training', 'interoperability']:
|
||||
actions.append({"remove": {"index": f"{entity}_*", "alias": "allresources"}})
|
||||
actions.append({"add": {"index": "allresources", "alias": entity}})
|
||||
if len(actions) > 0:
|
||||
client.indices.update_aliases(
|
||||
actions=actions
|
||||
)
|
||||
|
||||
parallel_batches = PythonOperator(task_id="compute_parallel_batches", python_callable=compute_batches)
|
||||
|
||||
|
|
Loading…
Reference in New Issue