initial stage

This commit is contained in:
Giambattista Bloisi 2024-07-26 19:09:45 +02:00
parent 48d2f69bc6
commit 43743187ba
2 changed files with 92 additions and 72 deletions

View File

@ -7,12 +7,43 @@ from catalogue.vocabulary import CATALOG_VOCABULARY
class RawCatalogOpensearch: class RawCatalogOpensearch:
entities = ["datasources",
"interoperability-records",
"providers",
"resource-interoperability-records",
"services",
"training-resources"]
mapped_entities = ["interoperability-records", "training-resources", "services"]
def __init__(self, os_client: OpenSearch, suffix: str | None): def __init__(self, os_client: OpenSearch, suffix: str | None):
self.os_client = os_client self.os_client = os_client
self.suffix = suffix self.suffix = suffix
def get_index(self, name: str): def get_index(self, name: str):
return "catalog_" + name + ("" if self.suffix is None else f"_{self.suffix}") return f"catalog_{name}_{self.suffix}"
def get_alias(self, name: str):
return f"catalog_{name}"
def get_mapped_index(self, name: str):
match name:
case "interoperability-records":
return f"interoperability_{self.suffix}"
case "training-resources":
return f"training_{self.suffix}"
case "services":
return f"services_{self.suffix}"
return None
def get_mapped_alias(self, name: str):
match name:
case "interoperability-records":
return f"interoperability"
case "training-resources":
return f"training"
case "services":
return f"services"
return None
def get_resource_interoperability_records(self, resource_id): def get_resource_interoperability_records(self, resource_id):
response = self.os_client.search( response = self.os_client.search(

View File

@ -1,6 +1,5 @@
from __future__ import annotations from __future__ import annotations
import json
import os import os
from datetime import timedelta from datetime import timedelta
@ -33,12 +32,6 @@ default_args = {
default_args=default_args, default_args=default_args,
params={ params={
"OPENSEARCH_CONN_ID": "opensearch_default", "OPENSEARCH_CONN_ID": "opensearch_default",
"ENTITIES": ["datasources",
"interoperability-records",
"providers",
"resource-interoperability-records",
"services",
"training-resources"],
"SHARDS": 3, "SHARDS": 3,
"SUFFIX": pendulum.now().format('YYYYMMDDHHmmss') "SUFFIX": pendulum.now().format('YYYYMMDDHHmmss')
}, },
@ -58,28 +51,11 @@ def import_catalogue_entities():
timeout=180 timeout=180
) )
for entity in kwargs["params"]["ENTITIES"]: for entity in RawCatalogOpensearch.entities:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}' indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
if client.indices.exists(indexname): if client.indices.exists(indexname):
client.indices.delete(indexname) client.indices.delete(indexname)
client.indices.create(indexname, {
"settings": {
"index": {
"number_of_shards": kwargs["params"]["SHARDS"],
"number_of_replicas": 0,
"refresh_interval": -1,
"translog.flush_threshold_size": "2048MB",
"codec": "zstd_no_dict",
"replication.type": "SEGMENT"
}
}
# "mappings": mappings[entity]
})
@task @task
def harvest_indexes(**kwargs): def harvest_indexes(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"]) conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
@ -95,7 +71,7 @@ def import_catalogue_entities():
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"]) catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
session = requests.session() session = requests.session()
for entity in kwargs["params"]["ENTITIES"]: for entity in RawCatalogOpensearch.entities:
indexname = catalog.get_index(entity) indexname = catalog.get_index(entity)
baseurl = "http://vereniki.athenarc.gr:8080/eic-registry" baseurl = "http://vereniki.athenarc.gr:8080/eic-registry"
callurl = f"{baseurl}/{entity}" callurl = f"{baseurl}/{entity}"
@ -148,29 +124,33 @@ def import_catalogue_entities():
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"]) catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
for entity in {"interoperability-records", "training-resources", "services"}.intersection( for entity in ["interoperability-records", "training-resources", "services"]:
kwargs["params"]["ENTITIES"]): mapped_index = catalog.get_mapped_index(entity)
def streamed_results():
nonlocal mapped_index
for hit in opensearchpy.helpers.scan(client, index=catalog.get_index(entity), for hit in opensearchpy.helpers.scan(client, index=catalog.get_index(entity),
query={"query": {"match_all": {}}}): query={"query": {"match_all": {}}}):
s = hit['_source'] r = hit['_source']
doc = None doc = None
match entity: match entity:
case "interoperability-records": case "interoperability-records":
doc = catalog.map_interoperability(s) doc = catalog.map_interoperability(r)
case "training-resources": case "training-resources":
doc = catalog.map_training(s) doc = catalog.map_training(r)
case "services": case "services":
doc = catalog.map_service(s) doc = catalog.map_service(r)
if doc is not None: yield {"_index": mapped_index, "_id": doc['id'], "_source": doc}
client.update(
index=f'{entity}_{kwargs["params"]["SUFFIX"]}', succeeded = 0
body={"doc": doc, "doc_as_upsert": True}, failed = 0
id=doc['id'], for success, item in helpers.parallel_bulk(client, actions=streamed_results(), timeout=5 * 60):
refresh=True if success:
) succeeded = succeeded + 1
else:
print("error: " + str(item))
failed = failed + 1
@task @task
def close_indexes(**kwargs): def close_indexes(**kwargs):
@ -184,32 +164,41 @@ def import_catalogue_entities():
pool_maxsize=20, pool_maxsize=20,
timeout=180 timeout=180
) )
for entity in kwargs["params"]["ENTITIES"]: catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
client.indices.refresh(indexname) def refresh_index(index_name):
client.indices.put_settings(index=indexname, body={ if index_name is not None:
client.indices.refresh(index_name)
client.indices.put_settings(index=index_name, body={
"index": { "index": {
"number_of_replicas": 1, "number_of_replicas": 1,
"refresh_interval": "60s", "refresh_interval": "60s",
} }
}) })
# update aliases def update_aliases(index_name, alias_name):
for entity in kwargs["params"]["ENTITIES"]: if index_name is not None and alias_name is not None:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
client.indices.update_aliases( client.indices.update_aliases(
body={"actions": [ body={"actions": [
{"remove": {"index": f"{entity}_*", "alias": entity}}, {"remove": {"index": f"{alias_name}_*", "alias": alias_name}},
{"add": {"index": indexname, "alias": entity}}, {"add": {"index": index_name, "alias": alias_name}},
]} ]}
) )
# update "allresources" alias
for entity in RawCatalogOpensearch.entities:
refresh_index(catalog.get_index(entity))
refresh_index(catalog.get_mapped_index(entity))
update_aliases(catalog.get_index(entity), catalog.get_alias(entity))
update_aliases(catalog.get_mapped_index(entity), catalog.get_mapped_alias(entity))
# update "allresources" alias with mapped indices
actions = [] actions = []
for entity in kwargs["params"]["ENTITIES"]: for entity in RawCatalogOpensearch.mapped_entities:
if entity in ['products', 'services', 'training', 'interoperability']: index_name = catalog.get_mapped_index(entity)
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}' entity_alias = catalog.get_mapped_alias(entity)
actions.append({"remove": {"index": f"{entity}_*", "alias": "allresources"}}) actions.append({"remove": {"index": f"{entity_alias}_*", "alias": "allresources"}})
actions.append({"add": {"index": indexname, "alias": "allresources"}}) actions.append({"add": {"index": index_name, "alias": "allresources"}})
if len(actions) > 0: if len(actions) > 0:
client.indices.update_aliases( client.indices.update_aliases(
body={"actions": actions} body={"actions": actions}