initial stage

This commit is contained in:
Giambattista Bloisi 2024-07-26 19:09:45 +02:00
parent 48d2f69bc6
commit 43743187ba
2 changed files with 92 additions and 72 deletions

View File

@ -7,12 +7,43 @@ from catalogue.vocabulary import CATALOG_VOCABULARY
class RawCatalogOpensearch:
entities = ["datasources",
"interoperability-records",
"providers",
"resource-interoperability-records",
"services",
"training-resources"]
mapped_entities = ["interoperability-records", "training-resources", "services"]
def __init__(self, os_client: OpenSearch, suffix: str | None):
self.os_client = os_client
self.suffix = suffix
def get_index(self, name: str):
return "catalog_" + name + ("" if self.suffix is None else f"_{self.suffix}")
return f"catalog_{name}_{self.suffix}"
def get_alias(self, name: str):
return f"catalog_{name}"
def get_mapped_index(self, name: str):
match name:
case "interoperability-records":
return f"interoperability_{self.suffix}"
case "training-resources":
return f"training_{self.suffix}"
case "services":
return f"services_{self.suffix}"
return None
def get_mapped_alias(self, name: str):
match name:
case "interoperability-records":
return f"interoperability"
case "training-resources":
return f"training"
case "services":
return f"services"
return None
def get_resource_interoperability_records(self, resource_id):
response = self.os_client.search(

View File

@ -1,6 +1,5 @@
from __future__ import annotations
import json
import os
from datetime import timedelta
@ -33,12 +32,6 @@ default_args = {
default_args=default_args,
params={
"OPENSEARCH_CONN_ID": "opensearch_default",
"ENTITIES": ["datasources",
"interoperability-records",
"providers",
"resource-interoperability-records",
"services",
"training-resources"],
"SHARDS": 3,
"SUFFIX": pendulum.now().format('YYYYMMDDHHmmss')
},
@ -58,28 +51,11 @@ def import_catalogue_entities():
timeout=180
)
for entity in kwargs["params"]["ENTITIES"]:
for entity in RawCatalogOpensearch.entities:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
if client.indices.exists(indexname):
client.indices.delete(indexname)
client.indices.create(indexname, {
"settings": {
"index": {
"number_of_shards": kwargs["params"]["SHARDS"],
"number_of_replicas": 0,
"refresh_interval": -1,
"translog.flush_threshold_size": "2048MB",
"codec": "zstd_no_dict",
"replication.type": "SEGMENT"
}
}
# "mappings": mappings[entity]
})
@task
def harvest_indexes(**kwargs):
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
@ -95,7 +71,7 @@ def import_catalogue_entities():
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
session = requests.session()
for entity in kwargs["params"]["ENTITIES"]:
for entity in RawCatalogOpensearch.entities:
indexname = catalog.get_index(entity)
baseurl = "http://vereniki.athenarc.gr:8080/eic-registry"
callurl = f"{baseurl}/{entity}"
@ -120,7 +96,7 @@ def import_catalogue_entities():
succeeded = 0
failed = 0
for success, item in helpers.parallel_bulk(client, actions=streamed_results(), timeout=5*60):
for success, item in helpers.parallel_bulk(client, actions=streamed_results(), timeout=5 * 60):
if success:
succeeded = succeeded + 1
else:
@ -148,29 +124,33 @@ def import_catalogue_entities():
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
for entity in {"interoperability-records", "training-resources", "services"}.intersection(
kwargs["params"]["ENTITIES"]):
for entity in ["interoperability-records", "training-resources", "services"]:
mapped_index = catalog.get_mapped_index(entity)
for hit in opensearchpy.helpers.scan(client, index=catalog.get_index(entity),
query={"query": {"match_all": {}}}):
s = hit['_source']
def streamed_results():
nonlocal mapped_index
for hit in opensearchpy.helpers.scan(client, index=catalog.get_index(entity),
query={"query": {"match_all": {}}}):
r = hit['_source']
doc = None
match entity:
case "interoperability-records":
doc = catalog.map_interoperability(r)
case "training-resources":
doc = catalog.map_training(r)
case "services":
doc = catalog.map_service(r)
doc = None
match entity:
case "interoperability-records":
doc = catalog.map_interoperability(s)
case "training-resources":
doc = catalog.map_training(s)
case "services":
doc = catalog.map_service(s)
yield {"_index": mapped_index, "_id": doc['id'], "_source": doc}
if doc is not None:
client.update(
index=f'{entity}_{kwargs["params"]["SUFFIX"]}',
body={"doc": doc, "doc_as_upsert": True},
id=doc['id'],
refresh=True
)
succeeded = 0
failed = 0
for success, item in helpers.parallel_bulk(client, actions=streamed_results(), timeout=5 * 60):
if success:
succeeded = succeeded + 1
else:
print("error: " + str(item))
failed = failed + 1
@task
def close_indexes(**kwargs):
@ -184,32 +164,41 @@ def import_catalogue_entities():
pool_maxsize=20,
timeout=180
)
for entity in kwargs["params"]["ENTITIES"]:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
client.indices.refresh(indexname)
client.indices.put_settings(index=indexname, body={
"index": {
"number_of_replicas": 1,
"refresh_interval": "60s",
}
})
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
# update aliases
for entity in kwargs["params"]["ENTITIES"]:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
client.indices.update_aliases(
body={"actions": [
{"remove": {"index": f"{entity}_*", "alias": entity}},
{"add": {"index": indexname, "alias": entity}},
]}
)
# update "allresources" alias
def refresh_index(index_name):
if index_name is not None:
client.indices.refresh(index_name)
client.indices.put_settings(index=index_name, body={
"index": {
"number_of_replicas": 1,
"refresh_interval": "60s",
}
})
def update_aliases(index_name, alias_name):
if index_name is not None and alias_name is not None:
client.indices.update_aliases(
body={"actions": [
{"remove": {"index": f"{alias_name}_*", "alias": alias_name}},
{"add": {"index": index_name, "alias": alias_name}},
]}
)
for entity in RawCatalogOpensearch.entities:
refresh_index(catalog.get_index(entity))
refresh_index(catalog.get_mapped_index(entity))
update_aliases(catalog.get_index(entity), catalog.get_alias(entity))
update_aliases(catalog.get_mapped_index(entity), catalog.get_mapped_alias(entity))
# update "allresources" alias with mapped indices
actions = []
for entity in kwargs["params"]["ENTITIES"]:
if entity in ['products', 'services', 'training', 'interoperability']:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
actions.append({"remove": {"index": f"{entity}_*", "alias": "allresources"}})
actions.append({"add": {"index": indexname, "alias": "allresources"}})
for entity in RawCatalogOpensearch.mapped_entities:
index_name = catalog.get_mapped_index(entity)
entity_alias = catalog.get_mapped_alias(entity)
actions.append({"remove": {"index": f"{entity_alias}_*", "alias": "allresources"}})
actions.append({"add": {"index": index_name, "alias": "allresources"}})
if len(actions) > 0:
client.indices.update_aliases(
body={"actions": actions}