initial stage

2024-08-06 12:09:57 +02:00 · 2024-08-06 11:37:23 +02:00 · 2024-08-06 11:21:52 +02:00 · 2024-08-06 11:20:52 +02:00 · 2024-08-06 11:19:01 +02:00 · 2024-08-01 11:10:44 +02:00
28 changed files with 62027 additions and 0 deletions
--- a/airflow/dags/EOSC_entity_trasform.py
+++ b/airflow/dags/EOSC_entity_trasform.py
@ -0,0 +1,140 @@
+from typing import Dict, Any, List
+
+
+def map_access_right(ar: str) -> str:
+    match ar:
+        case 'open':
+            return 'Open Access'
+        case 'closed':
+            return 'Closed'
+        case 'embargo':
+            return 'Embargo'
+        case 'restricted':
+            return 'Restricted'
+        case _:
+            return ''
+
+
+def trasform_graph_entity(p: dict) -> dict:
+    p['_id'] = p['local_identifier']
+    return p
+
+
+def trasform_catalog_entity(p: dict) -> dict:
+    p['_id'] = p['id']
+    return p
+
+
+def map_fos_topic_to_domain(fos: str):
+    if fos.startswith('01'):
+        return 'Natural Sciences'
+    elif fos.startswith('02'):
+        return 'Engineering & Technology'
+    elif fos.startswith('03'):
+        return 'Medical & Health Sciences'
+    elif fos.startswith('04'):
+        return 'Agricultural Sciences'
+    elif fos.startswith('05'):
+        return 'Social Sciences'
+    elif fos.startswith('06'):
+        return 'Humanities'
+
+    return None
+
+
+def trasform_interoperability(p: dict) -> dict:
+    p = trasform_catalog_entity(p)
+    if 'domain' in p:
+        p['domain'] = {"domain": p['domain']}
+    p['licenseDetails'] = p['license']
+    p['license'] = p['license']['identifier'] if 'identifier' in p['license'] else ''
+    return p
+
+
+def trasform_product(p: dict) -> dict:
+    p = trasform_graph_entity(p)
+    p['accessRights'] = list(set(
+        filter(lambda ar: ar != '', map(lambda m: map_access_right(m.get('access_right')), p.get('manifestations')))))
+    p['keyword'] = list(set(
+        map(lambda topic: topic.get('topic').get('value'),
+            filter(lambda topic: topic.get('topic').get('scheme') == 'keyword', p.get('topics')))))
+    p['domain'] = list(
+        map(lambda fos: {"domain": fos},
+            set(filter(lambda fos: fos is not None,
+                       map(lambda topic: map_fos_topic_to_domain(topic.get('topic').get('value')),
+                           filter(lambda topic: topic.get('topic').get('scheme') == 'FOS', p.get('topics')))))))
+    p['firstPublishDate'] = next(
+        iter(
+            sorted(
+                map(lambda date: date.get('value'),
+                    filter(lambda date: date.get('type') == 'publishing',
+                           [date for m in (p.get('manifestations') or []) for date in (m.get('dates') or [])])))),
+        None)
+    return p
+
+
+transform_entities = {
+    # SKG-IF graph entities
+    "datasource": trasform_graph_entity,
+    "grants": trasform_graph_entity,
+    "organizations": trasform_graph_entity,
+    "persons": trasform_graph_entity,
+    "products": trasform_product,
+    "topics": trasform_graph_entity,
+    "venues": trasform_graph_entity,
+    # EOSC catalog entities
+    "interoperability": trasform_interoperability,
+    "services": trasform_catalog_entity,
+    "training": trasform_catalog_entity,
+}
+
+
+def isEmpty(current_value: Dict[str, Any], labels: List[str]) -> bool:
+    if len(labels) <= 0:
+        return True
+    for label in labels:
+        if isinstance(current_value, list) and len(current_value) > 0:
+            current_value = current_value[0]
+
+        if isinstance(current_value, dict) and label in current_value:
+            current_value = current_value[label]
+        else:
+            return True
+    if current_value is None:
+        return True
+    if isinstance(current_value, list):
+        if len(current_value) > 0:
+            return current_value[0] == ""
+        else:
+            return True
+
+    return str(current_value) == ""
+
+
+#
+# Filter products that do not meet inclusion policy
+#
+def filter_product(p: dict) -> bool:
+    if isEmpty(p, ["titles", "none"]):
+        return True
+
+    if isEmpty(p, ["firstPublishDate"]):
+        return True
+
+    if p['product_type'] == "literature":
+        if isEmpty(p, ["abstracts", "none"]):
+            return True
+        if isEmpty(p, ["contributions", "person", "local_identifier"]):
+            return True
+    elif p['product_type'] in ["research data", "other"]:
+        if isEmpty(p, ["contributions", "person", "local_identifier"]):
+            return True
+
+    return False
+
+
+filter_entities = {
+    "products": filter_product
+}
+
+
--- a/airflow/dags/EOSC_indexes.py
+++ b/airflow/dags/EOSC_indexes.py
--- a/airflow/dags/MKG_prepare_environment.py
+++ b/airflow/dags/MKG_prepare_environment.py
@ -0,0 +1,53 @@
+from __future__ import annotations
+
+from airflow.decorators import dag
+from airflow.decorators import task
+from airflow.hooks.base import BaseHook
+from airflow.models.baseoperator import chain
+from opensearchpy import OpenSearch
+
+import init_ams_topics
+import init_opensearch_templates
+
+
+@dag(
+    dag_id="mkg_prepare_environment",
+    #dag_display_name="Prepare MKG Environment",
+    schedule=None,
+    dagrun_timeout=None,
+    start_date=None,
+    catchup=False,
+    params={
+        "OPENSEARCH_CONN_ID": "opensearch_default",
+        "ARGO_CONN_ID": "ams_default",
+        "RESET_AMS": False
+    },
+    tags=["MKG", "opensearch", "argo"]
+)
+def prepare_environment():
+    @task
+    def prepare_opensearch(**kwargs):
+        conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
+        client = OpenSearch(
+            hosts=[{'host': conn.host, 'port': conn.port}],
+            http_auth=(conn.login, conn.password),
+            use_ssl=True,
+            verify_certs=False,
+            ssl_show_warn=False,
+            pool_maxsize=20,
+            timeout=180)
+        init_opensearch_templates.init_opensearch(client)
+
+    @task
+    def prepare_ams(**kwargs):
+        conn = BaseHook.get_connection(kwargs["params"]["ARGO_CONN_ID"])
+        extra = conn.extra_dejson
+        init_ams_topics.init_ams(conn.host, extra['project'], extra['token'], kwargs["params"]["RESET_AMS"])
+
+    chain(
+        prepare_opensearch.override(task_id="prepare_opensearch")(),
+        # prepare_ams.override(task_id="prepare_ams")(),
+    )
+
+
+prepare_environment()
--- a/airflow/dags/OpenDataPortal_harvest.py
+++ b/airflow/dags/OpenDataPortal_harvest.py
@ -0,0 +1,112 @@
+import os
+from datetime import timedelta
+import time
+
+import pendulum
+import requests
+from airflow.decorators import dag
+from airflow.decorators import task
+from airflow.hooks.base import BaseHook
+from opensearchpy import OpenSearch, helpers
+
+S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
+EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
+
+default_args = {
+    "execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
+    "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
+    "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
+}
+
+
+@dag(
+    dag_id="open_data_portal_harvest",
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    schedule=None,
+    dagrun_timeout=None,
+    catchup=False,
+    default_args=default_args,
+    params={
+        "S3_CONN_ID": "s3_conn",
+        "OPENSEARCH_CONN_ID": "opensearch_default",
+        "OS_INDEX_NAME": "euodp_raw"
+    },
+    tags=["aggregation"]
+)
+def harvest():
+    @task
+    def everything(**context):
+        index_name = context["params"]["OS_INDEX_NAME"]
+        conn = BaseHook.get_connection(context["params"]["OPENSEARCH_CONN_ID"])
+        client = OpenSearch(
+            hosts=[{'host': conn.host, 'port': conn.port}],
+            http_auth=(conn.login, conn.password),
+            use_ssl=True,
+            verify_certs=False,
+            ssl_show_warn=False,
+            pool_maxsize=20
+        )
+
+        if not client.indices.exists(index_name):
+            client.indices.create(index_name, {
+                "settings": {
+                    "index": {
+                        "number_of_shards": 3,
+                        "number_of_replicas": 0,
+                        "codec": "zstd_no_dict",
+                        "replication.type": "SEGMENT"
+                    },
+                },
+                "mappings": {
+                    "dynamic": False
+                }
+            })
+
+        def store_results(hits):
+            def _generate_data():
+                for r in hits:
+                    r['_index'] = index_name
+                    r['_id'] = r['id']
+                    yield r
+            succeeded = 0
+            failed = 0
+            for success, item in helpers.parallel_bulk(client, actions=_generate_data(),
+                                                       raise_on_exception=False,
+                                                       raise_on_error=False,
+                                                       chunk_size=5000,
+                                                       max_chunk_bytes=50 * 1024 * 1024,
+                                                       timeout=180):
+                if success:
+                    succeeded = succeeded + 1
+                else:
+                    print(item["index"]["error"])
+                    failed = failed + 1
+        headers = {'Accept': 'application/json'}
+        r = requests.get('https://data.europa.eu/api/hub/search/search?filter=dataset&aggregation=false&limit=300&showScore=true&scroll=true', headers=headers).json()
+        scroll_id = r['result']['scrollId']
+        results = r['result']['results']
+        store_results(results)
+        max_retries = 10
+        while scroll_id:
+            try:
+                r = requests.get('https://data.europa.eu/api/hub/search/scroll?scrollId=' + scroll_id, headers=headers)
+                r.raise_for_status()
+            except Exception as e:
+                print(f"Error:" + str(e))
+                time.sleep(0.1)
+                max_retries = max_retries - 1
+                if max_retries == 0:
+                    raise Exception("Cannot fetch data")
+                continue
+            max_retries = 10
+            r = r.json()
+            scroll_id = r['result']['scrollId']
+            results = r['result']['results']
+            if len(results) <= 0:
+                return
+            store_results(results)
+
+    everything()
+
+
+harvest()
--- a/airflow/dags/S3_delete.py
+++ b/airflow/dags/S3_delete.py
@ -0,0 +1,42 @@
+import os
+from datetime import timedelta
+
+import pendulum
+from airflow.decorators import dag
+from airflow.decorators import task
+from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+
+S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
+EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
+
+default_args = {
+    "execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
+    "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
+    "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
+}
+
+
+@dag(
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    schedule=None,
+    catchup=False,
+    default_args=default_args,
+    params={
+        "prefix": "Key prefix of files to delete",
+        "bucket": "bucket containing files to delete",
+    },
+    tags=["s3"],
+)
+def s3_delete():
+    @task
+    def delete(**context):
+        hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
+        keys = hook.list_keys(bucket_name=context["params"]["bucket"], prefix=context["params"]["prefix"])
+        hook.delete_objects(bucket=context["params"]["bucket"], keys=keys)
+        for key in keys:
+            print(f"{key} deleted!")
+
+    delete()
+
+
+s3_delete()
--- a/airflow/dags/S3_untar.py
+++ b/airflow/dags/S3_untar.py
@ -0,0 +1,98 @@
+import os
+import tarfile
+import time
+from datetime import timedelta
+
+import pendulum
+from airflow.decorators import dag
+from airflow.decorators import task
+from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+from botocore.exceptions import ClientError
+
+S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
+EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
+
+default_args = {
+    "execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
+    "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
+    "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
+}
+
+
+def check_for_key_with_backoff(hook: S3Hook, key:str, bucket:str) -> bool:
+    delay = 10     # initial delay
+    delay_incr = 10 # additional delay in each loop
+    max_delay = 60   # max delay of one loop. Total delay is (max_delay**2)/2
+
+    while delay < max_delay:
+        try:
+            return hook.check_for_key(key=key, bucket_name=bucket)
+        except ClientError as err:
+            code = err.response.get('Error',{}).get('Code', '')
+            if code in ['NoSuchBucket']:
+                print(f"Error: {code}. Check s3path: s3://{bucket}/{key}")
+                raise err
+            time.sleep(delay)
+            delay += delay_incr
+
+
+def load_file_obj_with_backoff(hook: S3Hook, fileobj, key:str, bucket:str, replace:bool) -> bool:
+    delay = 10     # initial delay
+    delay_incr = 10 # additional delay in each loop
+    max_delay = 60   # max delay of one loop. Total delay is (max_delay**2)/2
+
+    while delay < max_delay:
+        try:
+            return hook.load_file_obj(fileobj,
+                               key,
+                               bucket,
+                               replace=replace)
+        except ClientError as err:
+            code = err.response.get('Error',{}).get('Code', '')
+            if code in ['NoSuchBucket']:
+                print(f"Error: {code}. Check s3path: s3://{bucket}/{key}")
+                raise err
+            time.sleep(delay)
+            delay += delay_incr
+
+
+@dag(
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    schedule=None,
+    catchup=False,
+    default_args=default_args,
+    params={
+        "src_key": "File to untar",
+        "src_bucket": "bucket containing the zip file",
+        "dst_key_prefix": "",
+        "dst_bucket": "bucket that will contain unzipped files"
+    },
+    tags=["s3"],
+)
+def s3_untar():
+    @task
+    def untar(**context):
+        hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
+        s3_obj = hook.get_key(context["params"]["src_key"], bucket_name=context["params"]["src_bucket"])
+        with tarfile.open(fileobj=s3_obj.get()["Body"], mode='r|*') as tar:
+            for member in tar:
+                dst_key = context["params"]["dst_key_prefix"] + "/" + member.name
+                dst_key = os.path.normpath(dst_key)
+                # Ignore directories, links, devices, fifos, etc.
+                if (not member.isfile()) or member.name.endswith('/'):
+                    print(f"Skipping {member.name}: is not a file")
+                    continue
+                if check_for_key_with_backoff(hook, key=dst_key, bucket=context["params"]["dst_bucket"]):
+                    print(f"Skipping {member.name}: already exists")
+                    continue
+                print(f"Extracting {member.name} to {dst_key}")
+                fileobj = tar.extractfile(member)
+                fileobj.seekable = lambda: False
+                load_file_obj_with_backoff(hook, fileobj,
+                                              dst_key,
+                                              context["params"]["dst_bucket"],
+                                              replace=True)
+    untar()
+
+
+s3_untar()
--- a/airflow/dags/S3_unzip.py
+++ b/airflow/dags/S3_unzip.py
@ -0,0 +1,55 @@
+import os
+from datetime import timedelta
+
+import pendulum
+from airflow.decorators import dag
+from airflow.decorators import task
+from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+from airflow.utils.file import TemporaryDirectory
+
+S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
+EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
+
+default_args = {
+    "execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
+    "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
+    "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
+}
+
+def s3_dowload_unzip_upload(s3conn: str, src_key: str, src_bucket: str, dest_bucket: str):
+    hook = S3Hook(s3conn, transfer_config_args={'use_threads': False})
+
+    with TemporaryDirectory() as dwl_dir:
+        with TemporaryDirectory() as tmp_dir:
+            archive = f'{dwl_dir}/{src_key}'
+            hook.download_file(key=src_key, bucket_name=src_bucket, local_path=dwl_dir, preserve_file_name=True,
+                               use_autogenerated_subdir=False)
+            with zipfile.ZipFile(archive, 'r') as zip_ref:
+                for info in zip_ref.infolist():
+                    with zip_ref.open(info.filename) as file:
+                        hook.load_file_obj(file, info.filename, dest_bucket, replace=True)
+
+@dag(
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    schedule=None,
+    catchup=False,
+    default_args=default_args,
+    params={
+        "zipfile": "File to unzip",
+        "src_bucket": "bucket containing the zip file",
+        "dst_bucket": "bucket that will contain unzipped files"
+    },
+    tags=["s3"],
+)
+def s3_unzip():
+    @task
+    def unzip(**context):
+        s3_dowload_unzip_upload(S3_CONN_ID,
+                 context["params"]["zipfile"],
+                 context["params"]["src_bucket"],
+                 context["params"]["dst_bucket"])
+
+    unzip()
+
+
+s3_unzip()
--- a/airflow/dags/antispam-batch.py
+++ b/airflow/dags/antispam-batch.py
@ -0,0 +1,68 @@
+import os
+from datetime import timedelta, datetime
+
+import pendulum
+from airflow import DAG
+from airflow.hooks.base import BaseHook
+from airflow.models.baseoperator import chain
+from airflow.providers.cncf.kubernetes.operators.pod import KubernetesPodOperator
+from airflow.providers.cncf.kubernetes.secret import Secret
+
+default_args = {
+    "execution_timeout": timedelta(days=6),
+    "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
+    "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
+}
+
+conn = BaseHook.get_connection("opensearch_default")
+
+dag = DAG(
+    'antispam_batch_check',
+    default_args=default_args,
+    schedule=None,
+    dagrun_timeout=None,
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    catchup=False,
+    schedule_interval=timedelta(days=1)
+)
+
+
+secrets = [
+    Secret(
+        deploy_type='env',
+        deploy_target='CURATION_OPENSEARCH__USER',
+        secret='opensearch-conn-secrets',
+        key='username',
+    ),
+    Secret(
+        deploy_type='env',
+        deploy_target='CURATION_OPENSEARCH__PASSWORD',
+        secret='opensearch-conn-secrets',
+        key='password',
+    ),
+]
+
+# Define the KubernetesPodOperator
+task = KubernetesPodOperator(
+    task_id='antispam_checker',
+    name='antispam_checker',
+    namespace='kg-airflow',
+    image='gbloisi/curation:1.0.0',
+    image_pull_policy="Always",
+    cmds=['python3'],
+    arguments=['/antispam-batch.py',
+               "--opensearch.host", conn.host,
+               "--opensearch.port", str(conn.port),
+               "--openai.host", "local-ai.kg-airflow.svc.cluster.local",
+               "--openai.port", "8000",
+               "--parallelism", "36"
+               ],
+    secrets=secrets,
+    is_delete_operator_pod=True,
+    in_cluster=True,
+    get_logs=True,
+    dag=dag
+)
+
+# Set the task dependencies
+chain(task)
--- a/airflow/dags/catalogue/RawCatalogOpensearch.py
+++ b/airflow/dags/catalogue/RawCatalogOpensearch.py
@ -0,0 +1,314 @@
+from datetime import datetime
+
+from opensearchpy import OpenSearch
+
+from catalogue.dictutils import extract_nested, extract_map_nested, delete_none
+from catalogue.vocabulary import CATALOG_VOCABULARY
+
+
+class RawCatalogOpensearch:
+    entities = ["datasources",
+                "interoperability-records",
+                "providers",
+                "resource-interoperability-records",
+                "services",
+                "training-resources"]
+    mapped_entities = ["interoperability-records", "training-resources", "services"]
+
+    def __init__(self, os_client: OpenSearch, suffix: str | None):
+        self.os_client = os_client
+        self.suffix = suffix
+
+    def get_index(self, name: str):
+        return f"catalog_{name}_{self.suffix}"
+
+    def get_alias(self, name: str):
+        return f"catalog_{name}"
+
+    def get_mapped_index(self, name: str):
+        match name:
+            case "interoperability-records":
+                return f"interoperability_{self.suffix}"
+            case "training-resources":
+                return f"training_{self.suffix}"
+            case "services":
+                return f"services_{self.suffix}"
+        return None
+
+    def get_mapped_alias(self, name: str):
+        match name:
+            case "interoperability-records":
+                return f"interoperability"
+            case "training-resources":
+                return f"training"
+            case "services":
+                return f"services"
+        return None
+
+    def get_resource_interoperability_records(self, resource_id):
+        response = self.os_client.search(
+            body={
+                'query': {
+                    'term': {
+                        'resourceInteroperabilityRecord.resourceId.keyword': resource_id,
+                    }
+                },
+                "fields": [
+                    "resourceInteroperabilityRecord.interoperabilityRecordIds"
+                ],
+                "_source": False
+            },
+            index=self.get_index('resource-interoperability-records')
+        )
+
+        interoperability_ids = []
+        interoperability_records = []
+        for hit in response['hits']['hits']:
+            interoperability_ids.extend(
+                extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.interoperabilityRecordIds']) or [])
+
+        if len(interoperability_ids) > 0:
+            response = self.os_client.search(
+                body={
+                    "query": {
+                        "ids": {
+                            "values": interoperability_ids,
+                        }
+                    },
+                },
+                index=self.get_index('interoperability-records')
+            )
+            for hit in response['hits']['hits']:
+                interoperability_records.append(extract_nested(hit, ['_source']))
+
+        return interoperability_records
+
+    def get_providers(self, provider_ids: list[str]) -> list:
+        provider_records = []
+        if provider_ids is not None and len(provider_ids) > 0:
+            response = self.os_client.search(
+                body={
+                    "query": {
+                        "ids": {
+                            "values": provider_ids if isinstance(provider_ids, list) else [provider_ids],
+                        }
+                    },
+                },
+                index=self.get_index('providers')
+            )
+            for hit in response['hits']['hits']:
+                provider_records.append(extract_nested(hit, ['_source']))
+        return provider_records
+
+    def get_provider(self, provider_id: str):
+        if provider_id is not None:
+            providers = self.get_providers([provider_id])
+            if providers is not None and len(providers) > 0:
+                return providers[0]
+        return {}
+
+    def get_services(self, service_ids: list[str]) -> list:
+        service_records = []
+
+        if service_ids is not None and len(service_ids) > 0:
+            response = self.os_client.search(
+                body={
+                    "query": {
+                        "ids": {
+                            "values": service_ids if isinstance(service_ids, list) else [
+                                service_ids],
+                        }
+                    },
+                },
+                index=self.get_index('services')
+            )
+            for hit in response['hits']['hits']:
+                service_records.append(extract_nested(hit, ['_source']))
+
+        return service_records
+
+    def get_datasource_of_service(self, service_id: str):
+        response = self.os_client.search(
+            body={
+                'query': {
+                    'term': {
+                        'datasource.serviceId.keyword': service_id,
+                    }
+                }
+            },
+            index=self.get_index('datasources')
+        )
+
+        for hit in response['hits']['hits']:
+            return extract_nested(hit, ['_source'])
+        return {}
+
+    def get_services_of_interoperability(self, interoperability_id: str):
+        svc_ids = []
+        response = self.os_client.search(
+            body={
+                'query': {
+                    'term': {
+                        'resourceInteroperabilityRecord.interoperabilityRecordIds.keyword': interoperability_id,
+                    }
+                },
+                "fields": [
+                    "resourceInteroperabilityRecord.resourceId"
+                ],
+                "_source": False
+            },
+            index=self.get_index('resource-interoperability-records')
+        )
+
+        for hit in response['hits']['hits']:
+            svc_ids.extend(extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.resourceId']) or [])
+
+        return svc_ids
+
+    def map_service(self, raw_svc: dict) -> dict:
+        interoperability_records = self.get_resource_interoperability_records(raw_svc['id'])
+        organization = self.get_provider(extract_nested(raw_svc, ['service', 'resourceOrganisation']))
+        provider_records = self.get_providers(list(
+            filter(lambda i: len(i) > 0, extract_nested(raw_svc, ['service', 'resourceProviders']) or [])))
+        related_resources_records = self.get_services(list(
+            filter(lambda i: len(i) > 0, extract_nested(raw_svc, ['service', 'relatedResources']) or [])))
+        datasource = self.get_datasource_of_service(raw_svc['id'])
+
+        res = {
+            "accessRestriction": extract_nested(raw_svc,
+                                                "service.geographicalAvailabilities".split(".")),
+            "accessTypes": extract_map_nested(raw_svc, 'access_type', "service.accessTypes".split(".")),
+            "access_modes": extract_map_nested(raw_svc, 'access_mode', "service.accessModes".split(".")),
+            "category": list(map(lambda c: {"category": CATALOG_VOCABULARY['categories'][c['category']],
+                                            "subcategory": CATALOG_VOCABULARY['subcategories'][c['subcategory']]},
+                                 extract_nested(raw_svc, "service.categories".split(".")))),
+            "description": extract_nested(raw_svc, "service.description".split(".")),
+            "domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
+                                          "subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
+                               extract_nested(raw_svc, "service.scientificDomains".split(".")))),
+            "grantProjectNames": extract_nested(raw_svc, "service.grantProjectNames".split(".")),
+            "helpdeskPage": extract_nested(raw_svc, "service.helpdeskPage".split(".")),
+            "horizontal": extract_nested(raw_svc, "service.horizontalService".split(".")) or False,
+            "id": extract_nested(raw_svc, "service.id".split(".")),
+            "interoperabilityGuidelines": list(
+                map(lambda ig: ig['interoperabilityRecord']['title'], interoperability_records)),
+            "language": extract_map_nested(raw_svc, 'languages', "service.languageAvailabilities".split(".")),
+            "name": extract_nested(raw_svc, "service.name".split(".")),
+            "orderType": extract_map_nested(raw_svc, 'order_type', "service.orderType".split(".")),
+            "organization": extract_nested(organization, "provider.name".split(".")),
+            "pricing": extract_nested(raw_svc, "service.pricing".split(".")),
+            "privacyPolicy": extract_nested(raw_svc, "service.privacyPolicy".split(".")),
+            "providers": list(map(lambda p: p['provider']['name'], provider_records)),
+            "relatedPlatforms": extract_map_nested(raw_svc, 'related_platform', "service.relatedPlatforms".split(".")),
+            "relatedResources": list(map(lambda p: p['service']['name'], related_resources_records)),
+            "tags": extract_nested(raw_svc, "service.tags".split(".")),
+            "targetUsers": extract_map_nested(raw_svc, 'target_user', "service.targetUsers".split(".")),
+            "termsOfUse": extract_nested(raw_svc, "service.termsOfUse".split(".")),
+            "thematic": extract_nested(datasource, "datasource.thematic".split(".")) or False,
+            "trl": extract_map_nested(raw_svc, 'trl', "service.trl".split(".")),
+            "type": 'datasource' if extract_nested(datasource, "datasource.id".split(".")) is not None else 'service',
+            "useCases": extract_nested(raw_svc, "service.useCases".split(".")),
+            "userManual": extract_nested(raw_svc, "service.userManual".split(".")),
+            "webpage": extract_nested(raw_svc, "service.webpage".split(".")),
+            "year": datetime.fromtimestamp(
+                int(extract_nested(raw_svc, "metadata.registeredAt".split("."))) / 1000).year,
+        }
+
+        return delete_none(res)
+
+    def map_training(self, raw_trn: dict) -> dict:
+        organization = self.get_provider(extract_nested(raw_trn, ['trainingResource', 'resourceOrganisation']))
+
+        res = {
+            "accessRights": extract_map_nested(raw_trn, 'tr_access', "trainingResource.accessRights".split(".")),
+            "alternativeIdentifiers": extract_nested(raw_trn,
+                                                     "trainingResource.alternativeIdentifiers".split(".")),
+            "authors": extract_nested(raw_trn,
+                                      "trainingResource.authors".split(".")),
+
+            "contentResourceType": extract_map_nested(raw_trn, 'tr_content',
+                                                      "trainingResource.contentResourceTypes".split(".")),
+
+            "description": extract_nested(raw_trn,
+                                          "trainingResource.description".split(".")),
+            "domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
+                                          "subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
+                               extract_nested(raw_trn, "trainingResource.scientificDomains".split(".")))),
+            "duration": extract_nested(raw_trn,
+                                       "trainingResource.duration".split(".")),
+            "expertiseLevel": extract_map_nested(raw_trn, 'expertise_level',
+                                                 "trainingResource.expertiseLevel".split(".")),
+
+            "id": extract_nested(raw_trn,
+                                 "trainingResource.id".split(".")),
+            "keyword": extract_nested(raw_trn,
+                                      "trainingResource.keywords".split(".")),
+            "language": extract_map_nested(raw_trn, 'languages', "trainingResource.languages".split(".")),
+            "learningOutcomes": extract_nested(raw_trn,
+                                               "trainingResource.learningOutcomes".split(".")),
+            "learningResourceType": extract_map_nested(raw_trn, 'tr_dcmi',
+                                                       "trainingResource.learningResourceTypes".split(".")),
+
+            "license": extract_nested(raw_trn,
+                                      "trainingResource.license".split(".")),
+            "organization": extract_nested(organization, "provider.name".split(".")),
+            "qualifications": extract_map_nested(raw_trn, 'qualification',
+                                                 "trainingResource.qualifications".split(".")),
+            "targetGroup": extract_map_nested(raw_trn, 'target_user', "trainingResource.targetGroups".split(".")),
+            "title": extract_nested(raw_trn,
+                                    "trainingResource.title".split(".")),
+            "type": 'trainingResource',
+            "url": extract_nested(raw_trn,
+                                  "trainingResource.url".split(".")),
+            "year": datetime.fromtimestamp(
+                int(extract_nested(raw_trn, "metadata.registeredAt".split("."))) / 1000).year,
+        }
+
+        return delete_none(res)
+
+    def map_interoperability(self, raw_itr: dict) -> dict:
+        organization = self.get_provider(extract_nested(raw_itr, ['interoperabilityRecord', 'providerId']))
+        service_records = self.get_services(self.get_services_of_interoperability(raw_itr['id']))
+
+        res = {
+            "alternativeIdentifiers": extract_nested(raw_itr,
+                                                     "interoperabilityRecord.alternativeIdentifiers".split(".")),
+            "creators": list(map(lambda c: {
+                "affiliation": extract_nested(c, ['creatorAffiliationInfo', 'affiliation']),
+                "givenName": extract_nested(c, ['givenName']),
+                "familyName": extract_nested(c, ['familyName']),
+                "fullName": extract_nested(c, ['creatorNameTypeInfo', 'creatorName']),
+                "type": extract_nested(c, ['creatorNameTypeInfo', 'nameType'])
+            }, extract_nested(raw_itr, "interoperabilityRecord.creators".split(".")))),
+
+            "description": extract_nested(raw_itr,
+                                          "interoperabilityRecord.description".split(".")),
+            "doi": extract_nested(raw_itr, ['identifierInfo', 'identifier']) if
+            extract_nested(raw_itr, ['identifierInfo', 'identifierType']) == 'ir_identifier_type-doi' else None,
+            "domain": {'domain': extract_map_nested(raw_itr, 'domains',
+                                                    "interoperabilityRecord.domain".split("."))},
+            "guidelineType": extract_map_nested(raw_itr, 'guideline_type',
+                                                "interoperabilityRecord.eoscGuidelineType".split(".")),
+            "id": extract_nested(raw_itr,
+                                 "interoperabilityRecord.id".split(".")),
+            "license": extract_nested(raw_itr, "interoperabilityRecord.rights.rightIdentifier".split(".")),
+            "licenseDetails": list(map(lambda c: {
+                "identifier": extract_nested(c, ['rightIdentifier']),
+                "title": extract_nested(c, ['rightTitle']),
+                "uri": extract_nested(c, ['rightURI'])
+            }, extract_nested(raw_itr, "interoperabilityRecord.rights".split(".")))),
+            "organization": extract_nested(organization, "provider.name".split(".")),
+            "provider": extract_nested(organization, "provider.name".split(".")),
+            "publicationYear": extract_nested(raw_itr, "interoperabilityRecord.publicationYear".split(".")),
+            "services": list(map(lambda s: {
+                "name": extract_nested(organization, "service.name".split(".")),
+                "organization": extract_nested(organization, "service.organization".split(".")),
+                # s.organization on already mapped services
+            }, service_records)),
+            "status": extract_nested(raw_itr, "interoperabilityRecord.status".split(".")),
+            "title": extract_nested(raw_itr, "interoperabilityRecord.title".split(".")),
+            "type": 'interoperabilityRecord',
+            # "year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
+        }
+
+        return delete_none(res)
--- a/airflow/dags/catalogue/init.py
+++ b/airflow/dags/catalogue/init.py
--- a/airflow/dags/catalogue/dictutils.py
+++ b/airflow/dags/catalogue/dictutils.py
@ -0,0 +1,41 @@
+from typing import Dict, Any, List
+
+from catalogue.vocabulary import CATALOG_VOCABULARY
+
+
+def extract_nested(current_value: Dict[str, Any], labels: List[str]) -> Any | None:
+    if len(labels) <= 0:
+        return current_value
+    for label in labels:
+        if isinstance(current_value, dict) and label in current_value:
+            current_value = current_value[label]
+        else:
+            return None
+
+    return current_value
+
+
+def extract_map_nested(current_value: Dict[str, Any], dictionary: str, labels: List[str]) -> Any | None:
+    value = extract_nested(current_value, labels)
+    if value is None:
+        return None
+    if isinstance(value, list):
+        return list(map(lambda d: CATALOG_VOCABULARY[dictionary][d] if d else None, value))
+    if isinstance(value, str) and value != '':
+        return CATALOG_VOCABULARY[dictionary][value]
+    return None
+
+
+def delete_none(_dict):
+    """Delete None values recursively from all of the dictionaries, tuples, lists, sets"""
+    if isinstance(_dict, dict):
+        for key, value in list(_dict.items()):
+            if isinstance(value, (list, dict, tuple, set)):
+                _dict[key] = delete_none(value)
+            elif value is None or key is None:
+                del _dict[key]
+
+    elif isinstance(_dict, (list, set, tuple)):
+        _dict = type(_dict)(delete_none(item) for item in _dict if item is not None)
+
+    return _dict
--- a/airflow/dags/catalogue/mappers.py
+++ b/airflow/dags/catalogue/mappers.py
@ -0,0 +1,13 @@
+from datetime import datetime
+from typing import Dict, Any, List
+
+from opensearchpy import OpenSearch
+
+from catalogue.dictutils import extract_nested, extract_map_nested, delete_none
+from catalogue.vocabulary import CATALOG_VOCABULARY
+
+
+
+
+
+
--- a/airflow/dags/catalogue/vocabulary.py
+++ b/airflow/dags/catalogue/vocabulary.py
@ -0,0 +1,878 @@
+CATALOG_VOCABULARY = {
+    'categories': {'category-access_physical_and_eInfrastructures-compute': 'Compute',
+                   'category-access_physical_and_eInfrastructures-data_storage': 'Data Storage',
+                   'category-access_physical_and_eInfrastructures-instrument_and_equipment': 'Instrument & Equipment',
+                   'category-access_physical_and_eInfrastructures-material_storage': 'Material Storage',
+                   'category-access_physical_and_eInfrastructures-network': 'Network',
+                   'category-aggregators_and_integrators-aggregators_and_integrators': 'Aggregators & Integrators',
+                   'category-other-other': 'Other', 'category-processing_and_analysis-data_analysis': 'Data Analysis',
+                   'category-processing_and_analysis-data_management': 'Data Management',
+                   'category-processing_and_analysis-measurement_and_materials_analysis': 'Measurement & Materials Analysis',
+                   'category-security_and_operations-operations_and_infrastructure_management_services': 'Operations & Infrastructure Management Services',
+                   'category-security_and_operations-security_and_identity': 'Security & Identity',
+                   'category-sharing_and_discovery-applications': 'Applications',
+                   'category-sharing_and_discovery-data': 'Data',
+                   'category-sharing_and_discovery-development_resources': 'Development Resources',
+                   'category-sharing_and_discovery-samples': 'Samples',
+                   'category-sharing_and_discovery-scholarly_communication': 'Scholarly Communication',
+                   'category-sharing_and_discovery-software': 'Software',
+                   'category-training_and_support-consultancy_and_support': 'Consultancy & Support',
+                   'category-training_and_support-education_and_training': 'Education & Training'},
+    'trl': {'trl-1': '1 - basic principles observed', 'trl-2': '2 - technology concept formulated',
+            'trl-3': '3 - experimental proof of concept', 'trl-4': '4 - technology validated in lab',
+            'trl-5': '5 - technology validated in relevant environment',
+            'trl-6': '6 - technology demonstrated in relevant environment',
+            'trl-7': '7 - system prototype demonstration in operational environment',
+            'trl-8': '8 - system complete and qualified',
+            'trl-9': '9 - actual system proven in operational environment'},
+    'target_users': {'target_user-businesses': 'Businesses', 'target_user-funders': 'Funders',
+                     'target_user-innovators': 'Innovators', 'target_user-other': 'Other',
+                     'target_user-policy_makers': 'Policy Makers', 'target_user-providers': 'Providers',
+                     'target_user-research_communities': 'Research Communities',
+                     'target_user-research_groups': 'Research Groups',
+                     'target_user-research_infrastructure_managers': 'Research Infrastructure Managers',
+                     'target_user-research_managers': 'Research Managers',
+                     'target_user-research_networks': 'Research Networks',
+                     'target_user-research_organisations': 'Research Organisations',
+                     'target_user-research_projects': 'Research Projects', 'target_user-researchers': 'Researchers',
+                     'target_user-resource_managers': 'Resource Managers',
+                     'target_user-resource_provider_managers': 'Provider Managers',
+                     'target_user-publishers': 'Publishers',
+                     'target_user-students': 'Students'},
+    'access_mode': {'access_mode-free': 'Free', 'access_mode-free_conditionally': 'Free Conditionally',
+                    'access_mode-other': 'Other', 'access_mode-paid': 'Paid',
+                    'access_mode-peer_reviewed': 'Peer Reviewed'},
+    'funding_body': {'funding_body-ademe': 'Agency for Environment and Energy Management (ADEME)',
+                     'funding_body-ahrc': 'Arts and Humanities Research Council (AHRC)',
+                     'funding_body-aka': 'Academy of Finland (AKA)',
+                     'funding_body-ancs': 'National Authority for Scientific Research (ANCS)',
+                     'funding_body-anr': 'French National Research Agency (ANR)',
+                     'funding_body-apvv': 'Research and Development Agency (APVV)',
+                     'funding_body-arc': 'Australian Research Council (ARC)',
+                     'funding_body-arrs': 'Slovenian Research Agency (ARRS)',
+                     'funding_body-awi': 'Alfred Wegener Institute for Polar and Marine Research (AWI)',
+                     'funding_body-bbsrc': 'Biotechnology and Biological Sciences Research Council (BBSRC)',
+                     'funding_body-bf': 'Belmont Forum (BF)',
+                     'funding_body-bmbf': 'Federal Ministry of Education and Research (BMBF)',
+                     'funding_body-caixa': 'La Caixa Foundation (CAIXA)',
+                     'funding_body-cdti': 'Center for Industrial Technological Development (CDTI)',
+                     'funding_body-cea': 'Alternative Energies and Atomic Energy Commission (CEA)',
+                     'funding_body-cihr': 'Canadian Institutes of Health Research (CIHR)',
+                     'funding_body-cncsis': 'National University Research Council (CNCSIS) - Romania',
+                     'funding_body-cnes': 'National Centre for Space Studies (CNES)',
+                     'funding_body-cnpq': 'National Council for Scientific and Technological Development (CNPq)',
+                     'funding_body-cnr': 'National Research Council (CNR)',
+                     'funding_body-cnrs': 'National Centre for Scientific Research (CNRS)',
+                     'funding_body-csf': 'Croatian Science Foundation (CSF)',
+                     'funding_body-csic': 'Spanish National Research Council (CSIC)',
+                     'funding_body-dashe': 'Danish Agency for Science and Higher Education (DASHE)',
+                     'funding_body-dasti': 'Danish Agency for Science, Technology and Innovation (DASTI)',
+                     'funding_body-ddf': 'The Danish Council for Independent Research (DDF)',
+                     'funding_body-dff': 'Danish Council for Independent Research (DFF)',
+                     'funding_body-dfg': 'German Research Foundation (DFG)',
+                     'funding_body-dgo6': 'General Operational Directorate for Economy, Employment and Research (DGO6)',
+                     'funding_body-dlr': 'German Aerospace Center (DLR)',
+                     'funding_body-dnrf': 'Danish National Research Foundation (DNRF)',
+                     'funding_body-eaer': 'Federal Department of Economic Affairs, Education and Research (EAER)',
+                     'funding_body-ec': 'European Commission (EC)',
+                     'funding_body-epsrc': 'Engineering and Physical Sciences Research Council (EPSRC)',
+                     'funding_body-esa': 'European Space Agency (ESA)',
+                     'funding_body-esrc': 'Economic and Social Research Council (ESRC)',
+                     'funding_body-etag': 'Estonian Research Council (ETAG)',
+                     'funding_body-fapesp': 'São Paulo Research Foundation (FAPESP)',
+                     'funding_body-fct': 'Foundation for Science and Technology (FCT)',
+                     'funding_body-ffg': 'Austrian Research Promotion Agency (FFG)',
+                     'funding_body-fnp': 'Foundation for Polish Science (FNP)',
+                     'funding_body-fnr': 'National Research Fund (FNR)',
+                     'funding_body-fnrs': 'Fonds National de la Recherche Scientifique (FNRS)',
+                     'funding_body-fom': 'Foundation for Fundamental Research on Matter (FOM)',
+                     'funding_body-forte': 'Swedish Research Council for Health, Working Life and Welfare (FORTE)',
+                     'funding_body-fts': 'Fritz Thyssen Foundation (FTS)',
+                     'funding_body-fwf': 'Austrian Science Fund (FWF)',
+                     'funding_body-fwo': 'Research Foundation Flanders (FWO)',
+                     'funding_body-gacr': 'Czech Science Foundation (GACR)',
+                     'funding_body-gsrt': 'General Secretariat for Research and Technology (GSRT)',
+                     'funding_body-ifd': 'Innovation Fund Denmark (IFD)',
+                     'funding_body-ifremer': 'French Research Institute for Exploitation of the Sea (IFREMER)',
+                     'funding_body-imsr': 'Innovation Fund of the Ministry of Economy of the Slovak Republic (IMSR)',
+                     'funding_body-innoviris': 'Brussels Institute for Research and Innovation (INNOVIRIS)',
+                     'funding_body-inra': 'National institute of Agricultural Research (INRA)',
+                     'funding_body-inserm': 'National Institute of Health and Medical Research (INSERM)',
+                     'funding_body-ipev': 'French Polar Institute (IPEV)',
+                     'funding_body-irc': 'Irish Research Council (IRC)',
+                     'funding_body-isc': 'International Science Council (ISC)',
+                     'funding_body-isciii': 'Carlos III Health Institute (ISCIII)',
+                     'funding_body-isf': 'Israel Science Foundation (ISF)',
+                     'funding_body-iwt': 'Agency for Innovation by Science and Technology (IWT)',
+                     'funding_body-jsps': 'Japanese Society for the Promotion of Science (JSPS)',
+                     'funding_body-jst': 'Japanese Science and Technology Agency (JST)',
+                     'funding_body-kaws': 'Knut and Alice Wallenberg Foundation (KAWS)',
+                     'funding_body-kks': 'Knowledge Foundation (KKS)',
+                     'funding_body-lmt': 'Research Council of Lithuania (LMT)',
+                     'funding_body-mcst': 'Malta Council for Science and Technology (MCST)',
+                     'funding_body-mecr': 'Ministry for Education and Scientific Research (MECR)',
+                     'funding_body-mesr': 'Ministry of Higher Education and Research (MESR)',
+                     'funding_body-mestd': 'Ministry of Education, Science and Technological Development of Republic of Serbia (MESTD)',
+                     'funding_body-mgrt': 'Ministry for Economic Development and Technology (MGRT)',
+                     'funding_body-mineco': 'Ministry for Economy and Competitveness (MINECO)',
+                     'funding_body-mistra': 'Swedish Foundation for Strategic Environmental Research (MISTRA)',
+                     'funding_body-mita': 'Agency for Science, Innovation and Technology (MITA)',
+                     'funding_body-miur': 'Ministry for Education, University and Research (MIUR)',
+                     'funding_body-most': "Ministry of Science and Technology of the People's Republic of China (MOST)",
+                     'funding_body-mpg': 'Max Planck Society for the Advancement of Science (MPG)',
+                     'funding_body-mrc': 'Medical Research Council (MRC)',
+                     'funding_body-mse': 'Ministry of Science and Education Republic of Croatia (MSE)',
+                     'funding_body-msvvas_sr': 'The Ministry of Education, Science, Research and Sports of the Slovak Republic (MSVVaS SR)',
+                     'funding_body-nasa': 'National Aeronautics and Space Administration (NASA)',
+                     'funding_body-ncbir': 'National Centre for Research and Development (NCBiR)',
+                     'funding_body-ncn': 'National Science Center (NCN)',
+                     'funding_body-nerc': 'Natural Environment Research Council (NERC)',
+                     'funding_body-nhmrc': 'National Health and Medical Research Council (NHMRC)',
+                     'funding_body-nig': 'National Institutes of Health (NIG)',
+                     'funding_body-nkfia': 'National Research, Development and Innovation Fund (NKFIA)',
+                     'funding_body-nrf': 'National Research Foundation (NRF)',
+                     'funding_body-nserc': 'Natural Sciences and Engineering Research Council of Canada (NSERC)',
+                     'funding_body-nsf': 'National Science Foundation (NSF)',
+                     'funding_body-nwo': 'Netherlands Organisation for Scientific Research (NWO)',
+                     'funding_body-oeaw': 'Austrian Academy of Sciences (OeAW)',
+                     'funding_body-oenfte': 'National Foundation for Research, Technology and Development (OeNFTE)',
+                     'funding_body-onera': 'French National Aerospace Research Center (ONERA)',
+                     'funding_body-other': 'Other', 'funding_body-rannis': 'Icelandic Centre for Research (RANNIS)',
+                     'funding_body-rcn': 'Research Council of Norway (RCN)',
+                     'funding_body-rcuk': 'Research Council UK (RCUK)',
+                     'funding_body-rj': 'The Swedish Foundation for Humanities and Social Sciences (RJ)',
+                     'funding_body-rpf': 'Research Promotion Foundation (RPF)',
+                     'funding_body-sea': 'Swedish Energy Agency (SEA)',
+                     'funding_body-sepa': 'Swedish Environmental Protection Agency (SEPA)',
+                     'funding_body-sfi': 'Science Foundation Ireland (SFI)',
+                     'funding_body-sgpi': 'Secretariat-General for Investment (SGPI)',
+                     'funding_body-snf': 'Swiss National Science Foundation (SNF)',
+                     'funding_body-snsb': 'Swedish National Space Board (SNSB)',
+                     'funding_body-srcf': 'Swedish Reseach Council Formas (SRCF)',
+                     'funding_body-srsa': 'Swedish Radiation Safety Authority (SRSA)',
+                     'funding_body-ssf': 'Swedish Foundation for Strategic Research (SSF)',
+                     'funding_body-sshrc': 'Social Sciences and Humanities Research Council (SSHRC)',
+                     'funding_body-stfc': 'Science and Technology Facilities Council (STFC)',
+                     'funding_body-stw': 'Technology Foundation (STW)',
+                     'funding_body-tacr': 'Technology Agency of the Czech Republic (TACR)',
+                     'funding_body-tara': 'Tara Expeditions Foundation (TARA)',
+                     'funding_body-tekes': 'Finnish Funding Agency for Technology and Innovation (TEKES)',
+                     'funding_body-tubitak': 'Scientific and Technological Research Council of Turkey (TUBITAK)',
+                     'funding_body-uefiscdi_cncs': 'Executive Agency for Higher Education, Research, Development and Innovation Funding (UEFISCDI - CNCS)',
+                     'funding_body-ukri': 'UK Research and Innovation (UKRI)',
+                     'funding_body-vega': 'Scientific Grant Agency (VEGA)',
+                     'funding_body-viaa': 'State Education Development Agency (VIAA)',
+                     'funding_body-vinnova': 'Swedish Governmental Agency for Innovation Systems (VINNOVA)',
+                     'funding_body-vlaio': 'Flanders Innovation & Entrepeneurship (VLAIO)',
+                     'funding_body-vr': 'Swedish Research Council (VR)',
+                     'funding_body-vs': 'Volkswagen Foundation (VS)',
+                     'funding_body-wt': 'Wellcome trust (WT)',
+                     'funding_body-wwtf': 'Vienna Science and Technology Fund (WWTF)',
+                     'funding_body-meys': 'Ministry of Education, Youth and Sports of the Czech Republic (MEYS)',
+                     'funding_body-af': 'Arcadia Fund'},
+    'target_user': {'target_user-businesses': 'Businesses', 'target_user-funders': 'Funders',
+                    'target_user-innovators': 'Innovators', 'target_user-other': 'Other',
+                    'target_user-policy_makers': 'Policy Makers', 'target_user-providers': 'Providers',
+                    'target_user-research_communities': 'Research Communities',
+                    'target_user-research_groups': 'Research Groups',
+                    'target_user-research_infrastructure_managers': 'Research Infrastructure Managers',
+                    'target_user-research_managers': 'Research Managers',
+                    'target_user-research_networks': 'Research Networks',
+                    'target_user-research_organisations': 'Research Organisations',
+                    'target_user-research_projects': 'Research Projects', 'target_user-researchers': 'Researchers',
+                    'target_user-resource_managers': 'Resource Managers',
+                    'target_user-resource_provider_managers': 'Provider Managers',
+                    'target_user-publishers': 'Publishers',
+                    'target_user-students': 'Students'},
+    'related_platform': {'related_platform-ands': 'ANDS', 'related_platform-artportalen': 'ArtPortalen',
+                         'related_platform-arxiv': 'arXiv', 'related_platform-ala': 'Atlas of Living Australia',
+                         'related_platform-avp': 'AV-Portal', 'related_platform-aws': 'AWS',
+                         'related_platform-bluecloud': 'Blue-Cloud',
+                         'related_platform-cdl': 'California Digital Library',
+                         'related_platform-ccdc': 'CCDC', 'related_platform-cessda': 'CESSDA',
+                         'related_platform-collabwith': 'COLLABWITH',
+                         'related_platform-cccs': 'Copernicus Climate Change Service',
+                         'related_platform-crossref': 'Crossref', 'related_platform-dariahteach': 'dariahTeach',
+                         'related_platform-dice': 'Data Infrastructure Capacity for EOSC (DICE)',
+                         'related_platform-datacite': 'DataCite', 'related_platform-ds': 'Digital Science',
+                         'related_platform-doab': 'DOAB', 'related_platform-einfracz': 'e-INFRA CZ',
+                         'related_platform-eirgspp': 'e-IRGSP projects', 'related_platform-edugain': 'eduGAIN',
+                         'related_platform-eduteams': 'eduTEAMS', 'related_platform-egi': 'EGI',
+                         'related_platform-egifc': 'EGI Federated Cloud', 'related_platform-egiace': 'EGI-ACE',
+                         'related_platform-elixir': 'ELIXIR', 'related_platform-emodnetc': 'EMODnet Chemistry',
+                         'related_platform-eol': 'Encyclopedia of Life',
+                         'related_platform-enc': 'Endemia New Caledonia',
+                         'related_platform-envri': 'ENVRI Hub', 'related_platform-eoscl': 'EOSC-Life',
+                         'related_platform-eoscn': 'EOSC-Nordic', 'related_platform-eoscp': 'EOSC-Pillar',
+                         'related_platform-eudatcdi': 'EUDAT CDI', 'related_platform-elg': 'European Language Grid',
+                         'related_platform-evs': 'European Values Study (EVS)',
+                         'related_platform-garrcp': 'GARR Container Platform',
+                         'related_platform-gatep': 'GATE platform',
+                         'related_platform-gbif': 'GBIF', 'related_platform-geonames': 'GeoNames',
+                         'related_platform-grin': 'Germplasm Resources Information Network (GRIN)',
+                         'related_platform-geoss': 'Global Earth Observation system of Systems (GEOSS)',
+                         'related_platform-hal': 'HAL', 'related_platform-hamelin': 'Hamelin',
+                         'related_platform-infnc': 'INFN-Cloud', 'related_platform-ispot': 'iSpot',
+                         'related_platform-jisc': 'JISC', 'related_platform-metacentrum': 'MetaCentrum',
+                         'related_platform-natusfera': 'Natusfera', 'related_platform-openairee': 'OpenAIRE EXPLORE',
+                         'related_platform-openairem': 'OpenAIRE MONITOR',
+                         'related_platform-openairerg': 'OpenAIRE research graph',
+                         'related_platform-oc': 'OpenCitations',
+                         'related_platform-pogo': 'Partnership for Observation of the Global Oceans (POGO)',
+                         'related_platform-pnp': 'Pl@ntNet platform', 'related_platform-pc': 'PolicyCloud',
+                         'related_platform-rjb': 'Real Jardín Botánico', 'related_platform-scopus': 'Scopus',
+                         'related_platform-seadatanet': 'SeaDataNet',
+                         'related_platform-tsd': 'Service for Sensitive Data (TSD)',
+                         'related_platform-sshom': 'SSH Open Marketplace', 'related_platform-surf': 'SURF',
+                         'related_platform-share': 'Survey of Health, Ageing and Retirement in Europe (SHARE)',
+                         'related_platform-tf': 'Taylor&Francis', 'related_platform-tb': 'Tela Botanica',
+                         'related_platform-tdp': 'The Dataverse Project',
+                         'related_platform-tnomadl': 'The NOMAD Laboratory', 'related_platform-tpg': 'The Plant Game',
+                         'related_platform-tibp': 'TIB Portal', 'related_platform-tripleh': 'TRIPLE H2020 project',
+                         'related_platform-tubitakcc': 'TÜBITAK cloud compute',
+                         'related_platform-vlab': 'Virtual Earth Laboratory (VLab)',
+                         'related_platform-zbwice': 'ZBW Information Centre for Economics',
+                         'related_platform-zenodo': 'Zenodo'},
+    'languages': {'aa': 'Afar', 'ab': 'Abkhazian', 'ae': 'Avestan', 'af': 'Afrikaans', 'ak': 'Akan', 'am': 'Amharic',
+                  'an': 'Aragonese', 'ar': 'Arabic', 'as': 'Assamese', 'av': 'Avaric', 'ay': 'Aymara',
+                  'az': 'Azerbaijani',
+                  'ba': 'Bashkir', 'be': 'Belarusian', 'bg': 'Bulgarian', 'bh': 'Bihari', 'bi': 'Bislama',
+                  'bm': 'Bambara',
+                  'bn': 'Bengali', 'bo': 'Tibetan', 'br': 'Breton', 'bs': 'Bosnian', 'ca': 'Catalan', 'ce': 'Chechen',
+                  'ch': 'Chamorro', 'co': 'Corsican', 'cr': 'Cree', 'cs': 'Czech', 'cu': 'Old Church Slavonic',
+                  'cv': 'Chuvash', 'cy': 'Welsh', 'da': 'Danish', 'de': 'German', 'dv': 'Divehi', 'dz': 'Dzongkha',
+                  'ee': 'Ewe', 'el': 'Greek', 'en': 'English', 'eo': 'Esperanto', 'es': 'Spanish', 'et': 'Estonian',
+                  'eu': 'Basque', 'fa': 'Persian', 'ff': 'Fula', 'fi': 'Finnish', 'fj': 'Fijian', 'fo': 'Faroese',
+                  'fr': 'French', 'fy': 'Western Frisian', 'ga': 'Irish', 'gd': 'Galician', 'gl': 'Gaelic',
+                  'gn': 'Guarani', 'gu': 'Gujarati', 'gv': 'Manx', 'ha': 'Hausa', 'he': 'Hebrew', 'hi': 'Hindi',
+                  'ho': 'Hiri Motu', 'hr': 'Croatian', 'ht': 'Haitian', 'hu': 'Hungarian', 'hy': 'Armenian',
+                  'hz': 'Herero', 'ia': 'Interlingua', 'id': 'Indonesian', 'ie': 'Interlingue', 'ig': 'Igbo',
+                  'ii': 'Nuosu', 'iii': 'Sichuan Yi', 'ik': 'Inupiak', 'io': 'Ido', 'is': 'Icelandic', 'it': 'Italian',
+                  'iu': 'Inuktitut', 'ja': 'Japanese', 'jv': 'Javanese', 'ka': 'Georgian', 'kg': 'Kongo',
+                  'ki': 'Kikuyu',
+                  'kj': 'Kwanyama', 'kk': 'Kazakh', 'kl': 'Kalaallisut', 'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean',
+                  'kr': 'Kanuri', 'ks': 'Kashmiri', 'ku': 'Kurdish', 'kv': 'Komi', 'kw': 'Cornish', 'ky': 'Kyrgyz',
+                  'la': 'Latin', 'lb': 'Luxembourgish', 'li': 'Limburgish', 'ln': 'Lingala', 'lo': 'Lao',
+                  'lt': 'Lithuanian', 'lu': 'Luba-Katanga', 'lv': 'Latvian', 'mg': 'Malagasy', 'mh': 'Marshallese',
+                  'mi': 'Maori', 'mk': 'Macedonian', 'ml': 'Malayalam', 'mn': 'Mongolian', 'mr': 'Marathi',
+                  'ms': 'Malay',
+                  'mt': 'Maltese', 'my': 'Burmese', 'na': 'Nauru', 'nb': 'Norwegian Bokmål', 'nd': 'Northern Ndebele',
+                  'ne': 'Nepali', 'ng': 'Ndonga', 'nl': 'Dutch', 'nn': 'Norwegian Nynorsk', 'no': 'Norwegian',
+                  'nr': 'Southern Ndebele', 'nv': 'Navajo', 'ny': 'Chichewa', 'oc': 'Occitan', 'oj': 'Ojibwe',
+                  'om': 'Oromo', 'or': 'Oriya', 'os': 'Ossetian', 'ot': 'Other', 'pa': 'Panjabi', 'pi': 'Pāli',
+                  'pl': 'Polish', 'ps': 'Pashto', 'pt': 'Portuguese', 'qu': 'Quechua', 'rm': 'Romansh', 'rn': 'Kirundi',
+                  'ro': 'Romanian', 'ru': 'Russian', 'rw': 'Kinyarwanda', 'sa': 'Sanskrit', 'sar': 'Sardinian',
+                  'sd': 'Sindhi', 'se': 'Sami', 'sg': 'Sango', 'si': 'Sinhalese', 'sk': 'Slovak', 'sl': 'Slovenian',
+                  'sm': 'Samoan', 'sn': 'Shona', 'so': 'Somali', 'sq': 'Albanian', 'sr': 'Serbian', 'ss': 'Swati',
+                  'st': 'Sesotho', 'su': 'Sundanese', 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil', 'te': 'Telugu',
+                  'tg': 'Tajik', 'th': 'Thai', 'ti': 'Tigrinya', 'tk': 'Turkmen', 'tl': 'Tagalog', 'tn': 'Setswana',
+                  'to': 'Tonga', 'tr': 'Turkish', 'ts': 'Tsonga', 'tt': 'Tatar', 'tw': 'Twi', 'ty': 'Tahitian',
+                  'ug': 'Uyghur', 'uk': 'Ukrainian', 'ur': 'Urdu', 'uz': 'Uzbek', 've': 'Venda', 'vi': 'Vietnamese',
+                  'vo': 'Volapük', 'wa': 'Wallon', 'wo': 'Wolof', 'xh': 'Xhosa', 'yi': 'Yiddish', 'yo': 'Yoruba',
+                  'za': 'Zhuang', 'zh': 'Chinese', 'zu': 'Zulu'}, 'ig': {},
+    'qualification': {'tr_qualification-badge': 'Badge', 'tr_qualification-certification': 'Certification',
+                      'tr_qualification-accreditation': 'Accreditation'}, 'subcategories': {
+        'subcategory-access_physical_and_eInfrastructures-compute-container_management': 'Container Management',
+        'subcategory-access_physical_and_eInfrastructures-compute-job_execution': 'Job Execution',
+        'subcategory-access_physical_and_eInfrastructures-compute-orchestration': 'Orchestration',
+        'subcategory-access_physical_and_eInfrastructures-compute-other': 'Other',
+        'subcategory-access_physical_and_eInfrastructures-compute-serverless_applications_repository': 'Serverless Applications Repository',
+        'subcategory-access_physical_and_eInfrastructures-compute-virtual_machine_management': 'Virtual Machine Management',
+        'subcategory-access_physical_and_eInfrastructures-compute-workload_management': 'Workload Management',
+        'subcategory-access_physical_and_eInfrastructures-data_storage-archive': 'Archive',
+        'subcategory-access_physical_and_eInfrastructures-data_storage-backup': 'Backup',
+        'subcategory-access_physical_and_eInfrastructures-data_storage-data': 'Data',
+        'subcategory-access_physical_and_eInfrastructures-data_storage-digital_preservation': 'Digital Preservation',
+        'subcategory-access_physical_and_eInfrastructures-data_storage-disk': 'Disk',
+        'subcategory-access_physical_and_eInfrastructures-data_storage-file': 'File',
+        'subcategory-access_physical_and_eInfrastructures-data_storage-online': 'Online',
+        'subcategory-access_physical_and_eInfrastructures-data_storage-other': 'Other',
+        'subcategory-access_physical_and_eInfrastructures-data_storage-queue': 'Queue',
+        'subcategory-access_physical_and_eInfrastructures-data_storage-recovery': 'Recovery',
+        'subcategory-access_physical_and_eInfrastructures-data_storage-replicated': 'Replicated',
+        'subcategory-access_physical_and_eInfrastructures-data_storage-synchronised': 'Synchronised',
+        'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-chromatographer': 'Chromatographer',
+        'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-cytometer': 'Cytometer',
+        'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-digitisation_equipment': 'Digitisation Equipment',
+        'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-geophysical': 'Geophysical',
+        'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-laser': 'Laser',
+        'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-microscopy': 'Microscopy',
+        'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-monument_maintenance_equipment': 'Monument Maintenance Equipment',
+        'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-other': 'Other',
+        'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-radiation': 'Radiation',
+        'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-spectrometer': 'Spectrometer',
+        'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-spectrophotometer': 'Spectrophotometer',
+        'subcategory-access_physical_and_eInfrastructures-material_storage-archiving': 'Archiving',
+        'subcategory-access_physical_and_eInfrastructures-material_storage-assembly': 'Assembly',
+        'subcategory-access_physical_and_eInfrastructures-material_storage-disposal': 'Disposal',
+        'subcategory-access_physical_and_eInfrastructures-material_storage-fulfilment': 'Fulfilment',
+        'subcategory-access_physical_and_eInfrastructures-material_storage-other': 'Other',
+        'subcategory-access_physical_and_eInfrastructures-material_storage-packaging': 'Packaging',
+        'subcategory-access_physical_and_eInfrastructures-material_storage-preservation': 'Preservation',
+        'subcategory-access_physical_and_eInfrastructures-material_storage-quality_inspecting': 'Quality Inspecting',
+        'subcategory-access_physical_and_eInfrastructures-material_storage-repository': 'Repository',
+        'subcategory-access_physical_and_eInfrastructures-material_storage-reworking': 'Reworking',
+        'subcategory-access_physical_and_eInfrastructures-material_storage-sorting': 'Sorting',
+        'subcategory-access_physical_and_eInfrastructures-material_storage-warehousing': 'Warehousing',
+        'subcategory-access_physical_and_eInfrastructures-network-content_delivery_network': 'Content Delivery Network',
+        'subcategory-access_physical_and_eInfrastructures-network-direct_connect': 'Direct Connect',
+        'subcategory-access_physical_and_eInfrastructures-network-exchange': 'Exchange',
+        'subcategory-access_physical_and_eInfrastructures-network-load_balancer': 'Load Balancer',
+        'subcategory-access_physical_and_eInfrastructures-network-other': 'Other',
+        'subcategory-access_physical_and_eInfrastructures-network-traffic_manager': 'Traffic Manager',
+        'subcategory-access_physical_and_eInfrastructures-network-virtual_nework': 'Virtual Network',
+        'subcategory-access_physical_and_eInfrastructures-network-vpn_gateway': 'VPN Gateway',
+        'subcategory-access_physical_and_eInfrastructures-network-dns': 'DNS',
+        'subcategory-aggregators_and_integrators-aggregators_and_integrators-applications': 'Applications',
+        'subcategory-aggregators_and_integrators-aggregators_and_integrators-data': 'Data',
+        'subcategory-aggregators_and_integrators-aggregators_and_integrators-other': 'Other',
+        'subcategory-aggregators_and_integrators-aggregators_and_integrators-services': 'Services',
+        'subcategory-aggregators_and_integrators-aggregators_and_integrators-software': 'Software',
+        'subcategory-other-other-other': 'Other',
+        'subcategory-processing_and_analysis-data_analysis-2d_3d_digitisation': '2D/3D Digitisation',
+        'subcategory-processing_and_analysis-data_analysis-artificial_intelligence': 'Artificial Intelligence',
+        'subcategory-processing_and_analysis-data_analysis-data_extrapolation': 'Data Extrapolation',
+        'subcategory-processing_and_analysis-data_analysis-forecast': 'Forecast',
+        'subcategory-processing_and_analysis-data_analysis-image_data_analysis': 'Image/Data Analysis',
+        'subcategory-processing_and_analysis-data_analysis-machine_learning': 'Machine Learning',
+        'subcategory-processing_and_analysis-data_analysis-other': 'Other',
+        'subcategory-processing_and_analysis-data_analysis-visualization': 'Visualization',
+        'subcategory-processing_and_analysis-data_analysis-workflows': 'Workflows',
+        'subcategory-processing_and_analysis-data_analysis-quality_assesment': 'Quality Assesment',
+        'subcategory-processing_and_analysis-data_management-access': 'Access',
+        'subcategory-processing_and_analysis-data_management-annotation': 'Annotation',
+        'subcategory-processing_and_analysis-data_management-anonymisation': 'Anonymisation',
+        'subcategory-processing_and_analysis-data_management-brokering': 'Brokering',
+        'subcategory-processing_and_analysis-data_management-digitisation': 'Digitisation',
+        'subcategory-processing_and_analysis-data_management-discovery': 'Discovery',
+        'subcategory-processing_and_analysis-data_management-embargo': 'Embargo',
+        'subcategory-processing_and_analysis-data_management-interlinking': 'Interlinking',
+        'subcategory-processing_and_analysis-data_management-maintenance': 'Maintenance',
+        'subcategory-processing_and_analysis-data_management-mining': 'Mining',
+        'subcategory-processing_and_analysis-data_management-other': 'Other',
+        'subcategory-processing_and_analysis-data_management-persistent_identifier': 'Persistent Identifier',
+        'subcategory-processing_and_analysis-data_management-preservation': 'Preservation',
+        'subcategory-processing_and_analysis-data_management-publishing': 'Publishing',
+        'subcategory-processing_and_analysis-data_management-registration': 'Registration',
+        'subcategory-processing_and_analysis-data_management-transfer': 'Transfer',
+        'subcategory-processing_and_analysis-data_management-validation': 'Validation',
+        'subcategory-processing_and_analysis-measurement_and_materials_analysis-analysis': 'Analysis',
+        'subcategory-processing_and_analysis-measurement_and_materials_analysis-characterisation': 'Characterisation',
+        'subcategory-processing_and_analysis-measurement_and_materials_analysis-maintenance_and_modification': 'Maintenance & Modification',
+        'subcategory-processing_and_analysis-measurement_and_materials_analysis-other': 'Other',
+        'subcategory-processing_and_analysis-measurement_and_materials_analysis-production': 'Production',
+        'subcategory-processing_and_analysis-measurement_and_materials_analysis-testing_and_validation': 'TEsting & Validation',
+        'subcategory-processing_and_analysis-measurement_and_materials_analysis-validation': 'Validation',
+        'subcategory-processing_and_analysis-measurement_and_materials_analysis-workflows': 'Workflows',
+        'subcategory-security_and_operations-operations_and_infrastructure_management_services-accounting': 'Accounting',
+        'subcategory-security_and_operations-operations_and_infrastructure_management_services-analysis': 'Analysis',
+        'subcategory-security_and_operations-operations_and_infrastructure_management_services-billing': 'Billing',
+        'subcategory-security_and_operations-operations_and_infrastructure_management_services-configuration': 'Configuration',
+        'subcategory-security_and_operations-operations_and_infrastructure_management_services-coordination': 'Coordination',
+        'subcategory-security_and_operations-operations_and_infrastructure_management_services-helpdesk': 'Helpdesk',
+        'subcategory-security_and_operations-operations_and_infrastructure_management_services-monitoring': 'Monitoring',
+        'subcategory-security_and_operations-operations_and_infrastructure_management_services-order_management': 'Order Management',
+        'subcategory-security_and_operations-operations_and_infrastructure_management_services-other': 'Other',
+        'subcategory-security_and_operations-operations_and_infrastructure_management_services-transportation': 'Transportation',
+        'subcategory-security_and_operations-operations_and_infrastructure_management_services-utilities': 'Utilities',
+        'subcategory-security_and_operations-security_and_identity-certification_authority': 'Certification Authority',
+        'subcategory-security_and_operations-security_and_identity-coordination': 'Coordination',
+        'subcategory-security_and_operations-security_and_identity-firewall': 'Firewall',
+        'subcategory-security_and_operations-security_and_identity-group_management': 'Group Management',
+        'subcategory-security_and_operations-security_and_identity-identity_and_access_management': 'Identity & Access Management',
+        'subcategory-security_and_operations-security_and_identity-other': 'Other',
+        'subcategory-security_and_operations-security_and_identity-single_sign_on': 'Single Sign-On',
+        'subcategory-security_and_operations-security_and_identity-threat_protection': 'Threat Protection',
+        'subcategory-security_and_operations-security_and_identity-tools': 'Tools',
+        'subcategory-security_and_operations-security_and_identity-user_authentication': 'User Authentication',
+        'subcategory-sharing_and_discovery-applications-applications_repository': 'Applications Repository',
+        'subcategory-sharing_and_discovery-applications-business': 'Business',
+        'subcategory-sharing_and_discovery-applications-collaboration': 'Collaboration',
+        'subcategory-sharing_and_discovery-applications-communication': 'Communication',
+        'subcategory-sharing_and_discovery-applications-education': 'Education',
+        'subcategory-sharing_and_discovery-applications-other': 'Other',
+        'subcategory-sharing_and_discovery-applications-productivity': 'Productivity',
+        'subcategory-sharing_and_discovery-applications-social_networking': 'Social/Networking',
+        'subcategory-sharing_and_discovery-applications-utilities': 'Utilities',
+        'subcategory-sharing_and_discovery-data-clinical_trial_data': 'Clinical Trial Data',
+        'subcategory-sharing_and_discovery-data-data_archives': 'Data Archives',
+        'subcategory-sharing_and_discovery-data-epidemiological_data': 'Epidemiological Data',
+        'subcategory-sharing_and_discovery-data-government_and_agency_data': 'Government & Agency Data',
+        'subcategory-sharing_and_discovery-data-online_service_data': 'Online Service Data',
+        'subcategory-sharing_and_discovery-data-other': 'Other',
+        'subcategory-sharing_and_discovery-data-scientific_research_data': 'Scientific/Research Data',
+        'subcategory-sharing_and_discovery-data-statistical_data': 'Statistical Data',
+        'subcategory-sharing_and_discovery-data-metadata': 'Metadata',
+        'subcategory-sharing_and_discovery-development_resources-apis_repository_gateway': 'APIs Repository/Gateway',
+        'subcategory-sharing_and_discovery-development_resources-developer_tools': 'Developer Tools',
+        'subcategory-sharing_and_discovery-development_resources-other': 'Other',
+        'subcategory-sharing_and_discovery-development_resources-software_development_kits': 'Software Development Kits',
+        'subcategory-sharing_and_discovery-development_resources-software_libraries': 'Software Libraries',
+        'subcategory-sharing_and_discovery-development_resources-simulation_tools': 'Simulation Tools',
+        'subcategory-sharing_and_discovery-samples-biological_samples': 'Biological Samples',
+        'subcategory-sharing_and_discovery-samples-characterisation': 'Characterisation',
+        'subcategory-sharing_and_discovery-samples-chemical_compounds_library': 'Chemical Compounds Library',
+        'subcategory-sharing_and_discovery-samples-other': 'Other',
+        'subcategory-sharing_and_discovery-samples-preparation': 'Preparation',
+        'subcategory-sharing_and_discovery-scholarly_communication-analysis': 'Analysis',
+        'subcategory-sharing_and_discovery-scholarly_communication-assessment': 'Assessment',
+        'subcategory-sharing_and_discovery-scholarly_communication-discovery': 'Discovery',
+        'subcategory-sharing_and_discovery-scholarly_communication-other': 'Other',
+        'subcategory-sharing_and_discovery-scholarly_communication-outreach': 'Outreach',
+        'subcategory-sharing_and_discovery-scholarly_communication-preparation': 'Preparation',
+        'subcategory-sharing_and_discovery-scholarly_communication-publication': 'Publication',
+        'subcategory-sharing_and_discovery-scholarly_communication-writing': 'Writing',
+        'subcategory-sharing_and_discovery-software-libraries': 'Libraries',
+        'subcategory-sharing_and_discovery-software-other': 'Other',
+        'subcategory-sharing_and_discovery-software-platform': 'Platform',
+        'subcategory-sharing_and_discovery-software-software_package': 'Software Package',
+        'subcategory-sharing_and_discovery-software-software_repository': 'Software Repository',
+        'subcategory-training_and_support-consultancy_and_support-application_optimisation': 'Application Optimisation',
+        'subcategory-training_and_support-consultancy_and_support-application_porting': 'Application_Porting',
+        'subcategory-training_and_support-consultancy_and_support-application_scaling': 'Application Scaling',
+        'subcategory-training_and_support-consultancy_and_support-audit_and_assessment': 'Audit & Assessment',
+        'subcategory-training_and_support-consultancy_and_support-benchmarking': 'Benchmarking',
+        'subcategory-training_and_support-consultancy_and_support-calibration': 'Calibration',
+        'subcategory-training_and_support-consultancy_and_support-certification': 'Certification',
+        'subcategory-training_and_support-consultancy_and_support-consulting': 'Consulting',
+        'subcategory-training_and_support-consultancy_and_support-methodology_development': 'Methodology Development',
+        'subcategory-training_and_support-consultancy_and_support-modeling_and_simulation': 'Modeling & Simulation',
+        'subcategory-training_and_support-consultancy_and_support-other': 'Other',
+        'subcategory-training_and_support-consultancy_and_support-prototype_development': 'Prototype Development',
+        'subcategory-training_and_support-consultancy_and_support-software_development': 'Software Development',
+        'subcategory-training_and_support-consultancy_and_support-software_improvement': 'Software Improvement',
+        'subcategory-training_and_support-consultancy_and_support-technology_transfer': 'Technology Transfer',
+        'subcategory-training_and_support-consultancy_and_support-testing': 'Testing',
+        'subcategory-training_and_support-education_and_training-in_house_courses': 'In-House Courses',
+        'subcategory-training_and_support-education_and_training-online_courses': 'Online Courses',
+        'subcategory-training_and_support-education_and_training-open_registration_courses': 'Open Registration Courses',
+        'subcategory-training_and_support-education_and_training-other': 'Other',
+        'subcategory-training_and_support-education_and_training-related_training': 'Related Training',
+        'subcategory-training_and_support-education_and_training-required_training': 'Required Training',
+        'subcategory-training_and_support-education_and_training-training_platform': 'Training Platform',
+        'subcategory-training_and_support-education_and_training-training_tool': 'Training Tool'}, 'service-ig': {},
+    'providers': {'eosc.ess': 'European Spallation Source ERIC', 'eosc.openaire': 'OpenAIRE',
+                  'eosc.ierek': ' International Experts for Research Enrichment and Knowledge Exchange',
+                  'eosc.centerdata': 'Centerdata',
+                  'ni4os.ukim_fcse': 'University Ss. Cyril and Methodius, Faculty of Computer Science and Engineering',
+                  'ni4os.sanu': 'Serbian Academy of Sciences and Arts', 'eosc.ds-wizard': 'Data Stewardship Wizard',
+                  'eosc.ubi': 'Ubitech', 'eosc.eosc-dih': 'EOSC DIH - Digital Innovation Hub',
+                  'eosc.vamdc': 'Virtual Atomic and Molecular Data Centre',
+                  'eosc.dariah_eric': 'DARIAH ERIC (Digital Research Infrastructure for the Arts and Humanities)',
+                  'eosc-nordic.rtu': 'Riga Technical University',
+                  'eosc.vito': 'VITO NV  (Vlaamse Instelling voor Technologisch Onderzoek NV)',
+                  'eosc.unifl': 'University of Florence, DISIT lab', 'eosc.mi': 'Mandat International',
+                  'eosc.lida': 'Lithuanian Data Archive for Social Sciences and Humanities',
+                  'eosc.epos': 'European Plate Observing System', 'eosc.gbif-es': 'GBIF Spain',
+                  'eosc.materialscloud': 'Materials Cloud', 'eosc.vilnius-university': 'Vilnius University',
+                  'eosc.vecma': 'Verified Exascale Computing for Multiscale Applications', 'eosc.hn': 'Huma-Num',
+                  'eosc.instruct-eric': 'Instruct-ERIC',
+                  'eosc.bbmri-eric': 'Biobanking and BioMolecular resources Research Infrastructure – European Research Infrastructure Consortium',
+                  'eosc.cut_library': 'Cracow University of Technology. The Library',
+                  'eosc.cnrsin2p3': ' Centre National de la Recherche Scientifique ',
+                  'eosc.forschungsdaten': 'forschungsdaten.info', 'eosc.odatis': 'Pôle Odatis',
+                  'eosc.cy-biobank': 'biobank.cy Center of Excellence in Biobanking and Biomedical Research, University of Cyprus',
+                  'eosc.up': 'Ubiquity Press Ltd',
+                  'eosc.ceric-eric': 'Central European Research Infrastructure Consortium',
+                  'eosc.ccsd': 'Center for direct scientific communication',
+                  'eosc.lnec': 'Laboratório Nacional de Engenharia Civil',
+                  'eosc.t-systems': 'T-Systems International GmbH',
+                  'eosc.icos_eric': 'Integrated Carbon Observation System European Research Infrastructure Consortium',
+                  'eosc.srce': 'University of Zagreb University Computing Centre',
+                  'eosc.crem': 'Centre de recherche Crem',
+                  'eosc.carbonneutrallng': 'Horizon Europe Project Truly Carbon Neutral electricity enhanced Synthesis of Liquefied Natural Gas (LNG) from biomass',
+                  'eosc.rb': 'Reportbrain Limited',
+                  'ni4os.ibceb': 'Ivane Beritashvili Center of Experimental Biomedicine',
+                  'eosc.ehealth_graz': 'Institute of eHealth', 'eosc.ku_leuven': 'KU Leuven',
+                  'eosc.creatis': "Centre de Recherche en Acquisition et Traitement de l'Image pour la Santé",
+                  'eosc.elixir-belgium': 'ELIXIR Belgium',
+                  'eosc.earthwatch': 'Conservation Education and Research Trust',
+                  'eosc.meeo': 'Meteorological Environmental Earth Observation', 'eosc.vib': 'VIB',
+                  'eosc.inbelixir-es': 'INB: The Spanish National Bioinformatics Institute, the Spanish node for ELIXIR',
+                  'eosc.iagos': 'In-service Aircraft for a Global Observing System AISBL',
+                  'eosc-nordic.vu': 'Vilnius University',
+                  'eosc.ifin-hh': 'Horia Hulubei National Institute for R&D in Physics and Nuclear Engineering',
+                  'eosc.max_iv_laboratory': 'MAX IV Laboratory, Lund University',
+                  'eosc.e-cam': 'E-CAM Centre of Excellence', 'eosc.scai': 'Fraunhofer SCAI',
+                  'eosc.ehri': 'European Holocaust Research Infrastructure', 'eosc.rli': 'Reiner Lemoine Institute',
+                  'eosc.expertai': 'expert.ai', 'eosc.sensing_clues': 'Sensing Clues Foundation',
+                  'eosc.cerm-cirmmp': 'Magnetic Resonance Center of the University of Florence - CERM, Interuniversity consortium CIRMMP',
+                  'eosc.rcisd': 'Regional Centre for Information and Scientific Development Ltd.',
+                  'ni4os.brfaa': 'Biomedical Research Foundation, Academy of Athens',
+                  'ni4os.ibiss': 'Institute for Biological Research Siniša Stanković, University of Belgrade',
+                  'eosc.astron': 'NWO-I Netherlands Institute for Radio Astronomy (ASTRON)',
+                  'eosc.bih_-_center_digital_health': 'Berlin Institute of Health at Charité – Universitätsmedizin Berlin, Center of Digital Health ',
+                  'eosc.net7': 'Net7 S.r.l.', 'eosc.csuc': 'Consorci de Serveis Universitaris de Catalunya',
+                  'eosc.iasa': 'Institute of Accelerating Systems and Applications',
+                  'eosc.elixir-italy': 'ELIXIR Italy',
+                  'eosc.rolos': 'Rolos Machine Intelligence Platform for academia and business with Consulting and Applications',
+                  'eosc.readcoop': 'READ-COOP SCE mit beschränkter Haftung',
+                  'eosc.slices': 'Scientific Large Scale Infrastructure for Computing/Communication Experimental Studies',
+                  'eosc.emphasis': 'European Infrastructure for Plant Phenotyping',
+                  'eosc.usv': 'Stefan cel Mare University of Suceava', 'eosc.enhancer': 'EnhanceR',
+                  'eosc.asgc': 'Academia Sinica Grid Computing Centre', 'eosc.msw': 'MyScienceWork',
+                  'eosc.oipub': 'Omni Iota Science Limited',
+                  'ni4os.ichtm': 'Institute of Chemistry, Technology and Metallurgy, University of Belgrade',
+                  'eosc.surf-nl': 'SURF', 'eosc.esrf': 'European Synchrotron Radiation Facility',
+                  'eosc.ensam': 'Arts et Metiers Institute of Technology',
+                  'eosc.desy': 'Deutsches Elektronen-Synchrotron',
+                  'eosc.ifremer': 'Ifremer, the French National Institute for Ocean Science',
+                  'eosc.inria': 'Institut national de recherche en informatique et en automatique',
+                  'eosc.gbif_portugal': 'Portuguese Node of GBIF',
+                  'eosc.mobile_observation_integration_service': 'DDQ B.V.',
+                  'eosc.awi_bremerhaven': 'Alfred Wegener Institute for Polar and Marine Research in cooperation with MARUM, Center for Marine Environmental Sciences',
+                  'eosc.tib': 'Leibniz Information Centre for Science and Technology',
+                  'eosc.obp': 'Open Book Publishers',
+                  'eosc.diamond_light_source': 'Diamond Light Source Ltd.',
+                  'eosc.kit-scc': 'KIT - Scientific Computing Center',
+                  'eosc.sites': 'Swedish Infrastructure for Ecosystem Science',
+                  'eosc.crg': 'Centre for Genomic Regulation',
+                  'eosc.naes_of_ukraine': ' National Academy of Educational Sciences of Ukraine',
+                  'eosc.soleil': 'Synchrotron SOLEIL', 'eosc.eiscat': 'EISCAT Scientific Association',
+                  'eosc.teledyne': 'Teledyne Marine', 'eosc.uni-freiburg': 'University of Freiburg',
+                  'eosc.lago': 'Latin American Giant Observatory',
+                  'eosc.sios': 'The Svalbard Integrated Arctic Earth Observing System',
+                  'eosc.upc': 'Universitat Politècnica de Catalunya',
+                  'eosc.ess_eric': 'European Social Survey, European Research Infrastructure Consortium',
+                  'eosc.arkivum': 'Arkivum Limited', 'eosc.enermaps': 'EnerMaps',
+                  'eosc.cineca': 'Cineca Consorzio Interuniversitario', 'eosc.bi_insight': 'BI INSIGHT S.A.',
+                  'eosc.embl-ebi': 'European Molecular Biology Laboratory - European Bioinformatics Institute',
+                  'eosc.ifca-csic': 'Institute of Physics of Cantabria (IFCA)',
+                  'eosc.kue': 'Krakow University of Economics, Main Library',
+                  'eosc.ulb-sa': 'University and State Library of Saxony Anhalt',
+                  'eosc-nordic.llu': 'Latvia University of Life Sciences and Technologies',
+                  'eosc.fairmat': 'Consortium FAIRmat', 'eosc.authenix': 'Secure Dimensions GmbH',
+                  'eosc.cnr-iia': 'Institute of Atmospheric Pollution Research - National Research Council of Italy',
+                  'eosc.blue-cloud': 'Blue-Cloud - Piloting innovative services for Marine Research & the Blue Economy',
+                  'eosc.upekrl': 'University of Physical Education in Krakow, Library',
+                  'eosc.oxford_e-research_centre': 'Oxford e-Research Centre, University of Oxford, UK',
+                  'eosc.fir': 'FIR e. V. at RWTH Aachen University', 'eosc.lab1100': 'LAB1100',
+                  'eosc.capsh': 'Committee for the Accessibility of Publications in Sciences and Humanities',
+                  'eosc.kit': 'Karlsruhe Institute of Technology',
+                  'eosc.ciemat-tic': 'Scientific IT Research Activities and Knowledge, ICT Division, CIEMAT',
+                  'eosc.operas': 'OPERAS AISBL',
+                  'ni4os.grena': 'Georgian Research and Educational Networking Association',
+                  'eosc.riga_stradins_university': 'Riga Stradins University',
+                  'eosc.hostkey': 'HOSTKEY B.V. - Dedicated servers in Amsterdam DC', 'eosc.ubiwhere': 'Ubiwhere ',
+                  'eosc.bsc-es': 'Barcelona Supercomputing Center - Centro Nacional de Supercomputación',
+                  'eosc.euro-argo': 'Euro-Argo ERIC, the European contribution to Argo programme',
+                  'eosc.cnag': 'Consorcio para la Explotación del Centro Nacional de Análisis Genómico',
+                  'eosc.hzdr': 'Helmholtz-Zentrum Dresden-Rossendorf e.V.',
+                  'eosc.eosc.grnet': 'National Infrastructures for Research and Technology',
+                  'eosc.embrc-eric': 'European Marine Biological Resource Centre', 'eosc.dynaikon': 'DynAikon Limited',
+                  'ni4os.nsl-ge': 'National Science Library at Tbilisi State University',
+                  'eosc.ktu': 'Kaunas University of Technology', 'eosc.sj-ucp': 'Universidade Católica Portuguesa',
+                  'eosc.gcc_umcg': 'Genomics Coordination Center, University Medical Center Groningen',
+                  'eosc.psnc': 'Poznan Supercomputing and Networking Center',
+                  'eosc.consorci_cee_lab_llum_sincrotro': 'CONSORCI PER  A LA CONSTRUCCIÓ, EQUIPAMENT I EXPLOTACIÓ DEL LABORATORI DE LLUM SINCROTRÓ',
+                  'eosc.ei': 'Earlham Institute', 'eosc.psi': 'Paul Scherrer Institute',
+                  'eosc.seadatanet': 'SeaDataNet',
+                  'eosc.uit': 'UiT The Arctic University of Norway', 'eosc.ukaea': 'UK Atomic Energy Authority',
+                  'eosc.switch': 'SWITCH', 'eosc.bkh': 'Biodiversity Knowledge Hub',
+                  'eosc.fzj': 'Forschungszentrum Jülich',
+                  'eosc.grycap': 'Institute of Instrumentation for Molecular Imaging - Grid and High Performance Computing - Universitat Politècnica de València',
+                  'eosc.infrafrontier': 'INFRAFRONTIER', 'eosc.siris_academic': 'SIRIS Academic SL',
+                  'eosc.ill': 'Institut Laue Langevin',
+                  'eosc.lindatclariah-cz': 'LINDAT/CLARIAH-CZ Research Infrastructure for Language Resources and Digital Arts and Humanities in the Czech Republic',
+                  'eosc.mediprospectsai': 'MediprospectsAI ltd',
+                  'eosc.coard': 'Collaborative Open Access Research and Development', 'eosc.elixir-europe': 'ELIXIR',
+                  'eosc.jsc-de': 'Jülich Supercomputing Centre', 'eosc.fh_joanneum': 'FH JOANNEUM Gesellschaft mbH',
+                  'eosc.dsmz': 'Leibniz Institute DSMZ - German Collection of Microorganisms and Cell Cultures',
+                  'eosc.data_revenue': 'Data Revenue', 'eosc.openbiomaps': 'OpenBioMaps Consortium',
+                  'eosc.edelweiss_connect': 'Edelweiss Connect GmbH', 'eosc.egi-fed': 'EGI Foundation',
+                  'ni4os.ipb': 'Institute of Physics Belgrade', 'eosc.upf': 'Universitat Pompeu Fabra',
+                  'eosc.infn': 'Italian National Institute of Nuclear Physics',
+                  'eosc.sks': 'Scientific Knowledge Services', 'eosc.cds': 'Strasbourg astronomical Data Centre',
+                  'eosc.geant': 'GÉANT Association',
+                  'eosc.emso_eric': 'European Multidisciplinary Seafloor and water column Observatory',
+                  'eosc.upv-es': 'Universitat Politècnica de València',
+                  'eosc.csi_piemonte': 'Consorzio per il Sistema Informativo',
+                  'eosc.bifi_-_unizar': 'Institute for Biocomputation and Physics of Complex Systems - University of Zaragoza',
+                  'eosc.wenmr': 'A Worldwide e-Infrastructure for Structural Biology',
+                  'eosc.bioexcel': 'BioExcel Centre of Excellence', 'eosc.ubora': 'UBORA association',
+                  'ni4os.fcub': 'University of Belgrade - Faculty of Chemistry',
+                  'eosc.coronis_computing_sl': 'CORONIS COMPUTING SL',
+                  'eosc.jagiellonian_library': 'Jagiellonian University, Jagiellonian Library',
+                  'eosc.data_centre': 'Centre for Data Analysis and Archiving',
+                  'eosc.elettra_sincrotrone_trieste': 'George Kourousias',
+                  'eosc.fairdi': 'FAIR Data Infrastructure for Physics, Chemistry, Materials Science, and Astronomy',
+                  'eosc.embimos': 'EMBIMOS (Environmental and Sustainability Participatory Information Systems)',
+                  'eosc.mz': 'Materials Zone',
+                  'eosc.charite_bih_brain_simulation': 'Charité University Medicine Berlin, Berlin Institute of Health, Brain Simulation Section',
+                  'eosc.ici_bucharest': 'National Institute for Research and Development in Informatics - ICI Bucharest',
+                  'eosc.ibiom-cnrhttpwwwibiomcnrit': 'Institute of Biomembranes, Bioenergetics and Molecular Biotechnologies, National Research Council',
+                  'eosc.bineo': 'Bineo Consulting SL', 'eosc.uniwersytet_opolski': 'University of Opole',
+                  'eosc.oasees': 'Open autonomous programmable cloud apps & smart sensors', 'eosc.datacite': 'DataCite',
+                  'eosc.idea': 'IDEAconsult', 'eosc.iict': 'Institute of Information and Communication Technologies',
+                  'eosc.unibo': 'Alma Mater Studiorum - Università di Bologna',
+                  'eosc.iasa_of_nasu': 'Institute for Applied System Analysis of the National Academy of Sciences of Ukraine',
+                  'eosc.cyberbotics': 'Cyberbotics',
+                  'eosc.cite': 'Communication & Information Technologies Experts SA Consulting and Development Services',
+                  'eosc.gesis': 'GESIS Leibniz Institute for the Social Sciences', 'eosc.unipd': 'University of Padua',
+                  'eosc.smartsmear': 'Institute for Atmospheric and Earth System Research',
+                  'eosc.euro-bioimaging': 'Euro-BioImaging', 'eosc.gft': 'GFT Italy',
+                  'eosc.cc-in2p3cnrs': 'Computing Centre of the National Institute of Nuclear Physics and Particle Physics, CNRS',
+                  'eosc.ror-org': 'Research Organization Registry',
+                  'eosc.bijvoetcenter': 'Bijvoet Centre - Utrecht University', 'eosc.d4science': 'D4Science',
+                  'eosc.terradue': 'Terradue', 'eosc.gbif': 'Global Biodiversity Information Facility (GBIF)',
+                  'eosc.csc-fi': 'CSC – IT CENTER FOR SCIENCE',
+                  'eosc.cesga': 'Fundacion Centro Tecnologico de Supercomputacion de Galicia',
+                  'eosc.ubfzf': 'University of Belgrade – Faculty of Philosophy',
+                  'eosc.cines': 'National Computing Center for Higher Education',
+                  'eosc.uni_konstanz': 'University of Konstanz', 'eosc.cesnet': 'CESNET', 'eosc.cs_group': 'CS GROUP',
+                  'eosc.treeofscience': 'Tree of Science', 'eosc.cscs': 'Swiss National Supercomputing Centre',
+                  'eosc.denbi': 'de.NBI - German Network for Bioinformatics Infrastructure',
+                  'eosc.gwdg': 'Gesellschaft für wissenschaftliche Datenverarbeitung mbH Göttingen',
+                  'eosc.sciences_po': 'Fondation Nationale des Sciences Politiques',
+                  'eosc.cern': 'EUROPEAN ORGANIZATION FOR NUCLEAR RESEARCH',
+                  'eosc.unibi-ub': 'Bielefeld University Library', 'eosc.sinergise': 'Sinergise',
+                  'eosc.plantnet': 'PlantNet consortium (hosted by Inria)', 'eosc.exoscale': 'EXOSCALE',
+                  'eosc.cmcc': 'Euro-Mediterranean Center on Climate Change',
+                  'eosc.taltechdata': 'Tallinn University of Technology',
+                  'eosc.tum-net': 'Technical University of Munich,  Chair of Network Architectures and Services',
+                  'eosc.cnio': 'CNIO - Spanish National Cancer Research Centre',
+                  'eosc.hits': 'Heidelberg Institute for Theoretical Studies',
+                  'eosc.zpid': 'Leibniz Institute for Psychology', 'eosc.fssda': 'Finnish Social Science Data Archive',
+                  'eosc.ugr-es': 'University of Granada – UGR',
+                  'eosc.etais': 'Estonian Scientific Computing Infrastructure',
+                  'eosc.inoe_2000': 'National Institute for Research and Development in Optoelectronics',
+                  'eosc.northern_data_cloud_services': 'ND CS (Services) GmbH', 'eosc.eurac': 'Eurac Research',
+                  'eosc.europeana': 'Europeana Foundation', 'eosc.kit-lib': 'KIT - Library',
+                  'eosc.dkrz': 'Deutsches Klimarechenzentrum GmbH',
+                  'eosc.predictia': 'Predictia Intelligent Data Solutions SL', 'eosc.scipedia': 'SCIPEDIA',
+                  'ni4os.rbi': 'Ruđer Bošković Institute', 'eosc.jelastic': 'Virtuozzo',
+                  'eosc.scigne': 'The SCIGNE Platform',
+                  'eosc.ibergrid': 'IBERGRID - Iberian Distributed Computing Infrastructure',
+                  'eosc.openedition': 'OpenEdition', 'eosc.norce': 'NORCE Norwegian Research Centre',
+                  'eosc.lsd-ufcg': 'Laboratório de Sistemas Distribuídos - Universidade Federal de Campina Grande',
+                  'eosc.sethsoftware': 'Seth Software spółka z ograniczoną odpowiedzialnością',
+                  'eosc.gsi': 'GSI Helmholtzzentrum für Schwerionenforschung GmbH',
+                  'eosc.incd': 'Portuguese National Distributed Computing Infrastructure (INCD)',
+                  'eosc.iisas': 'Institute of Informatics - Slovak Academy of Sciences ',
+                  'eosc.100percentit': '100 Percent IT', 'eosc.f6snl': 'F6S Network',
+                  'eosc.trust-it': 'Trust-IT Services',
+                  'eosc.eodc': 'Earth Observation Data Centre for Water Resources Monitoring',
+                  'ni4os.uob-rcub': 'University of Belgrade Computer Centre',
+                  'eosc.unige': 'University of Geneva, Department of Astronomy',
+                  'eosc.leaena': 'National Technical University of Athens', 'eosc.doabf': 'DOAB Foundation',
+                  'eosc.rbi': 'Ruđer Bošković Institute', 'eosc.sobigdata': 'SoBigData',
+                  'eosc.progedo': 'PROduction et GEstion des DOnnées',
+                  'eosc.isa-ulisboa': 'Instituto Superior de Agronomia da Universidade de Lisboa',
+                  'eosc.openknowledgemaps': 'Open Knowledge Maps - Verein zur Förderung der Sichtbarkeit wissenschaftlichen Wissens',
+                  'eosc.fau_evt': 'Friedrich-Alexander-University Erlangen-Nürnberg, Chair of Energy Process Engineering',
+                  'eosc.nikhef': 'Nikhef (Stichting Nederlandse Wetenschappelijk Onderzoek Instituten)',
+                  'eosc.charles_university': 'Charles University', 'eosc.dcc-uk': 'Digital Curation Centre',
+                  'eosc.it4i_vsb-tuo': 'VSB – Technical University of Ostrava, IT4Innovations National Supercomputing Center',
+                  'eosc.mundi_web_services': 'Mundi Web Services',
+                  'eosc.gdansk_tech': 'Gdańsk University of Technology',
+                  'eosc.bg_up': 'Pedagogical University of Krakow, Main Library', 'eosc.figshare': 'Figshare',
+                  'eosc.libnova': 'LIBNOVA SL', 'eosc.pml': 'Plymouth Marine Laboratory',
+                  'eosc.eox': 'EOX IT Services GmbH', 'eosc.dtu': 'Technical University of Denmark',
+                  'eosc.european_xfel': 'European X-ray Free Electron Laser Facility GmbH ',
+                  'eosc.cyfronet': 'Academic Computer Centre CYFRONET AGH',
+                  'eosc.progressive': 'Progressive Systems Srl',
+                  'eosc.ipsl': 'Institut Pierre-Simon Laplace',
+                  'ni4os.grnet': 'National Infrastructures for Research and Technology',
+                  'eosc-nordic.uot': 'University of Tartu', 'eosc.sztaki': 'INSTITUTE FOR COMPUTER SCIENCE AND CONTROL',
+                  'eosc.cnr_-_isti': 'Institute for Information Science and Technologies "Alessandro Faedo" - ISTI',
+                  'eosc.cbra': 'Clinical Bioinformatics Area', 'eosc.beia': 'BEIA CONSULT INTERNATIONAL',
+                  'eosc.slu': 'Swedish University of Agricultural Sciences', 'eosc.elcogen': 'Elcogen Oy',
+                  'eosc.enoll': 'European Network of Living Labs', 'eosc.inode': 'Intelligent Open Data Exploration',
+                  'eosc.creaf': 'Center for Research in Ecology and Forestry Applications',
+                  'eosc.csic': 'Consejo Superior de Investigaciones Científicas (CSIC)',
+                  'eosc.athena': 'Athena Research and Innovation Center in Information and Communication Technologies',
+                  'eosc.carlzeissm': 'Carl Zeiss Microscopy', 'eosc.unimib': 'University of Milano-Bicocca',
+                  'eosc.ukri_-_stfc': 'UK Research and Innovation - Science and Technology Facilities Council',
+                  'eosc.niod': 'NIOD Institute for War, Genocide and Holocaust Studies',
+                  'eosc.cloudferro': 'CloudFerro',
+                  'eosc.vliz': 'Flanders Marine Institute', 'eosc.unitartu': 'University of Tartu',
+                  'eosc.lu': 'Lund University',
+                  'eosc.clarin-eric': 'European Research Infrastructure for Language Resources and Technology',
+                  'eosc.ekt': 'National Documentation Centre', 'eosc.digifarm': 'DigiFarm',
+                  'eosc.inaf': 'Istituto Nazionale di Astrofisica',
+                  'eosc.altec': 'Aerospace Logistics Technology Engineering Company',
+                  'eosc.hu-cms': 'Humboldt-Universität zu Berlin - Computer- und Medienservice',
+                  'eosc.agh_university_main_library': 'AGH University of Krakow Main Library ',
+                  'eosc.ictlc': 'ICTLC S.P.A.', 'eosc.transcript': 'transcript Independent Academic Publishing ',
+                  'eosc.elixir-uk': 'ELIXIR United Kingdom',
+                  'eosc.acdh-ch': 'Austrian Centre for Digital Humanities and Cultural Heritage',
+                  'eosc.tubitak_ulakbim': 'Turkish Academic Network and Information Center', 'eosc.sixsq': 'SixSq',
+                  'eosc.fzj-inm7': 'Forschungszentrum Jülich, Institute of Neurosciences and Medicine (INM) Brain and Behavior (INM-7)',
+                  'eosc.forth': 'Foundation for Research and Technology, Hellas (FORTH)',
+                  'eosc.grnet': 'National Infrastructures for Research and Technology',
+                  'eosc.prace': 'Partnership For Advanced Computing in Europe aisbl',
+                  'eosc.umr_map': 'UMR CNRS/MC 3495 MAP', 'eosc.fris': 'Flemisch Research Information Space',
+                  'eosc.komanord': 'Koma Nord', 'eosc.unparallel': 'UNPARALLEL Innovation, Lda',
+                  'eosc.lifewatch-eric': 'LifeWatch ERIC', 'eosc.university_of_sussex': 'The University of Sussex',
+                  'eosc.cnb-csic': 'Centro Nacional de Biotecnologia (CSIC)', 'eosc.elsevier': 'Elsevier BV',
+                  'eosc.eudat': 'EUDAT', 'eosc.nilu': 'The Foundation NILU',
+                  'eosc.oslo_university': 'University of Oslo',
+                  'eosc.uo': 'University of Oulu', 'eosc.lapp': "Laboratoire d'Annecy de Physique des Particules",
+                  'eosc.cessda-eric': 'Consortium of European Social Science Data Archives ERIC',
+                  'eosc.olos': 'OLOS Association', 'eosc.obsparis': 'Observatoire de Paris'}, 'guideline_type': {
+        'ir_eosc_guideline_type-eosc_core_interoperability_guideline': 'EOSC-Core Interoperability Guideline',
+        'ir_eosc_guideline_type-eosc_exchange_interoperability_guideline_thematic': 'EOSC-Exchange Interoperability Guideline (Thematic)',
+        'ir_eosc_guideline_type-eosc_exchange_interoperability_guideline_horizontal': 'EOSC-Exchange Interoperability Guideline (Horizontal)',
+        'ir_eosc_guideline_type-operational_baseline': 'Operational Baseline'},
+    'tr_access': {'tr_access_right-open_access': 'Open Access',
+                  'tr_access_right-restricted_access': 'Restricted Access',
+                  'tr_access_right-metadata_only_access': 'Metadata Only Access',
+                  'tr_access_right-paid_access': 'Paid Access'},
+    'subdomains': {
+        'scientific_subdomain-agricultural_sciences-agricultural_biotechnology': 'Agricultural Biotechnology',
+        'scientific_subdomain-agricultural_sciences-agriculture_forestry_and_fisheries': 'Agriculture, Forestry & Fisheries',
+        'scientific_subdomain-agricultural_sciences-animal_and_dairy_sciences': 'Animal & Dairy Sciences',
+        'scientific_subdomain-agricultural_sciences-other_agricultural_sciences': 'Other Agricultural Sciences',
+        'scientific_subdomain-agricultural_sciences-veterinary_sciences': 'Veterinary Sciences',
+        'scientific_subdomain-engineering_and_technology-chemical_engineering': 'Chemical Engineering',
+        'scientific_subdomain-engineering_and_technology-civil_engineering': 'Civil Engineering',
+        'scientific_subdomain-engineering_and_technology-electrical_electronic_and_information_engineering': 'Electrical, Electronic & Information Engineering',
+        'scientific_subdomain-engineering_and_technology-environmental_biotechnology': 'Environmental Biotechnology',
+        'scientific_subdomain-engineering_and_technology-environmental_engineering': 'Environmental Engineering',
+        'scientific_subdomain-engineering_and_technology-industrial_biotechnology': 'Industrial Biotechnology',
+        'scientific_subdomain-engineering_and_technology-materials_engineering': 'Materials Engineering',
+        'scientific_subdomain-engineering_and_technology-mechanical_engineering': 'Mechanical Engineering',
+        'scientific_subdomain-engineering_and_technology-medical_engineering': 'Medical Engineering',
+        'scientific_subdomain-engineering_and_technology-nanotechnology': 'Nanotechnology',
+        'scientific_subdomain-engineering_and_technology-other_engineering_and_technology_sciences': 'Other Engineering & Technology Sciences',
+        'scientific_subdomain-generic-generic': 'Generic', 'scientific_subdomain-humanities-arts': 'Arts',
+        'scientific_subdomain-humanities-history_and_archaeology': 'History & Archaeology',
+        'scientific_subdomain-humanities-languages_and_literature': 'Languages & Literature',
+        'scientific_subdomain-humanities-other_humanities': 'Other Humanities',
+        'scientific_subdomain-humanities-philosophy_ethics_and_religion': 'Philosophy, Ethics & Religion',
+        'scientific_subdomain-medical_and_health_sciences-basic_medicine': 'Basic Medicine',
+        'scientific_subdomain-medical_and_health_sciences-clinical_medicine': 'Clinical Medicine',
+        'scientific_subdomain-medical_and_health_sciences-health_sciences': 'Health Sciences',
+        'scientific_subdomain-medical_and_health_sciences-medical_biotechnology': 'Medical Biotechnology',
+        'scientific_subdomain-medical_and_health_sciences-other_medical_sciences': 'Other Medical Sciences',
+        'scientific_subdomain-natural_sciences-biological_sciences': 'Biological Sciences',
+        'scientific_subdomain-natural_sciences-chemical_sciences': 'Chemical Sciences',
+        'scientific_subdomain-natural_sciences-computer_and_information_sciences': 'Computer & Information Sciences',
+        'scientific_subdomain-natural_sciences-earth_and_related_environmental_sciences': 'Earth & Related Environmental Sciences',
+        'scientific_subdomain-natural_sciences-mathematics': 'Mathematics',
+        'scientific_subdomain-natural_sciences-other_natural_sciences': 'Other Natural Sciences',
+        'scientific_subdomain-natural_sciences-physical_sciences': 'Physical Sciences',
+        'scientific_subdomain-other-other': 'Other',
+        'scientific_subdomain-social_sciences-economics_and_business': 'Economics & Business',
+        'scientific_subdomain-social_sciences-educational_sciences': 'Educational Sciences',
+        'scientific_subdomain-social_sciences-law': 'Law',
+        'scientific_subdomain-social_sciences-media_and_communications': 'Media & Communications',
+        'scientific_subdomain-social_sciences-other_social_sciences': 'Other Social Sciences',
+        'scientific_subdomain-social_sciences-political_sciences': 'Political Sciences',
+        'scientific_subdomain-social_sciences-psychology': 'Psychology',
+        'scientific_subdomain-social_sciences-social_and_economic_geography': 'Social & Economic Geography',
+        'scientific_subdomain-social_sciences-sociology': 'Sociology'},
+    'access_type': {'access_type-mail_in': 'Mail-In', 'access_type-other': 'Other', 'access_type-physical': 'Physical',
+                    'access_type-remote': 'Remote', 'access_type-virtual': 'Virtual'},
+    'expertise_level': {'tr_expertise_level-advanced': 'Advanced', 'tr_expertise_level-intermediate': 'Intermediate',
+                        'tr_expertise_level-beginner': 'Beginner', 'tr_expertise_level-all': 'All'},
+    'tr_content': {'tr_content_resource_type-animation': 'Animation', 'tr_content_resource_type-audio': 'Audio',
+                   'tr_content_resource_type-diagram': 'Diagram', 'tr_content_resource_type-game': 'Game',
+                   'tr_content_resource_type-image': 'Image', 'tr_content_resource_type-multimedia': 'Multimedia',
+                   'tr_content_resource_type-poster': 'Poster', 'tr_content_resource_type-slides': 'Slides',
+                   'tr_content_resource_type-text': 'Text', 'tr_content_resource_type-video': 'Video',
+                   'tr_content_resource_type-website': 'Website', 'tr_content_resource_type-other': 'Other'},
+    'domains': {'scientific_domain-agricultural_sciences': 'Agricultural Sciences',
+                'scientific_domain-engineering_and_technology': 'Engineering & Technology',
+                'scientific_domain-generic': 'Generic', 'scientific_domain-humanities': 'Humanities',
+                'scientific_domain-medical_and_health_sciences': 'Medical & Health Sciences',
+                'scientific_domain-natural_sciences': 'Natural Sciences', 'scientific_domain-other': 'Other',
+                'scientific_domain-social_sciences': 'Social Sciences'},
+    'tr_dcmi': {'tr_dcmi_type-activity_plan': 'Activity Plan', 'tr_dcmi_type-assessment': 'Assessment',
+                'tr_dcmi_type-assessment_item': 'Assessment Item',
+                'tr_dcmi_type-educator_curriculum_guide': 'Educator Curriculum Guide',
+                'tr_dcmi_type-lesson_plan': 'Lesson Plan',
+                'tr_dcmi_type-physical_learning_resource': 'Physical Learning Resource',
+                'tr_dcmi_type-recorded_lesson': 'Recorded Lesson',
+                'tr_dcmi_type-supporting_document': 'Supporting Document', 'tr_dcmi_type-textbook': 'Textbook',
+                'tr_dcmi_type-unit_plan': 'Unit Plan', 'tr_dcmi_type-other': 'Other'},
+    'funding_program': {'funding_program-afis2020': 'Anti Fraud Information System (AFIS2020)',
+                        'funding_program-agr': 'European Agricultural Guarantee Fund (after transfers between EAGF and EAFRD) (AGR)',
+                        'funding_program-agrnet': 'Net transfer between EAGF and EAFRD (AGRNET)',
+                        'funding_program-amf': 'Asylum, Migration and Integration Fund (AMF)',
+                        'funding_program-cdf2020': 'Rights, equality and citizenship programme (CDF2020)',
+                        'funding_program-cef': 'Connecting Europe Facility (CEF)',
+                        'funding_program-cf': 'Cohesion Fund (CF)',
+                        'funding_program-cf_det': 'Contribution from the Cohesion Fund to the CEF programme (CF_DET)',
+                        'funding_program-cfsp': 'Common foreign and security policy (CFSP2020)',
+                        'funding_program-cit2020': 'Europe for Citizens (CIT2020)',
+                        'funding_program-compreg': 'Competitiveness (more developed regions) (COMPREG)',
+                        'funding_program-cons': 'Consumer programme (CONS)',
+                        'funding_program-copernicus': 'European Earth Observation Programme (COPERNICUS)',
+                        'funding_program-cosme': 'Programme for the competitiveness of enterprises and small and medium-sized enterprises (COSME)',
+                        'funding_program-cpm_h3': 'Union Civil Protection Mechanism — Member States (CPM_H3)',
+                        'funding_program-cpm_h4': 'Union Civil Protection Mechanism — Outside EU (CPM_H4)',
+                        'funding_program-crea': 'Creative Europe programme (CREA)',
+                        'funding_program-cust2020': 'Action programme for customs in the European Union (CUST 2020)',
+                        'funding_program-dci2020': 'Development Cooperation Instrument (DCI2020)',
+                        'funding_program-e4a': 'The Union programme for education, training, youth and sport (Erasmus+) (E4A)',
+                        'funding_program-eafrd': 'European Agricultural Fund for Rural Development (after transfers between EAGF and EAFRD) (EAFRD)',
+                        'funding_program-eafrd2020': 'European Agricultural Fund for Rural Development (EAFRD2020)',
+                        'funding_program-eagf2020': 'European Agricultural Guarantee Fund (EAGF2020)',
+                        'funding_program-ear2020': 'Emergency Aid Reserve (EAR2020)',
+                        'funding_program-eerp': 'Energy projects to aid economic recovery (EERP)',
+                        'funding_program-efsd': 'European Fund for Sustainable Development (EFSD)',
+                        'funding_program-efsi': 'European Fund for Strategic Investments (EFSI)',
+                        'funding_program-egf2020': 'European Globalisation Adjustment Fund (EGF2020)',
+                        'funding_program-eidhr2020': 'European Instrument for Democracy and Human Rights (EIDHR2020)',
+                        'funding_program-emff2020': 'European Maritime and Fisheries Fund (EMFF2020)',
+                        'funding_program-eni': 'European Neighbourhood Instrument (ENI)',
+                        'funding_program-erdf': 'European Regional Development Fund (ERDF)',
+                        'funding_program-esc': 'European Solidarity Corps (ESC)',
+                        'funding_program-esf': 'European Social Fund (ESF)',
+                        'funding_program-esp2017': 'European statistical programme (ESP2017)',
+                        'funding_program-esp2020': 'European statistical programme (ESP2020)',
+                        'funding_program-euav': 'EU Aid Volunteers initiative (EUAV)',
+                        'funding_program-euratom': 'Euratom research and training programme (EURATOM)',
+                        'funding_program-eurodac2020': 'Comparison of fingerprints for the effective application of the Dublin Convention (EURODAC2020)',
+                        'funding_program-eusf2020': 'European Union Solidarity Fund (EUSF2020)',
+                        'funding_program-eusf_h3': 'European Union Solidarity Fund (EUSF) — Member States (EUSF_H3)',
+                        'funding_program-eusf_h4': 'European Union Solidarity Fund (EUSF) — Countries negotiating for accession (EUSF_H4)',
+                        'funding_program-fead': 'Fund for European Aid to the Most Deprived (FEAD)',
+                        'funding_program-ff2020': 'Food and feed (FF2020)',
+                        'funding_program-finser2020': 'Specific activities in the field of financial reporting and auditing (FINSER2020)',
+                        'funding_program-fisc2020': 'Action programme for taxation in the European Union (FISC2020)',
+                        'funding_program-gal2014': 'Implementation and exploitation of European satellite navigation systems (EGNOS and Galileo) (GAL2014)',
+                        'funding_program-grld2020': 'EU cooperation with Greenland (GRLD2020)',
+                        'funding_program-h2020': 'The framework programme for research and innovation (H2020)',
+                        'funding_program-health': "Union's action in the field of health (Health programme) (HEALTH)",
+                        'funding_program-herc3': "Programme to promote activities in the field of the protection of the European Union's financial interests (HERC3)",
+                        'funding_program-hfr2015': 'Supplementary high flux reactor (HFR) programmes (HFR2015)',
+                        'funding_program-huma2020': 'Humanitarian aid (HUMA2020)',
+                        'funding_program-icfs': 'Enhancing consumers involvement in EU policy making in the field of financial services (ICFS)',
+                        'funding_program-ies': 'Instrument for emergency support within the Union (IES)',
+                        'funding_program-ifs2020': 'Instrument contributing to Stability and Peace (IFS2020)',
+                        'funding_program-insc2020': 'Instrument for Nuclear Safety Cooperation (INSC2020)',
+                        'funding_program-ipa2': 'Instrument for Pre-accession Assistance (IPA2)',
+                        'funding_program-isa2015': 'Interoperability Solutions for European Public Administrations (ISA2015)',
+                        'funding_program-isa2020': 'Interoperability Solutions for European public administrations, businesses and citizens (ISA2020)',
+                        'funding_program-isf': 'Internal Security Fund (ISF)',
+                        'funding_program-iter': 'International thermonuclear experimental reactor (ITER)',
+                        'funding_program-just': 'Justice programme (JUST)',
+                        'funding_program-life2020': 'Programme for the Environment and Climate Action (LIFE2020)',
+                        'funding_program-loan2020': 'Guarantee Fund for external actions (LOAN2020)',
+                        'funding_program-mfa': 'Macro financial assistance (MFA)',
+                        'funding_program-nd': 'Nuclear decommissioning assistance programmes in Bulgaria, Lithuania and Slovakia (ND)',
+                        'funding_program-other': 'Other',
+                        'funding_program-outreg': 'Outermost and sparsely populated regions (OUTREG)',
+                        'funding_program-peri2020': 'Exchange, assistance and training programme for the protection of the euro against counterfeiting (PERI2020)',
+                        'funding_program-pi': 'Partnership instrument for cooperation with third countries (PI)',
+                        'funding_program-psci': 'European Union programme for employment and social innovation (PSCI)',
+                        'funding_program-regconv': 'Regional convergence (REGCONV)',
+                        'funding_program-rfmos': 'Compulsory contributions to regional fisheries management organisations (RFMOs) and to other international organisations',
+                        'funding_program-sfpas': 'Sustainable Fisheries Partnership Agreements (SFPAs)',
+                        'funding_program-sis2020': 'Schengen Information System (SIS2020)',
+                        'funding_program-ta_ia': 'Technical assistance and innovative actions (TA_IA)',
+                        'funding_program-tcc': 'Instrument of financial support for encouraging the economic development of the Turkish Cypriot community (TCC)',
+                        'funding_program-terrcoop': 'European territorial cooperation (TERRCOOP)',
+                        'funding_program-transreg': 'Transition regions (TRANSREG)',
+                        'funding_program-vis2020': 'Visa Information System (VIS2020)',
+                        'funding_program-yei': 'Youth employment initiative (specific top-up allocation) (YEI))',
+                        'funding_program-lripmeys': 'Large Research Infrastructures Programme of the MEYS, Czech Republic',
+                        'funding_program-ddoict': 'Development, deployment and operation of ICT-based e-infrastructures',
+                        'funding_program-nucleu': 'NUCLEU Programme (Romania)',
+                        'funding_program-driltah': 'LINDAT/CLARIAH-CZ Digital Research Infrastructure for the Language Technologies, Arts and Humanities (LM2018101)',
+                        'funding_program-esaeoep': 'ESA EO Exploitation Platforms initiative'},
+    'order_type': {'order_type-fully_open_access': 'Fully Open Access', 'order_type-open_access': 'Open Access',
+                   'order_type-order_required': 'Order Required', 'order_type-other': 'Other'}, 'related_resource': {},
+    'related_resources': {}}
--- a/airflow/dags/dag_utils.py
+++ b/airflow/dags/dag_utils.py
@ -0,0 +1,23 @@
+from airflow.hooks.base import BaseHook
+from opensearchpy import OpenSearch
+from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+
+
+def get_opensearch_client(kwargs) -> OpenSearch:
+    conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
+    return OpenSearch(
+        hosts=[{'host': conn.host, 'port': conn.port}],
+        http_auth=(conn.login, conn.password),
+        use_ssl=True,
+        verify_certs=False,
+        ssl_show_warn=False,
+        pool_maxsize=20,
+        timeout=180
+    )
+
+
+def get_bucket_name(context: dict, hook: S3Hook, param_name: str):
+    bucket_name = context["params"][param_name]
+    if not bucket_name:
+        bucket_name = hook.extra_args['bucket_name']
+    return bucket_name
--- a/airflow/dags/download_to_S3.py
+++ b/airflow/dags/download_to_S3.py
@ -0,0 +1,43 @@
+import os
+from datetime import timedelta
+
+import pendulum
+import requests
+from airflow.decorators import dag
+from airflow.decorators import task
+from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+
+S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
+EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
+
+default_args = {
+    "execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
+    "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
+    "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
+}
+
+
+@dag(
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    schedule=None,
+    catchup=False,
+    default_args=default_args,
+    params={
+        "url": "File to download",
+        "dst_key": "key containing the file",
+        "dst_bucket": "bucket that will contain file"
+    },
+    tags=["s3"],
+)
+def download_to_s3():
+    @task
+    def download(**context):
+        hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
+        with requests.get(context["params"]["url"], stream=True) as r:
+            r.raise_for_status()
+            hook.load_file_obj(r.raw, context["params"]["dst_key"], bucket_name=context["params"]["dst_bucket"], replace=True, encrypt=False)
+
+    download()
+
+
+download_to_s3()
--- a/airflow/dags/import_Catalogues.py
+++ b/airflow/dags/import_Catalogues.py
@ -0,0 +1,218 @@
+from __future__ import annotations
+
+import os
+from datetime import timedelta
+
+import opensearchpy
+import pendulum
+import requests
+from airflow.decorators import dag
+from airflow.decorators import task
+from airflow.hooks.base import BaseHook
+from airflow.utils.helpers import chain
+from opensearchpy import OpenSearch, helpers
+
+from catalogue.RawCatalogOpensearch import RawCatalogOpensearch
+
+EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
+
+default_args = {
+    "execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
+    "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
+    "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
+}
+
+
+@dag(
+    dag_id="import_Catalogue",
+    schedule=None,
+    dagrun_timeout=None,
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    catchup=False,
+    default_args=default_args,
+    params={
+        "OPENSEARCH_CONN_ID": "opensearch_default",
+        "SHARDS": 3,
+        "SUFFIX": pendulum.now().format('YYYYMMDDHHmmss')
+    },
+    tags=["lot1"]
+)
+def import_catalogue_entities():
+    @task
+    def create_indexes(**kwargs):
+        conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
+        client = OpenSearch(
+            hosts=[{'host': conn.host, 'port': conn.port}],
+            http_auth=(conn.login, conn.password),
+            use_ssl=True,
+            verify_certs=False,
+            ssl_show_warn=False,
+            pool_maxsize=20,
+            timeout=180
+        )
+
+        for entity in RawCatalogOpensearch.entities:
+            indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
+            if client.indices.exists(indexname):
+                client.indices.delete(indexname)
+
+    @task
+    def harvest_indexes(**kwargs):
+        conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
+        client = OpenSearch(
+            hosts=[{'host': conn.host, 'port': conn.port}],
+            http_auth=(conn.login, conn.password),
+            use_ssl=True,
+            verify_certs=False,
+            ssl_show_warn=False,
+            pool_maxsize=20,
+            timeout=180
+        )
+        catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
+
+        session = requests.session()
+        for entity in RawCatalogOpensearch.entities:
+            indexname = catalog.get_index(entity)
+            baseurl = "http://vereniki.athenarc.gr:8080/eic-registry"
+            callurl = f"{baseurl}/{entity}"
+            params = {"draft": "false", "active": "true", "suspended": "false"}
+
+            if client.indices.exists(indexname):
+                client.indices.delete(indexname)
+
+            while True:
+                reply = session.get(url=callurl, params=params)
+                reply.raise_for_status()
+                content = reply.json()
+                if 'results' not in content:
+                    break
+                results = content['results']
+                if len(results) <= 0:
+                    break
+
+                def streamed_results():
+                    for r in results:
+                        yield {"_index": indexname, "_id": r['id'], "_source": r}
+
+                succeeded = 0
+                failed = 0
+                for success, item in helpers.parallel_bulk(client, actions=streamed_results(), timeout=5 * 60):
+                    if success:
+                        succeeded = succeeded + 1
+                    else:
+                        print("error: " + str(item))
+                        failed = failed + 1
+
+                # end of stream conditions
+                if content['to'] >= content['total']:
+                    break
+                params['from'] = content['to']
+            client.indices.refresh(indexname)
+
+    @task
+    def map_indexes(**kwargs):
+        conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
+        client = OpenSearch(
+            hosts=[{'host': conn.host, 'port': conn.port}],
+            http_auth=(conn.login, conn.password),
+            use_ssl=True,
+            verify_certs=False,
+            ssl_show_warn=False,
+            pool_maxsize=20,
+            timeout=180
+        )
+
+        catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
+
+        for entity in RawCatalogOpensearch.mapped_entities:
+            mapped_index = catalog.get_mapped_index(entity)
+            if client.indices.exists(mapped_index):
+                client.indices.delete(mapped_index)
+
+            def streamed_results():
+                for hit in opensearchpy.helpers.scan(client,
+                                                     index=catalog.get_index(entity),
+                                                     query={"query": {"match_all": {}}}):
+                    r = hit['_source']
+                    doc = None
+                    match entity:
+                        case "interoperability-records":
+                            doc = catalog.map_interoperability(r)
+                        case "training-resources":
+                            doc = catalog.map_training(r)
+                        case "services":
+                            doc = catalog.map_service(r)
+
+                    yield {"_index": mapped_index, "_id": doc['id'], "_source": doc}
+
+            succeeded = 0
+            failed = 0
+            for success, item in helpers.parallel_bulk(client, actions=streamed_results(), timeout=5 * 60):
+                if success:
+                    succeeded = succeeded + 1
+                else:
+                    print("error: " + str(item))
+                    failed = failed + 1
+            print(f"Entity: {entity} succes: {success} error: {failed}")
+
+    @task
+    def close_indexes(**kwargs):
+        conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
+        client = OpenSearch(
+            hosts=[{'host': conn.host, 'port': conn.port}],
+            http_auth=(conn.login, conn.password),
+            use_ssl=True,
+            verify_certs=False,
+            ssl_show_warn=False,
+            pool_maxsize=20,
+            timeout=180
+        )
+        catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
+
+        def refresh_index(index_name):
+            if index_name is not None:
+                client.indices.refresh(index_name)
+                client.indices.put_settings(index=index_name, body={
+                    "index": {
+                        "number_of_replicas": 1,
+                        "refresh_interval": "60s",
+                    }
+                })
+
+        def update_aliases(index_name, alias_name):
+            if index_name is not None and alias_name is not None:
+                client.indices.update_aliases(
+                    body={"actions": [
+                        {"remove": {"index": f"{alias_name}_*", "alias": alias_name}},
+                        {"add": {"index": index_name, "alias": alias_name}},
+                    ]}
+                )
+
+        for entity in RawCatalogOpensearch.entities:
+            refresh_index(catalog.get_index(entity))
+            refresh_index(catalog.get_mapped_index(entity))
+            update_aliases(catalog.get_index(entity), catalog.get_alias(entity))
+            update_aliases(catalog.get_mapped_index(entity), catalog.get_mapped_alias(entity))
+
+        # update "allresources" alias with mapped indices
+        actions = []
+        for entity in RawCatalogOpensearch.mapped_entities:
+            index_name = catalog.get_mapped_index(entity)
+            entity_alias = catalog.get_mapped_alias(entity)
+            actions.append({"remove": {"index": f"{entity_alias}_*", "alias": "allresources"}})
+            actions.append({"add": {"index": index_name, "alias": "allresources"}})
+
+        if len(actions) > 0:
+            client.indices.update_aliases(
+                body={"actions": actions}
+            )
+
+    chain(
+        create_indexes.override(task_id="create_indexes")(),
+        harvest_indexes.override(task_id="harvest_indexes")(),
+        map_indexes.override(task_id="map_indexes")(),
+        close_indexes.override(task_id="close_indexes")()
+    )
+
+
+import_catalogue_entities()
--- a/airflow/dags/import_EOSC_graph.py
+++ b/airflow/dags/import_EOSC_graph.py
@ -0,0 +1,317 @@
+from __future__ import annotations
+
+import codecs
+import gzip
+import io
+import json
+import logging
+import os
+from datetime import timedelta
+
+from airflow.exceptions import AirflowException
+from kubernetes.client import models as k8s
+import pendulum
+from airflow.decorators import dag
+from airflow.decorators import task
+from airflow.operators.python import PythonOperator
+from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+from airflow.utils.helpers import chain
+from airflow.hooks.base import BaseHook
+
+from opensearchpy import OpenSearch, helpers
+from EOSC_indexes import mappings
+from EOSC_entity_trasform import filter_entities, transform_entities
+
+EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
+
+default_args = {
+    "execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
+    "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
+    "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
+}
+
+configs = {
+    "all": {"ENTITIES": ["datasource", "grants", "organizations", "persons", "products", "topics", "venues", "interoperability", "services", "training"]},
+    "skg-if": {"ENTITIES": ["datasource", "grants", "organizations", "persons", "products", "topics", "venues"]},
+    "catalogue": {"ENTITIES": ["interoperability", "services", "training"]},
+}
+
+for config_name, config in configs.items():
+    dag_id = f"import_EOSC_{config_name}"
+
+    @dag(
+        dag_id=dag_id,
+        schedule=None,
+        dagrun_timeout=None,
+        start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+        catchup=False,
+        default_args=default_args,
+        params={
+            "S3_CONN_ID": "s3_conn",
+            "OPENSEARCH_CONN_ID": "opensearch_default",
+            "KEY_PREFIX": "/",
+            "EOSC_CATALOG_BUCKET": "eosc-portal-import",
+            "BATCH_LOADERS_NUM": 10,
+            "ENTITIES": config["ENTITIES"],
+            "SUFFIX": pendulum.now().format('YYYYMMDDHHmmss')
+        },
+        tags=["lot1"]
+    )
+    def import_EOSC_entities():
+        @task
+        def create_indexes(**kwargs):
+            conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
+            client = OpenSearch(
+                hosts=[{'host': conn.host, 'port': conn.port}],
+                http_auth=(conn.login, conn.password),
+                use_ssl=True,
+                verify_certs=False,
+                ssl_show_warn=False,
+                pool_maxsize=20,
+                timeout=180
+            )
+
+            client.cluster.put_settings(body={
+                "persistent": {
+                    "cluster.routing.allocation.balance.prefer_primary": True,
+                    "segrep.pressure.enabled": True
+                }
+            })
+
+            for entity in kwargs["params"]["ENTITIES"]:
+                indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
+                if client.indices.exists(indexname):
+                    client.indices.delete(indexname)
+
+                client.indices.create(indexname, {
+                    "settings": {
+                        "index": {
+                            "number_of_shards": 40,
+                            "number_of_replicas": 0,
+                            "refresh_interval": -1,
+
+                            "translog.flush_threshold_size": "2048MB",
+
+                            "codec": "zstd_no_dict",
+                            "replication.type": "SEGMENT"
+                        }
+
+                    },
+                    "mappings": mappings[entity]
+                })
+
+        def compute_batches(ds=None, **kwargs):
+            hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
+            pieces = []
+            for entity in kwargs["params"]["ENTITIES"]:
+                s3_path = os.path.normpath(kwargs["params"]["KEY_PREFIX"] + "/" + entity + "/")
+                keys = hook.list_keys(bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"], prefix=s3_path)
+                to_delete = list(filter(lambda key: key.endswith('.PROCESSED'), keys))
+                for obj in to_delete:
+                    hook.get_conn().delete_object(Bucket=kwargs["params"]["EOSC_CATALOG_BUCKET"], Key=obj)
+                for key in keys:
+                    if key.endswith(('.json.gz', '.json')):
+                        pieces.append((entity, key))
+
+            def split_list(list_a, chunk_size):
+                for i in range(0, len(list_a), chunk_size):
+                    yield {"files": list_a[i:i + chunk_size]}
+
+            if len(pieces) <= 0:
+                print("Nothing found in: " + kwargs["params"]["KEY_PREFIX"])
+                return list()
+
+            num_batches = len(pieces)//kwargs["params"]["BATCH_LOADERS_NUM"]
+            if num_batches > 0:
+                return list(split_list(pieces, num_batches))
+            return list(split_list(pieces, len(pieces)))
+
+        @task(executor_config={
+            "pod_override": k8s.V1Pod(
+                spec=k8s.V1PodSpec(
+                    containers=[
+                        k8s.V1Container(
+                            name="base",
+                            resources=k8s.V1ResourceRequirements(
+                                requests={
+                                    "cpu": "550m",
+                                    "memory": "256Mi"
+                                }
+                            )
+                        )
+                    ]
+                )
+            )
+        })
+        def bulk_load(files: list[(str, str)], **kwargs):
+            conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
+            client = OpenSearch(
+                hosts=[{'host': conn.host, 'port': conn.port}],
+                http_auth=(conn.login, conn.password),
+                use_ssl=True,
+                verify_certs=False,
+                ssl_show_warn=False,
+                pool_maxsize=20,
+                timeout=180,
+                request_timeout=5*60
+            )
+            hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
+
+            retries = 0
+            while len(files) > 0 and retries < 5:
+                retries += 1
+                retry_files = []
+                for (entity, key) in files:
+                    indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
+                    if hook.check_for_key(key=f"{key}.PROCESSED", bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"]):
+                        print(f'Skipping {entity}: {key}')
+                        continue
+                    print(f'Processing {indexname}: {key}')
+                    s3_obj = hook.get_key(key, bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"])
+                    with gzip.GzipFile(fileobj=s3_obj.get()["Body"], mode='rb') if key.endswith(".gz") else codecs.getreader('utf-8')(s3_obj.get()["Body"]) as s3file:
+                        def _generate_data():
+                            for line in s3file:
+                                data: dict = json.loads(line)
+                                if entity in transform_entities:
+                                    data = transform_entities[entity](data)
+                                if entity in filter_entities:
+                                    if filter_entities[entity](data):
+                                        print(data["local_identifier"] + " does not meet inclusion policies")
+                                        continue
+                                index = {"update": {"_index": indexname, "_id": data.pop("_id")}}
+                                yield index, {"doc": data, "doc_as_upsert": True}
+
+                        # disable success post logging
+                        logging.getLogger("opensearch").setLevel(logging.WARN)
+                        succeeded = 0
+                        failed = 0
+                        for success, item in helpers.parallel_bulk(client, actions=_generate_data(),
+                                                                   expand_action_callback=lambda arg: arg,
+                                                                   raise_on_exception=False,
+                                                                   raise_on_error=False,
+                                                                   chunk_size=5000,
+                                                                   max_chunk_bytes=50 * 1024 * 1024,
+                                                                   timeout=5*60):
+                            if success:
+                                succeeded = succeeded + 1
+                            else:
+                                print("error: " + str(item))
+                                failed = failed + 1
+
+                        print(f"Bulk report: inserted {succeeded} items, {failed} failures, {retries} tentative")
+
+                        if failed > 0:
+                            retry_files.append((entity, key))
+                        else:
+                            hook.load_string(
+                                "",
+                                f"{key}.PROCESSED",
+                                bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"],
+                                replace=False
+                            )
+                files = retry_files  # retry files with errors
+            # Check if there are remaining files to recovered in retry
+            if len(files) > 0:
+                raise AirflowException("ERROR could not import all items from: " + str(files))
+
+        @task
+        def merge_curation_db(**kwargs):
+            conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
+            client = OpenSearch(
+                hosts=[{'host': conn.host, 'port': conn.port}],
+                http_auth=(conn.login, conn.password),
+                use_ssl=True,
+                verify_certs=False,
+                ssl_show_warn=False,
+                pool_maxsize=20,
+                timeout=180
+            )
+            if "products" in kwargs["params"]["ENTITIES"]:
+                products_index = f'products_{kwargs["params"]["SUFFIX"]}'
+                curationdb_index = 'curation'
+                if client.indices.exists(curationdb_index):
+                    client.reindex(body={
+                        "source": {
+                            "index": curationdb_index,
+                            "_source": ["status"]
+                        },
+                        "dest": {
+                            "index": products_index
+                        }
+                    },
+                        refresh=False,
+                        requests_per_second=-1,
+                        scroll="4h",
+                        slices="auto",
+                        timeout=60*60*4,
+                        wait_for_completion=True)
+
+        @task
+        def delete_missing_curated(**kwargs):
+            conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
+            client = OpenSearch(
+                hosts=[{'host': conn.host, 'port': conn.port}],
+                http_auth=(conn.login, conn.password),
+                use_ssl=True,
+                verify_certs=False,
+                ssl_show_warn=False,
+                pool_maxsize=20,
+                timeout=180
+            )
+            if "products" in kwargs["params"]["ENTITIES"]:
+                products_index = f'products_{kwargs["params"]["SUFFIX"]}'
+                client.indices.refresh(products_index)
+                client.delete_by_query(index=products_index,
+                                       body={"query": {"bool": {"must_not": {"exists": {"field": "local_identifier"}}}}},
+                                       refresh=True
+                                       )
+
+
+        @task
+        def close_indexes(**kwargs):
+            conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
+            client = OpenSearch(
+                hosts=[{'host': conn.host, 'port': conn.port}],
+                http_auth=(conn.login, conn.password),
+                use_ssl=True,
+                verify_certs=False,
+                ssl_show_warn=False,
+                pool_maxsize=20,
+                timeout=180
+            )
+            for entity in kwargs["params"]["ENTITIES"]:
+                indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
+                client.indices.refresh(indexname)
+            # update aliases
+            for entity in kwargs["params"]["ENTITIES"]:
+                indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
+                client.indices.update_aliases(
+                    body={"actions": [
+                        {"remove": {"index": f"{entity}_*", "alias": entity}},
+                        {"add": {"index": indexname, "alias": entity}},
+                    ]}
+                )
+            # update "allresources" alias
+            actions = []
+            for entity in kwargs["params"]["ENTITIES"]:
+                if entity in ['products', 'services', 'training', 'interoperability']:
+                    indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
+                    actions.append({"remove": {"index": f"{entity}_*", "alias": "allresources"}})
+                    actions.append({"add": {"index": indexname, "alias": "allresources"}})
+            if len(actions) > 0:
+                client.indices.update_aliases(
+                    body={"actions": actions}
+                )
+
+        parallel_batches = PythonOperator(task_id="compute_parallel_batches", python_callable=compute_batches)
+
+        chain(
+            create_indexes.override(task_id="create_indexes")(),
+            merge_curation_db.override(task_id="merge_curation_db")(),
+            parallel_batches,
+            bulk_load.expand_kwargs(parallel_batches.output),
+            delete_missing_curated.override(task_id="delete_missing_curated_recs")(),
+            close_indexes.override(task_id="close_indexes")()
+        )
+
+    import_EOSC_entities()
--- a/airflow/dags/init_ams_topics.py
+++ b/airflow/dags/init_ams_topics.py
@ -0,0 +1,67 @@
+import requests
+
+
+def init_ams(endpoint: str, project: str, token: str, reset: bool):
+    session = requests.session()
+
+    def delete_topic(topic):
+        print(f"Deleting projects/{project}/topics/{topic}", flush=True)
+        reply = session.delete(
+            headers={"x-api-key": token},
+            url=f"https://{endpoint}/v1/projects/{project}/topics/{topic}"
+        )
+        if not (200 <= reply.status_code < 500 or reply.status_code == 504):
+            reply.raise_for_status()
+
+    def delete_subscription(subscription):
+        print(f"Deleting projects/{project}/subscriptions/{subscription}", flush=True)
+        reply = session.delete(
+            headers={"x-api-key": token},
+            url=f"https://{endpoint}/v1/projects/{project}/subscriptions/{subscription}"
+        )
+        if not (200 <= reply.status_code < 500 or reply.status_code == 504):
+            reply.raise_for_status()
+
+    def create_topic(topic):
+        print(f"Creating projects/{project}/topics/{topic}", flush=True)
+        reply = session.put(
+            headers={"x-api-key": token},
+            url=f"https://{endpoint}/v1/projects/{project}/topics/{topic}",
+            json={
+                "maxMessages": "1",
+                "returnImmediately": "false"
+            }
+        )
+        if not (200 <= reply.status_code < 300 or reply.status_code == 409 or reply.status_code == 504):
+            reply.raise_for_status()
+
+    def create_subscription(topic, subscription):
+        print(f"Creating projects/{project}/subscriptions/{subscription}", flush=True)
+        reply = session.put(
+            headers={"x-api-key": token},
+            url=f"https://{endpoint}/v1/projects/{project}/subscriptions/{subscription}",
+            json={
+                "topic": f"projects/{project}/topics/{topic}",
+                "ackDeadlineSeconds": 600
+            }
+        )
+        if not (200 <= reply.status_code < 300 or reply.status_code == 409):
+            reply.raise_for_status()
+
+    subscriptions = {
+        'curation_requests': ['curation_requests_debug', 'curation_requests_dispatcher'],
+        'curation_replies': ['curation_replies_rest_debug', 'curation_replies_rest'],
+        'curation_spam_candidates': ['curation_spam_candidates_debug', 'curation_spam_candidates_dispatcher'],
+        'graph_requests': ['graph_requests_debug', 'graph_requests_indexer']
+    }
+
+    for topic in ['curation_requests', 'curation_replies', 'curation_spam_candidates', 'graph_requests']:
+        if reset:
+            for sub in subscriptions[topic]:
+                delete_subscription(sub)
+            delete_topic(topic)
+
+        create_topic(topic)
+
+        for sub in subscriptions[topic]:
+            create_subscription(topic, sub)
--- a/airflow/dags/init_opensearch_templates.py
+++ b/airflow/dags/init_opensearch_templates.py
--- a/airflow/dags/openaire_to_S3.py
+++ b/airflow/dags/openaire_to_S3.py
@ -0,0 +1,141 @@
+import os
+import time
+from datetime import timedelta
+
+import pendulum
+import requests
+from airflow.decorators import dag
+from airflow.decorators import task
+from airflow.hooks.base import BaseHook
+from airflow.providers.amazon.aws.hooks.s3 import S3Hook
+
+S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
+EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
+
+default_args = {
+    "execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
+    "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
+    "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
+}
+
+
+def delete_pending_multipart_uploads(s3_client, bucket, key):
+    multipart_uploads = s3_client.list_multipart_uploads(Bucket=bucket)
+
+    if 'Uploads' in multipart_uploads:
+        for upload in multipart_uploads['Uploads']:
+            if upload['Key'] == key:
+                upload_id = upload['UploadId']
+
+                s3_client.abort_multipart_upload(
+                    Bucket=bucket,
+                    Key=key,
+                    UploadId=upload_id
+                )
+                print(f"Aborted multipart upload {upload_id} for key {key}")
+    else:
+        print("No pending multipart uploads found")
+
+
+def download_uri(session: requests.Session, url: str, s3_client, bucket, key, max_retries: int = 10):
+    parts = []
+    total_size = 0
+    current_size = 0
+    part_number = 1
+    chunk_size = 0
+
+    response = s3_client.create_multipart_upload(Bucket=bucket,
+                                                 Key=key)
+    upload_id = response['UploadId']
+    tries = 0
+    while tries < max_retries:
+        try:
+            with session.get(url,
+                             headers={'Range': 'bytes=%d-' % current_size},
+                             stream=True) as r:
+                if total_size == 0:
+                    total_size = int(r.headers['Content-length'])
+                    chunk_size = max(total_size // (10000 - 1), 15 * 1024 * 1024)
+                for chunk in r.iter_content(chunk_size=chunk_size):
+                    if chunk:
+                        response = s3_client.upload_part(
+                            Body=chunk,
+                            Bucket=bucket,
+                            Key=key,
+                            PartNumber=part_number,
+                            UploadId=upload_id
+                        )
+                        parts.append({'PartNumber': part_number, 'ETag': response['ETag']})
+                        current_size += len(chunk)
+                        print(f"Read {current_size} of {total_size} part no {part_number}")
+                        part_number += 1
+                        tries = 0
+                break  # break the retry loop when reaches end of chunks
+        except Exception as e:
+            tries += 1
+            if tries < max_retries:
+                print(e)
+                print("Resume in 60 seconds...")
+                time.sleep(60)
+                continue
+            else:
+                print(f"ABORT: failed after {max_retries} tentatives")
+                s3_client.abort_multipart_upload(
+                    Bucket=bucket,
+                    Key=key,
+                    UploadId=upload_id
+                )
+                raise
+
+    s3_client.complete_multipart_upload(
+        Bucket=bucket,
+        Key=key,
+        UploadId=upload_id,
+        MultipartUpload={'Parts': parts}
+    )
+
+
+@dag(
+    schedule=None,
+    dagrun_timeout=None,
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    catchup=False,
+    default_args=default_args,
+    params={
+        "file": "File to download",
+        "dst_bucket": "bucket that will contain file",
+        "max_retries": 10
+    },
+    tags=["s3"],
+)
+def openaire_to_s3():
+    @task
+    def download(**context):
+        http_conn = BaseHook.get_connection("openaire_default")
+
+        max_retries = context["params"]["max_retries"]
+        url = "https://" + http_conn.host + "/data/graph/" + context["params"]["file"]
+        bucket_name = context["params"]["dst_bucket"]
+        s3_key = "/data/graph/" + context["params"]["file"]
+
+        session = requests.Session()
+        session.headers['Connection'] = 'close'
+        session.auth = (http_conn.login, http_conn.password)
+        hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
+
+        # Cleanup file and pending uploads
+        delete_pending_multipart_uploads(s3_client=hook.get_conn(), bucket=bucket_name, key=s3_key)
+        hook.delete_objects(bucket=bucket_name,
+                            keys=[s3_key])
+
+        download_uri(session=session,
+                     url=url,
+                     s3_client=hook.get_conn(),
+                     bucket=bucket_name,
+                     key=s3_key,
+                     max_retries=max_retries)
+
+    download()
+
+
+openaire_to_s3()
--- a/airflow/dags/remove_old_indexes.py
+++ b/airflow/dags/remove_old_indexes.py
@ -0,0 +1,80 @@
+import json
+from datetime import timedelta
+
+import pendulum
+from airflow.decorators import dag
+from airflow.decorators import task
+from airflow.operators.python import get_current_context
+
+from dag_utils import get_opensearch_client
+
+# Define default arguments
+default_args = {
+    'owner': 'airflow',
+    'depends_on_past': False,
+    'email_on_failure': False,
+    'email_on_retry': False,
+    'retries': 1,
+    'retry_delay': timedelta(minutes=5),
+}
+
+managed_indexes = {'catalog_datasources', 'catalog_interoperability-records', 'catalog_providers',
+                   'catalog_resource-interoperability-records', 'catalog_services', 'catalog_training-resources',
+                   'datasource', 'grants', 'interoperability',
+                   'organizations', 'persons', 'products',
+                   'services', 'topics', 'training', 'venues'
+                   }
+
+
+@dag(
+    dag_id="remove_old_indexes",
+    # dag_display_name="Remove outdated MKG indexes",
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    schedule=None,
+    catchup=False,
+    default_args=default_args,
+    params={
+        "OPENSEARCH_CONN_ID": "opensearch_default",
+    },
+    tags=["opensearch", "maintenance"],
+)
+def remove_old_indexes():
+    @task
+    def remove_indexes():
+        context = get_current_context()
+        client = get_opensearch_client(context)
+
+        indexes = client.cat.indices(format="json")
+        aliases = client.cat.aliases(format="json")
+
+        print(json.dumps(aliases))
+        print(json.dumps(indexes))
+
+        # indexes referred by aliases
+        alias_index_names = {alias['index'] for alias in aliases}
+        # indexes ordered by timestamp
+        index_dict = {}
+
+        for index in indexes:
+            index_name = index['index']
+            if '_' in index_name:
+                base_name = '_'.join(index_name.split('_')[:-1])
+                timestamp = index_name.split('_')[-1]
+                if not (base_name in managed_indexes and timestamp.isdigit()):
+                    continue
+                if base_name not in index_dict:
+                    index_dict[base_name] = []
+                index_dict[base_name].append((index_name, timestamp))
+
+        for base_name, index_list in index_dict.items():
+            index_list.sort(key=lambda x: x[1], reverse=True)
+            most_recent_index = index_list[0][0]
+            for index_name, timestamp in index_list:
+                if index_name != most_recent_index and index_name not in alias_index_names:
+                    # hook.run(f'/{index_name}')
+                    print(f'Deleted index: {index_name}')
+
+    remove_indexes()
+
+
+remove_old_indexes()
--- a/airflow/dags/test_dag.py
+++ b/airflow/dags/test_dag.py
@ -0,0 +1,111 @@
+from __future__ import annotations
+
+import os
+from datetime import timedelta
+
+import pendulum
+from airflow.decorators import dag, task_group
+from airflow.decorators import task
+from airflow.exceptions import AirflowSkipException
+from airflow.operators.empty import EmptyOperator
+from airflow.operators.python import get_current_context
+from airflow.utils.helpers import chain
+from kubernetes.client import models as k8s
+
+EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
+
+default_args = {
+    "execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
+    "retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
+    "retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
+}
+
+
+@dag(
+    dag_id="test_s3_openaire_dump",
+    # dag_display_name="(Test) Import OpenAIRE entities from S3",
+    schedule=None,
+    dagrun_timeout=None,
+    start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
+    catchup=False,
+    default_args=default_args,
+    params={
+        "S3_CONN_ID": "s3_conn",
+        "OPENSEARCH_CONN_ID": "opensearch_default",
+        "KEY_PREFIX": "/",
+        "S3_BUCKET": "kg-1",
+        "BATCH_LOADERS_NUM": 10,
+        "ENTITIES": ["datasource", "grants", "organizations", "persons", "products", "topics", "venues"],
+        "SUFFIX": pendulum.now().format('YYYYMMDDHHmmss')
+    },
+    tags=["openaire", "lot1", "mkg"]
+)
+def import_s3_openaire_dump():
+    @task
+    def create_indexes():
+        kwargs = get_current_context()
+        print(kwargs["params"]["ENTITIES"])
+
+    @task_group
+    def load_and_map_entity(entity: str):
+        @task(trigger_rule="none_failed")
+        def compute_batches():
+            nonlocal entity
+            kwargs = get_current_context()
+            if entity not in kwargs["params"]["ENTITIES"]:
+                raise AirflowSkipException(f"Skipping {entity}")
+            return [[(entity, '1'), (entity, '2')], [], []]
+
+        @task(executor_config={
+            "pod_override": k8s.V1Pod(
+                spec=k8s.V1PodSpec(
+                    containers=[
+                        k8s.V1Container(
+                            name="base",
+                            resources=k8s.V1ResourceRequirements(
+                                requests={
+                                    "cpu": "550m",
+                                    "memory": "256Mi"
+                                }
+                            )
+                        )
+                    ]
+                )
+            )
+        })
+        def parallel_load(files: list[(str, str)], **kwargs):
+            kwargs = get_current_context()
+            print(files)
+
+        parallel_load.expand(files=compute_batches())
+
+    @task(trigger_rule="none_failed")
+    def merge_curation_db(**kwargs):
+        pass
+
+    @task(trigger_rule="none_failed")
+    def delete_missing_curated(**kwargs):
+        pass
+
+    @task(trigger_rule="none_failed")
+    def close_indexes(**kwargs):
+        pass
+
+    chain(
+        create_indexes(),
+        # todo get checkpoint
+        merge_curation_db(),
+        load_and_map_entity("datasource"),
+        load_and_map_entity("grants"),
+        load_and_map_entity("organizations"),
+        load_and_map_entity("persons"),
+        load_and_map_entity("products"),
+        load_and_map_entity("topics"),
+        load_and_map_entity("venues"),
+        delete_missing_curated(),
+        close_indexes()
+        # todo ask resync
+    )
+
+
+import_s3_openaire_dump()
--- a/components/curationdb/Dockerfile
+++ b/components/curationdb/Dockerfile
@ -0,0 +1,12 @@
+#!/usr/bin/env -S docker build . --tag=gbloisi/curation:1.0.0 --platform linux/amd64 --push --network=host --file
+
+FROM python:3.12-slim-bullseye
+
+COPY requirements.txt /
+
+RUN python -m pip install --upgrade -r /requirements.txt
+
+COPY antispam-batch.py blacklist.txt curation-rest.py /
+
+# Run the server
+CMD python3 /curation-rest.py
--- a/components/curationdb/antispam-batch.py
+++ b/components/curationdb/antispam-batch.py
@ -0,0 +1,255 @@
+import json
+import sys
+import traceback
+from typing import Any, Dict, List, Optional
+
+from jsonargparse import ArgumentParser
+from openai import AsyncOpenAI
+
+import asyncio
+import enum
+import instructor
+
+from pydantic import BaseModel, Field, SecretStr
+
+from datetime import datetime
+from opensearchpy import OpenSearch, helpers, AsyncOpenSearch
+
+
+class Topics(str, enum.Enum):
+    """Correctly assign one of the predefined topic to the content"""
+    SPAM = "SPAM, advertisement, promotional"
+    SALES = "direct sales of goods or services"
+    EXPLICIT_CONTENT = "porn, violence or Harmful content"
+    RESEARCH = "description of a scientific research"
+    DATASET = "description of a scientific dataset "
+    OBJECT = "scientific description of an object"
+    BIBLIOGRAPHIC = "bibliographic record"
+    NA = "not available"
+
+
+class ProductInfo(BaseModel):
+    """
+    Your task is to identify SPAM content among research product descriptions.
+    """
+    language: str = Field(description="The language of the content")
+    topic: Topics
+    reason: str = Field(description="explain why the topic was chosen")
+    spam_words: list[str] = Field(description="content's spam words", min_length=0, max_length=3)
+
+main_model_schema = ProductInfo.model_json_schema()
+response_schema = json.dumps(main_model_schema, indent=None)
+
+parser = ArgumentParser(env_prefix="CURATION", default_env=True)
+parser.add_argument("--opensearch.host", default='opensearch-cluster.local-dataplatform')
+parser.add_argument("--opensearch.port", default=443, type=int)
+parser.add_argument("--opensearch.user", default="admin", type=SecretStr)
+parser.add_argument("--opensearch.password", default="admin", type=SecretStr)
+parser.add_argument("--openai.host", default='localhost')
+parser.add_argument("--openai.port", default=8000, type=int)
+parser.add_argument("--openai.api_key", default='api_key')
+parser.add_argument("--parallelism", default=36, type=int)
+cfg = parser.parse_args()
+
+with open("/blacklist.txt", "r") as text_file:
+    blacklist = [line.rstrip().lower() for line in text_file.readlines()]
+
+
+client = AsyncOpenSearch(
+    hosts=[{'host': cfg.get("opensearch.host"), 'port': cfg.get("opensearch.port")}],
+    http_auth=(cfg.get("opensearch.user").get_secret_value(), cfg.get("opensearch.password").get_secret_value()),
+    use_ssl=True,
+    verify_certs=False,
+    ssl_show_warn=False,
+    pool_maxsize=20
+)
+
+oai = instructor.patch(AsyncOpenAI(base_url="http://" + cfg.get("openai.host") + ":" + str(cfg.get("openai.port")) + "/v1",
+                                   api_key=cfg.get("openai.api_key"),
+                                   timeout=2400.0*6.0),
+                       mode=instructor.Mode.JSON_SCHEMA)
+
+
+def source_txt_value(data: Dict[str, Any], labels: List[str]) -> Optional[Any]:
+    if len(labels) <= 0:
+        return None
+    current_value = data['_source']
+    for label in labels:
+        if isinstance(current_value, dict) and label in current_value:
+            current_value = current_value[label]
+        else:
+            return None
+    if current_value is None:
+        return None
+    if isinstance(current_value, list):
+        if len(current_value) > 0:
+            return current_value[0]
+        else:
+            return None
+    return str(current_value)
+
+
+async def eval_spam_candidate(hit: dict) -> ProductInfo:
+    response = await oai.chat.completions.create(
+        model="suzume-multilingual",
+        response_model=ProductInfo,
+        messages=[
+            {
+                "role": "user",
+                "content": hit['title']
+            }
+        ],
+        extra_body={
+            "cache_prompt": True,
+            "json_schema": response_schema
+        },
+        temperature=0.0,
+        max_retries=5,
+        stream=False
+    )
+    return response.model_dump()
+
+
+async def evaluate_hit(hit: dict):
+    obj = await eval_spam_candidate(hit)
+    if obj['topic'] in [Topics.SPAM, Topics.EXPLICIT_CONTENT, Topics.SALES]:
+        print("SPAM detected: " + hit['local_identifier'], flush=True)
+    print("AI Reponse:" + str(obj) + "  for: " + hit['title'], flush=True)
+    obj['local_identifier'] = hit['local_identifier']
+    obj['trigger_word'] = hit['found']
+    obj['abstract'] = hit['title']
+    obj['timestamp'] = datetime.now().isoformat()
+    await client.index(
+        index='spam',
+        body=obj,
+        id=hit['local_identifier'],
+        refresh=True
+    )
+    return obj
+
+async def get_potential_spam() -> Any:
+    count = 0
+    resume_from = 0
+    async for hit in helpers.async_scan(client, index="products", query={"query": {"match_all": {}}}, scroll='1d'):
+        count = count + 1
+        if count < resume_from:
+            continue
+        local_identifier = source_txt_value(hit, ["local_identifier"])
+        print(f"{count}:\t{local_identifier}")
+        title = source_txt_value(hit, ["titles", "none"])
+        description = source_txt_value(hit, ['abstracts', 'none'])
+
+        if title is None:
+            if description is None:
+                print("No description! {local_identifier}", flush=True)
+                continue
+            title = ""
+
+        if description is not None:
+            title = title + " " + description
+
+        utf8_title = title.encode('utf-8')
+        if len(utf8_title) > 2048:
+            title = utf8_title[0:2048].decode('utf-8', 'ignore')
+        test_string = title.lower()
+        split_string = test_string.split()
+        found = None
+        for badword in blacklist:
+            if badword in test_string:
+                if len(badword) == 1 or ' ' in badword or badword in split_string:
+                    found = badword
+                    break
+        if found is None:
+            continue
+        if await client.exists(index="spam", id=local_identifier):
+            print("cached")
+            continue
+        yield {"local_identifier": local_identifier, "title": title, "found": found}
+
+
+
+async def worker(name, queue):
+    try:
+        while True:
+            # Get a "work item" out of the queue.
+            hit = await queue.get()
+            # Sleep for the "sleep_for" seconds.
+            await evaluate_hit(hit)
+            # Notify the queue that the "work item" has been processed.
+            queue.task_done()
+    except Exception as e:
+        print(traceback.format_exc())
+        sys.exit(-1)
+
+
+async def main():
+    #if await client.indices.exists("spam"):
+    #    await client.indices.delete("spam")
+
+    if not await client.indices.exists("spam"):
+        await client.indices.create("spam", {
+            "settings": {
+                "index": {
+                    "number_of_shards": 3,
+                    "number_of_replicas": 0,
+                    "replication.type": "SEGMENT"
+                }
+
+            },
+            "mappings": {
+                "properties": {
+                    "local_identifier": {
+                        "type": "keyword"
+                    },
+                    "language": {
+                        "type": "keyword"
+                    },
+                    "topic": {
+                        "type": "keyword"
+                    },
+                    "abstract": {
+                        "type": "text",
+                        "index": False,
+                    },
+                    "reason": {
+                        "type": "text",
+                        "index": False,
+                    },
+                    "spam_words": {
+                        "type": "keyword"
+                    },
+                    "trigger_word": {
+                        "type": "keyword"
+                    },
+                    "timestamp": {
+                        "type": "date",
+                        "format": "date_hour_minute_second_fraction"
+                    }
+                }
+            }
+        })
+
+    parallelism = cfg.get("parallelism")
+    queue = asyncio.Queue(parallelism)
+    tasks = []
+    for i in range(parallelism):
+        task = asyncio.create_task(worker(f'worker-{i}', queue))
+        tasks.append(task)
+
+    async for hit in get_potential_spam():
+        await queue.put(hit)
+
+    await queue.join()
+    # Cancel our worker tasks.
+    for task in tasks:
+        task.cancel()
+
+    # Wait until all worker tasks are cancelled.
+    await asyncio.gather(*tasks, return_exceptions=True)
+
+
+if __name__ == "__main__":
+    loop = asyncio.new_event_loop()
+    asyncio.set_event_loop(loop)
+    loop.run_until_complete(main())
+    loop.close()
--- a/components/curationdb/blacklist.txt
+++ b/components/curationdb/blacklist.txt
--- a/components/curationdb/curation-rest.py
+++ b/components/curationdb/curation-rest.py
@ -0,0 +1,298 @@
+from datetime import datetime
+from enum import Enum
+from flask_openapi3 import Info, Tag
+from flask_openapi3 import OpenAPI
+from jsonargparse import ArgumentParser
+from opensearchpy import OpenSearch, NotFoundError, helpers
+from pydantic import BaseModel, SecretStr
+import logging
+
+
+
+parser = ArgumentParser(env_prefix="CURATION", default_env=True)
+parser.add_argument("--opensearch.host", default='opensearch-cluster.local-dataplatform')
+parser.add_argument("--opensearch.port", default=443, type=int)
+parser.add_argument("--opensearch.user", default="admin", type=SecretStr)
+parser.add_argument("--opensearch.password", default="admin", type=SecretStr)
+parser.add_argument("--debug", default=False, type=bool)
+cfg = parser.parse_args()
+
+print(cfg.as_dict())
+
+client = OpenSearch(
+    hosts=[{'host': cfg.get("opensearch.host"), 'port': cfg.get("opensearch.port")}],
+    http_auth=(cfg.get("opensearch.user").get_secret_value(), cfg.get("opensearch.password").get_secret_value()),
+    use_ssl=True,
+    verify_certs=False,
+    ssl_show_warn=False,
+    pool_maxsize=20,
+)
+
+# if client.indices.exists("curation"):
+#     client.indices.delete("curation")
+
+if not client.indices.exists("curation"):
+    client.indices.create("curation", {
+        "settings": {
+            "index": {
+                "number_of_shards": 10,
+                "number_of_replicas": 0,
+                "codec": "zstd_no_dict",
+                "replication.type": "SEGMENT"
+            },
+        },
+        "mappings": {
+            "dynamic": "strict",
+            "properties": {
+                "local_identifier": {
+                    "type": "keyword"
+                },
+                "timestamp": {
+                    "type": "date",
+                    "format": "date_hour_minute_second_fraction"
+                },
+                "creator": {
+                    "type": "keyword"
+                },
+                "status": {
+                    "type": "keyword"
+                },
+                "note": {
+                    "index": False,
+                    "type": "text"
+                },
+
+                "log": {
+                    "type": "object",
+                    "properties": {
+                        "timestamp": {
+                            "format": "date_hour_minute_second_fraction",
+                            "type": "date"
+                        },
+                        "creator": {
+                            "type": "keyword"
+                        },
+                        "status": {
+                            "index": False,
+                            "type": "keyword"
+                        },
+                        "note": {
+                            "index": False,
+                            "type": "text"
+                        },
+                    }
+                }
+            }
+        }
+    })
+
+
+info = Info(title="Curator API", version="1.0.0")
+app = OpenAPI(__name__, info=info)
+curation_tag = Tag(name="curation", description="Curator API")
+
+
+class CurationStatus(str, Enum):
+    valid = "valid"
+    withdrawn = "withdrawn"
+    alert = "alert"
+    restore = "restore"
+    reset = "reset"
+
+
+class CurationRequest(BaseModel):
+    local_identifier: str
+    creator: str
+    status: CurationStatus
+    note: str
+
+
+class LogEntry(BaseModel):
+    timestamp: str
+    creator: str
+    status: CurationStatus
+    note: str
+
+
+class CurationResponse(BaseModel):
+    local_identifier: str
+    timestamp: str
+    creator: str
+    status: CurationStatus
+    note: str
+    log: list[LogEntry]
+
+
+@app.route('/health')
+def health_check():
+    if all_required_services_are_running():
+        return 'OK', 200
+    else:
+        return 'Service Unavailable', 500
+
+
+def all_required_services_are_running():
+    os_health = client.cluster.health()
+    return os_health['status'] in ['green', 'yellow'] and os_health['number_of_nodes'] > 0
+
+
+@app.post("/curation", summary="set curation",
+          responses={200: CurationResponse},
+          tags=[curation_tag])
+def post_curation(query: CurationRequest):
+    """
+    set curation status
+    """
+    curation = dict()
+
+    try:
+        hit = client.get(index="curation", id=query.local_identifier)
+        curation = hit['_source']
+
+        if query.status.name == curation['status']:
+            return {"msg": "status is not changed"}, 403
+
+        # move current status in history
+        annotations = curation['log'] if 'log' in curation else list()
+        if isinstance(annotations, dict):
+            annotations = [annotations]
+        annotations.insert(0, {
+            "timestamp": curation['timestamp'],
+            "creator": curation['creator'],
+            "status": curation['status'],
+            "note": curation['note'],
+        })
+        annotations = annotations[0:100]
+        curation['log'] = annotations
+        curation['timestamp'] = datetime.now().isoformat()
+        curation['creator'] = query.creator
+        curation['note'] = query.note
+
+        print(curation)
+
+        # todo check status transition
+        match query.status.name:
+            case "valid":
+                if curation['status'] not in ('restore', 'reset'):
+                    return {"msg": "status cannot be updated to 'valid'"}, 403
+                curation['status'] = query.status.name
+            case "withdrawn":
+                curation['status'] = query.status.name
+            case "alert":
+                curation['status'] = query.status.name
+            case "restore":
+                if curation['status'] != "withdrawn":
+                    return {"msg": "only withdrawn records can be restored'"}, 403
+                curation['status'] = query.status.name
+            case "reset":
+                curation['status'] = query.status.name
+
+        #TODO transactionality in case of failure?
+        client.index(
+            index='curation',
+            id=query.local_identifier,
+            body=curation,
+            refresh=True,
+            if_primary_term=hit['_primary_term'],
+            if_seq_no=hit['_seq_no']
+        )
+        metadata_status = curation['status']
+
+        if metadata_status == 'reset':
+            client.update(
+                index='products',
+                id=query.local_identifier,
+                body={
+                    "script": {"source": "ctx._source.remove(\"status\")"}
+                },
+                refresh=True
+            )
+        else:
+            if metadata_status == "restore":
+                metadata_status = 'valid'
+
+            client.update(
+                index='products',
+                id=query.local_identifier,
+                body={
+                    "doc": {"status": metadata_status}
+                },
+                refresh=True
+            )
+    except NotFoundError:
+        curation['local_identifier'] = query.local_identifier
+        curation['timestamp'] = datetime.now().isoformat()
+        curation['status'] = query.status.name
+        curation['creator'] = query.creator
+        curation['note'] = query.note
+
+        match query.status.name:
+            case "restore":
+                return {"msg": "cannot restore: status does not exist'"}, 403
+            case "reset":
+                return {"msg": "cannot reset: status does not exist'"}, 403
+
+        client.index(
+            index='curation',
+            id=query.local_identifier,
+            body=curation,
+            refresh=True,
+            op_type='create'
+        )
+        client.update(
+            index='products',
+            id=query.local_identifier,
+            body={
+                "doc": {"status": curation['status']}
+            },
+            refresh=True
+        )
+
+    return curation
+
+
+@app.get("/curation", summary="get curation", tags=[curation_tag])
+def get_curation(local_identifier: str):
+    """
+    to get a curation record
+    """
+    try:
+        hit = client.get(index="curation", id=local_identifier)
+
+        return {
+            "code": 0,
+            "message": "ok",
+            "data": hit['_source']
+        }
+    except NotFoundError:
+        return {"msg": f"Cannot fetch: '{local_identifier}' does not exist'"}, 403
+
+
+@app.get("/alerts", summary="get curation in alert status", tags=[curation_tag])
+def get_alerts():
+    """
+    to get a curation record
+    """
+    query = {
+        "query": {
+            "terms": {
+                "status": [CurationStatus.alert]
+            }
+        }
+    }
+    return {
+        "code": 0,
+        "message": "ok",
+        "data": list(helpers.scan(client, index="curation", query=query))
+    }
+
+
+if __name__ == "__main__":
+    debug = False
+    if debug:
+        logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s')
+        app.run(debug=True)
+    else:
+        from waitress import serve
+        logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
+        serve(app, host="0.0.0.0", port=5000)
--- a/components/curationdb/product_classifier.schema.json
+++ b/components/curationdb/product_classifier.schema.json
@ -0,0 +1,45 @@
+{
+  "type": "object",
+  "required": [
+    "language",
+    "topic",
+    "reason"
+  ],
+  "properties": {
+    "language": {
+      "type": "string"
+    },
+    "topic": {
+      "enum": [
+        "Other",
+        "Natural and life Sciences",
+        "Engineering And Technology",
+        "Computer Science",
+        "Medical And Health Sciences",
+        "Agricultural And Veterinary Sciences",
+        "Social Sciences",
+        "Humanities And The Arts",
+        "Archaeology",
+        "Bibliographic record",
+        "Porn, Violence or Harmful content",
+        "Direct sales of goods or services",
+        "SPAM, advertisement, promotional"
+      ],
+      "type": "string"
+    },
+    "general_subject": {
+      "type": "string"
+    },
+    "reason": {
+      "description": "reason of the classification",
+      "type": "string"
+    },
+    "spam_words": {
+      "items": {
+        "type": "string"
+      },
+      "type": "array",
+      "maxItems": 3
+    }
+  }
+}
--- a/components/curationdb/requirements.txt
+++ b/components/curationdb/requirements.txt
@ -0,0 +1,11 @@
+langchain
+langchain-community
+langchain-core
+instructor
+pydantic
+openai
+opensearch-py
+jsonargparse
+flask
+flask-openapi3
+flask-waitress