initial stage

2024-07-26 17:52:37 +02:00 · 2024-07-26 17:52:37 +02:00 · 48d2f69bc6
parent 1ad367efcc
commit 48d2f69bc6
5 changed files with 386 additions and 383 deletions
--- a/airflow/dags/EOSC_indexes.py
+++ b/airflow/dags/EOSC_indexes.py
@ -6,9 +6,9 @@ mappings['datasource'] = {
        "data_source_classification": {
            "type": "keyword"
        },
-        # "eoscId": {
-        #     "type": "keyword"
-        # },
+        "entity_type": {
+            "type": "keyword"
+        },
        "identifiers": {
            "type": "object",
            "properties": {
@ -86,6 +86,9 @@ mappings['datasource'] = {

 mappings['venues'] = {
    "properties": {
+        "entity_type": {
+            "type": "keyword"
+        },
        "identifiers": {
            "type": "object",
            "properties": {
@ -124,6 +127,9 @@ mappings['venues'] = {

 mappings['topics'] = {
    "properties": {
+        "entity_type": {
+            "type": "keyword"
+        },
        "identifiers": {
            "type": "object",
            "properties": {
@ -151,6 +157,9 @@ mappings['topics'] = {

 mappings['persons'] = {
    "properties": {
+        "entity_type": {
+            "type": "keyword"
+        },
        "family_name": {
            "fields": {
                "keyword": {
@ -197,6 +206,9 @@ mappings['organizations'] = {
        "country": {
            "type": "keyword"
        },
+        "entity_type": {
+            "type": "keyword"
+        },
        "identifiers": {
            "type": "object",
            "properties": {
@ -394,9 +406,9 @@ mappings['products'] = {
        "contributions": {
            "type": "object",
            "properties": {
-                # "declared_affiliations": {
-                #     "type": "keyword"
-                # },
+                "is_listed_author": {
+                  "type": "boolean"
+                },
                "person": {
                    "type": "object",
                    "properties": {
@ -420,11 +432,11 @@ mappings['products'] = {
                    "index": False,
                    "type": "long"
                },
-                # "roles": {
-                #     "type": "keyword"
-                # }
            }
        },
+        "entity_type": {
+            "type": "keyword"
+        },
        "funding": {
            "type": "object",
            "properties": {
@ -506,15 +518,9 @@ mappings['products'] = {
                        "end_page": {
                            "type": "text"
                        },
-                        # "hosting_data_source": {
-                        #     "type": "text"
-                        # },
                        "issue": {
                            "type": "text"
                        },
-                        # "number": {
-                        #     "type": "text"
-                        # },
                        "publisher": {
                            "type": "text"
                        },
@ -537,9 +543,6 @@ mappings['products'] = {
                        }
                    }
                },
-                "eoscId": {
-                    "type": "keyword"
-                },
                "hosting_datasource": {
                    "type": "object",
                    "properties": {
@ -612,9 +615,6 @@ mappings['products'] = {
                        "pmid": {
                            "type": "keyword"
                        },
-                        # "title": {
-                        #     "type": "text"
-                        # }
                    }
                },
                "relation_type": {
@ -686,6 +686,7 @@ mappings['products'] = {
                }
            }
        },
+        # Aliases
        "type": {
            "path": "product_type",
            "type": "alias"
--- a/airflow/dags/catalogue/RawCatalogOpensearch.py
+++ b/airflow/dags/catalogue/RawCatalogOpensearch.py
@ -0,0 +1,283 @@
+from datetime import datetime
+
+from opensearchpy import OpenSearch
+
+from catalogue.dictutils import extract_nested, extract_map_nested, delete_none
+from catalogue.vocabulary import CATALOG_VOCABULARY
+
+
+class RawCatalogOpensearch:
+    def __init__(self, os_client: OpenSearch, suffix: str | None):
+        self.os_client = os_client
+        self.suffix = suffix
+
+    def get_index(self, name: str):
+        return "catalog_" + name + ("" if self.suffix is None else f"_{self.suffix}")
+
+    def get_resource_interoperability_records(self, resource_id):
+        response = self.os_client.search(
+            body={
+                'query': {
+                    'term': {
+                        'resourceInteroperabilityRecord.resourceId.keyword': resource_id,
+                    }
+                },
+                "fields": [
+                    "resourceInteroperabilityRecord.interoperabilityRecordIds"
+                ],
+                "_source": False
+            },
+            index=self.get_index('resource-interoperability-records')
+        )
+
+        interoperability_ids = []
+        interoperability_records = []
+        for hit in response['hits']['hits']:
+            interoperability_ids.extend(
+                extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.interoperabilityRecordIds']) or [])
+
+        if len(interoperability_ids) > 0:
+            response = self.os_client.search(
+                body={
+                    "query": {
+                        "ids": {
+                            "values": interoperability_ids,
+                        }
+                    },
+                },
+                index=self.get_index('interoperability-records')
+            )
+            for hit in response['hits']['hits']:
+                interoperability_records.append(extract_nested(hit, ['_source']))
+
+        return interoperability_records
+
+    def get_providers(self, provider_ids: list[str]) -> list:
+        provider_records = []
+        if provider_ids is not None and len(provider_ids) > 0:
+            response = self.os_client.search(
+                body={
+                    "query": {
+                        "ids": {
+                            "values": provider_ids if isinstance(provider_ids, list) else [provider_ids],
+                        }
+                    },
+                },
+                index=self.get_index('providers')
+            )
+            for hit in response['hits']['hits']:
+                provider_records.append(extract_nested(hit, ['_source']))
+        return provider_records
+
+    def get_provider(self, provider_id: str):
+        if provider_id is not None:
+            providers = self.get_providers([provider_id])
+            if providers is not None and len(providers) > 0:
+                return providers[0]
+        return {}
+
+    def get_services(self, service_ids: list[str]) -> list:
+        service_records = []
+
+        if service_ids is not None and len(service_ids) > 0:
+            response = self.os_client.search(
+                body={
+                    "query": {
+                        "ids": {
+                            "values": service_ids if isinstance(service_ids, list) else [
+                                service_ids],
+                        }
+                    },
+                },
+                index=self.get_index('services')
+            )
+            for hit in response['hits']['hits']:
+                service_records.append(extract_nested(hit, ['_source']))
+
+        return service_records
+
+    def get_datasource_of_service(self, service_id: str):
+        response = self.os_client.search(
+            body={
+                'query': {
+                    'term': {
+                        'datasource.serviceId.keyword': service_id,
+                    }
+                }
+            },
+            index=self.get_index('datasources')
+        )
+
+        for hit in response['hits']['hits']:
+            return extract_nested(hit, ['_source'])
+        return {}
+
+    def get_services_of_interoperability(self, interoperability_id: str):
+        svc_ids = []
+        response = self.os_client.search(
+            body={
+                'query': {
+                    'term': {
+                        'resourceInteroperabilityRecord.interoperabilityRecordIds.keyword': interoperability_id,
+                    }
+                },
+                "fields": [
+                    "resourceInteroperabilityRecord.resourceId"
+                ],
+                "_source": False
+            },
+            index=self.get_index('resource-interoperability-records')
+        )
+
+        for hit in response['hits']['hits']:
+            svc_ids.extend(extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.resourceId']) or [])
+
+        return svc_ids
+
+    def map_service(self, raw_svc: dict) -> dict:
+        interoperability_records = self.get_resource_interoperability_records(raw_svc['id'])
+        organization = self.get_provider(extract_nested(raw_svc, ['service', 'resourceOrganisation']))
+        provider_records = self.get_providers(list(
+            filter(lambda i: len(i) > 0, extract_nested(raw_svc, ['service', 'resourceProviders']) or [])))
+        related_resources_records = self.get_services(list(
+            filter(lambda i: len(i) > 0, extract_nested(raw_svc, ['service', 'relatedResources']) or [])))
+        datasource = self.get_datasource_of_service(raw_svc['id'])
+
+        res = {
+            "accessRestriction": extract_nested(raw_svc,
+                                                "service.geographicalAvailabilities".split(".")),
+            "accessTypes": extract_map_nested(raw_svc, 'access_type', "service.accessTypes".split(".")),
+            "access_modes": extract_map_nested(raw_svc, 'access_mode', "service.accessModes".split(".")),
+            "category": list(map(lambda c: {"category": CATALOG_VOCABULARY['categories'][c['category']],
+                                            "subcategory": CATALOG_VOCABULARY['subcategories'][c['subcategory']]},
+                                 extract_nested(raw_svc, "service.categories".split(".")))),
+            "description": extract_nested(raw_svc, "service.description".split(".")),
+            "domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
+                                          "subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
+                               extract_nested(raw_svc, "service.scientificDomains".split(".")))),
+            "grantProjectNames": extract_nested(raw_svc, "service.grantProjectNames".split(".")),
+            "helpdeskPage": extract_nested(raw_svc, "service.helpdeskPage".split(".")),
+            "horizontal": extract_nested(raw_svc, "service.horizontalService".split(".")) or False,
+            "id": extract_nested(raw_svc, "service.id".split(".")),
+            "interoperabilityGuidelines": list(
+                map(lambda ig: ig['interoperabilityRecord']['title'], interoperability_records)),
+            "language": extract_map_nested(raw_svc, 'languages', "service.languageAvailabilities".split(".")),
+            "name": extract_nested(raw_svc, "service.name".split(".")),
+            "orderType": extract_map_nested(raw_svc, 'order_type', "service.orderType".split(".")),
+            "organization": extract_nested(organization, "provider.name".split(".")),
+            "pricing": extract_nested(raw_svc, "service.pricing".split(".")),
+            "privacyPolicy": extract_nested(raw_svc, "service.privacyPolicy".split(".")),
+            "providers": list(map(lambda p: p['provider']['name'], provider_records)),
+            "relatedPlatforms": extract_map_nested(raw_svc, 'related_platform', "service.relatedPlatforms".split(".")),
+            "relatedResources": list(map(lambda p: p['service']['name'], related_resources_records)),
+            "tags": extract_nested(raw_svc, "service.tags".split(".")),
+            "targetUsers": extract_map_nested(raw_svc, 'target_user', "service.targetUsers".split(".")),
+            "termsOfUse": extract_nested(raw_svc, "service.termsOfUse".split(".")),
+            "thematic": extract_nested(datasource, "datasource.thematic".split(".")) or False,
+            "trl": extract_map_nested(raw_svc, 'trl', "service.trl".split(".")),
+            "type": 'datasource' if extract_nested(datasource, "datasource.id".split(".")) is not None else 'service',
+            "useCases": extract_nested(raw_svc, "service.useCases".split(".")),
+            "userManual": extract_nested(raw_svc, "service.userManual".split(".")),
+            "webpage": extract_nested(raw_svc, "service.webpage".split(".")),
+            "year": datetime.fromtimestamp(
+                int(extract_nested(raw_svc, "metadata.registeredAt".split("."))) / 1000).year,
+        }
+
+        return delete_none(res)
+
+    def map_training(self, raw_trn: dict) -> dict:
+        organization = self.get_provider(extract_nested(raw_trn, ['trainingResource', 'resourceOrganisation']))
+
+        res = {
+            "accessRight": extract_map_nested(raw_trn, 'tr_access', "trainingResource.accessRights".split(".")),
+            "alternativeIdentifiers": extract_nested(raw_trn,
+                                                     "trainingResource.alternativeIdentifiers".split(".")),
+            "authors": extract_nested(raw_trn,
+                                      "trainingResource.authors".split(".")),
+
+            "contentResourceType": extract_map_nested(raw_trn, 'tr_content',
+                                                      "trainingResource.contentResourceTypes".split(".")),
+
+            "description": extract_nested(raw_trn,
+                                          "trainingResource.description".split(".")),
+            "domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
+                                          "subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
+                               extract_nested(raw_trn, "trainingResource.scientificDomains".split(".")))),
+            "duration": extract_nested(raw_trn,
+                                       "trainingResource.duration".split(".")),
+            "expertiseLevel": extract_map_nested(raw_trn, 'expertise_level',
+                                                 "trainingResource.expertiseLevel".split(".")),
+
+            "id": extract_nested(raw_trn,
+                                 "trainingResource.id".split(".")),
+            "keyword": extract_nested(raw_trn,
+                                      "trainingResource.keywords".split(".")),
+            "language": extract_map_nested(raw_trn, 'languages', "trainingResource.languages".split(".")),
+            "learningOutcomes": extract_nested(raw_trn,
+                                               "trainingResource.learningOutcomes".split(".")),
+            "learningResourceType": extract_map_nested(raw_trn, 'tr_dcmi',
+                                                       "trainingResource.learningResourceTypes".split(".")),
+
+            "license": extract_nested(raw_trn,
+                                      "trainingResource.license".split(".")),
+            "organization": extract_nested(organization, "provider.name".split(".")),
+            "qualifications": extract_map_nested(raw_trn, 'qualification',
+                                                 "trainingResource.qualifications".split(".")),
+            "targetGroup": extract_map_nested(raw_trn, 'target_user', "trainingResource.targetGroups".split(".")),
+            "title": extract_nested(raw_trn,
+                                    "trainingResource.title".split(".")),
+            "type": 'trainingResource',
+            "url": extract_nested(raw_trn,
+                                  "trainingResource.url".split(".")),
+            "year": datetime.fromtimestamp(
+                int(extract_nested(raw_trn, "metadata.registeredAt".split("."))) / 1000).year,
+        }
+
+        return delete_none(res)
+
+    def map_interoperability(self, raw_itr: dict) -> dict:
+        organization = self.get_provider(extract_nested(raw_itr, ['interoperabilityRecord', 'providerId']))
+        service_records = self.get_services(self.get_services_of_interoperability(raw_itr['id']))
+
+        res = {
+            "alternativeIdentifiers": extract_nested(raw_itr,
+                                                     "interoperabilityRecord.alternativeIdentifiers".split(".")),
+            "creators": list(map(lambda c: {
+                "affiliation": extract_nested(c, ['creatorAffiliationInfo', 'affiliation']),
+                "givenName": extract_nested(c, ['givenName']),
+                "familyName": extract_nested(c, ['familyName']),
+                "fullName": extract_nested(c, ['creatorNameTypeInfo', 'creatorName']),
+                "type": extract_nested(c, ['creatorNameTypeInfo', 'nameType'])
+            }, extract_nested(raw_itr, "interoperabilityRecord.creators".split(".")))),
+
+            "description": extract_nested(raw_itr,
+                                          "interoperabilityRecord.description".split(".")),
+            "doi": extract_nested(raw_itr, ['identifierInfo', 'identifier']) if
+            extract_nested(raw_itr, ['identifierInfo', 'identifierType']) == 'ir_identifier_type-doi' else None,
+            "domain": {'domain': extract_map_nested(raw_itr, 'domains',
+                                                    "interoperabilityRecord.domain".split("."))},
+            "guidelineType": extract_map_nested(raw_itr, 'guideline_type',
+                                                "interoperabilityRecord.eoscGuidelineType".split(".")),
+            "id": extract_nested(raw_itr,
+                                 "interoperabilityRecord.id".split(".")),
+            "license": extract_nested(raw_itr, "interoperabilityRecord.rights.rightIdentifier".split(".")),
+            "licenseDetails": list(map(lambda c: {
+                "identifier": extract_nested(c, ['rightIdentifier']),
+                "title": extract_nested(c, ['rightTitle']),
+                "uri": extract_nested(c, ['rightURI'])
+            }, extract_nested(raw_itr, "interoperabilityRecord.rights".split(".")))),
+            "organization": extract_nested(organization, "provider.name".split(".")),
+            "provider": extract_nested(organization, "provider.name".split(".")),
+            "publicationYear": extract_nested(raw_itr, "interoperabilityRecord.publicationYear".split(".")),
+            "services": list(map(lambda s: {
+                "name": extract_nested(organization, "service.name".split(".")),
+                "organization": extract_nested(organization, "service.organization".split(".")),
+                # s.organization on already mapped services
+            }, service_records)),
+            "status": extract_nested(raw_itr, "interoperabilityRecord.status".split(".")),
+            "title": extract_nested(raw_itr, "interoperabilityRecord.title".split(".")),
+            "type": 'interoperabilityRecord',
+            # "year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
+        }
+
+        return delete_none(res)
--- a/airflow/dags/catalogue/dictutils.py
+++ b/airflow/dags/catalogue/dictutils.py
@ -0,0 +1,41 @@
+from typing import Dict, Any, List
+
+from catalogue.vocabulary import CATALOG_VOCABULARY
+
+
+def extract_nested(current_value: Dict[str, Any], labels: List[str]) -> Any | None:
+    if len(labels) <= 0:
+        return current_value
+    for label in labels:
+        if isinstance(current_value, dict) and label in current_value:
+            current_value = current_value[label]
+        else:
+            return None
+
+    return current_value
+
+
+def extract_map_nested(current_value: Dict[str, Any], dictionary: str, labels: List[str]) -> Any | None:
+    value = extract_nested(current_value, labels)
+    if value is None:
+        return None
+    if isinstance(value, list):
+        return list(map(lambda d: CATALOG_VOCABULARY[dictionary][d] if d else None, value))
+    if isinstance(value, str) and value != '':
+        return CATALOG_VOCABULARY[dictionary][value]
+    return None
+
+
+def delete_none(_dict):
+    """Delete None values recursively from all of the dictionaries, tuples, lists, sets"""
+    if isinstance(_dict, dict):
+        for key, value in list(_dict.items()):
+            if isinstance(value, (list, dict, tuple, set)):
+                _dict[key] = delete_none(value)
+            elif value is None or key is None:
+                del _dict[key]
+
+    elif isinstance(_dict, (list, set, tuple)):
+        _dict = type(_dict)(delete_none(item) for item in _dict if item is not None)
+
+    return _dict
--- a/airflow/dags/catalogue/mappers.py
+++ b/airflow/dags/catalogue/mappers.py
@ -3,347 +3,11 @@ from typing import Dict, Any, List

 from opensearchpy import OpenSearch

+from catalogue.dictutils import extract_nested, extract_map_nested, delete_none
 from catalogue.vocabulary import CATALOG_VOCABULARY


-def extract_nested(current_value: Dict[str, Any], labels: List[str]) -> Any | None:
-    if len(labels) <= 0:
-        return current_value
-    for label in labels:
-        if isinstance(current_value, dict) and label in current_value:
-            current_value = current_value[label]
-        else:
-            return None
-
-    return current_value


-def extract_map_nested(current_value: Dict[str, Any], dictionary: str, labels: List[str]) -> Any | None:
-    value = extract_nested(current_value, labels)
-    if value is None:
-        return None
-    if isinstance(value, list):
-        return list(map(lambda d: CATALOG_VOCABULARY[dictionary][d] if d else None, value))
-    if isinstance(value, str) and value != '':
-        return CATALOG_VOCABULARY[dictionary][value]
-    return None


-def delete_none(_dict):
-    """Delete None values recursively from all of the dictionaries, tuples, lists, sets"""
-    if isinstance(_dict, dict):
-        for key, value in list(_dict.items()):
-            if isinstance(value, (list, dict, tuple, set)):
-                _dict[key] = delete_none(value)
-            elif value is None or key is None:
-                del _dict[key]
-
-    elif isinstance(_dict, (list, set, tuple)):
-        _dict = type(_dict)(delete_none(item) for item in _dict if item is not None)
-
-    return _dict
-
-
-def map_service(raw_data: dict, os_client: OpenSearch) -> dict:
-    response = os_client.search(
-        body={
-            'query': {
-                'term': {
-                    'resourceInteroperabilityRecord.resourceId.keyword': raw_data['id'],
-                }
-            },
-            "fields": [
-                "resourceInteroperabilityRecord.interoperabilityRecordIds"
-            ],
-            "_source": False
-        },
-        index='resource-interoperability-records_test'
-    )
-
-    ig_ids = []
-    interoperability_records = []
-    for hit in response['hits']['hits']:
-        ig_ids.extend(extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.interoperabilityRecordIds']) or [])
-
-    if len(ig_ids) > 0:
-        response = os_client.search(
-            body={
-
-                "query": {
-                    "ids": {
-                        "values": ig_ids,
-                    }
-                },
-            },
-            index='interoperability-records_test'
-        )
-        for hit in response['hits']['hits']:
-            interoperability_records.append(extract_nested(hit, ['_source']))
-
-    organization = {}  # "provider.name through service.resourceOrganisation=provider.id", for service.Organization
-    if extract_nested(raw_data, ['service', 'resourceOrganisation']) is not None:
-        response = os_client.search(
-            body={
-                "query": {
-                    "ids": {
-                        "values": [extract_nested(raw_data, ['service', 'resourceOrganisation'])],
-                    }
-                },
-            },
-            index='providers_test'
-        )
-        for hit in response['hits']['hits']:
-            organization = extract_nested(hit, ['_source'])
-            break
-
-        # INNER JOIN! extract_map_nested(raw_data, 'rel_service', "service.relatedResources".split(".")),
-        # TODO: relatedResources via query
-    related_resources_records = []
-    related_resources_ids = list(
-        filter(lambda i: len(i) > 0, extract_nested(raw_data, ['service', 'relatedResources']) or []))
-    print(related_resources_ids)
-    if related_resources_ids is not None and len(related_resources_ids) > 0:
-        response = os_client.search(
-            body={
-                "query": {
-                    "ids": {
-                        "values": related_resources_ids if isinstance(related_resources_ids, list) else [
-                            related_resources_ids],
-                    }
-                },
-            },
-            index='services_test'
-        )
-        for hit in response['hits']['hits']:
-            related_resources_records.append(extract_nested(hit, ['_source']))
-
-    provider_records = []  # "provider.name through s.service.resourceProviders=provider.id",
-    provider_ids = list(filter(lambda i: len(i) > 0, extract_nested(raw_data, ['service', 'resourceProviders']) or []))
-    print(provider_ids)
-    if provider_ids is not None and len(provider_ids) > 0:
-        response = os_client.search(
-            body={
-                "query": {
-                    "ids": {
-                        "values": provider_ids if isinstance(provider_ids, list) else [provider_ids],
-                    }
-                },
-            },
-            index='providers_test'
-        )
-        for hit in response['hits']['hits']:
-            provider_records.append(extract_nested(hit, ['_source']))
-
-    datasource = {}  # datasource that point to this via serviceID
-    response = os_client.search(
-        body={
-            'query': {
-                'term': {
-                    'datasource.serviceId.keyword': raw_data['id'],
-                }
-            }
-        },
-        index='datasources_test'
-    )
-    for hit in response['hits']['hits']:
-        datasource = extract_nested(hit, ['_source'])
-        break
-
-    res = {
-        "accessRestriction": extract_nested(raw_data,
-                                            "service.geographicalAvailabilities".split(".")),
-        "accessTypes": extract_map_nested(raw_data, 'access_type', "service.accessTypes".split(".")),
-        "access_modes": extract_map_nested(raw_data, 'access_mode', "service.accessModes".split(".")),
-        "category": list(map(lambda c: {"category": CATALOG_VOCABULARY['categories'][c['category']],
-                                        "subcategory": CATALOG_VOCABULARY['subcategories'][c['subcategory']]},
-                             extract_nested(raw_data, "service.categories".split(".")))),
-        "description": extract_nested(raw_data, "service.description".split(".")),
-        "domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
-                                      "subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
-                           extract_nested(raw_data, "service.scientificDomains".split(".")))),
-        "grantProjectNames": extract_nested(raw_data, "service.grantProjectNames".split(".")),
-        "helpdeskPage": extract_nested(raw_data, "service.helpdeskPage".split(".")),
-        "horizontal": extract_nested(raw_data, "service.horizontalService".split(".")) or False,
-        "id": extract_nested(raw_data, "service.id".split(".")),
-        "interoperabilityGuidelines": list(
-            map(lambda ig: ig['interoperabilityRecord']['title'], interoperability_records)),
-        "language": extract_map_nested(raw_data, 'languages', "service.languageAvailabilities".split(".")),
-        "name": extract_nested(raw_data, "service.name".split(".")),
-        "orderType": extract_map_nested(raw_data, 'order_type', "service.orderType".split(".")),
-        "organization": extract_nested(organization, "provider.name".split(".")),
-        "pricing": extract_nested(raw_data, "service.pricing".split(".")),
-        "privacyPolicy": extract_nested(raw_data, "service.privacyPolicy".split(".")),
-        "providers": list(map(lambda p: p['provider']['name'], provider_records)),
-        "relatedPlatforms": extract_map_nested(raw_data, 'related_platform', "service.relatedPlatforms".split(".")),
-        "relatedResources": list(map(lambda p: p['service']['name'], related_resources_records)),
-        "tags": extract_nested(raw_data, "service.tags".split(".")),
-        "targetUsers": extract_map_nested(raw_data, 'target_user', "service.targetUsers".split(".")),
-        "termsOfUse": extract_nested(raw_data, "service.termsOfUse".split(".")),
-        "thematic": extract_nested(datasource, "datasource.thematic".split(".")) or False,
-        "trl": extract_map_nested(raw_data, 'trl', "service.trl".split(".")),
-        "type": 'datasource' if extract_nested(datasource, "datasource.id".split(".")) is not None else 'service',
-        "useCases": extract_nested(raw_data, "service.useCases".split(".")),
-        "userManual": extract_nested(raw_data, "service.userManual".split(".")),
-        "webpage": extract_nested(raw_data, "service.webpage".split(".")),
-        "year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
-    }
-
-    return delete_none(res)
-
-
-def map_training(raw_data: dict, os_client: OpenSearch) -> dict:
-    organization = {}  # "provider.name through service.resourceOrganisation=provider.id", for service.Organization
-    if extract_nested(raw_data, ['trainingResource', 'resourceOrganisation']) is not None:
-        response = os_client.search(
-            body={
-                "query": {
-                    "ids": {
-                        "values": [extract_nested(raw_data, ['trainingResource', 'resourceOrganisation'])],
-                    }
-                },
-            },
-            index='providers_test'
-        )
-        for hit in response['hits']['hits']:
-            organization = extract_nested(hit, ['_source'])
-            break
-
-    res = {
-        "accessRight": extract_map_nested(raw_data, 'tr_access', "trainingResource.accessRights".split(".")),
-        "alternativeIdentifiers": extract_nested(raw_data,
-                                                 "trainingResource.alternativeIdentifiers".split(".")),
-        "authors": extract_nested(raw_data,
-                                  "trainingResource.authors".split(".")),
-
-        "contentResourceType": extract_map_nested(raw_data, 'tr_content',
-                                                  "trainingResource.contentResourceTypes".split(".")),
-
-        "description": extract_nested(raw_data,
-                                      "trainingResource.description".split(".")),
-        "domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
-                                      "subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
-                           extract_nested(raw_data, "trainingResource.scientificDomains".split(".")))),
-        "duration": extract_nested(raw_data,
-                                   "trainingResource.duration".split(".")),
-        "expertiseLevel": extract_map_nested(raw_data, 'expertise_level', "trainingResource.expertiseLevel".split(".")),
-
-        "id": extract_nested(raw_data,
-                             "trainingResource.id".split(".")),
-        "keyword": extract_nested(raw_data,
-                                  "trainingResource.keywords".split(".")),
-        "language": extract_map_nested(raw_data, 'languages', "trainingResource.languages".split(".")),
-        "learningOutcomes": extract_nested(raw_data,
-                                           "trainingResource.learningOutcomes".split(".")),
-        "learningResourceType": extract_map_nested(raw_data, 'tr_dcmi',
-                                                   "trainingResource.learningResourceTypes".split(".")),
-
-        "license": extract_nested(raw_data,
-                                  "trainingResource.license".split(".")),
-        "organization": extract_nested(organization, "provider.name".split(".")),
-        "qualifications": extract_map_nested(raw_data, 'qualification', "trainingResource.qualifications".split(".")),
-        "targetGroup": extract_map_nested(raw_data, 'target_user', "trainingResource.targetGroups".split(".")),
-        "title": extract_nested(raw_data,
-                                "trainingResource.title".split(".")),
-        "type": 'trainingResource',
-        "url": extract_nested(raw_data,
-                              "trainingResource.url".split(".")),
-        "year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
-    }
-
-    return delete_none(res)
-
-
-def map_interoperability(raw_data: dict, os_client: OpenSearch) -> dict:
-    organization = {}  # "provider.name through service.resourceOrganisation=provider.id", for service.Organization
-    if extract_nested(raw_data, ['interoperabilityRecord', 'providerId']) is not None:
-        response = os_client.search(
-            body={
-                "query": {
-                    "ids": {
-                        "values": [extract_nested(raw_data, ['interoperabilityRecord', 'providerId'])],
-                    }
-                },
-            },
-            index='providers_test'
-        )
-        for hit in response['hits']['hits']:
-            organization = extract_nested(hit, ['_source'])
-            break
-
-    response = os_client.search(
-        body={
-            'query': {
-                'term': {
-                    'resourceInteroperabilityRecord.interoperabilityRecordIds.keyword': raw_data['id'],
-                }
-            },
-            "fields": [
-                "resourceInteroperabilityRecord.resourceId"
-            ],
-            "_source": False
-        },
-        index='resource-interoperability-records_test'
-    )
-    svc_ids = []
-    service_records = []
-    for hit in response['hits']['hits']:
-        svc_ids.extend(extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.resourceId']) or [])
-
-    print(raw_data)
-    print(svc_ids)
-    if len(svc_ids) > 0:
-        response = os_client.search(
-            body={
-                "query": {
-                    "ids": {
-                        "values": svc_ids,
-                    }
-                },
-            },
-            index='services_test'
-        )
-        for hit in response['hits']['hits']:
-            service_records.append(extract_nested(hit, ['_source']))
-
-    res = {
-        "alternativeIdentifiers": extract_nested(raw_data,
-                                                 "interoperabilityRecord.alternativeIdentifiers".split(".")),
-        "creators": list(map(lambda c: {
-            "affiliation": extract_nested(c, ['creatorAffiliationInfo', 'affiliation']),
-            "givenName": extract_nested(c, ['givenName']),
-            "familyName": extract_nested(c, ['familyName']),
-            "fullName": extract_nested(c, ['creatorNameTypeInfo', 'creatorName']),
-            "type": extract_nested(c, ['creatorNameTypeInfo', 'nameType'])
-        }, extract_nested(raw_data, "interoperabilityRecord.creators".split(".")))),
-
-        "description": extract_nested(raw_data,
-                                      "interoperabilityRecord.description".split(".")),
-        "doi": extract_nested(raw_data, ['identifierInfo', 'identifier']) if
-        extract_nested(raw_data, ['identifierInfo', 'identifierType']) == 'ir_identifier_type-doi' else None,
-        "domain": {'domain': extract_map_nested(raw_data, 'domains',
-                                                "interoperabilityRecord.domain".split("."))},
-        "guidelineType": extract_map_nested(raw_data, 'guideline_type',
-                                            "interoperabilityRecord.eoscGuidelineType".split(".")),
-        "id": extract_nested(raw_data,
-                             "interoperabilityRecord.id".split(".")),
-        "license": extract_nested(raw_data, "interoperabilityRecord.rights.rightIdentifier".split(".")),
-        "licenseDetails": list(map(lambda c: {
-            "identifier": extract_nested(c, ['rightIdentifier']),
-            "title": extract_nested(c, ['rightTitle']),
-            "uri": extract_nested(c, ['rightURI'])
-        }, extract_nested(raw_data, "interoperabilityRecord.rights".split(".")))),
-        "organization": extract_nested(organization, "provider.name".split(".")),
-        "provider": extract_nested(organization, "provider.name".split(".")),
-        "publicationYear": extract_nested(raw_data, "interoperabilityRecord.publicationYear".split(".")),
-        "services": list(map(lambda s: {
-            "name": extract_nested(organization, "service.name".split(".")),
-            "organization": extract_nested(organization, "service.organization".split(".")),
-            # s.organization on already mapped services
-        }, service_records)),
-        "status": extract_nested(raw_data, "interoperabilityRecord.status".split(".")),
-        "title": extract_nested(raw_data, "interoperabilityRecord.title".split(".")),
-        "type": 'interoperabilityRecord',
-        # "year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
-    }
-
-    return delete_none(res)
--- a/airflow/dags/import_Catalogues.py
+++ b/airflow/dags/import_Catalogues.py
@ -2,8 +2,7 @@ from __future__ import annotations

 import json
 import os
-from datetime import timedelta, datetime
-from typing import Any, List, Dict
+from datetime import timedelta

 import opensearchpy
 import pendulum
@ -12,9 +11,9 @@ from airflow.decorators import dag
 from airflow.decorators import task
 from airflow.hooks.base import BaseHook
 from airflow.utils.helpers import chain
-from opensearchpy import OpenSearch
+from opensearchpy import OpenSearch, helpers

-from catalogue.mappers import map_interoperability, map_training, map_service
+from catalogue.RawCatalogOpensearch import RawCatalogOpensearch

 EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))

@ -34,7 +33,6 @@ default_args = {
    default_args=default_args,
    params={
        "OPENSEARCH_CONN_ID": "opensearch_default",
-        "BATCH_LOADERS_NUM": 10,
        "ENTITIES": ["datasources",
                     "interoperability-records",
                     "providers",
@ -94,14 +92,18 @@ def import_catalogue_entities():
            pool_maxsize=20,
            timeout=180
        )
+        catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])

        session = requests.session()
        for entity in kwargs["params"]["ENTITIES"]:
-            indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
+            indexname = catalog.get_index(entity)
            baseurl = "http://vereniki.athenarc.gr:8080/eic-registry"
            callurl = f"{baseurl}/{entity}"
            params = {"draft": "false", "active": "true", "suspended": "false"}

+            if client.indices.exists(indexname):
+                client.indices.delete(indexname)
+
            while True:
                reply = session.get(url=callurl, params=params)
                reply.raise_for_status()
@ -112,20 +114,24 @@ def import_catalogue_entities():
                if len(results) <= 0:
                    break

-                for result in results:
-                    # TODO: mapping code
-                    body = {"doc": result, "doc_as_upsert": True}
-                    client.update(
-                        index=indexname,
-                        body=body,
-                        id=result['id'],
-                        refresh=True
-                    )
+                def streamed_results():
+                    for r in results:
+                        yield {"_index": indexname, "_id": r['id'], "_source": r}
+
+                succeeded = 0
+                failed = 0
+                for success, item in helpers.parallel_bulk(client, actions=streamed_results(), timeout=5*60):
+                    if success:
+                        succeeded = succeeded + 1
+                    else:
+                        print("error: " + str(item))
+                        failed = failed + 1

                # end of stream conditions
                if content['to'] >= content['total']:
                    break
                params['from'] = content['to']
+            client.indices.refresh(indexname)

    @task
    def map_indexes(**kwargs):
@ -140,22 +146,31 @@ def import_catalogue_entities():
            timeout=180
        )

-        session = requests.session()
-        for entity in {"interoperability-records", "training-resources", "services"}.intersection(kwargs["params"]["ENTITIES"]):
-            indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
+        catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])

-            for hit in opensearchpy.helpers.scan(client, index=indexname, query={"query": {"match_all": {}}}):
+        for entity in {"interoperability-records", "training-resources", "services"}.intersection(
+                kwargs["params"]["ENTITIES"]):
+
+            for hit in opensearchpy.helpers.scan(client, index=catalog.get_index(entity),
+                                                 query={"query": {"match_all": {}}}):
                s = hit['_source']

+                doc = None
                match entity:
                    case "interoperability-records":
-                        print(json.dumps(map_interoperability(s, client), indent=-1))
+                        doc = catalog.map_interoperability(s)
                    case "training-resources":
-                        print(json.dumps(map_training(s, client), indent=-1))
+                        doc = catalog.map_training(s)
                    case "services":
-                        print(json.dumps(map_service(s, client), indent=-1))
-                    case _:
-                        pass
+                        doc = catalog.map_service(s)
+
+                if doc is not None:
+                    client.update(
+                        index=f'{entity}_{kwargs["params"]["SUFFIX"]}',
+                        body={"doc": doc, "doc_as_upsert": True},
+                        id=doc['id'],
+                        refresh=True
+                    )

    @task
    def close_indexes(**kwargs):
@ -177,7 +192,6 @@ def import_catalogue_entities():
                    "number_of_replicas": 1,
                    "refresh_interval": "60s",
                }
-
            })

        # update aliases