initial stage

This commit is contained in:
Giambattista Bloisi 2024-07-26 17:52:37 +02:00
parent 1ad367efcc
commit 48d2f69bc6
5 changed files with 386 additions and 383 deletions

View File

@ -6,9 +6,9 @@ mappings['datasource'] = {
"data_source_classification": {
"type": "keyword"
},
# "eoscId": {
# "type": "keyword"
# },
"entity_type": {
"type": "keyword"
},
"identifiers": {
"type": "object",
"properties": {
@ -86,6 +86,9 @@ mappings['datasource'] = {
mappings['venues'] = {
"properties": {
"entity_type": {
"type": "keyword"
},
"identifiers": {
"type": "object",
"properties": {
@ -124,6 +127,9 @@ mappings['venues'] = {
mappings['topics'] = {
"properties": {
"entity_type": {
"type": "keyword"
},
"identifiers": {
"type": "object",
"properties": {
@ -151,6 +157,9 @@ mappings['topics'] = {
mappings['persons'] = {
"properties": {
"entity_type": {
"type": "keyword"
},
"family_name": {
"fields": {
"keyword": {
@ -197,6 +206,9 @@ mappings['organizations'] = {
"country": {
"type": "keyword"
},
"entity_type": {
"type": "keyword"
},
"identifiers": {
"type": "object",
"properties": {
@ -394,9 +406,9 @@ mappings['products'] = {
"contributions": {
"type": "object",
"properties": {
# "declared_affiliations": {
# "type": "keyword"
# },
"is_listed_author": {
"type": "boolean"
},
"person": {
"type": "object",
"properties": {
@ -420,11 +432,11 @@ mappings['products'] = {
"index": False,
"type": "long"
},
# "roles": {
# "type": "keyword"
# }
}
},
"entity_type": {
"type": "keyword"
},
"funding": {
"type": "object",
"properties": {
@ -506,15 +518,9 @@ mappings['products'] = {
"end_page": {
"type": "text"
},
# "hosting_data_source": {
# "type": "text"
# },
"issue": {
"type": "text"
},
# "number": {
# "type": "text"
# },
"publisher": {
"type": "text"
},
@ -537,9 +543,6 @@ mappings['products'] = {
}
}
},
"eoscId": {
"type": "keyword"
},
"hosting_datasource": {
"type": "object",
"properties": {
@ -612,9 +615,6 @@ mappings['products'] = {
"pmid": {
"type": "keyword"
},
# "title": {
# "type": "text"
# }
}
},
"relation_type": {
@ -686,6 +686,7 @@ mappings['products'] = {
}
}
},
# Aliases
"type": {
"path": "product_type",
"type": "alias"

View File

@ -0,0 +1,283 @@
from datetime import datetime
from opensearchpy import OpenSearch
from catalogue.dictutils import extract_nested, extract_map_nested, delete_none
from catalogue.vocabulary import CATALOG_VOCABULARY
class RawCatalogOpensearch:
def __init__(self, os_client: OpenSearch, suffix: str | None):
self.os_client = os_client
self.suffix = suffix
def get_index(self, name: str):
return "catalog_" + name + ("" if self.suffix is None else f"_{self.suffix}")
def get_resource_interoperability_records(self, resource_id):
response = self.os_client.search(
body={
'query': {
'term': {
'resourceInteroperabilityRecord.resourceId.keyword': resource_id,
}
},
"fields": [
"resourceInteroperabilityRecord.interoperabilityRecordIds"
],
"_source": False
},
index=self.get_index('resource-interoperability-records')
)
interoperability_ids = []
interoperability_records = []
for hit in response['hits']['hits']:
interoperability_ids.extend(
extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.interoperabilityRecordIds']) or [])
if len(interoperability_ids) > 0:
response = self.os_client.search(
body={
"query": {
"ids": {
"values": interoperability_ids,
}
},
},
index=self.get_index('interoperability-records')
)
for hit in response['hits']['hits']:
interoperability_records.append(extract_nested(hit, ['_source']))
return interoperability_records
def get_providers(self, provider_ids: list[str]) -> list:
provider_records = []
if provider_ids is not None and len(provider_ids) > 0:
response = self.os_client.search(
body={
"query": {
"ids": {
"values": provider_ids if isinstance(provider_ids, list) else [provider_ids],
}
},
},
index=self.get_index('providers')
)
for hit in response['hits']['hits']:
provider_records.append(extract_nested(hit, ['_source']))
return provider_records
def get_provider(self, provider_id: str):
if provider_id is not None:
providers = self.get_providers([provider_id])
if providers is not None and len(providers) > 0:
return providers[0]
return {}
def get_services(self, service_ids: list[str]) -> list:
service_records = []
if service_ids is not None and len(service_ids) > 0:
response = self.os_client.search(
body={
"query": {
"ids": {
"values": service_ids if isinstance(service_ids, list) else [
service_ids],
}
},
},
index=self.get_index('services')
)
for hit in response['hits']['hits']:
service_records.append(extract_nested(hit, ['_source']))
return service_records
def get_datasource_of_service(self, service_id: str):
response = self.os_client.search(
body={
'query': {
'term': {
'datasource.serviceId.keyword': service_id,
}
}
},
index=self.get_index('datasources')
)
for hit in response['hits']['hits']:
return extract_nested(hit, ['_source'])
return {}
def get_services_of_interoperability(self, interoperability_id: str):
svc_ids = []
response = self.os_client.search(
body={
'query': {
'term': {
'resourceInteroperabilityRecord.interoperabilityRecordIds.keyword': interoperability_id,
}
},
"fields": [
"resourceInteroperabilityRecord.resourceId"
],
"_source": False
},
index=self.get_index('resource-interoperability-records')
)
for hit in response['hits']['hits']:
svc_ids.extend(extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.resourceId']) or [])
return svc_ids
def map_service(self, raw_svc: dict) -> dict:
interoperability_records = self.get_resource_interoperability_records(raw_svc['id'])
organization = self.get_provider(extract_nested(raw_svc, ['service', 'resourceOrganisation']))
provider_records = self.get_providers(list(
filter(lambda i: len(i) > 0, extract_nested(raw_svc, ['service', 'resourceProviders']) or [])))
related_resources_records = self.get_services(list(
filter(lambda i: len(i) > 0, extract_nested(raw_svc, ['service', 'relatedResources']) or [])))
datasource = self.get_datasource_of_service(raw_svc['id'])
res = {
"accessRestriction": extract_nested(raw_svc,
"service.geographicalAvailabilities".split(".")),
"accessTypes": extract_map_nested(raw_svc, 'access_type', "service.accessTypes".split(".")),
"access_modes": extract_map_nested(raw_svc, 'access_mode', "service.accessModes".split(".")),
"category": list(map(lambda c: {"category": CATALOG_VOCABULARY['categories'][c['category']],
"subcategory": CATALOG_VOCABULARY['subcategories'][c['subcategory']]},
extract_nested(raw_svc, "service.categories".split(".")))),
"description": extract_nested(raw_svc, "service.description".split(".")),
"domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
"subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
extract_nested(raw_svc, "service.scientificDomains".split(".")))),
"grantProjectNames": extract_nested(raw_svc, "service.grantProjectNames".split(".")),
"helpdeskPage": extract_nested(raw_svc, "service.helpdeskPage".split(".")),
"horizontal": extract_nested(raw_svc, "service.horizontalService".split(".")) or False,
"id": extract_nested(raw_svc, "service.id".split(".")),
"interoperabilityGuidelines": list(
map(lambda ig: ig['interoperabilityRecord']['title'], interoperability_records)),
"language": extract_map_nested(raw_svc, 'languages', "service.languageAvailabilities".split(".")),
"name": extract_nested(raw_svc, "service.name".split(".")),
"orderType": extract_map_nested(raw_svc, 'order_type', "service.orderType".split(".")),
"organization": extract_nested(organization, "provider.name".split(".")),
"pricing": extract_nested(raw_svc, "service.pricing".split(".")),
"privacyPolicy": extract_nested(raw_svc, "service.privacyPolicy".split(".")),
"providers": list(map(lambda p: p['provider']['name'], provider_records)),
"relatedPlatforms": extract_map_nested(raw_svc, 'related_platform', "service.relatedPlatforms".split(".")),
"relatedResources": list(map(lambda p: p['service']['name'], related_resources_records)),
"tags": extract_nested(raw_svc, "service.tags".split(".")),
"targetUsers": extract_map_nested(raw_svc, 'target_user', "service.targetUsers".split(".")),
"termsOfUse": extract_nested(raw_svc, "service.termsOfUse".split(".")),
"thematic": extract_nested(datasource, "datasource.thematic".split(".")) or False,
"trl": extract_map_nested(raw_svc, 'trl', "service.trl".split(".")),
"type": 'datasource' if extract_nested(datasource, "datasource.id".split(".")) is not None else 'service',
"useCases": extract_nested(raw_svc, "service.useCases".split(".")),
"userManual": extract_nested(raw_svc, "service.userManual".split(".")),
"webpage": extract_nested(raw_svc, "service.webpage".split(".")),
"year": datetime.fromtimestamp(
int(extract_nested(raw_svc, "metadata.registeredAt".split("."))) / 1000).year,
}
return delete_none(res)
def map_training(self, raw_trn: dict) -> dict:
organization = self.get_provider(extract_nested(raw_trn, ['trainingResource', 'resourceOrganisation']))
res = {
"accessRight": extract_map_nested(raw_trn, 'tr_access', "trainingResource.accessRights".split(".")),
"alternativeIdentifiers": extract_nested(raw_trn,
"trainingResource.alternativeIdentifiers".split(".")),
"authors": extract_nested(raw_trn,
"trainingResource.authors".split(".")),
"contentResourceType": extract_map_nested(raw_trn, 'tr_content',
"trainingResource.contentResourceTypes".split(".")),
"description": extract_nested(raw_trn,
"trainingResource.description".split(".")),
"domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
"subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
extract_nested(raw_trn, "trainingResource.scientificDomains".split(".")))),
"duration": extract_nested(raw_trn,
"trainingResource.duration".split(".")),
"expertiseLevel": extract_map_nested(raw_trn, 'expertise_level',
"trainingResource.expertiseLevel".split(".")),
"id": extract_nested(raw_trn,
"trainingResource.id".split(".")),
"keyword": extract_nested(raw_trn,
"trainingResource.keywords".split(".")),
"language": extract_map_nested(raw_trn, 'languages', "trainingResource.languages".split(".")),
"learningOutcomes": extract_nested(raw_trn,
"trainingResource.learningOutcomes".split(".")),
"learningResourceType": extract_map_nested(raw_trn, 'tr_dcmi',
"trainingResource.learningResourceTypes".split(".")),
"license": extract_nested(raw_trn,
"trainingResource.license".split(".")),
"organization": extract_nested(organization, "provider.name".split(".")),
"qualifications": extract_map_nested(raw_trn, 'qualification',
"trainingResource.qualifications".split(".")),
"targetGroup": extract_map_nested(raw_trn, 'target_user', "trainingResource.targetGroups".split(".")),
"title": extract_nested(raw_trn,
"trainingResource.title".split(".")),
"type": 'trainingResource',
"url": extract_nested(raw_trn,
"trainingResource.url".split(".")),
"year": datetime.fromtimestamp(
int(extract_nested(raw_trn, "metadata.registeredAt".split("."))) / 1000).year,
}
return delete_none(res)
def map_interoperability(self, raw_itr: dict) -> dict:
organization = self.get_provider(extract_nested(raw_itr, ['interoperabilityRecord', 'providerId']))
service_records = self.get_services(self.get_services_of_interoperability(raw_itr['id']))
res = {
"alternativeIdentifiers": extract_nested(raw_itr,
"interoperabilityRecord.alternativeIdentifiers".split(".")),
"creators": list(map(lambda c: {
"affiliation": extract_nested(c, ['creatorAffiliationInfo', 'affiliation']),
"givenName": extract_nested(c, ['givenName']),
"familyName": extract_nested(c, ['familyName']),
"fullName": extract_nested(c, ['creatorNameTypeInfo', 'creatorName']),
"type": extract_nested(c, ['creatorNameTypeInfo', 'nameType'])
}, extract_nested(raw_itr, "interoperabilityRecord.creators".split(".")))),
"description": extract_nested(raw_itr,
"interoperabilityRecord.description".split(".")),
"doi": extract_nested(raw_itr, ['identifierInfo', 'identifier']) if
extract_nested(raw_itr, ['identifierInfo', 'identifierType']) == 'ir_identifier_type-doi' else None,
"domain": {'domain': extract_map_nested(raw_itr, 'domains',
"interoperabilityRecord.domain".split("."))},
"guidelineType": extract_map_nested(raw_itr, 'guideline_type',
"interoperabilityRecord.eoscGuidelineType".split(".")),
"id": extract_nested(raw_itr,
"interoperabilityRecord.id".split(".")),
"license": extract_nested(raw_itr, "interoperabilityRecord.rights.rightIdentifier".split(".")),
"licenseDetails": list(map(lambda c: {
"identifier": extract_nested(c, ['rightIdentifier']),
"title": extract_nested(c, ['rightTitle']),
"uri": extract_nested(c, ['rightURI'])
}, extract_nested(raw_itr, "interoperabilityRecord.rights".split(".")))),
"organization": extract_nested(organization, "provider.name".split(".")),
"provider": extract_nested(organization, "provider.name".split(".")),
"publicationYear": extract_nested(raw_itr, "interoperabilityRecord.publicationYear".split(".")),
"services": list(map(lambda s: {
"name": extract_nested(organization, "service.name".split(".")),
"organization": extract_nested(organization, "service.organization".split(".")),
# s.organization on already mapped services
}, service_records)),
"status": extract_nested(raw_itr, "interoperabilityRecord.status".split(".")),
"title": extract_nested(raw_itr, "interoperabilityRecord.title".split(".")),
"type": 'interoperabilityRecord',
# "year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
}
return delete_none(res)

View File

@ -0,0 +1,41 @@
from typing import Dict, Any, List
from catalogue.vocabulary import CATALOG_VOCABULARY
def extract_nested(current_value: Dict[str, Any], labels: List[str]) -> Any | None:
if len(labels) <= 0:
return current_value
for label in labels:
if isinstance(current_value, dict) and label in current_value:
current_value = current_value[label]
else:
return None
return current_value
def extract_map_nested(current_value: Dict[str, Any], dictionary: str, labels: List[str]) -> Any | None:
value = extract_nested(current_value, labels)
if value is None:
return None
if isinstance(value, list):
return list(map(lambda d: CATALOG_VOCABULARY[dictionary][d] if d else None, value))
if isinstance(value, str) and value != '':
return CATALOG_VOCABULARY[dictionary][value]
return None
def delete_none(_dict):
"""Delete None values recursively from all of the dictionaries, tuples, lists, sets"""
if isinstance(_dict, dict):
for key, value in list(_dict.items()):
if isinstance(value, (list, dict, tuple, set)):
_dict[key] = delete_none(value)
elif value is None or key is None:
del _dict[key]
elif isinstance(_dict, (list, set, tuple)):
_dict = type(_dict)(delete_none(item) for item in _dict if item is not None)
return _dict

View File

@ -3,347 +3,11 @@ from typing import Dict, Any, List
from opensearchpy import OpenSearch
from catalogue.dictutils import extract_nested, extract_map_nested, delete_none
from catalogue.vocabulary import CATALOG_VOCABULARY
def extract_nested(current_value: Dict[str, Any], labels: List[str]) -> Any | None:
if len(labels) <= 0:
return current_value
for label in labels:
if isinstance(current_value, dict) and label in current_value:
current_value = current_value[label]
else:
return None
return current_value
def extract_map_nested(current_value: Dict[str, Any], dictionary: str, labels: List[str]) -> Any | None:
value = extract_nested(current_value, labels)
if value is None:
return None
if isinstance(value, list):
return list(map(lambda d: CATALOG_VOCABULARY[dictionary][d] if d else None, value))
if isinstance(value, str) and value != '':
return CATALOG_VOCABULARY[dictionary][value]
return None
def delete_none(_dict):
"""Delete None values recursively from all of the dictionaries, tuples, lists, sets"""
if isinstance(_dict, dict):
for key, value in list(_dict.items()):
if isinstance(value, (list, dict, tuple, set)):
_dict[key] = delete_none(value)
elif value is None or key is None:
del _dict[key]
elif isinstance(_dict, (list, set, tuple)):
_dict = type(_dict)(delete_none(item) for item in _dict if item is not None)
return _dict
def map_service(raw_data: dict, os_client: OpenSearch) -> dict:
response = os_client.search(
body={
'query': {
'term': {
'resourceInteroperabilityRecord.resourceId.keyword': raw_data['id'],
}
},
"fields": [
"resourceInteroperabilityRecord.interoperabilityRecordIds"
],
"_source": False
},
index='resource-interoperability-records_test'
)
ig_ids = []
interoperability_records = []
for hit in response['hits']['hits']:
ig_ids.extend(extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.interoperabilityRecordIds']) or [])
if len(ig_ids) > 0:
response = os_client.search(
body={
"query": {
"ids": {
"values": ig_ids,
}
},
},
index='interoperability-records_test'
)
for hit in response['hits']['hits']:
interoperability_records.append(extract_nested(hit, ['_source']))
organization = {} # "provider.name through service.resourceOrganisation=provider.id", for service.Organization
if extract_nested(raw_data, ['service', 'resourceOrganisation']) is not None:
response = os_client.search(
body={
"query": {
"ids": {
"values": [extract_nested(raw_data, ['service', 'resourceOrganisation'])],
}
},
},
index='providers_test'
)
for hit in response['hits']['hits']:
organization = extract_nested(hit, ['_source'])
break
# INNER JOIN! extract_map_nested(raw_data, 'rel_service', "service.relatedResources".split(".")),
# TODO: relatedResources via query
related_resources_records = []
related_resources_ids = list(
filter(lambda i: len(i) > 0, extract_nested(raw_data, ['service', 'relatedResources']) or []))
print(related_resources_ids)
if related_resources_ids is not None and len(related_resources_ids) > 0:
response = os_client.search(
body={
"query": {
"ids": {
"values": related_resources_ids if isinstance(related_resources_ids, list) else [
related_resources_ids],
}
},
},
index='services_test'
)
for hit in response['hits']['hits']:
related_resources_records.append(extract_nested(hit, ['_source']))
provider_records = [] # "provider.name through s.service.resourceProviders=provider.id",
provider_ids = list(filter(lambda i: len(i) > 0, extract_nested(raw_data, ['service', 'resourceProviders']) or []))
print(provider_ids)
if provider_ids is not None and len(provider_ids) > 0:
response = os_client.search(
body={
"query": {
"ids": {
"values": provider_ids if isinstance(provider_ids, list) else [provider_ids],
}
},
},
index='providers_test'
)
for hit in response['hits']['hits']:
provider_records.append(extract_nested(hit, ['_source']))
datasource = {} # datasource that point to this via serviceID
response = os_client.search(
body={
'query': {
'term': {
'datasource.serviceId.keyword': raw_data['id'],
}
}
},
index='datasources_test'
)
for hit in response['hits']['hits']:
datasource = extract_nested(hit, ['_source'])
break
res = {
"accessRestriction": extract_nested(raw_data,
"service.geographicalAvailabilities".split(".")),
"accessTypes": extract_map_nested(raw_data, 'access_type', "service.accessTypes".split(".")),
"access_modes": extract_map_nested(raw_data, 'access_mode', "service.accessModes".split(".")),
"category": list(map(lambda c: {"category": CATALOG_VOCABULARY['categories'][c['category']],
"subcategory": CATALOG_VOCABULARY['subcategories'][c['subcategory']]},
extract_nested(raw_data, "service.categories".split(".")))),
"description": extract_nested(raw_data, "service.description".split(".")),
"domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
"subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
extract_nested(raw_data, "service.scientificDomains".split(".")))),
"grantProjectNames": extract_nested(raw_data, "service.grantProjectNames".split(".")),
"helpdeskPage": extract_nested(raw_data, "service.helpdeskPage".split(".")),
"horizontal": extract_nested(raw_data, "service.horizontalService".split(".")) or False,
"id": extract_nested(raw_data, "service.id".split(".")),
"interoperabilityGuidelines": list(
map(lambda ig: ig['interoperabilityRecord']['title'], interoperability_records)),
"language": extract_map_nested(raw_data, 'languages', "service.languageAvailabilities".split(".")),
"name": extract_nested(raw_data, "service.name".split(".")),
"orderType": extract_map_nested(raw_data, 'order_type', "service.orderType".split(".")),
"organization": extract_nested(organization, "provider.name".split(".")),
"pricing": extract_nested(raw_data, "service.pricing".split(".")),
"privacyPolicy": extract_nested(raw_data, "service.privacyPolicy".split(".")),
"providers": list(map(lambda p: p['provider']['name'], provider_records)),
"relatedPlatforms": extract_map_nested(raw_data, 'related_platform', "service.relatedPlatforms".split(".")),
"relatedResources": list(map(lambda p: p['service']['name'], related_resources_records)),
"tags": extract_nested(raw_data, "service.tags".split(".")),
"targetUsers": extract_map_nested(raw_data, 'target_user', "service.targetUsers".split(".")),
"termsOfUse": extract_nested(raw_data, "service.termsOfUse".split(".")),
"thematic": extract_nested(datasource, "datasource.thematic".split(".")) or False,
"trl": extract_map_nested(raw_data, 'trl', "service.trl".split(".")),
"type": 'datasource' if extract_nested(datasource, "datasource.id".split(".")) is not None else 'service',
"useCases": extract_nested(raw_data, "service.useCases".split(".")),
"userManual": extract_nested(raw_data, "service.userManual".split(".")),
"webpage": extract_nested(raw_data, "service.webpage".split(".")),
"year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
}
return delete_none(res)
def map_training(raw_data: dict, os_client: OpenSearch) -> dict:
organization = {} # "provider.name through service.resourceOrganisation=provider.id", for service.Organization
if extract_nested(raw_data, ['trainingResource', 'resourceOrganisation']) is not None:
response = os_client.search(
body={
"query": {
"ids": {
"values": [extract_nested(raw_data, ['trainingResource', 'resourceOrganisation'])],
}
},
},
index='providers_test'
)
for hit in response['hits']['hits']:
organization = extract_nested(hit, ['_source'])
break
res = {
"accessRight": extract_map_nested(raw_data, 'tr_access', "trainingResource.accessRights".split(".")),
"alternativeIdentifiers": extract_nested(raw_data,
"trainingResource.alternativeIdentifiers".split(".")),
"authors": extract_nested(raw_data,
"trainingResource.authors".split(".")),
"contentResourceType": extract_map_nested(raw_data, 'tr_content',
"trainingResource.contentResourceTypes".split(".")),
"description": extract_nested(raw_data,
"trainingResource.description".split(".")),
"domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
"subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
extract_nested(raw_data, "trainingResource.scientificDomains".split(".")))),
"duration": extract_nested(raw_data,
"trainingResource.duration".split(".")),
"expertiseLevel": extract_map_nested(raw_data, 'expertise_level', "trainingResource.expertiseLevel".split(".")),
"id": extract_nested(raw_data,
"trainingResource.id".split(".")),
"keyword": extract_nested(raw_data,
"trainingResource.keywords".split(".")),
"language": extract_map_nested(raw_data, 'languages', "trainingResource.languages".split(".")),
"learningOutcomes": extract_nested(raw_data,
"trainingResource.learningOutcomes".split(".")),
"learningResourceType": extract_map_nested(raw_data, 'tr_dcmi',
"trainingResource.learningResourceTypes".split(".")),
"license": extract_nested(raw_data,
"trainingResource.license".split(".")),
"organization": extract_nested(organization, "provider.name".split(".")),
"qualifications": extract_map_nested(raw_data, 'qualification', "trainingResource.qualifications".split(".")),
"targetGroup": extract_map_nested(raw_data, 'target_user', "trainingResource.targetGroups".split(".")),
"title": extract_nested(raw_data,
"trainingResource.title".split(".")),
"type": 'trainingResource',
"url": extract_nested(raw_data,
"trainingResource.url".split(".")),
"year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
}
return delete_none(res)
def map_interoperability(raw_data: dict, os_client: OpenSearch) -> dict:
organization = {} # "provider.name through service.resourceOrganisation=provider.id", for service.Organization
if extract_nested(raw_data, ['interoperabilityRecord', 'providerId']) is not None:
response = os_client.search(
body={
"query": {
"ids": {
"values": [extract_nested(raw_data, ['interoperabilityRecord', 'providerId'])],
}
},
},
index='providers_test'
)
for hit in response['hits']['hits']:
organization = extract_nested(hit, ['_source'])
break
response = os_client.search(
body={
'query': {
'term': {
'resourceInteroperabilityRecord.interoperabilityRecordIds.keyword': raw_data['id'],
}
},
"fields": [
"resourceInteroperabilityRecord.resourceId"
],
"_source": False
},
index='resource-interoperability-records_test'
)
svc_ids = []
service_records = []
for hit in response['hits']['hits']:
svc_ids.extend(extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.resourceId']) or [])
print(raw_data)
print(svc_ids)
if len(svc_ids) > 0:
response = os_client.search(
body={
"query": {
"ids": {
"values": svc_ids,
}
},
},
index='services_test'
)
for hit in response['hits']['hits']:
service_records.append(extract_nested(hit, ['_source']))
res = {
"alternativeIdentifiers": extract_nested(raw_data,
"interoperabilityRecord.alternativeIdentifiers".split(".")),
"creators": list(map(lambda c: {
"affiliation": extract_nested(c, ['creatorAffiliationInfo', 'affiliation']),
"givenName": extract_nested(c, ['givenName']),
"familyName": extract_nested(c, ['familyName']),
"fullName": extract_nested(c, ['creatorNameTypeInfo', 'creatorName']),
"type": extract_nested(c, ['creatorNameTypeInfo', 'nameType'])
}, extract_nested(raw_data, "interoperabilityRecord.creators".split(".")))),
"description": extract_nested(raw_data,
"interoperabilityRecord.description".split(".")),
"doi": extract_nested(raw_data, ['identifierInfo', 'identifier']) if
extract_nested(raw_data, ['identifierInfo', 'identifierType']) == 'ir_identifier_type-doi' else None,
"domain": {'domain': extract_map_nested(raw_data, 'domains',
"interoperabilityRecord.domain".split("."))},
"guidelineType": extract_map_nested(raw_data, 'guideline_type',
"interoperabilityRecord.eoscGuidelineType".split(".")),
"id": extract_nested(raw_data,
"interoperabilityRecord.id".split(".")),
"license": extract_nested(raw_data, "interoperabilityRecord.rights.rightIdentifier".split(".")),
"licenseDetails": list(map(lambda c: {
"identifier": extract_nested(c, ['rightIdentifier']),
"title": extract_nested(c, ['rightTitle']),
"uri": extract_nested(c, ['rightURI'])
}, extract_nested(raw_data, "interoperabilityRecord.rights".split(".")))),
"organization": extract_nested(organization, "provider.name".split(".")),
"provider": extract_nested(organization, "provider.name".split(".")),
"publicationYear": extract_nested(raw_data, "interoperabilityRecord.publicationYear".split(".")),
"services": list(map(lambda s: {
"name": extract_nested(organization, "service.name".split(".")),
"organization": extract_nested(organization, "service.organization".split(".")),
# s.organization on already mapped services
}, service_records)),
"status": extract_nested(raw_data, "interoperabilityRecord.status".split(".")),
"title": extract_nested(raw_data, "interoperabilityRecord.title".split(".")),
"type": 'interoperabilityRecord',
# "year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
}
return delete_none(res)

View File

@ -2,8 +2,7 @@ from __future__ import annotations
import json
import os
from datetime import timedelta, datetime
from typing import Any, List, Dict
from datetime import timedelta
import opensearchpy
import pendulum
@ -12,9 +11,9 @@ from airflow.decorators import dag
from airflow.decorators import task
from airflow.hooks.base import BaseHook
from airflow.utils.helpers import chain
from opensearchpy import OpenSearch
from opensearchpy import OpenSearch, helpers
from catalogue.mappers import map_interoperability, map_training, map_service
from catalogue.RawCatalogOpensearch import RawCatalogOpensearch
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
@ -34,7 +33,6 @@ default_args = {
default_args=default_args,
params={
"OPENSEARCH_CONN_ID": "opensearch_default",
"BATCH_LOADERS_NUM": 10,
"ENTITIES": ["datasources",
"interoperability-records",
"providers",
@ -94,14 +92,18 @@ def import_catalogue_entities():
pool_maxsize=20,
timeout=180
)
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
session = requests.session()
for entity in kwargs["params"]["ENTITIES"]:
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
indexname = catalog.get_index(entity)
baseurl = "http://vereniki.athenarc.gr:8080/eic-registry"
callurl = f"{baseurl}/{entity}"
params = {"draft": "false", "active": "true", "suspended": "false"}
if client.indices.exists(indexname):
client.indices.delete(indexname)
while True:
reply = session.get(url=callurl, params=params)
reply.raise_for_status()
@ -112,20 +114,24 @@ def import_catalogue_entities():
if len(results) <= 0:
break
for result in results:
# TODO: mapping code
body = {"doc": result, "doc_as_upsert": True}
client.update(
index=indexname,
body=body,
id=result['id'],
refresh=True
)
def streamed_results():
for r in results:
yield {"_index": indexname, "_id": r['id'], "_source": r}
succeeded = 0
failed = 0
for success, item in helpers.parallel_bulk(client, actions=streamed_results(), timeout=5*60):
if success:
succeeded = succeeded + 1
else:
print("error: " + str(item))
failed = failed + 1
# end of stream conditions
if content['to'] >= content['total']:
break
params['from'] = content['to']
client.indices.refresh(indexname)
@task
def map_indexes(**kwargs):
@ -140,22 +146,31 @@ def import_catalogue_entities():
timeout=180
)
session = requests.session()
for entity in {"interoperability-records", "training-resources", "services"}.intersection(kwargs["params"]["ENTITIES"]):
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
for hit in opensearchpy.helpers.scan(client, index=indexname, query={"query": {"match_all": {}}}):
for entity in {"interoperability-records", "training-resources", "services"}.intersection(
kwargs["params"]["ENTITIES"]):
for hit in opensearchpy.helpers.scan(client, index=catalog.get_index(entity),
query={"query": {"match_all": {}}}):
s = hit['_source']
doc = None
match entity:
case "interoperability-records":
print(json.dumps(map_interoperability(s, client), indent=-1))
doc = catalog.map_interoperability(s)
case "training-resources":
print(json.dumps(map_training(s, client), indent=-1))
doc = catalog.map_training(s)
case "services":
print(json.dumps(map_service(s, client), indent=-1))
case _:
pass
doc = catalog.map_service(s)
if doc is not None:
client.update(
index=f'{entity}_{kwargs["params"]["SUFFIX"]}',
body={"doc": doc, "doc_as_upsert": True},
id=doc['id'],
refresh=True
)
@task
def close_indexes(**kwargs):
@ -177,7 +192,6 @@ def import_catalogue_entities():
"number_of_replicas": 1,
"refresh_interval": "60s",
}
})
# update aliases