initial stage
This commit is contained in:
parent
1ad367efcc
commit
48d2f69bc6
|
@ -6,9 +6,9 @@ mappings['datasource'] = {
|
|||
"data_source_classification": {
|
||||
"type": "keyword"
|
||||
},
|
||||
# "eoscId": {
|
||||
# "type": "keyword"
|
||||
# },
|
||||
"entity_type": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"identifiers": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -86,6 +86,9 @@ mappings['datasource'] = {
|
|||
|
||||
mappings['venues'] = {
|
||||
"properties": {
|
||||
"entity_type": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"identifiers": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -124,6 +127,9 @@ mappings['venues'] = {
|
|||
|
||||
mappings['topics'] = {
|
||||
"properties": {
|
||||
"entity_type": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"identifiers": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -151,6 +157,9 @@ mappings['topics'] = {
|
|||
|
||||
mappings['persons'] = {
|
||||
"properties": {
|
||||
"entity_type": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"family_name": {
|
||||
"fields": {
|
||||
"keyword": {
|
||||
|
@ -197,6 +206,9 @@ mappings['organizations'] = {
|
|||
"country": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"entity_type": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"identifiers": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -394,9 +406,9 @@ mappings['products'] = {
|
|||
"contributions": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
# "declared_affiliations": {
|
||||
# "type": "keyword"
|
||||
# },
|
||||
"is_listed_author": {
|
||||
"type": "boolean"
|
||||
},
|
||||
"person": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -420,11 +432,11 @@ mappings['products'] = {
|
|||
"index": False,
|
||||
"type": "long"
|
||||
},
|
||||
# "roles": {
|
||||
# "type": "keyword"
|
||||
# }
|
||||
}
|
||||
},
|
||||
"entity_type": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"funding": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -506,15 +518,9 @@ mappings['products'] = {
|
|||
"end_page": {
|
||||
"type": "text"
|
||||
},
|
||||
# "hosting_data_source": {
|
||||
# "type": "text"
|
||||
# },
|
||||
"issue": {
|
||||
"type": "text"
|
||||
},
|
||||
# "number": {
|
||||
# "type": "text"
|
||||
# },
|
||||
"publisher": {
|
||||
"type": "text"
|
||||
},
|
||||
|
@ -537,9 +543,6 @@ mappings['products'] = {
|
|||
}
|
||||
}
|
||||
},
|
||||
"eoscId": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"hosting_datasource": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
|
@ -612,9 +615,6 @@ mappings['products'] = {
|
|||
"pmid": {
|
||||
"type": "keyword"
|
||||
},
|
||||
# "title": {
|
||||
# "type": "text"
|
||||
# }
|
||||
}
|
||||
},
|
||||
"relation_type": {
|
||||
|
@ -686,6 +686,7 @@ mappings['products'] = {
|
|||
}
|
||||
}
|
||||
},
|
||||
# Aliases
|
||||
"type": {
|
||||
"path": "product_type",
|
||||
"type": "alias"
|
||||
|
|
|
@ -0,0 +1,283 @@
|
|||
from datetime import datetime
|
||||
|
||||
from opensearchpy import OpenSearch
|
||||
|
||||
from catalogue.dictutils import extract_nested, extract_map_nested, delete_none
|
||||
from catalogue.vocabulary import CATALOG_VOCABULARY
|
||||
|
||||
|
||||
class RawCatalogOpensearch:
|
||||
def __init__(self, os_client: OpenSearch, suffix: str | None):
|
||||
self.os_client = os_client
|
||||
self.suffix = suffix
|
||||
|
||||
def get_index(self, name: str):
|
||||
return "catalog_" + name + ("" if self.suffix is None else f"_{self.suffix}")
|
||||
|
||||
def get_resource_interoperability_records(self, resource_id):
|
||||
response = self.os_client.search(
|
||||
body={
|
||||
'query': {
|
||||
'term': {
|
||||
'resourceInteroperabilityRecord.resourceId.keyword': resource_id,
|
||||
}
|
||||
},
|
||||
"fields": [
|
||||
"resourceInteroperabilityRecord.interoperabilityRecordIds"
|
||||
],
|
||||
"_source": False
|
||||
},
|
||||
index=self.get_index('resource-interoperability-records')
|
||||
)
|
||||
|
||||
interoperability_ids = []
|
||||
interoperability_records = []
|
||||
for hit in response['hits']['hits']:
|
||||
interoperability_ids.extend(
|
||||
extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.interoperabilityRecordIds']) or [])
|
||||
|
||||
if len(interoperability_ids) > 0:
|
||||
response = self.os_client.search(
|
||||
body={
|
||||
"query": {
|
||||
"ids": {
|
||||
"values": interoperability_ids,
|
||||
}
|
||||
},
|
||||
},
|
||||
index=self.get_index('interoperability-records')
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
interoperability_records.append(extract_nested(hit, ['_source']))
|
||||
|
||||
return interoperability_records
|
||||
|
||||
def get_providers(self, provider_ids: list[str]) -> list:
|
||||
provider_records = []
|
||||
if provider_ids is not None and len(provider_ids) > 0:
|
||||
response = self.os_client.search(
|
||||
body={
|
||||
"query": {
|
||||
"ids": {
|
||||
"values": provider_ids if isinstance(provider_ids, list) else [provider_ids],
|
||||
}
|
||||
},
|
||||
},
|
||||
index=self.get_index('providers')
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
provider_records.append(extract_nested(hit, ['_source']))
|
||||
return provider_records
|
||||
|
||||
def get_provider(self, provider_id: str):
|
||||
if provider_id is not None:
|
||||
providers = self.get_providers([provider_id])
|
||||
if providers is not None and len(providers) > 0:
|
||||
return providers[0]
|
||||
return {}
|
||||
|
||||
def get_services(self, service_ids: list[str]) -> list:
|
||||
service_records = []
|
||||
|
||||
if service_ids is not None and len(service_ids) > 0:
|
||||
response = self.os_client.search(
|
||||
body={
|
||||
"query": {
|
||||
"ids": {
|
||||
"values": service_ids if isinstance(service_ids, list) else [
|
||||
service_ids],
|
||||
}
|
||||
},
|
||||
},
|
||||
index=self.get_index('services')
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
service_records.append(extract_nested(hit, ['_source']))
|
||||
|
||||
return service_records
|
||||
|
||||
def get_datasource_of_service(self, service_id: str):
|
||||
response = self.os_client.search(
|
||||
body={
|
||||
'query': {
|
||||
'term': {
|
||||
'datasource.serviceId.keyword': service_id,
|
||||
}
|
||||
}
|
||||
},
|
||||
index=self.get_index('datasources')
|
||||
)
|
||||
|
||||
for hit in response['hits']['hits']:
|
||||
return extract_nested(hit, ['_source'])
|
||||
return {}
|
||||
|
||||
def get_services_of_interoperability(self, interoperability_id: str):
|
||||
svc_ids = []
|
||||
response = self.os_client.search(
|
||||
body={
|
||||
'query': {
|
||||
'term': {
|
||||
'resourceInteroperabilityRecord.interoperabilityRecordIds.keyword': interoperability_id,
|
||||
}
|
||||
},
|
||||
"fields": [
|
||||
"resourceInteroperabilityRecord.resourceId"
|
||||
],
|
||||
"_source": False
|
||||
},
|
||||
index=self.get_index('resource-interoperability-records')
|
||||
)
|
||||
|
||||
for hit in response['hits']['hits']:
|
||||
svc_ids.extend(extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.resourceId']) or [])
|
||||
|
||||
return svc_ids
|
||||
|
||||
def map_service(self, raw_svc: dict) -> dict:
|
||||
interoperability_records = self.get_resource_interoperability_records(raw_svc['id'])
|
||||
organization = self.get_provider(extract_nested(raw_svc, ['service', 'resourceOrganisation']))
|
||||
provider_records = self.get_providers(list(
|
||||
filter(lambda i: len(i) > 0, extract_nested(raw_svc, ['service', 'resourceProviders']) or [])))
|
||||
related_resources_records = self.get_services(list(
|
||||
filter(lambda i: len(i) > 0, extract_nested(raw_svc, ['service', 'relatedResources']) or [])))
|
||||
datasource = self.get_datasource_of_service(raw_svc['id'])
|
||||
|
||||
res = {
|
||||
"accessRestriction": extract_nested(raw_svc,
|
||||
"service.geographicalAvailabilities".split(".")),
|
||||
"accessTypes": extract_map_nested(raw_svc, 'access_type', "service.accessTypes".split(".")),
|
||||
"access_modes": extract_map_nested(raw_svc, 'access_mode', "service.accessModes".split(".")),
|
||||
"category": list(map(lambda c: {"category": CATALOG_VOCABULARY['categories'][c['category']],
|
||||
"subcategory": CATALOG_VOCABULARY['subcategories'][c['subcategory']]},
|
||||
extract_nested(raw_svc, "service.categories".split(".")))),
|
||||
"description": extract_nested(raw_svc, "service.description".split(".")),
|
||||
"domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
|
||||
"subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
|
||||
extract_nested(raw_svc, "service.scientificDomains".split(".")))),
|
||||
"grantProjectNames": extract_nested(raw_svc, "service.grantProjectNames".split(".")),
|
||||
"helpdeskPage": extract_nested(raw_svc, "service.helpdeskPage".split(".")),
|
||||
"horizontal": extract_nested(raw_svc, "service.horizontalService".split(".")) or False,
|
||||
"id": extract_nested(raw_svc, "service.id".split(".")),
|
||||
"interoperabilityGuidelines": list(
|
||||
map(lambda ig: ig['interoperabilityRecord']['title'], interoperability_records)),
|
||||
"language": extract_map_nested(raw_svc, 'languages', "service.languageAvailabilities".split(".")),
|
||||
"name": extract_nested(raw_svc, "service.name".split(".")),
|
||||
"orderType": extract_map_nested(raw_svc, 'order_type', "service.orderType".split(".")),
|
||||
"organization": extract_nested(organization, "provider.name".split(".")),
|
||||
"pricing": extract_nested(raw_svc, "service.pricing".split(".")),
|
||||
"privacyPolicy": extract_nested(raw_svc, "service.privacyPolicy".split(".")),
|
||||
"providers": list(map(lambda p: p['provider']['name'], provider_records)),
|
||||
"relatedPlatforms": extract_map_nested(raw_svc, 'related_platform', "service.relatedPlatforms".split(".")),
|
||||
"relatedResources": list(map(lambda p: p['service']['name'], related_resources_records)),
|
||||
"tags": extract_nested(raw_svc, "service.tags".split(".")),
|
||||
"targetUsers": extract_map_nested(raw_svc, 'target_user', "service.targetUsers".split(".")),
|
||||
"termsOfUse": extract_nested(raw_svc, "service.termsOfUse".split(".")),
|
||||
"thematic": extract_nested(datasource, "datasource.thematic".split(".")) or False,
|
||||
"trl": extract_map_nested(raw_svc, 'trl', "service.trl".split(".")),
|
||||
"type": 'datasource' if extract_nested(datasource, "datasource.id".split(".")) is not None else 'service',
|
||||
"useCases": extract_nested(raw_svc, "service.useCases".split(".")),
|
||||
"userManual": extract_nested(raw_svc, "service.userManual".split(".")),
|
||||
"webpage": extract_nested(raw_svc, "service.webpage".split(".")),
|
||||
"year": datetime.fromtimestamp(
|
||||
int(extract_nested(raw_svc, "metadata.registeredAt".split("."))) / 1000).year,
|
||||
}
|
||||
|
||||
return delete_none(res)
|
||||
|
||||
def map_training(self, raw_trn: dict) -> dict:
|
||||
organization = self.get_provider(extract_nested(raw_trn, ['trainingResource', 'resourceOrganisation']))
|
||||
|
||||
res = {
|
||||
"accessRight": extract_map_nested(raw_trn, 'tr_access', "trainingResource.accessRights".split(".")),
|
||||
"alternativeIdentifiers": extract_nested(raw_trn,
|
||||
"trainingResource.alternativeIdentifiers".split(".")),
|
||||
"authors": extract_nested(raw_trn,
|
||||
"trainingResource.authors".split(".")),
|
||||
|
||||
"contentResourceType": extract_map_nested(raw_trn, 'tr_content',
|
||||
"trainingResource.contentResourceTypes".split(".")),
|
||||
|
||||
"description": extract_nested(raw_trn,
|
||||
"trainingResource.description".split(".")),
|
||||
"domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
|
||||
"subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
|
||||
extract_nested(raw_trn, "trainingResource.scientificDomains".split(".")))),
|
||||
"duration": extract_nested(raw_trn,
|
||||
"trainingResource.duration".split(".")),
|
||||
"expertiseLevel": extract_map_nested(raw_trn, 'expertise_level',
|
||||
"trainingResource.expertiseLevel".split(".")),
|
||||
|
||||
"id": extract_nested(raw_trn,
|
||||
"trainingResource.id".split(".")),
|
||||
"keyword": extract_nested(raw_trn,
|
||||
"trainingResource.keywords".split(".")),
|
||||
"language": extract_map_nested(raw_trn, 'languages', "trainingResource.languages".split(".")),
|
||||
"learningOutcomes": extract_nested(raw_trn,
|
||||
"trainingResource.learningOutcomes".split(".")),
|
||||
"learningResourceType": extract_map_nested(raw_trn, 'tr_dcmi',
|
||||
"trainingResource.learningResourceTypes".split(".")),
|
||||
|
||||
"license": extract_nested(raw_trn,
|
||||
"trainingResource.license".split(".")),
|
||||
"organization": extract_nested(organization, "provider.name".split(".")),
|
||||
"qualifications": extract_map_nested(raw_trn, 'qualification',
|
||||
"trainingResource.qualifications".split(".")),
|
||||
"targetGroup": extract_map_nested(raw_trn, 'target_user', "trainingResource.targetGroups".split(".")),
|
||||
"title": extract_nested(raw_trn,
|
||||
"trainingResource.title".split(".")),
|
||||
"type": 'trainingResource',
|
||||
"url": extract_nested(raw_trn,
|
||||
"trainingResource.url".split(".")),
|
||||
"year": datetime.fromtimestamp(
|
||||
int(extract_nested(raw_trn, "metadata.registeredAt".split("."))) / 1000).year,
|
||||
}
|
||||
|
||||
return delete_none(res)
|
||||
|
||||
def map_interoperability(self, raw_itr: dict) -> dict:
|
||||
organization = self.get_provider(extract_nested(raw_itr, ['interoperabilityRecord', 'providerId']))
|
||||
service_records = self.get_services(self.get_services_of_interoperability(raw_itr['id']))
|
||||
|
||||
res = {
|
||||
"alternativeIdentifiers": extract_nested(raw_itr,
|
||||
"interoperabilityRecord.alternativeIdentifiers".split(".")),
|
||||
"creators": list(map(lambda c: {
|
||||
"affiliation": extract_nested(c, ['creatorAffiliationInfo', 'affiliation']),
|
||||
"givenName": extract_nested(c, ['givenName']),
|
||||
"familyName": extract_nested(c, ['familyName']),
|
||||
"fullName": extract_nested(c, ['creatorNameTypeInfo', 'creatorName']),
|
||||
"type": extract_nested(c, ['creatorNameTypeInfo', 'nameType'])
|
||||
}, extract_nested(raw_itr, "interoperabilityRecord.creators".split(".")))),
|
||||
|
||||
"description": extract_nested(raw_itr,
|
||||
"interoperabilityRecord.description".split(".")),
|
||||
"doi": extract_nested(raw_itr, ['identifierInfo', 'identifier']) if
|
||||
extract_nested(raw_itr, ['identifierInfo', 'identifierType']) == 'ir_identifier_type-doi' else None,
|
||||
"domain": {'domain': extract_map_nested(raw_itr, 'domains',
|
||||
"interoperabilityRecord.domain".split("."))},
|
||||
"guidelineType": extract_map_nested(raw_itr, 'guideline_type',
|
||||
"interoperabilityRecord.eoscGuidelineType".split(".")),
|
||||
"id": extract_nested(raw_itr,
|
||||
"interoperabilityRecord.id".split(".")),
|
||||
"license": extract_nested(raw_itr, "interoperabilityRecord.rights.rightIdentifier".split(".")),
|
||||
"licenseDetails": list(map(lambda c: {
|
||||
"identifier": extract_nested(c, ['rightIdentifier']),
|
||||
"title": extract_nested(c, ['rightTitle']),
|
||||
"uri": extract_nested(c, ['rightURI'])
|
||||
}, extract_nested(raw_itr, "interoperabilityRecord.rights".split(".")))),
|
||||
"organization": extract_nested(organization, "provider.name".split(".")),
|
||||
"provider": extract_nested(organization, "provider.name".split(".")),
|
||||
"publicationYear": extract_nested(raw_itr, "interoperabilityRecord.publicationYear".split(".")),
|
||||
"services": list(map(lambda s: {
|
||||
"name": extract_nested(organization, "service.name".split(".")),
|
||||
"organization": extract_nested(organization, "service.organization".split(".")),
|
||||
# s.organization on already mapped services
|
||||
}, service_records)),
|
||||
"status": extract_nested(raw_itr, "interoperabilityRecord.status".split(".")),
|
||||
"title": extract_nested(raw_itr, "interoperabilityRecord.title".split(".")),
|
||||
"type": 'interoperabilityRecord',
|
||||
# "year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
|
||||
}
|
||||
|
||||
return delete_none(res)
|
|
@ -0,0 +1,41 @@
|
|||
from typing import Dict, Any, List
|
||||
|
||||
from catalogue.vocabulary import CATALOG_VOCABULARY
|
||||
|
||||
|
||||
def extract_nested(current_value: Dict[str, Any], labels: List[str]) -> Any | None:
|
||||
if len(labels) <= 0:
|
||||
return current_value
|
||||
for label in labels:
|
||||
if isinstance(current_value, dict) and label in current_value:
|
||||
current_value = current_value[label]
|
||||
else:
|
||||
return None
|
||||
|
||||
return current_value
|
||||
|
||||
|
||||
def extract_map_nested(current_value: Dict[str, Any], dictionary: str, labels: List[str]) -> Any | None:
|
||||
value = extract_nested(current_value, labels)
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, list):
|
||||
return list(map(lambda d: CATALOG_VOCABULARY[dictionary][d] if d else None, value))
|
||||
if isinstance(value, str) and value != '':
|
||||
return CATALOG_VOCABULARY[dictionary][value]
|
||||
return None
|
||||
|
||||
|
||||
def delete_none(_dict):
|
||||
"""Delete None values recursively from all of the dictionaries, tuples, lists, sets"""
|
||||
if isinstance(_dict, dict):
|
||||
for key, value in list(_dict.items()):
|
||||
if isinstance(value, (list, dict, tuple, set)):
|
||||
_dict[key] = delete_none(value)
|
||||
elif value is None or key is None:
|
||||
del _dict[key]
|
||||
|
||||
elif isinstance(_dict, (list, set, tuple)):
|
||||
_dict = type(_dict)(delete_none(item) for item in _dict if item is not None)
|
||||
|
||||
return _dict
|
|
@ -3,347 +3,11 @@ from typing import Dict, Any, List
|
|||
|
||||
from opensearchpy import OpenSearch
|
||||
|
||||
from catalogue.dictutils import extract_nested, extract_map_nested, delete_none
|
||||
from catalogue.vocabulary import CATALOG_VOCABULARY
|
||||
|
||||
|
||||
def extract_nested(current_value: Dict[str, Any], labels: List[str]) -> Any | None:
|
||||
if len(labels) <= 0:
|
||||
return current_value
|
||||
for label in labels:
|
||||
if isinstance(current_value, dict) and label in current_value:
|
||||
current_value = current_value[label]
|
||||
else:
|
||||
return None
|
||||
|
||||
return current_value
|
||||
|
||||
|
||||
def extract_map_nested(current_value: Dict[str, Any], dictionary: str, labels: List[str]) -> Any | None:
|
||||
value = extract_nested(current_value, labels)
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, list):
|
||||
return list(map(lambda d: CATALOG_VOCABULARY[dictionary][d] if d else None, value))
|
||||
if isinstance(value, str) and value != '':
|
||||
return CATALOG_VOCABULARY[dictionary][value]
|
||||
return None
|
||||
|
||||
|
||||
def delete_none(_dict):
|
||||
"""Delete None values recursively from all of the dictionaries, tuples, lists, sets"""
|
||||
if isinstance(_dict, dict):
|
||||
for key, value in list(_dict.items()):
|
||||
if isinstance(value, (list, dict, tuple, set)):
|
||||
_dict[key] = delete_none(value)
|
||||
elif value is None or key is None:
|
||||
del _dict[key]
|
||||
|
||||
elif isinstance(_dict, (list, set, tuple)):
|
||||
_dict = type(_dict)(delete_none(item) for item in _dict if item is not None)
|
||||
|
||||
return _dict
|
||||
|
||||
|
||||
def map_service(raw_data: dict, os_client: OpenSearch) -> dict:
|
||||
response = os_client.search(
|
||||
body={
|
||||
'query': {
|
||||
'term': {
|
||||
'resourceInteroperabilityRecord.resourceId.keyword': raw_data['id'],
|
||||
}
|
||||
},
|
||||
"fields": [
|
||||
"resourceInteroperabilityRecord.interoperabilityRecordIds"
|
||||
],
|
||||
"_source": False
|
||||
},
|
||||
index='resource-interoperability-records_test'
|
||||
)
|
||||
|
||||
ig_ids = []
|
||||
interoperability_records = []
|
||||
for hit in response['hits']['hits']:
|
||||
ig_ids.extend(extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.interoperabilityRecordIds']) or [])
|
||||
|
||||
if len(ig_ids) > 0:
|
||||
response = os_client.search(
|
||||
body={
|
||||
|
||||
"query": {
|
||||
"ids": {
|
||||
"values": ig_ids,
|
||||
}
|
||||
},
|
||||
},
|
||||
index='interoperability-records_test'
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
interoperability_records.append(extract_nested(hit, ['_source']))
|
||||
|
||||
organization = {} # "provider.name through service.resourceOrganisation=provider.id", for service.Organization
|
||||
if extract_nested(raw_data, ['service', 'resourceOrganisation']) is not None:
|
||||
response = os_client.search(
|
||||
body={
|
||||
"query": {
|
||||
"ids": {
|
||||
"values": [extract_nested(raw_data, ['service', 'resourceOrganisation'])],
|
||||
}
|
||||
},
|
||||
},
|
||||
index='providers_test'
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
organization = extract_nested(hit, ['_source'])
|
||||
break
|
||||
|
||||
# INNER JOIN! extract_map_nested(raw_data, 'rel_service', "service.relatedResources".split(".")),
|
||||
# TODO: relatedResources via query
|
||||
related_resources_records = []
|
||||
related_resources_ids = list(
|
||||
filter(lambda i: len(i) > 0, extract_nested(raw_data, ['service', 'relatedResources']) or []))
|
||||
print(related_resources_ids)
|
||||
if related_resources_ids is not None and len(related_resources_ids) > 0:
|
||||
response = os_client.search(
|
||||
body={
|
||||
"query": {
|
||||
"ids": {
|
||||
"values": related_resources_ids if isinstance(related_resources_ids, list) else [
|
||||
related_resources_ids],
|
||||
}
|
||||
},
|
||||
},
|
||||
index='services_test'
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
related_resources_records.append(extract_nested(hit, ['_source']))
|
||||
|
||||
provider_records = [] # "provider.name through s.service.resourceProviders=provider.id",
|
||||
provider_ids = list(filter(lambda i: len(i) > 0, extract_nested(raw_data, ['service', 'resourceProviders']) or []))
|
||||
print(provider_ids)
|
||||
if provider_ids is not None and len(provider_ids) > 0:
|
||||
response = os_client.search(
|
||||
body={
|
||||
"query": {
|
||||
"ids": {
|
||||
"values": provider_ids if isinstance(provider_ids, list) else [provider_ids],
|
||||
}
|
||||
},
|
||||
},
|
||||
index='providers_test'
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
provider_records.append(extract_nested(hit, ['_source']))
|
||||
|
||||
datasource = {} # datasource that point to this via serviceID
|
||||
response = os_client.search(
|
||||
body={
|
||||
'query': {
|
||||
'term': {
|
||||
'datasource.serviceId.keyword': raw_data['id'],
|
||||
}
|
||||
}
|
||||
},
|
||||
index='datasources_test'
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
datasource = extract_nested(hit, ['_source'])
|
||||
break
|
||||
|
||||
res = {
|
||||
"accessRestriction": extract_nested(raw_data,
|
||||
"service.geographicalAvailabilities".split(".")),
|
||||
"accessTypes": extract_map_nested(raw_data, 'access_type', "service.accessTypes".split(".")),
|
||||
"access_modes": extract_map_nested(raw_data, 'access_mode', "service.accessModes".split(".")),
|
||||
"category": list(map(lambda c: {"category": CATALOG_VOCABULARY['categories'][c['category']],
|
||||
"subcategory": CATALOG_VOCABULARY['subcategories'][c['subcategory']]},
|
||||
extract_nested(raw_data, "service.categories".split(".")))),
|
||||
"description": extract_nested(raw_data, "service.description".split(".")),
|
||||
"domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
|
||||
"subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
|
||||
extract_nested(raw_data, "service.scientificDomains".split(".")))),
|
||||
"grantProjectNames": extract_nested(raw_data, "service.grantProjectNames".split(".")),
|
||||
"helpdeskPage": extract_nested(raw_data, "service.helpdeskPage".split(".")),
|
||||
"horizontal": extract_nested(raw_data, "service.horizontalService".split(".")) or False,
|
||||
"id": extract_nested(raw_data, "service.id".split(".")),
|
||||
"interoperabilityGuidelines": list(
|
||||
map(lambda ig: ig['interoperabilityRecord']['title'], interoperability_records)),
|
||||
"language": extract_map_nested(raw_data, 'languages', "service.languageAvailabilities".split(".")),
|
||||
"name": extract_nested(raw_data, "service.name".split(".")),
|
||||
"orderType": extract_map_nested(raw_data, 'order_type', "service.orderType".split(".")),
|
||||
"organization": extract_nested(organization, "provider.name".split(".")),
|
||||
"pricing": extract_nested(raw_data, "service.pricing".split(".")),
|
||||
"privacyPolicy": extract_nested(raw_data, "service.privacyPolicy".split(".")),
|
||||
"providers": list(map(lambda p: p['provider']['name'], provider_records)),
|
||||
"relatedPlatforms": extract_map_nested(raw_data, 'related_platform', "service.relatedPlatforms".split(".")),
|
||||
"relatedResources": list(map(lambda p: p['service']['name'], related_resources_records)),
|
||||
"tags": extract_nested(raw_data, "service.tags".split(".")),
|
||||
"targetUsers": extract_map_nested(raw_data, 'target_user', "service.targetUsers".split(".")),
|
||||
"termsOfUse": extract_nested(raw_data, "service.termsOfUse".split(".")),
|
||||
"thematic": extract_nested(datasource, "datasource.thematic".split(".")) or False,
|
||||
"trl": extract_map_nested(raw_data, 'trl', "service.trl".split(".")),
|
||||
"type": 'datasource' if extract_nested(datasource, "datasource.id".split(".")) is not None else 'service',
|
||||
"useCases": extract_nested(raw_data, "service.useCases".split(".")),
|
||||
"userManual": extract_nested(raw_data, "service.userManual".split(".")),
|
||||
"webpage": extract_nested(raw_data, "service.webpage".split(".")),
|
||||
"year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
|
||||
}
|
||||
|
||||
return delete_none(res)
|
||||
|
||||
|
||||
def map_training(raw_data: dict, os_client: OpenSearch) -> dict:
|
||||
organization = {} # "provider.name through service.resourceOrganisation=provider.id", for service.Organization
|
||||
if extract_nested(raw_data, ['trainingResource', 'resourceOrganisation']) is not None:
|
||||
response = os_client.search(
|
||||
body={
|
||||
"query": {
|
||||
"ids": {
|
||||
"values": [extract_nested(raw_data, ['trainingResource', 'resourceOrganisation'])],
|
||||
}
|
||||
},
|
||||
},
|
||||
index='providers_test'
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
organization = extract_nested(hit, ['_source'])
|
||||
break
|
||||
|
||||
res = {
|
||||
"accessRight": extract_map_nested(raw_data, 'tr_access', "trainingResource.accessRights".split(".")),
|
||||
"alternativeIdentifiers": extract_nested(raw_data,
|
||||
"trainingResource.alternativeIdentifiers".split(".")),
|
||||
"authors": extract_nested(raw_data,
|
||||
"trainingResource.authors".split(".")),
|
||||
|
||||
"contentResourceType": extract_map_nested(raw_data, 'tr_content',
|
||||
"trainingResource.contentResourceTypes".split(".")),
|
||||
|
||||
"description": extract_nested(raw_data,
|
||||
"trainingResource.description".split(".")),
|
||||
"domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
|
||||
"subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
|
||||
extract_nested(raw_data, "trainingResource.scientificDomains".split(".")))),
|
||||
"duration": extract_nested(raw_data,
|
||||
"trainingResource.duration".split(".")),
|
||||
"expertiseLevel": extract_map_nested(raw_data, 'expertise_level', "trainingResource.expertiseLevel".split(".")),
|
||||
|
||||
"id": extract_nested(raw_data,
|
||||
"trainingResource.id".split(".")),
|
||||
"keyword": extract_nested(raw_data,
|
||||
"trainingResource.keywords".split(".")),
|
||||
"language": extract_map_nested(raw_data, 'languages', "trainingResource.languages".split(".")),
|
||||
"learningOutcomes": extract_nested(raw_data,
|
||||
"trainingResource.learningOutcomes".split(".")),
|
||||
"learningResourceType": extract_map_nested(raw_data, 'tr_dcmi',
|
||||
"trainingResource.learningResourceTypes".split(".")),
|
||||
|
||||
"license": extract_nested(raw_data,
|
||||
"trainingResource.license".split(".")),
|
||||
"organization": extract_nested(organization, "provider.name".split(".")),
|
||||
"qualifications": extract_map_nested(raw_data, 'qualification', "trainingResource.qualifications".split(".")),
|
||||
"targetGroup": extract_map_nested(raw_data, 'target_user', "trainingResource.targetGroups".split(".")),
|
||||
"title": extract_nested(raw_data,
|
||||
"trainingResource.title".split(".")),
|
||||
"type": 'trainingResource',
|
||||
"url": extract_nested(raw_data,
|
||||
"trainingResource.url".split(".")),
|
||||
"year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
|
||||
}
|
||||
|
||||
return delete_none(res)
|
||||
|
||||
|
||||
def map_interoperability(raw_data: dict, os_client: OpenSearch) -> dict:
|
||||
organization = {} # "provider.name through service.resourceOrganisation=provider.id", for service.Organization
|
||||
if extract_nested(raw_data, ['interoperabilityRecord', 'providerId']) is not None:
|
||||
response = os_client.search(
|
||||
body={
|
||||
"query": {
|
||||
"ids": {
|
||||
"values": [extract_nested(raw_data, ['interoperabilityRecord', 'providerId'])],
|
||||
}
|
||||
},
|
||||
},
|
||||
index='providers_test'
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
organization = extract_nested(hit, ['_source'])
|
||||
break
|
||||
|
||||
response = os_client.search(
|
||||
body={
|
||||
'query': {
|
||||
'term': {
|
||||
'resourceInteroperabilityRecord.interoperabilityRecordIds.keyword': raw_data['id'],
|
||||
}
|
||||
},
|
||||
"fields": [
|
||||
"resourceInteroperabilityRecord.resourceId"
|
||||
],
|
||||
"_source": False
|
||||
},
|
||||
index='resource-interoperability-records_test'
|
||||
)
|
||||
svc_ids = []
|
||||
service_records = []
|
||||
for hit in response['hits']['hits']:
|
||||
svc_ids.extend(extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.resourceId']) or [])
|
||||
|
||||
print(raw_data)
|
||||
print(svc_ids)
|
||||
if len(svc_ids) > 0:
|
||||
response = os_client.search(
|
||||
body={
|
||||
"query": {
|
||||
"ids": {
|
||||
"values": svc_ids,
|
||||
}
|
||||
},
|
||||
},
|
||||
index='services_test'
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
service_records.append(extract_nested(hit, ['_source']))
|
||||
|
||||
res = {
|
||||
"alternativeIdentifiers": extract_nested(raw_data,
|
||||
"interoperabilityRecord.alternativeIdentifiers".split(".")),
|
||||
"creators": list(map(lambda c: {
|
||||
"affiliation": extract_nested(c, ['creatorAffiliationInfo', 'affiliation']),
|
||||
"givenName": extract_nested(c, ['givenName']),
|
||||
"familyName": extract_nested(c, ['familyName']),
|
||||
"fullName": extract_nested(c, ['creatorNameTypeInfo', 'creatorName']),
|
||||
"type": extract_nested(c, ['creatorNameTypeInfo', 'nameType'])
|
||||
}, extract_nested(raw_data, "interoperabilityRecord.creators".split(".")))),
|
||||
|
||||
"description": extract_nested(raw_data,
|
||||
"interoperabilityRecord.description".split(".")),
|
||||
"doi": extract_nested(raw_data, ['identifierInfo', 'identifier']) if
|
||||
extract_nested(raw_data, ['identifierInfo', 'identifierType']) == 'ir_identifier_type-doi' else None,
|
||||
"domain": {'domain': extract_map_nested(raw_data, 'domains',
|
||||
"interoperabilityRecord.domain".split("."))},
|
||||
"guidelineType": extract_map_nested(raw_data, 'guideline_type',
|
||||
"interoperabilityRecord.eoscGuidelineType".split(".")),
|
||||
"id": extract_nested(raw_data,
|
||||
"interoperabilityRecord.id".split(".")),
|
||||
"license": extract_nested(raw_data, "interoperabilityRecord.rights.rightIdentifier".split(".")),
|
||||
"licenseDetails": list(map(lambda c: {
|
||||
"identifier": extract_nested(c, ['rightIdentifier']),
|
||||
"title": extract_nested(c, ['rightTitle']),
|
||||
"uri": extract_nested(c, ['rightURI'])
|
||||
}, extract_nested(raw_data, "interoperabilityRecord.rights".split(".")))),
|
||||
"organization": extract_nested(organization, "provider.name".split(".")),
|
||||
"provider": extract_nested(organization, "provider.name".split(".")),
|
||||
"publicationYear": extract_nested(raw_data, "interoperabilityRecord.publicationYear".split(".")),
|
||||
"services": list(map(lambda s: {
|
||||
"name": extract_nested(organization, "service.name".split(".")),
|
||||
"organization": extract_nested(organization, "service.organization".split(".")),
|
||||
# s.organization on already mapped services
|
||||
}, service_records)),
|
||||
"status": extract_nested(raw_data, "interoperabilityRecord.status".split(".")),
|
||||
"title": extract_nested(raw_data, "interoperabilityRecord.title".split(".")),
|
||||
"type": 'interoperabilityRecord',
|
||||
# "year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
|
||||
}
|
||||
|
||||
return delete_none(res)
|
||||
|
|
|
@ -2,8 +2,7 @@ from __future__ import annotations
|
|||
|
||||
import json
|
||||
import os
|
||||
from datetime import timedelta, datetime
|
||||
from typing import Any, List, Dict
|
||||
from datetime import timedelta
|
||||
|
||||
import opensearchpy
|
||||
import pendulum
|
||||
|
@ -12,9 +11,9 @@ from airflow.decorators import dag
|
|||
from airflow.decorators import task
|
||||
from airflow.hooks.base import BaseHook
|
||||
from airflow.utils.helpers import chain
|
||||
from opensearchpy import OpenSearch
|
||||
from opensearchpy import OpenSearch, helpers
|
||||
|
||||
from catalogue.mappers import map_interoperability, map_training, map_service
|
||||
from catalogue.RawCatalogOpensearch import RawCatalogOpensearch
|
||||
|
||||
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||
|
||||
|
@ -34,7 +33,6 @@ default_args = {
|
|||
default_args=default_args,
|
||||
params={
|
||||
"OPENSEARCH_CONN_ID": "opensearch_default",
|
||||
"BATCH_LOADERS_NUM": 10,
|
||||
"ENTITIES": ["datasources",
|
||||
"interoperability-records",
|
||||
"providers",
|
||||
|
@ -94,14 +92,18 @@ def import_catalogue_entities():
|
|||
pool_maxsize=20,
|
||||
timeout=180
|
||||
)
|
||||
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
|
||||
|
||||
session = requests.session()
|
||||
for entity in kwargs["params"]["ENTITIES"]:
|
||||
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
|
||||
indexname = catalog.get_index(entity)
|
||||
baseurl = "http://vereniki.athenarc.gr:8080/eic-registry"
|
||||
callurl = f"{baseurl}/{entity}"
|
||||
params = {"draft": "false", "active": "true", "suspended": "false"}
|
||||
|
||||
if client.indices.exists(indexname):
|
||||
client.indices.delete(indexname)
|
||||
|
||||
while True:
|
||||
reply = session.get(url=callurl, params=params)
|
||||
reply.raise_for_status()
|
||||
|
@ -112,20 +114,24 @@ def import_catalogue_entities():
|
|||
if len(results) <= 0:
|
||||
break
|
||||
|
||||
for result in results:
|
||||
# TODO: mapping code
|
||||
body = {"doc": result, "doc_as_upsert": True}
|
||||
client.update(
|
||||
index=indexname,
|
||||
body=body,
|
||||
id=result['id'],
|
||||
refresh=True
|
||||
)
|
||||
def streamed_results():
|
||||
for r in results:
|
||||
yield {"_index": indexname, "_id": r['id'], "_source": r}
|
||||
|
||||
succeeded = 0
|
||||
failed = 0
|
||||
for success, item in helpers.parallel_bulk(client, actions=streamed_results(), timeout=5*60):
|
||||
if success:
|
||||
succeeded = succeeded + 1
|
||||
else:
|
||||
print("error: " + str(item))
|
||||
failed = failed + 1
|
||||
|
||||
# end of stream conditions
|
||||
if content['to'] >= content['total']:
|
||||
break
|
||||
params['from'] = content['to']
|
||||
client.indices.refresh(indexname)
|
||||
|
||||
@task
|
||||
def map_indexes(**kwargs):
|
||||
|
@ -140,22 +146,31 @@ def import_catalogue_entities():
|
|||
timeout=180
|
||||
)
|
||||
|
||||
session = requests.session()
|
||||
for entity in {"interoperability-records", "training-resources", "services"}.intersection(kwargs["params"]["ENTITIES"]):
|
||||
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
|
||||
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
|
||||
|
||||
for hit in opensearchpy.helpers.scan(client, index=indexname, query={"query": {"match_all": {}}}):
|
||||
for entity in {"interoperability-records", "training-resources", "services"}.intersection(
|
||||
kwargs["params"]["ENTITIES"]):
|
||||
|
||||
for hit in opensearchpy.helpers.scan(client, index=catalog.get_index(entity),
|
||||
query={"query": {"match_all": {}}}):
|
||||
s = hit['_source']
|
||||
|
||||
doc = None
|
||||
match entity:
|
||||
case "interoperability-records":
|
||||
print(json.dumps(map_interoperability(s, client), indent=-1))
|
||||
doc = catalog.map_interoperability(s)
|
||||
case "training-resources":
|
||||
print(json.dumps(map_training(s, client), indent=-1))
|
||||
doc = catalog.map_training(s)
|
||||
case "services":
|
||||
print(json.dumps(map_service(s, client), indent=-1))
|
||||
case _:
|
||||
pass
|
||||
doc = catalog.map_service(s)
|
||||
|
||||
if doc is not None:
|
||||
client.update(
|
||||
index=f'{entity}_{kwargs["params"]["SUFFIX"]}',
|
||||
body={"doc": doc, "doc_as_upsert": True},
|
||||
id=doc['id'],
|
||||
refresh=True
|
||||
)
|
||||
|
||||
@task
|
||||
def close_indexes(**kwargs):
|
||||
|
@ -177,7 +192,6 @@ def import_catalogue_entities():
|
|||
"number_of_replicas": 1,
|
||||
"refresh_interval": "60s",
|
||||
}
|
||||
|
||||
})
|
||||
|
||||
# update aliases
|
||||
|
|
Loading…
Reference in New Issue