From 2eb2a94da5183d78ce52820ebf78471eed7a1ace Mon Sep 17 00:00:00 2001 From: Giambattista Bloisi Date: Fri, 5 Apr 2024 17:41:22 +0200 Subject: [PATCH] initial stage --- airflow/dags/EOSC_entity_trasform.py | 31 +++++++++++++++++++++++++++- airflow/dags/EOSC_indexes.py | 22 ++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) diff --git a/airflow/dags/EOSC_entity_trasform.py b/airflow/dags/EOSC_entity_trasform.py index c910333..8ea86f3 100644 --- a/airflow/dags/EOSC_entity_trasform.py +++ b/airflow/dags/EOSC_entity_trasform.py @@ -23,19 +23,48 @@ def trasform_catalog_entity(p: dict) -> dict: p['_id'] = p['id'] return p + +def map_fos_topic_to_domain(fos: str): + res = dict() + if fos.startswith('01'): + res['domain'] = 'Natural Sciences' + if fos.startswith('02'): + res['domain'] = 'Engineering & Technology' + if fos.startswith('03'): + res['domain'] = 'Medical & Health Sciences' + if fos.startswith('04'): + res['domain'] = 'Agricultural Sciences' + if fos.startswith('05'): + res['domain'] = 'Social Sciences' + if fos.startswith('06'): + res['domain'] = 'Humanities' + return res + def trasform_product(p: dict) -> dict: p = trasform_graph_entity(p) p['accessRights'] = list( filter(lambda ar: ar != '', map(lambda m: map_access_right(m.get('access_right')), p.get('manifestations')))) + p['keyword'] = list( + map(lambda topic: topic.get('value'), + filter(lambda topic: topic.get('topic').get('scheme') == 'keyword', p.get('topics')))) + p['domain'] = list(filter(lambda fos: fos is not None, + map(lambda topic: map_fos_topic_to_domain(topic.get('value')), + filter(lambda topic: topic.get('topic').get('scheme') == 'FOS', p.get('topics'))))) + + p['firstPublishDate'] = next( + iter(sorted(filter(lambda date: date.get('type') == 'publishing', p.get('manifestations').get('dates')))), + None) + return p + transform_entities = { # SKG-IF graph entities "datasource": trasform_graph_entity, "grants": trasform_graph_entity, "organizations": trasform_graph_entity, "persons": trasform_graph_entity, - "products": trasform_graph_entity, + "products": trasform_product, "topics": trasform_graph_entity, "venues": trasform_graph_entity, # EOSC catalog entities diff --git a/airflow/dags/EOSC_indexes.py b/airflow/dags/EOSC_indexes.py index 9455c3a..365bfd5 100644 --- a/airflow/dags/EOSC_indexes.py +++ b/airflow/dags/EOSC_indexes.py @@ -325,6 +325,28 @@ mappings['grants'] = { mappings['products'] = { "properties": { + ### Syntethic fields + "accessRights": { + "type": "keyword" + }, + "domain": { + "type": "object", + "properties": { + "domain": { + "type": "keyword" + }, + "subdomain": { + "type": "keyword" + } + } + }, + "firstPublishDate": { + "type": "date" + }, + "keyword": { + "type": "keyword" + }, + ### "abstracts": { "type": "object", "properties": {