lot1-kickoff/airflow/dags/EOSC_entity_trasform.py

88 lines
2.7 KiB
Python
Raw Normal View History

2024-03-25 17:54:23 +01:00
def map_access_right(ar: str) -> str:
match ar:
case 'open':
return 'Open Access'
case 'closed':
return 'Closed'
case 'embargo':
return 'Embargo'
case 'restricted':
return 'Restricted'
case _:
return ''
2024-03-26 22:25:02 +01:00
def trasform_graph_entity(p: dict) -> dict:
p['_id'] = p['local_identifier']
return p
def trasform_catalog_entity(p: dict) -> dict:
p['_id'] = p['id']
return p
2024-04-05 17:41:22 +02:00
def map_fos_topic_to_domain(fos: str):
if fos.startswith('01'):
2024-04-05 19:09:36 +02:00
return 'Natural Sciences'
2024-04-05 18:04:42 +02:00
elif fos.startswith('02'):
2024-04-05 19:09:36 +02:00
return 'Engineering & Technology'
2024-04-05 18:04:42 +02:00
elif fos.startswith('03'):
2024-04-05 19:09:36 +02:00
return 'Medical & Health Sciences'
2024-04-05 18:04:42 +02:00
elif fos.startswith('04'):
2024-04-05 19:09:36 +02:00
return 'Agricultural Sciences'
2024-04-05 18:04:42 +02:00
elif fos.startswith('05'):
2024-04-05 19:09:36 +02:00
return 'Social Sciences'
2024-04-05 18:04:42 +02:00
elif fos.startswith('06'):
2024-04-05 19:09:36 +02:00
return 'Humanities'
2024-04-05 18:04:42 +02:00
2024-04-05 19:09:36 +02:00
return None
2024-04-05 17:41:22 +02:00
2024-04-09 11:04:06 +02:00
def trasform_interoperability(p: dict) -> dict:
p = trasform_catalog_entity(p)
if 'domain' in p:
p['domain'] = {"domain": p['domain']}
2024-04-18 12:01:43 +02:00
p['licenseDetails'] = p['license']
p['license'] = p['license']['title'] if 'title' in p['license'] else ''
2024-04-09 11:04:06 +02:00
return p
2024-03-25 17:54:23 +01:00
def trasform_product(p: dict) -> dict:
2024-03-26 22:25:02 +01:00
p = trasform_graph_entity(p)
2024-04-05 19:09:36 +02:00
p['accessRights'] = list(set(
filter(lambda ar: ar != '', map(lambda m: map_access_right(m.get('access_right')), p.get('manifestations')))))
p['keyword'] = list(set(
map(lambda topic: topic.get('topic').get('value'),
filter(lambda topic: topic.get('topic').get('scheme') == 'keyword', p.get('topics')))))
p['domain'] = list(
map(lambda fos: {"domain": fos},
set(filter(lambda fos: fos is not None,
map(lambda topic: map_fos_topic_to_domain(topic.get('topic').get('value')),
filter(lambda topic: topic.get('topic').get('scheme') == 'FOS', p.get('topics')))))))
2024-04-05 17:41:22 +02:00
p['firstPublishDate'] = next(
2024-04-05 17:55:30 +02:00
iter(
sorted(
map(lambda date: date.get('value'),
filter(lambda date: date.get('type') == 'publishing',
2024-04-05 17:59:20 +02:00
[date for m in (p.get('manifestations') or []) for date in (m.get('dates') or [])])))),
2024-04-05 17:41:22 +02:00
None)
2024-03-25 17:54:23 +01:00
return p
2024-04-05 17:41:22 +02:00
2024-03-25 17:54:23 +01:00
transform_entities = {
2024-03-26 22:25:02 +01:00
# SKG-IF graph entities
"datasource": trasform_graph_entity,
"grants": trasform_graph_entity,
"organizations": trasform_graph_entity,
"persons": trasform_graph_entity,
2024-04-05 17:41:22 +02:00
"products": trasform_product,
2024-03-26 22:25:02 +01:00
"topics": trasform_graph_entity,
"venues": trasform_graph_entity,
# EOSC catalog entities
2024-04-09 11:04:06 +02:00
"interoperability": trasform_interoperability,
2024-03-26 22:25:02 +01:00
"services": trasform_catalog_entity,
"training": trasform_catalog_entity,
2024-03-25 17:54:23 +01:00
}