Compare commits
310 Commits
Author | SHA1 | Date |
---|---|---|
Giambattista Bloisi | dac3849ced | |
Giambattista Bloisi | e072311240 | |
Giambattista Bloisi | 2f8c04a6a2 | |
Giambattista Bloisi | 72ddac35cb | |
Giambattista Bloisi | a2e7c4beb6 | |
Giambattista Bloisi | 118e29f462 | |
Giambattista Bloisi | b23ddd3002 | |
Giambattista Bloisi | 9581a86313 | |
Giambattista Bloisi | f22f89b54c | |
Giambattista Bloisi | 9435d23083 | |
Giambattista Bloisi | fa24f85997 | |
Giambattista Bloisi | 88076612b7 | |
Giambattista Bloisi | 10c27e578d | |
Giambattista Bloisi | 792d069234 | |
Giambattista Bloisi | 47665d151e | |
Giambattista Bloisi | afc31fd17c | |
Giambattista Bloisi | 4f41c48e0d | |
Giambattista Bloisi | 97fa9a986b | |
Giambattista Bloisi | 2ede28e998 | |
Giambattista Bloisi | c9365d18e1 | |
Giambattista Bloisi | fcc5344cc8 | |
Giambattista Bloisi | 4063a33550 | |
Giambattista Bloisi | 169505c75e | |
Giambattista Bloisi | 625eaaf1af | |
Giambattista Bloisi | 828ddb2f82 | |
Giambattista Bloisi | d15bc299c0 | |
Giambattista Bloisi | c6a12baeba | |
Giambattista Bloisi | 19bde3bcef | |
Giambattista Bloisi | 77e4ddbe79 | |
Giambattista Bloisi | 43743187ba | |
Giambattista Bloisi | 48d2f69bc6 | |
Giambattista Bloisi | 1ad367efcc | |
Giambattista Bloisi | 418ad5e430 | |
Giambattista Bloisi | bfb1ebd349 | |
Giambattista Bloisi | 36e995f66c | |
Giambattista Bloisi | ed8bb5bc25 | |
Giambattista Bloisi | 95cd1d7573 | |
Giambattista Bloisi | eccf5d396c | |
Giambattista Bloisi | 227ec44a21 | |
Giambattista Bloisi | 8018975863 | |
Giambattista Bloisi | c7729c44e1 | |
Giambattista Bloisi | c7703bb4df | |
Giambattista Bloisi | 52dd788d15 | |
Giambattista Bloisi | 0339a92de5 | |
Giambattista Bloisi | 833ea1538a | |
Giambattista Bloisi | a07bc0da2b | |
Giambattista Bloisi | 0aba5ef69f | |
Giambattista Bloisi | 2a54a3e325 | |
Giambattista Bloisi | 0fa9e585ac | |
Giambattista Bloisi | aa38362f26 | |
Giambattista Bloisi | 1e3d7595ea | |
Giambattista Bloisi | 6f405f0cbb | |
Giambattista Bloisi | 5600a23f06 | |
Giambattista Bloisi | e7f84f9df0 | |
Giambattista Bloisi | 1db5cb5cbd | |
Giambattista Bloisi | ece4184d8a | |
Giambattista Bloisi | 69b3688ba4 | |
Giambattista Bloisi | 387ddce398 | |
Giambattista Bloisi | c683be854a | |
Giambattista Bloisi | 7c892c5d62 | |
Giambattista Bloisi | b0f8161e80 | |
Giambattista Bloisi | a4c75d072b | |
Giambattista Bloisi | 71289af27f | |
Giambattista Bloisi | 0ed1c3f762 | |
Giambattista Bloisi | ae327daa61 | |
Giambattista Bloisi | c987cdea54 | |
Giambattista Bloisi | 19509f7f60 | |
Giambattista Bloisi | 52be021867 | |
Giambattista Bloisi | df9cab15c3 | |
Giambattista Bloisi | 9378856e9a | |
Giambattista Bloisi | 430b0ac41a | |
Giambattista Bloisi | 05592343e0 | |
Giambattista Bloisi | 8307ebd406 | |
Giambattista Bloisi | a50db121d1 | |
Giambattista Bloisi | 2563f70bfe | |
Giambattista Bloisi | 47bcc93c7d | |
Giambattista Bloisi | f807ce4911 | |
Giambattista Bloisi | eca293419d | |
Giambattista Bloisi | beef14d721 | |
Giambattista Bloisi | ee97fe9659 | |
Giambattista Bloisi | 398794a080 | |
Giambattista Bloisi | 64e83a0cdd | |
Giambattista Bloisi | f048d7df03 | |
Giambattista Bloisi | 354ae6ad41 | |
Giambattista Bloisi | fe9030cdba | |
Giambattista Bloisi | 3bc48791ce | |
Giambattista Bloisi | f155b5e8d1 | |
Giambattista Bloisi | 6d3af5e50d | |
Giambattista Bloisi | 565763faac | |
Giambattista Bloisi | 56b27c940d | |
Giambattista Bloisi | 577e0fcb4d | |
Giambattista Bloisi | 23e91ec335 | |
Giambattista Bloisi | 66d09d37aa | |
Giambattista Bloisi | 8e7613625e | |
Giambattista Bloisi | 2e72b11447 | |
Giambattista Bloisi | 26e8789d30 | |
Giambattista Bloisi | 132d3a45b1 | |
Giambattista Bloisi | e0e04ac22e | |
Giambattista Bloisi | fe50bf1475 | |
Giambattista Bloisi | d7e3e7a1b7 | |
Giambattista Bloisi | 5fceeb8b61 | |
Giambattista Bloisi | 5318979b01 | |
Giambattista Bloisi | bc42ccb8ba | |
Giambattista Bloisi | 6aab7198f7 | |
Giambattista Bloisi | e3d2c52092 | |
Giambattista Bloisi | d1c08458bb | |
Giambattista Bloisi | a4d8a48c87 | |
Giambattista Bloisi | 7e12b9e3dc | |
Giambattista Bloisi | 7a08db26cd | |
Giambattista Bloisi | c1833f6d75 | |
Giambattista Bloisi | b3b0472400 | |
Giambattista Bloisi | 4068e9d702 | |
Giambattista Bloisi | 4bb806d008 | |
Giambattista Bloisi | b3a9ad8342 | |
Giambattista Bloisi | 541581c8b2 | |
Giambattista Bloisi | 21f89da1ed | |
Giambattista Bloisi | 4c7faec554 | |
Giambattista Bloisi | 6754f7bbec | |
Giambattista Bloisi | 336026b6d8 | |
Giambattista Bloisi | f77274ce4f | |
Giambattista Bloisi | 151d305417 | |
Giambattista Bloisi | 94b4add8cd | |
Giambattista Bloisi | 1bc94cd835 | |
Giambattista Bloisi | d9e7528927 | |
Giambattista Bloisi | 09b603925d | |
Giambattista Bloisi | f89898e99b | |
Giambattista Bloisi | 26b0d7219d | |
Giambattista Bloisi | 5486d48817 | |
Giambattista Bloisi | 4f4c236b19 | |
Giambattista Bloisi | e293990c27 | |
Giambattista Bloisi | bf6a9e3d61 | |
Giambattista Bloisi | 735f08aee8 | |
Giambattista Bloisi | b2329a7b63 | |
Giambattista Bloisi | 28d2e96842 | |
Giambattista Bloisi | ba37ed66eb | |
Giambattista Bloisi | 51b695c1b7 | |
Giambattista Bloisi | b89d7f2646 | |
Giambattista Bloisi | 684230b314 | |
Giambattista Bloisi | c798eb0aff | |
Giambattista Bloisi | 8461dc62cc | |
Giambattista Bloisi | 3aab558117 | |
Giambattista Bloisi | 2fe306fdae | |
Giambattista Bloisi | 3b27f4ea1c | |
Giambattista Bloisi | 801516be67 | |
Giambattista Bloisi | 2eb2a94da5 | |
Giambattista Bloisi | 32e76e9f2d | |
Giambattista Bloisi | 5fd2558a3a | |
Giambattista Bloisi | 7c919f5278 | |
Giambattista Bloisi | 33cb4ce636 | |
Giambattista Bloisi | f6fbce36e1 | |
Giambattista Bloisi | 6aa4108b2d | |
Giambattista Bloisi | e684e4cae5 | |
Giambattista Bloisi | 6c76a3e0b8 | |
Giambattista Bloisi | ab5c8a4b7f | |
Giambattista Bloisi | 08ed592711 | |
Giambattista Bloisi | 43eb5cb43d | |
Giambattista Bloisi | 1a91dcf3d6 | |
Giambattista Bloisi | f04459666a | |
Giambattista Bloisi | fc5f884f4d | |
Giambattista Bloisi | 75221b489d | |
Giambattista Bloisi | 185ca78f71 | |
Giambattista Bloisi | 26c2e3eaad | |
Giambattista Bloisi | 7e41f71d32 | |
Giambattista Bloisi | 10c29f86c2 | |
Giambattista Bloisi | 4398546095 | |
Giambattista Bloisi | c9f23d2796 | |
Giambattista Bloisi | 8594587ee5 | |
Giambattista Bloisi | b86cf359f5 | |
Giambattista Bloisi | 00514edfbd | |
Giambattista Bloisi | f79eb140eb | |
Giambattista Bloisi | 4e1955b673 | |
Giambattista Bloisi | c07ddc03d9 | |
Giambattista Bloisi | 0c27895e13 | |
Giambattista Bloisi | 349db6f602 | |
Giambattista Bloisi | 072fb76a26 | |
Giambattista Bloisi | 172703df7c | |
Giambattista Bloisi | f1e619c7fb | |
Giambattista Bloisi | 6b2ef00c25 | |
Giambattista Bloisi | 921ce0bf48 | |
Giambattista Bloisi | 99ef9b3980 | |
Giambattista Bloisi | 8bea0251f1 | |
Giambattista Bloisi | d97972b85e | |
Giambattista Bloisi | 2f5430d9c8 | |
Giambattista Bloisi | 0738f8bebc | |
Giambattista Bloisi | 83b86b50ab | |
Giambattista Bloisi | d660233e8e | |
Giambattista Bloisi | 10fedb06f1 | |
Giambattista Bloisi | a7e485a8c6 | |
Giambattista Bloisi | 587c43872b | |
Giambattista Bloisi | 0ca0da3cc9 | |
Giambattista Bloisi | dead48e9b2 | |
Giambattista Bloisi | 620c6fadea | |
Giambattista Bloisi | b71bcfabf8 | |
Giambattista Bloisi | 65daefb971 | |
Giambattista Bloisi | 1152e14920 | |
Giambattista Bloisi | 65cba81f20 | |
Giambattista Bloisi | 5502f449a5 | |
Giambattista Bloisi | cbdb6f3640 | |
Giambattista Bloisi | 68a16e6c5a | |
Giambattista Bloisi | ef67d70961 | |
Giambattista Bloisi | bf939c0254 | |
Giambattista Bloisi | fa3214dc2c | |
Giambattista Bloisi | cb4f9c838a | |
Giambattista Bloisi | 47505e885f | |
Giambattista Bloisi | 78e2aaf404 | |
Giambattista Bloisi | f4fa06a634 | |
Giambattista Bloisi | ec8e00d7a4 | |
Giambattista Bloisi | b8aa473fff | |
Giambattista Bloisi | fd53c5af5b | |
Giambattista Bloisi | fd25f9bf59 | |
Giambattista Bloisi | 0c272f7ff2 | |
Giambattista Bloisi | a7a6f8e95f | |
Giambattista Bloisi | df6cd00621 | |
Giambattista Bloisi | c221f80d1b | |
Giambattista Bloisi | d9170a0d1a | |
Giambattista Bloisi | 3406662572 | |
Giambattista Bloisi | 5cc3b050ce | |
Giambattista Bloisi | c0bfa81d97 | |
Giambattista Bloisi | 8262871be8 | |
Giambattista Bloisi | ab172a39ff | |
Giambattista Bloisi | 4c7d80a0a8 | |
Giambattista Bloisi | f1cec0cfeb | |
Giambattista Bloisi | 636a4e38e9 | |
Giambattista Bloisi | 55f3a06e1d | |
Giambattista Bloisi | 599625c472 | |
Giambattista Bloisi | c87b207ef2 | |
Giambattista Bloisi | 95cc6095c3 | |
Giambattista Bloisi | f01ba4efb2 | |
Giambattista Bloisi | 679797cfe5 | |
Giambattista Bloisi | 602fedc6cb | |
Giambattista Bloisi | c513072be9 | |
Giambattista Bloisi | 5a5aaccbeb | |
Giambattista Bloisi | 7959c1bc08 | |
Giambattista Bloisi | 0d4ef9cb1f | |
Giambattista Bloisi | d2bbaaece3 | |
Giambattista Bloisi | dd6a192da2 | |
Giambattista Bloisi | d19198c2ba | |
Giambattista Bloisi | 815ce27e34 | |
Giambattista Bloisi | f5ef2d3754 | |
Giambattista Bloisi | e2a5f3e90e | |
Giambattista Bloisi | 2cb40d2276 | |
Giambattista Bloisi | 32992c79e8 | |
Giambattista Bloisi | 8ee696c145 | |
Giambattista Bloisi | b4f8ba1bd0 | |
Giambattista Bloisi | dd07466aae | |
Giambattista Bloisi | 5f07513b35 | |
Giambattista Bloisi | d1a944b8f5 | |
Giambattista Bloisi | f8f0141d50 | |
Giambattista Bloisi | 5a181be26a | |
Giambattista Bloisi | edc6976a47 | |
Giambattista Bloisi | 4c2062e3b9 | |
Giambattista Bloisi | ddbf71cca4 | |
Giambattista Bloisi | e81e28f5f9 | |
Giambattista Bloisi | 7cfae9f1bc | |
Giambattista Bloisi | 546cc75763 | |
Giambattista Bloisi | c8ffe36fbc | |
Giambattista Bloisi | 222b5e66c6 | |
Giambattista Bloisi | 07f8645a60 | |
Giambattista Bloisi | fcbc01fed4 | |
Giambattista Bloisi | b19e4f8ae8 | |
Giambattista Bloisi | 8840091813 | |
Giambattista Bloisi | 38bbf4f449 | |
Giambattista Bloisi | 30181573cf | |
Giambattista Bloisi | 908644d005 | |
Giambattista Bloisi | a7b1d25fdb | |
Giambattista Bloisi | 027996069c | |
Giambattista Bloisi | ba99672349 | |
Giambattista Bloisi | 0a62276c42 | |
Giambattista Bloisi | ec02290442 | |
Giambattista Bloisi | d505df8d36 | |
Giambattista Bloisi | 031b11a3db | |
Giambattista Bloisi | c259c529bc | |
Giambattista Bloisi | deb6567a73 | |
Giambattista Bloisi | 6e8f2c3664 | |
Giambattista Bloisi | d281fb043a | |
Giambattista Bloisi | 3342e20571 | |
Giambattista Bloisi | a7c82b0d61 | |
Giambattista Bloisi | 5a30741e29 | |
Giambattista Bloisi | 4128d1c863 | |
Giambattista Bloisi | 7edb0c5a7e | |
Giambattista Bloisi | 1ad289e948 | |
Giambattista Bloisi | 9682e09eb4 | |
Giambattista Bloisi | 31b05ff2fb | |
Giambattista Bloisi | d4f33496aa | |
Giambattista Bloisi | 7d2da06118 | |
Giambattista Bloisi | 7fcc6a9bd0 | |
Giambattista Bloisi | 550da2c190 | |
Giambattista Bloisi | e99002329e | |
Giambattista Bloisi | 3e6c175901 | |
Giambattista Bloisi | bc50df0413 | |
Giambattista Bloisi | 2c81ded53c | |
Giambattista Bloisi | d6bfc955a3 | |
Giambattista Bloisi | 379920e21b | |
Giambattista Bloisi | 5d073deaa7 | |
Giambattista Bloisi | 2937d77cba | |
Giambattista Bloisi | 91739b26b8 | |
Giambattista Bloisi | 52179da636 | |
Giambattista Bloisi | f0169ca158 | |
Giambattista Bloisi | c3761a161e | |
Giambattista Bloisi | c80a5e6eb8 | |
Giambattista Bloisi | 76981a01ba | |
Giambattista Bloisi | e343e95a9b | |
Giambattista Bloisi | 080d30cc33 | |
Giambattista Bloisi | 991930f934 | |
Giambattista Bloisi | 4a6f8568eb | |
Giambattista Bloisi | 7b0bc4e5b4 | |
Giambattista Bloisi | 6998573b79 | |
Giambattista Bloisi | 4e6f4ee2fb | |
Giambattista Bloisi | de9796a376 | |
Giambattista Bloisi | cf1e7914ca |
|
@ -0,0 +1,140 @@
|
|||
from typing import Dict, Any, List
|
||||
|
||||
|
||||
def map_access_right(ar: str) -> str:
|
||||
match ar:
|
||||
case 'open':
|
||||
return 'Open Access'
|
||||
case 'closed':
|
||||
return 'Closed'
|
||||
case 'embargo':
|
||||
return 'Embargo'
|
||||
case 'restricted':
|
||||
return 'Restricted'
|
||||
case _:
|
||||
return ''
|
||||
|
||||
|
||||
def trasform_graph_entity(p: dict) -> dict:
|
||||
p['_id'] = p['local_identifier']
|
||||
return p
|
||||
|
||||
|
||||
def trasform_catalog_entity(p: dict) -> dict:
|
||||
p['_id'] = p['id']
|
||||
return p
|
||||
|
||||
|
||||
def map_fos_topic_to_domain(fos: str):
|
||||
if fos.startswith('01'):
|
||||
return 'Natural Sciences'
|
||||
elif fos.startswith('02'):
|
||||
return 'Engineering & Technology'
|
||||
elif fos.startswith('03'):
|
||||
return 'Medical & Health Sciences'
|
||||
elif fos.startswith('04'):
|
||||
return 'Agricultural Sciences'
|
||||
elif fos.startswith('05'):
|
||||
return 'Social Sciences'
|
||||
elif fos.startswith('06'):
|
||||
return 'Humanities'
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def trasform_interoperability(p: dict) -> dict:
|
||||
p = trasform_catalog_entity(p)
|
||||
if 'domain' in p:
|
||||
p['domain'] = {"domain": p['domain']}
|
||||
p['licenseDetails'] = p['license']
|
||||
p['license'] = p['license']['identifier'] if 'identifier' in p['license'] else ''
|
||||
return p
|
||||
|
||||
|
||||
def trasform_product(p: dict) -> dict:
|
||||
p = trasform_graph_entity(p)
|
||||
p['accessRights'] = list(set(
|
||||
filter(lambda ar: ar != '', map(lambda m: map_access_right(m.get('access_right')), p.get('manifestations')))))
|
||||
p['keyword'] = list(set(
|
||||
map(lambda topic: topic.get('topic').get('value'),
|
||||
filter(lambda topic: topic.get('topic').get('scheme') == 'keyword', p.get('topics')))))
|
||||
p['domain'] = list(
|
||||
map(lambda fos: {"domain": fos},
|
||||
set(filter(lambda fos: fos is not None,
|
||||
map(lambda topic: map_fos_topic_to_domain(topic.get('topic').get('value')),
|
||||
filter(lambda topic: topic.get('topic').get('scheme') == 'FOS', p.get('topics')))))))
|
||||
p['firstPublishDate'] = next(
|
||||
iter(
|
||||
sorted(
|
||||
map(lambda date: date.get('value'),
|
||||
filter(lambda date: date.get('type') == 'publishing',
|
||||
[date for m in (p.get('manifestations') or []) for date in (m.get('dates') or [])])))),
|
||||
None)
|
||||
return p
|
||||
|
||||
|
||||
transform_entities = {
|
||||
# SKG-IF graph entities
|
||||
"datasource": trasform_graph_entity,
|
||||
"grants": trasform_graph_entity,
|
||||
"organizations": trasform_graph_entity,
|
||||
"persons": trasform_graph_entity,
|
||||
"products": trasform_product,
|
||||
"topics": trasform_graph_entity,
|
||||
"venues": trasform_graph_entity,
|
||||
# EOSC catalog entities
|
||||
"interoperability": trasform_interoperability,
|
||||
"services": trasform_catalog_entity,
|
||||
"training": trasform_catalog_entity,
|
||||
}
|
||||
|
||||
|
||||
def isEmpty(current_value: Dict[str, Any], labels: List[str]) -> bool:
|
||||
if len(labels) <= 0:
|
||||
return True
|
||||
for label in labels:
|
||||
if isinstance(current_value, list) and len(current_value) > 0:
|
||||
current_value = current_value[0]
|
||||
|
||||
if isinstance(current_value, dict) and label in current_value:
|
||||
current_value = current_value[label]
|
||||
else:
|
||||
return True
|
||||
if current_value is None:
|
||||
return True
|
||||
if isinstance(current_value, list):
|
||||
if len(current_value) > 0:
|
||||
return current_value[0] == ""
|
||||
else:
|
||||
return True
|
||||
|
||||
return str(current_value) == ""
|
||||
|
||||
|
||||
#
|
||||
# Filter products that do not meet inclusion policy
|
||||
#
|
||||
def filter_product(p: dict) -> bool:
|
||||
if isEmpty(p, ["titles", "none"]):
|
||||
return True
|
||||
|
||||
if isEmpty(p, ["firstPublishDate"]):
|
||||
return True
|
||||
|
||||
if p['product_type'] == "literature":
|
||||
if isEmpty(p, ["abstracts", "none"]):
|
||||
return True
|
||||
if isEmpty(p, ["contributions", "person", "local_identifier"]):
|
||||
return True
|
||||
elif p['product_type'] in ["research data", "other"]:
|
||||
if isEmpty(p, ["contributions", "person", "local_identifier"]):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
filter_entities = {
|
||||
"products": filter_product
|
||||
}
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,53 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from airflow.decorators import dag
|
||||
from airflow.decorators import task
|
||||
from airflow.hooks.base import BaseHook
|
||||
from airflow.models.baseoperator import chain
|
||||
from opensearchpy import OpenSearch
|
||||
|
||||
import init_ams_topics
|
||||
import init_opensearch_templates
|
||||
|
||||
|
||||
@dag(
|
||||
dag_id="mkg_prepare_environment",
|
||||
#dag_display_name="Prepare MKG Environment",
|
||||
schedule=None,
|
||||
dagrun_timeout=None,
|
||||
start_date=None,
|
||||
catchup=False,
|
||||
params={
|
||||
"OPENSEARCH_CONN_ID": "opensearch_default",
|
||||
"ARGO_CONN_ID": "ams_default",
|
||||
"RESET_AMS": False
|
||||
},
|
||||
tags=["MKG", "opensearch", "argo"]
|
||||
)
|
||||
def prepare_environment():
|
||||
@task
|
||||
def prepare_opensearch(**kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20,
|
||||
timeout=180)
|
||||
init_opensearch_templates.init_opensearch(client)
|
||||
|
||||
@task
|
||||
def prepare_ams(**kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["ARGO_CONN_ID"])
|
||||
extra = conn.extra_dejson
|
||||
init_ams_topics.init_ams(conn.host, extra['project'], extra['token'], kwargs["params"]["RESET_AMS"])
|
||||
|
||||
chain(
|
||||
prepare_opensearch.override(task_id="prepare_opensearch")(),
|
||||
# prepare_ams.override(task_id="prepare_ams")(),
|
||||
)
|
||||
|
||||
|
||||
prepare_environment()
|
|
@ -0,0 +1,112 @@
|
|||
import os
|
||||
from datetime import timedelta
|
||||
import time
|
||||
|
||||
import pendulum
|
||||
import requests
|
||||
from airflow.decorators import dag
|
||||
from airflow.decorators import task
|
||||
from airflow.hooks.base import BaseHook
|
||||
from opensearchpy import OpenSearch, helpers
|
||||
|
||||
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
|
||||
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||
|
||||
default_args = {
|
||||
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
|
||||
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
||||
}
|
||||
|
||||
|
||||
@dag(
|
||||
dag_id="open_data_portal_harvest",
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
schedule=None,
|
||||
dagrun_timeout=None,
|
||||
catchup=False,
|
||||
default_args=default_args,
|
||||
params={
|
||||
"S3_CONN_ID": "s3_conn",
|
||||
"OPENSEARCH_CONN_ID": "opensearch_default",
|
||||
"OS_INDEX_NAME": "euodp_raw"
|
||||
},
|
||||
tags=["aggregation"]
|
||||
)
|
||||
def harvest():
|
||||
@task
|
||||
def everything(**context):
|
||||
index_name = context["params"]["OS_INDEX_NAME"]
|
||||
conn = BaseHook.get_connection(context["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20
|
||||
)
|
||||
|
||||
if not client.indices.exists(index_name):
|
||||
client.indices.create(index_name, {
|
||||
"settings": {
|
||||
"index": {
|
||||
"number_of_shards": 3,
|
||||
"number_of_replicas": 0,
|
||||
"codec": "zstd_no_dict",
|
||||
"replication.type": "SEGMENT"
|
||||
},
|
||||
},
|
||||
"mappings": {
|
||||
"dynamic": False
|
||||
}
|
||||
})
|
||||
|
||||
def store_results(hits):
|
||||
def _generate_data():
|
||||
for r in hits:
|
||||
r['_index'] = index_name
|
||||
r['_id'] = r['id']
|
||||
yield r
|
||||
succeeded = 0
|
||||
failed = 0
|
||||
for success, item in helpers.parallel_bulk(client, actions=_generate_data(),
|
||||
raise_on_exception=False,
|
||||
raise_on_error=False,
|
||||
chunk_size=5000,
|
||||
max_chunk_bytes=50 * 1024 * 1024,
|
||||
timeout=180):
|
||||
if success:
|
||||
succeeded = succeeded + 1
|
||||
else:
|
||||
print(item["index"]["error"])
|
||||
failed = failed + 1
|
||||
headers = {'Accept': 'application/json'}
|
||||
r = requests.get('https://data.europa.eu/api/hub/search/search?filter=dataset&aggregation=false&limit=300&showScore=true&scroll=true', headers=headers).json()
|
||||
scroll_id = r['result']['scrollId']
|
||||
results = r['result']['results']
|
||||
store_results(results)
|
||||
max_retries = 10
|
||||
while scroll_id:
|
||||
try:
|
||||
r = requests.get('https://data.europa.eu/api/hub/search/scroll?scrollId=' + scroll_id, headers=headers)
|
||||
r.raise_for_status()
|
||||
except Exception as e:
|
||||
print(f"Error:" + str(e))
|
||||
time.sleep(0.1)
|
||||
max_retries = max_retries - 1
|
||||
if max_retries == 0:
|
||||
raise Exception("Cannot fetch data")
|
||||
continue
|
||||
max_retries = 10
|
||||
r = r.json()
|
||||
scroll_id = r['result']['scrollId']
|
||||
results = r['result']['results']
|
||||
if len(results) <= 0:
|
||||
return
|
||||
store_results(results)
|
||||
|
||||
everything()
|
||||
|
||||
|
||||
harvest()
|
|
@ -0,0 +1,42 @@
|
|||
import os
|
||||
from datetime import timedelta
|
||||
|
||||
import pendulum
|
||||
from airflow.decorators import dag
|
||||
from airflow.decorators import task
|
||||
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
||||
|
||||
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
|
||||
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||
|
||||
default_args = {
|
||||
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
|
||||
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
||||
}
|
||||
|
||||
|
||||
@dag(
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
schedule=None,
|
||||
catchup=False,
|
||||
default_args=default_args,
|
||||
params={
|
||||
"prefix": "Key prefix of files to delete",
|
||||
"bucket": "bucket containing files to delete",
|
||||
},
|
||||
tags=["s3"],
|
||||
)
|
||||
def s3_delete():
|
||||
@task
|
||||
def delete(**context):
|
||||
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
|
||||
keys = hook.list_keys(bucket_name=context["params"]["bucket"], prefix=context["params"]["prefix"])
|
||||
hook.delete_objects(bucket=context["params"]["bucket"], keys=keys)
|
||||
for key in keys:
|
||||
print(f"{key} deleted!")
|
||||
|
||||
delete()
|
||||
|
||||
|
||||
s3_delete()
|
|
@ -0,0 +1,98 @@
|
|||
import os
|
||||
import tarfile
|
||||
import time
|
||||
from datetime import timedelta
|
||||
|
||||
import pendulum
|
||||
from airflow.decorators import dag
|
||||
from airflow.decorators import task
|
||||
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
|
||||
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||
|
||||
default_args = {
|
||||
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
|
||||
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
||||
}
|
||||
|
||||
|
||||
def check_for_key_with_backoff(hook: S3Hook, key:str, bucket:str) -> bool:
|
||||
delay = 10 # initial delay
|
||||
delay_incr = 10 # additional delay in each loop
|
||||
max_delay = 60 # max delay of one loop. Total delay is (max_delay**2)/2
|
||||
|
||||
while delay < max_delay:
|
||||
try:
|
||||
return hook.check_for_key(key=key, bucket_name=bucket)
|
||||
except ClientError as err:
|
||||
code = err.response.get('Error',{}).get('Code', '')
|
||||
if code in ['NoSuchBucket']:
|
||||
print(f"Error: {code}. Check s3path: s3://{bucket}/{key}")
|
||||
raise err
|
||||
time.sleep(delay)
|
||||
delay += delay_incr
|
||||
|
||||
|
||||
def load_file_obj_with_backoff(hook: S3Hook, fileobj, key:str, bucket:str, replace:bool) -> bool:
|
||||
delay = 10 # initial delay
|
||||
delay_incr = 10 # additional delay in each loop
|
||||
max_delay = 60 # max delay of one loop. Total delay is (max_delay**2)/2
|
||||
|
||||
while delay < max_delay:
|
||||
try:
|
||||
return hook.load_file_obj(fileobj,
|
||||
key,
|
||||
bucket,
|
||||
replace=replace)
|
||||
except ClientError as err:
|
||||
code = err.response.get('Error',{}).get('Code', '')
|
||||
if code in ['NoSuchBucket']:
|
||||
print(f"Error: {code}. Check s3path: s3://{bucket}/{key}")
|
||||
raise err
|
||||
time.sleep(delay)
|
||||
delay += delay_incr
|
||||
|
||||
|
||||
@dag(
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
schedule=None,
|
||||
catchup=False,
|
||||
default_args=default_args,
|
||||
params={
|
||||
"src_key": "File to untar",
|
||||
"src_bucket": "bucket containing the zip file",
|
||||
"dst_key_prefix": "",
|
||||
"dst_bucket": "bucket that will contain unzipped files"
|
||||
},
|
||||
tags=["s3"],
|
||||
)
|
||||
def s3_untar():
|
||||
@task
|
||||
def untar(**context):
|
||||
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
|
||||
s3_obj = hook.get_key(context["params"]["src_key"], bucket_name=context["params"]["src_bucket"])
|
||||
with tarfile.open(fileobj=s3_obj.get()["Body"], mode='r|*') as tar:
|
||||
for member in tar:
|
||||
dst_key = context["params"]["dst_key_prefix"] + "/" + member.name
|
||||
dst_key = os.path.normpath(dst_key)
|
||||
# Ignore directories, links, devices, fifos, etc.
|
||||
if (not member.isfile()) or member.name.endswith('/'):
|
||||
print(f"Skipping {member.name}: is not a file")
|
||||
continue
|
||||
if check_for_key_with_backoff(hook, key=dst_key, bucket=context["params"]["dst_bucket"]):
|
||||
print(f"Skipping {member.name}: already exists")
|
||||
continue
|
||||
print(f"Extracting {member.name} to {dst_key}")
|
||||
fileobj = tar.extractfile(member)
|
||||
fileobj.seekable = lambda: False
|
||||
load_file_obj_with_backoff(hook, fileobj,
|
||||
dst_key,
|
||||
context["params"]["dst_bucket"],
|
||||
replace=True)
|
||||
untar()
|
||||
|
||||
|
||||
s3_untar()
|
|
@ -0,0 +1,55 @@
|
|||
import os
|
||||
from datetime import timedelta
|
||||
|
||||
import pendulum
|
||||
from airflow.decorators import dag
|
||||
from airflow.decorators import task
|
||||
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
||||
from airflow.utils.file import TemporaryDirectory
|
||||
|
||||
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
|
||||
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||
|
||||
default_args = {
|
||||
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
|
||||
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
||||
}
|
||||
|
||||
def s3_dowload_unzip_upload(s3conn: str, src_key: str, src_bucket: str, dest_bucket: str):
|
||||
hook = S3Hook(s3conn, transfer_config_args={'use_threads': False})
|
||||
|
||||
with TemporaryDirectory() as dwl_dir:
|
||||
with TemporaryDirectory() as tmp_dir:
|
||||
archive = f'{dwl_dir}/{src_key}'
|
||||
hook.download_file(key=src_key, bucket_name=src_bucket, local_path=dwl_dir, preserve_file_name=True,
|
||||
use_autogenerated_subdir=False)
|
||||
with zipfile.ZipFile(archive, 'r') as zip_ref:
|
||||
for info in zip_ref.infolist():
|
||||
with zip_ref.open(info.filename) as file:
|
||||
hook.load_file_obj(file, info.filename, dest_bucket, replace=True)
|
||||
|
||||
@dag(
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
schedule=None,
|
||||
catchup=False,
|
||||
default_args=default_args,
|
||||
params={
|
||||
"zipfile": "File to unzip",
|
||||
"src_bucket": "bucket containing the zip file",
|
||||
"dst_bucket": "bucket that will contain unzipped files"
|
||||
},
|
||||
tags=["s3"],
|
||||
)
|
||||
def s3_unzip():
|
||||
@task
|
||||
def unzip(**context):
|
||||
s3_dowload_unzip_upload(S3_CONN_ID,
|
||||
context["params"]["zipfile"],
|
||||
context["params"]["src_bucket"],
|
||||
context["params"]["dst_bucket"])
|
||||
|
||||
unzip()
|
||||
|
||||
|
||||
s3_unzip()
|
|
@ -0,0 +1,68 @@
|
|||
import os
|
||||
from datetime import timedelta, datetime
|
||||
|
||||
import pendulum
|
||||
from airflow import DAG
|
||||
from airflow.hooks.base import BaseHook
|
||||
from airflow.models.baseoperator import chain
|
||||
from airflow.providers.cncf.kubernetes.operators.pod import KubernetesPodOperator
|
||||
from airflow.providers.cncf.kubernetes.secret import Secret
|
||||
|
||||
default_args = {
|
||||
"execution_timeout": timedelta(days=6),
|
||||
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
||||
}
|
||||
|
||||
conn = BaseHook.get_connection("opensearch_default")
|
||||
|
||||
dag = DAG(
|
||||
'antispam_batch_check',
|
||||
default_args=default_args,
|
||||
schedule=None,
|
||||
dagrun_timeout=None,
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
catchup=False,
|
||||
schedule_interval=timedelta(days=1)
|
||||
)
|
||||
|
||||
|
||||
secrets = [
|
||||
Secret(
|
||||
deploy_type='env',
|
||||
deploy_target='CURATION_OPENSEARCH__USER',
|
||||
secret='opensearch-conn-secrets',
|
||||
key='username',
|
||||
),
|
||||
Secret(
|
||||
deploy_type='env',
|
||||
deploy_target='CURATION_OPENSEARCH__PASSWORD',
|
||||
secret='opensearch-conn-secrets',
|
||||
key='password',
|
||||
),
|
||||
]
|
||||
|
||||
# Define the KubernetesPodOperator
|
||||
task = KubernetesPodOperator(
|
||||
task_id='antispam_checker',
|
||||
name='antispam_checker',
|
||||
namespace='kg-airflow',
|
||||
image='gbloisi/curation:1.0.0',
|
||||
image_pull_policy="Always",
|
||||
cmds=['python3'],
|
||||
arguments=['/antispam-batch.py',
|
||||
"--opensearch.host", conn.host,
|
||||
"--opensearch.port", str(conn.port),
|
||||
"--openai.host", "local-ai.kg-airflow.svc.cluster.local",
|
||||
"--openai.port", "8000",
|
||||
"--parallelism", "36"
|
||||
],
|
||||
secrets=secrets,
|
||||
is_delete_operator_pod=True,
|
||||
in_cluster=True,
|
||||
get_logs=True,
|
||||
dag=dag
|
||||
)
|
||||
|
||||
# Set the task dependencies
|
||||
chain(task)
|
|
@ -0,0 +1,314 @@
|
|||
from datetime import datetime
|
||||
|
||||
from opensearchpy import OpenSearch
|
||||
|
||||
from catalogue.dictutils import extract_nested, extract_map_nested, delete_none
|
||||
from catalogue.vocabulary import CATALOG_VOCABULARY
|
||||
|
||||
|
||||
class RawCatalogOpensearch:
|
||||
entities = ["datasources",
|
||||
"interoperability-records",
|
||||
"providers",
|
||||
"resource-interoperability-records",
|
||||
"services",
|
||||
"training-resources"]
|
||||
mapped_entities = ["interoperability-records", "training-resources", "services"]
|
||||
|
||||
def __init__(self, os_client: OpenSearch, suffix: str | None):
|
||||
self.os_client = os_client
|
||||
self.suffix = suffix
|
||||
|
||||
def get_index(self, name: str):
|
||||
return f"catalog_{name}_{self.suffix}"
|
||||
|
||||
def get_alias(self, name: str):
|
||||
return f"catalog_{name}"
|
||||
|
||||
def get_mapped_index(self, name: str):
|
||||
match name:
|
||||
case "interoperability-records":
|
||||
return f"interoperability_{self.suffix}"
|
||||
case "training-resources":
|
||||
return f"training_{self.suffix}"
|
||||
case "services":
|
||||
return f"services_{self.suffix}"
|
||||
return None
|
||||
|
||||
def get_mapped_alias(self, name: str):
|
||||
match name:
|
||||
case "interoperability-records":
|
||||
return f"interoperability"
|
||||
case "training-resources":
|
||||
return f"training"
|
||||
case "services":
|
||||
return f"services"
|
||||
return None
|
||||
|
||||
def get_resource_interoperability_records(self, resource_id):
|
||||
response = self.os_client.search(
|
||||
body={
|
||||
'query': {
|
||||
'term': {
|
||||
'resourceInteroperabilityRecord.resourceId.keyword': resource_id,
|
||||
}
|
||||
},
|
||||
"fields": [
|
||||
"resourceInteroperabilityRecord.interoperabilityRecordIds"
|
||||
],
|
||||
"_source": False
|
||||
},
|
||||
index=self.get_index('resource-interoperability-records')
|
||||
)
|
||||
|
||||
interoperability_ids = []
|
||||
interoperability_records = []
|
||||
for hit in response['hits']['hits']:
|
||||
interoperability_ids.extend(
|
||||
extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.interoperabilityRecordIds']) or [])
|
||||
|
||||
if len(interoperability_ids) > 0:
|
||||
response = self.os_client.search(
|
||||
body={
|
||||
"query": {
|
||||
"ids": {
|
||||
"values": interoperability_ids,
|
||||
}
|
||||
},
|
||||
},
|
||||
index=self.get_index('interoperability-records')
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
interoperability_records.append(extract_nested(hit, ['_source']))
|
||||
|
||||
return interoperability_records
|
||||
|
||||
def get_providers(self, provider_ids: list[str]) -> list:
|
||||
provider_records = []
|
||||
if provider_ids is not None and len(provider_ids) > 0:
|
||||
response = self.os_client.search(
|
||||
body={
|
||||
"query": {
|
||||
"ids": {
|
||||
"values": provider_ids if isinstance(provider_ids, list) else [provider_ids],
|
||||
}
|
||||
},
|
||||
},
|
||||
index=self.get_index('providers')
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
provider_records.append(extract_nested(hit, ['_source']))
|
||||
return provider_records
|
||||
|
||||
def get_provider(self, provider_id: str):
|
||||
if provider_id is not None:
|
||||
providers = self.get_providers([provider_id])
|
||||
if providers is not None and len(providers) > 0:
|
||||
return providers[0]
|
||||
return {}
|
||||
|
||||
def get_services(self, service_ids: list[str]) -> list:
|
||||
service_records = []
|
||||
|
||||
if service_ids is not None and len(service_ids) > 0:
|
||||
response = self.os_client.search(
|
||||
body={
|
||||
"query": {
|
||||
"ids": {
|
||||
"values": service_ids if isinstance(service_ids, list) else [
|
||||
service_ids],
|
||||
}
|
||||
},
|
||||
},
|
||||
index=self.get_index('services')
|
||||
)
|
||||
for hit in response['hits']['hits']:
|
||||
service_records.append(extract_nested(hit, ['_source']))
|
||||
|
||||
return service_records
|
||||
|
||||
def get_datasource_of_service(self, service_id: str):
|
||||
response = self.os_client.search(
|
||||
body={
|
||||
'query': {
|
||||
'term': {
|
||||
'datasource.serviceId.keyword': service_id,
|
||||
}
|
||||
}
|
||||
},
|
||||
index=self.get_index('datasources')
|
||||
)
|
||||
|
||||
for hit in response['hits']['hits']:
|
||||
return extract_nested(hit, ['_source'])
|
||||
return {}
|
||||
|
||||
def get_services_of_interoperability(self, interoperability_id: str):
|
||||
svc_ids = []
|
||||
response = self.os_client.search(
|
||||
body={
|
||||
'query': {
|
||||
'term': {
|
||||
'resourceInteroperabilityRecord.interoperabilityRecordIds.keyword': interoperability_id,
|
||||
}
|
||||
},
|
||||
"fields": [
|
||||
"resourceInteroperabilityRecord.resourceId"
|
||||
],
|
||||
"_source": False
|
||||
},
|
||||
index=self.get_index('resource-interoperability-records')
|
||||
)
|
||||
|
||||
for hit in response['hits']['hits']:
|
||||
svc_ids.extend(extract_nested(hit, ['fields', 'resourceInteroperabilityRecord.resourceId']) or [])
|
||||
|
||||
return svc_ids
|
||||
|
||||
def map_service(self, raw_svc: dict) -> dict:
|
||||
interoperability_records = self.get_resource_interoperability_records(raw_svc['id'])
|
||||
organization = self.get_provider(extract_nested(raw_svc, ['service', 'resourceOrganisation']))
|
||||
provider_records = self.get_providers(list(
|
||||
filter(lambda i: len(i) > 0, extract_nested(raw_svc, ['service', 'resourceProviders']) or [])))
|
||||
related_resources_records = self.get_services(list(
|
||||
filter(lambda i: len(i) > 0, extract_nested(raw_svc, ['service', 'relatedResources']) or [])))
|
||||
datasource = self.get_datasource_of_service(raw_svc['id'])
|
||||
|
||||
res = {
|
||||
"accessRestriction": extract_nested(raw_svc,
|
||||
"service.geographicalAvailabilities".split(".")),
|
||||
"accessTypes": extract_map_nested(raw_svc, 'access_type', "service.accessTypes".split(".")),
|
||||
"access_modes": extract_map_nested(raw_svc, 'access_mode', "service.accessModes".split(".")),
|
||||
"category": list(map(lambda c: {"category": CATALOG_VOCABULARY['categories'][c['category']],
|
||||
"subcategory": CATALOG_VOCABULARY['subcategories'][c['subcategory']]},
|
||||
extract_nested(raw_svc, "service.categories".split(".")))),
|
||||
"description": extract_nested(raw_svc, "service.description".split(".")),
|
||||
"domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
|
||||
"subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
|
||||
extract_nested(raw_svc, "service.scientificDomains".split(".")))),
|
||||
"grantProjectNames": extract_nested(raw_svc, "service.grantProjectNames".split(".")),
|
||||
"helpdeskPage": extract_nested(raw_svc, "service.helpdeskPage".split(".")),
|
||||
"horizontal": extract_nested(raw_svc, "service.horizontalService".split(".")) or False,
|
||||
"id": extract_nested(raw_svc, "service.id".split(".")),
|
||||
"interoperabilityGuidelines": list(
|
||||
map(lambda ig: ig['interoperabilityRecord']['title'], interoperability_records)),
|
||||
"language": extract_map_nested(raw_svc, 'languages', "service.languageAvailabilities".split(".")),
|
||||
"name": extract_nested(raw_svc, "service.name".split(".")),
|
||||
"orderType": extract_map_nested(raw_svc, 'order_type', "service.orderType".split(".")),
|
||||
"organization": extract_nested(organization, "provider.name".split(".")),
|
||||
"pricing": extract_nested(raw_svc, "service.pricing".split(".")),
|
||||
"privacyPolicy": extract_nested(raw_svc, "service.privacyPolicy".split(".")),
|
||||
"providers": list(map(lambda p: p['provider']['name'], provider_records)),
|
||||
"relatedPlatforms": extract_map_nested(raw_svc, 'related_platform', "service.relatedPlatforms".split(".")),
|
||||
"relatedResources": list(map(lambda p: p['service']['name'], related_resources_records)),
|
||||
"tags": extract_nested(raw_svc, "service.tags".split(".")),
|
||||
"targetUsers": extract_map_nested(raw_svc, 'target_user', "service.targetUsers".split(".")),
|
||||
"termsOfUse": extract_nested(raw_svc, "service.termsOfUse".split(".")),
|
||||
"thematic": extract_nested(datasource, "datasource.thematic".split(".")) or False,
|
||||
"trl": extract_map_nested(raw_svc, 'trl', "service.trl".split(".")),
|
||||
"type": 'datasource' if extract_nested(datasource, "datasource.id".split(".")) is not None else 'service',
|
||||
"useCases": extract_nested(raw_svc, "service.useCases".split(".")),
|
||||
"userManual": extract_nested(raw_svc, "service.userManual".split(".")),
|
||||
"webpage": extract_nested(raw_svc, "service.webpage".split(".")),
|
||||
"year": datetime.fromtimestamp(
|
||||
int(extract_nested(raw_svc, "metadata.registeredAt".split("."))) / 1000).year,
|
||||
}
|
||||
|
||||
return delete_none(res)
|
||||
|
||||
def map_training(self, raw_trn: dict) -> dict:
|
||||
organization = self.get_provider(extract_nested(raw_trn, ['trainingResource', 'resourceOrganisation']))
|
||||
|
||||
res = {
|
||||
"accessRights": extract_map_nested(raw_trn, 'tr_access', "trainingResource.accessRights".split(".")),
|
||||
"alternativeIdentifiers": extract_nested(raw_trn,
|
||||
"trainingResource.alternativeIdentifiers".split(".")),
|
||||
"authors": extract_nested(raw_trn,
|
||||
"trainingResource.authors".split(".")),
|
||||
|
||||
"contentResourceType": extract_map_nested(raw_trn, 'tr_content',
|
||||
"trainingResource.contentResourceTypes".split(".")),
|
||||
|
||||
"description": extract_nested(raw_trn,
|
||||
"trainingResource.description".split(".")),
|
||||
"domain": list(map(lambda c: {"domain": CATALOG_VOCABULARY['domains'][c['scientificDomain']],
|
||||
"subdomain": CATALOG_VOCABULARY['subdomains'][c['scientificSubdomain']]},
|
||||
extract_nested(raw_trn, "trainingResource.scientificDomains".split(".")))),
|
||||
"duration": extract_nested(raw_trn,
|
||||
"trainingResource.duration".split(".")),
|
||||
"expertiseLevel": extract_map_nested(raw_trn, 'expertise_level',
|
||||
"trainingResource.expertiseLevel".split(".")),
|
||||
|
||||
"id": extract_nested(raw_trn,
|
||||
"trainingResource.id".split(".")),
|
||||
"keyword": extract_nested(raw_trn,
|
||||
"trainingResource.keywords".split(".")),
|
||||
"language": extract_map_nested(raw_trn, 'languages', "trainingResource.languages".split(".")),
|
||||
"learningOutcomes": extract_nested(raw_trn,
|
||||
"trainingResource.learningOutcomes".split(".")),
|
||||
"learningResourceType": extract_map_nested(raw_trn, 'tr_dcmi',
|
||||
"trainingResource.learningResourceTypes".split(".")),
|
||||
|
||||
"license": extract_nested(raw_trn,
|
||||
"trainingResource.license".split(".")),
|
||||
"organization": extract_nested(organization, "provider.name".split(".")),
|
||||
"qualifications": extract_map_nested(raw_trn, 'qualification',
|
||||
"trainingResource.qualifications".split(".")),
|
||||
"targetGroup": extract_map_nested(raw_trn, 'target_user', "trainingResource.targetGroups".split(".")),
|
||||
"title": extract_nested(raw_trn,
|
||||
"trainingResource.title".split(".")),
|
||||
"type": 'trainingResource',
|
||||
"url": extract_nested(raw_trn,
|
||||
"trainingResource.url".split(".")),
|
||||
"year": datetime.fromtimestamp(
|
||||
int(extract_nested(raw_trn, "metadata.registeredAt".split("."))) / 1000).year,
|
||||
}
|
||||
|
||||
return delete_none(res)
|
||||
|
||||
def map_interoperability(self, raw_itr: dict) -> dict:
|
||||
organization = self.get_provider(extract_nested(raw_itr, ['interoperabilityRecord', 'providerId']))
|
||||
service_records = self.get_services(self.get_services_of_interoperability(raw_itr['id']))
|
||||
|
||||
res = {
|
||||
"alternativeIdentifiers": extract_nested(raw_itr,
|
||||
"interoperabilityRecord.alternativeIdentifiers".split(".")),
|
||||
"creators": list(map(lambda c: {
|
||||
"affiliation": extract_nested(c, ['creatorAffiliationInfo', 'affiliation']),
|
||||
"givenName": extract_nested(c, ['givenName']),
|
||||
"familyName": extract_nested(c, ['familyName']),
|
||||
"fullName": extract_nested(c, ['creatorNameTypeInfo', 'creatorName']),
|
||||
"type": extract_nested(c, ['creatorNameTypeInfo', 'nameType'])
|
||||
}, extract_nested(raw_itr, "interoperabilityRecord.creators".split(".")))),
|
||||
|
||||
"description": extract_nested(raw_itr,
|
||||
"interoperabilityRecord.description".split(".")),
|
||||
"doi": extract_nested(raw_itr, ['identifierInfo', 'identifier']) if
|
||||
extract_nested(raw_itr, ['identifierInfo', 'identifierType']) == 'ir_identifier_type-doi' else None,
|
||||
"domain": {'domain': extract_map_nested(raw_itr, 'domains',
|
||||
"interoperabilityRecord.domain".split("."))},
|
||||
"guidelineType": extract_map_nested(raw_itr, 'guideline_type',
|
||||
"interoperabilityRecord.eoscGuidelineType".split(".")),
|
||||
"id": extract_nested(raw_itr,
|
||||
"interoperabilityRecord.id".split(".")),
|
||||
"license": extract_nested(raw_itr, "interoperabilityRecord.rights.rightIdentifier".split(".")),
|
||||
"licenseDetails": list(map(lambda c: {
|
||||
"identifier": extract_nested(c, ['rightIdentifier']),
|
||||
"title": extract_nested(c, ['rightTitle']),
|
||||
"uri": extract_nested(c, ['rightURI'])
|
||||
}, extract_nested(raw_itr, "interoperabilityRecord.rights".split(".")))),
|
||||
"organization": extract_nested(organization, "provider.name".split(".")),
|
||||
"provider": extract_nested(organization, "provider.name".split(".")),
|
||||
"publicationYear": extract_nested(raw_itr, "interoperabilityRecord.publicationYear".split(".")),
|
||||
"services": list(map(lambda s: {
|
||||
"name": extract_nested(organization, "service.name".split(".")),
|
||||
"organization": extract_nested(organization, "service.organization".split(".")),
|
||||
# s.organization on already mapped services
|
||||
}, service_records)),
|
||||
"status": extract_nested(raw_itr, "interoperabilityRecord.status".split(".")),
|
||||
"title": extract_nested(raw_itr, "interoperabilityRecord.title".split(".")),
|
||||
"type": 'interoperabilityRecord',
|
||||
# "year": datetime.fromtimestamp(int(extract_nested(raw_data, "metadata.registeredAt".split("."))) / 1000).year,
|
||||
}
|
||||
|
||||
return delete_none(res)
|
|
@ -0,0 +1,41 @@
|
|||
from typing import Dict, Any, List
|
||||
|
||||
from catalogue.vocabulary import CATALOG_VOCABULARY
|
||||
|
||||
|
||||
def extract_nested(current_value: Dict[str, Any], labels: List[str]) -> Any | None:
|
||||
if len(labels) <= 0:
|
||||
return current_value
|
||||
for label in labels:
|
||||
if isinstance(current_value, dict) and label in current_value:
|
||||
current_value = current_value[label]
|
||||
else:
|
||||
return None
|
||||
|
||||
return current_value
|
||||
|
||||
|
||||
def extract_map_nested(current_value: Dict[str, Any], dictionary: str, labels: List[str]) -> Any | None:
|
||||
value = extract_nested(current_value, labels)
|
||||
if value is None:
|
||||
return None
|
||||
if isinstance(value, list):
|
||||
return list(map(lambda d: CATALOG_VOCABULARY[dictionary][d] if d else None, value))
|
||||
if isinstance(value, str) and value != '':
|
||||
return CATALOG_VOCABULARY[dictionary][value]
|
||||
return None
|
||||
|
||||
|
||||
def delete_none(_dict):
|
||||
"""Delete None values recursively from all of the dictionaries, tuples, lists, sets"""
|
||||
if isinstance(_dict, dict):
|
||||
for key, value in list(_dict.items()):
|
||||
if isinstance(value, (list, dict, tuple, set)):
|
||||
_dict[key] = delete_none(value)
|
||||
elif value is None or key is None:
|
||||
del _dict[key]
|
||||
|
||||
elif isinstance(_dict, (list, set, tuple)):
|
||||
_dict = type(_dict)(delete_none(item) for item in _dict if item is not None)
|
||||
|
||||
return _dict
|
|
@ -0,0 +1,13 @@
|
|||
from datetime import datetime
|
||||
from typing import Dict, Any, List
|
||||
|
||||
from opensearchpy import OpenSearch
|
||||
|
||||
from catalogue.dictutils import extract_nested, extract_map_nested, delete_none
|
||||
from catalogue.vocabulary import CATALOG_VOCABULARY
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
@ -0,0 +1,878 @@
|
|||
CATALOG_VOCABULARY = {
|
||||
'categories': {'category-access_physical_and_eInfrastructures-compute': 'Compute',
|
||||
'category-access_physical_and_eInfrastructures-data_storage': 'Data Storage',
|
||||
'category-access_physical_and_eInfrastructures-instrument_and_equipment': 'Instrument & Equipment',
|
||||
'category-access_physical_and_eInfrastructures-material_storage': 'Material Storage',
|
||||
'category-access_physical_and_eInfrastructures-network': 'Network',
|
||||
'category-aggregators_and_integrators-aggregators_and_integrators': 'Aggregators & Integrators',
|
||||
'category-other-other': 'Other', 'category-processing_and_analysis-data_analysis': 'Data Analysis',
|
||||
'category-processing_and_analysis-data_management': 'Data Management',
|
||||
'category-processing_and_analysis-measurement_and_materials_analysis': 'Measurement & Materials Analysis',
|
||||
'category-security_and_operations-operations_and_infrastructure_management_services': 'Operations & Infrastructure Management Services',
|
||||
'category-security_and_operations-security_and_identity': 'Security & Identity',
|
||||
'category-sharing_and_discovery-applications': 'Applications',
|
||||
'category-sharing_and_discovery-data': 'Data',
|
||||
'category-sharing_and_discovery-development_resources': 'Development Resources',
|
||||
'category-sharing_and_discovery-samples': 'Samples',
|
||||
'category-sharing_and_discovery-scholarly_communication': 'Scholarly Communication',
|
||||
'category-sharing_and_discovery-software': 'Software',
|
||||
'category-training_and_support-consultancy_and_support': 'Consultancy & Support',
|
||||
'category-training_and_support-education_and_training': 'Education & Training'},
|
||||
'trl': {'trl-1': '1 - basic principles observed', 'trl-2': '2 - technology concept formulated',
|
||||
'trl-3': '3 - experimental proof of concept', 'trl-4': '4 - technology validated in lab',
|
||||
'trl-5': '5 - technology validated in relevant environment',
|
||||
'trl-6': '6 - technology demonstrated in relevant environment',
|
||||
'trl-7': '7 - system prototype demonstration in operational environment',
|
||||
'trl-8': '8 - system complete and qualified',
|
||||
'trl-9': '9 - actual system proven in operational environment'},
|
||||
'target_users': {'target_user-businesses': 'Businesses', 'target_user-funders': 'Funders',
|
||||
'target_user-innovators': 'Innovators', 'target_user-other': 'Other',
|
||||
'target_user-policy_makers': 'Policy Makers', 'target_user-providers': 'Providers',
|
||||
'target_user-research_communities': 'Research Communities',
|
||||
'target_user-research_groups': 'Research Groups',
|
||||
'target_user-research_infrastructure_managers': 'Research Infrastructure Managers',
|
||||
'target_user-research_managers': 'Research Managers',
|
||||
'target_user-research_networks': 'Research Networks',
|
||||
'target_user-research_organisations': 'Research Organisations',
|
||||
'target_user-research_projects': 'Research Projects', 'target_user-researchers': 'Researchers',
|
||||
'target_user-resource_managers': 'Resource Managers',
|
||||
'target_user-resource_provider_managers': 'Provider Managers',
|
||||
'target_user-publishers': 'Publishers',
|
||||
'target_user-students': 'Students'},
|
||||
'access_mode': {'access_mode-free': 'Free', 'access_mode-free_conditionally': 'Free Conditionally',
|
||||
'access_mode-other': 'Other', 'access_mode-paid': 'Paid',
|
||||
'access_mode-peer_reviewed': 'Peer Reviewed'},
|
||||
'funding_body': {'funding_body-ademe': 'Agency for Environment and Energy Management (ADEME)',
|
||||
'funding_body-ahrc': 'Arts and Humanities Research Council (AHRC)',
|
||||
'funding_body-aka': 'Academy of Finland (AKA)',
|
||||
'funding_body-ancs': 'National Authority for Scientific Research (ANCS)',
|
||||
'funding_body-anr': 'French National Research Agency (ANR)',
|
||||
'funding_body-apvv': 'Research and Development Agency (APVV)',
|
||||
'funding_body-arc': 'Australian Research Council (ARC)',
|
||||
'funding_body-arrs': 'Slovenian Research Agency (ARRS)',
|
||||
'funding_body-awi': 'Alfred Wegener Institute for Polar and Marine Research (AWI)',
|
||||
'funding_body-bbsrc': 'Biotechnology and Biological Sciences Research Council (BBSRC)',
|
||||
'funding_body-bf': 'Belmont Forum (BF)',
|
||||
'funding_body-bmbf': 'Federal Ministry of Education and Research (BMBF)',
|
||||
'funding_body-caixa': 'La Caixa Foundation (CAIXA)',
|
||||
'funding_body-cdti': 'Center for Industrial Technological Development (CDTI)',
|
||||
'funding_body-cea': 'Alternative Energies and Atomic Energy Commission (CEA)',
|
||||
'funding_body-cihr': 'Canadian Institutes of Health Research (CIHR)',
|
||||
'funding_body-cncsis': 'National University Research Council (CNCSIS) - Romania',
|
||||
'funding_body-cnes': 'National Centre for Space Studies (CNES)',
|
||||
'funding_body-cnpq': 'National Council for Scientific and Technological Development (CNPq)',
|
||||
'funding_body-cnr': 'National Research Council (CNR)',
|
||||
'funding_body-cnrs': 'National Centre for Scientific Research (CNRS)',
|
||||
'funding_body-csf': 'Croatian Science Foundation (CSF)',
|
||||
'funding_body-csic': 'Spanish National Research Council (CSIC)',
|
||||
'funding_body-dashe': 'Danish Agency for Science and Higher Education (DASHE)',
|
||||
'funding_body-dasti': 'Danish Agency for Science, Technology and Innovation (DASTI)',
|
||||
'funding_body-ddf': 'The Danish Council for Independent Research (DDF)',
|
||||
'funding_body-dff': 'Danish Council for Independent Research (DFF)',
|
||||
'funding_body-dfg': 'German Research Foundation (DFG)',
|
||||
'funding_body-dgo6': 'General Operational Directorate for Economy, Employment and Research (DGO6)',
|
||||
'funding_body-dlr': 'German Aerospace Center (DLR)',
|
||||
'funding_body-dnrf': 'Danish National Research Foundation (DNRF)',
|
||||
'funding_body-eaer': 'Federal Department of Economic Affairs, Education and Research (EAER)',
|
||||
'funding_body-ec': 'European Commission (EC)',
|
||||
'funding_body-epsrc': 'Engineering and Physical Sciences Research Council (EPSRC)',
|
||||
'funding_body-esa': 'European Space Agency (ESA)',
|
||||
'funding_body-esrc': 'Economic and Social Research Council (ESRC)',
|
||||
'funding_body-etag': 'Estonian Research Council (ETAG)',
|
||||
'funding_body-fapesp': 'São Paulo Research Foundation (FAPESP)',
|
||||
'funding_body-fct': 'Foundation for Science and Technology (FCT)',
|
||||
'funding_body-ffg': 'Austrian Research Promotion Agency (FFG)',
|
||||
'funding_body-fnp': 'Foundation for Polish Science (FNP)',
|
||||
'funding_body-fnr': 'National Research Fund (FNR)',
|
||||
'funding_body-fnrs': 'Fonds National de la Recherche Scientifique (FNRS)',
|
||||
'funding_body-fom': 'Foundation for Fundamental Research on Matter (FOM)',
|
||||
'funding_body-forte': 'Swedish Research Council for Health, Working Life and Welfare (FORTE)',
|
||||
'funding_body-fts': 'Fritz Thyssen Foundation (FTS)',
|
||||
'funding_body-fwf': 'Austrian Science Fund (FWF)',
|
||||
'funding_body-fwo': 'Research Foundation Flanders (FWO)',
|
||||
'funding_body-gacr': 'Czech Science Foundation (GACR)',
|
||||
'funding_body-gsrt': 'General Secretariat for Research and Technology (GSRT)',
|
||||
'funding_body-ifd': 'Innovation Fund Denmark (IFD)',
|
||||
'funding_body-ifremer': 'French Research Institute for Exploitation of the Sea (IFREMER)',
|
||||
'funding_body-imsr': 'Innovation Fund of the Ministry of Economy of the Slovak Republic (IMSR)',
|
||||
'funding_body-innoviris': 'Brussels Institute for Research and Innovation (INNOVIRIS)',
|
||||
'funding_body-inra': 'National institute of Agricultural Research (INRA)',
|
||||
'funding_body-inserm': 'National Institute of Health and Medical Research (INSERM)',
|
||||
'funding_body-ipev': 'French Polar Institute (IPEV)',
|
||||
'funding_body-irc': 'Irish Research Council (IRC)',
|
||||
'funding_body-isc': 'International Science Council (ISC)',
|
||||
'funding_body-isciii': 'Carlos III Health Institute (ISCIII)',
|
||||
'funding_body-isf': 'Israel Science Foundation (ISF)',
|
||||
'funding_body-iwt': 'Agency for Innovation by Science and Technology (IWT)',
|
||||
'funding_body-jsps': 'Japanese Society for the Promotion of Science (JSPS)',
|
||||
'funding_body-jst': 'Japanese Science and Technology Agency (JST)',
|
||||
'funding_body-kaws': 'Knut and Alice Wallenberg Foundation (KAWS)',
|
||||
'funding_body-kks': 'Knowledge Foundation (KKS)',
|
||||
'funding_body-lmt': 'Research Council of Lithuania (LMT)',
|
||||
'funding_body-mcst': 'Malta Council for Science and Technology (MCST)',
|
||||
'funding_body-mecr': 'Ministry for Education and Scientific Research (MECR)',
|
||||
'funding_body-mesr': 'Ministry of Higher Education and Research (MESR)',
|
||||
'funding_body-mestd': 'Ministry of Education, Science and Technological Development of Republic of Serbia (MESTD)',
|
||||
'funding_body-mgrt': 'Ministry for Economic Development and Technology (MGRT)',
|
||||
'funding_body-mineco': 'Ministry for Economy and Competitveness (MINECO)',
|
||||
'funding_body-mistra': 'Swedish Foundation for Strategic Environmental Research (MISTRA)',
|
||||
'funding_body-mita': 'Agency for Science, Innovation and Technology (MITA)',
|
||||
'funding_body-miur': 'Ministry for Education, University and Research (MIUR)',
|
||||
'funding_body-most': "Ministry of Science and Technology of the People's Republic of China (MOST)",
|
||||
'funding_body-mpg': 'Max Planck Society for the Advancement of Science (MPG)',
|
||||
'funding_body-mrc': 'Medical Research Council (MRC)',
|
||||
'funding_body-mse': 'Ministry of Science and Education Republic of Croatia (MSE)',
|
||||
'funding_body-msvvas_sr': 'The Ministry of Education, Science, Research and Sports of the Slovak Republic (MSVVaS SR)',
|
||||
'funding_body-nasa': 'National Aeronautics and Space Administration (NASA)',
|
||||
'funding_body-ncbir': 'National Centre for Research and Development (NCBiR)',
|
||||
'funding_body-ncn': 'National Science Center (NCN)',
|
||||
'funding_body-nerc': 'Natural Environment Research Council (NERC)',
|
||||
'funding_body-nhmrc': 'National Health and Medical Research Council (NHMRC)',
|
||||
'funding_body-nig': 'National Institutes of Health (NIG)',
|
||||
'funding_body-nkfia': 'National Research, Development and Innovation Fund (NKFIA)',
|
||||
'funding_body-nrf': 'National Research Foundation (NRF)',
|
||||
'funding_body-nserc': 'Natural Sciences and Engineering Research Council of Canada (NSERC)',
|
||||
'funding_body-nsf': 'National Science Foundation (NSF)',
|
||||
'funding_body-nwo': 'Netherlands Organisation for Scientific Research (NWO)',
|
||||
'funding_body-oeaw': 'Austrian Academy of Sciences (OeAW)',
|
||||
'funding_body-oenfte': 'National Foundation for Research, Technology and Development (OeNFTE)',
|
||||
'funding_body-onera': 'French National Aerospace Research Center (ONERA)',
|
||||
'funding_body-other': 'Other', 'funding_body-rannis': 'Icelandic Centre for Research (RANNIS)',
|
||||
'funding_body-rcn': 'Research Council of Norway (RCN)',
|
||||
'funding_body-rcuk': 'Research Council UK (RCUK)',
|
||||
'funding_body-rj': 'The Swedish Foundation for Humanities and Social Sciences (RJ)',
|
||||
'funding_body-rpf': 'Research Promotion Foundation (RPF)',
|
||||
'funding_body-sea': 'Swedish Energy Agency (SEA)',
|
||||
'funding_body-sepa': 'Swedish Environmental Protection Agency (SEPA)',
|
||||
'funding_body-sfi': 'Science Foundation Ireland (SFI)',
|
||||
'funding_body-sgpi': 'Secretariat-General for Investment (SGPI)',
|
||||
'funding_body-snf': 'Swiss National Science Foundation (SNF)',
|
||||
'funding_body-snsb': 'Swedish National Space Board (SNSB)',
|
||||
'funding_body-srcf': 'Swedish Reseach Council Formas (SRCF)',
|
||||
'funding_body-srsa': 'Swedish Radiation Safety Authority (SRSA)',
|
||||
'funding_body-ssf': 'Swedish Foundation for Strategic Research (SSF)',
|
||||
'funding_body-sshrc': 'Social Sciences and Humanities Research Council (SSHRC)',
|
||||
'funding_body-stfc': 'Science and Technology Facilities Council (STFC)',
|
||||
'funding_body-stw': 'Technology Foundation (STW)',
|
||||
'funding_body-tacr': 'Technology Agency of the Czech Republic (TACR)',
|
||||
'funding_body-tara': 'Tara Expeditions Foundation (TARA)',
|
||||
'funding_body-tekes': 'Finnish Funding Agency for Technology and Innovation (TEKES)',
|
||||
'funding_body-tubitak': 'Scientific and Technological Research Council of Turkey (TUBITAK)',
|
||||
'funding_body-uefiscdi_cncs': 'Executive Agency for Higher Education, Research, Development and Innovation Funding (UEFISCDI - CNCS)',
|
||||
'funding_body-ukri': 'UK Research and Innovation (UKRI)',
|
||||
'funding_body-vega': 'Scientific Grant Agency (VEGA)',
|
||||
'funding_body-viaa': 'State Education Development Agency (VIAA)',
|
||||
'funding_body-vinnova': 'Swedish Governmental Agency for Innovation Systems (VINNOVA)',
|
||||
'funding_body-vlaio': 'Flanders Innovation & Entrepeneurship (VLAIO)',
|
||||
'funding_body-vr': 'Swedish Research Council (VR)',
|
||||
'funding_body-vs': 'Volkswagen Foundation (VS)',
|
||||
'funding_body-wt': 'Wellcome trust (WT)',
|
||||
'funding_body-wwtf': 'Vienna Science and Technology Fund (WWTF)',
|
||||
'funding_body-meys': 'Ministry of Education, Youth and Sports of the Czech Republic (MEYS)',
|
||||
'funding_body-af': 'Arcadia Fund'},
|
||||
'target_user': {'target_user-businesses': 'Businesses', 'target_user-funders': 'Funders',
|
||||
'target_user-innovators': 'Innovators', 'target_user-other': 'Other',
|
||||
'target_user-policy_makers': 'Policy Makers', 'target_user-providers': 'Providers',
|
||||
'target_user-research_communities': 'Research Communities',
|
||||
'target_user-research_groups': 'Research Groups',
|
||||
'target_user-research_infrastructure_managers': 'Research Infrastructure Managers',
|
||||
'target_user-research_managers': 'Research Managers',
|
||||
'target_user-research_networks': 'Research Networks',
|
||||
'target_user-research_organisations': 'Research Organisations',
|
||||
'target_user-research_projects': 'Research Projects', 'target_user-researchers': 'Researchers',
|
||||
'target_user-resource_managers': 'Resource Managers',
|
||||
'target_user-resource_provider_managers': 'Provider Managers',
|
||||
'target_user-publishers': 'Publishers',
|
||||
'target_user-students': 'Students'},
|
||||
'related_platform': {'related_platform-ands': 'ANDS', 'related_platform-artportalen': 'ArtPortalen',
|
||||
'related_platform-arxiv': 'arXiv', 'related_platform-ala': 'Atlas of Living Australia',
|
||||
'related_platform-avp': 'AV-Portal', 'related_platform-aws': 'AWS',
|
||||
'related_platform-bluecloud': 'Blue-Cloud',
|
||||
'related_platform-cdl': 'California Digital Library',
|
||||
'related_platform-ccdc': 'CCDC', 'related_platform-cessda': 'CESSDA',
|
||||
'related_platform-collabwith': 'COLLABWITH',
|
||||
'related_platform-cccs': 'Copernicus Climate Change Service',
|
||||
'related_platform-crossref': 'Crossref', 'related_platform-dariahteach': 'dariahTeach',
|
||||
'related_platform-dice': 'Data Infrastructure Capacity for EOSC (DICE)',
|
||||
'related_platform-datacite': 'DataCite', 'related_platform-ds': 'Digital Science',
|
||||
'related_platform-doab': 'DOAB', 'related_platform-einfracz': 'e-INFRA CZ',
|
||||
'related_platform-eirgspp': 'e-IRGSP projects', 'related_platform-edugain': 'eduGAIN',
|
||||
'related_platform-eduteams': 'eduTEAMS', 'related_platform-egi': 'EGI',
|
||||
'related_platform-egifc': 'EGI Federated Cloud', 'related_platform-egiace': 'EGI-ACE',
|
||||
'related_platform-elixir': 'ELIXIR', 'related_platform-emodnetc': 'EMODnet Chemistry',
|
||||
'related_platform-eol': 'Encyclopedia of Life',
|
||||
'related_platform-enc': 'Endemia New Caledonia',
|
||||
'related_platform-envri': 'ENVRI Hub', 'related_platform-eoscl': 'EOSC-Life',
|
||||
'related_platform-eoscn': 'EOSC-Nordic', 'related_platform-eoscp': 'EOSC-Pillar',
|
||||
'related_platform-eudatcdi': 'EUDAT CDI', 'related_platform-elg': 'European Language Grid',
|
||||
'related_platform-evs': 'European Values Study (EVS)',
|
||||
'related_platform-garrcp': 'GARR Container Platform',
|
||||
'related_platform-gatep': 'GATE platform',
|
||||
'related_platform-gbif': 'GBIF', 'related_platform-geonames': 'GeoNames',
|
||||
'related_platform-grin': 'Germplasm Resources Information Network (GRIN)',
|
||||
'related_platform-geoss': 'Global Earth Observation system of Systems (GEOSS)',
|
||||
'related_platform-hal': 'HAL', 'related_platform-hamelin': 'Hamelin',
|
||||
'related_platform-infnc': 'INFN-Cloud', 'related_platform-ispot': 'iSpot',
|
||||
'related_platform-jisc': 'JISC', 'related_platform-metacentrum': 'MetaCentrum',
|
||||
'related_platform-natusfera': 'Natusfera', 'related_platform-openairee': 'OpenAIRE EXPLORE',
|
||||
'related_platform-openairem': 'OpenAIRE MONITOR',
|
||||
'related_platform-openairerg': 'OpenAIRE research graph',
|
||||
'related_platform-oc': 'OpenCitations',
|
||||
'related_platform-pogo': 'Partnership for Observation of the Global Oceans (POGO)',
|
||||
'related_platform-pnp': 'Pl@ntNet platform', 'related_platform-pc': 'PolicyCloud',
|
||||
'related_platform-rjb': 'Real Jardín Botánico', 'related_platform-scopus': 'Scopus',
|
||||
'related_platform-seadatanet': 'SeaDataNet',
|
||||
'related_platform-tsd': 'Service for Sensitive Data (TSD)',
|
||||
'related_platform-sshom': 'SSH Open Marketplace', 'related_platform-surf': 'SURF',
|
||||
'related_platform-share': 'Survey of Health, Ageing and Retirement in Europe (SHARE)',
|
||||
'related_platform-tf': 'Taylor&Francis', 'related_platform-tb': 'Tela Botanica',
|
||||
'related_platform-tdp': 'The Dataverse Project',
|
||||
'related_platform-tnomadl': 'The NOMAD Laboratory', 'related_platform-tpg': 'The Plant Game',
|
||||
'related_platform-tibp': 'TIB Portal', 'related_platform-tripleh': 'TRIPLE H2020 project',
|
||||
'related_platform-tubitakcc': 'TÜBITAK cloud compute',
|
||||
'related_platform-vlab': 'Virtual Earth Laboratory (VLab)',
|
||||
'related_platform-zbwice': 'ZBW Information Centre for Economics',
|
||||
'related_platform-zenodo': 'Zenodo'},
|
||||
'languages': {'aa': 'Afar', 'ab': 'Abkhazian', 'ae': 'Avestan', 'af': 'Afrikaans', 'ak': 'Akan', 'am': 'Amharic',
|
||||
'an': 'Aragonese', 'ar': 'Arabic', 'as': 'Assamese', 'av': 'Avaric', 'ay': 'Aymara',
|
||||
'az': 'Azerbaijani',
|
||||
'ba': 'Bashkir', 'be': 'Belarusian', 'bg': 'Bulgarian', 'bh': 'Bihari', 'bi': 'Bislama',
|
||||
'bm': 'Bambara',
|
||||
'bn': 'Bengali', 'bo': 'Tibetan', 'br': 'Breton', 'bs': 'Bosnian', 'ca': 'Catalan', 'ce': 'Chechen',
|
||||
'ch': 'Chamorro', 'co': 'Corsican', 'cr': 'Cree', 'cs': 'Czech', 'cu': 'Old Church Slavonic',
|
||||
'cv': 'Chuvash', 'cy': 'Welsh', 'da': 'Danish', 'de': 'German', 'dv': 'Divehi', 'dz': 'Dzongkha',
|
||||
'ee': 'Ewe', 'el': 'Greek', 'en': 'English', 'eo': 'Esperanto', 'es': 'Spanish', 'et': 'Estonian',
|
||||
'eu': 'Basque', 'fa': 'Persian', 'ff': 'Fula', 'fi': 'Finnish', 'fj': 'Fijian', 'fo': 'Faroese',
|
||||
'fr': 'French', 'fy': 'Western Frisian', 'ga': 'Irish', 'gd': 'Galician', 'gl': 'Gaelic',
|
||||
'gn': 'Guarani', 'gu': 'Gujarati', 'gv': 'Manx', 'ha': 'Hausa', 'he': 'Hebrew', 'hi': 'Hindi',
|
||||
'ho': 'Hiri Motu', 'hr': 'Croatian', 'ht': 'Haitian', 'hu': 'Hungarian', 'hy': 'Armenian',
|
||||
'hz': 'Herero', 'ia': 'Interlingua', 'id': 'Indonesian', 'ie': 'Interlingue', 'ig': 'Igbo',
|
||||
'ii': 'Nuosu', 'iii': 'Sichuan Yi', 'ik': 'Inupiak', 'io': 'Ido', 'is': 'Icelandic', 'it': 'Italian',
|
||||
'iu': 'Inuktitut', 'ja': 'Japanese', 'jv': 'Javanese', 'ka': 'Georgian', 'kg': 'Kongo',
|
||||
'ki': 'Kikuyu',
|
||||
'kj': 'Kwanyama', 'kk': 'Kazakh', 'kl': 'Kalaallisut', 'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean',
|
||||
'kr': 'Kanuri', 'ks': 'Kashmiri', 'ku': 'Kurdish', 'kv': 'Komi', 'kw': 'Cornish', 'ky': 'Kyrgyz',
|
||||
'la': 'Latin', 'lb': 'Luxembourgish', 'li': 'Limburgish', 'ln': 'Lingala', 'lo': 'Lao',
|
||||
'lt': 'Lithuanian', 'lu': 'Luba-Katanga', 'lv': 'Latvian', 'mg': 'Malagasy', 'mh': 'Marshallese',
|
||||
'mi': 'Maori', 'mk': 'Macedonian', 'ml': 'Malayalam', 'mn': 'Mongolian', 'mr': 'Marathi',
|
||||
'ms': 'Malay',
|
||||
'mt': 'Maltese', 'my': 'Burmese', 'na': 'Nauru', 'nb': 'Norwegian Bokmål', 'nd': 'Northern Ndebele',
|
||||
'ne': 'Nepali', 'ng': 'Ndonga', 'nl': 'Dutch', 'nn': 'Norwegian Nynorsk', 'no': 'Norwegian',
|
||||
'nr': 'Southern Ndebele', 'nv': 'Navajo', 'ny': 'Chichewa', 'oc': 'Occitan', 'oj': 'Ojibwe',
|
||||
'om': 'Oromo', 'or': 'Oriya', 'os': 'Ossetian', 'ot': 'Other', 'pa': 'Panjabi', 'pi': 'Pāli',
|
||||
'pl': 'Polish', 'ps': 'Pashto', 'pt': 'Portuguese', 'qu': 'Quechua', 'rm': 'Romansh', 'rn': 'Kirundi',
|
||||
'ro': 'Romanian', 'ru': 'Russian', 'rw': 'Kinyarwanda', 'sa': 'Sanskrit', 'sar': 'Sardinian',
|
||||
'sd': 'Sindhi', 'se': 'Sami', 'sg': 'Sango', 'si': 'Sinhalese', 'sk': 'Slovak', 'sl': 'Slovenian',
|
||||
'sm': 'Samoan', 'sn': 'Shona', 'so': 'Somali', 'sq': 'Albanian', 'sr': 'Serbian', 'ss': 'Swati',
|
||||
'st': 'Sesotho', 'su': 'Sundanese', 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil', 'te': 'Telugu',
|
||||
'tg': 'Tajik', 'th': 'Thai', 'ti': 'Tigrinya', 'tk': 'Turkmen', 'tl': 'Tagalog', 'tn': 'Setswana',
|
||||
'to': 'Tonga', 'tr': 'Turkish', 'ts': 'Tsonga', 'tt': 'Tatar', 'tw': 'Twi', 'ty': 'Tahitian',
|
||||
'ug': 'Uyghur', 'uk': 'Ukrainian', 'ur': 'Urdu', 'uz': 'Uzbek', 've': 'Venda', 'vi': 'Vietnamese',
|
||||
'vo': 'Volapük', 'wa': 'Wallon', 'wo': 'Wolof', 'xh': 'Xhosa', 'yi': 'Yiddish', 'yo': 'Yoruba',
|
||||
'za': 'Zhuang', 'zh': 'Chinese', 'zu': 'Zulu'}, 'ig': {},
|
||||
'qualification': {'tr_qualification-badge': 'Badge', 'tr_qualification-certification': 'Certification',
|
||||
'tr_qualification-accreditation': 'Accreditation'}, 'subcategories': {
|
||||
'subcategory-access_physical_and_eInfrastructures-compute-container_management': 'Container Management',
|
||||
'subcategory-access_physical_and_eInfrastructures-compute-job_execution': 'Job Execution',
|
||||
'subcategory-access_physical_and_eInfrastructures-compute-orchestration': 'Orchestration',
|
||||
'subcategory-access_physical_and_eInfrastructures-compute-other': 'Other',
|
||||
'subcategory-access_physical_and_eInfrastructures-compute-serverless_applications_repository': 'Serverless Applications Repository',
|
||||
'subcategory-access_physical_and_eInfrastructures-compute-virtual_machine_management': 'Virtual Machine Management',
|
||||
'subcategory-access_physical_and_eInfrastructures-compute-workload_management': 'Workload Management',
|
||||
'subcategory-access_physical_and_eInfrastructures-data_storage-archive': 'Archive',
|
||||
'subcategory-access_physical_and_eInfrastructures-data_storage-backup': 'Backup',
|
||||
'subcategory-access_physical_and_eInfrastructures-data_storage-data': 'Data',
|
||||
'subcategory-access_physical_and_eInfrastructures-data_storage-digital_preservation': 'Digital Preservation',
|
||||
'subcategory-access_physical_and_eInfrastructures-data_storage-disk': 'Disk',
|
||||
'subcategory-access_physical_and_eInfrastructures-data_storage-file': 'File',
|
||||
'subcategory-access_physical_and_eInfrastructures-data_storage-online': 'Online',
|
||||
'subcategory-access_physical_and_eInfrastructures-data_storage-other': 'Other',
|
||||
'subcategory-access_physical_and_eInfrastructures-data_storage-queue': 'Queue',
|
||||
'subcategory-access_physical_and_eInfrastructures-data_storage-recovery': 'Recovery',
|
||||
'subcategory-access_physical_and_eInfrastructures-data_storage-replicated': 'Replicated',
|
||||
'subcategory-access_physical_and_eInfrastructures-data_storage-synchronised': 'Synchronised',
|
||||
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-chromatographer': 'Chromatographer',
|
||||
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-cytometer': 'Cytometer',
|
||||
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-digitisation_equipment': 'Digitisation Equipment',
|
||||
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-geophysical': 'Geophysical',
|
||||
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-laser': 'Laser',
|
||||
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-microscopy': 'Microscopy',
|
||||
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-monument_maintenance_equipment': 'Monument Maintenance Equipment',
|
||||
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-other': 'Other',
|
||||
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-radiation': 'Radiation',
|
||||
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-spectrometer': 'Spectrometer',
|
||||
'subcategory-access_physical_and_eInfrastructures-instrument_and_equipment-spectrophotometer': 'Spectrophotometer',
|
||||
'subcategory-access_physical_and_eInfrastructures-material_storage-archiving': 'Archiving',
|
||||
'subcategory-access_physical_and_eInfrastructures-material_storage-assembly': 'Assembly',
|
||||
'subcategory-access_physical_and_eInfrastructures-material_storage-disposal': 'Disposal',
|
||||
'subcategory-access_physical_and_eInfrastructures-material_storage-fulfilment': 'Fulfilment',
|
||||
'subcategory-access_physical_and_eInfrastructures-material_storage-other': 'Other',
|
||||
'subcategory-access_physical_and_eInfrastructures-material_storage-packaging': 'Packaging',
|
||||
'subcategory-access_physical_and_eInfrastructures-material_storage-preservation': 'Preservation',
|
||||
'subcategory-access_physical_and_eInfrastructures-material_storage-quality_inspecting': 'Quality Inspecting',
|
||||
'subcategory-access_physical_and_eInfrastructures-material_storage-repository': 'Repository',
|
||||
'subcategory-access_physical_and_eInfrastructures-material_storage-reworking': 'Reworking',
|
||||
'subcategory-access_physical_and_eInfrastructures-material_storage-sorting': 'Sorting',
|
||||
'subcategory-access_physical_and_eInfrastructures-material_storage-warehousing': 'Warehousing',
|
||||
'subcategory-access_physical_and_eInfrastructures-network-content_delivery_network': 'Content Delivery Network',
|
||||
'subcategory-access_physical_and_eInfrastructures-network-direct_connect': 'Direct Connect',
|
||||
'subcategory-access_physical_and_eInfrastructures-network-exchange': 'Exchange',
|
||||
'subcategory-access_physical_and_eInfrastructures-network-load_balancer': 'Load Balancer',
|
||||
'subcategory-access_physical_and_eInfrastructures-network-other': 'Other',
|
||||
'subcategory-access_physical_and_eInfrastructures-network-traffic_manager': 'Traffic Manager',
|
||||
'subcategory-access_physical_and_eInfrastructures-network-virtual_nework': 'Virtual Network',
|
||||
'subcategory-access_physical_and_eInfrastructures-network-vpn_gateway': 'VPN Gateway',
|
||||
'subcategory-access_physical_and_eInfrastructures-network-dns': 'DNS',
|
||||
'subcategory-aggregators_and_integrators-aggregators_and_integrators-applications': 'Applications',
|
||||
'subcategory-aggregators_and_integrators-aggregators_and_integrators-data': 'Data',
|
||||
'subcategory-aggregators_and_integrators-aggregators_and_integrators-other': 'Other',
|
||||
'subcategory-aggregators_and_integrators-aggregators_and_integrators-services': 'Services',
|
||||
'subcategory-aggregators_and_integrators-aggregators_and_integrators-software': 'Software',
|
||||
'subcategory-other-other-other': 'Other',
|
||||
'subcategory-processing_and_analysis-data_analysis-2d_3d_digitisation': '2D/3D Digitisation',
|
||||
'subcategory-processing_and_analysis-data_analysis-artificial_intelligence': 'Artificial Intelligence',
|
||||
'subcategory-processing_and_analysis-data_analysis-data_extrapolation': 'Data Extrapolation',
|
||||
'subcategory-processing_and_analysis-data_analysis-forecast': 'Forecast',
|
||||
'subcategory-processing_and_analysis-data_analysis-image_data_analysis': 'Image/Data Analysis',
|
||||
'subcategory-processing_and_analysis-data_analysis-machine_learning': 'Machine Learning',
|
||||
'subcategory-processing_and_analysis-data_analysis-other': 'Other',
|
||||
'subcategory-processing_and_analysis-data_analysis-visualization': 'Visualization',
|
||||
'subcategory-processing_and_analysis-data_analysis-workflows': 'Workflows',
|
||||
'subcategory-processing_and_analysis-data_analysis-quality_assesment': 'Quality Assesment',
|
||||
'subcategory-processing_and_analysis-data_management-access': 'Access',
|
||||
'subcategory-processing_and_analysis-data_management-annotation': 'Annotation',
|
||||
'subcategory-processing_and_analysis-data_management-anonymisation': 'Anonymisation',
|
||||
'subcategory-processing_and_analysis-data_management-brokering': 'Brokering',
|
||||
'subcategory-processing_and_analysis-data_management-digitisation': 'Digitisation',
|
||||
'subcategory-processing_and_analysis-data_management-discovery': 'Discovery',
|
||||
'subcategory-processing_and_analysis-data_management-embargo': 'Embargo',
|
||||
'subcategory-processing_and_analysis-data_management-interlinking': 'Interlinking',
|
||||
'subcategory-processing_and_analysis-data_management-maintenance': 'Maintenance',
|
||||
'subcategory-processing_and_analysis-data_management-mining': 'Mining',
|
||||
'subcategory-processing_and_analysis-data_management-other': 'Other',
|
||||
'subcategory-processing_and_analysis-data_management-persistent_identifier': 'Persistent Identifier',
|
||||
'subcategory-processing_and_analysis-data_management-preservation': 'Preservation',
|
||||
'subcategory-processing_and_analysis-data_management-publishing': 'Publishing',
|
||||
'subcategory-processing_and_analysis-data_management-registration': 'Registration',
|
||||
'subcategory-processing_and_analysis-data_management-transfer': 'Transfer',
|
||||
'subcategory-processing_and_analysis-data_management-validation': 'Validation',
|
||||
'subcategory-processing_and_analysis-measurement_and_materials_analysis-analysis': 'Analysis',
|
||||
'subcategory-processing_and_analysis-measurement_and_materials_analysis-characterisation': 'Characterisation',
|
||||
'subcategory-processing_and_analysis-measurement_and_materials_analysis-maintenance_and_modification': 'Maintenance & Modification',
|
||||
'subcategory-processing_and_analysis-measurement_and_materials_analysis-other': 'Other',
|
||||
'subcategory-processing_and_analysis-measurement_and_materials_analysis-production': 'Production',
|
||||
'subcategory-processing_and_analysis-measurement_and_materials_analysis-testing_and_validation': 'TEsting & Validation',
|
||||
'subcategory-processing_and_analysis-measurement_and_materials_analysis-validation': 'Validation',
|
||||
'subcategory-processing_and_analysis-measurement_and_materials_analysis-workflows': 'Workflows',
|
||||
'subcategory-security_and_operations-operations_and_infrastructure_management_services-accounting': 'Accounting',
|
||||
'subcategory-security_and_operations-operations_and_infrastructure_management_services-analysis': 'Analysis',
|
||||
'subcategory-security_and_operations-operations_and_infrastructure_management_services-billing': 'Billing',
|
||||
'subcategory-security_and_operations-operations_and_infrastructure_management_services-configuration': 'Configuration',
|
||||
'subcategory-security_and_operations-operations_and_infrastructure_management_services-coordination': 'Coordination',
|
||||
'subcategory-security_and_operations-operations_and_infrastructure_management_services-helpdesk': 'Helpdesk',
|
||||
'subcategory-security_and_operations-operations_and_infrastructure_management_services-monitoring': 'Monitoring',
|
||||
'subcategory-security_and_operations-operations_and_infrastructure_management_services-order_management': 'Order Management',
|
||||
'subcategory-security_and_operations-operations_and_infrastructure_management_services-other': 'Other',
|
||||
'subcategory-security_and_operations-operations_and_infrastructure_management_services-transportation': 'Transportation',
|
||||
'subcategory-security_and_operations-operations_and_infrastructure_management_services-utilities': 'Utilities',
|
||||
'subcategory-security_and_operations-security_and_identity-certification_authority': 'Certification Authority',
|
||||
'subcategory-security_and_operations-security_and_identity-coordination': 'Coordination',
|
||||
'subcategory-security_and_operations-security_and_identity-firewall': 'Firewall',
|
||||
'subcategory-security_and_operations-security_and_identity-group_management': 'Group Management',
|
||||
'subcategory-security_and_operations-security_and_identity-identity_and_access_management': 'Identity & Access Management',
|
||||
'subcategory-security_and_operations-security_and_identity-other': 'Other',
|
||||
'subcategory-security_and_operations-security_and_identity-single_sign_on': 'Single Sign-On',
|
||||
'subcategory-security_and_operations-security_and_identity-threat_protection': 'Threat Protection',
|
||||
'subcategory-security_and_operations-security_and_identity-tools': 'Tools',
|
||||
'subcategory-security_and_operations-security_and_identity-user_authentication': 'User Authentication',
|
||||
'subcategory-sharing_and_discovery-applications-applications_repository': 'Applications Repository',
|
||||
'subcategory-sharing_and_discovery-applications-business': 'Business',
|
||||
'subcategory-sharing_and_discovery-applications-collaboration': 'Collaboration',
|
||||
'subcategory-sharing_and_discovery-applications-communication': 'Communication',
|
||||
'subcategory-sharing_and_discovery-applications-education': 'Education',
|
||||
'subcategory-sharing_and_discovery-applications-other': 'Other',
|
||||
'subcategory-sharing_and_discovery-applications-productivity': 'Productivity',
|
||||
'subcategory-sharing_and_discovery-applications-social_networking': 'Social/Networking',
|
||||
'subcategory-sharing_and_discovery-applications-utilities': 'Utilities',
|
||||
'subcategory-sharing_and_discovery-data-clinical_trial_data': 'Clinical Trial Data',
|
||||
'subcategory-sharing_and_discovery-data-data_archives': 'Data Archives',
|
||||
'subcategory-sharing_and_discovery-data-epidemiological_data': 'Epidemiological Data',
|
||||
'subcategory-sharing_and_discovery-data-government_and_agency_data': 'Government & Agency Data',
|
||||
'subcategory-sharing_and_discovery-data-online_service_data': 'Online Service Data',
|
||||
'subcategory-sharing_and_discovery-data-other': 'Other',
|
||||
'subcategory-sharing_and_discovery-data-scientific_research_data': 'Scientific/Research Data',
|
||||
'subcategory-sharing_and_discovery-data-statistical_data': 'Statistical Data',
|
||||
'subcategory-sharing_and_discovery-data-metadata': 'Metadata',
|
||||
'subcategory-sharing_and_discovery-development_resources-apis_repository_gateway': 'APIs Repository/Gateway',
|
||||
'subcategory-sharing_and_discovery-development_resources-developer_tools': 'Developer Tools',
|
||||
'subcategory-sharing_and_discovery-development_resources-other': 'Other',
|
||||
'subcategory-sharing_and_discovery-development_resources-software_development_kits': 'Software Development Kits',
|
||||
'subcategory-sharing_and_discovery-development_resources-software_libraries': 'Software Libraries',
|
||||
'subcategory-sharing_and_discovery-development_resources-simulation_tools': 'Simulation Tools',
|
||||
'subcategory-sharing_and_discovery-samples-biological_samples': 'Biological Samples',
|
||||
'subcategory-sharing_and_discovery-samples-characterisation': 'Characterisation',
|
||||
'subcategory-sharing_and_discovery-samples-chemical_compounds_library': 'Chemical Compounds Library',
|
||||
'subcategory-sharing_and_discovery-samples-other': 'Other',
|
||||
'subcategory-sharing_and_discovery-samples-preparation': 'Preparation',
|
||||
'subcategory-sharing_and_discovery-scholarly_communication-analysis': 'Analysis',
|
||||
'subcategory-sharing_and_discovery-scholarly_communication-assessment': 'Assessment',
|
||||
'subcategory-sharing_and_discovery-scholarly_communication-discovery': 'Discovery',
|
||||
'subcategory-sharing_and_discovery-scholarly_communication-other': 'Other',
|
||||
'subcategory-sharing_and_discovery-scholarly_communication-outreach': 'Outreach',
|
||||
'subcategory-sharing_and_discovery-scholarly_communication-preparation': 'Preparation',
|
||||
'subcategory-sharing_and_discovery-scholarly_communication-publication': 'Publication',
|
||||
'subcategory-sharing_and_discovery-scholarly_communication-writing': 'Writing',
|
||||
'subcategory-sharing_and_discovery-software-libraries': 'Libraries',
|
||||
'subcategory-sharing_and_discovery-software-other': 'Other',
|
||||
'subcategory-sharing_and_discovery-software-platform': 'Platform',
|
||||
'subcategory-sharing_and_discovery-software-software_package': 'Software Package',
|
||||
'subcategory-sharing_and_discovery-software-software_repository': 'Software Repository',
|
||||
'subcategory-training_and_support-consultancy_and_support-application_optimisation': 'Application Optimisation',
|
||||
'subcategory-training_and_support-consultancy_and_support-application_porting': 'Application_Porting',
|
||||
'subcategory-training_and_support-consultancy_and_support-application_scaling': 'Application Scaling',
|
||||
'subcategory-training_and_support-consultancy_and_support-audit_and_assessment': 'Audit & Assessment',
|
||||
'subcategory-training_and_support-consultancy_and_support-benchmarking': 'Benchmarking',
|
||||
'subcategory-training_and_support-consultancy_and_support-calibration': 'Calibration',
|
||||
'subcategory-training_and_support-consultancy_and_support-certification': 'Certification',
|
||||
'subcategory-training_and_support-consultancy_and_support-consulting': 'Consulting',
|
||||
'subcategory-training_and_support-consultancy_and_support-methodology_development': 'Methodology Development',
|
||||
'subcategory-training_and_support-consultancy_and_support-modeling_and_simulation': 'Modeling & Simulation',
|
||||
'subcategory-training_and_support-consultancy_and_support-other': 'Other',
|
||||
'subcategory-training_and_support-consultancy_and_support-prototype_development': 'Prototype Development',
|
||||
'subcategory-training_and_support-consultancy_and_support-software_development': 'Software Development',
|
||||
'subcategory-training_and_support-consultancy_and_support-software_improvement': 'Software Improvement',
|
||||
'subcategory-training_and_support-consultancy_and_support-technology_transfer': 'Technology Transfer',
|
||||
'subcategory-training_and_support-consultancy_and_support-testing': 'Testing',
|
||||
'subcategory-training_and_support-education_and_training-in_house_courses': 'In-House Courses',
|
||||
'subcategory-training_and_support-education_and_training-online_courses': 'Online Courses',
|
||||
'subcategory-training_and_support-education_and_training-open_registration_courses': 'Open Registration Courses',
|
||||
'subcategory-training_and_support-education_and_training-other': 'Other',
|
||||
'subcategory-training_and_support-education_and_training-related_training': 'Related Training',
|
||||
'subcategory-training_and_support-education_and_training-required_training': 'Required Training',
|
||||
'subcategory-training_and_support-education_and_training-training_platform': 'Training Platform',
|
||||
'subcategory-training_and_support-education_and_training-training_tool': 'Training Tool'}, 'service-ig': {},
|
||||
'providers': {'eosc.ess': 'European Spallation Source ERIC', 'eosc.openaire': 'OpenAIRE',
|
||||
'eosc.ierek': ' International Experts for Research Enrichment and Knowledge Exchange',
|
||||
'eosc.centerdata': 'Centerdata',
|
||||
'ni4os.ukim_fcse': 'University Ss. Cyril and Methodius, Faculty of Computer Science and Engineering',
|
||||
'ni4os.sanu': 'Serbian Academy of Sciences and Arts', 'eosc.ds-wizard': 'Data Stewardship Wizard',
|
||||
'eosc.ubi': 'Ubitech', 'eosc.eosc-dih': 'EOSC DIH - Digital Innovation Hub',
|
||||
'eosc.vamdc': 'Virtual Atomic and Molecular Data Centre',
|
||||
'eosc.dariah_eric': 'DARIAH ERIC (Digital Research Infrastructure for the Arts and Humanities)',
|
||||
'eosc-nordic.rtu': 'Riga Technical University',
|
||||
'eosc.vito': 'VITO NV (Vlaamse Instelling voor Technologisch Onderzoek NV)',
|
||||
'eosc.unifl': 'University of Florence, DISIT lab', 'eosc.mi': 'Mandat International',
|
||||
'eosc.lida': 'Lithuanian Data Archive for Social Sciences and Humanities',
|
||||
'eosc.epos': 'European Plate Observing System', 'eosc.gbif-es': 'GBIF Spain',
|
||||
'eosc.materialscloud': 'Materials Cloud', 'eosc.vilnius-university': 'Vilnius University',
|
||||
'eosc.vecma': 'Verified Exascale Computing for Multiscale Applications', 'eosc.hn': 'Huma-Num',
|
||||
'eosc.instruct-eric': 'Instruct-ERIC',
|
||||
'eosc.bbmri-eric': 'Biobanking and BioMolecular resources Research Infrastructure – European Research Infrastructure Consortium',
|
||||
'eosc.cut_library': 'Cracow University of Technology. The Library',
|
||||
'eosc.cnrsin2p3': ' Centre National de la Recherche Scientifique ',
|
||||
'eosc.forschungsdaten': 'forschungsdaten.info', 'eosc.odatis': 'Pôle Odatis',
|
||||
'eosc.cy-biobank': 'biobank.cy Center of Excellence in Biobanking and Biomedical Research, University of Cyprus',
|
||||
'eosc.up': 'Ubiquity Press Ltd',
|
||||
'eosc.ceric-eric': 'Central European Research Infrastructure Consortium',
|
||||
'eosc.ccsd': 'Center for direct scientific communication',
|
||||
'eosc.lnec': 'Laboratório Nacional de Engenharia Civil',
|
||||
'eosc.t-systems': 'T-Systems International GmbH',
|
||||
'eosc.icos_eric': 'Integrated Carbon Observation System European Research Infrastructure Consortium',
|
||||
'eosc.srce': 'University of Zagreb University Computing Centre',
|
||||
'eosc.crem': 'Centre de recherche Crem',
|
||||
'eosc.carbonneutrallng': 'Horizon Europe Project Truly Carbon Neutral electricity enhanced Synthesis of Liquefied Natural Gas (LNG) from biomass',
|
||||
'eosc.rb': 'Reportbrain Limited',
|
||||
'ni4os.ibceb': 'Ivane Beritashvili Center of Experimental Biomedicine',
|
||||
'eosc.ehealth_graz': 'Institute of eHealth', 'eosc.ku_leuven': 'KU Leuven',
|
||||
'eosc.creatis': "Centre de Recherche en Acquisition et Traitement de l'Image pour la Santé",
|
||||
'eosc.elixir-belgium': 'ELIXIR Belgium',
|
||||
'eosc.earthwatch': 'Conservation Education and Research Trust',
|
||||
'eosc.meeo': 'Meteorological Environmental Earth Observation', 'eosc.vib': 'VIB',
|
||||
'eosc.inbelixir-es': 'INB: The Spanish National Bioinformatics Institute, the Spanish node for ELIXIR',
|
||||
'eosc.iagos': 'In-service Aircraft for a Global Observing System AISBL',
|
||||
'eosc-nordic.vu': 'Vilnius University',
|
||||
'eosc.ifin-hh': 'Horia Hulubei National Institute for R&D in Physics and Nuclear Engineering',
|
||||
'eosc.max_iv_laboratory': 'MAX IV Laboratory, Lund University',
|
||||
'eosc.e-cam': 'E-CAM Centre of Excellence', 'eosc.scai': 'Fraunhofer SCAI',
|
||||
'eosc.ehri': 'European Holocaust Research Infrastructure', 'eosc.rli': 'Reiner Lemoine Institute',
|
||||
'eosc.expertai': 'expert.ai', 'eosc.sensing_clues': 'Sensing Clues Foundation',
|
||||
'eosc.cerm-cirmmp': 'Magnetic Resonance Center of the University of Florence - CERM, Interuniversity consortium CIRMMP',
|
||||
'eosc.rcisd': 'Regional Centre for Information and Scientific Development Ltd.',
|
||||
'ni4os.brfaa': 'Biomedical Research Foundation, Academy of Athens',
|
||||
'ni4os.ibiss': 'Institute for Biological Research Siniša Stanković, University of Belgrade',
|
||||
'eosc.astron': 'NWO-I Netherlands Institute for Radio Astronomy (ASTRON)',
|
||||
'eosc.bih_-_center_digital_health': 'Berlin Institute of Health at Charité – Universitätsmedizin Berlin, Center of Digital Health ',
|
||||
'eosc.net7': 'Net7 S.r.l.', 'eosc.csuc': 'Consorci de Serveis Universitaris de Catalunya',
|
||||
'eosc.iasa': 'Institute of Accelerating Systems and Applications',
|
||||
'eosc.elixir-italy': 'ELIXIR Italy',
|
||||
'eosc.rolos': 'Rolos Machine Intelligence Platform for academia and business with Consulting and Applications',
|
||||
'eosc.readcoop': 'READ-COOP SCE mit beschränkter Haftung',
|
||||
'eosc.slices': 'Scientific Large Scale Infrastructure for Computing/Communication Experimental Studies',
|
||||
'eosc.emphasis': 'European Infrastructure for Plant Phenotyping',
|
||||
'eosc.usv': 'Stefan cel Mare University of Suceava', 'eosc.enhancer': 'EnhanceR',
|
||||
'eosc.asgc': 'Academia Sinica Grid Computing Centre', 'eosc.msw': 'MyScienceWork',
|
||||
'eosc.oipub': 'Omni Iota Science Limited',
|
||||
'ni4os.ichtm': 'Institute of Chemistry, Technology and Metallurgy, University of Belgrade',
|
||||
'eosc.surf-nl': 'SURF', 'eosc.esrf': 'European Synchrotron Radiation Facility',
|
||||
'eosc.ensam': 'Arts et Metiers Institute of Technology',
|
||||
'eosc.desy': 'Deutsches Elektronen-Synchrotron',
|
||||
'eosc.ifremer': 'Ifremer, the French National Institute for Ocean Science',
|
||||
'eosc.inria': 'Institut national de recherche en informatique et en automatique',
|
||||
'eosc.gbif_portugal': 'Portuguese Node of GBIF',
|
||||
'eosc.mobile_observation_integration_service': 'DDQ B.V.',
|
||||
'eosc.awi_bremerhaven': 'Alfred Wegener Institute for Polar and Marine Research in cooperation with MARUM, Center for Marine Environmental Sciences',
|
||||
'eosc.tib': 'Leibniz Information Centre for Science and Technology',
|
||||
'eosc.obp': 'Open Book Publishers',
|
||||
'eosc.diamond_light_source': 'Diamond Light Source Ltd.',
|
||||
'eosc.kit-scc': 'KIT - Scientific Computing Center',
|
||||
'eosc.sites': 'Swedish Infrastructure for Ecosystem Science',
|
||||
'eosc.crg': 'Centre for Genomic Regulation',
|
||||
'eosc.naes_of_ukraine': ' National Academy of Educational Sciences of Ukraine',
|
||||
'eosc.soleil': 'Synchrotron SOLEIL', 'eosc.eiscat': 'EISCAT Scientific Association',
|
||||
'eosc.teledyne': 'Teledyne Marine', 'eosc.uni-freiburg': 'University of Freiburg',
|
||||
'eosc.lago': 'Latin American Giant Observatory',
|
||||
'eosc.sios': 'The Svalbard Integrated Arctic Earth Observing System',
|
||||
'eosc.upc': 'Universitat Politècnica de Catalunya',
|
||||
'eosc.ess_eric': 'European Social Survey, European Research Infrastructure Consortium',
|
||||
'eosc.arkivum': 'Arkivum Limited', 'eosc.enermaps': 'EnerMaps',
|
||||
'eosc.cineca': 'Cineca Consorzio Interuniversitario', 'eosc.bi_insight': 'BI INSIGHT S.A.',
|
||||
'eosc.embl-ebi': 'European Molecular Biology Laboratory - European Bioinformatics Institute',
|
||||
'eosc.ifca-csic': 'Institute of Physics of Cantabria (IFCA)',
|
||||
'eosc.kue': 'Krakow University of Economics, Main Library',
|
||||
'eosc.ulb-sa': 'University and State Library of Saxony Anhalt',
|
||||
'eosc-nordic.llu': 'Latvia University of Life Sciences and Technologies',
|
||||
'eosc.fairmat': 'Consortium FAIRmat', 'eosc.authenix': 'Secure Dimensions GmbH',
|
||||
'eosc.cnr-iia': 'Institute of Atmospheric Pollution Research - National Research Council of Italy',
|
||||
'eosc.blue-cloud': 'Blue-Cloud - Piloting innovative services for Marine Research & the Blue Economy',
|
||||
'eosc.upekrl': 'University of Physical Education in Krakow, Library',
|
||||
'eosc.oxford_e-research_centre': 'Oxford e-Research Centre, University of Oxford, UK',
|
||||
'eosc.fir': 'FIR e. V. at RWTH Aachen University', 'eosc.lab1100': 'LAB1100',
|
||||
'eosc.capsh': 'Committee for the Accessibility of Publications in Sciences and Humanities',
|
||||
'eosc.kit': 'Karlsruhe Institute of Technology',
|
||||
'eosc.ciemat-tic': 'Scientific IT Research Activities and Knowledge, ICT Division, CIEMAT',
|
||||
'eosc.operas': 'OPERAS AISBL',
|
||||
'ni4os.grena': 'Georgian Research and Educational Networking Association',
|
||||
'eosc.riga_stradins_university': 'Riga Stradins University',
|
||||
'eosc.hostkey': 'HOSTKEY B.V. - Dedicated servers in Amsterdam DC', 'eosc.ubiwhere': 'Ubiwhere ',
|
||||
'eosc.bsc-es': 'Barcelona Supercomputing Center - Centro Nacional de Supercomputación',
|
||||
'eosc.euro-argo': 'Euro-Argo ERIC, the European contribution to Argo programme',
|
||||
'eosc.cnag': 'Consorcio para la Explotación del Centro Nacional de Análisis Genómico',
|
||||
'eosc.hzdr': 'Helmholtz-Zentrum Dresden-Rossendorf e.V.',
|
||||
'eosc.eosc.grnet': 'National Infrastructures for Research and Technology',
|
||||
'eosc.embrc-eric': 'European Marine Biological Resource Centre', 'eosc.dynaikon': 'DynAikon Limited',
|
||||
'ni4os.nsl-ge': 'National Science Library at Tbilisi State University',
|
||||
'eosc.ktu': 'Kaunas University of Technology', 'eosc.sj-ucp': 'Universidade Católica Portuguesa',
|
||||
'eosc.gcc_umcg': 'Genomics Coordination Center, University Medical Center Groningen',
|
||||
'eosc.psnc': 'Poznan Supercomputing and Networking Center',
|
||||
'eosc.consorci_cee_lab_llum_sincrotro': 'CONSORCI PER A LA CONSTRUCCIÓ, EQUIPAMENT I EXPLOTACIÓ DEL LABORATORI DE LLUM SINCROTRÓ',
|
||||
'eosc.ei': 'Earlham Institute', 'eosc.psi': 'Paul Scherrer Institute',
|
||||
'eosc.seadatanet': 'SeaDataNet',
|
||||
'eosc.uit': 'UiT The Arctic University of Norway', 'eosc.ukaea': 'UK Atomic Energy Authority',
|
||||
'eosc.switch': 'SWITCH', 'eosc.bkh': 'Biodiversity Knowledge Hub',
|
||||
'eosc.fzj': 'Forschungszentrum Jülich',
|
||||
'eosc.grycap': 'Institute of Instrumentation for Molecular Imaging - Grid and High Performance Computing - Universitat Politècnica de València',
|
||||
'eosc.infrafrontier': 'INFRAFRONTIER', 'eosc.siris_academic': 'SIRIS Academic SL',
|
||||
'eosc.ill': 'Institut Laue Langevin',
|
||||
'eosc.lindatclariah-cz': 'LINDAT/CLARIAH-CZ Research Infrastructure for Language Resources and Digital Arts and Humanities in the Czech Republic',
|
||||
'eosc.mediprospectsai': 'MediprospectsAI ltd',
|
||||
'eosc.coard': 'Collaborative Open Access Research and Development', 'eosc.elixir-europe': 'ELIXIR',
|
||||
'eosc.jsc-de': 'Jülich Supercomputing Centre', 'eosc.fh_joanneum': 'FH JOANNEUM Gesellschaft mbH',
|
||||
'eosc.dsmz': 'Leibniz Institute DSMZ - German Collection of Microorganisms and Cell Cultures',
|
||||
'eosc.data_revenue': 'Data Revenue', 'eosc.openbiomaps': 'OpenBioMaps Consortium',
|
||||
'eosc.edelweiss_connect': 'Edelweiss Connect GmbH', 'eosc.egi-fed': 'EGI Foundation',
|
||||
'ni4os.ipb': 'Institute of Physics Belgrade', 'eosc.upf': 'Universitat Pompeu Fabra',
|
||||
'eosc.infn': 'Italian National Institute of Nuclear Physics',
|
||||
'eosc.sks': 'Scientific Knowledge Services', 'eosc.cds': 'Strasbourg astronomical Data Centre',
|
||||
'eosc.geant': 'GÉANT Association',
|
||||
'eosc.emso_eric': 'European Multidisciplinary Seafloor and water column Observatory',
|
||||
'eosc.upv-es': 'Universitat Politècnica de València',
|
||||
'eosc.csi_piemonte': 'Consorzio per il Sistema Informativo',
|
||||
'eosc.bifi_-_unizar': 'Institute for Biocomputation and Physics of Complex Systems - University of Zaragoza',
|
||||
'eosc.wenmr': 'A Worldwide e-Infrastructure for Structural Biology',
|
||||
'eosc.bioexcel': 'BioExcel Centre of Excellence', 'eosc.ubora': 'UBORA association',
|
||||
'ni4os.fcub': 'University of Belgrade - Faculty of Chemistry',
|
||||
'eosc.coronis_computing_sl': 'CORONIS COMPUTING SL',
|
||||
'eosc.jagiellonian_library': 'Jagiellonian University, Jagiellonian Library',
|
||||
'eosc.data_centre': 'Centre for Data Analysis and Archiving',
|
||||
'eosc.elettra_sincrotrone_trieste': 'George Kourousias',
|
||||
'eosc.fairdi': 'FAIR Data Infrastructure for Physics, Chemistry, Materials Science, and Astronomy',
|
||||
'eosc.embimos': 'EMBIMOS (Environmental and Sustainability Participatory Information Systems)',
|
||||
'eosc.mz': 'Materials Zone',
|
||||
'eosc.charite_bih_brain_simulation': 'Charité University Medicine Berlin, Berlin Institute of Health, Brain Simulation Section',
|
||||
'eosc.ici_bucharest': 'National Institute for Research and Development in Informatics - ICI Bucharest',
|
||||
'eosc.ibiom-cnrhttpwwwibiomcnrit': 'Institute of Biomembranes, Bioenergetics and Molecular Biotechnologies, National Research Council',
|
||||
'eosc.bineo': 'Bineo Consulting SL', 'eosc.uniwersytet_opolski': 'University of Opole',
|
||||
'eosc.oasees': 'Open autonomous programmable cloud apps & smart sensors', 'eosc.datacite': 'DataCite',
|
||||
'eosc.idea': 'IDEAconsult', 'eosc.iict': 'Institute of Information and Communication Technologies',
|
||||
'eosc.unibo': 'Alma Mater Studiorum - Università di Bologna',
|
||||
'eosc.iasa_of_nasu': 'Institute for Applied System Analysis of the National Academy of Sciences of Ukraine',
|
||||
'eosc.cyberbotics': 'Cyberbotics',
|
||||
'eosc.cite': 'Communication & Information Technologies Experts SA Consulting and Development Services',
|
||||
'eosc.gesis': 'GESIS Leibniz Institute for the Social Sciences', 'eosc.unipd': 'University of Padua',
|
||||
'eosc.smartsmear': 'Institute for Atmospheric and Earth System Research',
|
||||
'eosc.euro-bioimaging': 'Euro-BioImaging', 'eosc.gft': 'GFT Italy',
|
||||
'eosc.cc-in2p3cnrs': 'Computing Centre of the National Institute of Nuclear Physics and Particle Physics, CNRS',
|
||||
'eosc.ror-org': 'Research Organization Registry',
|
||||
'eosc.bijvoetcenter': 'Bijvoet Centre - Utrecht University', 'eosc.d4science': 'D4Science',
|
||||
'eosc.terradue': 'Terradue', 'eosc.gbif': 'Global Biodiversity Information Facility (GBIF)',
|
||||
'eosc.csc-fi': 'CSC – IT CENTER FOR SCIENCE',
|
||||
'eosc.cesga': 'Fundacion Centro Tecnologico de Supercomputacion de Galicia',
|
||||
'eosc.ubfzf': 'University of Belgrade – Faculty of Philosophy',
|
||||
'eosc.cines': 'National Computing Center for Higher Education',
|
||||
'eosc.uni_konstanz': 'University of Konstanz', 'eosc.cesnet': 'CESNET', 'eosc.cs_group': 'CS GROUP',
|
||||
'eosc.treeofscience': 'Tree of Science', 'eosc.cscs': 'Swiss National Supercomputing Centre',
|
||||
'eosc.denbi': 'de.NBI - German Network for Bioinformatics Infrastructure',
|
||||
'eosc.gwdg': 'Gesellschaft für wissenschaftliche Datenverarbeitung mbH Göttingen',
|
||||
'eosc.sciences_po': 'Fondation Nationale des Sciences Politiques',
|
||||
'eosc.cern': 'EUROPEAN ORGANIZATION FOR NUCLEAR RESEARCH',
|
||||
'eosc.unibi-ub': 'Bielefeld University Library', 'eosc.sinergise': 'Sinergise',
|
||||
'eosc.plantnet': 'PlantNet consortium (hosted by Inria)', 'eosc.exoscale': 'EXOSCALE',
|
||||
'eosc.cmcc': 'Euro-Mediterranean Center on Climate Change',
|
||||
'eosc.taltechdata': 'Tallinn University of Technology',
|
||||
'eosc.tum-net': 'Technical University of Munich, Chair of Network Architectures and Services',
|
||||
'eosc.cnio': 'CNIO - Spanish National Cancer Research Centre',
|
||||
'eosc.hits': 'Heidelberg Institute for Theoretical Studies',
|
||||
'eosc.zpid': 'Leibniz Institute for Psychology', 'eosc.fssda': 'Finnish Social Science Data Archive',
|
||||
'eosc.ugr-es': 'University of Granada – UGR',
|
||||
'eosc.etais': 'Estonian Scientific Computing Infrastructure',
|
||||
'eosc.inoe_2000': 'National Institute for Research and Development in Optoelectronics',
|
||||
'eosc.northern_data_cloud_services': 'ND CS (Services) GmbH', 'eosc.eurac': 'Eurac Research',
|
||||
'eosc.europeana': 'Europeana Foundation', 'eosc.kit-lib': 'KIT - Library',
|
||||
'eosc.dkrz': 'Deutsches Klimarechenzentrum GmbH',
|
||||
'eosc.predictia': 'Predictia Intelligent Data Solutions SL', 'eosc.scipedia': 'SCIPEDIA',
|
||||
'ni4os.rbi': 'Ruđer Bošković Institute', 'eosc.jelastic': 'Virtuozzo',
|
||||
'eosc.scigne': 'The SCIGNE Platform',
|
||||
'eosc.ibergrid': 'IBERGRID - Iberian Distributed Computing Infrastructure',
|
||||
'eosc.openedition': 'OpenEdition', 'eosc.norce': 'NORCE Norwegian Research Centre',
|
||||
'eosc.lsd-ufcg': 'Laboratório de Sistemas Distribuídos - Universidade Federal de Campina Grande',
|
||||
'eosc.sethsoftware': 'Seth Software spółka z ograniczoną odpowiedzialnością',
|
||||
'eosc.gsi': 'GSI Helmholtzzentrum für Schwerionenforschung GmbH',
|
||||
'eosc.incd': 'Portuguese National Distributed Computing Infrastructure (INCD)',
|
||||
'eosc.iisas': 'Institute of Informatics - Slovak Academy of Sciences ',
|
||||
'eosc.100percentit': '100 Percent IT', 'eosc.f6snl': 'F6S Network',
|
||||
'eosc.trust-it': 'Trust-IT Services',
|
||||
'eosc.eodc': 'Earth Observation Data Centre for Water Resources Monitoring',
|
||||
'ni4os.uob-rcub': 'University of Belgrade Computer Centre',
|
||||
'eosc.unige': 'University of Geneva, Department of Astronomy',
|
||||
'eosc.leaena': 'National Technical University of Athens', 'eosc.doabf': 'DOAB Foundation',
|
||||
'eosc.rbi': 'Ruđer Bošković Institute', 'eosc.sobigdata': 'SoBigData',
|
||||
'eosc.progedo': 'PROduction et GEstion des DOnnées',
|
||||
'eosc.isa-ulisboa': 'Instituto Superior de Agronomia da Universidade de Lisboa',
|
||||
'eosc.openknowledgemaps': 'Open Knowledge Maps - Verein zur Förderung der Sichtbarkeit wissenschaftlichen Wissens',
|
||||
'eosc.fau_evt': 'Friedrich-Alexander-University Erlangen-Nürnberg, Chair of Energy Process Engineering',
|
||||
'eosc.nikhef': 'Nikhef (Stichting Nederlandse Wetenschappelijk Onderzoek Instituten)',
|
||||
'eosc.charles_university': 'Charles University', 'eosc.dcc-uk': 'Digital Curation Centre',
|
||||
'eosc.it4i_vsb-tuo': 'VSB – Technical University of Ostrava, IT4Innovations National Supercomputing Center',
|
||||
'eosc.mundi_web_services': 'Mundi Web Services',
|
||||
'eosc.gdansk_tech': 'Gdańsk University of Technology',
|
||||
'eosc.bg_up': 'Pedagogical University of Krakow, Main Library', 'eosc.figshare': 'Figshare',
|
||||
'eosc.libnova': 'LIBNOVA SL', 'eosc.pml': 'Plymouth Marine Laboratory',
|
||||
'eosc.eox': 'EOX IT Services GmbH', 'eosc.dtu': 'Technical University of Denmark',
|
||||
'eosc.european_xfel': 'European X-ray Free Electron Laser Facility GmbH ',
|
||||
'eosc.cyfronet': 'Academic Computer Centre CYFRONET AGH',
|
||||
'eosc.progressive': 'Progressive Systems Srl',
|
||||
'eosc.ipsl': 'Institut Pierre-Simon Laplace',
|
||||
'ni4os.grnet': 'National Infrastructures for Research and Technology',
|
||||
'eosc-nordic.uot': 'University of Tartu', 'eosc.sztaki': 'INSTITUTE FOR COMPUTER SCIENCE AND CONTROL',
|
||||
'eosc.cnr_-_isti': 'Institute for Information Science and Technologies "Alessandro Faedo" - ISTI',
|
||||
'eosc.cbra': 'Clinical Bioinformatics Area', 'eosc.beia': 'BEIA CONSULT INTERNATIONAL',
|
||||
'eosc.slu': 'Swedish University of Agricultural Sciences', 'eosc.elcogen': 'Elcogen Oy',
|
||||
'eosc.enoll': 'European Network of Living Labs', 'eosc.inode': 'Intelligent Open Data Exploration',
|
||||
'eosc.creaf': 'Center for Research in Ecology and Forestry Applications',
|
||||
'eosc.csic': 'Consejo Superior de Investigaciones Científicas (CSIC)',
|
||||
'eosc.athena': 'Athena Research and Innovation Center in Information and Communication Technologies',
|
||||
'eosc.carlzeissm': 'Carl Zeiss Microscopy', 'eosc.unimib': 'University of Milano-Bicocca',
|
||||
'eosc.ukri_-_stfc': 'UK Research and Innovation - Science and Technology Facilities Council',
|
||||
'eosc.niod': 'NIOD Institute for War, Genocide and Holocaust Studies',
|
||||
'eosc.cloudferro': 'CloudFerro',
|
||||
'eosc.vliz': 'Flanders Marine Institute', 'eosc.unitartu': 'University of Tartu',
|
||||
'eosc.lu': 'Lund University',
|
||||
'eosc.clarin-eric': 'European Research Infrastructure for Language Resources and Technology',
|
||||
'eosc.ekt': 'National Documentation Centre', 'eosc.digifarm': 'DigiFarm',
|
||||
'eosc.inaf': 'Istituto Nazionale di Astrofisica',
|
||||
'eosc.altec': 'Aerospace Logistics Technology Engineering Company',
|
||||
'eosc.hu-cms': 'Humboldt-Universität zu Berlin - Computer- und Medienservice',
|
||||
'eosc.agh_university_main_library': 'AGH University of Krakow Main Library ',
|
||||
'eosc.ictlc': 'ICTLC S.P.A.', 'eosc.transcript': 'transcript Independent Academic Publishing ',
|
||||
'eosc.elixir-uk': 'ELIXIR United Kingdom',
|
||||
'eosc.acdh-ch': 'Austrian Centre for Digital Humanities and Cultural Heritage',
|
||||
'eosc.tubitak_ulakbim': 'Turkish Academic Network and Information Center', 'eosc.sixsq': 'SixSq',
|
||||
'eosc.fzj-inm7': 'Forschungszentrum Jülich, Institute of Neurosciences and Medicine (INM) Brain and Behavior (INM-7)',
|
||||
'eosc.forth': 'Foundation for Research and Technology, Hellas (FORTH)',
|
||||
'eosc.grnet': 'National Infrastructures for Research and Technology',
|
||||
'eosc.prace': 'Partnership For Advanced Computing in Europe aisbl',
|
||||
'eosc.umr_map': 'UMR CNRS/MC 3495 MAP', 'eosc.fris': 'Flemisch Research Information Space',
|
||||
'eosc.komanord': 'Koma Nord', 'eosc.unparallel': 'UNPARALLEL Innovation, Lda',
|
||||
'eosc.lifewatch-eric': 'LifeWatch ERIC', 'eosc.university_of_sussex': 'The University of Sussex',
|
||||
'eosc.cnb-csic': 'Centro Nacional de Biotecnologia (CSIC)', 'eosc.elsevier': 'Elsevier BV',
|
||||
'eosc.eudat': 'EUDAT', 'eosc.nilu': 'The Foundation NILU',
|
||||
'eosc.oslo_university': 'University of Oslo',
|
||||
'eosc.uo': 'University of Oulu', 'eosc.lapp': "Laboratoire d'Annecy de Physique des Particules",
|
||||
'eosc.cessda-eric': 'Consortium of European Social Science Data Archives ERIC',
|
||||
'eosc.olos': 'OLOS Association', 'eosc.obsparis': 'Observatoire de Paris'}, 'guideline_type': {
|
||||
'ir_eosc_guideline_type-eosc_core_interoperability_guideline': 'EOSC-Core Interoperability Guideline',
|
||||
'ir_eosc_guideline_type-eosc_exchange_interoperability_guideline_thematic': 'EOSC-Exchange Interoperability Guideline (Thematic)',
|
||||
'ir_eosc_guideline_type-eosc_exchange_interoperability_guideline_horizontal': 'EOSC-Exchange Interoperability Guideline (Horizontal)',
|
||||
'ir_eosc_guideline_type-operational_baseline': 'Operational Baseline'},
|
||||
'tr_access': {'tr_access_right-open_access': 'Open Access',
|
||||
'tr_access_right-restricted_access': 'Restricted Access',
|
||||
'tr_access_right-metadata_only_access': 'Metadata Only Access',
|
||||
'tr_access_right-paid_access': 'Paid Access'},
|
||||
'subdomains': {
|
||||
'scientific_subdomain-agricultural_sciences-agricultural_biotechnology': 'Agricultural Biotechnology',
|
||||
'scientific_subdomain-agricultural_sciences-agriculture_forestry_and_fisheries': 'Agriculture, Forestry & Fisheries',
|
||||
'scientific_subdomain-agricultural_sciences-animal_and_dairy_sciences': 'Animal & Dairy Sciences',
|
||||
'scientific_subdomain-agricultural_sciences-other_agricultural_sciences': 'Other Agricultural Sciences',
|
||||
'scientific_subdomain-agricultural_sciences-veterinary_sciences': 'Veterinary Sciences',
|
||||
'scientific_subdomain-engineering_and_technology-chemical_engineering': 'Chemical Engineering',
|
||||
'scientific_subdomain-engineering_and_technology-civil_engineering': 'Civil Engineering',
|
||||
'scientific_subdomain-engineering_and_technology-electrical_electronic_and_information_engineering': 'Electrical, Electronic & Information Engineering',
|
||||
'scientific_subdomain-engineering_and_technology-environmental_biotechnology': 'Environmental Biotechnology',
|
||||
'scientific_subdomain-engineering_and_technology-environmental_engineering': 'Environmental Engineering',
|
||||
'scientific_subdomain-engineering_and_technology-industrial_biotechnology': 'Industrial Biotechnology',
|
||||
'scientific_subdomain-engineering_and_technology-materials_engineering': 'Materials Engineering',
|
||||
'scientific_subdomain-engineering_and_technology-mechanical_engineering': 'Mechanical Engineering',
|
||||
'scientific_subdomain-engineering_and_technology-medical_engineering': 'Medical Engineering',
|
||||
'scientific_subdomain-engineering_and_technology-nanotechnology': 'Nanotechnology',
|
||||
'scientific_subdomain-engineering_and_technology-other_engineering_and_technology_sciences': 'Other Engineering & Technology Sciences',
|
||||
'scientific_subdomain-generic-generic': 'Generic', 'scientific_subdomain-humanities-arts': 'Arts',
|
||||
'scientific_subdomain-humanities-history_and_archaeology': 'History & Archaeology',
|
||||
'scientific_subdomain-humanities-languages_and_literature': 'Languages & Literature',
|
||||
'scientific_subdomain-humanities-other_humanities': 'Other Humanities',
|
||||
'scientific_subdomain-humanities-philosophy_ethics_and_religion': 'Philosophy, Ethics & Religion',
|
||||
'scientific_subdomain-medical_and_health_sciences-basic_medicine': 'Basic Medicine',
|
||||
'scientific_subdomain-medical_and_health_sciences-clinical_medicine': 'Clinical Medicine',
|
||||
'scientific_subdomain-medical_and_health_sciences-health_sciences': 'Health Sciences',
|
||||
'scientific_subdomain-medical_and_health_sciences-medical_biotechnology': 'Medical Biotechnology',
|
||||
'scientific_subdomain-medical_and_health_sciences-other_medical_sciences': 'Other Medical Sciences',
|
||||
'scientific_subdomain-natural_sciences-biological_sciences': 'Biological Sciences',
|
||||
'scientific_subdomain-natural_sciences-chemical_sciences': 'Chemical Sciences',
|
||||
'scientific_subdomain-natural_sciences-computer_and_information_sciences': 'Computer & Information Sciences',
|
||||
'scientific_subdomain-natural_sciences-earth_and_related_environmental_sciences': 'Earth & Related Environmental Sciences',
|
||||
'scientific_subdomain-natural_sciences-mathematics': 'Mathematics',
|
||||
'scientific_subdomain-natural_sciences-other_natural_sciences': 'Other Natural Sciences',
|
||||
'scientific_subdomain-natural_sciences-physical_sciences': 'Physical Sciences',
|
||||
'scientific_subdomain-other-other': 'Other',
|
||||
'scientific_subdomain-social_sciences-economics_and_business': 'Economics & Business',
|
||||
'scientific_subdomain-social_sciences-educational_sciences': 'Educational Sciences',
|
||||
'scientific_subdomain-social_sciences-law': 'Law',
|
||||
'scientific_subdomain-social_sciences-media_and_communications': 'Media & Communications',
|
||||
'scientific_subdomain-social_sciences-other_social_sciences': 'Other Social Sciences',
|
||||
'scientific_subdomain-social_sciences-political_sciences': 'Political Sciences',
|
||||
'scientific_subdomain-social_sciences-psychology': 'Psychology',
|
||||
'scientific_subdomain-social_sciences-social_and_economic_geography': 'Social & Economic Geography',
|
||||
'scientific_subdomain-social_sciences-sociology': 'Sociology'},
|
||||
'access_type': {'access_type-mail_in': 'Mail-In', 'access_type-other': 'Other', 'access_type-physical': 'Physical',
|
||||
'access_type-remote': 'Remote', 'access_type-virtual': 'Virtual'},
|
||||
'expertise_level': {'tr_expertise_level-advanced': 'Advanced', 'tr_expertise_level-intermediate': 'Intermediate',
|
||||
'tr_expertise_level-beginner': 'Beginner', 'tr_expertise_level-all': 'All'},
|
||||
'tr_content': {'tr_content_resource_type-animation': 'Animation', 'tr_content_resource_type-audio': 'Audio',
|
||||
'tr_content_resource_type-diagram': 'Diagram', 'tr_content_resource_type-game': 'Game',
|
||||
'tr_content_resource_type-image': 'Image', 'tr_content_resource_type-multimedia': 'Multimedia',
|
||||
'tr_content_resource_type-poster': 'Poster', 'tr_content_resource_type-slides': 'Slides',
|
||||
'tr_content_resource_type-text': 'Text', 'tr_content_resource_type-video': 'Video',
|
||||
'tr_content_resource_type-website': 'Website', 'tr_content_resource_type-other': 'Other'},
|
||||
'domains': {'scientific_domain-agricultural_sciences': 'Agricultural Sciences',
|
||||
'scientific_domain-engineering_and_technology': 'Engineering & Technology',
|
||||
'scientific_domain-generic': 'Generic', 'scientific_domain-humanities': 'Humanities',
|
||||
'scientific_domain-medical_and_health_sciences': 'Medical & Health Sciences',
|
||||
'scientific_domain-natural_sciences': 'Natural Sciences', 'scientific_domain-other': 'Other',
|
||||
'scientific_domain-social_sciences': 'Social Sciences'},
|
||||
'tr_dcmi': {'tr_dcmi_type-activity_plan': 'Activity Plan', 'tr_dcmi_type-assessment': 'Assessment',
|
||||
'tr_dcmi_type-assessment_item': 'Assessment Item',
|
||||
'tr_dcmi_type-educator_curriculum_guide': 'Educator Curriculum Guide',
|
||||
'tr_dcmi_type-lesson_plan': 'Lesson Plan',
|
||||
'tr_dcmi_type-physical_learning_resource': 'Physical Learning Resource',
|
||||
'tr_dcmi_type-recorded_lesson': 'Recorded Lesson',
|
||||
'tr_dcmi_type-supporting_document': 'Supporting Document', 'tr_dcmi_type-textbook': 'Textbook',
|
||||
'tr_dcmi_type-unit_plan': 'Unit Plan', 'tr_dcmi_type-other': 'Other'},
|
||||
'funding_program': {'funding_program-afis2020': 'Anti Fraud Information System (AFIS2020)',
|
||||
'funding_program-agr': 'European Agricultural Guarantee Fund (after transfers between EAGF and EAFRD) (AGR)',
|
||||
'funding_program-agrnet': 'Net transfer between EAGF and EAFRD (AGRNET)',
|
||||
'funding_program-amf': 'Asylum, Migration and Integration Fund (AMF)',
|
||||
'funding_program-cdf2020': 'Rights, equality and citizenship programme (CDF2020)',
|
||||
'funding_program-cef': 'Connecting Europe Facility (CEF)',
|
||||
'funding_program-cf': 'Cohesion Fund (CF)',
|
||||
'funding_program-cf_det': 'Contribution from the Cohesion Fund to the CEF programme (CF_DET)',
|
||||
'funding_program-cfsp': 'Common foreign and security policy (CFSP2020)',
|
||||
'funding_program-cit2020': 'Europe for Citizens (CIT2020)',
|
||||
'funding_program-compreg': 'Competitiveness (more developed regions) (COMPREG)',
|
||||
'funding_program-cons': 'Consumer programme (CONS)',
|
||||
'funding_program-copernicus': 'European Earth Observation Programme (COPERNICUS)',
|
||||
'funding_program-cosme': 'Programme for the competitiveness of enterprises and small and medium-sized enterprises (COSME)',
|
||||
'funding_program-cpm_h3': 'Union Civil Protection Mechanism — Member States (CPM_H3)',
|
||||
'funding_program-cpm_h4': 'Union Civil Protection Mechanism — Outside EU (CPM_H4)',
|
||||
'funding_program-crea': 'Creative Europe programme (CREA)',
|
||||
'funding_program-cust2020': 'Action programme for customs in the European Union (CUST 2020)',
|
||||
'funding_program-dci2020': 'Development Cooperation Instrument (DCI2020)',
|
||||
'funding_program-e4a': 'The Union programme for education, training, youth and sport (Erasmus+) (E4A)',
|
||||
'funding_program-eafrd': 'European Agricultural Fund for Rural Development (after transfers between EAGF and EAFRD) (EAFRD)',
|
||||
'funding_program-eafrd2020': 'European Agricultural Fund for Rural Development (EAFRD2020)',
|
||||
'funding_program-eagf2020': 'European Agricultural Guarantee Fund (EAGF2020)',
|
||||
'funding_program-ear2020': 'Emergency Aid Reserve (EAR2020)',
|
||||
'funding_program-eerp': 'Energy projects to aid economic recovery (EERP)',
|
||||
'funding_program-efsd': 'European Fund for Sustainable Development (EFSD)',
|
||||
'funding_program-efsi': 'European Fund for Strategic Investments (EFSI)',
|
||||
'funding_program-egf2020': 'European Globalisation Adjustment Fund (EGF2020)',
|
||||
'funding_program-eidhr2020': 'European Instrument for Democracy and Human Rights (EIDHR2020)',
|
||||
'funding_program-emff2020': 'European Maritime and Fisheries Fund (EMFF2020)',
|
||||
'funding_program-eni': 'European Neighbourhood Instrument (ENI)',
|
||||
'funding_program-erdf': 'European Regional Development Fund (ERDF)',
|
||||
'funding_program-esc': 'European Solidarity Corps (ESC)',
|
||||
'funding_program-esf': 'European Social Fund (ESF)',
|
||||
'funding_program-esp2017': 'European statistical programme (ESP2017)',
|
||||
'funding_program-esp2020': 'European statistical programme (ESP2020)',
|
||||
'funding_program-euav': 'EU Aid Volunteers initiative (EUAV)',
|
||||
'funding_program-euratom': 'Euratom research and training programme (EURATOM)',
|
||||
'funding_program-eurodac2020': 'Comparison of fingerprints for the effective application of the Dublin Convention (EURODAC2020)',
|
||||
'funding_program-eusf2020': 'European Union Solidarity Fund (EUSF2020)',
|
||||
'funding_program-eusf_h3': 'European Union Solidarity Fund (EUSF) — Member States (EUSF_H3)',
|
||||
'funding_program-eusf_h4': 'European Union Solidarity Fund (EUSF) — Countries negotiating for accession (EUSF_H4)',
|
||||
'funding_program-fead': 'Fund for European Aid to the Most Deprived (FEAD)',
|
||||
'funding_program-ff2020': 'Food and feed (FF2020)',
|
||||
'funding_program-finser2020': 'Specific activities in the field of financial reporting and auditing (FINSER2020)',
|
||||
'funding_program-fisc2020': 'Action programme for taxation in the European Union (FISC2020)',
|
||||
'funding_program-gal2014': 'Implementation and exploitation of European satellite navigation systems (EGNOS and Galileo) (GAL2014)',
|
||||
'funding_program-grld2020': 'EU cooperation with Greenland (GRLD2020)',
|
||||
'funding_program-h2020': 'The framework programme for research and innovation (H2020)',
|
||||
'funding_program-health': "Union's action in the field of health (Health programme) (HEALTH)",
|
||||
'funding_program-herc3': "Programme to promote activities in the field of the protection of the European Union's financial interests (HERC3)",
|
||||
'funding_program-hfr2015': 'Supplementary high flux reactor (HFR) programmes (HFR2015)',
|
||||
'funding_program-huma2020': 'Humanitarian aid (HUMA2020)',
|
||||
'funding_program-icfs': 'Enhancing consumers involvement in EU policy making in the field of financial services (ICFS)',
|
||||
'funding_program-ies': 'Instrument for emergency support within the Union (IES)',
|
||||
'funding_program-ifs2020': 'Instrument contributing to Stability and Peace (IFS2020)',
|
||||
'funding_program-insc2020': 'Instrument for Nuclear Safety Cooperation (INSC2020)',
|
||||
'funding_program-ipa2': 'Instrument for Pre-accession Assistance (IPA2)',
|
||||
'funding_program-isa2015': 'Interoperability Solutions for European Public Administrations (ISA2015)',
|
||||
'funding_program-isa2020': 'Interoperability Solutions for European public administrations, businesses and citizens (ISA2020)',
|
||||
'funding_program-isf': 'Internal Security Fund (ISF)',
|
||||
'funding_program-iter': 'International thermonuclear experimental reactor (ITER)',
|
||||
'funding_program-just': 'Justice programme (JUST)',
|
||||
'funding_program-life2020': 'Programme for the Environment and Climate Action (LIFE2020)',
|
||||
'funding_program-loan2020': 'Guarantee Fund for external actions (LOAN2020)',
|
||||
'funding_program-mfa': 'Macro financial assistance (MFA)',
|
||||
'funding_program-nd': 'Nuclear decommissioning assistance programmes in Bulgaria, Lithuania and Slovakia (ND)',
|
||||
'funding_program-other': 'Other',
|
||||
'funding_program-outreg': 'Outermost and sparsely populated regions (OUTREG)',
|
||||
'funding_program-peri2020': 'Exchange, assistance and training programme for the protection of the euro against counterfeiting (PERI2020)',
|
||||
'funding_program-pi': 'Partnership instrument for cooperation with third countries (PI)',
|
||||
'funding_program-psci': 'European Union programme for employment and social innovation (PSCI)',
|
||||
'funding_program-regconv': 'Regional convergence (REGCONV)',
|
||||
'funding_program-rfmos': 'Compulsory contributions to regional fisheries management organisations (RFMOs) and to other international organisations',
|
||||
'funding_program-sfpas': 'Sustainable Fisheries Partnership Agreements (SFPAs)',
|
||||
'funding_program-sis2020': 'Schengen Information System (SIS2020)',
|
||||
'funding_program-ta_ia': 'Technical assistance and innovative actions (TA_IA)',
|
||||
'funding_program-tcc': 'Instrument of financial support for encouraging the economic development of the Turkish Cypriot community (TCC)',
|
||||
'funding_program-terrcoop': 'European territorial cooperation (TERRCOOP)',
|
||||
'funding_program-transreg': 'Transition regions (TRANSREG)',
|
||||
'funding_program-vis2020': 'Visa Information System (VIS2020)',
|
||||
'funding_program-yei': 'Youth employment initiative (specific top-up allocation) (YEI))',
|
||||
'funding_program-lripmeys': 'Large Research Infrastructures Programme of the MEYS, Czech Republic',
|
||||
'funding_program-ddoict': 'Development, deployment and operation of ICT-based e-infrastructures',
|
||||
'funding_program-nucleu': 'NUCLEU Programme (Romania)',
|
||||
'funding_program-driltah': 'LINDAT/CLARIAH-CZ Digital Research Infrastructure for the Language Technologies, Arts and Humanities (LM2018101)',
|
||||
'funding_program-esaeoep': 'ESA EO Exploitation Platforms initiative'},
|
||||
'order_type': {'order_type-fully_open_access': 'Fully Open Access', 'order_type-open_access': 'Open Access',
|
||||
'order_type-order_required': 'Order Required', 'order_type-other': 'Other'}, 'related_resource': {},
|
||||
'related_resources': {}}
|
|
@ -0,0 +1,23 @@
|
|||
from airflow.hooks.base import BaseHook
|
||||
from opensearchpy import OpenSearch
|
||||
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
||||
|
||||
|
||||
def get_opensearch_client(kwargs) -> OpenSearch:
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
return OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20,
|
||||
timeout=180
|
||||
)
|
||||
|
||||
|
||||
def get_bucket_name(context: dict, hook: S3Hook, param_name: str):
|
||||
bucket_name = context["params"][param_name]
|
||||
if not bucket_name:
|
||||
bucket_name = hook.extra_args['bucket_name']
|
||||
return bucket_name
|
|
@ -0,0 +1,43 @@
|
|||
import os
|
||||
from datetime import timedelta
|
||||
|
||||
import pendulum
|
||||
import requests
|
||||
from airflow.decorators import dag
|
||||
from airflow.decorators import task
|
||||
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
||||
|
||||
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
|
||||
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||
|
||||
default_args = {
|
||||
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
|
||||
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
||||
}
|
||||
|
||||
|
||||
@dag(
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
schedule=None,
|
||||
catchup=False,
|
||||
default_args=default_args,
|
||||
params={
|
||||
"url": "File to download",
|
||||
"dst_key": "key containing the file",
|
||||
"dst_bucket": "bucket that will contain file"
|
||||
},
|
||||
tags=["s3"],
|
||||
)
|
||||
def download_to_s3():
|
||||
@task
|
||||
def download(**context):
|
||||
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
|
||||
with requests.get(context["params"]["url"], stream=True) as r:
|
||||
r.raise_for_status()
|
||||
hook.load_file_obj(r.raw, context["params"]["dst_key"], bucket_name=context["params"]["dst_bucket"], replace=True, encrypt=False)
|
||||
|
||||
download()
|
||||
|
||||
|
||||
download_to_s3()
|
|
@ -0,0 +1,218 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from datetime import timedelta
|
||||
|
||||
import opensearchpy
|
||||
import pendulum
|
||||
import requests
|
||||
from airflow.decorators import dag
|
||||
from airflow.decorators import task
|
||||
from airflow.hooks.base import BaseHook
|
||||
from airflow.utils.helpers import chain
|
||||
from opensearchpy import OpenSearch, helpers
|
||||
|
||||
from catalogue.RawCatalogOpensearch import RawCatalogOpensearch
|
||||
|
||||
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||
|
||||
default_args = {
|
||||
"execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
|
||||
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
||||
}
|
||||
|
||||
|
||||
@dag(
|
||||
dag_id="import_Catalogue",
|
||||
schedule=None,
|
||||
dagrun_timeout=None,
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
catchup=False,
|
||||
default_args=default_args,
|
||||
params={
|
||||
"OPENSEARCH_CONN_ID": "opensearch_default",
|
||||
"SHARDS": 3,
|
||||
"SUFFIX": pendulum.now().format('YYYYMMDDHHmmss')
|
||||
},
|
||||
tags=["lot1"]
|
||||
)
|
||||
def import_catalogue_entities():
|
||||
@task
|
||||
def create_indexes(**kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20,
|
||||
timeout=180
|
||||
)
|
||||
|
||||
for entity in RawCatalogOpensearch.entities:
|
||||
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
|
||||
if client.indices.exists(indexname):
|
||||
client.indices.delete(indexname)
|
||||
|
||||
@task
|
||||
def harvest_indexes(**kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20,
|
||||
timeout=180
|
||||
)
|
||||
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
|
||||
|
||||
session = requests.session()
|
||||
for entity in RawCatalogOpensearch.entities:
|
||||
indexname = catalog.get_index(entity)
|
||||
baseurl = "http://vereniki.athenarc.gr:8080/eic-registry"
|
||||
callurl = f"{baseurl}/{entity}"
|
||||
params = {"draft": "false", "active": "true", "suspended": "false"}
|
||||
|
||||
if client.indices.exists(indexname):
|
||||
client.indices.delete(indexname)
|
||||
|
||||
while True:
|
||||
reply = session.get(url=callurl, params=params)
|
||||
reply.raise_for_status()
|
||||
content = reply.json()
|
||||
if 'results' not in content:
|
||||
break
|
||||
results = content['results']
|
||||
if len(results) <= 0:
|
||||
break
|
||||
|
||||
def streamed_results():
|
||||
for r in results:
|
||||
yield {"_index": indexname, "_id": r['id'], "_source": r}
|
||||
|
||||
succeeded = 0
|
||||
failed = 0
|
||||
for success, item in helpers.parallel_bulk(client, actions=streamed_results(), timeout=5 * 60):
|
||||
if success:
|
||||
succeeded = succeeded + 1
|
||||
else:
|
||||
print("error: " + str(item))
|
||||
failed = failed + 1
|
||||
|
||||
# end of stream conditions
|
||||
if content['to'] >= content['total']:
|
||||
break
|
||||
params['from'] = content['to']
|
||||
client.indices.refresh(indexname)
|
||||
|
||||
@task
|
||||
def map_indexes(**kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20,
|
||||
timeout=180
|
||||
)
|
||||
|
||||
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
|
||||
|
||||
for entity in RawCatalogOpensearch.mapped_entities:
|
||||
mapped_index = catalog.get_mapped_index(entity)
|
||||
if client.indices.exists(mapped_index):
|
||||
client.indices.delete(mapped_index)
|
||||
|
||||
def streamed_results():
|
||||
for hit in opensearchpy.helpers.scan(client,
|
||||
index=catalog.get_index(entity),
|
||||
query={"query": {"match_all": {}}}):
|
||||
r = hit['_source']
|
||||
doc = None
|
||||
match entity:
|
||||
case "interoperability-records":
|
||||
doc = catalog.map_interoperability(r)
|
||||
case "training-resources":
|
||||
doc = catalog.map_training(r)
|
||||
case "services":
|
||||
doc = catalog.map_service(r)
|
||||
|
||||
yield {"_index": mapped_index, "_id": doc['id'], "_source": doc}
|
||||
|
||||
succeeded = 0
|
||||
failed = 0
|
||||
for success, item in helpers.parallel_bulk(client, actions=streamed_results(), timeout=5 * 60):
|
||||
if success:
|
||||
succeeded = succeeded + 1
|
||||
else:
|
||||
print("error: " + str(item))
|
||||
failed = failed + 1
|
||||
print(f"Entity: {entity} succes: {success} error: {failed}")
|
||||
|
||||
@task
|
||||
def close_indexes(**kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20,
|
||||
timeout=180
|
||||
)
|
||||
catalog = RawCatalogOpensearch(client, kwargs["params"]["SUFFIX"])
|
||||
|
||||
def refresh_index(index_name):
|
||||
if index_name is not None:
|
||||
client.indices.refresh(index_name)
|
||||
client.indices.put_settings(index=index_name, body={
|
||||
"index": {
|
||||
"number_of_replicas": 1,
|
||||
"refresh_interval": "60s",
|
||||
}
|
||||
})
|
||||
|
||||
def update_aliases(index_name, alias_name):
|
||||
if index_name is not None and alias_name is not None:
|
||||
client.indices.update_aliases(
|
||||
body={"actions": [
|
||||
{"remove": {"index": f"{alias_name}_*", "alias": alias_name}},
|
||||
{"add": {"index": index_name, "alias": alias_name}},
|
||||
]}
|
||||
)
|
||||
|
||||
for entity in RawCatalogOpensearch.entities:
|
||||
refresh_index(catalog.get_index(entity))
|
||||
refresh_index(catalog.get_mapped_index(entity))
|
||||
update_aliases(catalog.get_index(entity), catalog.get_alias(entity))
|
||||
update_aliases(catalog.get_mapped_index(entity), catalog.get_mapped_alias(entity))
|
||||
|
||||
# update "allresources" alias with mapped indices
|
||||
actions = []
|
||||
for entity in RawCatalogOpensearch.mapped_entities:
|
||||
index_name = catalog.get_mapped_index(entity)
|
||||
entity_alias = catalog.get_mapped_alias(entity)
|
||||
actions.append({"remove": {"index": f"{entity_alias}_*", "alias": "allresources"}})
|
||||
actions.append({"add": {"index": index_name, "alias": "allresources"}})
|
||||
|
||||
if len(actions) > 0:
|
||||
client.indices.update_aliases(
|
||||
body={"actions": actions}
|
||||
)
|
||||
|
||||
chain(
|
||||
create_indexes.override(task_id="create_indexes")(),
|
||||
harvest_indexes.override(task_id="harvest_indexes")(),
|
||||
map_indexes.override(task_id="map_indexes")(),
|
||||
close_indexes.override(task_id="close_indexes")()
|
||||
)
|
||||
|
||||
|
||||
import_catalogue_entities()
|
|
@ -0,0 +1,317 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import codecs
|
||||
import gzip
|
||||
import io
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from datetime import timedelta
|
||||
|
||||
from airflow.exceptions import AirflowException
|
||||
from kubernetes.client import models as k8s
|
||||
import pendulum
|
||||
from airflow.decorators import dag
|
||||
from airflow.decorators import task
|
||||
from airflow.operators.python import PythonOperator
|
||||
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
||||
from airflow.utils.helpers import chain
|
||||
from airflow.hooks.base import BaseHook
|
||||
|
||||
from opensearchpy import OpenSearch, helpers
|
||||
from EOSC_indexes import mappings
|
||||
from EOSC_entity_trasform import filter_entities, transform_entities
|
||||
|
||||
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||
|
||||
default_args = {
|
||||
"execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
|
||||
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
||||
}
|
||||
|
||||
configs = {
|
||||
"all": {"ENTITIES": ["datasource", "grants", "organizations", "persons", "products", "topics", "venues", "interoperability", "services", "training"]},
|
||||
"skg-if": {"ENTITIES": ["datasource", "grants", "organizations", "persons", "products", "topics", "venues"]},
|
||||
"catalogue": {"ENTITIES": ["interoperability", "services", "training"]},
|
||||
}
|
||||
|
||||
for config_name, config in configs.items():
|
||||
dag_id = f"import_EOSC_{config_name}"
|
||||
|
||||
@dag(
|
||||
dag_id=dag_id,
|
||||
schedule=None,
|
||||
dagrun_timeout=None,
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
catchup=False,
|
||||
default_args=default_args,
|
||||
params={
|
||||
"S3_CONN_ID": "s3_conn",
|
||||
"OPENSEARCH_CONN_ID": "opensearch_default",
|
||||
"KEY_PREFIX": "/",
|
||||
"EOSC_CATALOG_BUCKET": "eosc-portal-import",
|
||||
"BATCH_LOADERS_NUM": 10,
|
||||
"ENTITIES": config["ENTITIES"],
|
||||
"SUFFIX": pendulum.now().format('YYYYMMDDHHmmss')
|
||||
},
|
||||
tags=["lot1"]
|
||||
)
|
||||
def import_EOSC_entities():
|
||||
@task
|
||||
def create_indexes(**kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20,
|
||||
timeout=180
|
||||
)
|
||||
|
||||
client.cluster.put_settings(body={
|
||||
"persistent": {
|
||||
"cluster.routing.allocation.balance.prefer_primary": True,
|
||||
"segrep.pressure.enabled": True
|
||||
}
|
||||
})
|
||||
|
||||
for entity in kwargs["params"]["ENTITIES"]:
|
||||
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
|
||||
if client.indices.exists(indexname):
|
||||
client.indices.delete(indexname)
|
||||
|
||||
client.indices.create(indexname, {
|
||||
"settings": {
|
||||
"index": {
|
||||
"number_of_shards": 40,
|
||||
"number_of_replicas": 0,
|
||||
"refresh_interval": -1,
|
||||
|
||||
"translog.flush_threshold_size": "2048MB",
|
||||
|
||||
"codec": "zstd_no_dict",
|
||||
"replication.type": "SEGMENT"
|
||||
}
|
||||
|
||||
},
|
||||
"mappings": mappings[entity]
|
||||
})
|
||||
|
||||
def compute_batches(ds=None, **kwargs):
|
||||
hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
|
||||
pieces = []
|
||||
for entity in kwargs["params"]["ENTITIES"]:
|
||||
s3_path = os.path.normpath(kwargs["params"]["KEY_PREFIX"] + "/" + entity + "/")
|
||||
keys = hook.list_keys(bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"], prefix=s3_path)
|
||||
to_delete = list(filter(lambda key: key.endswith('.PROCESSED'), keys))
|
||||
for obj in to_delete:
|
||||
hook.get_conn().delete_object(Bucket=kwargs["params"]["EOSC_CATALOG_BUCKET"], Key=obj)
|
||||
for key in keys:
|
||||
if key.endswith(('.json.gz', '.json')):
|
||||
pieces.append((entity, key))
|
||||
|
||||
def split_list(list_a, chunk_size):
|
||||
for i in range(0, len(list_a), chunk_size):
|
||||
yield {"files": list_a[i:i + chunk_size]}
|
||||
|
||||
if len(pieces) <= 0:
|
||||
print("Nothing found in: " + kwargs["params"]["KEY_PREFIX"])
|
||||
return list()
|
||||
|
||||
num_batches = len(pieces)//kwargs["params"]["BATCH_LOADERS_NUM"]
|
||||
if num_batches > 0:
|
||||
return list(split_list(pieces, num_batches))
|
||||
return list(split_list(pieces, len(pieces)))
|
||||
|
||||
@task(executor_config={
|
||||
"pod_override": k8s.V1Pod(
|
||||
spec=k8s.V1PodSpec(
|
||||
containers=[
|
||||
k8s.V1Container(
|
||||
name="base",
|
||||
resources=k8s.V1ResourceRequirements(
|
||||
requests={
|
||||
"cpu": "550m",
|
||||
"memory": "256Mi"
|
||||
}
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
)
|
||||
})
|
||||
def bulk_load(files: list[(str, str)], **kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20,
|
||||
timeout=180,
|
||||
request_timeout=5*60
|
||||
)
|
||||
hook = S3Hook(kwargs["params"]["S3_CONN_ID"], transfer_config_args={'use_threads': False})
|
||||
|
||||
retries = 0
|
||||
while len(files) > 0 and retries < 5:
|
||||
retries += 1
|
||||
retry_files = []
|
||||
for (entity, key) in files:
|
||||
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
|
||||
if hook.check_for_key(key=f"{key}.PROCESSED", bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"]):
|
||||
print(f'Skipping {entity}: {key}')
|
||||
continue
|
||||
print(f'Processing {indexname}: {key}')
|
||||
s3_obj = hook.get_key(key, bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"])
|
||||
with gzip.GzipFile(fileobj=s3_obj.get()["Body"], mode='rb') if key.endswith(".gz") else codecs.getreader('utf-8')(s3_obj.get()["Body"]) as s3file:
|
||||
def _generate_data():
|
||||
for line in s3file:
|
||||
data: dict = json.loads(line)
|
||||
if entity in transform_entities:
|
||||
data = transform_entities[entity](data)
|
||||
if entity in filter_entities:
|
||||
if filter_entities[entity](data):
|
||||
print(data["local_identifier"] + " does not meet inclusion policies")
|
||||
continue
|
||||
index = {"update": {"_index": indexname, "_id": data.pop("_id")}}
|
||||
yield index, {"doc": data, "doc_as_upsert": True}
|
||||
|
||||
# disable success post logging
|
||||
logging.getLogger("opensearch").setLevel(logging.WARN)
|
||||
succeeded = 0
|
||||
failed = 0
|
||||
for success, item in helpers.parallel_bulk(client, actions=_generate_data(),
|
||||
expand_action_callback=lambda arg: arg,
|
||||
raise_on_exception=False,
|
||||
raise_on_error=False,
|
||||
chunk_size=5000,
|
||||
max_chunk_bytes=50 * 1024 * 1024,
|
||||
timeout=5*60):
|
||||
if success:
|
||||
succeeded = succeeded + 1
|
||||
else:
|
||||
print("error: " + str(item))
|
||||
failed = failed + 1
|
||||
|
||||
print(f"Bulk report: inserted {succeeded} items, {failed} failures, {retries} tentative")
|
||||
|
||||
if failed > 0:
|
||||
retry_files.append((entity, key))
|
||||
else:
|
||||
hook.load_string(
|
||||
"",
|
||||
f"{key}.PROCESSED",
|
||||
bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"],
|
||||
replace=False
|
||||
)
|
||||
files = retry_files # retry files with errors
|
||||
# Check if there are remaining files to recovered in retry
|
||||
if len(files) > 0:
|
||||
raise AirflowException("ERROR could not import all items from: " + str(files))
|
||||
|
||||
@task
|
||||
def merge_curation_db(**kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20,
|
||||
timeout=180
|
||||
)
|
||||
if "products" in kwargs["params"]["ENTITIES"]:
|
||||
products_index = f'products_{kwargs["params"]["SUFFIX"]}'
|
||||
curationdb_index = 'curation'
|
||||
if client.indices.exists(curationdb_index):
|
||||
client.reindex(body={
|
||||
"source": {
|
||||
"index": curationdb_index,
|
||||
"_source": ["status"]
|
||||
},
|
||||
"dest": {
|
||||
"index": products_index
|
||||
}
|
||||
},
|
||||
refresh=False,
|
||||
requests_per_second=-1,
|
||||
scroll="4h",
|
||||
slices="auto",
|
||||
timeout=60*60*4,
|
||||
wait_for_completion=True)
|
||||
|
||||
@task
|
||||
def delete_missing_curated(**kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20,
|
||||
timeout=180
|
||||
)
|
||||
if "products" in kwargs["params"]["ENTITIES"]:
|
||||
products_index = f'products_{kwargs["params"]["SUFFIX"]}'
|
||||
client.indices.refresh(products_index)
|
||||
client.delete_by_query(index=products_index,
|
||||
body={"query": {"bool": {"must_not": {"exists": {"field": "local_identifier"}}}}},
|
||||
refresh=True
|
||||
)
|
||||
|
||||
|
||||
@task
|
||||
def close_indexes(**kwargs):
|
||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||
http_auth=(conn.login, conn.password),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20,
|
||||
timeout=180
|
||||
)
|
||||
for entity in kwargs["params"]["ENTITIES"]:
|
||||
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
|
||||
client.indices.refresh(indexname)
|
||||
# update aliases
|
||||
for entity in kwargs["params"]["ENTITIES"]:
|
||||
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
|
||||
client.indices.update_aliases(
|
||||
body={"actions": [
|
||||
{"remove": {"index": f"{entity}_*", "alias": entity}},
|
||||
{"add": {"index": indexname, "alias": entity}},
|
||||
]}
|
||||
)
|
||||
# update "allresources" alias
|
||||
actions = []
|
||||
for entity in kwargs["params"]["ENTITIES"]:
|
||||
if entity in ['products', 'services', 'training', 'interoperability']:
|
||||
indexname = f'{entity}_{kwargs["params"]["SUFFIX"]}'
|
||||
actions.append({"remove": {"index": f"{entity}_*", "alias": "allresources"}})
|
||||
actions.append({"add": {"index": indexname, "alias": "allresources"}})
|
||||
if len(actions) > 0:
|
||||
client.indices.update_aliases(
|
||||
body={"actions": actions}
|
||||
)
|
||||
|
||||
parallel_batches = PythonOperator(task_id="compute_parallel_batches", python_callable=compute_batches)
|
||||
|
||||
chain(
|
||||
create_indexes.override(task_id="create_indexes")(),
|
||||
merge_curation_db.override(task_id="merge_curation_db")(),
|
||||
parallel_batches,
|
||||
bulk_load.expand_kwargs(parallel_batches.output),
|
||||
delete_missing_curated.override(task_id="delete_missing_curated_recs")(),
|
||||
close_indexes.override(task_id="close_indexes")()
|
||||
)
|
||||
|
||||
import_EOSC_entities()
|
|
@ -0,0 +1,67 @@
|
|||
import requests
|
||||
|
||||
|
||||
def init_ams(endpoint: str, project: str, token: str, reset: bool):
|
||||
session = requests.session()
|
||||
|
||||
def delete_topic(topic):
|
||||
print(f"Deleting projects/{project}/topics/{topic}", flush=True)
|
||||
reply = session.delete(
|
||||
headers={"x-api-key": token},
|
||||
url=f"https://{endpoint}/v1/projects/{project}/topics/{topic}"
|
||||
)
|
||||
if not (200 <= reply.status_code < 500 or reply.status_code == 504):
|
||||
reply.raise_for_status()
|
||||
|
||||
def delete_subscription(subscription):
|
||||
print(f"Deleting projects/{project}/subscriptions/{subscription}", flush=True)
|
||||
reply = session.delete(
|
||||
headers={"x-api-key": token},
|
||||
url=f"https://{endpoint}/v1/projects/{project}/subscriptions/{subscription}"
|
||||
)
|
||||
if not (200 <= reply.status_code < 500 or reply.status_code == 504):
|
||||
reply.raise_for_status()
|
||||
|
||||
def create_topic(topic):
|
||||
print(f"Creating projects/{project}/topics/{topic}", flush=True)
|
||||
reply = session.put(
|
||||
headers={"x-api-key": token},
|
||||
url=f"https://{endpoint}/v1/projects/{project}/topics/{topic}",
|
||||
json={
|
||||
"maxMessages": "1",
|
||||
"returnImmediately": "false"
|
||||
}
|
||||
)
|
||||
if not (200 <= reply.status_code < 300 or reply.status_code == 409 or reply.status_code == 504):
|
||||
reply.raise_for_status()
|
||||
|
||||
def create_subscription(topic, subscription):
|
||||
print(f"Creating projects/{project}/subscriptions/{subscription}", flush=True)
|
||||
reply = session.put(
|
||||
headers={"x-api-key": token},
|
||||
url=f"https://{endpoint}/v1/projects/{project}/subscriptions/{subscription}",
|
||||
json={
|
||||
"topic": f"projects/{project}/topics/{topic}",
|
||||
"ackDeadlineSeconds": 600
|
||||
}
|
||||
)
|
||||
if not (200 <= reply.status_code < 300 or reply.status_code == 409):
|
||||
reply.raise_for_status()
|
||||
|
||||
subscriptions = {
|
||||
'curation_requests': ['curation_requests_debug', 'curation_requests_dispatcher'],
|
||||
'curation_replies': ['curation_replies_rest_debug', 'curation_replies_rest'],
|
||||
'curation_spam_candidates': ['curation_spam_candidates_debug', 'curation_spam_candidates_dispatcher'],
|
||||
'graph_requests': ['graph_requests_debug', 'graph_requests_indexer']
|
||||
}
|
||||
|
||||
for topic in ['curation_requests', 'curation_replies', 'curation_spam_candidates', 'graph_requests']:
|
||||
if reset:
|
||||
for sub in subscriptions[topic]:
|
||||
delete_subscription(sub)
|
||||
delete_topic(topic)
|
||||
|
||||
create_topic(topic)
|
||||
|
||||
for sub in subscriptions[topic]:
|
||||
create_subscription(topic, sub)
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,141 @@
|
|||
import os
|
||||
import time
|
||||
from datetime import timedelta
|
||||
|
||||
import pendulum
|
||||
import requests
|
||||
from airflow.decorators import dag
|
||||
from airflow.decorators import task
|
||||
from airflow.hooks.base import BaseHook
|
||||
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
||||
|
||||
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
|
||||
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||
|
||||
default_args = {
|
||||
"execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
|
||||
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
||||
}
|
||||
|
||||
|
||||
def delete_pending_multipart_uploads(s3_client, bucket, key):
|
||||
multipart_uploads = s3_client.list_multipart_uploads(Bucket=bucket)
|
||||
|
||||
if 'Uploads' in multipart_uploads:
|
||||
for upload in multipart_uploads['Uploads']:
|
||||
if upload['Key'] == key:
|
||||
upload_id = upload['UploadId']
|
||||
|
||||
s3_client.abort_multipart_upload(
|
||||
Bucket=bucket,
|
||||
Key=key,
|
||||
UploadId=upload_id
|
||||
)
|
||||
print(f"Aborted multipart upload {upload_id} for key {key}")
|
||||
else:
|
||||
print("No pending multipart uploads found")
|
||||
|
||||
|
||||
def download_uri(session: requests.Session, url: str, s3_client, bucket, key, max_retries: int = 10):
|
||||
parts = []
|
||||
total_size = 0
|
||||
current_size = 0
|
||||
part_number = 1
|
||||
chunk_size = 0
|
||||
|
||||
response = s3_client.create_multipart_upload(Bucket=bucket,
|
||||
Key=key)
|
||||
upload_id = response['UploadId']
|
||||
tries = 0
|
||||
while tries < max_retries:
|
||||
try:
|
||||
with session.get(url,
|
||||
headers={'Range': 'bytes=%d-' % current_size},
|
||||
stream=True) as r:
|
||||
if total_size == 0:
|
||||
total_size = int(r.headers['Content-length'])
|
||||
chunk_size = max(total_size // (10000 - 1), 15 * 1024 * 1024)
|
||||
for chunk in r.iter_content(chunk_size=chunk_size):
|
||||
if chunk:
|
||||
response = s3_client.upload_part(
|
||||
Body=chunk,
|
||||
Bucket=bucket,
|
||||
Key=key,
|
||||
PartNumber=part_number,
|
||||
UploadId=upload_id
|
||||
)
|
||||
parts.append({'PartNumber': part_number, 'ETag': response['ETag']})
|
||||
current_size += len(chunk)
|
||||
print(f"Read {current_size} of {total_size} part no {part_number}")
|
||||
part_number += 1
|
||||
tries = 0
|
||||
break # break the retry loop when reaches end of chunks
|
||||
except Exception as e:
|
||||
tries += 1
|
||||
if tries < max_retries:
|
||||
print(e)
|
||||
print("Resume in 60 seconds...")
|
||||
time.sleep(60)
|
||||
continue
|
||||
else:
|
||||
print(f"ABORT: failed after {max_retries} tentatives")
|
||||
s3_client.abort_multipart_upload(
|
||||
Bucket=bucket,
|
||||
Key=key,
|
||||
UploadId=upload_id
|
||||
)
|
||||
raise
|
||||
|
||||
s3_client.complete_multipart_upload(
|
||||
Bucket=bucket,
|
||||
Key=key,
|
||||
UploadId=upload_id,
|
||||
MultipartUpload={'Parts': parts}
|
||||
)
|
||||
|
||||
|
||||
@dag(
|
||||
schedule=None,
|
||||
dagrun_timeout=None,
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
catchup=False,
|
||||
default_args=default_args,
|
||||
params={
|
||||
"file": "File to download",
|
||||
"dst_bucket": "bucket that will contain file",
|
||||
"max_retries": 10
|
||||
},
|
||||
tags=["s3"],
|
||||
)
|
||||
def openaire_to_s3():
|
||||
@task
|
||||
def download(**context):
|
||||
http_conn = BaseHook.get_connection("openaire_default")
|
||||
|
||||
max_retries = context["params"]["max_retries"]
|
||||
url = "https://" + http_conn.host + "/data/graph/" + context["params"]["file"]
|
||||
bucket_name = context["params"]["dst_bucket"]
|
||||
s3_key = "/data/graph/" + context["params"]["file"]
|
||||
|
||||
session = requests.Session()
|
||||
session.headers['Connection'] = 'close'
|
||||
session.auth = (http_conn.login, http_conn.password)
|
||||
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
|
||||
|
||||
# Cleanup file and pending uploads
|
||||
delete_pending_multipart_uploads(s3_client=hook.get_conn(), bucket=bucket_name, key=s3_key)
|
||||
hook.delete_objects(bucket=bucket_name,
|
||||
keys=[s3_key])
|
||||
|
||||
download_uri(session=session,
|
||||
url=url,
|
||||
s3_client=hook.get_conn(),
|
||||
bucket=bucket_name,
|
||||
key=s3_key,
|
||||
max_retries=max_retries)
|
||||
|
||||
download()
|
||||
|
||||
|
||||
openaire_to_s3()
|
|
@ -0,0 +1,80 @@
|
|||
import json
|
||||
from datetime import timedelta
|
||||
|
||||
import pendulum
|
||||
from airflow.decorators import dag
|
||||
from airflow.decorators import task
|
||||
from airflow.operators.python import get_current_context
|
||||
|
||||
from dag_utils import get_opensearch_client
|
||||
|
||||
# Define default arguments
|
||||
default_args = {
|
||||
'owner': 'airflow',
|
||||
'depends_on_past': False,
|
||||
'email_on_failure': False,
|
||||
'email_on_retry': False,
|
||||
'retries': 1,
|
||||
'retry_delay': timedelta(minutes=5),
|
||||
}
|
||||
|
||||
managed_indexes = {'catalog_datasources', 'catalog_interoperability-records', 'catalog_providers',
|
||||
'catalog_resource-interoperability-records', 'catalog_services', 'catalog_training-resources',
|
||||
'datasource', 'grants', 'interoperability',
|
||||
'organizations', 'persons', 'products',
|
||||
'services', 'topics', 'training', 'venues'
|
||||
}
|
||||
|
||||
|
||||
@dag(
|
||||
dag_id="remove_old_indexes",
|
||||
# dag_display_name="Remove outdated MKG indexes",
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
schedule=None,
|
||||
catchup=False,
|
||||
default_args=default_args,
|
||||
params={
|
||||
"OPENSEARCH_CONN_ID": "opensearch_default",
|
||||
},
|
||||
tags=["opensearch", "maintenance"],
|
||||
)
|
||||
def remove_old_indexes():
|
||||
@task
|
||||
def remove_indexes():
|
||||
context = get_current_context()
|
||||
client = get_opensearch_client(context)
|
||||
|
||||
indexes = client.cat.indices(format="json")
|
||||
aliases = client.cat.aliases(format="json")
|
||||
|
||||
print(json.dumps(aliases))
|
||||
print(json.dumps(indexes))
|
||||
|
||||
# indexes referred by aliases
|
||||
alias_index_names = {alias['index'] for alias in aliases}
|
||||
# indexes ordered by timestamp
|
||||
index_dict = {}
|
||||
|
||||
for index in indexes:
|
||||
index_name = index['index']
|
||||
if '_' in index_name:
|
||||
base_name = '_'.join(index_name.split('_')[:-1])
|
||||
timestamp = index_name.split('_')[-1]
|
||||
if not (base_name in managed_indexes and timestamp.isdigit()):
|
||||
continue
|
||||
if base_name not in index_dict:
|
||||
index_dict[base_name] = []
|
||||
index_dict[base_name].append((index_name, timestamp))
|
||||
|
||||
for base_name, index_list in index_dict.items():
|
||||
index_list.sort(key=lambda x: x[1], reverse=True)
|
||||
most_recent_index = index_list[0][0]
|
||||
for index_name, timestamp in index_list:
|
||||
if index_name != most_recent_index and index_name not in alias_index_names:
|
||||
# hook.run(f'/{index_name}')
|
||||
print(f'Deleted index: {index_name}')
|
||||
|
||||
remove_indexes()
|
||||
|
||||
|
||||
remove_old_indexes()
|
|
@ -0,0 +1,111 @@
|
|||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from datetime import timedelta
|
||||
|
||||
import pendulum
|
||||
from airflow.decorators import dag, task_group
|
||||
from airflow.decorators import task
|
||||
from airflow.exceptions import AirflowSkipException
|
||||
from airflow.operators.empty import EmptyOperator
|
||||
from airflow.operators.python import get_current_context
|
||||
from airflow.utils.helpers import chain
|
||||
from kubernetes.client import models as k8s
|
||||
|
||||
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||
|
||||
default_args = {
|
||||
"execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
|
||||
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
||||
}
|
||||
|
||||
|
||||
@dag(
|
||||
dag_id="test_s3_openaire_dump",
|
||||
# dag_display_name="(Test) Import OpenAIRE entities from S3",
|
||||
schedule=None,
|
||||
dagrun_timeout=None,
|
||||
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
|
||||
catchup=False,
|
||||
default_args=default_args,
|
||||
params={
|
||||
"S3_CONN_ID": "s3_conn",
|
||||
"OPENSEARCH_CONN_ID": "opensearch_default",
|
||||
"KEY_PREFIX": "/",
|
||||
"S3_BUCKET": "kg-1",
|
||||
"BATCH_LOADERS_NUM": 10,
|
||||
"ENTITIES": ["datasource", "grants", "organizations", "persons", "products", "topics", "venues"],
|
||||
"SUFFIX": pendulum.now().format('YYYYMMDDHHmmss')
|
||||
},
|
||||
tags=["openaire", "lot1", "mkg"]
|
||||
)
|
||||
def import_s3_openaire_dump():
|
||||
@task
|
||||
def create_indexes():
|
||||
kwargs = get_current_context()
|
||||
print(kwargs["params"]["ENTITIES"])
|
||||
|
||||
@task_group
|
||||
def load_and_map_entity(entity: str):
|
||||
@task(trigger_rule="none_failed")
|
||||
def compute_batches():
|
||||
nonlocal entity
|
||||
kwargs = get_current_context()
|
||||
if entity not in kwargs["params"]["ENTITIES"]:
|
||||
raise AirflowSkipException(f"Skipping {entity}")
|
||||
return [[(entity, '1'), (entity, '2')], [], []]
|
||||
|
||||
@task(executor_config={
|
||||
"pod_override": k8s.V1Pod(
|
||||
spec=k8s.V1PodSpec(
|
||||
containers=[
|
||||
k8s.V1Container(
|
||||
name="base",
|
||||
resources=k8s.V1ResourceRequirements(
|
||||
requests={
|
||||
"cpu": "550m",
|
||||
"memory": "256Mi"
|
||||
}
|
||||
)
|
||||
)
|
||||
]
|
||||
)
|
||||
)
|
||||
})
|
||||
def parallel_load(files: list[(str, str)], **kwargs):
|
||||
kwargs = get_current_context()
|
||||
print(files)
|
||||
|
||||
parallel_load.expand(files=compute_batches())
|
||||
|
||||
@task(trigger_rule="none_failed")
|
||||
def merge_curation_db(**kwargs):
|
||||
pass
|
||||
|
||||
@task(trigger_rule="none_failed")
|
||||
def delete_missing_curated(**kwargs):
|
||||
pass
|
||||
|
||||
@task(trigger_rule="none_failed")
|
||||
def close_indexes(**kwargs):
|
||||
pass
|
||||
|
||||
chain(
|
||||
create_indexes(),
|
||||
# todo get checkpoint
|
||||
merge_curation_db(),
|
||||
load_and_map_entity("datasource"),
|
||||
load_and_map_entity("grants"),
|
||||
load_and_map_entity("organizations"),
|
||||
load_and_map_entity("persons"),
|
||||
load_and_map_entity("products"),
|
||||
load_and_map_entity("topics"),
|
||||
load_and_map_entity("venues"),
|
||||
delete_missing_curated(),
|
||||
close_indexes()
|
||||
# todo ask resync
|
||||
)
|
||||
|
||||
|
||||
import_s3_openaire_dump()
|
|
@ -0,0 +1,12 @@
|
|||
#!/usr/bin/env -S docker build . --tag=gbloisi/curation:1.0.0 --platform linux/amd64 --push --network=host --file
|
||||
|
||||
FROM python:3.12-slim-bullseye
|
||||
|
||||
COPY requirements.txt /
|
||||
|
||||
RUN python -m pip install --upgrade -r /requirements.txt
|
||||
|
||||
COPY antispam-batch.py blacklist.txt curation-rest.py /
|
||||
|
||||
# Run the server
|
||||
CMD python3 /curation-rest.py
|
|
@ -0,0 +1,255 @@
|
|||
import json
|
||||
import sys
|
||||
import traceback
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from jsonargparse import ArgumentParser
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
import asyncio
|
||||
import enum
|
||||
import instructor
|
||||
|
||||
from pydantic import BaseModel, Field, SecretStr
|
||||
|
||||
from datetime import datetime
|
||||
from opensearchpy import OpenSearch, helpers, AsyncOpenSearch
|
||||
|
||||
|
||||
class Topics(str, enum.Enum):
|
||||
"""Correctly assign one of the predefined topic to the content"""
|
||||
SPAM = "SPAM, advertisement, promotional"
|
||||
SALES = "direct sales of goods or services"
|
||||
EXPLICIT_CONTENT = "porn, violence or Harmful content"
|
||||
RESEARCH = "description of a scientific research"
|
||||
DATASET = "description of a scientific dataset "
|
||||
OBJECT = "scientific description of an object"
|
||||
BIBLIOGRAPHIC = "bibliographic record"
|
||||
NA = "not available"
|
||||
|
||||
|
||||
class ProductInfo(BaseModel):
|
||||
"""
|
||||
Your task is to identify SPAM content among research product descriptions.
|
||||
"""
|
||||
language: str = Field(description="The language of the content")
|
||||
topic: Topics
|
||||
reason: str = Field(description="explain why the topic was chosen")
|
||||
spam_words: list[str] = Field(description="content's spam words", min_length=0, max_length=3)
|
||||
|
||||
main_model_schema = ProductInfo.model_json_schema()
|
||||
response_schema = json.dumps(main_model_schema, indent=None)
|
||||
|
||||
parser = ArgumentParser(env_prefix="CURATION", default_env=True)
|
||||
parser.add_argument("--opensearch.host", default='opensearch-cluster.local-dataplatform')
|
||||
parser.add_argument("--opensearch.port", default=443, type=int)
|
||||
parser.add_argument("--opensearch.user", default="admin", type=SecretStr)
|
||||
parser.add_argument("--opensearch.password", default="admin", type=SecretStr)
|
||||
parser.add_argument("--openai.host", default='localhost')
|
||||
parser.add_argument("--openai.port", default=8000, type=int)
|
||||
parser.add_argument("--openai.api_key", default='api_key')
|
||||
parser.add_argument("--parallelism", default=36, type=int)
|
||||
cfg = parser.parse_args()
|
||||
|
||||
with open("/blacklist.txt", "r") as text_file:
|
||||
blacklist = [line.rstrip().lower() for line in text_file.readlines()]
|
||||
|
||||
|
||||
client = AsyncOpenSearch(
|
||||
hosts=[{'host': cfg.get("opensearch.host"), 'port': cfg.get("opensearch.port")}],
|
||||
http_auth=(cfg.get("opensearch.user").get_secret_value(), cfg.get("opensearch.password").get_secret_value()),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20
|
||||
)
|
||||
|
||||
oai = instructor.patch(AsyncOpenAI(base_url="http://" + cfg.get("openai.host") + ":" + str(cfg.get("openai.port")) + "/v1",
|
||||
api_key=cfg.get("openai.api_key"),
|
||||
timeout=2400.0*6.0),
|
||||
mode=instructor.Mode.JSON_SCHEMA)
|
||||
|
||||
|
||||
def source_txt_value(data: Dict[str, Any], labels: List[str]) -> Optional[Any]:
|
||||
if len(labels) <= 0:
|
||||
return None
|
||||
current_value = data['_source']
|
||||
for label in labels:
|
||||
if isinstance(current_value, dict) and label in current_value:
|
||||
current_value = current_value[label]
|
||||
else:
|
||||
return None
|
||||
if current_value is None:
|
||||
return None
|
||||
if isinstance(current_value, list):
|
||||
if len(current_value) > 0:
|
||||
return current_value[0]
|
||||
else:
|
||||
return None
|
||||
return str(current_value)
|
||||
|
||||
|
||||
async def eval_spam_candidate(hit: dict) -> ProductInfo:
|
||||
response = await oai.chat.completions.create(
|
||||
model="suzume-multilingual",
|
||||
response_model=ProductInfo,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": hit['title']
|
||||
}
|
||||
],
|
||||
extra_body={
|
||||
"cache_prompt": True,
|
||||
"json_schema": response_schema
|
||||
},
|
||||
temperature=0.0,
|
||||
max_retries=5,
|
||||
stream=False
|
||||
)
|
||||
return response.model_dump()
|
||||
|
||||
|
||||
async def evaluate_hit(hit: dict):
|
||||
obj = await eval_spam_candidate(hit)
|
||||
if obj['topic'] in [Topics.SPAM, Topics.EXPLICIT_CONTENT, Topics.SALES]:
|
||||
print("SPAM detected: " + hit['local_identifier'], flush=True)
|
||||
print("AI Reponse:" + str(obj) + " for: " + hit['title'], flush=True)
|
||||
obj['local_identifier'] = hit['local_identifier']
|
||||
obj['trigger_word'] = hit['found']
|
||||
obj['abstract'] = hit['title']
|
||||
obj['timestamp'] = datetime.now().isoformat()
|
||||
await client.index(
|
||||
index='spam',
|
||||
body=obj,
|
||||
id=hit['local_identifier'],
|
||||
refresh=True
|
||||
)
|
||||
return obj
|
||||
|
||||
async def get_potential_spam() -> Any:
|
||||
count = 0
|
||||
resume_from = 0
|
||||
async for hit in helpers.async_scan(client, index="products", query={"query": {"match_all": {}}}, scroll='1d'):
|
||||
count = count + 1
|
||||
if count < resume_from:
|
||||
continue
|
||||
local_identifier = source_txt_value(hit, ["local_identifier"])
|
||||
print(f"{count}:\t{local_identifier}")
|
||||
title = source_txt_value(hit, ["titles", "none"])
|
||||
description = source_txt_value(hit, ['abstracts', 'none'])
|
||||
|
||||
if title is None:
|
||||
if description is None:
|
||||
print("No description! {local_identifier}", flush=True)
|
||||
continue
|
||||
title = ""
|
||||
|
||||
if description is not None:
|
||||
title = title + " " + description
|
||||
|
||||
utf8_title = title.encode('utf-8')
|
||||
if len(utf8_title) > 2048:
|
||||
title = utf8_title[0:2048].decode('utf-8', 'ignore')
|
||||
test_string = title.lower()
|
||||
split_string = test_string.split()
|
||||
found = None
|
||||
for badword in blacklist:
|
||||
if badword in test_string:
|
||||
if len(badword) == 1 or ' ' in badword or badword in split_string:
|
||||
found = badword
|
||||
break
|
||||
if found is None:
|
||||
continue
|
||||
if await client.exists(index="spam", id=local_identifier):
|
||||
print("cached")
|
||||
continue
|
||||
yield {"local_identifier": local_identifier, "title": title, "found": found}
|
||||
|
||||
|
||||
|
||||
async def worker(name, queue):
|
||||
try:
|
||||
while True:
|
||||
# Get a "work item" out of the queue.
|
||||
hit = await queue.get()
|
||||
# Sleep for the "sleep_for" seconds.
|
||||
await evaluate_hit(hit)
|
||||
# Notify the queue that the "work item" has been processed.
|
||||
queue.task_done()
|
||||
except Exception as e:
|
||||
print(traceback.format_exc())
|
||||
sys.exit(-1)
|
||||
|
||||
|
||||
async def main():
|
||||
#if await client.indices.exists("spam"):
|
||||
# await client.indices.delete("spam")
|
||||
|
||||
if not await client.indices.exists("spam"):
|
||||
await client.indices.create("spam", {
|
||||
"settings": {
|
||||
"index": {
|
||||
"number_of_shards": 3,
|
||||
"number_of_replicas": 0,
|
||||
"replication.type": "SEGMENT"
|
||||
}
|
||||
|
||||
},
|
||||
"mappings": {
|
||||
"properties": {
|
||||
"local_identifier": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"language": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"topic": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"abstract": {
|
||||
"type": "text",
|
||||
"index": False,
|
||||
},
|
||||
"reason": {
|
||||
"type": "text",
|
||||
"index": False,
|
||||
},
|
||||
"spam_words": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"trigger_word": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"timestamp": {
|
||||
"type": "date",
|
||||
"format": "date_hour_minute_second_fraction"
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
parallelism = cfg.get("parallelism")
|
||||
queue = asyncio.Queue(parallelism)
|
||||
tasks = []
|
||||
for i in range(parallelism):
|
||||
task = asyncio.create_task(worker(f'worker-{i}', queue))
|
||||
tasks.append(task)
|
||||
|
||||
async for hit in get_potential_spam():
|
||||
await queue.put(hit)
|
||||
|
||||
await queue.join()
|
||||
# Cancel our worker tasks.
|
||||
for task in tasks:
|
||||
task.cancel()
|
||||
|
||||
# Wait until all worker tasks are cancelled.
|
||||
await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
loop.run_until_complete(main())
|
||||
loop.close()
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,298 @@
|
|||
from datetime import datetime
|
||||
from enum import Enum
|
||||
from flask_openapi3 import Info, Tag
|
||||
from flask_openapi3 import OpenAPI
|
||||
from jsonargparse import ArgumentParser
|
||||
from opensearchpy import OpenSearch, NotFoundError, helpers
|
||||
from pydantic import BaseModel, SecretStr
|
||||
import logging
|
||||
|
||||
|
||||
|
||||
parser = ArgumentParser(env_prefix="CURATION", default_env=True)
|
||||
parser.add_argument("--opensearch.host", default='opensearch-cluster.local-dataplatform')
|
||||
parser.add_argument("--opensearch.port", default=443, type=int)
|
||||
parser.add_argument("--opensearch.user", default="admin", type=SecretStr)
|
||||
parser.add_argument("--opensearch.password", default="admin", type=SecretStr)
|
||||
parser.add_argument("--debug", default=False, type=bool)
|
||||
cfg = parser.parse_args()
|
||||
|
||||
print(cfg.as_dict())
|
||||
|
||||
client = OpenSearch(
|
||||
hosts=[{'host': cfg.get("opensearch.host"), 'port': cfg.get("opensearch.port")}],
|
||||
http_auth=(cfg.get("opensearch.user").get_secret_value(), cfg.get("opensearch.password").get_secret_value()),
|
||||
use_ssl=True,
|
||||
verify_certs=False,
|
||||
ssl_show_warn=False,
|
||||
pool_maxsize=20,
|
||||
)
|
||||
|
||||
# if client.indices.exists("curation"):
|
||||
# client.indices.delete("curation")
|
||||
|
||||
if not client.indices.exists("curation"):
|
||||
client.indices.create("curation", {
|
||||
"settings": {
|
||||
"index": {
|
||||
"number_of_shards": 10,
|
||||
"number_of_replicas": 0,
|
||||
"codec": "zstd_no_dict",
|
||||
"replication.type": "SEGMENT"
|
||||
},
|
||||
},
|
||||
"mappings": {
|
||||
"dynamic": "strict",
|
||||
"properties": {
|
||||
"local_identifier": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"timestamp": {
|
||||
"type": "date",
|
||||
"format": "date_hour_minute_second_fraction"
|
||||
},
|
||||
"creator": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"status": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"note": {
|
||||
"index": False,
|
||||
"type": "text"
|
||||
},
|
||||
|
||||
"log": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"timestamp": {
|
||||
"format": "date_hour_minute_second_fraction",
|
||||
"type": "date"
|
||||
},
|
||||
"creator": {
|
||||
"type": "keyword"
|
||||
},
|
||||
"status": {
|
||||
"index": False,
|
||||
"type": "keyword"
|
||||
},
|
||||
"note": {
|
||||
"index": False,
|
||||
"type": "text"
|
||||
},
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
|
||||
info = Info(title="Curator API", version="1.0.0")
|
||||
app = OpenAPI(__name__, info=info)
|
||||
curation_tag = Tag(name="curation", description="Curator API")
|
||||
|
||||
|
||||
class CurationStatus(str, Enum):
|
||||
valid = "valid"
|
||||
withdrawn = "withdrawn"
|
||||
alert = "alert"
|
||||
restore = "restore"
|
||||
reset = "reset"
|
||||
|
||||
|
||||
class CurationRequest(BaseModel):
|
||||
local_identifier: str
|
||||
creator: str
|
||||
status: CurationStatus
|
||||
note: str
|
||||
|
||||
|
||||
class LogEntry(BaseModel):
|
||||
timestamp: str
|
||||
creator: str
|
||||
status: CurationStatus
|
||||
note: str
|
||||
|
||||
|
||||
class CurationResponse(BaseModel):
|
||||
local_identifier: str
|
||||
timestamp: str
|
||||
creator: str
|
||||
status: CurationStatus
|
||||
note: str
|
||||
log: list[LogEntry]
|
||||
|
||||
|
||||
@app.route('/health')
|
||||
def health_check():
|
||||
if all_required_services_are_running():
|
||||
return 'OK', 200
|
||||
else:
|
||||
return 'Service Unavailable', 500
|
||||
|
||||
|
||||
def all_required_services_are_running():
|
||||
os_health = client.cluster.health()
|
||||
return os_health['status'] in ['green', 'yellow'] and os_health['number_of_nodes'] > 0
|
||||
|
||||
|
||||
@app.post("/curation", summary="set curation",
|
||||
responses={200: CurationResponse},
|
||||
tags=[curation_tag])
|
||||
def post_curation(query: CurationRequest):
|
||||
"""
|
||||
set curation status
|
||||
"""
|
||||
curation = dict()
|
||||
|
||||
try:
|
||||
hit = client.get(index="curation", id=query.local_identifier)
|
||||
curation = hit['_source']
|
||||
|
||||
if query.status.name == curation['status']:
|
||||
return {"msg": "status is not changed"}, 403
|
||||
|
||||
# move current status in history
|
||||
annotations = curation['log'] if 'log' in curation else list()
|
||||
if isinstance(annotations, dict):
|
||||
annotations = [annotations]
|
||||
annotations.insert(0, {
|
||||
"timestamp": curation['timestamp'],
|
||||
"creator": curation['creator'],
|
||||
"status": curation['status'],
|
||||
"note": curation['note'],
|
||||
})
|
||||
annotations = annotations[0:100]
|
||||
curation['log'] = annotations
|
||||
curation['timestamp'] = datetime.now().isoformat()
|
||||
curation['creator'] = query.creator
|
||||
curation['note'] = query.note
|
||||
|
||||
print(curation)
|
||||
|
||||
# todo check status transition
|
||||
match query.status.name:
|
||||
case "valid":
|
||||
if curation['status'] not in ('restore', 'reset'):
|
||||
return {"msg": "status cannot be updated to 'valid'"}, 403
|
||||
curation['status'] = query.status.name
|
||||
case "withdrawn":
|
||||
curation['status'] = query.status.name
|
||||
case "alert":
|
||||
curation['status'] = query.status.name
|
||||
case "restore":
|
||||
if curation['status'] != "withdrawn":
|
||||
return {"msg": "only withdrawn records can be restored'"}, 403
|
||||
curation['status'] = query.status.name
|
||||
case "reset":
|
||||
curation['status'] = query.status.name
|
||||
|
||||
#TODO transactionality in case of failure?
|
||||
client.index(
|
||||
index='curation',
|
||||
id=query.local_identifier,
|
||||
body=curation,
|
||||
refresh=True,
|
||||
if_primary_term=hit['_primary_term'],
|
||||
if_seq_no=hit['_seq_no']
|
||||
)
|
||||
metadata_status = curation['status']
|
||||
|
||||
if metadata_status == 'reset':
|
||||
client.update(
|
||||
index='products',
|
||||
id=query.local_identifier,
|
||||
body={
|
||||
"script": {"source": "ctx._source.remove(\"status\")"}
|
||||
},
|
||||
refresh=True
|
||||
)
|
||||
else:
|
||||
if metadata_status == "restore":
|
||||
metadata_status = 'valid'
|
||||
|
||||
client.update(
|
||||
index='products',
|
||||
id=query.local_identifier,
|
||||
body={
|
||||
"doc": {"status": metadata_status}
|
||||
},
|
||||
refresh=True
|
||||
)
|
||||
except NotFoundError:
|
||||
curation['local_identifier'] = query.local_identifier
|
||||
curation['timestamp'] = datetime.now().isoformat()
|
||||
curation['status'] = query.status.name
|
||||
curation['creator'] = query.creator
|
||||
curation['note'] = query.note
|
||||
|
||||
match query.status.name:
|
||||
case "restore":
|
||||
return {"msg": "cannot restore: status does not exist'"}, 403
|
||||
case "reset":
|
||||
return {"msg": "cannot reset: status does not exist'"}, 403
|
||||
|
||||
client.index(
|
||||
index='curation',
|
||||
id=query.local_identifier,
|
||||
body=curation,
|
||||
refresh=True,
|
||||
op_type='create'
|
||||
)
|
||||
client.update(
|
||||
index='products',
|
||||
id=query.local_identifier,
|
||||
body={
|
||||
"doc": {"status": curation['status']}
|
||||
},
|
||||
refresh=True
|
||||
)
|
||||
|
||||
return curation
|
||||
|
||||
|
||||
@app.get("/curation", summary="get curation", tags=[curation_tag])
|
||||
def get_curation(local_identifier: str):
|
||||
"""
|
||||
to get a curation record
|
||||
"""
|
||||
try:
|
||||
hit = client.get(index="curation", id=local_identifier)
|
||||
|
||||
return {
|
||||
"code": 0,
|
||||
"message": "ok",
|
||||
"data": hit['_source']
|
||||
}
|
||||
except NotFoundError:
|
||||
return {"msg": f"Cannot fetch: '{local_identifier}' does not exist'"}, 403
|
||||
|
||||
|
||||
@app.get("/alerts", summary="get curation in alert status", tags=[curation_tag])
|
||||
def get_alerts():
|
||||
"""
|
||||
to get a curation record
|
||||
"""
|
||||
query = {
|
||||
"query": {
|
||||
"terms": {
|
||||
"status": [CurationStatus.alert]
|
||||
}
|
||||
}
|
||||
}
|
||||
return {
|
||||
"code": 0,
|
||||
"message": "ok",
|
||||
"data": list(helpers.scan(client, index="curation", query=query))
|
||||
}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
debug = False
|
||||
if debug:
|
||||
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(message)s')
|
||||
app.run(debug=True)
|
||||
else:
|
||||
from waitress import serve
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')
|
||||
serve(app, host="0.0.0.0", port=5000)
|
|
@ -0,0 +1,45 @@
|
|||
{
|
||||
"type": "object",
|
||||
"required": [
|
||||
"language",
|
||||
"topic",
|
||||
"reason"
|
||||
],
|
||||
"properties": {
|
||||
"language": {
|
||||
"type": "string"
|
||||
},
|
||||
"topic": {
|
||||
"enum": [
|
||||
"Other",
|
||||
"Natural and life Sciences",
|
||||
"Engineering And Technology",
|
||||
"Computer Science",
|
||||
"Medical And Health Sciences",
|
||||
"Agricultural And Veterinary Sciences",
|
||||
"Social Sciences",
|
||||
"Humanities And The Arts",
|
||||
"Archaeology",
|
||||
"Bibliographic record",
|
||||
"Porn, Violence or Harmful content",
|
||||
"Direct sales of goods or services",
|
||||
"SPAM, advertisement, promotional"
|
||||
],
|
||||
"type": "string"
|
||||
},
|
||||
"general_subject": {
|
||||
"type": "string"
|
||||
},
|
||||
"reason": {
|
||||
"description": "reason of the classification",
|
||||
"type": "string"
|
||||
},
|
||||
"spam_words": {
|
||||
"items": {
|
||||
"type": "string"
|
||||
},
|
||||
"type": "array",
|
||||
"maxItems": 3
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
langchain
|
||||
langchain-community
|
||||
langchain-core
|
||||
instructor
|
||||
pydantic
|
||||
openai
|
||||
opensearch-py
|
||||
jsonargparse
|
||||
flask
|
||||
flask-openapi3
|
||||
flask-waitress
|
Loading…
Reference in New Issue