code-infrastructure-lab/workflow/dnet/build_openaire_graph.py

76 lines
3.2 KiB
Python

from __future__ import annotations
from airflow.decorators import dag
from airflow.models.baseoperator import chain
from airflow.models.param import Param
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
import dag_utils
@dag(
dag_id="build_openaire_graph",
dag_display_name="Build the OpenAIRE graph",
params={
"S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection for S3 endpoint"),
"GRAPH_PATH": Param("s3a://graph/tmp/prod_provision/graph", type='string', description=""),
"WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir", type='string', description=""),
"IS_LOOKUP_URL": Param("http://services.openaire.eu:8280/is/services/isLookUp?wsdl", type='string',
description=""),
"DEDUP_CONFIG_ID": Param("dedup-result-decisiontree-v4", type='string', description=""),
"ORCID_PATH": Param("s3a://graph/data/orcid_2023/tables", type='string', description="")
},
tags=["openaire"]
)
def build_new_graph():
chain(
TriggerDagRunOperator(
task_id="dedup",
task_display_name="Deduplicate Research Results",
trigger_dag_id="results_deduplication",
wait_for_completion=True,
conf={
"S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}",
"INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["inference"],
"OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["dedup"],
"WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/dedup",
"IS_LOOKUP_URL": "{{ dag_run.conf.get('IS_LOOKUP_URL') }}",
"DEDUP_CONFIG_ID": "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}"
}
),
TriggerDagRunOperator(
task_id="consistency",
task_display_name="Enforce Consistency of Graph",
trigger_dag_id="consistency_graph",
wait_for_completion=True,
conf={
"S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}",
"INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["dedup"],
"OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"],
"WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/dedup",
"IS_LOOKUP_URL": "{{ dag_run.conf.get('IS_LOOKUP_URL') }}"
}
),
TriggerDagRunOperator(
task_id="orcid_enrichment",
task_display_name="Enrich Graph with ORCID data",
trigger_dag_id="orcid_enrichment_graph",
wait_for_completion=True,
conf={
"S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}",
"ORCID_PATH": "{{ dag_run.conf.get('ORCID_PATH') }}",
"INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"],
"OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["orcid_enhancement"],
"WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/orcid_enrichment"
}
)
)
build_new_graph()