2024-10-21 21:31:43 +02:00
|
|
|
from __future__ import annotations
|
|
|
|
|
|
|
|
from airflow.decorators import dag
|
|
|
|
from airflow.models.baseoperator import chain
|
|
|
|
from airflow.models.param import Param
|
|
|
|
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
|
|
|
|
|
|
|
|
import dag_utils
|
|
|
|
|
2024-10-22 10:19:40 +02:00
|
|
|
|
2024-10-21 21:31:43 +02:00
|
|
|
@dag(
|
|
|
|
dag_id="build_openaire_graph",
|
|
|
|
dag_display_name="Build the OpenAIRE graph",
|
|
|
|
params={
|
2024-10-22 10:19:40 +02:00
|
|
|
"S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection for S3 endpoint"),
|
|
|
|
"GRAPH_PATH": Param("s3a://graph/tmp/prod_provision/graph", type='string', description=""),
|
|
|
|
"WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir", type='string', description=""),
|
|
|
|
"IS_LOOKUP_URL": Param("http://services.openaire.eu:8280/is/services/isLookUp?wsdl", type='string',
|
|
|
|
description=""),
|
|
|
|
"DEDUP_CONFIG_ID": Param("dedup-result-decisiontree-v4", type='string', description=""),
|
|
|
|
"ORCID_PATH": Param("s3a://graph/data/orcid_2023/tables", type='string', description="")
|
2024-10-21 21:31:43 +02:00
|
|
|
},
|
|
|
|
tags=["openaire"]
|
|
|
|
)
|
|
|
|
def build_new_graph():
|
2024-10-22 10:19:40 +02:00
|
|
|
chain(
|
|
|
|
TriggerDagRunOperator(
|
|
|
|
task_id="dedup",
|
|
|
|
task_display_name="Deduplicate Research Results",
|
|
|
|
trigger_dag_id="results_deduplication",
|
|
|
|
wait_for_completion=True,
|
|
|
|
conf={
|
|
|
|
"S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}",
|
2024-10-21 21:31:43 +02:00
|
|
|
|
2024-10-22 10:19:40 +02:00
|
|
|
"INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["inference"],
|
|
|
|
"OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["dedup"],
|
|
|
|
"WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/dedup",
|
|
|
|
"IS_LOOKUP_URL": "{{ dag_run.conf.get('IS_LOOKUP_URL') }}",
|
|
|
|
"DEDUP_CONFIG_ID": "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}"
|
|
|
|
}
|
|
|
|
),
|
2024-10-21 21:31:43 +02:00
|
|
|
TriggerDagRunOperator(
|
|
|
|
task_id="consistency",
|
2024-10-22 10:19:40 +02:00
|
|
|
task_display_name="Enforce Consistency of Graph",
|
2024-10-21 21:31:43 +02:00
|
|
|
trigger_dag_id="consistency_graph",
|
2024-10-22 10:19:40 +02:00
|
|
|
wait_for_completion=True,
|
|
|
|
|
|
|
|
conf={
|
|
|
|
"S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}",
|
2024-10-21 21:31:43 +02:00
|
|
|
|
2024-10-22 10:19:40 +02:00
|
|
|
"INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["dedup"],
|
|
|
|
"OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"],
|
|
|
|
"WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/dedup",
|
|
|
|
"IS_LOOKUP_URL": "{{ dag_run.conf.get('IS_LOOKUP_URL') }}"
|
|
|
|
}
|
2024-10-21 21:31:43 +02:00
|
|
|
),
|
|
|
|
TriggerDagRunOperator(
|
|
|
|
task_id="orcid_enrichment",
|
2024-10-22 10:19:40 +02:00
|
|
|
task_display_name="Enrich Graph with ORCID data",
|
2024-10-21 21:31:43 +02:00
|
|
|
trigger_dag_id="orcid_enrichment_graph",
|
2024-10-22 10:19:40 +02:00
|
|
|
wait_for_completion=True,
|
|
|
|
|
|
|
|
conf={
|
|
|
|
"S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}",
|
|
|
|
|
|
|
|
"ORCID_PATH": "{{ dag_run.conf.get('ORCID_PATH') }}",
|
|
|
|
"INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"],
|
|
|
|
"OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["orcid_enhancement"],
|
|
|
|
"WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/orcid_enrichment"
|
|
|
|
}
|
2024-10-21 21:31:43 +02:00
|
|
|
)
|
|
|
|
)
|
|
|
|
|
2024-10-22 10:19:40 +02:00
|
|
|
|
2024-10-21 21:31:43 +02:00
|
|
|
build_new_graph()
|