DAG to build the graph from a delta
This commit is contained in:
parent
e64b3ec6b1
commit
b3d7dda0c1
|
@ -0,0 +1,125 @@
|
|||
from __future__ import annotations
|
||||
|
||||
from airflow.decorators import dag
|
||||
from airflow.models.baseoperator import chain
|
||||
from airflow.models.param import Param
|
||||
from airflow.operators.trigger_dagrun import TriggerDagRunOperator
|
||||
from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator
|
||||
|
||||
from spark_configurator import SparkConfigurator
|
||||
import dag_utils
|
||||
|
||||
|
||||
@dag(
|
||||
dag_id="build_openaire_graph_incremental",
|
||||
dag_display_name="Build the OpenAIRE graph incrementally",
|
||||
params={
|
||||
"S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection for S3 endpoint"),
|
||||
"GRAPH_PATH": Param("s3a://graph/tmp/prod_provision/graph", type='string', description=""),
|
||||
"WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir", type='string', description=""),
|
||||
"IS_LOOKUP_URL": Param("http://services.openaire.eu:8280/is/services/isLookUp?wsdl", type='string',
|
||||
description=""),
|
||||
"DEDUP_CONFIG_ID": Param("dedup-result-decisiontree-v4", type='string', description=""),
|
||||
"ORCID_PATH": Param("s3a://graph/data/orcid_2023/tables", type='string', description=""),
|
||||
"DELTA_PATH": Param("s3a://graph/data/delta", type='string', description=""),
|
||||
},
|
||||
tags=["openaire"]
|
||||
)
|
||||
def build_new_graph():
|
||||
chain(
|
||||
SparkKubernetesOperator(
|
||||
task_id='raw_graph',
|
||||
task_display_name="Generate Raw Graph",
|
||||
namespace='dnet-spark-jobs',
|
||||
template_spec=SparkConfigurator(
|
||||
name="rawgraph-{{ ds }}-{{ task_instance.try_number }}",
|
||||
mainClass="eu.dnetlib.dhp.oa.graph.raw.CopyIncrementalOafSparkApplication",
|
||||
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||
arguments=["--inputPath", "{{ dag_run.conf.get('DELTA_PATH') }}",
|
||||
"--graphOutputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["raw"]
|
||||
]).get_configuration(),
|
||||
kubernetes_conn_id="kubernetes_default"
|
||||
),
|
||||
|
||||
SparkKubernetesOperator(
|
||||
task_id='grouped_graph',
|
||||
task_display_name="Generate Grouped-by-id Graph",
|
||||
namespace='dnet-spark-jobs',
|
||||
template_spec=SparkConfigurator(
|
||||
name="groupedgraph-{{ ds }}-{{ task_instance.try_number }}",
|
||||
mainClass="eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob",
|
||||
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||
arguments=["--graphInputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["raw"],
|
||||
"--outputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["grouped"],
|
||||
"--checkpointPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}/grouped_entities",
|
||||
"--isLookupUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}",
|
||||
"--filterInvisible", "false"
|
||||
]).get_configuration(),
|
||||
kubernetes_conn_id="kubernetes_default"
|
||||
),
|
||||
|
||||
SparkKubernetesOperator(
|
||||
task_id='copygroupedrels',
|
||||
task_display_name="Copy relations to Grouped-by-id Graph",
|
||||
namespace='dnet-spark-jobs',
|
||||
template_spec=SparkConfigurator(
|
||||
name="copygroupedrels-{{ ds }}-{{ task_instance.try_number }}",
|
||||
mainClass="eu.dnetlib.dhp.oa.merge.CopyEntitiesSparkJob",
|
||||
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||
arguments=["--graphInputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["raw"],
|
||||
"--outputPath", "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["grouped"],
|
||||
"--entities", "relation",
|
||||
"--format", "text"
|
||||
]).get_configuration(),
|
||||
kubernetes_conn_id="kubernetes_default"
|
||||
)
|
||||
|
||||
# , TriggerDagRunOperator(
|
||||
# task_id="dedup",
|
||||
# task_display_name="Deduplicate Research Results",
|
||||
# trigger_dag_id="results_deduplication",
|
||||
# wait_for_completion=True,
|
||||
# conf={
|
||||
# "S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}",
|
||||
#
|
||||
# "INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["inference"],
|
||||
# "OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["dedup"],
|
||||
# "WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/dedup",
|
||||
# "IS_LOOKUP_URL": "{{ dag_run.conf.get('IS_LOOKUP_URL') }}",
|
||||
# "DEDUP_CONFIG_ID": "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}"
|
||||
# }
|
||||
# ),
|
||||
# TriggerDagRunOperator(
|
||||
# task_id="consistency",
|
||||
# task_display_name="Enforce Consistency of Graph",
|
||||
# trigger_dag_id="consistency_graph",
|
||||
# wait_for_completion=True,
|
||||
#
|
||||
# conf={
|
||||
# "S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}",
|
||||
#
|
||||
# "INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["dedup"],
|
||||
# "OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"],
|
||||
# "WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/dedup",
|
||||
# "IS_LOOKUP_URL": "{{ dag_run.conf.get('IS_LOOKUP_URL') }}"
|
||||
# }
|
||||
# ),
|
||||
# TriggerDagRunOperator(
|
||||
# task_id="orcid_enrichment",
|
||||
# task_display_name="Enrich Graph with ORCID data",
|
||||
# trigger_dag_id="orcid_enrichment_graph",
|
||||
# wait_for_completion=True,
|
||||
#
|
||||
# conf={
|
||||
# "S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}",
|
||||
#
|
||||
# "ORCID_PATH": "{{ dag_run.conf.get('ORCID_PATH') }}",
|
||||
# "INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"],
|
||||
# "OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["orcid_enhancement"],
|
||||
# "WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/orcid_enrichment"
|
||||
# }
|
||||
# )
|
||||
)
|
||||
|
||||
|
||||
build_new_graph()
|
|
@ -2,6 +2,10 @@ from airflow.hooks.base import BaseHook
|
|||
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
||||
|
||||
BUILD_PHASES = {
|
||||
"raw": "01_graph_raw",
|
||||
"grouped": "02_graph_grouped",
|
||||
"clean": "03_graph_cleaned",
|
||||
|
||||
"inference": "05_graph_inferred",
|
||||
"dedup": "06_graph_dedup",
|
||||
"consistency": "07_graph_consistent",
|
||||
|
|
Loading…
Reference in New Issue