oct-update #1
|
@ -1,6 +1,5 @@
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import pendulum
|
|
||||||
from airflow.decorators import dag
|
from airflow.decorators import dag
|
||||||
from airflow.models.baseoperator import chain
|
from airflow.models.baseoperator import chain
|
||||||
from airflow.models.param import Param
|
from airflow.models.param import Param
|
||||||
|
@ -8,42 +7,69 @@ from airflow.operators.trigger_dagrun import TriggerDagRunOperator
|
||||||
|
|
||||||
import dag_utils
|
import dag_utils
|
||||||
|
|
||||||
|
|
||||||
@dag(
|
@dag(
|
||||||
dag_id="build_openaire_graph",
|
dag_id="build_openaire_graph",
|
||||||
dag_display_name="Build the OpenAIRE graph",
|
dag_display_name="Build the OpenAIRE graph",
|
||||||
params={
|
params={
|
||||||
"S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection for S3 endpoint")
|
"S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection for S3 endpoint"),
|
||||||
|
"GRAPH_PATH": Param("s3a://graph/tmp/prod_provision/graph", type='string', description=""),
|
||||||
|
"WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir", type='string', description=""),
|
||||||
|
"IS_LOOKUP_URL": Param("http://services.openaire.eu:8280/is/services/isLookUp?wsdl", type='string',
|
||||||
|
description=""),
|
||||||
|
"DEDUP_CONFIG_ID": Param("dedup-result-decisiontree-v4", type='string', description=""),
|
||||||
|
"ORCID_PATH": Param("s3a://graph/data/orcid_2023/tables", type='string', description="")
|
||||||
},
|
},
|
||||||
tags=["openaire"]
|
tags=["openaire"]
|
||||||
)
|
)
|
||||||
def build_new_graph():
|
def build_new_graph():
|
||||||
chain(TriggerDagRunOperator(
|
chain(
|
||||||
task_id="dedup",
|
TriggerDagRunOperator(
|
||||||
trigger_dag_id="dedup_graph",
|
task_id="dedup",
|
||||||
wait_for_completion=True),
|
task_display_name="Deduplicate Research Results",
|
||||||
|
trigger_dag_id="results_deduplication",
|
||||||
|
wait_for_completion=True,
|
||||||
|
conf={
|
||||||
|
"S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}",
|
||||||
|
|
||||||
|
"INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["inference"],
|
||||||
|
"OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["dedup"],
|
||||||
|
"WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/dedup",
|
||||||
|
"IS_LOOKUP_URL": "{{ dag_run.conf.get('IS_LOOKUP_URL') }}",
|
||||||
|
"DEDUP_CONFIG_ID": "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}"
|
||||||
|
}
|
||||||
|
),
|
||||||
TriggerDagRunOperator(
|
TriggerDagRunOperator(
|
||||||
task_id="consistency",
|
task_id="consistency",
|
||||||
|
task_display_name="Enforce Consistency of Graph",
|
||||||
trigger_dag_id="consistency_graph",
|
trigger_dag_id="consistency_graph",
|
||||||
wait_for_completion=True
|
wait_for_completion=True,
|
||||||
|
|
||||||
# conf={
|
conf={
|
||||||
# "file": "{{ task_instance.xcom_pull(task_ids='check_new_dump_availability', key='file_path') }}",
|
"S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}",
|
||||||
# "dst_bucket": "{{ dag_run.conf.get('S3_BUCKET') }}",
|
|
||||||
# }
|
"INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["dedup"],
|
||||||
|
"OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"],
|
||||||
|
"WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/dedup",
|
||||||
|
"IS_LOOKUP_URL": "{{ dag_run.conf.get('IS_LOOKUP_URL') }}"
|
||||||
|
}
|
||||||
),
|
),
|
||||||
TriggerDagRunOperator(
|
TriggerDagRunOperator(
|
||||||
task_id="orcid_enrichment",
|
task_id="orcid_enrichment",
|
||||||
|
task_display_name="Enrich Graph with ORCID data",
|
||||||
trigger_dag_id="orcid_enrichment_graph",
|
trigger_dag_id="orcid_enrichment_graph",
|
||||||
wait_for_completion=True
|
wait_for_completion=True,
|
||||||
|
|
||||||
# conf={
|
conf={
|
||||||
# "src_key": "/data/graph/{{ task_instance.xcom_pull(task_ids='check_new_dump_availability', key='file_path') }}",
|
"S3_CONN_ID": "{{ dag_run.conf.get('S3_CONN_ID') }}",
|
||||||
# "src_bucket": "{{ dag_run.conf.get('S3_BUCKET') }}",
|
|
||||||
# "dst_key_prefix": "/data/graph/{{ task_instance.xcom_pull(task_ids='check_new_dump_availability', key='timestamp') }}",
|
"ORCID_PATH": "{{ dag_run.conf.get('ORCID_PATH') }}",
|
||||||
# "dst_bucket": "{{ dag_run.conf.get('S3_BUCKET') }}"
|
"INPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["consistency"],
|
||||||
# }
|
"OUTPUT_PATH": "{{ dag_run.conf.get('GRAPH_PATH') }}/" + dag_utils.BUILD_PHASES["orcid_enhancement"],
|
||||||
|
"WRKDIR_PATH": "{{ dag_run.conf.get('WRKDIR_PATH') }}/orcid_enrichment"
|
||||||
|
}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
build_new_graph()
|
build_new_graph()
|
||||||
|
|
|
@ -1,101 +1,71 @@
|
||||||
#
|
import os
|
||||||
# Licensed to the Apache Software Foundation (ASF) under one
|
from datetime import timedelta
|
||||||
# or more contributor license agreements. See the NOTICE file
|
|
||||||
# distributed with this work for additional information
|
|
||||||
# regarding copyright ownership. The ASF licenses this file
|
|
||||||
# to you under the Apache License, Version 2.0 (the
|
|
||||||
# "License"); you may not use this file except in compliance
|
|
||||||
# with the License. You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing,
|
|
||||||
# software distributed under the License is distributed on an
|
|
||||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
# KIND, either express or implied. See the License for the
|
|
||||||
# specific language governing permissions and limitations
|
|
||||||
# under the License.
|
|
||||||
"""
|
|
||||||
This is an example DAG which uses SparkKubernetesOperator and SparkKubernetesSensor.
|
|
||||||
In this example, we create two tasks which execute sequentially.
|
|
||||||
The first task is to submit sparkApplication on Kubernetes cluster(the example uses spark-pi application).
|
|
||||||
and the second task is to check the final state of the sparkApplication that submitted in the first state.
|
|
||||||
|
|
||||||
Spark-on-k8s operator is required to be already installed on Kubernetes
|
from airflow.decorators import dag
|
||||||
https://github.com/GoogleCloudPlatform/spark-on-k8s-operator
|
from airflow.models.baseoperator import chain
|
||||||
"""
|
from airflow.models.param import Param
|
||||||
|
|
||||||
# [START import_module]
|
|
||||||
# The DAG object; we'll need this to instantiate a DAG
|
|
||||||
from airflow import DAG
|
|
||||||
# Operators; we need this to operate!
|
|
||||||
from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator
|
from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator
|
||||||
from airflow.utils.dates import days_ago
|
|
||||||
|
|
||||||
from spark_configurator import SparkConfigurator
|
from spark_configurator import SparkConfigurator
|
||||||
|
|
||||||
# [END import_module]
|
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||||
|
|
||||||
# [START default_args]
|
|
||||||
# These args will get passed on to each operator
|
|
||||||
# You can override them on a per-task basis during operator initialization
|
|
||||||
default_args = {
|
default_args = {
|
||||||
'owner': 'airflow',
|
"execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
|
||||||
'depends_on_past': False,
|
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||||
'start_date': days_ago(1),
|
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60)))
|
||||||
'email': ['airflow@example.com'],
|
|
||||||
'email_on_failure': False,
|
|
||||||
'email_on_retry': False,
|
|
||||||
'max_active_runs': 1,
|
|
||||||
'retries': 3
|
|
||||||
}
|
}
|
||||||
|
|
||||||
dag = DAG(
|
|
||||||
'consistency_graph',
|
@dag(
|
||||||
|
dag_id="consistency_graph",
|
||||||
|
dag_display_name="Enforce Consistency of Graph",
|
||||||
default_args=default_args,
|
default_args=default_args,
|
||||||
schedule_interval=None,
|
params={
|
||||||
tags=['example', 'spark']
|
"S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection of S3 endpoint"),
|
||||||
)
|
|
||||||
|
|
||||||
propagaterel = SparkKubernetesOperator(
|
"INPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/06_graph_dedup", type='string', description=""),
|
||||||
task_id='PropagateRelation',
|
"OUTPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/07_graph_consistent", type='string', description=""),
|
||||||
namespace='dnet-spark-jobs',
|
"WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir/dedup", type='string', description=""),
|
||||||
template_spec=SparkConfigurator(
|
"IS_LOOKUP_URL": Param("http://services.openaire.eu:8280/is/services/isLookUp?wsdl", type='string',
|
||||||
name="propagaterels-{{ ds }}-{{ task_instance.try_number }}",
|
description="")
|
||||||
mainClass="eu.dnetlib.dhp.oa.dedup.SparkPropagateRelation",
|
},
|
||||||
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
tags=["openaire"]
|
||||||
arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup",
|
|
||||||
"--graphOutputPath", "s3a://graph/tmp/prod_provision/graph/07_graph_consistent",
|
|
||||||
"--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup"
|
|
||||||
],
|
|
||||||
executor_cores=8,
|
|
||||||
executor_memory="4G",
|
|
||||||
executor_instances=1,
|
|
||||||
executor_memoryOverhead="3G").get_configuration(),
|
|
||||||
kubernetes_conn_id="kubernetes_default",
|
|
||||||
dag=dag
|
|
||||||
)
|
)
|
||||||
|
def consistency_graph_dag():
|
||||||
|
propagate_rel = SparkKubernetesOperator(
|
||||||
|
task_id='PropagateRelation',
|
||||||
|
task_display_name="Propagate Relations",
|
||||||
|
namespace='dnet-spark-jobs',
|
||||||
|
template_spec=SparkConfigurator(
|
||||||
|
name="propagaterels-{{ ds }}-{{ task_instance.try_number }}",
|
||||||
|
mainClass="eu.dnetlib.dhp.oa.dedup.SparkPropagateRelation",
|
||||||
|
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||||
|
arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}",
|
||||||
|
"--graphOutputPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}",
|
||||||
|
"--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}"
|
||||||
|
]).get_configuration(),
|
||||||
|
kubernetes_conn_id="kubernetes_default"
|
||||||
|
)
|
||||||
|
|
||||||
group_entities = SparkKubernetesOperator(
|
group_entities = SparkKubernetesOperator(
|
||||||
task_id='GroupEntities',
|
task_id='GroupEntities',
|
||||||
namespace='dnet-spark-jobs',
|
task_display_name="Group results by id",
|
||||||
template_spec=SparkConfigurator(
|
namespace='dnet-spark-jobs',
|
||||||
name="groupentities-{{ ds }}-{{ task_instance.try_number }}",
|
template_spec=SparkConfigurator(
|
||||||
mainClass="eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob",
|
name="groupentities-{{ ds }}-{{ task_instance.try_number }}",
|
||||||
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
mainClass="eu.dnetlib.dhp.oa.merge.GroupEntitiesSparkJob",
|
||||||
arguments=["--graphInputPath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup",
|
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||||
"--checkpointPath", "s3a://graph/tmp/prod_provision/working_dir/dedup/grouped_entities",
|
arguments=["--graphInputPath", "{{ dag_run.conf.get('INPUT_PATH') }}",
|
||||||
"--outputPath", "s3a://graph/tmp/prod_provision/graph/07_graph_consistent",
|
"--checkpointPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}/grouped_entities",
|
||||||
"--isLookupUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl",
|
"--outputPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}",
|
||||||
"--filterInvisible", "true"
|
"--isLookupUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}",
|
||||||
],
|
"--filterInvisible", "true"
|
||||||
#
|
]).get_configuration(),
|
||||||
executor_cores=8,
|
kubernetes_conn_id="kubernetes_default"
|
||||||
executor_memory="4G",
|
)
|
||||||
executor_instances=1,
|
|
||||||
executor_memoryOverhead="3G").get_configuration(),
|
|
||||||
kubernetes_conn_id="kubernetes_default",
|
|
||||||
dag=dag
|
|
||||||
)
|
|
||||||
|
|
||||||
propagaterel >> group_entities
|
chain(propagate_rel, group_entities)
|
||||||
|
|
||||||
|
|
||||||
|
consistency_graph_dag()
|
||||||
|
|
|
@ -1,6 +1,14 @@
|
||||||
from airflow.hooks.base import BaseHook
|
from airflow.hooks.base import BaseHook
|
||||||
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
|
||||||
|
|
||||||
|
BUILD_PHASES = {
|
||||||
|
"inference": "05_graph_inferred",
|
||||||
|
"dedup": "06_graph_dedup",
|
||||||
|
"consistency": "07_graph_consistent",
|
||||||
|
"enrichment": "08_graph_dedup_enriched", # actionset
|
||||||
|
"orcid_enhancement": "09_graph_orcid_enriched"
|
||||||
|
}
|
||||||
|
|
||||||
def get_bucket_name(context: dict, hook: S3Hook, param_name: str):
|
def get_bucket_name(context: dict, hook: S3Hook, param_name: str):
|
||||||
bucket_name = context["params"][param_name]
|
bucket_name = context["params"][param_name]
|
||||||
if not bucket_name:
|
if not bucket_name:
|
||||||
|
|
|
@ -1,228 +1,173 @@
|
||||||
#
|
import os
|
||||||
# Licensed to the Apache Software Foundation (ASF) under one
|
from datetime import timedelta
|
||||||
# or more contributor license agreements. See the NOTICE file
|
|
||||||
# distributed with this work for additional information
|
|
||||||
# regarding copyright ownership. The ASF licenses this file
|
|
||||||
# to you under the Apache License, Version 2.0 (the
|
|
||||||
# "License"); you may not use this file except in compliance
|
|
||||||
# with the License. You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing,
|
|
||||||
# software distributed under the License is distributed on an
|
|
||||||
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
||||||
# KIND, either express or implied. See the License for the
|
|
||||||
# specific language governing permissions and limitations
|
|
||||||
# under the License.
|
|
||||||
"""
|
|
||||||
This is an example DAG which uses SparkKubernetesOperator and SparkKubernetesSensor.
|
|
||||||
In this example, we create two tasks which execute sequentially.
|
|
||||||
The first task is to submit sparkApplication on Kubernetes cluster(the example uses spark-pi application).
|
|
||||||
and the second task is to check the final state of the sparkApplication that submitted in the first state.
|
|
||||||
|
|
||||||
Spark-on-k8s operator is required to be already installed on Kubernetes
|
from airflow.decorators import dag
|
||||||
https://github.com/GoogleCloudPlatform/spark-on-k8s-operator
|
from airflow.models.baseoperator import chain
|
||||||
"""
|
from airflow.models.param import Param
|
||||||
|
|
||||||
# [START import_module]
|
|
||||||
# The DAG object; we'll need this to instantiate a DAG
|
|
||||||
from airflow import DAG
|
|
||||||
# Operators; we need this to operate!
|
|
||||||
from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator
|
from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator
|
||||||
from airflow.utils.dates import days_ago
|
|
||||||
|
|
||||||
from spark_configurator import SparkConfigurator
|
from spark_configurator import SparkConfigurator
|
||||||
|
|
||||||
# [END import_module]
|
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||||
|
|
||||||
# [START default_args]
|
|
||||||
# These args will get passed on to each operator
|
|
||||||
# You can override them on a per-task basis during operator initialization
|
|
||||||
default_args = {
|
default_args = {
|
||||||
'owner': 'airflow',
|
"execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
|
||||||
'depends_on_past': False,
|
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||||
'start_date': days_ago(1),
|
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60)))
|
||||||
'email': ['airflow@example.com'],
|
|
||||||
'email_on_failure': False,
|
|
||||||
'email_on_retry': False,
|
|
||||||
'max_active_runs': 1,
|
|
||||||
'retries': 3
|
|
||||||
}
|
}
|
||||||
|
|
||||||
dag = DAG(
|
|
||||||
'dedup_graph',
|
@dag(
|
||||||
|
dag_id="results_deduplication",
|
||||||
|
dag_display_name="Deduplicate Research Results",
|
||||||
default_args=default_args,
|
default_args=default_args,
|
||||||
schedule_interval=None,
|
params={
|
||||||
tags=['example', 'spark']
|
"S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection of S3 endpoint"),
|
||||||
)
|
|
||||||
|
|
||||||
simrel = SparkKubernetesOperator(
|
"INPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/05_graph_inferred", type='string', description=""),
|
||||||
task_id='CreateSimRel',
|
"OUTPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/06_graph_dedup", type='string', description=""),
|
||||||
namespace='dnet-spark-jobs',
|
"WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir/dedup", type='string', description=""),
|
||||||
template_spec=SparkConfigurator(
|
"IS_LOOKUP_URL": Param("http://services.openaire.eu:8280/is/services/isLookUp?wsdl", type='string',
|
||||||
name="createsimrels-{{ ds }}-{{ task_instance.try_number }}",
|
description=""),
|
||||||
mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels",
|
"DEDUP_CONFIG_ID": Param("dedup-result-decisiontree-v4", type='string', description="")
|
||||||
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
},
|
||||||
arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred",
|
tags=["openaire"]
|
||||||
"--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl",
|
|
||||||
"--actionSetId", "dedup-result-decisiontree-v4",
|
|
||||||
"--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup",
|
|
||||||
"--numPartitions", "64"
|
|
||||||
],
|
|
||||||
executor_cores=8,
|
|
||||||
executor_memory="4G",
|
|
||||||
executor_instances=1,
|
|
||||||
executor_memoryOverhead="3G").get_configuration(),
|
|
||||||
kubernetes_conn_id="kubernetes_default",
|
|
||||||
dag=dag
|
|
||||||
)
|
)
|
||||||
|
def results_deduplication_dag():
|
||||||
|
simrel = SparkKubernetesOperator(
|
||||||
|
task_id='CreateSimRel',
|
||||||
|
task_display_name="Create Similarity Relations",
|
||||||
|
namespace='dnet-spark-jobs',
|
||||||
|
template_spec=SparkConfigurator(
|
||||||
|
name="createsimrels-{{ ds }}-{{ task_instance.try_number }}",
|
||||||
|
mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateSimRels",
|
||||||
|
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||||
|
arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}",
|
||||||
|
"--isLookUpUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}",
|
||||||
|
"--actionSetId", "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}",
|
||||||
|
"--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}",
|
||||||
|
"--numPartitions", "64"
|
||||||
|
]).get_configuration(),
|
||||||
|
kubernetes_conn_id="kubernetes_default"
|
||||||
|
)
|
||||||
|
|
||||||
whitelist = SparkKubernetesOperator(
|
whitelist = SparkKubernetesOperator(
|
||||||
task_id='WhitelistSimRels',
|
task_id='WhitelistSimRels',
|
||||||
namespace='dnet-spark-jobs',
|
task_display_name="Add Whitelist Similarity Relations",
|
||||||
template_spec=SparkConfigurator(
|
namespace='dnet-spark-jobs',
|
||||||
name="whitelistsimrels-{{ ds }}-{{ task_instance.try_number }}",
|
template_spec=SparkConfigurator(
|
||||||
mainClass="eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels",
|
name="whitelistsimrels-{{ ds }}-{{ task_instance.try_number }}",
|
||||||
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
mainClass="eu.dnetlib.dhp.oa.dedup.SparkWhitelistSimRels",
|
||||||
arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred",
|
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||||
"--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl",
|
arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}",
|
||||||
"--actionSetId", "dedup-result-decisiontree-v4",
|
"--isLookUpUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}",
|
||||||
"--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup",
|
"--actionSetId", "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}",
|
||||||
"--whiteListPath", "s3a://graph/data/dedup/whitelist_prod", # TODO: copy!
|
"--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}",
|
||||||
"--numPartitions", "64"
|
"--whiteListPath", "s3a://graph/data/dedup/whitelist_prod", # TODO: copy!
|
||||||
],
|
"--numPartitions", "64"
|
||||||
executor_cores=8,
|
]).get_configuration(),
|
||||||
executor_memory="4G",
|
kubernetes_conn_id="kubernetes_default"
|
||||||
executor_instances=1,
|
)
|
||||||
executor_memoryOverhead="3G").get_configuration(),
|
|
||||||
kubernetes_conn_id="kubernetes_default",
|
|
||||||
dag=dag
|
|
||||||
)
|
|
||||||
|
|
||||||
createmergerel = SparkKubernetesOperator(
|
createmergerel = SparkKubernetesOperator(
|
||||||
task_id='CreateMergeRels',
|
task_id='CreateMergeRels',
|
||||||
namespace='dnet-spark-jobs',
|
task_display_name="Create Merge Relations",
|
||||||
template_spec=SparkConfigurator(
|
namespace='dnet-spark-jobs',
|
||||||
name="createmergerels-{{ ds }}-{{ task_instance.try_number }}",
|
template_spec=SparkConfigurator(
|
||||||
mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels",
|
name="createmergerels-{{ ds }}-{{ task_instance.try_number }}",
|
||||||
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateMergeRels",
|
||||||
arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred",
|
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||||
"--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl",
|
arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}",
|
||||||
"--actionSetId", "dedup-result-decisiontree-v4",
|
"--isLookUpUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}",
|
||||||
"--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup",
|
"--actionSetId", "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}",
|
||||||
"--cutConnectedComponent", "200",
|
"--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}",
|
||||||
"--hiveMetastoreUris", "",
|
"--cutConnectedComponent", "200",
|
||||||
"--pivotHistoryDatabase", ""
|
"--hiveMetastoreUris", "",
|
||||||
],
|
"--pivotHistoryDatabase", ""
|
||||||
executor_cores=8,
|
]).get_configuration(),
|
||||||
executor_memory="4G",
|
kubernetes_conn_id="kubernetes_default"
|
||||||
executor_instances=1,
|
)
|
||||||
executor_memoryOverhead="3G").get_configuration(),
|
|
||||||
kubernetes_conn_id="kubernetes_default",
|
|
||||||
dag=dag
|
|
||||||
)
|
|
||||||
|
|
||||||
creatededuprecord = SparkKubernetesOperator(
|
creatededuprecord = SparkKubernetesOperator(
|
||||||
task_id='CreateDedupRecord',
|
task_id='CreateDedupRecord',
|
||||||
namespace='dnet-spark-jobs',
|
task_display_name="Create Dedup Record",
|
||||||
template_spec=SparkConfigurator(
|
namespace='dnet-spark-jobs',
|
||||||
name="creatededuprecord-{{ ds }}-{{ task_instance.try_number }}",
|
template_spec=SparkConfigurator(
|
||||||
mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord",
|
name="creatededuprecord-{{ ds }}-{{ task_instance.try_number }}",
|
||||||
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateDedupRecord",
|
||||||
arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred",
|
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||||
"--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl",
|
arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}",
|
||||||
"--actionSetId", "dedup-result-decisiontree-v4",
|
"--isLookUpUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}",
|
||||||
"--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup"
|
"--actionSetId", "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}",
|
||||||
],
|
"--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}"
|
||||||
executor_cores=8,
|
]).get_configuration(),
|
||||||
executor_memory="4G",
|
kubernetes_conn_id="kubernetes_default"
|
||||||
executor_instances=1,
|
)
|
||||||
executor_memoryOverhead="3G").get_configuration(),
|
|
||||||
kubernetes_conn_id="kubernetes_default",
|
|
||||||
dag=dag
|
|
||||||
)
|
|
||||||
|
|
||||||
copyopenorgsmergerel = SparkKubernetesOperator(
|
copyopenorgsmergerel = SparkKubernetesOperator(
|
||||||
task_id='CopyOpenorgsMergeRels',
|
task_id='CopyOpenorgsMergeRels',
|
||||||
namespace='dnet-spark-jobs',
|
task_display_name="Copy Openorgs Merge Relations",
|
||||||
template_spec=SparkConfigurator(
|
namespace='dnet-spark-jobs',
|
||||||
name="copyopenorgsmergerels-{{ ds }}-{{ task_instance.try_number }}",
|
template_spec=SparkConfigurator(
|
||||||
mainClass="eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels",
|
name="copyopenorgsmergerels-{{ ds }}-{{ task_instance.try_number }}",
|
||||||
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
mainClass="eu.dnetlib.dhp.oa.dedup.SparkCopyOpenorgsMergeRels",
|
||||||
arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred",
|
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||||
"--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl",
|
arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}",
|
||||||
"--actionSetId", "dedup-result-decisiontree-v4",
|
"--isLookUpUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}",
|
||||||
"--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup",
|
"--actionSetId", "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}",
|
||||||
"--numPartitions", "64"
|
"--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}",
|
||||||
],
|
"--numPartitions", "64"
|
||||||
executor_cores=8,
|
]).get_configuration(),
|
||||||
executor_memory="4G",
|
kubernetes_conn_id="kubernetes_default"
|
||||||
executor_instances=1,
|
)
|
||||||
executor_memoryOverhead="3G").get_configuration(),
|
|
||||||
kubernetes_conn_id="kubernetes_default",
|
|
||||||
dag=dag
|
|
||||||
)
|
|
||||||
|
|
||||||
createorgsdeduprecord = SparkKubernetesOperator(
|
createorgsdeduprecord = SparkKubernetesOperator(
|
||||||
task_id='CreateOrgsDedupRecord',
|
task_id='CreateOrgsDedupRecord',
|
||||||
namespace='dnet-spark-jobs',
|
task_display_name="Create Organizations Dedup Records",
|
||||||
template_spec=SparkConfigurator(
|
namespace='dnet-spark-jobs',
|
||||||
name="createorgsdeduprecord-{{ ds }}-{{ task_instance.try_number }}",
|
template_spec=SparkConfigurator(
|
||||||
mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord",
|
name="createorgsdeduprecord-{{ ds }}-{{ task_instance.try_number }}",
|
||||||
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
mainClass="eu.dnetlib.dhp.oa.dedup.SparkCreateOrgsDedupRecord",
|
||||||
arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred",
|
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||||
"--isLookUpUrl", "http://services.openaire.eu:8280/is/services/isLookUp?wsdl",
|
arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}",
|
||||||
"--actionSetId", "dedup-result-decisiontree-v4",
|
"--isLookUpUrl", "{{ dag_run.conf.get('IS_LOOKUP_URL') }}",
|
||||||
"--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup"
|
"--actionSetId", "{{ dag_run.conf.get('DEDUP_CONFIG_ID') }}",
|
||||||
],
|
"--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}"
|
||||||
executor_cores=8,
|
]).get_configuration(),
|
||||||
executor_memory="4G",
|
kubernetes_conn_id="kubernetes_default"
|
||||||
executor_instances=1,
|
)
|
||||||
executor_memoryOverhead="3G").get_configuration(),
|
|
||||||
kubernetes_conn_id="kubernetes_default",
|
|
||||||
dag=dag
|
|
||||||
)
|
|
||||||
|
|
||||||
updateentity = SparkKubernetesOperator(
|
updateentity = SparkKubernetesOperator(
|
||||||
task_id='UpdateEntity',
|
task_id='UpdateEntity',
|
||||||
namespace='dnet-spark-jobs',
|
task_display_name="Update Entity",
|
||||||
template_spec=SparkConfigurator(
|
namespace='dnet-spark-jobs',
|
||||||
name="updateentity-{{ ds }}-{{ task_instance.try_number }}",
|
template_spec=SparkConfigurator(
|
||||||
mainClass="eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity",
|
name="updateentity-{{ ds }}-{{ task_instance.try_number }}",
|
||||||
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
mainClass="eu.dnetlib.dhp.oa.dedup.SparkUpdateEntity",
|
||||||
arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred",
|
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||||
"--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup",
|
arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}",
|
||||||
"--dedupGraphPath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup"
|
"--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}",
|
||||||
],
|
"--dedupGraphPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}"
|
||||||
executor_cores=8,
|
]).get_configuration(),
|
||||||
executor_memory="4G",
|
kubernetes_conn_id="kubernetes_default"
|
||||||
executor_instances=1,
|
)
|
||||||
executor_memoryOverhead="3G").get_configuration(),
|
|
||||||
kubernetes_conn_id="kubernetes_default",
|
|
||||||
dag=dag
|
|
||||||
)
|
|
||||||
|
|
||||||
copyrelations = SparkKubernetesOperator(
|
copyrelations = SparkKubernetesOperator(
|
||||||
task_id='copyRelations',
|
task_id='copyRelations',
|
||||||
namespace='dnet-spark-jobs',
|
task_display_name="Copy Non-Openorgs Relations",
|
||||||
template_spec=SparkConfigurator(
|
namespace='dnet-spark-jobs',
|
||||||
name="copyrelations-{{ ds }}-{{ task_instance.try_number }}",
|
template_spec=SparkConfigurator(
|
||||||
mainClass="eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs",
|
name="copyrelations-{{ ds }}-{{ task_instance.try_number }}",
|
||||||
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
mainClass="eu.dnetlib.dhp.oa.dedup.SparkCopyRelationsNoOpenorgs",
|
||||||
arguments=["--graphBasePath", "s3a://graph/tmp/prod_provision/graph/05_graph_inferred",
|
jarLocation='s3a://binaries/dhp-shade-package-1.2.5-SNAPSHOT.jar',
|
||||||
"--workingPath", "s3a://graph/tmp/prod_provision/working_dir/dedup",
|
arguments=["--graphBasePath", "{{ dag_run.conf.get('INPUT_PATH') }}",
|
||||||
"--dedupGraphPath", "s3a://graph/tmp/prod_provision/graph/06_graph_dedup"
|
"--workingPath", "{{ dag_run.conf.get('WRKDIR_PATH') }}",
|
||||||
],
|
"--dedupGraphPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}"
|
||||||
executor_cores=8,
|
]).get_configuration(),
|
||||||
executor_memory="4G",
|
kubernetes_conn_id="kubernetes_default"
|
||||||
executor_instances=1,
|
)
|
||||||
executor_memoryOverhead="3G").get_configuration(),
|
|
||||||
kubernetes_conn_id="kubernetes_default",
|
|
||||||
dag=dag
|
|
||||||
)
|
|
||||||
|
|
||||||
simrel >> whitelist >> createmergerel \
|
chain(simrel, whitelist, createmergerel, creatededuprecord, copyopenorgsmergerel, createorgsdeduprecord, updateentity, copyrelations)
|
||||||
>> creatededuprecord >> copyopenorgsmergerel \
|
|
||||||
>> createorgsdeduprecord \
|
|
||||||
>> updateentity >> copyrelations
|
results_deduplication_dag()
|
||||||
|
|
|
@ -1,16 +1,9 @@
|
||||||
import os
|
import os
|
||||||
import tarfile
|
|
||||||
import time
|
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
|
|
||||||
import pendulum
|
|
||||||
from airflow.decorators import dag
|
from airflow.decorators import dag
|
||||||
from airflow.decorators import task
|
|
||||||
from airflow.models.param import Param
|
from airflow.models.param import Param
|
||||||
from airflow.operators.python import get_current_context
|
|
||||||
|
|
||||||
from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator
|
from airflow.providers.cncf.kubernetes.operators.spark_kubernetes import SparkKubernetesOperator
|
||||||
from airflow.utils.dates import days_ago
|
|
||||||
|
|
||||||
from spark_configurator import SparkConfigurator
|
from spark_configurator import SparkConfigurator
|
||||||
|
|
||||||
|
@ -19,7 +12,7 @@ EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||||
default_args = {
|
default_args = {
|
||||||
"execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
|
"execution_timeout": timedelta(days=EXECUTION_TIMEOUT),
|
||||||
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
|
||||||
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
|
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -31,14 +24,17 @@ default_args = {
|
||||||
"S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection of S3 endpoint"),
|
"S3_CONN_ID": Param("s3_conn", type='string', description="Airflow connection of S3 endpoint"),
|
||||||
"ORCID_PATH": Param("s3a://graph/data/orcid_2023/tables", type='string', description=""),
|
"ORCID_PATH": Param("s3a://graph/data/orcid_2023/tables", type='string', description=""),
|
||||||
"INPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/07_graph_consistent", type='string', description=""),
|
"INPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/07_graph_consistent", type='string', description=""),
|
||||||
"OUTPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/09_graph_orcid_enriched", type='string', description=""),
|
"OUTPUT_PATH": Param("s3a://graph/tmp/prod_provision/graph/09_graph_orcid_enriched", type='string',
|
||||||
"WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir/orcid_enrichment", type='string', description=""),
|
description=""),
|
||||||
|
"WRKDIR_PATH": Param("s3a://graph/tmp/prod_provision/working_dir/orcid_enrichment", type='string',
|
||||||
|
description="")
|
||||||
},
|
},
|
||||||
tags=["openaire"],
|
tags=["openaire"]
|
||||||
)
|
)
|
||||||
def orcid_enrichment_dag():
|
def orcid_enrichment_dag():
|
||||||
orcid_enrich = SparkKubernetesOperator(
|
orcid_enrich = SparkKubernetesOperator(
|
||||||
task_id='EnrichGraphWithOrcidAuthors',
|
task_id='EnrichGraphWithOrcidAuthors',
|
||||||
|
task_display_name='Enrich Authors with ORCID',
|
||||||
namespace='dnet-spark-jobs',
|
namespace='dnet-spark-jobs',
|
||||||
template_spec=SparkConfigurator(
|
template_spec=SparkConfigurator(
|
||||||
name="orcidenrich-{{ ds }}-{{ task_instance.try_number }}",
|
name="orcidenrich-{{ ds }}-{{ task_instance.try_number }}",
|
||||||
|
@ -49,16 +45,11 @@ def orcid_enrichment_dag():
|
||||||
"--targetPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}",
|
"--targetPath", "{{ dag_run.conf.get('OUTPUT_PATH') }}",
|
||||||
"--workingDir", "{{ dag_run.conf.get('WRKDIR_PATH') }}",
|
"--workingDir", "{{ dag_run.conf.get('WRKDIR_PATH') }}",
|
||||||
"--master", ""
|
"--master", ""
|
||||||
],
|
]).get_configuration(),
|
||||||
executor_cores=8,
|
kubernetes_conn_id="kubernetes_default"
|
||||||
executor_memory="16G",
|
|
||||||
executor_instances=1,
|
|
||||||
executor_memoryOverhead="8G").get_configuration(),
|
|
||||||
kubernetes_conn_id="kubernetes_default",
|
|
||||||
dag=dag
|
|
||||||
)
|
)
|
||||||
|
|
||||||
orcid_enrich()
|
orcid_enrich
|
||||||
|
|
||||||
|
|
||||||
orcid_enrichment_dag()
|
orcid_enrichment_dag()
|
||||||
|
|
|
@ -1,3 +1,4 @@
|
||||||
|
|
||||||
class SparkConfigurator:
|
class SparkConfigurator:
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
name,
|
name,
|
||||||
|
@ -9,9 +10,9 @@ class SparkConfigurator:
|
||||||
image= "dnet-spark:1.0.0",
|
image= "dnet-spark:1.0.0",
|
||||||
driver_cores=1,
|
driver_cores=1,
|
||||||
driver_memory='1G',
|
driver_memory='1G',
|
||||||
executor_cores=1,
|
executor_cores=8,
|
||||||
executor_memory="1G",
|
executor_memory="16G",
|
||||||
executor_memoryOverhead= "1G",
|
executor_memoryOverhead="8G",
|
||||||
executor_instances=1
|
executor_instances=1
|
||||||
) -> None:
|
) -> None:
|
||||||
if apiVersion:
|
if apiVersion:
|
||||||
|
|
Loading…
Reference in New Issue