lot1-kickoff/airflow/dags/OpenDataPortal_harvest.py

110 lines
3.8 KiB
Python
Raw Normal View History

2024-06-06 19:48:13 +02:00
import os
2024-06-06 22:08:12 +02:00
from datetime import timedelta, time
2024-06-06 19:48:13 +02:00
import pendulum
import requests
from airflow.decorators import dag
from airflow.decorators import task
from airflow.hooks.base import BaseHook
from opensearchpy import OpenSearch, helpers
S3_CONN_ID = os.getenv("S3_CONN_ID", "s3_conn")
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
default_args = {
"execution_timeout": timedelta(hours=EXECUTION_TIMEOUT),
"retries": int(os.getenv("DEFAULT_TASK_RETRIES", 1)),
"retry_delay": timedelta(seconds=int(os.getenv("DEFAULT_RETRY_DELAY_SECONDS", 60))),
}
@dag(
2024-06-06 19:52:42 +02:00
dag_id="open_data_portal_harvest",
2024-06-06 19:48:13 +02:00
start_date=pendulum.datetime(2021, 1, 1, tz="UTC"),
schedule=None,
2024-06-09 10:30:31 +02:00
dagrun_timeout=None,
2024-06-06 19:48:13 +02:00
catchup=False,
default_args=default_args,
params={
"S3_CONN_ID": "s3_conn",
"OPENSEARCH_CONN_ID": "opensearch_default",
"OS_INDEX_NAME": "euodp_raw"
},
2024-06-06 19:50:06 +02:00
tags=["aggregation"]
2024-06-06 19:48:13 +02:00
)
def harvest():
@task
def everything(**context):
2024-06-06 19:50:06 +02:00
index_name = context["params"]["OS_INDEX_NAME"]
2024-06-06 19:48:13 +02:00
conn = BaseHook.get_connection(context["params"]["OPENSEARCH_CONN_ID"])
client = OpenSearch(
hosts=[{'host': conn.host, 'port': conn.port}],
http_auth=(conn.login, conn.password),
use_ssl=True,
verify_certs=False,
ssl_show_warn=False,
pool_maxsize=20
)
2024-06-06 19:50:06 +02:00
if not client.indices.exists(index_name):
client.indices.create(index_name, {
2024-06-06 19:48:13 +02:00
"settings": {
"index": {
"number_of_shards": 3,
"number_of_replicas": 0,
"codec": "zstd_no_dict",
"replication.type": "SEGMENT"
},
},
"mappings": {
"dynamic": False
}
})
2024-06-06 22:10:03 +02:00
def store_results(hits):
def _generate_data():
for r in hits:
r['_index'] = index_name
r['_id'] = r['id']
yield r
succeeded = 0
failed = 0
for success, item in helpers.parallel_bulk(client, actions=_generate_data(),
raise_on_exception=False,
raise_on_error=False,
chunk_size=5000,
max_chunk_bytes=50 * 1024 * 1024,
timeout=180):
if success:
succeeded = succeeded + 1
else:
print(item["index"]["error"])
failed = failed + 1
headers = {'Accept': 'application/json'}
2024-06-07 09:10:57 +02:00
r = requests.get('https://data.europa.eu/api/hub/search/search?filter=dataset&aggregation=false&limit=300&showScore=true&scroll=true', headers=headers).json()
2024-06-06 22:10:03 +02:00
scroll_id = r['result']['scrollId']
results = r['result']['results']
store_results(results)
max_retries = 10
while scroll_id:
2024-06-09 10:30:31 +02:00
try:
r = requests.get('https://data.europa.eu/api/hub/search/scroll?scrollId=' + scroll_id, headers=headers)
r.raise_for_status()
except Exception as e:
print(f"Error:" + str(e))
2024-06-06 22:10:03 +02:00
time.sleep(0.1)
max_retries = max_retries - 1
if max_retries == 0:
raise Exception("Cannot fetch data")
continue
max_retries = 10
r = r.json()
2024-06-06 19:48:13 +02:00
scroll_id = r['result']['scrollId']
results = r['result']['results']
store_results(results)
everything()
harvest()