lot1-kickoff/airflow/dags/common.py

47 lines
1.4 KiB
Python
Raw Normal View History

2024-03-25 17:52:56 +01:00
import gzip
import io
import json
import os
import zipfile
from datetime import timedelta
from airflow.decorators import dag
from airflow.decorators import task
from airflow.operators.python import PythonOperator
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.utils.file import TemporaryDirectory
from airflow.utils.helpers import chain
from airflow.models import Variable
from opensearchpy import OpenSearch, helpers
from EOSC_indexes import mappings
def strip_prefix(s, p):
if s.startswith(p):
return s[len(p):]
else:
return s
def s3_dowload_unzip_upload(s3conn: str, src_key: str, src_bucket: str, dest_bucket: str):
hook = S3Hook(s3conn, transfer_config_args={'use_threads': False})
with TemporaryDirectory() as dwl_dir:
with TemporaryDirectory() as tmp_dir:
archive = f'{dwl_dir}/{src_key}'
hook.download_file(key=src_key, bucket_name=src_bucket, local_path=dwl_dir, preserve_file_name=True,
use_autogenerated_subdir=False)
with zipfile.ZipFile(archive, 'r') as zip_ref:
zip_ref.extractall(tmp_dir)
for root, _, files in os.walk(tmp_dir):
for file in files:
local_file_path = os.path.join(root, file)
2024-03-26 11:03:05 +01:00
hook.load_file(local_file_path, strip_prefix(local_file_path, tmp_dir + "/"), dest_bucket,
2024-03-25 17:52:56 +01:00
replace=True)