initial stage
This commit is contained in:
parent
10c29f86c2
commit
7e41f71d32
|
@ -44,7 +44,7 @@ default_args = {
|
||||||
},
|
},
|
||||||
tags=["lot1"],
|
tags=["lot1"],
|
||||||
)
|
)
|
||||||
def import_EOSC_catalog(**context):
|
def import_EOSC_catalog():
|
||||||
@task
|
@task
|
||||||
def create_indexes():
|
def create_indexes():
|
||||||
conn = BaseHook.get_connection('opensearch_default')
|
conn = BaseHook.get_connection('opensearch_default')
|
||||||
|
@ -83,7 +83,7 @@ def import_EOSC_catalog(**context):
|
||||||
pieces = []
|
pieces = []
|
||||||
for entity in ENTITIES:
|
for entity in ENTITIES:
|
||||||
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
|
hook = S3Hook(S3_CONN_ID, transfer_config_args={'use_threads': False})
|
||||||
keys = hook.list_keys(bucket_name=context["params"]["EOSC_CATALOG_BUCKET"], prefix=f'{entity}/')
|
keys = hook.list_keys(bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"], prefix=f'{entity}/')
|
||||||
for key in keys:
|
for key in keys:
|
||||||
pieces.append((entity, key))
|
pieces.append((entity, key))
|
||||||
|
|
||||||
|
@ -94,7 +94,7 @@ def import_EOSC_catalog(**context):
|
||||||
return list(split_list(pieces, len(pieces)//BULK_PARALLELISM))
|
return list(split_list(pieces, len(pieces)//BULK_PARALLELISM))
|
||||||
|
|
||||||
@task
|
@task
|
||||||
def bulk_load(files: list[(str, str)]):
|
def bulk_load(files: list[(str, str)], **kwargs):
|
||||||
conn = BaseHook.get_connection('opensearch_default')
|
conn = BaseHook.get_connection('opensearch_default')
|
||||||
client = OpenSearch(
|
client = OpenSearch(
|
||||||
hosts=[{'host': conn.host, 'port': conn.port}],
|
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||||
|
@ -109,7 +109,7 @@ def import_EOSC_catalog(**context):
|
||||||
def _generate_data():
|
def _generate_data():
|
||||||
for (entity, key) in files:
|
for (entity, key) in files:
|
||||||
print(f'{entity}: {key}')
|
print(f'{entity}: {key}')
|
||||||
s3_obj = hook.get_key(key, bucket_name=context["params"]["EOSC_CATALOG_BUCKET"])
|
s3_obj = hook.get_key(key, bucket_name=kwargs["params"]["EOSC_CATALOG_BUCKET"])
|
||||||
with gzip.GzipFile(fileobj=s3_obj.get()["Body"]) as gzipfile:
|
with gzip.GzipFile(fileobj=s3_obj.get()["Body"]) as gzipfile:
|
||||||
buff = io.BufferedReader(gzipfile)
|
buff = io.BufferedReader(gzipfile)
|
||||||
for line in buff:
|
for line in buff:
|
||||||
|
|
Loading…
Reference in New Issue