initial stage
This commit is contained in:
parent
df9cab15c3
commit
52be021867
|
@ -1,3 +1,4 @@
|
||||||
|
from typing import Dict, Any, List
|
||||||
|
|
||||||
|
|
||||||
def map_access_right(ar: str) -> str:
|
def map_access_right(ar: str) -> str:
|
||||||
|
@ -86,3 +87,50 @@ transform_entities = {
|
||||||
"services": trasform_catalog_entity,
|
"services": trasform_catalog_entity,
|
||||||
"training": trasform_catalog_entity,
|
"training": trasform_catalog_entity,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def isEmpty(current_value: Dict[str, Any], labels: List[str]) -> bool:
|
||||||
|
if len(labels) <= 0:
|
||||||
|
return True
|
||||||
|
for label in labels:
|
||||||
|
if isinstance(current_value, list):
|
||||||
|
current_value = current_value[0]
|
||||||
|
if isinstance(current_value, dict) and label in current_value:
|
||||||
|
current_value = current_value[label]
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
if current_value is None:
|
||||||
|
return True
|
||||||
|
if isinstance(current_value, list):
|
||||||
|
if len(current_value) > 0:
|
||||||
|
return current_value[0] == ""
|
||||||
|
else:
|
||||||
|
return True
|
||||||
|
|
||||||
|
return str(current_value) == ""
|
||||||
|
|
||||||
|
|
||||||
|
#
|
||||||
|
# Filter products that do not meet inclusion policy
|
||||||
|
#
|
||||||
|
def filter_product(p: dict) -> bool:
|
||||||
|
if isEmpty(p, ["titles", "none"]):
|
||||||
|
return True
|
||||||
|
|
||||||
|
if isEmpty(p, ["firstPublishDate"]):
|
||||||
|
return True
|
||||||
|
|
||||||
|
if p['product_type'] == "literature":
|
||||||
|
if isEmpty(p, ["abstracts", "none"]):
|
||||||
|
return True
|
||||||
|
if isEmpty(p, ["contributions", "person", "local_identifier"]):
|
||||||
|
return True
|
||||||
|
elif p['product_type'] == "research data":
|
||||||
|
if isEmpty(p, ["contributions", "person", "local_identifier"]):
|
||||||
|
return True
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
filter_entities = {
|
||||||
|
"products": filter_product
|
||||||
|
}
|
||||||
|
|
|
@ -6,9 +6,9 @@ mappings['datasource'] = {
|
||||||
"data_source_classification": {
|
"data_source_classification": {
|
||||||
"type": "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"eoscId": {
|
# "eoscId": {
|
||||||
"type": "keyword"
|
# "type": "keyword"
|
||||||
},
|
# },
|
||||||
"identifiers": {
|
"identifiers": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -320,6 +320,9 @@ mappings['grants'] = {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
"keywords": {
|
||||||
|
"type": "keyword"
|
||||||
|
},
|
||||||
"local_identifier": {
|
"local_identifier": {
|
||||||
"type": "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
|
@ -391,9 +394,9 @@ mappings['products'] = {
|
||||||
"contributions": {
|
"contributions": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"declared_affiliations": {
|
# "declared_affiliations": {
|
||||||
"type": "keyword"
|
# "type": "keyword"
|
||||||
},
|
# },
|
||||||
"person": {
|
"person": {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
|
@ -414,11 +417,12 @@ mappings['products'] = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"rank": {
|
"rank": {
|
||||||
|
"index": False,
|
||||||
"type": "long"
|
"type": "long"
|
||||||
},
|
},
|
||||||
"roles": {
|
# "roles": {
|
||||||
"type": "keyword"
|
# "type": "keyword"
|
||||||
}
|
# }
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"funding": {
|
"funding": {
|
||||||
|
@ -455,23 +459,29 @@ mappings['products'] = {
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"indicator": {
|
"indicator": {
|
||||||
|
"dynamic": False,
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"downloadsAndViews": {
|
"downloadsAndViews": {
|
||||||
|
"dynamic": False,
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"downloads": {
|
"downloads": {
|
||||||
|
"index": False,
|
||||||
"type": "long"
|
"type": "long"
|
||||||
},
|
},
|
||||||
"views": {
|
"views": {
|
||||||
|
"index": False,
|
||||||
"type": "long"
|
"type": "long"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"impact": {
|
"impact": {
|
||||||
|
"dynamic": False,
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"citationCount": {
|
"citationCount": {
|
||||||
|
"index": False,
|
||||||
"type": "long"
|
"type": "long"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -488,6 +498,7 @@ mappings['products'] = {
|
||||||
"type": "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"biblio": {
|
"biblio": {
|
||||||
|
"index": False,
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"edition": {
|
"edition": {
|
||||||
|
@ -496,15 +507,15 @@ mappings['products'] = {
|
||||||
"end_page": {
|
"end_page": {
|
||||||
"type": "text"
|
"type": "text"
|
||||||
},
|
},
|
||||||
"hosting_data_source": {
|
# "hosting_data_source": {
|
||||||
"type": "text"
|
# "type": "text"
|
||||||
},
|
# },
|
||||||
"issue": {
|
"issue": {
|
||||||
"type": "text"
|
"type": "text"
|
||||||
},
|
},
|
||||||
"number": {
|
# "number": {
|
||||||
"type": "text"
|
# "type": "text"
|
||||||
},
|
# },
|
||||||
"publisher": {
|
"publisher": {
|
||||||
"type": "text"
|
"type": "text"
|
||||||
},
|
},
|
||||||
|
@ -602,9 +613,9 @@ mappings['products'] = {
|
||||||
"pmid": {
|
"pmid": {
|
||||||
"type": "keyword"
|
"type": "keyword"
|
||||||
},
|
},
|
||||||
"title": {
|
# "title": {
|
||||||
"type": "text"
|
# "type": "text"
|
||||||
}
|
# }
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"relation_type": {
|
"relation_type": {
|
||||||
|
@ -651,12 +662,12 @@ mappings['products'] = {
|
||||||
"type": "object",
|
"type": "object",
|
||||||
"properties": {
|
"properties": {
|
||||||
"trust": {
|
"trust": {
|
||||||
"type": "keyword",
|
"type": "double",
|
||||||
"index": "false"
|
"index": False
|
||||||
},
|
},
|
||||||
"type": {
|
"type": {
|
||||||
"type": "keyword",
|
"type": "keyword",
|
||||||
"index": "false"
|
"index": False
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
|
@ -19,7 +19,7 @@ from airflow.hooks.base import BaseHook
|
||||||
|
|
||||||
from opensearchpy import OpenSearch, helpers
|
from opensearchpy import OpenSearch, helpers
|
||||||
from EOSC_indexes import mappings
|
from EOSC_indexes import mappings
|
||||||
from EOSC_entity_trasform import transform_entities
|
from EOSC_entity_trasform import filter_entities, transform_entities
|
||||||
|
|
||||||
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
EXECUTION_TIMEOUT = int(os.getenv("EXECUTION_TIMEOUT", 6))
|
||||||
|
|
||||||
|
@ -163,17 +163,22 @@ for config_name, config in configs.items():
|
||||||
with gzip.GzipFile(fileobj=s3_obj.get()["Body"], mode='rb') if key.endswith(".gz") else codecs.getreader('utf-8')(s3_obj.get()["Body"]) as s3file:
|
with gzip.GzipFile(fileobj=s3_obj.get()["Body"], mode='rb') if key.endswith(".gz") else codecs.getreader('utf-8')(s3_obj.get()["Body"]) as s3file:
|
||||||
def _generate_data():
|
def _generate_data():
|
||||||
for line in s3file:
|
for line in s3file:
|
||||||
data = json.loads(line)
|
data: dict = json.loads(line)
|
||||||
data['_index'] = indexname
|
|
||||||
if entity in transform_entities:
|
if entity in transform_entities:
|
||||||
data = transform_entities[entity](data)
|
data = transform_entities[entity](data)
|
||||||
yield data
|
if entity in filter_entities:
|
||||||
|
if filter_entities[entity](data):
|
||||||
|
print(data["local_identifier"] + " does not meet inclusion policies")
|
||||||
|
continue
|
||||||
|
index = {"update": {"_index": indexname, "_id": data.pop("_id")}}
|
||||||
|
yield index, {"doc": data, "doc_as_upsert": True}
|
||||||
|
|
||||||
# disable success post logging
|
# disable success post logging
|
||||||
logging.getLogger("opensearch").setLevel(logging.WARN)
|
logging.getLogger("opensearch").setLevel(logging.WARN)
|
||||||
succeeded = 0
|
succeeded = 0
|
||||||
failed = 0
|
failed = 0
|
||||||
for success, item in helpers.parallel_bulk(client, actions=_generate_data(),
|
for success, item in helpers.parallel_bulk(client, actions=_generate_data(),
|
||||||
|
expand_action_callback=None,
|
||||||
raise_on_exception=False,
|
raise_on_exception=False,
|
||||||
raise_on_error=False,
|
raise_on_error=False,
|
||||||
chunk_size=5000,
|
chunk_size=5000,
|
||||||
|
@ -198,6 +203,58 @@ for config_name, config in configs.items():
|
||||||
if succeeded > 0:
|
if succeeded > 0:
|
||||||
print(f"Bulk-inserted {succeeded} items (streaming_bulk).")
|
print(f"Bulk-inserted {succeeded} items (streaming_bulk).")
|
||||||
|
|
||||||
|
@task
|
||||||
|
def merge_curation_db(**kwargs):
|
||||||
|
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||||
|
client = OpenSearch(
|
||||||
|
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||||
|
http_auth=(conn.login, conn.password),
|
||||||
|
use_ssl=True,
|
||||||
|
verify_certs=False,
|
||||||
|
ssl_show_warn=False,
|
||||||
|
pool_maxsize=20,
|
||||||
|
timeout=180
|
||||||
|
)
|
||||||
|
if "products" in kwargs["params"]["ENTITIES"]:
|
||||||
|
products_index = f'products_{kwargs["params"]["SUFFIX"]}'
|
||||||
|
curationdb_index = 'curation'
|
||||||
|
if client.indices.exists(curationdb_index):
|
||||||
|
client.reindex(body={
|
||||||
|
"source": {
|
||||||
|
"index": curationdb_index,
|
||||||
|
"_source": ["status"]
|
||||||
|
},
|
||||||
|
"dest": {
|
||||||
|
"index": products_index
|
||||||
|
}
|
||||||
|
},
|
||||||
|
refresh=False,
|
||||||
|
requests_per_second=-1,
|
||||||
|
scroll="4h",
|
||||||
|
slices="auto",
|
||||||
|
timeout=60*60*4,
|
||||||
|
wait_for_completion=True)
|
||||||
|
|
||||||
|
@task
|
||||||
|
def delete_missing_curated(**kwargs):
|
||||||
|
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||||
|
client = OpenSearch(
|
||||||
|
hosts=[{'host': conn.host, 'port': conn.port}],
|
||||||
|
http_auth=(conn.login, conn.password),
|
||||||
|
use_ssl=True,
|
||||||
|
verify_certs=False,
|
||||||
|
ssl_show_warn=False,
|
||||||
|
pool_maxsize=20,
|
||||||
|
timeout=180
|
||||||
|
)
|
||||||
|
if "products" in kwargs["params"]["ENTITIES"]:
|
||||||
|
products_index = f'products_{kwargs["params"]["SUFFIX"]}'
|
||||||
|
client.delete_by_query(index=products_index,
|
||||||
|
body={"query": {"bool": {"must_not": {"exists": {"field": "local_identifier"}}}}},
|
||||||
|
refresh=False
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@task
|
@task
|
||||||
def close_indexes(**kwargs):
|
def close_indexes(**kwargs):
|
||||||
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
conn = BaseHook.get_connection(kwargs["params"]["OPENSEARCH_CONN_ID"])
|
||||||
|
@ -238,8 +295,10 @@ for config_name, config in configs.items():
|
||||||
|
|
||||||
chain(
|
chain(
|
||||||
create_indexes.override(task_id="create_indexes")(),
|
create_indexes.override(task_id="create_indexes")(),
|
||||||
|
merge_curation_db.override(task_id="merge_curation_db")(),
|
||||||
parallel_batches,
|
parallel_batches,
|
||||||
bulk_load.expand_kwargs(parallel_batches.output),
|
bulk_load.expand_kwargs(parallel_batches.output),
|
||||||
|
delete_missing_curated.override(task_id="delete_missing_curated_recs")(),
|
||||||
close_indexes.override(task_id="close_indexes")()
|
close_indexes.override(task_id="close_indexes")()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue