Refactor the search logic to move it to its own module

This commit is contained in:
amercader 2022-10-11 10:20:45 +02:00
parent b89da29192
commit 15caaad9f1
2 changed files with 251 additions and 156 deletions

View File

@ -18,7 +18,6 @@ from ckan import plugins as p
from ckan.lib.search import SearchError from ckan.lib.search import SearchError
from ckan.lib.helpers import json from ckan.lib.helpers import json
from ckanext.spatial.lib import normalize_bbox, fit_bbox, fit_linear_ring
if tk.check_ckan_version(min_version="2.9.0"): if tk.check_ckan_version(min_version="2.9.0"):
from ckanext.spatial.plugin.flask_plugin import ( from ckanext.spatial.plugin.flask_plugin import (
@ -29,6 +28,9 @@ else:
SpatialQueryMixin, HarvestMetadataApiMixin SpatialQueryMixin, HarvestMetadataApiMixin
) )
from ckanext.spatial.lib import normalize_bbox
from ckanext.spatial.search import search_backends
config = tk.config config = tk.config
log = getLogger(__name__) log = getLogger(__name__)
@ -227,85 +229,17 @@ class SpatialQuery(SpatialQueryMixin, p.SingletonPlugin):
if not pkg_dict.get('extras_spatial'): if not pkg_dict.get('extras_spatial'):
return pkg_dict return pkg_dict
pkg_dict = search_backends[search_backend]().index_dataset(pkg_dict)
# Coupled resources are URL -> uuid links, they are not needed in SOLR # Coupled resources are URL -> uuid links, they are not needed in SOLR
# and might be huge if there are lot of coupled resources # and might be huge if there are lot of coupled resources
pkg_dict.pop('coupled-resource', None) pkg_dict.pop('coupled-resource', None)
pkg_dict.pop('extras_coupled-resource', None) pkg_dict.pop('extras_coupled-resource', None)
# spatial field is geojson coordinate data, not needed in SOLR either # spatial field is geojson coordinate data, not needed in SOLR either
geom_from_metadata = pkg_dict.pop('spatial', None) pkg_dict.pop('spatial', None)
pkg_dict.pop('extras_spatial', None) pkg_dict.pop('extras_spatial', None)
try:
geometry = json.loads(geom_from_metadata)
except (AttributeError, ValueError) as e:
log.error('Geometry not valid JSON {}, not indexing :: {}'.format(e, geom_from_metadata[:100]))
return pkg_dict
try:
shape = shapely.geometry.shape(geometry)
except GeometryTypeError as e:
log.error('{}, not indexing :: {}'.format(e, geom_from_metadata[:100]))
return pkg_dict
if search_backend == "solr-bbox":
# We always index the envelope of the geometry regardless of
# if it's an actual bounding box (polygon)
bounds = shape.bounds
bbox = fit_bbox(normalize_bbox(list(bounds)))
pkg_dict["spatial_bbox"] = "ENVELOPE({minx}, {maxx}, {maxy}, {miny})".format(
**bbox)
elif search_backend == 'solr-spatial-field':
wkt = None
# We allow multiple geometries as GeometryCollections
if geometry['type'] == 'GeometryCollection':
geometries = geometry['geometries']
else:
geometries = [geometry]
# Check potential problems with bboxes in each geometry
wkt = []
for geom in geometries:
if geom['type'] == 'Polygon' \
and len(geom['coordinates']) == 1 \
and len(geom['coordinates'][0]) == 5:
# Check wrong bboxes (4 same points)
xs = [p[0] for p in geom['coordinates'][0]]
ys = [p[1] for p in geom['coordinates'][0]]
if xs.count(xs[0]) == 5 and ys.count(ys[0]) == 5:
wkt.append('POINT({x} {y})'.format(x=xs[0], y=ys[0]))
else:
# Check if coordinates are defined counter-clockwise,
# otherwise we'll get wrong results from Solr
lr = shapely.geometry.polygon.LinearRing(geom['coordinates'][0])
lr_coords = (
list(lr.coords) if lr.is_ccw
else list(reversed(list(lr.coords)))
)
polygon = shapely.geometry.polygon.Polygon(
fit_linear_ring(lr_coords))
wkt.append(polygon.wkt)
if not wkt:
shape = shapely.geometry.shape(geometry)
if not shape.is_valid:
log.error('Wrong geometry, not indexing')
return pkg_dict
if shape.bounds[0] < -180 or shape.bounds[2] > 180:
log.error("""
Geometries outside the -180, -90, 180, 90 boundaries are not supported,
you need to split the geometry in order to fit the parts. Not indexing""")
return pkg_dict
wkt = shape.wkt
pkg_dict['spatial_geom'] = wkt
return pkg_dict return pkg_dict
def before_dataset_search(self, search_params): def before_dataset_search(self, search_params):
@ -316,93 +250,12 @@ you need to split the geometry in order to fit the parts. Not indexing""")
if input_bbox: if input_bbox:
bbox = normalize_bbox(input_bbox) bbox = normalize_bbox(input_bbox)
if not bbox: if not bbox:
raise SearchError('Wrong bounding box provided') raise SearchError('Wrong bounding box provided')
if search_backend in ("solr-bbox", "solr-spatial-field"): search_params = search_backends[search_backend]().search_params(
bbox, search_params)
bbox = fit_bbox(bbox)
if not search_params.get("fq_list"):
search_params["fq_list"] = []
spatial_field = (
"spatial_bbox" if search_backend == "solr-bbox" else "spatial_geom"
)
default_spatial_query = "{{!field f={spatial_field}}}Intersects(ENVELOPE({minx}, {maxx}, {maxy}, {miny}))"
spatial_query = config.get(
"ckanext.spatial.solr_query", default_spatial_query)
search_params["fq_list"].append(
spatial_query.format(
spatial_field=spatial_field, **bbox)
)
elif search_backend == 'postgis':
search_params = self._params_for_postgis_search(bbox, search_params)
return search_params
def _params_for_postgis_search(self, bbox, search_params):
"""
Note: The PostGIS search functionality will be removed in future versions
"""
from ckanext.spatial.postgis.model import bbox_query, bbox_query_ordered
from ckan.lib.search import SearchError
# Adjust easting values
while (bbox['minx'] < -180):
bbox['minx'] += 360
bbox['maxx'] += 360
while (bbox['minx'] > 180):
bbox['minx'] -= 360
bbox['maxx'] -= 360
# Note: This will be deprecated at some point in favour of the
# Solr 4 spatial sorting capabilities
if search_params.get('sort') == 'spatial desc' and \
tk.asbool(config.get('ckanext.spatial.use_postgis_sorting', 'False')):
if search_params['q'] or search_params['fq']:
raise SearchError('Spatial ranking cannot be mixed with other search parameters')
# ...because it is too inefficient to use SOLR to filter
# results and return the entire set to this class and
# after_search do the sorting and paging.
extents = bbox_query_ordered(bbox)
are_no_results = not extents
search_params['extras']['ext_rows'] = search_params['rows']
search_params['extras']['ext_start'] = search_params['start']
# this SOLR query needs to return no actual results since
# they are in the wrong order anyway. We just need this SOLR
# query to get the count and facet counts.
rows = 0
search_params['sort'] = None # SOLR should not sort.
# Store the rankings of the results for this page, so for
# after_search to construct the correctly sorted results
rows = search_params['extras']['ext_rows'] = search_params['rows']
start = search_params['extras']['ext_start'] = search_params['start']
search_params['extras']['ext_spatial'] = [
(extent.package_id, extent.spatial_ranking) \
for extent in extents[start:start+rows]]
else:
extents = bbox_query(bbox)
are_no_results = extents.count() == 0
if are_no_results:
# We don't need to perform the search
search_params['abort_search'] = True
else:
# We'll perform the existing search but also filtering by the ids
# of datasets within the bbox
bbox_query_ids = [extent.package_id for extent in extents]
q = search_params.get('q','').strip() or '""'
# Note: `"" AND` query doesn't work in github ci
new_q = '%s AND ' % q if q and q != '""' else ''
new_q += '(%s)' % ' OR '.join(['id:%s' % id for id in bbox_query_ids])
search_params['q'] = new_q
return search_params return search_params

View File

@ -0,0 +1,242 @@
import json
import logging
import shapely.geometry
try:
from shapely.errors import GeometryTypeError
except ImportError:
# Previous version of shapely uses ValueError and TypeError
GeometryTypeError = (ValueError, TypeError)
from ckantoolkit import config, asbool
from ckanext.spatial.lib import normalize_bbox, fit_bbox, fit_linear_ring
log = logging.getLogger(__name__)
class SpatialSearchBackend:
"""Base class for all datastore backends."""
def parse_geojson(self, geom_from_metadata):
try:
geometry = json.loads(geom_from_metadata)
except (AttributeError, ValueError) as e:
log.error(
"Geometry not valid JSON {}, not indexing :: {}".format(
e, geom_from_metadata[:100]
)
)
return None
return geometry
def shape_from_geometry(self, geometry):
try:
shape = shapely.geometry.shape(geometry)
except GeometryTypeError as e:
log.error("{}, not indexing :: {}".format(e, json.dumps(geometry)[:100]))
return None
return shape
class SolrBBoxSearchBackend(SpatialSearchBackend):
def index_dataset(self, dataset_dict):
"""
We always index the envelope of the geometry regardless of
if it's an actual bounding box (polygon)
"""
geom_from_metadata = dataset_dict.get("spatial")
geometry = self.parse_geojson(geom_from_metadata)
shape = self.shape_from_geometry(geometry)
if not shape:
return dataset_dict
bounds = shape.bounds
bbox = fit_bbox(normalize_bbox(list(bounds)))
dataset_dict[
"spatial_bbox"
] = "ENVELOPE({minx}, {maxx}, {maxy}, {miny})".format(**bbox)
return dataset_dict
def search_params(self, bbox, search_params):
bbox = fit_bbox(bbox)
if not search_params.get("fq_list"):
search_params["fq_list"] = []
default_spatial_query = "{{!field f={spatial_field}}}Intersects(ENVELOPE({minx}, {maxx}, {maxy}, {miny}))"
spatial_query = config.get("ckanext.spatial.solr_query", default_spatial_query)
search_params["fq_list"].append(
spatial_query.format(spatial_field="spatial_bbox", **bbox)
)
return search_params
class SolrSpatialFieldSearchBackend(SpatialSearchBackend):
def index_dataset(self, dataset_dict):
wkt = None
geom_from_metadata = dataset_dict.get("spatial")
geometry = self.parse_geojson(geom_from_metadata)
if not geometry:
return dataset_dict
# We allow multiple geometries as GeometryCollections
if geometry["type"] == "GeometryCollection":
geometries = geometry["geometries"]
else:
geometries = [geometry]
# Check potential problems with bboxes in each geometry
wkt = []
for geom in geometries:
if (
geom["type"] == "Polygon"
and len(geom["coordinates"]) == 1
and len(geom["coordinates"][0]) == 5
):
# Check wrong bboxes (4 same points)
xs = [p[0] for p in geom["coordinates"][0]]
ys = [p[1] for p in geom["coordinates"][0]]
if xs.count(xs[0]) == 5 and ys.count(ys[0]) == 5:
wkt.append("POINT({x} {y})".format(x=xs[0], y=ys[0]))
else:
# Check if coordinates are defined counter-clockwise,
# otherwise we'll get wrong results from Solr
lr = shapely.geometry.polygon.LinearRing(geom["coordinates"][0])
lr_coords = (
list(lr.coords)
if lr.is_ccw
else list(reversed(list(lr.coords)))
)
polygon = shapely.geometry.polygon.Polygon(
fit_linear_ring(lr_coords)
)
wkt.append(polygon.wkt)
shape = self.shape_from_geometry(geometry)
if not wkt:
shape = shapely.geometry.shape(geometry)
if not shape.is_valid:
log.error("Wrong geometry, not indexing")
return dataset_dict
if shape.bounds[0] < -180 or shape.bounds[2] > 180:
log.error(
"""
Geometries outside the -180, -90, 180, 90 boundaries are not supported,
you need to split the geometry in order to fit the parts. Not indexing"""
)
return dataset_dict
wkt = shape.wkt
dataset_dict["spatial_geom"] = wkt
return dataset_dict
def search_params(self, bbox, search_params):
bbox = fit_bbox(bbox)
if not search_params.get("fq_list"):
search_params["fq_list"] = []
default_spatial_query = "{{!field f={spatial_field}}}Intersects(ENVELOPE({minx}, {maxx}, {maxy}, {miny}))"
spatial_query = config.get("ckanext.spatial.solr_query", default_spatial_query)
search_params["fq_list"].append(
spatial_query.format(spatial_field="spatial_geom", **bbox)
)
return search_params
class PostgisSearchBackend(SpatialSearchBackend):
"""
Note: The PostGIS search functionality will be removed in future versions
"""
def index_dataset(self, dataset_dict):
return dataset_dict
def search_params(self, bbox, search_params):
from ckanext.spatial.postgis.model import bbox_query, bbox_query_ordered
from ckan.lib.search import SearchError
# Adjust easting values
while bbox["minx"] < -180:
bbox["minx"] += 360
bbox["maxx"] += 360
while bbox["minx"] > 180:
bbox["minx"] -= 360
bbox["maxx"] -= 360
# Note: This will be deprecated at some point in favour of the
# Solr 4 spatial sorting capabilities
if search_params.get("sort") == "spatial desc" and asbool(
config.get("ckanext.spatial.use_postgis_sorting", "False")
):
if search_params["q"] or search_params["fq"]:
raise SearchError(
"Spatial ranking cannot be mixed with other search parameters"
)
# ...because it is too inefficient to use SOLR to filter
# results and return the entire set to this class and
# after_search do the sorting and paging.
extents = bbox_query_ordered(bbox)
are_no_results = not extents
search_params["extras"]["ext_rows"] = search_params["rows"]
search_params["extras"]["ext_start"] = search_params["start"]
# this SOLR query needs to return no actual results since
# they are in the wrong order anyway. We just need this SOLR
# query to get the count and facet counts.
rows = 0
search_params["sort"] = None # SOLR should not sort.
# Store the rankings of the results for this page, so for
# after_search to construct the correctly sorted results
rows = search_params["extras"]["ext_rows"] = search_params["rows"]
start = search_params["extras"]["ext_start"] = search_params["start"]
search_params["extras"]["ext_spatial"] = [
(extent.package_id, extent.spatial_ranking)
for extent in extents[start : start + rows]
]
else:
extents = bbox_query(bbox)
are_no_results = extents.count() == 0
if are_no_results:
# We don't need to perform the search
search_params["abort_search"] = True
else:
# We'll perform the existing search but also filtering by the ids
# of datasets within the bbox
bbox_query_ids = [extent.package_id for extent in extents]
q = search_params.get("q", "").strip() or '""'
# Note: `"" AND` query doesn't work in github ci
new_q = "%s AND " % q if q and q != '""' else ""
new_q += "(%s)" % " OR ".join(["id:%s" % id for id in bbox_query_ids])
search_params["q"] = new_q
return search_params
search_backends = {
"solr-bbox": SolrBBoxSearchBackend,
"solr-spatial-field": SolrSpatialFieldSearchBackend,
"postgis": PostgisSearchBackend,
}