Refactor the search logic to move it to its own module
This commit is contained in:
parent
b89da29192
commit
15caaad9f1
|
@ -18,7 +18,6 @@ from ckan import plugins as p
|
|||
from ckan.lib.search import SearchError
|
||||
|
||||
from ckan.lib.helpers import json
|
||||
from ckanext.spatial.lib import normalize_bbox, fit_bbox, fit_linear_ring
|
||||
|
||||
if tk.check_ckan_version(min_version="2.9.0"):
|
||||
from ckanext.spatial.plugin.flask_plugin import (
|
||||
|
@ -29,6 +28,9 @@ else:
|
|||
SpatialQueryMixin, HarvestMetadataApiMixin
|
||||
)
|
||||
|
||||
from ckanext.spatial.lib import normalize_bbox
|
||||
from ckanext.spatial.search import search_backends
|
||||
|
||||
config = tk.config
|
||||
|
||||
log = getLogger(__name__)
|
||||
|
@ -227,85 +229,17 @@ class SpatialQuery(SpatialQueryMixin, p.SingletonPlugin):
|
|||
if not pkg_dict.get('extras_spatial'):
|
||||
return pkg_dict
|
||||
|
||||
pkg_dict = search_backends[search_backend]().index_dataset(pkg_dict)
|
||||
|
||||
# Coupled resources are URL -> uuid links, they are not needed in SOLR
|
||||
# and might be huge if there are lot of coupled resources
|
||||
pkg_dict.pop('coupled-resource', None)
|
||||
pkg_dict.pop('extras_coupled-resource', None)
|
||||
|
||||
# spatial field is geojson coordinate data, not needed in SOLR either
|
||||
geom_from_metadata = pkg_dict.pop('spatial', None)
|
||||
pkg_dict.pop('spatial', None)
|
||||
pkg_dict.pop('extras_spatial', None)
|
||||
|
||||
try:
|
||||
geometry = json.loads(geom_from_metadata)
|
||||
except (AttributeError, ValueError) as e:
|
||||
log.error('Geometry not valid JSON {}, not indexing :: {}'.format(e, geom_from_metadata[:100]))
|
||||
return pkg_dict
|
||||
|
||||
try:
|
||||
shape = shapely.geometry.shape(geometry)
|
||||
except GeometryTypeError as e:
|
||||
log.error('{}, not indexing :: {}'.format(e, geom_from_metadata[:100]))
|
||||
return pkg_dict
|
||||
|
||||
if search_backend == "solr-bbox":
|
||||
# We always index the envelope of the geometry regardless of
|
||||
# if it's an actual bounding box (polygon)
|
||||
|
||||
bounds = shape.bounds
|
||||
bbox = fit_bbox(normalize_bbox(list(bounds)))
|
||||
|
||||
pkg_dict["spatial_bbox"] = "ENVELOPE({minx}, {maxx}, {maxy}, {miny})".format(
|
||||
**bbox)
|
||||
|
||||
elif search_backend == 'solr-spatial-field':
|
||||
wkt = None
|
||||
|
||||
# We allow multiple geometries as GeometryCollections
|
||||
if geometry['type'] == 'GeometryCollection':
|
||||
geometries = geometry['geometries']
|
||||
else:
|
||||
geometries = [geometry]
|
||||
|
||||
# Check potential problems with bboxes in each geometry
|
||||
wkt = []
|
||||
for geom in geometries:
|
||||
if geom['type'] == 'Polygon' \
|
||||
and len(geom['coordinates']) == 1 \
|
||||
and len(geom['coordinates'][0]) == 5:
|
||||
|
||||
# Check wrong bboxes (4 same points)
|
||||
xs = [p[0] for p in geom['coordinates'][0]]
|
||||
ys = [p[1] for p in geom['coordinates'][0]]
|
||||
|
||||
if xs.count(xs[0]) == 5 and ys.count(ys[0]) == 5:
|
||||
wkt.append('POINT({x} {y})'.format(x=xs[0], y=ys[0]))
|
||||
else:
|
||||
# Check if coordinates are defined counter-clockwise,
|
||||
# otherwise we'll get wrong results from Solr
|
||||
lr = shapely.geometry.polygon.LinearRing(geom['coordinates'][0])
|
||||
lr_coords = (
|
||||
list(lr.coords) if lr.is_ccw
|
||||
else list(reversed(list(lr.coords)))
|
||||
)
|
||||
polygon = shapely.geometry.polygon.Polygon(
|
||||
fit_linear_ring(lr_coords))
|
||||
wkt.append(polygon.wkt)
|
||||
|
||||
if not wkt:
|
||||
shape = shapely.geometry.shape(geometry)
|
||||
if not shape.is_valid:
|
||||
log.error('Wrong geometry, not indexing')
|
||||
return pkg_dict
|
||||
if shape.bounds[0] < -180 or shape.bounds[2] > 180:
|
||||
log.error("""
|
||||
Geometries outside the -180, -90, 180, 90 boundaries are not supported,
|
||||
you need to split the geometry in order to fit the parts. Not indexing""")
|
||||
return pkg_dict
|
||||
wkt = shape.wkt
|
||||
|
||||
pkg_dict['spatial_geom'] = wkt
|
||||
|
||||
return pkg_dict
|
||||
|
||||
def before_dataset_search(self, search_params):
|
||||
|
@ -316,93 +250,12 @@ you need to split the geometry in order to fit the parts. Not indexing""")
|
|||
|
||||
if input_bbox:
|
||||
bbox = normalize_bbox(input_bbox)
|
||||
|
||||
if not bbox:
|
||||
raise SearchError('Wrong bounding box provided')
|
||||
|
||||
if search_backend in ("solr-bbox", "solr-spatial-field"):
|
||||
|
||||
bbox = fit_bbox(bbox)
|
||||
|
||||
if not search_params.get("fq_list"):
|
||||
search_params["fq_list"] = []
|
||||
|
||||
spatial_field = (
|
||||
"spatial_bbox" if search_backend == "solr-bbox" else "spatial_geom"
|
||||
)
|
||||
|
||||
default_spatial_query = "{{!field f={spatial_field}}}Intersects(ENVELOPE({minx}, {maxx}, {maxy}, {miny}))"
|
||||
|
||||
spatial_query = config.get(
|
||||
"ckanext.spatial.solr_query", default_spatial_query)
|
||||
|
||||
search_params["fq_list"].append(
|
||||
spatial_query.format(
|
||||
spatial_field=spatial_field, **bbox)
|
||||
)
|
||||
|
||||
elif search_backend == 'postgis':
|
||||
search_params = self._params_for_postgis_search(bbox, search_params)
|
||||
|
||||
return search_params
|
||||
|
||||
def _params_for_postgis_search(self, bbox, search_params):
|
||||
"""
|
||||
Note: The PostGIS search functionality will be removed in future versions
|
||||
"""
|
||||
from ckanext.spatial.postgis.model import bbox_query, bbox_query_ordered
|
||||
from ckan.lib.search import SearchError
|
||||
|
||||
# Adjust easting values
|
||||
while (bbox['minx'] < -180):
|
||||
bbox['minx'] += 360
|
||||
bbox['maxx'] += 360
|
||||
while (bbox['minx'] > 180):
|
||||
bbox['minx'] -= 360
|
||||
bbox['maxx'] -= 360
|
||||
|
||||
# Note: This will be deprecated at some point in favour of the
|
||||
# Solr 4 spatial sorting capabilities
|
||||
if search_params.get('sort') == 'spatial desc' and \
|
||||
tk.asbool(config.get('ckanext.spatial.use_postgis_sorting', 'False')):
|
||||
if search_params['q'] or search_params['fq']:
|
||||
raise SearchError('Spatial ranking cannot be mixed with other search parameters')
|
||||
# ...because it is too inefficient to use SOLR to filter
|
||||
# results and return the entire set to this class and
|
||||
# after_search do the sorting and paging.
|
||||
extents = bbox_query_ordered(bbox)
|
||||
are_no_results = not extents
|
||||
search_params['extras']['ext_rows'] = search_params['rows']
|
||||
search_params['extras']['ext_start'] = search_params['start']
|
||||
# this SOLR query needs to return no actual results since
|
||||
# they are in the wrong order anyway. We just need this SOLR
|
||||
# query to get the count and facet counts.
|
||||
rows = 0
|
||||
search_params['sort'] = None # SOLR should not sort.
|
||||
# Store the rankings of the results for this page, so for
|
||||
# after_search to construct the correctly sorted results
|
||||
rows = search_params['extras']['ext_rows'] = search_params['rows']
|
||||
start = search_params['extras']['ext_start'] = search_params['start']
|
||||
search_params['extras']['ext_spatial'] = [
|
||||
(extent.package_id, extent.spatial_ranking) \
|
||||
for extent in extents[start:start+rows]]
|
||||
else:
|
||||
extents = bbox_query(bbox)
|
||||
are_no_results = extents.count() == 0
|
||||
|
||||
if are_no_results:
|
||||
# We don't need to perform the search
|
||||
search_params['abort_search'] = True
|
||||
else:
|
||||
# We'll perform the existing search but also filtering by the ids
|
||||
# of datasets within the bbox
|
||||
bbox_query_ids = [extent.package_id for extent in extents]
|
||||
|
||||
q = search_params.get('q','').strip() or '""'
|
||||
# Note: `"" AND` query doesn't work in github ci
|
||||
new_q = '%s AND ' % q if q and q != '""' else ''
|
||||
new_q += '(%s)' % ' OR '.join(['id:%s' % id for id in bbox_query_ids])
|
||||
|
||||
search_params['q'] = new_q
|
||||
search_params = search_backends[search_backend]().search_params(
|
||||
bbox, search_params)
|
||||
|
||||
return search_params
|
||||
|
||||
|
|
|
@ -0,0 +1,242 @@
|
|||
import json
|
||||
import logging
|
||||
|
||||
import shapely.geometry
|
||||
|
||||
try:
|
||||
from shapely.errors import GeometryTypeError
|
||||
except ImportError:
|
||||
# Previous version of shapely uses ValueError and TypeError
|
||||
GeometryTypeError = (ValueError, TypeError)
|
||||
|
||||
from ckantoolkit import config, asbool
|
||||
from ckanext.spatial.lib import normalize_bbox, fit_bbox, fit_linear_ring
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class SpatialSearchBackend:
|
||||
"""Base class for all datastore backends."""
|
||||
|
||||
def parse_geojson(self, geom_from_metadata):
|
||||
|
||||
try:
|
||||
geometry = json.loads(geom_from_metadata)
|
||||
except (AttributeError, ValueError) as e:
|
||||
log.error(
|
||||
"Geometry not valid JSON {}, not indexing :: {}".format(
|
||||
e, geom_from_metadata[:100]
|
||||
)
|
||||
)
|
||||
return None
|
||||
|
||||
return geometry
|
||||
|
||||
def shape_from_geometry(self, geometry):
|
||||
try:
|
||||
shape = shapely.geometry.shape(geometry)
|
||||
except GeometryTypeError as e:
|
||||
log.error("{}, not indexing :: {}".format(e, json.dumps(geometry)[:100]))
|
||||
return None
|
||||
|
||||
return shape
|
||||
|
||||
|
||||
class SolrBBoxSearchBackend(SpatialSearchBackend):
|
||||
def index_dataset(self, dataset_dict):
|
||||
"""
|
||||
We always index the envelope of the geometry regardless of
|
||||
if it's an actual bounding box (polygon)
|
||||
"""
|
||||
|
||||
geom_from_metadata = dataset_dict.get("spatial")
|
||||
geometry = self.parse_geojson(geom_from_metadata)
|
||||
shape = self.shape_from_geometry(geometry)
|
||||
|
||||
if not shape:
|
||||
return dataset_dict
|
||||
|
||||
bounds = shape.bounds
|
||||
bbox = fit_bbox(normalize_bbox(list(bounds)))
|
||||
|
||||
dataset_dict[
|
||||
"spatial_bbox"
|
||||
] = "ENVELOPE({minx}, {maxx}, {maxy}, {miny})".format(**bbox)
|
||||
|
||||
return dataset_dict
|
||||
|
||||
def search_params(self, bbox, search_params):
|
||||
|
||||
bbox = fit_bbox(bbox)
|
||||
|
||||
if not search_params.get("fq_list"):
|
||||
search_params["fq_list"] = []
|
||||
|
||||
default_spatial_query = "{{!field f={spatial_field}}}Intersects(ENVELOPE({minx}, {maxx}, {maxy}, {miny}))"
|
||||
|
||||
spatial_query = config.get("ckanext.spatial.solr_query", default_spatial_query)
|
||||
|
||||
search_params["fq_list"].append(
|
||||
spatial_query.format(spatial_field="spatial_bbox", **bbox)
|
||||
)
|
||||
|
||||
return search_params
|
||||
|
||||
|
||||
class SolrSpatialFieldSearchBackend(SpatialSearchBackend):
|
||||
def index_dataset(self, dataset_dict):
|
||||
wkt = None
|
||||
geom_from_metadata = dataset_dict.get("spatial")
|
||||
geometry = self.parse_geojson(geom_from_metadata)
|
||||
if not geometry:
|
||||
return dataset_dict
|
||||
|
||||
# We allow multiple geometries as GeometryCollections
|
||||
if geometry["type"] == "GeometryCollection":
|
||||
geometries = geometry["geometries"]
|
||||
else:
|
||||
geometries = [geometry]
|
||||
|
||||
# Check potential problems with bboxes in each geometry
|
||||
wkt = []
|
||||
for geom in geometries:
|
||||
if (
|
||||
geom["type"] == "Polygon"
|
||||
and len(geom["coordinates"]) == 1
|
||||
and len(geom["coordinates"][0]) == 5
|
||||
):
|
||||
|
||||
# Check wrong bboxes (4 same points)
|
||||
xs = [p[0] for p in geom["coordinates"][0]]
|
||||
ys = [p[1] for p in geom["coordinates"][0]]
|
||||
|
||||
if xs.count(xs[0]) == 5 and ys.count(ys[0]) == 5:
|
||||
wkt.append("POINT({x} {y})".format(x=xs[0], y=ys[0]))
|
||||
else:
|
||||
# Check if coordinates are defined counter-clockwise,
|
||||
# otherwise we'll get wrong results from Solr
|
||||
lr = shapely.geometry.polygon.LinearRing(geom["coordinates"][0])
|
||||
lr_coords = (
|
||||
list(lr.coords)
|
||||
if lr.is_ccw
|
||||
else list(reversed(list(lr.coords)))
|
||||
)
|
||||
polygon = shapely.geometry.polygon.Polygon(
|
||||
fit_linear_ring(lr_coords)
|
||||
)
|
||||
wkt.append(polygon.wkt)
|
||||
|
||||
shape = self.shape_from_geometry(geometry)
|
||||
|
||||
if not wkt:
|
||||
shape = shapely.geometry.shape(geometry)
|
||||
if not shape.is_valid:
|
||||
log.error("Wrong geometry, not indexing")
|
||||
return dataset_dict
|
||||
if shape.bounds[0] < -180 or shape.bounds[2] > 180:
|
||||
log.error(
|
||||
"""
|
||||
Geometries outside the -180, -90, 180, 90 boundaries are not supported,
|
||||
you need to split the geometry in order to fit the parts. Not indexing"""
|
||||
)
|
||||
return dataset_dict
|
||||
wkt = shape.wkt
|
||||
|
||||
dataset_dict["spatial_geom"] = wkt
|
||||
|
||||
return dataset_dict
|
||||
|
||||
def search_params(self, bbox, search_params):
|
||||
|
||||
bbox = fit_bbox(bbox)
|
||||
|
||||
if not search_params.get("fq_list"):
|
||||
search_params["fq_list"] = []
|
||||
|
||||
default_spatial_query = "{{!field f={spatial_field}}}Intersects(ENVELOPE({minx}, {maxx}, {maxy}, {miny}))"
|
||||
|
||||
spatial_query = config.get("ckanext.spatial.solr_query", default_spatial_query)
|
||||
|
||||
search_params["fq_list"].append(
|
||||
spatial_query.format(spatial_field="spatial_geom", **bbox)
|
||||
)
|
||||
|
||||
return search_params
|
||||
|
||||
|
||||
class PostgisSearchBackend(SpatialSearchBackend):
|
||||
"""
|
||||
Note: The PostGIS search functionality will be removed in future versions
|
||||
"""
|
||||
|
||||
def index_dataset(self, dataset_dict):
|
||||
return dataset_dict
|
||||
|
||||
def search_params(self, bbox, search_params):
|
||||
from ckanext.spatial.postgis.model import bbox_query, bbox_query_ordered
|
||||
from ckan.lib.search import SearchError
|
||||
|
||||
# Adjust easting values
|
||||
while bbox["minx"] < -180:
|
||||
bbox["minx"] += 360
|
||||
bbox["maxx"] += 360
|
||||
while bbox["minx"] > 180:
|
||||
bbox["minx"] -= 360
|
||||
bbox["maxx"] -= 360
|
||||
|
||||
# Note: This will be deprecated at some point in favour of the
|
||||
# Solr 4 spatial sorting capabilities
|
||||
if search_params.get("sort") == "spatial desc" and asbool(
|
||||
config.get("ckanext.spatial.use_postgis_sorting", "False")
|
||||
):
|
||||
if search_params["q"] or search_params["fq"]:
|
||||
raise SearchError(
|
||||
"Spatial ranking cannot be mixed with other search parameters"
|
||||
)
|
||||
# ...because it is too inefficient to use SOLR to filter
|
||||
# results and return the entire set to this class and
|
||||
# after_search do the sorting and paging.
|
||||
extents = bbox_query_ordered(bbox)
|
||||
are_no_results = not extents
|
||||
search_params["extras"]["ext_rows"] = search_params["rows"]
|
||||
search_params["extras"]["ext_start"] = search_params["start"]
|
||||
# this SOLR query needs to return no actual results since
|
||||
# they are in the wrong order anyway. We just need this SOLR
|
||||
# query to get the count and facet counts.
|
||||
rows = 0
|
||||
search_params["sort"] = None # SOLR should not sort.
|
||||
# Store the rankings of the results for this page, so for
|
||||
# after_search to construct the correctly sorted results
|
||||
rows = search_params["extras"]["ext_rows"] = search_params["rows"]
|
||||
start = search_params["extras"]["ext_start"] = search_params["start"]
|
||||
search_params["extras"]["ext_spatial"] = [
|
||||
(extent.package_id, extent.spatial_ranking)
|
||||
for extent in extents[start : start + rows]
|
||||
]
|
||||
else:
|
||||
extents = bbox_query(bbox)
|
||||
are_no_results = extents.count() == 0
|
||||
|
||||
if are_no_results:
|
||||
# We don't need to perform the search
|
||||
search_params["abort_search"] = True
|
||||
else:
|
||||
# We'll perform the existing search but also filtering by the ids
|
||||
# of datasets within the bbox
|
||||
bbox_query_ids = [extent.package_id for extent in extents]
|
||||
|
||||
q = search_params.get("q", "").strip() or '""'
|
||||
# Note: `"" AND` query doesn't work in github ci
|
||||
new_q = "%s AND " % q if q and q != '""' else ""
|
||||
new_q += "(%s)" % " OR ".join(["id:%s" % id for id in bbox_query_ids])
|
||||
|
||||
search_params["q"] = new_q
|
||||
|
||||
return search_params
|
||||
|
||||
|
||||
search_backends = {
|
||||
"solr-bbox": SolrBBoxSearchBackend,
|
||||
"solr-spatial-field": SolrSpatialFieldSearchBackend,
|
||||
"postgis": PostgisSearchBackend,
|
||||
}
|
Loading…
Reference in New Issue