Refactor the search logic to move it to its own module
This commit is contained in:
parent
b89da29192
commit
15caaad9f1
|
@ -18,7 +18,6 @@ from ckan import plugins as p
|
||||||
from ckan.lib.search import SearchError
|
from ckan.lib.search import SearchError
|
||||||
|
|
||||||
from ckan.lib.helpers import json
|
from ckan.lib.helpers import json
|
||||||
from ckanext.spatial.lib import normalize_bbox, fit_bbox, fit_linear_ring
|
|
||||||
|
|
||||||
if tk.check_ckan_version(min_version="2.9.0"):
|
if tk.check_ckan_version(min_version="2.9.0"):
|
||||||
from ckanext.spatial.plugin.flask_plugin import (
|
from ckanext.spatial.plugin.flask_plugin import (
|
||||||
|
@ -29,6 +28,9 @@ else:
|
||||||
SpatialQueryMixin, HarvestMetadataApiMixin
|
SpatialQueryMixin, HarvestMetadataApiMixin
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from ckanext.spatial.lib import normalize_bbox
|
||||||
|
from ckanext.spatial.search import search_backends
|
||||||
|
|
||||||
config = tk.config
|
config = tk.config
|
||||||
|
|
||||||
log = getLogger(__name__)
|
log = getLogger(__name__)
|
||||||
|
@ -227,85 +229,17 @@ class SpatialQuery(SpatialQueryMixin, p.SingletonPlugin):
|
||||||
if not pkg_dict.get('extras_spatial'):
|
if not pkg_dict.get('extras_spatial'):
|
||||||
return pkg_dict
|
return pkg_dict
|
||||||
|
|
||||||
|
pkg_dict = search_backends[search_backend]().index_dataset(pkg_dict)
|
||||||
|
|
||||||
# Coupled resources are URL -> uuid links, they are not needed in SOLR
|
# Coupled resources are URL -> uuid links, they are not needed in SOLR
|
||||||
# and might be huge if there are lot of coupled resources
|
# and might be huge if there are lot of coupled resources
|
||||||
pkg_dict.pop('coupled-resource', None)
|
pkg_dict.pop('coupled-resource', None)
|
||||||
pkg_dict.pop('extras_coupled-resource', None)
|
pkg_dict.pop('extras_coupled-resource', None)
|
||||||
|
|
||||||
# spatial field is geojson coordinate data, not needed in SOLR either
|
# spatial field is geojson coordinate data, not needed in SOLR either
|
||||||
geom_from_metadata = pkg_dict.pop('spatial', None)
|
pkg_dict.pop('spatial', None)
|
||||||
pkg_dict.pop('extras_spatial', None)
|
pkg_dict.pop('extras_spatial', None)
|
||||||
|
|
||||||
try:
|
|
||||||
geometry = json.loads(geom_from_metadata)
|
|
||||||
except (AttributeError, ValueError) as e:
|
|
||||||
log.error('Geometry not valid JSON {}, not indexing :: {}'.format(e, geom_from_metadata[:100]))
|
|
||||||
return pkg_dict
|
|
||||||
|
|
||||||
try:
|
|
||||||
shape = shapely.geometry.shape(geometry)
|
|
||||||
except GeometryTypeError as e:
|
|
||||||
log.error('{}, not indexing :: {}'.format(e, geom_from_metadata[:100]))
|
|
||||||
return pkg_dict
|
|
||||||
|
|
||||||
if search_backend == "solr-bbox":
|
|
||||||
# We always index the envelope of the geometry regardless of
|
|
||||||
# if it's an actual bounding box (polygon)
|
|
||||||
|
|
||||||
bounds = shape.bounds
|
|
||||||
bbox = fit_bbox(normalize_bbox(list(bounds)))
|
|
||||||
|
|
||||||
pkg_dict["spatial_bbox"] = "ENVELOPE({minx}, {maxx}, {maxy}, {miny})".format(
|
|
||||||
**bbox)
|
|
||||||
|
|
||||||
elif search_backend == 'solr-spatial-field':
|
|
||||||
wkt = None
|
|
||||||
|
|
||||||
# We allow multiple geometries as GeometryCollections
|
|
||||||
if geometry['type'] == 'GeometryCollection':
|
|
||||||
geometries = geometry['geometries']
|
|
||||||
else:
|
|
||||||
geometries = [geometry]
|
|
||||||
|
|
||||||
# Check potential problems with bboxes in each geometry
|
|
||||||
wkt = []
|
|
||||||
for geom in geometries:
|
|
||||||
if geom['type'] == 'Polygon' \
|
|
||||||
and len(geom['coordinates']) == 1 \
|
|
||||||
and len(geom['coordinates'][0]) == 5:
|
|
||||||
|
|
||||||
# Check wrong bboxes (4 same points)
|
|
||||||
xs = [p[0] for p in geom['coordinates'][0]]
|
|
||||||
ys = [p[1] for p in geom['coordinates'][0]]
|
|
||||||
|
|
||||||
if xs.count(xs[0]) == 5 and ys.count(ys[0]) == 5:
|
|
||||||
wkt.append('POINT({x} {y})'.format(x=xs[0], y=ys[0]))
|
|
||||||
else:
|
|
||||||
# Check if coordinates are defined counter-clockwise,
|
|
||||||
# otherwise we'll get wrong results from Solr
|
|
||||||
lr = shapely.geometry.polygon.LinearRing(geom['coordinates'][0])
|
|
||||||
lr_coords = (
|
|
||||||
list(lr.coords) if lr.is_ccw
|
|
||||||
else list(reversed(list(lr.coords)))
|
|
||||||
)
|
|
||||||
polygon = shapely.geometry.polygon.Polygon(
|
|
||||||
fit_linear_ring(lr_coords))
|
|
||||||
wkt.append(polygon.wkt)
|
|
||||||
|
|
||||||
if not wkt:
|
|
||||||
shape = shapely.geometry.shape(geometry)
|
|
||||||
if not shape.is_valid:
|
|
||||||
log.error('Wrong geometry, not indexing')
|
|
||||||
return pkg_dict
|
|
||||||
if shape.bounds[0] < -180 or shape.bounds[2] > 180:
|
|
||||||
log.error("""
|
|
||||||
Geometries outside the -180, -90, 180, 90 boundaries are not supported,
|
|
||||||
you need to split the geometry in order to fit the parts. Not indexing""")
|
|
||||||
return pkg_dict
|
|
||||||
wkt = shape.wkt
|
|
||||||
|
|
||||||
pkg_dict['spatial_geom'] = wkt
|
|
||||||
|
|
||||||
return pkg_dict
|
return pkg_dict
|
||||||
|
|
||||||
def before_dataset_search(self, search_params):
|
def before_dataset_search(self, search_params):
|
||||||
|
@ -316,93 +250,12 @@ you need to split the geometry in order to fit the parts. Not indexing""")
|
||||||
|
|
||||||
if input_bbox:
|
if input_bbox:
|
||||||
bbox = normalize_bbox(input_bbox)
|
bbox = normalize_bbox(input_bbox)
|
||||||
|
|
||||||
if not bbox:
|
if not bbox:
|
||||||
raise SearchError('Wrong bounding box provided')
|
raise SearchError('Wrong bounding box provided')
|
||||||
|
|
||||||
if search_backend in ("solr-bbox", "solr-spatial-field"):
|
search_params = search_backends[search_backend]().search_params(
|
||||||
|
bbox, search_params)
|
||||||
bbox = fit_bbox(bbox)
|
|
||||||
|
|
||||||
if not search_params.get("fq_list"):
|
|
||||||
search_params["fq_list"] = []
|
|
||||||
|
|
||||||
spatial_field = (
|
|
||||||
"spatial_bbox" if search_backend == "solr-bbox" else "spatial_geom"
|
|
||||||
)
|
|
||||||
|
|
||||||
default_spatial_query = "{{!field f={spatial_field}}}Intersects(ENVELOPE({minx}, {maxx}, {maxy}, {miny}))"
|
|
||||||
|
|
||||||
spatial_query = config.get(
|
|
||||||
"ckanext.spatial.solr_query", default_spatial_query)
|
|
||||||
|
|
||||||
search_params["fq_list"].append(
|
|
||||||
spatial_query.format(
|
|
||||||
spatial_field=spatial_field, **bbox)
|
|
||||||
)
|
|
||||||
|
|
||||||
elif search_backend == 'postgis':
|
|
||||||
search_params = self._params_for_postgis_search(bbox, search_params)
|
|
||||||
|
|
||||||
return search_params
|
|
||||||
|
|
||||||
def _params_for_postgis_search(self, bbox, search_params):
|
|
||||||
"""
|
|
||||||
Note: The PostGIS search functionality will be removed in future versions
|
|
||||||
"""
|
|
||||||
from ckanext.spatial.postgis.model import bbox_query, bbox_query_ordered
|
|
||||||
from ckan.lib.search import SearchError
|
|
||||||
|
|
||||||
# Adjust easting values
|
|
||||||
while (bbox['minx'] < -180):
|
|
||||||
bbox['minx'] += 360
|
|
||||||
bbox['maxx'] += 360
|
|
||||||
while (bbox['minx'] > 180):
|
|
||||||
bbox['minx'] -= 360
|
|
||||||
bbox['maxx'] -= 360
|
|
||||||
|
|
||||||
# Note: This will be deprecated at some point in favour of the
|
|
||||||
# Solr 4 spatial sorting capabilities
|
|
||||||
if search_params.get('sort') == 'spatial desc' and \
|
|
||||||
tk.asbool(config.get('ckanext.spatial.use_postgis_sorting', 'False')):
|
|
||||||
if search_params['q'] or search_params['fq']:
|
|
||||||
raise SearchError('Spatial ranking cannot be mixed with other search parameters')
|
|
||||||
# ...because it is too inefficient to use SOLR to filter
|
|
||||||
# results and return the entire set to this class and
|
|
||||||
# after_search do the sorting and paging.
|
|
||||||
extents = bbox_query_ordered(bbox)
|
|
||||||
are_no_results = not extents
|
|
||||||
search_params['extras']['ext_rows'] = search_params['rows']
|
|
||||||
search_params['extras']['ext_start'] = search_params['start']
|
|
||||||
# this SOLR query needs to return no actual results since
|
|
||||||
# they are in the wrong order anyway. We just need this SOLR
|
|
||||||
# query to get the count and facet counts.
|
|
||||||
rows = 0
|
|
||||||
search_params['sort'] = None # SOLR should not sort.
|
|
||||||
# Store the rankings of the results for this page, so for
|
|
||||||
# after_search to construct the correctly sorted results
|
|
||||||
rows = search_params['extras']['ext_rows'] = search_params['rows']
|
|
||||||
start = search_params['extras']['ext_start'] = search_params['start']
|
|
||||||
search_params['extras']['ext_spatial'] = [
|
|
||||||
(extent.package_id, extent.spatial_ranking) \
|
|
||||||
for extent in extents[start:start+rows]]
|
|
||||||
else:
|
|
||||||
extents = bbox_query(bbox)
|
|
||||||
are_no_results = extents.count() == 0
|
|
||||||
|
|
||||||
if are_no_results:
|
|
||||||
# We don't need to perform the search
|
|
||||||
search_params['abort_search'] = True
|
|
||||||
else:
|
|
||||||
# We'll perform the existing search but also filtering by the ids
|
|
||||||
# of datasets within the bbox
|
|
||||||
bbox_query_ids = [extent.package_id for extent in extents]
|
|
||||||
|
|
||||||
q = search_params.get('q','').strip() or '""'
|
|
||||||
# Note: `"" AND` query doesn't work in github ci
|
|
||||||
new_q = '%s AND ' % q if q and q != '""' else ''
|
|
||||||
new_q += '(%s)' % ' OR '.join(['id:%s' % id for id in bbox_query_ids])
|
|
||||||
|
|
||||||
search_params['q'] = new_q
|
|
||||||
|
|
||||||
return search_params
|
return search_params
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,242 @@
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import shapely.geometry
|
||||||
|
|
||||||
|
try:
|
||||||
|
from shapely.errors import GeometryTypeError
|
||||||
|
except ImportError:
|
||||||
|
# Previous version of shapely uses ValueError and TypeError
|
||||||
|
GeometryTypeError = (ValueError, TypeError)
|
||||||
|
|
||||||
|
from ckantoolkit import config, asbool
|
||||||
|
from ckanext.spatial.lib import normalize_bbox, fit_bbox, fit_linear_ring
|
||||||
|
|
||||||
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class SpatialSearchBackend:
|
||||||
|
"""Base class for all datastore backends."""
|
||||||
|
|
||||||
|
def parse_geojson(self, geom_from_metadata):
|
||||||
|
|
||||||
|
try:
|
||||||
|
geometry = json.loads(geom_from_metadata)
|
||||||
|
except (AttributeError, ValueError) as e:
|
||||||
|
log.error(
|
||||||
|
"Geometry not valid JSON {}, not indexing :: {}".format(
|
||||||
|
e, geom_from_metadata[:100]
|
||||||
|
)
|
||||||
|
)
|
||||||
|
return None
|
||||||
|
|
||||||
|
return geometry
|
||||||
|
|
||||||
|
def shape_from_geometry(self, geometry):
|
||||||
|
try:
|
||||||
|
shape = shapely.geometry.shape(geometry)
|
||||||
|
except GeometryTypeError as e:
|
||||||
|
log.error("{}, not indexing :: {}".format(e, json.dumps(geometry)[:100]))
|
||||||
|
return None
|
||||||
|
|
||||||
|
return shape
|
||||||
|
|
||||||
|
|
||||||
|
class SolrBBoxSearchBackend(SpatialSearchBackend):
|
||||||
|
def index_dataset(self, dataset_dict):
|
||||||
|
"""
|
||||||
|
We always index the envelope of the geometry regardless of
|
||||||
|
if it's an actual bounding box (polygon)
|
||||||
|
"""
|
||||||
|
|
||||||
|
geom_from_metadata = dataset_dict.get("spatial")
|
||||||
|
geometry = self.parse_geojson(geom_from_metadata)
|
||||||
|
shape = self.shape_from_geometry(geometry)
|
||||||
|
|
||||||
|
if not shape:
|
||||||
|
return dataset_dict
|
||||||
|
|
||||||
|
bounds = shape.bounds
|
||||||
|
bbox = fit_bbox(normalize_bbox(list(bounds)))
|
||||||
|
|
||||||
|
dataset_dict[
|
||||||
|
"spatial_bbox"
|
||||||
|
] = "ENVELOPE({minx}, {maxx}, {maxy}, {miny})".format(**bbox)
|
||||||
|
|
||||||
|
return dataset_dict
|
||||||
|
|
||||||
|
def search_params(self, bbox, search_params):
|
||||||
|
|
||||||
|
bbox = fit_bbox(bbox)
|
||||||
|
|
||||||
|
if not search_params.get("fq_list"):
|
||||||
|
search_params["fq_list"] = []
|
||||||
|
|
||||||
|
default_spatial_query = "{{!field f={spatial_field}}}Intersects(ENVELOPE({minx}, {maxx}, {maxy}, {miny}))"
|
||||||
|
|
||||||
|
spatial_query = config.get("ckanext.spatial.solr_query", default_spatial_query)
|
||||||
|
|
||||||
|
search_params["fq_list"].append(
|
||||||
|
spatial_query.format(spatial_field="spatial_bbox", **bbox)
|
||||||
|
)
|
||||||
|
|
||||||
|
return search_params
|
||||||
|
|
||||||
|
|
||||||
|
class SolrSpatialFieldSearchBackend(SpatialSearchBackend):
|
||||||
|
def index_dataset(self, dataset_dict):
|
||||||
|
wkt = None
|
||||||
|
geom_from_metadata = dataset_dict.get("spatial")
|
||||||
|
geometry = self.parse_geojson(geom_from_metadata)
|
||||||
|
if not geometry:
|
||||||
|
return dataset_dict
|
||||||
|
|
||||||
|
# We allow multiple geometries as GeometryCollections
|
||||||
|
if geometry["type"] == "GeometryCollection":
|
||||||
|
geometries = geometry["geometries"]
|
||||||
|
else:
|
||||||
|
geometries = [geometry]
|
||||||
|
|
||||||
|
# Check potential problems with bboxes in each geometry
|
||||||
|
wkt = []
|
||||||
|
for geom in geometries:
|
||||||
|
if (
|
||||||
|
geom["type"] == "Polygon"
|
||||||
|
and len(geom["coordinates"]) == 1
|
||||||
|
and len(geom["coordinates"][0]) == 5
|
||||||
|
):
|
||||||
|
|
||||||
|
# Check wrong bboxes (4 same points)
|
||||||
|
xs = [p[0] for p in geom["coordinates"][0]]
|
||||||
|
ys = [p[1] for p in geom["coordinates"][0]]
|
||||||
|
|
||||||
|
if xs.count(xs[0]) == 5 and ys.count(ys[0]) == 5:
|
||||||
|
wkt.append("POINT({x} {y})".format(x=xs[0], y=ys[0]))
|
||||||
|
else:
|
||||||
|
# Check if coordinates are defined counter-clockwise,
|
||||||
|
# otherwise we'll get wrong results from Solr
|
||||||
|
lr = shapely.geometry.polygon.LinearRing(geom["coordinates"][0])
|
||||||
|
lr_coords = (
|
||||||
|
list(lr.coords)
|
||||||
|
if lr.is_ccw
|
||||||
|
else list(reversed(list(lr.coords)))
|
||||||
|
)
|
||||||
|
polygon = shapely.geometry.polygon.Polygon(
|
||||||
|
fit_linear_ring(lr_coords)
|
||||||
|
)
|
||||||
|
wkt.append(polygon.wkt)
|
||||||
|
|
||||||
|
shape = self.shape_from_geometry(geometry)
|
||||||
|
|
||||||
|
if not wkt:
|
||||||
|
shape = shapely.geometry.shape(geometry)
|
||||||
|
if not shape.is_valid:
|
||||||
|
log.error("Wrong geometry, not indexing")
|
||||||
|
return dataset_dict
|
||||||
|
if shape.bounds[0] < -180 or shape.bounds[2] > 180:
|
||||||
|
log.error(
|
||||||
|
"""
|
||||||
|
Geometries outside the -180, -90, 180, 90 boundaries are not supported,
|
||||||
|
you need to split the geometry in order to fit the parts. Not indexing"""
|
||||||
|
)
|
||||||
|
return dataset_dict
|
||||||
|
wkt = shape.wkt
|
||||||
|
|
||||||
|
dataset_dict["spatial_geom"] = wkt
|
||||||
|
|
||||||
|
return dataset_dict
|
||||||
|
|
||||||
|
def search_params(self, bbox, search_params):
|
||||||
|
|
||||||
|
bbox = fit_bbox(bbox)
|
||||||
|
|
||||||
|
if not search_params.get("fq_list"):
|
||||||
|
search_params["fq_list"] = []
|
||||||
|
|
||||||
|
default_spatial_query = "{{!field f={spatial_field}}}Intersects(ENVELOPE({minx}, {maxx}, {maxy}, {miny}))"
|
||||||
|
|
||||||
|
spatial_query = config.get("ckanext.spatial.solr_query", default_spatial_query)
|
||||||
|
|
||||||
|
search_params["fq_list"].append(
|
||||||
|
spatial_query.format(spatial_field="spatial_geom", **bbox)
|
||||||
|
)
|
||||||
|
|
||||||
|
return search_params
|
||||||
|
|
||||||
|
|
||||||
|
class PostgisSearchBackend(SpatialSearchBackend):
|
||||||
|
"""
|
||||||
|
Note: The PostGIS search functionality will be removed in future versions
|
||||||
|
"""
|
||||||
|
|
||||||
|
def index_dataset(self, dataset_dict):
|
||||||
|
return dataset_dict
|
||||||
|
|
||||||
|
def search_params(self, bbox, search_params):
|
||||||
|
from ckanext.spatial.postgis.model import bbox_query, bbox_query_ordered
|
||||||
|
from ckan.lib.search import SearchError
|
||||||
|
|
||||||
|
# Adjust easting values
|
||||||
|
while bbox["minx"] < -180:
|
||||||
|
bbox["minx"] += 360
|
||||||
|
bbox["maxx"] += 360
|
||||||
|
while bbox["minx"] > 180:
|
||||||
|
bbox["minx"] -= 360
|
||||||
|
bbox["maxx"] -= 360
|
||||||
|
|
||||||
|
# Note: This will be deprecated at some point in favour of the
|
||||||
|
# Solr 4 spatial sorting capabilities
|
||||||
|
if search_params.get("sort") == "spatial desc" and asbool(
|
||||||
|
config.get("ckanext.spatial.use_postgis_sorting", "False")
|
||||||
|
):
|
||||||
|
if search_params["q"] or search_params["fq"]:
|
||||||
|
raise SearchError(
|
||||||
|
"Spatial ranking cannot be mixed with other search parameters"
|
||||||
|
)
|
||||||
|
# ...because it is too inefficient to use SOLR to filter
|
||||||
|
# results and return the entire set to this class and
|
||||||
|
# after_search do the sorting and paging.
|
||||||
|
extents = bbox_query_ordered(bbox)
|
||||||
|
are_no_results = not extents
|
||||||
|
search_params["extras"]["ext_rows"] = search_params["rows"]
|
||||||
|
search_params["extras"]["ext_start"] = search_params["start"]
|
||||||
|
# this SOLR query needs to return no actual results since
|
||||||
|
# they are in the wrong order anyway. We just need this SOLR
|
||||||
|
# query to get the count and facet counts.
|
||||||
|
rows = 0
|
||||||
|
search_params["sort"] = None # SOLR should not sort.
|
||||||
|
# Store the rankings of the results for this page, so for
|
||||||
|
# after_search to construct the correctly sorted results
|
||||||
|
rows = search_params["extras"]["ext_rows"] = search_params["rows"]
|
||||||
|
start = search_params["extras"]["ext_start"] = search_params["start"]
|
||||||
|
search_params["extras"]["ext_spatial"] = [
|
||||||
|
(extent.package_id, extent.spatial_ranking)
|
||||||
|
for extent in extents[start : start + rows]
|
||||||
|
]
|
||||||
|
else:
|
||||||
|
extents = bbox_query(bbox)
|
||||||
|
are_no_results = extents.count() == 0
|
||||||
|
|
||||||
|
if are_no_results:
|
||||||
|
# We don't need to perform the search
|
||||||
|
search_params["abort_search"] = True
|
||||||
|
else:
|
||||||
|
# We'll perform the existing search but also filtering by the ids
|
||||||
|
# of datasets within the bbox
|
||||||
|
bbox_query_ids = [extent.package_id for extent in extents]
|
||||||
|
|
||||||
|
q = search_params.get("q", "").strip() or '""'
|
||||||
|
# Note: `"" AND` query doesn't work in github ci
|
||||||
|
new_q = "%s AND " % q if q and q != '""' else ""
|
||||||
|
new_q += "(%s)" % " OR ".join(["id:%s" % id for id in bbox_query_ids])
|
||||||
|
|
||||||
|
search_params["q"] = new_q
|
||||||
|
|
||||||
|
return search_params
|
||||||
|
|
||||||
|
|
||||||
|
search_backends = {
|
||||||
|
"solr-bbox": SolrBBoxSearchBackend,
|
||||||
|
"solr-spatial-field": SolrSpatialFieldSearchBackend,
|
||||||
|
"postgis": PostgisSearchBackend,
|
||||||
|
}
|
Loading…
Reference in New Issue