From cc60327d0b67d9032d325e1bcea35154e467fc36 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 19 Feb 2013 17:02:28 +0000 Subject: [PATCH] [#10] Improve harvested metadata API Some improvements on the endpoints that return the contents of the harvest objects: * Nicer URLs with redirects to the old ones * Returning the raw harvest object content is available on the main harvest extension, so just redirect there * Support for showing the original document of a harvest object, if present * Suport for defining a custom XSLT for the HTML view, via ckanext.spatial.harvest.xslt_html_content ckanext.spatial.harvest.xslt_html_content_original --- README.rst | 24 +++++++- ckanext/spatial/controllers/api.py | 99 ++++++++++++++++++++++++------ ckanext/spatial/plugin.py | 22 +++++-- ckanext/spatial/tests/test_api.py | 90 +++++++++++++++++++++++++++ 4 files changed, 210 insertions(+), 25 deletions(-) diff --git a/README.rst b/README.rst index 95215d0..f532576 100644 --- a/README.rst +++ b/README.rst @@ -212,9 +212,29 @@ Enabled with the ``ckan.plugins = spatial_harvest_metadata_api`` (previous known To view the harvest objects (containing the harvested metadata) in the web interface, these controller locations are added: -/api/2/rest/harvestobject//xml +* raw XML document: /harvest/object/{id} +* HTML representation: /harvest/object/{id}/html -/api/2/rest/harvestobject//html +.. note:: + The old URLs are now deprecated and redirect to the previously defined. + + /api/2/rest/harvestobject//xml + /api/2/rest/harvestobject//html + + +For those harvest objects that have an original document (which was transformed to ISO), this can be accessed via: + +* raw XML document: /harvest/object/{id}/original +* HTML representation: /harvest/object/{id}/html/original + +The HTML representation is created via an XSLT transformation. The extension provides an XSLT file that should work +on ISO 19139 based documents, but if you want to use your own on your extension, you can override it using +the following configuration options:: + + ckanext.spatial.harvest.xslt_html_content = ckanext.myext:templates/xslt/custom.xslt + ckanext.spatial.harvest.xslt_html_content_original = ckanext.myext:templates/xslt/custom2.xslt + +If your project does not transform different metadata types you can ignore the second option. CSW Client diff --git a/ckanext/spatial/controllers/api.py b/ckanext/spatial/controllers/api.py index 5b7973d..3a66caa 100644 --- a/ckanext/spatial/controllers/api.py +++ b/ckanext/spatial/controllers/api.py @@ -1,3 +1,5 @@ +import logging + try: from cStringIO import StringIO except ImportError: from StringIO import StringIO @@ -10,9 +12,10 @@ from ckan.lib.base import request, config, abort from ckan.controllers.api import ApiController as BaseApiController from ckan.model import Session -from ckanext.harvest.model import HarvestObject +from ckanext.harvest.model import HarvestObject, HarvestObjectExtra from ckanext.spatial.lib import get_srid, validate_bbox, bbox_query +log = logging.getLogger(__name__) class ApiController(BaseApiController): @@ -46,33 +49,91 @@ class ApiController(BaseApiController): class HarvestMetadataApiController(BaseApiController): - def _get_harvest_object(self,id): + def _get_content(self, id): obj = Session.query(HarvestObject) \ .filter(HarvestObject.id==id).first() - return obj + if obj: + return obj.content + else: + return None - def display_xml(self,id): - obj = self._get_harvest_object(id) + def _get_original_content(self, id): + extra = Session.query(HarvestObjectExtra).join(HarvestObject) \ + .filter(HarvestObject.id==id) \ + .filter(HarvestObjectExtra.key=='original_document').first() + if extra: + return extra.value + else: + return None - if obj is None: - abort(404) - response.content_type = "application/xml" - response.headers["Content-Length"] = len(obj.content) - return obj.content + def _transform_to_html(self, content, xslt_package=None, xslt_path=None): - def display_html(self,id): - obj = self._get_harvest_object(id) + xslt_package = xslt_package or 'ckanext.spatial' + xslt_path = xslt_path or 'templates/ckanext/spatial/gemini2-html-stylesheet.xsl' - if obj is None: - abort(404) ## optimise -- read transform only once and compile rather ## than at each request - with resource_stream("ckanext.spatial", - "templates/ckanext/spatial/gemini2-html-stylesheet.xsl") as style: + with resource_stream(xslt_package, xslt_path) as style: style_xml = etree.parse(style) transformer = etree.XSLT(style_xml) - xml = etree.parse(StringIO(obj.content.encode("utf-8"))) - html = transformer(xml) - return etree.tostring(html, pretty_print=True) + xml = etree.parse(StringIO(content.encode('utf-8'))) + html = transformer(xml) + + response.headers['Content-Type'] = 'text/html; charset=utf-8' + response.headers['Content-Length'] = len(content) + + result = etree.tostring(html, pretty_print=True) + + return result + + def _get_xslt(self, original=False): + + if original: + config_option = 'ckanext.spatial.harvest.xslt_html_content_original' + else: + config_option = 'ckanext.spatial.harvest.xslt_html_content' + + xslt_package = None + xslt_path = None + xslt = config.get(config_option, None) + if xslt: + if ':' in xslt: + xslt = xslt.split(':') + xslt_package = xslt[0] + xslt_path = xslt[1] + else: + log.error('XSLT should be defined in the form :' + + ', eg ckanext.myext:templates/my.xslt') + + return xslt_package, xslt_path + + def display_xml_original(self, id): + content = self._get_original_content(id) + + if not content: + abort(404) + + response.headers['Content-Type'] = 'application/xml; charset=utf-8' + response.headers['Content-Length'] = len(content) + + return content.encode('utf-8') + + def display_html(self,id): + content = self._get_content(id) + + if not content: + abort(404) + + xslt_package, xslt_path = self._get_xslt() + return self._transform_to_html(content, xslt_package, xslt_path) + + def display_html_original(self, id): + content = self._get_original_content(id) + + if content is None: + abort(404) + + xslt_package, xslt_path = self._get_xslt(original=True) + return self._transform_to_html(content, xslt_package, xslt_path) diff --git a/ckanext/spatial/plugin.py b/ckanext/spatial/plugin.py index aa870f8..3a1585f 100644 --- a/ckanext/spatial/plugin.py +++ b/ckanext/spatial/plugin.py @@ -267,10 +267,24 @@ class HarvestMetadataApi(p.SingletonPlugin): def before_map(self, route_map): controller = "ckanext.spatial.controllers.api:HarvestMetadataApiController" - route_map.connect("/api/2/rest/harvestobject/:id/xml", controller=controller, - action="display_xml") - route_map.connect("/api/2/rest/harvestobject/:id/html", controller=controller, - action="display_html") + # Showing the harvest object content is an action of the default + # harvest plugin, so just redirect there + route_map.redirect('/api/2/rest/harvestobject/{id:.*}/xml', + '/harvest/object/{id}', + _redirect_code='301 Moved Permanently') + + route_map.connect('/harvest/object/{id}/original', controller=controller, + action='display_xml_original') + + route_map.connect('/harvest/object/{id}/html', controller=controller, + action='display_html') + route_map.connect('/harvest/object/{id}/html/original', controller=controller, + action='display_html_original') + + # Redirect old URL to a nicer and unversioned one + route_map.redirect('/api/2/rest/harvestobject/:id/html', + '/harvest/object/{id}/html', + _redirect_code='301 Moved Permanently') return route_map diff --git a/ckanext/spatial/tests/test_api.py b/ckanext/spatial/tests/test_api.py index 791e93f..b21e72b 100644 --- a/ckanext/spatial/tests/test_api.py +++ b/ckanext/spatial/tests/test_api.py @@ -1,6 +1,7 @@ import logging import json from pprint import pprint +from nose.plugins.skip import SkipTest from nose.tools import assert_equal, assert_raises from ckan.logic.action.create import package_create from ckan.logic.action.delete import package_delete @@ -125,3 +126,92 @@ class TestActionPackageSearch(SpatialTestBase,WsgiAppCase): assert_equal(result['count'], 1) assert_equal(result['results'][0]['name'], 'test-spatial-dataset-search-point-2') + +class TestHarvestedMetadataAPI(WsgiAppCase): + + + @classmethod + def setup_class(cls): + try: + from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra + except ImportError: + raise SkipTest('The harvester extension is needed for these tests') + + cls.content1 = 'Content 1' + ho1 = HarvestObject(guid='test-ho-1', + job=HarvestJob(source=HarvestSource(url='http://', type='xx')), + content=cls.content1) + + cls.content2 = 'Content 2' + cls.original_content2 = 'Original Content 2' + ho2 = HarvestObject(guid='test-ho-2', + job=HarvestJob(source=HarvestSource(url='http://', type='xx')), + content=cls.content2) + + hoe = HarvestObjectExtra(key='original_document', + value=cls.original_content2, + object=ho2) + + Session.add(ho1) + Session.add(ho2) + Session.add(hoe) + Session.commit() + + cls.object_id_1 = ho1.id + cls.object_id_2 = ho2.id + + + def test_api(self): + + # Test redirects for old URLs + url = '/api/2/rest/harvestobject/{0}/xml'.format(self.object_id_1) + r = self.app.get(url) + assert r.status == 301 + assert '/harvest/object/{0}'.format(self.object_id_1) in r.header_dict['Location'] + + url = '/api/2/rest/harvestobject/{0}/html'.format(self.object_id_1) + r = self.app.get(url) + assert r.status == 301 + assert '/harvest/object/{0}/html'.format(self.object_id_1) in r.header_dict['Location'] + + + # Access object content + url = '/harvest/object/{0}'.format(self.object_id_1) + r = self.app.get(url) + assert r.status == 200 + assert r.header_dict['Content-Type'] == 'application/xml; charset=utf-8' + assert r.body == self.content1 + + # Access original content in object extra (if present) + url = '/harvest/object/{0}/original'.format(self.object_id_1) + r = self.app.get(url, status=404) + assert r.status == 404 + + url = '/harvest/object/{0}/original'.format(self.object_id_2) + r = self.app.get(url) + assert r.status == 200 + assert r.header_dict['Content-Type'] == 'application/xml; charset=utf-8' + assert r.body == self.original_content2 + + # Access HTML transformation + url = '/harvest/object/{0}/html'.format(self.object_id_1) + r = self.app.get(url) + assert r.status == 200 + assert r.header_dict['Content-Type'] == 'text/html; charset=utf-8' + assert 'GEMINI record about' in r.body + + url = '/harvest/object/{0}/html/original'.format(self.object_id_1) + r = self.app.get(url, status=404) + assert r.status == 404 + + url = '/harvest/object/{0}/html'.format(self.object_id_2) + r = self.app.get(url) + assert r.status == 200 + assert r.header_dict['Content-Type'] == 'text/html; charset=utf-8' + assert 'GEMINI record about' in r.body + + url = '/harvest/object/{0}/html/original'.format(self.object_id_2) + r = self.app.get(url) + assert r.status == 200 + assert r.header_dict['Content-Type'] == 'text/html; charset=utf-8' + assert 'GEMINI record about' in r.body