[#10] Improve harvested metadata API

Some improvements on the endpoints that return the contents of the
harvest objects:

* Nicer URLs with redirects to the old ones
* Returning the raw harvest object content is available on the main
 harvest extension, so just redirect there
* Support for showing the original document of a harvest object, if
 present
* Suport for defining a custom XSLT for the HTML view, via

ckanext.spatial.harvest.xslt_html_content
ckanext.spatial.harvest.xslt_html_content_original
This commit is contained in:
amercader 2013-02-19 17:02:28 +00:00
parent 8647f90cb6
commit cc60327d0b
4 changed files with 210 additions and 25 deletions

View File

@ -212,9 +212,29 @@ Enabled with the ``ckan.plugins = spatial_harvest_metadata_api`` (previous known
To view the harvest objects (containing the harvested metadata) in the web interface, these controller locations are added:
/api/2/rest/harvestobject/<id>/xml
* raw XML document: /harvest/object/{id}
* HTML representation: /harvest/object/{id}/html
/api/2/rest/harvestobject/<id>/html
.. note::
The old URLs are now deprecated and redirect to the previously defined.
/api/2/rest/harvestobject/<id>/xml
/api/2/rest/harvestobject/<id>/html
For those harvest objects that have an original document (which was transformed to ISO), this can be accessed via:
* raw XML document: /harvest/object/{id}/original
* HTML representation: /harvest/object/{id}/html/original
The HTML representation is created via an XSLT transformation. The extension provides an XSLT file that should work
on ISO 19139 based documents, but if you want to use your own on your extension, you can override it using
the following configuration options::
ckanext.spatial.harvest.xslt_html_content = ckanext.myext:templates/xslt/custom.xslt
ckanext.spatial.harvest.xslt_html_content_original = ckanext.myext:templates/xslt/custom2.xslt
If your project does not transform different metadata types you can ignore the second option.
CSW Client

View File

@ -1,3 +1,5 @@
import logging
try: from cStringIO import StringIO
except ImportError: from StringIO import StringIO
@ -10,9 +12,10 @@ from ckan.lib.base import request, config, abort
from ckan.controllers.api import ApiController as BaseApiController
from ckan.model import Session
from ckanext.harvest.model import HarvestObject
from ckanext.harvest.model import HarvestObject, HarvestObjectExtra
from ckanext.spatial.lib import get_srid, validate_bbox, bbox_query
log = logging.getLogger(__name__)
class ApiController(BaseApiController):
@ -46,33 +49,91 @@ class ApiController(BaseApiController):
class HarvestMetadataApiController(BaseApiController):
def _get_harvest_object(self,id):
def _get_content(self, id):
obj = Session.query(HarvestObject) \
.filter(HarvestObject.id==id).first()
return obj
if obj:
return obj.content
else:
return None
def display_xml(self,id):
obj = self._get_harvest_object(id)
def _get_original_content(self, id):
extra = Session.query(HarvestObjectExtra).join(HarvestObject) \
.filter(HarvestObject.id==id) \
.filter(HarvestObjectExtra.key=='original_document').first()
if extra:
return extra.value
else:
return None
if obj is None:
abort(404)
response.content_type = "application/xml"
response.headers["Content-Length"] = len(obj.content)
return obj.content
def _transform_to_html(self, content, xslt_package=None, xslt_path=None):
def display_html(self,id):
obj = self._get_harvest_object(id)
xslt_package = xslt_package or 'ckanext.spatial'
xslt_path = xslt_path or 'templates/ckanext/spatial/gemini2-html-stylesheet.xsl'
if obj is None:
abort(404)
## optimise -- read transform only once and compile rather
## than at each request
with resource_stream("ckanext.spatial",
"templates/ckanext/spatial/gemini2-html-stylesheet.xsl") as style:
with resource_stream(xslt_package, xslt_path) as style:
style_xml = etree.parse(style)
transformer = etree.XSLT(style_xml)
xml = etree.parse(StringIO(obj.content.encode("utf-8")))
html = transformer(xml)
return etree.tostring(html, pretty_print=True)
xml = etree.parse(StringIO(content.encode('utf-8')))
html = transformer(xml)
response.headers['Content-Type'] = 'text/html; charset=utf-8'
response.headers['Content-Length'] = len(content)
result = etree.tostring(html, pretty_print=True)
return result
def _get_xslt(self, original=False):
if original:
config_option = 'ckanext.spatial.harvest.xslt_html_content_original'
else:
config_option = 'ckanext.spatial.harvest.xslt_html_content'
xslt_package = None
xslt_path = None
xslt = config.get(config_option, None)
if xslt:
if ':' in xslt:
xslt = xslt.split(':')
xslt_package = xslt[0]
xslt_path = xslt[1]
else:
log.error('XSLT should be defined in the form <package>:<path>' +
', eg ckanext.myext:templates/my.xslt')
return xslt_package, xslt_path
def display_xml_original(self, id):
content = self._get_original_content(id)
if not content:
abort(404)
response.headers['Content-Type'] = 'application/xml; charset=utf-8'
response.headers['Content-Length'] = len(content)
return content.encode('utf-8')
def display_html(self,id):
content = self._get_content(id)
if not content:
abort(404)
xslt_package, xslt_path = self._get_xslt()
return self._transform_to_html(content, xslt_package, xslt_path)
def display_html_original(self, id):
content = self._get_original_content(id)
if content is None:
abort(404)
xslt_package, xslt_path = self._get_xslt(original=True)
return self._transform_to_html(content, xslt_package, xslt_path)

View File

@ -267,10 +267,24 @@ class HarvestMetadataApi(p.SingletonPlugin):
def before_map(self, route_map):
controller = "ckanext.spatial.controllers.api:HarvestMetadataApiController"
route_map.connect("/api/2/rest/harvestobject/:id/xml", controller=controller,
action="display_xml")
route_map.connect("/api/2/rest/harvestobject/:id/html", controller=controller,
action="display_html")
# Showing the harvest object content is an action of the default
# harvest plugin, so just redirect there
route_map.redirect('/api/2/rest/harvestobject/{id:.*}/xml',
'/harvest/object/{id}',
_redirect_code='301 Moved Permanently')
route_map.connect('/harvest/object/{id}/original', controller=controller,
action='display_xml_original')
route_map.connect('/harvest/object/{id}/html', controller=controller,
action='display_html')
route_map.connect('/harvest/object/{id}/html/original', controller=controller,
action='display_html_original')
# Redirect old URL to a nicer and unversioned one
route_map.redirect('/api/2/rest/harvestobject/:id/html',
'/harvest/object/{id}/html',
_redirect_code='301 Moved Permanently')
return route_map

View File

@ -1,6 +1,7 @@
import logging
import json
from pprint import pprint
from nose.plugins.skip import SkipTest
from nose.tools import assert_equal, assert_raises
from ckan.logic.action.create import package_create
from ckan.logic.action.delete import package_delete
@ -125,3 +126,92 @@ class TestActionPackageSearch(SpatialTestBase,WsgiAppCase):
assert_equal(result['count'], 1)
assert_equal(result['results'][0]['name'], 'test-spatial-dataset-search-point-2')
class TestHarvestedMetadataAPI(WsgiAppCase):
@classmethod
def setup_class(cls):
try:
from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource, HarvestObjectExtra
except ImportError:
raise SkipTest('The harvester extension is needed for these tests')
cls.content1 = '<xml>Content 1</xml>'
ho1 = HarvestObject(guid='test-ho-1',
job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
content=cls.content1)
cls.content2 = '<xml>Content 2</xml>'
cls.original_content2 = '<xml>Original Content 2</xml>'
ho2 = HarvestObject(guid='test-ho-2',
job=HarvestJob(source=HarvestSource(url='http://', type='xx')),
content=cls.content2)
hoe = HarvestObjectExtra(key='original_document',
value=cls.original_content2,
object=ho2)
Session.add(ho1)
Session.add(ho2)
Session.add(hoe)
Session.commit()
cls.object_id_1 = ho1.id
cls.object_id_2 = ho2.id
def test_api(self):
# Test redirects for old URLs
url = '/api/2/rest/harvestobject/{0}/xml'.format(self.object_id_1)
r = self.app.get(url)
assert r.status == 301
assert '/harvest/object/{0}'.format(self.object_id_1) in r.header_dict['Location']
url = '/api/2/rest/harvestobject/{0}/html'.format(self.object_id_1)
r = self.app.get(url)
assert r.status == 301
assert '/harvest/object/{0}/html'.format(self.object_id_1) in r.header_dict['Location']
# Access object content
url = '/harvest/object/{0}'.format(self.object_id_1)
r = self.app.get(url)
assert r.status == 200
assert r.header_dict['Content-Type'] == 'application/xml; charset=utf-8'
assert r.body == self.content1
# Access original content in object extra (if present)
url = '/harvest/object/{0}/original'.format(self.object_id_1)
r = self.app.get(url, status=404)
assert r.status == 404
url = '/harvest/object/{0}/original'.format(self.object_id_2)
r = self.app.get(url)
assert r.status == 200
assert r.header_dict['Content-Type'] == 'application/xml; charset=utf-8'
assert r.body == self.original_content2
# Access HTML transformation
url = '/harvest/object/{0}/html'.format(self.object_id_1)
r = self.app.get(url)
assert r.status == 200
assert r.header_dict['Content-Type'] == 'text/html; charset=utf-8'
assert 'GEMINI record about' in r.body
url = '/harvest/object/{0}/html/original'.format(self.object_id_1)
r = self.app.get(url, status=404)
assert r.status == 404
url = '/harvest/object/{0}/html'.format(self.object_id_2)
r = self.app.get(url)
assert r.status == 200
assert r.header_dict['Content-Type'] == 'text/html; charset=utf-8'
assert 'GEMINI record about' in r.body
url = '/harvest/object/{0}/html/original'.format(self.object_id_2)
r = self.app.get(url)
assert r.status == 200
assert r.header_dict['Content-Type'] == 'text/html; charset=utf-8'
assert 'GEMINI record about' in r.body