try: from cStringIO import StringIO except ImportError: from StringIO import StringIO import traceback from datetime import datetime from pylons import request, response, config from lxml import etree from owslib.csw import namespaces from sqlalchemy import select,distinct,or_ from ckan.lib.base import BaseController from ckan.lib.helpers import truncate from ckan.model import Package from ckan.model.meta import Session from ckanext.harvest.model import HarvestObject, HarvestJob, HarvestSource namespaces["xlink"] = "http://www.w3.org/1999/xlink" log = __import__("logging").getLogger(__name__) LOG_XML_LENGTH = config.get('cswservice.log_xml_length', 1000) from random import random class __rlog__(object): """ Random log -- log wrapper to log a defined percentage of dialogues """ def __init__(self, threshold=config["cswservice.rndlog_threshold"]): self.threshold = threshold self.i = random() def __getattr__(self, attr): if self.i > self.threshold: return self.dummy return getattr(log, attr) def dummy(self, *av, **kw): pass def ntag(nselem): pfx, elem = nselem.split(":") return "{%s}%s" % (namespaces[pfx], elem) class CatalogueServiceWebController(BaseController): '''Basic CSW server''' def _operations(self): '''Returns a list of this class\'s methods. ''' return dict((x, getattr(self, x)) for x in dir(self) if not x.startswith("_")) def dispatch_get(self): self.rlog = __rlog__() self.rlog.info("request environ\n%s", request.environ) self.rlog.info("request headers\n%s", request.headers) ops = self._operations() req = dict(request.GET.items()) ## special cases if "REQUEST" in req: req["request"] = req["REQUEST"] if "SERVICE" in req: req["service"] = req["SERVICE"] if "request" not in req: err = self._exception(exceptionCode="MissingParameterValue", locator="request") return self._render_xml(err) if req["request"] not in ops: err = self._exception(exceptionCode="OperationNotSupported", locator=req["request"]) return self._render_xml(err) if "service" not in req: err = self._exception(exceptionCode="MissingParameterValue", locator="service") return self._render_xml(err) if req["service"] != "CSW": err = self._exception(exceptionCode="InvalidParameterValue", locator="service", text=req["service"]) return self._render_xml(err) ## fill in some defaults startPosition = req.get("startPosition", 1) try: req["startPosition"] = int(startPosition) except: err = self._exception(exceptionCode="InvalidParameterValue", locator="startPosition", text=unicode(startPosition)) return self._render_xml(err) maxRecords = req.get("maxRecords", 10) try: req["maxRecords"] = int(maxRecords) except: err = self._exception(exceptionCode="InvalidParameterValue", locator="maxRecords", text=unicode(maxRecords)) return self._render_xml(err) req["id"] = [req["id"]] if "id" in req else [] return ops[req["request"]](req) def dispatch_post(self): self.rlog = __rlog__() self.rlog.info("request environ\n%s", request.environ) self.rlog.info("request headers\n%s", request.headers) ops = self._operations() try: req = etree.parse(StringIO(request.body)) self.rlog.info(u"request body\n%s", etree.tostring(req, pretty_print=True)) except: self.rlog.info("request body\n%s", request.body) self.rlog.error("exception parsing body\n%s", traceback.format_exc()) err = self._exception(exceptionCode="MissingParameterValue", locator="request") return self._render_xml(err) root = req.getroot() _unused, op = root.tag.rsplit("}", 1) if op not in ops: err = self._exception(exceptionCode="OperationNotSupported", locator=op) return self._render_xml(err) req = self._parse_xml_request(root) if not isinstance(req, dict): return req return ops[op](req) def _render_xml(self, root): tree = etree.ElementTree(root) data = etree.tostring(tree, pretty_print=True) response.headers["Content-Length"] = len(data) response.content_type = "application/xml" response.charset="UTF-8" self.rlog.info("response headers:\n%s", response.headers) self.rlog.info("response.body:\n%s", data) return data def _exception(self, text=None, **kw): metaargs = { "nsmap": namespaces, "version": "1.0.0", ntag("xsi:schemaLocation"): "http://schemas.opengis.net/ows/1.0.0/owsExceptionReport.xsd", } root = etree.Element(ntag("ows:ExceptionReport"), **metaargs) exc = etree.SubElement(root, ntag("ows:Exception"), **kw) if text is not None: txt = etree.SubElement(exc, ntag("ows:ExceptionText")) txt.text = text return root def _parse_xml_request(self, root): """ Check common parts of the request for GetRecords, GetRecordById, etc. Return a dictionary of parameters or else an XML error message (string). The dictionary should be compatible with CSW GET requests """ service = root.get("service") if service is None: err = self._exception(exceptionCode="MissingParameterValue", locator="service") return self._render_xml(err) elif service != "CSW": err = self._exception(exceptionCode="InvalidParameterValue", locator="service", text=service) return self._render_xml(err) outputSchema = root.get("outputSchema", namespaces["gmd"]) if outputSchema != namespaces["gmd"]: err = self._exception(exceptionCode="InvalidParameterValue", locator="outputSchema", text=outputSchema) return self._render_xml(err) resultType = root.get("resultType", "results") if resultType not in ("results", "hits"): err = self._exception(exceptionCode="InvalidParameterValue", locator="resultType", text=resultType) return self._render_xml(err) outputFormat = root.get("outputFormat", "application/xml") if outputFormat != "application/xml": err = self._exception(exceptionCode="InvalidParameterValue", locator="outputFormat", text=outputFormat) return self._render_xml(err) elementSetName = root.get("elementSetName", "full") params = { "outputSchema": outputSchema, "resultType": resultType, "outputFormat": outputFormat, "elementSetName": elementSetName } startPosition = root.get("startPosition", "1") try: params["startPosition"] = int(startPosition) except: err = self._exception(exceptionCode="InvalidParameterValue", locator="startPosition") return self._render_xml(err) maxRecords = root.get("maxRecords", "10") try: params["maxRecords"] = int(maxRecords) except: err = self._exception(exceptionCode="InvalidParameterValue", locator="maxRecords") return self._render_xml(err) params["id"] = [x.text for x in root.findall(ntag("csw:Id"))] query = root.find(ntag("csw:Query")) if query is not None: params.update(self._parse_query(query)) if params["elementSetName"] not in ("full", "brief", "summary"): err = self._exception(exceptionCode="InvalidParameterValue", locator="elementSetName", text=params["elementSetName"]) return self._render_xml(err) return params def _parse_query(self, query): params = {} params["typeNames"] = query.get("typeNames", "csw:Record") esn = query.find(ntag("csw:ElementSetName")) if esn is not None: params["elementSetName"] = esn.text return params def GetCapabilities(self, req): site = request.host_url + request.path caps = etree.Element(ntag("csw:Capabilities"), nsmap=namespaces) srvid = etree.SubElement(caps, ntag("ows:ServiceIdentification")) title = etree.SubElement(srvid, ntag("ows:Title")) title.text = unicode(config["cswservice.title"]) abstract = etree.SubElement(srvid, ntag("ows:Abstract")) abstract.text = unicode(config["cswservice.abstract"]) keywords = etree.SubElement(srvid, ntag("ows:Keywords")) for word in [w.strip() for w in config["cswservice.keywords"].split(",")]: if word == "": continue kw = etree.SubElement(keywords, ntag("ows:Keyword")) kw.text = unicode(word) kwtype = etree.SubElement(keywords, ntag("ows:Type")) kwtype.text = unicode(config["cswservice.keyword_type"]) srvtype = etree.SubElement(srvid, ntag("ows:ServiceType")) srvtype.text = "CSW" srvver = etree.SubElement(srvid, ntag("ows:ServiceTypeVersion")) srvver.text = "2.0.2" ### ows:Fees, ows:AccessConstraints provider = etree.SubElement(caps, ntag("ows:ServiceProvider")) provname = etree.SubElement(provider, ntag("ows:ProviderName")) provname.text = unicode(config["cswservice.provider_name"]) attrs = { ntag("xlink:href"): site } etree.SubElement(provider, ntag("ows:ProviderSite"), **attrs) contact = etree.SubElement(provider, ntag("ows:ServiceContact")) name = etree.SubElement(contact, ntag("ows:IndividualName")) name.text = unicode(config["cswservice.contact_name"]) pos = etree.SubElement(contact, ntag("ows:PositionName")) pos.text = unicode(config["cswservice.contact_position"]) cinfo = etree.SubElement(contact, ntag("ows:ContactInfo")) phone = etree.SubElement(cinfo, ntag("ows:Phone")) voice = etree.SubElement(phone, ntag("ows:Voice")) voice.text = unicode(config["cswservice.contact_voice"]) fax = etree.SubElement(phone, ntag("ows:Fax")) fax.text = unicode(config["cswservice.contact_fax"]) addr = etree.SubElement(cinfo, ntag("ows:Address")) dpoint = etree.SubElement(addr, ntag("ows:DeliveryPoint")) dpoint.text= unicode(config["cswservice.contact_address"]) city = etree.SubElement(addr, ntag("ows:City")) city.text = unicode(config["cswservice.contact_city"]) region = etree.SubElement(addr, ntag("ows:AdministrativeArea")) region.text = unicode(config["cswservice.contact_region"]) pcode = etree.SubElement(addr, ntag("ows:PostalCode")) pcode.text = unicode(config["cswservice.contact_pcode"]) country = etree.SubElement(addr, ntag("ows:Country")) country.text = unicode(config["cswservice.contact_country"]) email = etree.SubElement(addr, ntag("ows:ElectronicMailAddress")) email.text = unicode(config["cswservice.contact_email"]) hours = etree.SubElement(cinfo, ntag("ows:HoursOfService")) hours.text = unicode(config["cswservice.contact_hours"]) instructions = etree.SubElement(cinfo, ntag("ows:ContactInstructions")) instructions.text = unicode(config["cswservice.contact_instructions"]) role = etree.SubElement(contact, ntag("ows:Role")) role.text = unicode(config["cswservice.contact_role"]) opmeta = etree.SubElement(caps, ntag("ows:OperationsMetadata")) op = etree.SubElement(opmeta, ntag("ows:Operation"), name="GetCapabilities") dcp = etree.SubElement(op, ntag("ows:DCP")) http = etree.SubElement(dcp, ntag("ows:HTTP")) attrs = { ntag("xlink:href"): site } etree.SubElement(http, ntag("ows:Get"), **attrs) post = etree.SubElement(http, ntag("ows:Post"), **attrs) pe = etree.SubElement(post, ntag("ows:Constraint"), name="PostEncoding") val = etree.SubElement(pe, ntag("ows:Value")) val.text = "XML" op = etree.SubElement(opmeta, ntag("ows:Operation"), name="GetRecords") dcp = etree.SubElement(op, ntag("ows:DCP")) http = etree.SubElement(dcp, ntag("ows:HTTP")) attrs = { ntag("xlink:href"): site } etree.SubElement(http, ntag("ows:Get"), **attrs) post = etree.SubElement(http, ntag("ows:Post"), **attrs) pe = etree.SubElement(post, ntag("ows:Constraint"), name="PostEncoding") val = etree.SubElement(pe, ntag("ows:Value")) val.text = "XML" param = etree.SubElement(op, ntag("ows:Parameter"), name="resultType") val = etree.SubElement(param, ntag("ows:Value")) val.text = "results" param = etree.SubElement(op, ntag("ows:Parameter"), name="outputFormat") val = etree.SubElement(param, ntag("ows:Value")) val.text = "application/xml" param = etree.SubElement(op, ntag("ows:Parameter"), name="outputSchema") val = etree.SubElement(param, ntag("ows:Value")) val.text = "http://www.isotc211.org/2005/gmd" param = etree.SubElement(op, ntag("ows:Parameter"), name="typeNames") val = etree.SubElement(param, ntag("ows:Value")) val.text = "gmd:MD_Metadata" op = etree.SubElement(opmeta, ntag("ows:Operation"), name="GetRecordById") dcp = etree.SubElement(op, ntag("ows:DCP")) http = etree.SubElement(dcp, ntag("ows:HTTP")) attrs = { ntag("xlink:href"): site } etree.SubElement(http, ntag("ows:Get"), **attrs) post = etree.SubElement(http, ntag("ows:Post"), **attrs) pe = etree.SubElement(post, ntag("ows:Constraint"), name="PostEncoding") val = etree.SubElement(pe, ntag("ows:Value")) val.text = "XML" param = etree.SubElement(op, ntag("ows:Parameter"), name="resultType") val = etree.SubElement(param, ntag("ows:Value")) val.text = "results" param = etree.SubElement(op, ntag("ows:Parameter"), name="outputFormat") val = etree.SubElement(param, ntag("ows:Value")) val.text = "application/xml" param = etree.SubElement(op, ntag("ows:Parameter"), name="outputSchema") val = etree.SubElement(param, ntag("ows:Value")) val.text = "http://www.isotc211.org/2005/gmd" param = etree.SubElement(op, ntag("ows:Parameter"), name="typeNames") val = etree.SubElement(param, ntag("ows:Value")) val.text = "gmd:MD_Metadata" # op = etree.SubElement(opmeta, ntag("ows:Operation"), name="Harvest") # dcp = etree.SubElement(op, ntag("ows:DCP")) # http = etree.SubElement(dcp, ntag("ows:HTTP")) # attrs = { ntag("xlink:href"): site } # post = etree.SubElement(http, ntag("ows:Post"), **attrs) # pe = etree.SubElement(post, ntag("ows:Constraint"), name="PostEncoding") # val = etree.SubElement(pe, ntag("ows:Value")) # val.text = "XML" # param = etree.SubElement(op, ntag("ows:Parameter"), name="outputFormat") # val = etree.SubElement(param, ntag("ows:Value")) # val.text = "application/xml" # param = etree.SubElement(op, ntag("ows:Parameter"), name="outputSchema") # val = etree.SubElement(param, ntag("ows:Value")) # val.text = "http://www.isotc211.org/2005/gmd" filcap = etree.SubElement(caps, ntag("ogc:Filter_Capabilities")) spacap = etree.SubElement(filcap, ntag("ogc:Spatial_Capabilities")) geomop = etree.SubElement(spacap, ntag("ogc:GeometryOperands")) spaceop = etree.SubElement(spacap, ntag("ogc:SpatialOperators")) scalcap = etree.SubElement(filcap, ntag("ogc:Scalar_Capabilities")) lop = etree.SubElement(scalcap, ntag("ogc:LogicalOperators")) cop = etree.SubElement(scalcap, ntag("ogc:ComparisonOperators")) idcap = etree.SubElement(filcap, ntag("ogc:Id_Capabilities")) eid = etree.SubElement(idcap, ntag("ogc:EID")) fid = etree.SubElement(idcap, ntag("ogc:FID")) data = self._render_xml(caps) log.info('GetCapabilities response: %r', truncate(data, LOG_XML_LENGTH)) return data def GetRecords(self, req): resp = etree.Element(ntag("csw:GetRecordsResponse"), nsmap=namespaces) etree.SubElement(resp, ntag("csw:SearchStatus"), timestamp=datetime.utcnow().isoformat()) cursor = Session.connection() q = Session.query(distinct(HarvestObject.guid)) \ .join(Package) \ .join(HarvestSource) \ .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') \ .filter(or_(HarvestSource.type=='gemini-single', \ HarvestSource.type=='gemini-waf', \ HarvestSource.type=='csw')) ### TODO Parse query instead of stupidly just returning whatever we like startPosition = req["startPosition"] if req["startPosition"] > 0 else 1 maxRecords = req["maxRecords"] if req["maxRecords"] > 0 else 10 rset = q.offset(startPosition-1).limit(maxRecords) total = q.count() attrs = { "numberOfRecordsMatched": total, "elementSet": req["elementSetName"], # we lie here. it's always really "full" } if req["resultType"] == "results": returned = rset.count() attrs["numberOfRecordsReturned"] = returned if (total-startPosition-1) > returned: attrs["nextRecord"] = startPosition + returned else: attrs["nextRecord"] = 0 else: attrs["numberOfRecordsReturned"] = 0 attrs = dict((k, unicode(v)) for k,v in attrs.items()) results = etree.SubElement(resp, ntag("csw:SearchResults"), **attrs) if req["resultType"] == "results": for guid, in Session.execute(rset): doc = Session.query(HarvestObject) \ .join(Package) \ .filter(HarvestObject.guid==guid) \ .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') \ .first() try: record = etree.parse(StringIO(doc.content.encode("utf-8"))) results.append(record.getroot()) except: log.error("exception parsing document %s:\n%s", doc.id, traceback.format_exc()) raise data = self._render_xml(resp) log.info('GetRecords response: %r', truncate(data, LOG_XML_LENGTH)) return data def GetRecordById(self, req): resp = etree.Element(ntag("csw:GetRecordByIdResponse"), nsmap=namespaces) seen = set() for ident in req["id"]: doc = Session.query(HarvestObject) \ .join(Package) \ .join(HarvestJob).join(HarvestSource) \ .filter(HarvestSource.active==True) \ .filter(HarvestObject.guid==ident) \ .filter(HarvestObject.package!=None) \ .filter(Package.state==u'active') \ .order_by(HarvestObject.gathered.desc()) \ .limit(1).first() if doc is None: continue if 'MD_Metadata' in doc.content: try: record = etree.parse(StringIO(doc.content.encode("utf-8"))) resp.append(record.getroot()) except: log.error("exception parsing document %s:\n%s", doc.id, traceback.format_exc()) raise data = self._render_xml(resp) log.info('GetRecordById response: %r', truncate(data, LOG_XML_LENGTH)) return data ### ### ### full ### ### ### ### dc:type ### dataset ### ### ### ### ###