diff --git a/ckanext/spatial/harvesters.py b/ckanext/spatial/harvesters.py index 9de6582..dfbb1c8 100644 --- a/ckanext/spatial/harvesters.py +++ b/ckanext/spatial/harvesters.py @@ -111,15 +111,19 @@ class SpatialHarvester(object): finally: log.error(message) - def _save_object_error(self,message,obj,stage=u'Fetch'): - err = HarvestObjectError(message=message,object=obj,stage=stage) + def _save_object_error(self,message,obj,stage=u'Fetch',line=None): + err = HarvestObjectError(message=message, + object=obj, + stage=stage, + line=line) try: err.save() except InvalidRequestError,e: Session.rollback() err.save() finally: - log.error(message) + log_message = '{0}, line {1}'.format(message,line) if line else message + log.debug(log_message) def _get_content(self, url): url = url.replace(' ','%20') @@ -199,11 +203,10 @@ class GeminiHarvester(SpatialHarvester): def import_gemini_object(self, gemini_string): log = logging.getLogger(__name__ + '.import') xml = etree.fromstring(gemini_string) - - valid, messages = self._get_validator().is_valid(xml) + valid, profile, errors = self._get_validator().is_valid(xml) if not valid: + out = errors[0][0] + ':\n' + '\n'.join(e[0] for e in errors[1:]) log.error('Errors found for object with GUID %s:' % self.obj.guid) - out = messages[0] + ':\n' + '\n'.join(messages[1:]) self._save_object_error(out,self.obj,'Import') unicode_gemini_string = etree.tostring(xml, encoding=unicode, pretty_print=True) @@ -570,9 +573,9 @@ class GeminiHarvester(SpatialHarvester): if gemini_xml is None: self._save_gather_error('Content is not a valid Gemini document',self.harvest_job) - valid, messages = self._get_validator().is_valid(gemini_xml) + valid, profile, errors = self._get_validator().is_valid(gemini_xml) if not valid: - out = messages[0] + ':\n' + '\n'.join(messages[1:]) + out = errors[0][0] + ':\n' + '\n'.join(e[0] for e in errors[1:]) if url: self._save_gather_error('Validation error for %s - %s'% (url,out),self.harvest_job) else: @@ -780,7 +783,6 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin): self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None - ids = [] try: for url in self._extract_urls(content,url): @@ -816,7 +818,6 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin): self._save_gather_error(msg,harvest_job) return None - if len(ids) > 0: return ids else: diff --git a/ckanext/spatial/lib/csw_client.py b/ckanext/spatial/lib/csw_client.py index 283deff..d4e2403 100644 --- a/ckanext/spatial/lib/csw_client.py +++ b/ckanext/spatial/lib/csw_client.py @@ -161,12 +161,14 @@ class CswService(OwsService): md = csw._exml.find("/{http://www.isotc211.org/2005/gmd}MD_Metadata") mdtree = etree.ElementTree(md) try: - record["xml"] = etree.tostring(mdtree, pretty_print=True, xml_declaration=True) + record["xml"] = etree.tostring(mdtree, pretty_print=True, encoding=unicode) except TypeError: # API incompatibilities between different flavours of elementtree try: - record["xml"] = etree.tostring(mdtree) + record["xml"] = etree.tostring(mdtree, pretty_print=True, encoding=unicode) except AssertionError: - record["xml"] = etree.tostring(md) + record["xml"] = etree.tostring(md, pretty_print=True, encoding=unicode) + + record["xml"] = '\n' + record["xml"] record["tree"] = mdtree return record diff --git a/ckanext/spatial/model/harvested_metadata.py b/ckanext/spatial/model/harvested_metadata.py index 5ba7ff6..debb8df 100644 --- a/ckanext/spatial/model/harvested_metadata.py +++ b/ckanext/spatial/model/harvested_metadata.py @@ -603,6 +603,13 @@ class GeminiDocument(MappedXmlDocument): ], multiplicity="*", ), + GeminiResourceLocator( + name="resource-locator-identification", + search_paths=[ + "gmd:identificationInfo//gmd:CI_OnlineResource", + ], + multiplicity="*", + ), GeminiElement( name="conformity-specification", search_paths=[ diff --git a/ckanext/spatial/tests/test_harvest.py b/ckanext/spatial/tests/test_harvest.py index 6599d57..0347338 100644 --- a/ckanext/spatial/tests/test_harvest.py +++ b/ckanext/spatial/tests/test_harvest.py @@ -449,7 +449,6 @@ class TestHarvest(HarvestFixtureBase): message = job.gather_errors[0].message assert_in('Validation error', message) - assert_in('Validating against "GEMINI 2.1 Schematron 1.2" profile failed', message) assert_in('One email address shall be provided', message) assert_in('Service type shall be one of \'discovery\', \'view\', \'download\', \'transformation\', \'invoke\' or \'other\' following INSPIRE generic names', message) assert_in('Limitations on public access code list value shall be \'otherRestrictions\'', message) diff --git a/ckanext/spatial/tests/test_validation.py b/ckanext/spatial/tests/test_validation.py index d437878..d397b8a 100644 --- a/ckanext/spatial/tests/test_validation.py +++ b/ckanext/spatial/tests/test_validation.py @@ -9,13 +9,15 @@ from ckanext.spatial import validation class TestValidation: + def _get_file_path(self, file_name): + return os.path.join(os.path.dirname(__file__), 'xml', file_name) + def get_validation_errors(self, validator, validation_test_filename): - validation_test_filepath = os.path.join(os.path.dirname(__file__), - 'xml', - validation_test_filename) + validation_test_filepath = self._get_file_path(validation_test_filename) xml = etree.parse(validation_test_filepath) is_valid, errors = validator.is_valid(xml) - return ';'.join(errors) + + return ';'.join([e[0] for e in errors]) def test_iso19139_failure(self): errors = self.get_validation_errors(validation.ISO19139Schema, @@ -117,7 +119,7 @@ class TestValidation: 'gemini2.1/validation/13_Dataset_Invalid_Element_srv.xml') assert len(errors) > 0 assert_in('(gmx.xsd)', errors) - assert_in('(u"Element \'{http://www.isotc211.org/2005/srv}SV_ServiceIdentification\': This element is not expected.', errors) + assert_in('Element \'{http://www.isotc211.org/2005/srv}SV_ServiceIdentification\': This element is not expected.', errors) def test_schematron_error_extraction(self): validation_error_xml = ''' @@ -137,3 +139,15 @@ class TestValidation: assert_in("srv:serviceType/*[1] = 'discovery'", details) assert_in("/*[local-name()='MD_Metadata'", details) assert_in("Service type shall be one of 'discovery'", details) + + + def test_error_line_numbers(self): + file_path = self._get_file_path('iso19139/dataset-invalid.xml') + xml = etree.parse(file_path) + is_valid, profile, errors = validation.Validators(profiles=['iso19139']).is_valid(xml) + assert not is_valid + assert len(errors) == 2 + + message, line = errors[1] + assert 'This element is not expected' in message + assert line == 3 diff --git a/ckanext/spatial/validation/validation.py b/ckanext/spatial/validation/validation.py index 8f20262..cbdbfe6 100644 --- a/ckanext/spatial/validation/validation.py +++ b/ckanext/spatial/validation/validation.py @@ -1,5 +1,5 @@ import os -from pkg_resources import resource_stream, resource_filename +from pkg_resources import resource_stream from ckanext.spatial.model import GeminiDocument from lxml import etree @@ -15,8 +15,12 @@ class BaseValidator(object): def is_valid(cls, xml): ''' Runs the validation on the supplied XML etree. + Returns a tuple, the first value is a boolean indicating + whether the validation passed or not. The second is a list of tuples, + each containing the error message and the error line. + Returns tuple: - (is_valid, error_message_list) + (is_valid, [(error_message_string, error_line_number)]) ''' raise NotImplementedError @@ -26,7 +30,9 @@ class XsdValidator(BaseValidator): @classmethod def _is_valid(cls, xml, xsd_filepath, xsd_name): '''Returns whether or not an XML file is valid according to - an XSD. + an XSD. Returns a tuple, the first value is a boolean indicating + whether the validation passed or not. The second is a list of tuples, + each containing the error message and the error line. Params: xml - etree of the XML to be validated @@ -34,7 +40,7 @@ class XsdValidator(BaseValidator): xsd_name - string describing the XSD Returns: - (is_valid_boolean, list_of_error_message_strings) + (is_valid, [(error_message_string, error_line_number)]) ''' xsd = etree.parse(xsd_filepath) schema = etree.XMLSchema(xsd) @@ -44,12 +50,13 @@ class XsdValidator(BaseValidator): # XMLSchemaParseError: local list type: A type, derived by list or union, must have the simple ur-type definition as base type, not '{http://www.opengis.net/gml/3.2}doubleList'., line 118 try: schema.assertValid(xml) - except AssertionError, e: - msg = '%s Schema Error: %s' % (xsd_name, e.args) - return False, [msg] - except etree.DocumentInvalid, e: - msg = '%s Validation Error: %s' % (xsd_name, e.args) - return False, [msg] + except etree.DocumentInvalid: + log.info('Validation errors found using schema {0}'.format(xsd_name)) + errors = [] + for error in schema.error_log: + errors.append((error.message, error.line)) + errors.insert + return False, errors return True, [] @@ -62,7 +69,11 @@ class ISO19139Schema(XsdValidator): xsd_path = 'xml/iso19139' gmx_xsd_filepath = os.path.join(os.path.dirname(__file__), xsd_path, 'gmx/gmx.xsd') - is_valid, errors = cls._is_valid(xml, gmx_xsd_filepath, 'Dataset schema (gmx.xsd)') + xsd_name = 'Dataset schema (gmx.xsd)' + is_valid, errors = cls._is_valid(xml, gmx_xsd_filepath, xsd_name) + if not is_valid: + #TODO: not sure if we need this one, keeping for backwards compatibility + errors.insert(0, ('{0} Validation Error'.format(xsd_name), None)) return is_valid, errors class ISO19139EdenSchema(XsdValidator): @@ -78,15 +89,23 @@ class ISO19139EdenSchema(XsdValidator): if metadata_type in ('dataset', 'series'): gmx_xsd_filepath = os.path.join(os.path.dirname(__file__), xsd_path, 'gmx/gmx.xsd') - is_valid, errors = cls._is_valid(xml, gmx_xsd_filepath, 'Dataset schema (gmx.xsd)') + xsd_name = 'Dataset schema (gmx.xsd)' + is_valid, errors = cls._is_valid(xml, gmx_xsd_filepath, xsd_name) + if not is_valid: + #TODO: not sure if we need this one, keeping for backwards compatibility + errors.insert(0, ('{0} Validation Error'.format(xsd_name), None)) elif metadata_type == 'service': gmx_and_srv_xsd_filepath = os.path.join(os.path.dirname(__file__), xsd_path, 'gmx_and_srv.xsd') - is_valid, errors = cls._is_valid(xml, gmx_and_srv_xsd_filepath, 'Service schemas (gmx.xsd & srv.xsd)') + xsd_name = 'Service schemas (gmx.xsd & srv.xsd)' + is_valid, errors = cls._is_valid(xml, gmx_and_srv_xsd_filepath, xsd_name) + if not is_valid: + #TODO: not sure if we need this one, keeping for backwards compatibility + errors.insert(0, ('{0} Validation Error'.format(xsd_name), None)) else: is_valid = False - errors = ['Metadata type not recognised "%s" - cannot choose an ISO19139 validator.' % - metadata_type] + errors = [('Metadata type not recognised "%s" - cannot choose an ISO19139 validator.' % + metadata_type, None)] if is_valid: return True, [] @@ -121,12 +140,7 @@ class ISO19139NGDCSchema(XsdValidator): xsd_filepath = os.path.join(os.path.dirname(__file__), xsd_path, 'schema.xsd') - is_valid, errors = cls._is_valid(xml, xsd_filepath, 'NGDC Schema (schema.xsd)') - - if is_valid: - return True, [] - - return False, errors + return cls._is_valid(xml, xsd_filepath, 'NGDC Schema (schema.xsd)') class FGDCSchema(XsdValidator): ''' @@ -147,12 +161,8 @@ class FGDCSchema(XsdValidator): xsd_filepath = os.path.join(os.path.dirname(__file__), xsd_path, 'fgdc-std-001-1998.xsd') - is_valid, errors = cls._is_valid(xml, xsd_filepath, 'FGDC Schema (fgdc-std-001-1998.xsd)') + return cls._is_valid(xml, xsd_filepath, 'FGDC Schema (fgdc-std-001-1998.xsd)') - if is_valid: - return True, [] - - return False, errors class SchematronValidator(BaseValidator): '''Base class for a validator that uses Schematron.''' @@ -166,6 +176,19 @@ class SchematronValidator(BaseValidator): @classmethod def is_valid(cls, xml): + '''Returns whether or not an XML file is valid according to + a schematron. Returns a tuple, the first value is a boolean indicating + whether the validation passed or not. The second is a list of tuples, + each containing the error message and the error line (which defaults to + None on the schematron validation case). + + Params: + xml - etree of the XML to be validated + + Returns: + (is_valid, [(error_message_string, error_line_number)]) + ''' + if not hasattr(cls, 'schematrons'): log.info('Compiling schematron "%s"', cls.title) cls.schematrons = cls.get_schematrons() @@ -180,7 +203,8 @@ class SchematronValidator(BaseValidator): for error in errors: message, details = cls.extract_error_details(error) if not message in messages_already_reported: - error_details.append(details) + #TODO: perhaps can extract the source line from the error location + error_details.append((details,None)) messages_already_reported.add(message) return False, error_details return True, [] @@ -198,7 +222,8 @@ class SchematronValidator(BaseValidator): location = failed_assert_element.get('location') message_element = failed_assert_element.find("{http://purl.oclc.org/dsdl/svrl}text") message = message_element.text.strip() - failed_assert_element + + #TODO: Do we really need such detail on the error messages? return message, 'Error Message: "%s" Error Location: "%s" Error Assert: "%s"' % (message, location, assert_) @classmethod @@ -255,27 +280,45 @@ class Validators(object): ''' def __init__(self, profiles=["iso19139", "constraints", "gemini2"]): self.profiles = profiles + + self.validators = {} # name: class + for validator_class in all_validators: + self.validators[validator_class.name] = validator_class + + def add_validator(self, validator_class): + self.validators[validator_class.name] = validator_class def isvalid(self, xml): '''For backward compatibility''' return self.is_valid(xml) def is_valid(self, xml): - if not hasattr(self, 'validators'): - self.validators = {} # name: class - for validator_class in all_validators: - self.validators[validator_class.name] = validator_class + '''Returns whether or not an XML file is valid. + Returns a tuple, the first value is a boolean indicating + whether the validation passed or not. The second is the name of the profile + that failed and the third is a list of tuples, + each containing the error message and the error line if present. + + Params: + xml - etree of the XML to be validated + + Returns: + (is_valid, failed_profile_name, [(error_message_string, error_line_number)]) + ''' + + log.debug('Starting validation against profile(s) %s' % ','.join(self.profiles)) for name in self.profiles: validator = self.validators[name] is_valid, error_message_list = validator.is_valid(xml) if not is_valid: - error_message_list.insert(0, 'Validating against "%s" profile failed' % validator.title) - log.info('%r', error_message_list) - return False, error_message_list - log.info('Validated against "%s"', validator.title) + #error_message_list.insert(0, 'Validating against "%s" profile failed' % validator.title) + log.info('Validating against "%s" profile failed' % validator.title) + log.debug('%r', error_message_list) + return False, validator.name, error_message_list + log.debug('Validated against "%s"', validator.title) log.info('Validation passed') - return True, [] + return True, None, [] if __name__ == '__main__': from sys import argv