From e12e38cab0a311d4d9ed3e5cc14021e7b5e5ae85 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 19 Nov 2012 17:15:16 +0000 Subject: [PATCH 1/7] Improvements on the validation code To make easier to filter and display errors on the UI, the validators have been modified to return the message and line number separately. The return format for validators is now: (is_valid, [(error_message_string, error_line_number)]) Also the XSD based validators were returning only the last error found on the document, instead of iterating the whole error log. Harvesters should create a harvest object error for each of this validation errors. Tests have been adapted to these changes. --- ckanext/spatial/tests/test_validation.py | 24 ++++- ckanext/spatial/validation/validation.py | 114 +++++++++++++++-------- 2 files changed, 96 insertions(+), 42 deletions(-) diff --git a/ckanext/spatial/tests/test_validation.py b/ckanext/spatial/tests/test_validation.py index d437878..d397b8a 100644 --- a/ckanext/spatial/tests/test_validation.py +++ b/ckanext/spatial/tests/test_validation.py @@ -9,13 +9,15 @@ from ckanext.spatial import validation class TestValidation: + def _get_file_path(self, file_name): + return os.path.join(os.path.dirname(__file__), 'xml', file_name) + def get_validation_errors(self, validator, validation_test_filename): - validation_test_filepath = os.path.join(os.path.dirname(__file__), - 'xml', - validation_test_filename) + validation_test_filepath = self._get_file_path(validation_test_filename) xml = etree.parse(validation_test_filepath) is_valid, errors = validator.is_valid(xml) - return ';'.join(errors) + + return ';'.join([e[0] for e in errors]) def test_iso19139_failure(self): errors = self.get_validation_errors(validation.ISO19139Schema, @@ -117,7 +119,7 @@ class TestValidation: 'gemini2.1/validation/13_Dataset_Invalid_Element_srv.xml') assert len(errors) > 0 assert_in('(gmx.xsd)', errors) - assert_in('(u"Element \'{http://www.isotc211.org/2005/srv}SV_ServiceIdentification\': This element is not expected.', errors) + assert_in('Element \'{http://www.isotc211.org/2005/srv}SV_ServiceIdentification\': This element is not expected.', errors) def test_schematron_error_extraction(self): validation_error_xml = ''' @@ -137,3 +139,15 @@ class TestValidation: assert_in("srv:serviceType/*[1] = 'discovery'", details) assert_in("/*[local-name()='MD_Metadata'", details) assert_in("Service type shall be one of 'discovery'", details) + + + def test_error_line_numbers(self): + file_path = self._get_file_path('iso19139/dataset-invalid.xml') + xml = etree.parse(file_path) + is_valid, profile, errors = validation.Validators(profiles=['iso19139']).is_valid(xml) + assert not is_valid + assert len(errors) == 2 + + message, line = errors[1] + assert 'This element is not expected' in message + assert line == 3 diff --git a/ckanext/spatial/validation/validation.py b/ckanext/spatial/validation/validation.py index 8f20262..0cb2d56 100644 --- a/ckanext/spatial/validation/validation.py +++ b/ckanext/spatial/validation/validation.py @@ -1,5 +1,5 @@ import os -from pkg_resources import resource_stream, resource_filename +from pkg_resources import resource_stream from ckanext.spatial.model import GeminiDocument from lxml import etree @@ -15,8 +15,12 @@ class BaseValidator(object): def is_valid(cls, xml): ''' Runs the validation on the supplied XML etree. + Returns a tuple, the first value is a boolean indicating + whether the validation passed or not. The second is a list of tuples, + each containing the error message and the error line. + Returns tuple: - (is_valid, error_message_list) + (is_valid, [(error_message_string, error_line_number)]) ''' raise NotImplementedError @@ -26,7 +30,9 @@ class XsdValidator(BaseValidator): @classmethod def _is_valid(cls, xml, xsd_filepath, xsd_name): '''Returns whether or not an XML file is valid according to - an XSD. + an XSD. Returns a tuple, the first value is a boolean indicating + whether the validation passed or not. The second is a list of tuples, + each containing the error message and the error line. Params: xml - etree of the XML to be validated @@ -34,7 +40,7 @@ class XsdValidator(BaseValidator): xsd_name - string describing the XSD Returns: - (is_valid_boolean, list_of_error_message_strings) + (is_valid, [(error_message_string, error_line_number)]) ''' xsd = etree.parse(xsd_filepath) schema = etree.XMLSchema(xsd) @@ -44,12 +50,13 @@ class XsdValidator(BaseValidator): # XMLSchemaParseError: local list type: A type, derived by list or union, must have the simple ur-type definition as base type, not '{http://www.opengis.net/gml/3.2}doubleList'., line 118 try: schema.assertValid(xml) - except AssertionError, e: - msg = '%s Schema Error: %s' % (xsd_name, e.args) - return False, [msg] - except etree.DocumentInvalid, e: - msg = '%s Validation Error: %s' % (xsd_name, e.args) - return False, [msg] + except etree.DocumentInvalid: + log.info('Validation errors found using schema {0}'.format(xsd_name)) + errors = [] + for error in schema.error_log: + errors.append((error.message, error.line)) + errors.insert + return False, errors return True, [] @@ -62,7 +69,11 @@ class ISO19139Schema(XsdValidator): xsd_path = 'xml/iso19139' gmx_xsd_filepath = os.path.join(os.path.dirname(__file__), xsd_path, 'gmx/gmx.xsd') - is_valid, errors = cls._is_valid(xml, gmx_xsd_filepath, 'Dataset schema (gmx.xsd)') + xsd_name = 'Dataset schema (gmx.xsd)' + is_valid, errors = cls._is_valid(xml, gmx_xsd_filepath, xsd_name) + if not is_valid: + #TODO: not sure if we need this one, keeping for backwards compatibility + errors.insert(0, ('{0} Validation Error'.format(xsd_name), None)) return is_valid, errors class ISO19139EdenSchema(XsdValidator): @@ -78,15 +89,23 @@ class ISO19139EdenSchema(XsdValidator): if metadata_type in ('dataset', 'series'): gmx_xsd_filepath = os.path.join(os.path.dirname(__file__), xsd_path, 'gmx/gmx.xsd') - is_valid, errors = cls._is_valid(xml, gmx_xsd_filepath, 'Dataset schema (gmx.xsd)') + xsd_name = 'Dataset schema (gmx.xsd)' + is_valid, errors = cls._is_valid(xml, gmx_xsd_filepath, xsd_name) + if not is_valid: + #TODO: not sure if we need this one, keeping for backwards compatibility + errors.insert(0, ('{0} Validation Error'.format(xsd_name), None)) elif metadata_type == 'service': gmx_and_srv_xsd_filepath = os.path.join(os.path.dirname(__file__), xsd_path, 'gmx_and_srv.xsd') - is_valid, errors = cls._is_valid(xml, gmx_and_srv_xsd_filepath, 'Service schemas (gmx.xsd & srv.xsd)') + xsd_name = 'Service schemas (gmx.xsd & srv.xsd)' + is_valid, errors = cls._is_valid(xml, gmx_and_srv_xsd_filepath, xsd_name) + if not is_valid: + #TODO: not sure if we need this one, keeping for backwards compatibility + errors.insert(0, ('{0} Validation Error'.format(xsd_name), None)) else: is_valid = False - errors = ['Metadata type not recognised "%s" - cannot choose an ISO19139 validator.' % - metadata_type] + errors = [('Metadata type not recognised "%s" - cannot choose an ISO19139 validator.' % + metadata_type, None)] if is_valid: return True, [] @@ -121,12 +140,7 @@ class ISO19139NGDCSchema(XsdValidator): xsd_filepath = os.path.join(os.path.dirname(__file__), xsd_path, 'schema.xsd') - is_valid, errors = cls._is_valid(xml, xsd_filepath, 'NGDC Schema (schema.xsd)') - - if is_valid: - return True, [] - - return False, errors + return cls._is_valid(xml, xsd_filepath, 'NGDC Schema (schema.xsd)') class FGDCSchema(XsdValidator): ''' @@ -147,12 +161,8 @@ class FGDCSchema(XsdValidator): xsd_filepath = os.path.join(os.path.dirname(__file__), xsd_path, 'fgdc-std-001-1998.xsd') - is_valid, errors = cls._is_valid(xml, xsd_filepath, 'FGDC Schema (fgdc-std-001-1998.xsd)') + return cls._is_valid(xml, xsd_filepath, 'FGDC Schema (fgdc-std-001-1998.xsd)') - if is_valid: - return True, [] - - return False, errors class SchematronValidator(BaseValidator): '''Base class for a validator that uses Schematron.''' @@ -166,6 +176,19 @@ class SchematronValidator(BaseValidator): @classmethod def is_valid(cls, xml): + '''Returns whether or not an XML file is valid according to + a schematron. Returns a tuple, the first value is a boolean indicating + whether the validation passed or not. The second is a list of tuples, + each containing the error message and the error line (which defaults to + None on the schematron validation case). + + Params: + xml - etree of the XML to be validated + + Returns: + (is_valid, [(error_message_string, error_line_number)]) + ''' + if not hasattr(cls, 'schematrons'): log.info('Compiling schematron "%s"', cls.title) cls.schematrons = cls.get_schematrons() @@ -180,7 +203,8 @@ class SchematronValidator(BaseValidator): for error in errors: message, details = cls.extract_error_details(error) if not message in messages_already_reported: - error_details.append(details) + #TODO: perhaps can extract the source line from the error location + error_details.append((details,None)) messages_already_reported.add(message) return False, error_details return True, [] @@ -198,7 +222,8 @@ class SchematronValidator(BaseValidator): location = failed_assert_element.get('location') message_element = failed_assert_element.find("{http://purl.oclc.org/dsdl/svrl}text") message = message_element.text.strip() - failed_assert_element + + #TODO: Do we really need such detail on the error messages? return message, 'Error Message: "%s" Error Location: "%s" Error Assert: "%s"' % (message, location, assert_) @classmethod @@ -255,27 +280,42 @@ class Validators(object): ''' def __init__(self, profiles=["iso19139", "constraints", "gemini2"]): self.profiles = profiles + + self.validators = {} # name: class + for validator_class in all_validators: + self.validators[validator_class.name] = validator_class def isvalid(self, xml): '''For backward compatibility''' return self.is_valid(xml) def is_valid(self, xml): - if not hasattr(self, 'validators'): - self.validators = {} # name: class - for validator_class in all_validators: - self.validators[validator_class.name] = validator_class + '''Returns whether or not an XML file is valid. + Returns a tuple, the first value is a boolean indicating + whether the validation passed or not. The second is the name of the profile + that failed and the third is a list of tuples, + each containing the error message and the error line if present. + + Params: + xml - etree of the XML to be validated + + Returns: + (is_valid, failed_profile_name, [(error_message_string, error_line_number)]) + ''' + + log.debug('Starting validation against profile(s) %s' % ','.join(self.profiles)) for name in self.profiles: validator = self.validators[name] is_valid, error_message_list = validator.is_valid(xml) if not is_valid: - error_message_list.insert(0, 'Validating against "%s" profile failed' % validator.title) - log.info('%r', error_message_list) - return False, error_message_list - log.info('Validated against "%s"', validator.title) + #error_message_list.insert(0, 'Validating against "%s" profile failed' % validator.title) + log.info('Validating against "%s" profile failed' % validator.title) + log.debug('%r', error_message_list) + return False, validator.name, error_message_list + log.debug('Validated against "%s"', validator.title) log.info('Validation passed') - return True, [] + return True, None, [] if __name__ == '__main__': from sys import argv From 71134667606fdf31a5c996925f76f7385e269219 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 19 Nov 2012 18:12:40 +0000 Subject: [PATCH 2/7] Update harvesters to new validator outputs --- ckanext/spatial/harvesters.py | 11 ++++------- ckanext/spatial/tests/test_harvest.py | 1 - 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/ckanext/spatial/harvesters.py b/ckanext/spatial/harvesters.py index 9de6582..9b7354b 100644 --- a/ckanext/spatial/harvesters.py +++ b/ckanext/spatial/harvesters.py @@ -199,11 +199,10 @@ class GeminiHarvester(SpatialHarvester): def import_gemini_object(self, gemini_string): log = logging.getLogger(__name__ + '.import') xml = etree.fromstring(gemini_string) - - valid, messages = self._get_validator().is_valid(xml) + valid, profile, errors = self._get_validator().is_valid(xml) if not valid: + out = errors[0][0] + ':\n' + '\n'.join(e[0] for e in errors[1:]) log.error('Errors found for object with GUID %s:' % self.obj.guid) - out = messages[0] + ':\n' + '\n'.join(messages[1:]) self._save_object_error(out,self.obj,'Import') unicode_gemini_string = etree.tostring(xml, encoding=unicode, pretty_print=True) @@ -570,9 +569,9 @@ class GeminiHarvester(SpatialHarvester): if gemini_xml is None: self._save_gather_error('Content is not a valid Gemini document',self.harvest_job) - valid, messages = self._get_validator().is_valid(gemini_xml) + valid, profile, errors = self._get_validator().is_valid(gemini_xml) if not valid: - out = messages[0] + ':\n' + '\n'.join(messages[1:]) + out = errors[0][0] + ':\n' + '\n'.join(e[0] for e in errors[1:]) if url: self._save_gather_error('Validation error for %s - %s'% (url,out),self.harvest_job) else: @@ -780,7 +779,6 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin): self._save_gather_error('Unable to get content for URL: %s: %r' % \ (url, e),harvest_job) return None - ids = [] try: for url in self._extract_urls(content,url): @@ -816,7 +814,6 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin): self._save_gather_error(msg,harvest_job) return None - if len(ids) > 0: return ids else: diff --git a/ckanext/spatial/tests/test_harvest.py b/ckanext/spatial/tests/test_harvest.py index 6599d57..0347338 100644 --- a/ckanext/spatial/tests/test_harvest.py +++ b/ckanext/spatial/tests/test_harvest.py @@ -449,7 +449,6 @@ class TestHarvest(HarvestFixtureBase): message = job.gather_errors[0].message assert_in('Validation error', message) - assert_in('Validating against "GEMINI 2.1 Schematron 1.2" profile failed', message) assert_in('One email address shall be provided', message) assert_in('Service type shall be one of \'discovery\', \'view\', \'download\', \'transformation\', \'invoke\' or \'other\' following INSPIRE generic names', message) assert_in('Limitations on public access code list value shall be \'otherRestrictions\'', message) From 6cf7f79942e72245d848c541975320c74bad9e7a Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 20 Nov 2012 11:39:37 +0000 Subject: [PATCH 3/7] Save line if present when storing object errors --- ckanext/spatial/harvesters.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/ckanext/spatial/harvesters.py b/ckanext/spatial/harvesters.py index 9b7354b..9835384 100644 --- a/ckanext/spatial/harvesters.py +++ b/ckanext/spatial/harvesters.py @@ -111,15 +111,19 @@ class SpatialHarvester(object): finally: log.error(message) - def _save_object_error(self,message,obj,stage=u'Fetch'): - err = HarvestObjectError(message=message,object=obj,stage=stage) + def _save_object_error(self,message,obj,stage=u'Fetch',line=None): + err = HarvestObjectError(message=message, + object=obj, + stage=stage, + line=line) try: err.save() except InvalidRequestError,e: Session.rollback() err.save() finally: - log.error(message) + log_message = '{0}, line {1}'.format(message,line) if line else message + log.error(log_message) def _get_content(self, url): url = url.replace(' ','%20') From 9ea721e2563792f3f73232d8b5623b19f8d99a1c Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 20 Nov 2012 15:42:07 +0000 Subject: [PATCH 4/7] Encode remote documents from CSW servers as unicode --- ckanext/spatial/lib/csw_client.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/ckanext/spatial/lib/csw_client.py b/ckanext/spatial/lib/csw_client.py index 283deff..d4e2403 100644 --- a/ckanext/spatial/lib/csw_client.py +++ b/ckanext/spatial/lib/csw_client.py @@ -161,12 +161,14 @@ class CswService(OwsService): md = csw._exml.find("/{http://www.isotc211.org/2005/gmd}MD_Metadata") mdtree = etree.ElementTree(md) try: - record["xml"] = etree.tostring(mdtree, pretty_print=True, xml_declaration=True) + record["xml"] = etree.tostring(mdtree, pretty_print=True, encoding=unicode) except TypeError: # API incompatibilities between different flavours of elementtree try: - record["xml"] = etree.tostring(mdtree) + record["xml"] = etree.tostring(mdtree, pretty_print=True, encoding=unicode) except AssertionError: - record["xml"] = etree.tostring(md) + record["xml"] = etree.tostring(md, pretty_print=True, encoding=unicode) + + record["xml"] = '\n' + record["xml"] record["tree"] = mdtree return record From 615a58ce93011ecbf91c3bc4e11b675d68d24b43 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 21 Nov 2012 16:22:55 +0000 Subject: [PATCH 5/7] Reduce log level for harvest object errors --- ckanext/spatial/harvesters.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/spatial/harvesters.py b/ckanext/spatial/harvesters.py index 9835384..dfbb1c8 100644 --- a/ckanext/spatial/harvesters.py +++ b/ckanext/spatial/harvesters.py @@ -123,7 +123,7 @@ class SpatialHarvester(object): err.save() finally: log_message = '{0}, line {1}'.format(message,line) if line else message - log.error(log_message) + log.debug(log_message) def _get_content(self, url): url = url.replace(' ','%20') From c927d8b6abd526695139d58fc0fb6c4fd4632d36 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 20 Dec 2012 18:26:40 +0000 Subject: [PATCH 6/7] Add method for adding custom validators This probably needs to be done properly, adding them once on startup somehow. --- ckanext/spatial/validation/validation.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ckanext/spatial/validation/validation.py b/ckanext/spatial/validation/validation.py index 0cb2d56..cbdbfe6 100644 --- a/ckanext/spatial/validation/validation.py +++ b/ckanext/spatial/validation/validation.py @@ -285,6 +285,9 @@ class Validators(object): for validator_class in all_validators: self.validators[validator_class.name] = validator_class + def add_validator(self, validator_class): + self.validators[validator_class.name] = validator_class + def isvalid(self, xml): '''For backward compatibility''' return self.is_valid(xml) From 4e47141717b39669e9d041c5c53592bdde5dc19c Mon Sep 17 00:00:00 2001 From: kindly Date: Mon, 24 Dec 2012 10:43:44 +0000 Subject: [PATCH 7/7] add extra resource locator --- ckanext/spatial/model/harvested_metadata.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/ckanext/spatial/model/harvested_metadata.py b/ckanext/spatial/model/harvested_metadata.py index 5ba7ff6..debb8df 100644 --- a/ckanext/spatial/model/harvested_metadata.py +++ b/ckanext/spatial/model/harvested_metadata.py @@ -603,6 +603,13 @@ class GeminiDocument(MappedXmlDocument): ], multiplicity="*", ), + GeminiResourceLocator( + name="resource-locator-identification", + search_paths=[ + "gmd:identificationInfo//gmd:CI_OnlineResource", + ], + multiplicity="*", + ), GeminiElement( name="conformity-specification", search_paths=[