Merge branch '2.0-validation-changes' into release-v2.0

This commit is contained in:
amercader 2013-02-07 12:45:02 +00:00
commit 3142794524
6 changed files with 122 additions and 56 deletions

View File

@ -111,15 +111,19 @@ class SpatialHarvester(object):
finally:
log.error(message)
def _save_object_error(self,message,obj,stage=u'Fetch'):
err = HarvestObjectError(message=message,object=obj,stage=stage)
def _save_object_error(self,message,obj,stage=u'Fetch',line=None):
err = HarvestObjectError(message=message,
object=obj,
stage=stage,
line=line)
try:
err.save()
except InvalidRequestError,e:
Session.rollback()
err.save()
finally:
log.error(message)
log_message = '{0}, line {1}'.format(message,line) if line else message
log.debug(log_message)
def _get_content(self, url):
url = url.replace(' ','%20')
@ -199,11 +203,10 @@ class GeminiHarvester(SpatialHarvester):
def import_gemini_object(self, gemini_string):
log = logging.getLogger(__name__ + '.import')
xml = etree.fromstring(gemini_string)
valid, messages = self._get_validator().is_valid(xml)
valid, profile, errors = self._get_validator().is_valid(xml)
if not valid:
out = errors[0][0] + ':\n' + '\n'.join(e[0] for e in errors[1:])
log.error('Errors found for object with GUID %s:' % self.obj.guid)
out = messages[0] + ':\n' + '\n'.join(messages[1:])
self._save_object_error(out,self.obj,'Import')
unicode_gemini_string = etree.tostring(xml, encoding=unicode, pretty_print=True)
@ -570,9 +573,9 @@ class GeminiHarvester(SpatialHarvester):
if gemini_xml is None:
self._save_gather_error('Content is not a valid Gemini document',self.harvest_job)
valid, messages = self._get_validator().is_valid(gemini_xml)
valid, profile, errors = self._get_validator().is_valid(gemini_xml)
if not valid:
out = messages[0] + ':\n' + '\n'.join(messages[1:])
out = errors[0][0] + ':\n' + '\n'.join(e[0] for e in errors[1:])
if url:
self._save_gather_error('Validation error for %s - %s'% (url,out),self.harvest_job)
else:
@ -780,7 +783,6 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin):
self._save_gather_error('Unable to get content for URL: %s: %r' % \
(url, e),harvest_job)
return None
ids = []
try:
for url in self._extract_urls(content,url):
@ -816,7 +818,6 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin):
self._save_gather_error(msg,harvest_job)
return None
if len(ids) > 0:
return ids
else:

View File

@ -161,12 +161,14 @@ class CswService(OwsService):
md = csw._exml.find("/{http://www.isotc211.org/2005/gmd}MD_Metadata")
mdtree = etree.ElementTree(md)
try:
record["xml"] = etree.tostring(mdtree, pretty_print=True, xml_declaration=True)
record["xml"] = etree.tostring(mdtree, pretty_print=True, encoding=unicode)
except TypeError:
# API incompatibilities between different flavours of elementtree
try:
record["xml"] = etree.tostring(mdtree)
record["xml"] = etree.tostring(mdtree, pretty_print=True, encoding=unicode)
except AssertionError:
record["xml"] = etree.tostring(md)
record["xml"] = etree.tostring(md, pretty_print=True, encoding=unicode)
record["xml"] = '<?xml version="1.0" encoding="UTF-8"?>\n' + record["xml"]
record["tree"] = mdtree
return record

View File

@ -603,6 +603,13 @@ class GeminiDocument(MappedXmlDocument):
],
multiplicity="*",
),
GeminiResourceLocator(
name="resource-locator-identification",
search_paths=[
"gmd:identificationInfo//gmd:CI_OnlineResource",
],
multiplicity="*",
),
GeminiElement(
name="conformity-specification",
search_paths=[

View File

@ -449,7 +449,6 @@ class TestHarvest(HarvestFixtureBase):
message = job.gather_errors[0].message
assert_in('Validation error', message)
assert_in('Validating against "GEMINI 2.1 Schematron 1.2" profile failed', message)
assert_in('One email address shall be provided', message)
assert_in('Service type shall be one of \'discovery\', \'view\', \'download\', \'transformation\', \'invoke\' or \'other\' following INSPIRE generic names', message)
assert_in('Limitations on public access code list value shall be \'otherRestrictions\'', message)

View File

@ -9,13 +9,15 @@ from ckanext.spatial import validation
class TestValidation:
def _get_file_path(self, file_name):
return os.path.join(os.path.dirname(__file__), 'xml', file_name)
def get_validation_errors(self, validator, validation_test_filename):
validation_test_filepath = os.path.join(os.path.dirname(__file__),
'xml',
validation_test_filename)
validation_test_filepath = self._get_file_path(validation_test_filename)
xml = etree.parse(validation_test_filepath)
is_valid, errors = validator.is_valid(xml)
return ';'.join(errors)
return ';'.join([e[0] for e in errors])
def test_iso19139_failure(self):
errors = self.get_validation_errors(validation.ISO19139Schema,
@ -117,7 +119,7 @@ class TestValidation:
'gemini2.1/validation/13_Dataset_Invalid_Element_srv.xml')
assert len(errors) > 0
assert_in('(gmx.xsd)', errors)
assert_in('(u"Element \'{http://www.isotc211.org/2005/srv}SV_ServiceIdentification\': This element is not expected.', errors)
assert_in('Element \'{http://www.isotc211.org/2005/srv}SV_ServiceIdentification\': This element is not expected.', errors)
def test_schematron_error_extraction(self):
validation_error_xml = '''
@ -137,3 +139,15 @@ class TestValidation:
assert_in("srv:serviceType/*[1] = 'discovery'", details)
assert_in("/*[local-name()='MD_Metadata'", details)
assert_in("Service type shall be one of 'discovery'", details)
def test_error_line_numbers(self):
file_path = self._get_file_path('iso19139/dataset-invalid.xml')
xml = etree.parse(file_path)
is_valid, profile, errors = validation.Validators(profiles=['iso19139']).is_valid(xml)
assert not is_valid
assert len(errors) == 2
message, line = errors[1]
assert 'This element is not expected' in message
assert line == 3

View File

@ -1,5 +1,5 @@
import os
from pkg_resources import resource_stream, resource_filename
from pkg_resources import resource_stream
from ckanext.spatial.model import GeminiDocument
from lxml import etree
@ -15,8 +15,12 @@ class BaseValidator(object):
def is_valid(cls, xml):
'''
Runs the validation on the supplied XML etree.
Returns a tuple, the first value is a boolean indicating
whether the validation passed or not. The second is a list of tuples,
each containing the error message and the error line.
Returns tuple:
(is_valid, error_message_list)
(is_valid, [(error_message_string, error_line_number)])
'''
raise NotImplementedError
@ -26,7 +30,9 @@ class XsdValidator(BaseValidator):
@classmethod
def _is_valid(cls, xml, xsd_filepath, xsd_name):
'''Returns whether or not an XML file is valid according to
an XSD.
an XSD. Returns a tuple, the first value is a boolean indicating
whether the validation passed or not. The second is a list of tuples,
each containing the error message and the error line.
Params:
xml - etree of the XML to be validated
@ -34,7 +40,7 @@ class XsdValidator(BaseValidator):
xsd_name - string describing the XSD
Returns:
(is_valid_boolean, list_of_error_message_strings)
(is_valid, [(error_message_string, error_line_number)])
'''
xsd = etree.parse(xsd_filepath)
schema = etree.XMLSchema(xsd)
@ -44,12 +50,13 @@ class XsdValidator(BaseValidator):
# XMLSchemaParseError: local list type: A type, derived by list or union, must have the simple ur-type definition as base type, not '{http://www.opengis.net/gml/3.2}doubleList'., line 118
try:
schema.assertValid(xml)
except AssertionError, e:
msg = '%s Schema Error: %s' % (xsd_name, e.args)
return False, [msg]
except etree.DocumentInvalid, e:
msg = '%s Validation Error: %s' % (xsd_name, e.args)
return False, [msg]
except etree.DocumentInvalid:
log.info('Validation errors found using schema {0}'.format(xsd_name))
errors = []
for error in schema.error_log:
errors.append((error.message, error.line))
errors.insert
return False, errors
return True, []
@ -62,7 +69,11 @@ class ISO19139Schema(XsdValidator):
xsd_path = 'xml/iso19139'
gmx_xsd_filepath = os.path.join(os.path.dirname(__file__),
xsd_path, 'gmx/gmx.xsd')
is_valid, errors = cls._is_valid(xml, gmx_xsd_filepath, 'Dataset schema (gmx.xsd)')
xsd_name = 'Dataset schema (gmx.xsd)'
is_valid, errors = cls._is_valid(xml, gmx_xsd_filepath, xsd_name)
if not is_valid:
#TODO: not sure if we need this one, keeping for backwards compatibility
errors.insert(0, ('{0} Validation Error'.format(xsd_name), None))
return is_valid, errors
class ISO19139EdenSchema(XsdValidator):
@ -78,15 +89,23 @@ class ISO19139EdenSchema(XsdValidator):
if metadata_type in ('dataset', 'series'):
gmx_xsd_filepath = os.path.join(os.path.dirname(__file__),
xsd_path, 'gmx/gmx.xsd')
is_valid, errors = cls._is_valid(xml, gmx_xsd_filepath, 'Dataset schema (gmx.xsd)')
xsd_name = 'Dataset schema (gmx.xsd)'
is_valid, errors = cls._is_valid(xml, gmx_xsd_filepath, xsd_name)
if not is_valid:
#TODO: not sure if we need this one, keeping for backwards compatibility
errors.insert(0, ('{0} Validation Error'.format(xsd_name), None))
elif metadata_type == 'service':
gmx_and_srv_xsd_filepath = os.path.join(os.path.dirname(__file__),
xsd_path, 'gmx_and_srv.xsd')
is_valid, errors = cls._is_valid(xml, gmx_and_srv_xsd_filepath, 'Service schemas (gmx.xsd & srv.xsd)')
xsd_name = 'Service schemas (gmx.xsd & srv.xsd)'
is_valid, errors = cls._is_valid(xml, gmx_and_srv_xsd_filepath, xsd_name)
if not is_valid:
#TODO: not sure if we need this one, keeping for backwards compatibility
errors.insert(0, ('{0} Validation Error'.format(xsd_name), None))
else:
is_valid = False
errors = ['Metadata type not recognised "%s" - cannot choose an ISO19139 validator.' %
metadata_type]
errors = [('Metadata type not recognised "%s" - cannot choose an ISO19139 validator.' %
metadata_type, None)]
if is_valid:
return True, []
@ -121,12 +140,7 @@ class ISO19139NGDCSchema(XsdValidator):
xsd_filepath = os.path.join(os.path.dirname(__file__),
xsd_path, 'schema.xsd')
is_valid, errors = cls._is_valid(xml, xsd_filepath, 'NGDC Schema (schema.xsd)')
if is_valid:
return True, []
return False, errors
return cls._is_valid(xml, xsd_filepath, 'NGDC Schema (schema.xsd)')
class FGDCSchema(XsdValidator):
'''
@ -147,12 +161,8 @@ class FGDCSchema(XsdValidator):
xsd_filepath = os.path.join(os.path.dirname(__file__),
xsd_path, 'fgdc-std-001-1998.xsd')
is_valid, errors = cls._is_valid(xml, xsd_filepath, 'FGDC Schema (fgdc-std-001-1998.xsd)')
return cls._is_valid(xml, xsd_filepath, 'FGDC Schema (fgdc-std-001-1998.xsd)')
if is_valid:
return True, []
return False, errors
class SchematronValidator(BaseValidator):
'''Base class for a validator that uses Schematron.'''
@ -166,6 +176,19 @@ class SchematronValidator(BaseValidator):
@classmethod
def is_valid(cls, xml):
'''Returns whether or not an XML file is valid according to
a schematron. Returns a tuple, the first value is a boolean indicating
whether the validation passed or not. The second is a list of tuples,
each containing the error message and the error line (which defaults to
None on the schematron validation case).
Params:
xml - etree of the XML to be validated
Returns:
(is_valid, [(error_message_string, error_line_number)])
'''
if not hasattr(cls, 'schematrons'):
log.info('Compiling schematron "%s"', cls.title)
cls.schematrons = cls.get_schematrons()
@ -180,7 +203,8 @@ class SchematronValidator(BaseValidator):
for error in errors:
message, details = cls.extract_error_details(error)
if not message in messages_already_reported:
error_details.append(details)
#TODO: perhaps can extract the source line from the error location
error_details.append((details,None))
messages_already_reported.add(message)
return False, error_details
return True, []
@ -198,7 +222,8 @@ class SchematronValidator(BaseValidator):
location = failed_assert_element.get('location')
message_element = failed_assert_element.find("{http://purl.oclc.org/dsdl/svrl}text")
message = message_element.text.strip()
failed_assert_element
#TODO: Do we really need such detail on the error messages?
return message, 'Error Message: "%s" Error Location: "%s" Error Assert: "%s"' % (message, location, assert_)
@classmethod
@ -255,27 +280,45 @@ class Validators(object):
'''
def __init__(self, profiles=["iso19139", "constraints", "gemini2"]):
self.profiles = profiles
self.validators = {} # name: class
for validator_class in all_validators:
self.validators[validator_class.name] = validator_class
def add_validator(self, validator_class):
self.validators[validator_class.name] = validator_class
def isvalid(self, xml):
'''For backward compatibility'''
return self.is_valid(xml)
def is_valid(self, xml):
if not hasattr(self, 'validators'):
self.validators = {} # name: class
for validator_class in all_validators:
self.validators[validator_class.name] = validator_class
'''Returns whether or not an XML file is valid.
Returns a tuple, the first value is a boolean indicating
whether the validation passed or not. The second is the name of the profile
that failed and the third is a list of tuples,
each containing the error message and the error line if present.
Params:
xml - etree of the XML to be validated
Returns:
(is_valid, failed_profile_name, [(error_message_string, error_line_number)])
'''
log.debug('Starting validation against profile(s) %s' % ','.join(self.profiles))
for name in self.profiles:
validator = self.validators[name]
is_valid, error_message_list = validator.is_valid(xml)
if not is_valid:
error_message_list.insert(0, 'Validating against "%s" profile failed' % validator.title)
log.info('%r', error_message_list)
return False, error_message_list
log.info('Validated against "%s"', validator.title)
#error_message_list.insert(0, 'Validating against "%s" profile failed' % validator.title)
log.info('Validating against "%s" profile failed' % validator.title)
log.debug('%r', error_message_list)
return False, validator.name, error_message_list
log.debug('Validated against "%s"', validator.title)
log.info('Validation passed')
return True, []
return True, None, []
if __name__ == '__main__':
from sys import argv