2012-10-17 17:59:02 +02:00
import os
2012-11-19 18:15:16 +01:00
from pkg_resources import resource_stream
2013-02-13 20:16:36 +01:00
from ckanext . spatial . model import ISODocument
2012-10-17 17:59:02 +02:00
from lxml import etree
log = __import__ ( " logging " ) . getLogger ( __name__ )
2017-05-23 16:19:14 +02:00
2012-10-17 17:59:02 +02:00
class BaseValidator ( object ) :
''' Base class for a validator. '''
name = None
title = None
@classmethod
def is_valid ( cls , xml ) :
'''
Runs the validation on the supplied XML etree .
2012-11-19 18:15:16 +01:00
Returns a tuple , the first value is a boolean indicating
whether the validation passed or not . The second is a list of tuples ,
each containing the error message and the error line .
2012-10-17 17:59:02 +02:00
Returns tuple :
2012-11-19 18:15:16 +01:00
( is_valid , [ ( error_message_string , error_line_number ) ] )
2012-10-17 17:59:02 +02:00
'''
raise NotImplementedError
2017-05-23 16:19:14 +02:00
2012-10-17 17:59:02 +02:00
class XsdValidator ( BaseValidator ) :
''' Base class for validators that use an XSD schema. '''
@classmethod
def _is_valid ( cls , xml , xsd_filepath , xsd_name ) :
''' Returns whether or not an XML file is valid according to
2012-11-19 18:15:16 +01:00
an XSD . Returns a tuple , the first value is a boolean indicating
whether the validation passed or not . The second is a list of tuples ,
each containing the error message and the error line .
2012-10-17 17:59:02 +02:00
Params :
xml - etree of the XML to be validated
xsd_filepath - full path to the XSD file
xsd_name - string describing the XSD
Returns :
2012-11-19 18:15:16 +01:00
( is_valid , [ ( error_message_string , error_line_number ) ] )
2012-10-17 17:59:02 +02:00
'''
xsd = etree . parse ( xsd_filepath )
schema = etree . XMLSchema ( xsd )
# With libxml2 versions before 2.9, this fails with this error:
# gmx_schema = etree.XMLSchema(gmx_xsd)
# File "xmlschema.pxi", line 103, in lxml.etree.XMLSchema.__init__ (src/lxml/lxml.etree.c:116069)
# XMLSchemaParseError: local list type: A type, derived by list or union, must have the simple ur-type definition as base type, not '{http://www.opengis.net/gml/3.2}doubleList'., line 118
try :
schema . assertValid ( xml )
2012-11-19 18:15:16 +01:00
except etree . DocumentInvalid :
2017-05-23 16:19:14 +02:00
log . info (
' Validation errors found using schema {0} ' . format ( xsd_name ) )
2012-11-19 18:15:16 +01:00
errors = [ ]
for error in schema . error_log :
errors . append ( ( error . message , error . line ) )
errors . insert
return False , errors
2012-10-17 17:59:02 +02:00
return True , [ ]
2012-10-19 15:23:34 +02:00
class ISO19139Schema ( XsdValidator ) :
name = ' iso19139 '
title = ' ISO19139 XSD Schema '
@classmethod
def is_valid ( cls , xml ) :
xsd_path = ' xml/iso19139 '
gmx_xsd_filepath = os . path . join ( os . path . dirname ( __file__ ) ,
2017-05-23 16:19:14 +02:00
xsd_path , ' gmx/gmx.xsd ' )
2012-11-19 18:15:16 +01:00
xsd_name = ' Dataset schema (gmx.xsd) '
is_valid , errors = cls . _is_valid ( xml , gmx_xsd_filepath , xsd_name )
if not is_valid :
#TODO: not sure if we need this one, keeping for backwards compatibility
errors . insert ( 0 , ( ' {0} Validation Error ' . format ( xsd_name ) , None ) )
2012-10-19 15:23:34 +02:00
return is_valid , errors
2017-05-23 16:19:14 +02:00
2012-10-17 17:59:02 +02:00
class ISO19139EdenSchema ( XsdValidator ) :
name = ' iso19139eden '
2012-10-30 10:34:40 +01:00
title = ' ISO19139 XSD Schema (EDEN 2009-03-16) '
2012-10-17 17:59:02 +02:00
@classmethod
def is_valid ( cls , xml ) :
xsd_path = ' xml/iso19139eden '
metadata_type = cls . get_record_type ( xml )
if metadata_type in ( ' dataset ' , ' series ' ) :
gmx_xsd_filepath = os . path . join ( os . path . dirname ( __file__ ) ,
xsd_path , ' gmx/gmx.xsd ' )
2012-11-19 18:15:16 +01:00
xsd_name = ' Dataset schema (gmx.xsd) '
is_valid , errors = cls . _is_valid ( xml , gmx_xsd_filepath , xsd_name )
if not is_valid :
#TODO: not sure if we need this one, keeping for backwards compatibility
errors . insert ( 0 , ( ' {0} Validation Error ' . format ( xsd_name ) , None ) )
2012-10-17 17:59:02 +02:00
elif metadata_type == ' service ' :
gmx_and_srv_xsd_filepath = os . path . join ( os . path . dirname ( __file__ ) ,
xsd_path , ' gmx_and_srv.xsd ' )
2012-11-19 18:15:16 +01:00
xsd_name = ' Service schemas (gmx.xsd & srv.xsd) '
is_valid , errors = cls . _is_valid ( xml , gmx_and_srv_xsd_filepath , xsd_name )
if not is_valid :
#TODO: not sure if we need this one, keeping for backwards compatibility
errors . insert ( 0 , ( ' {0} Validation Error ' . format ( xsd_name ) , None ) )
2012-10-17 17:59:02 +02:00
else :
is_valid = False
2012-11-19 18:15:16 +01:00
errors = [ ( ' Metadata type not recognised " %s " - cannot choose an ISO19139 validator. ' %
metadata_type , None ) ]
2012-10-17 17:59:02 +02:00
if is_valid :
return True , [ ]
return False , errors
@classmethod
def get_record_type ( cls , xml ) :
'''
For a given ISO19139 record , returns the " type "
e . g . " dataset " , " series " , " service "
xml - etree of the ISO19139 XML record
'''
2013-02-13 20:16:36 +01:00
iso_parser = ISODocument ( xml_tree = xml )
2014-03-14 13:36:01 +01:00
record_types = iso_parser . read_value ( ' resource-type ' )
if len ( record_types ) :
return record_types [ 0 ]
else :
return ' dataset '
2012-10-17 17:59:02 +02:00
2012-10-29 15:28:58 +01:00
class ISO19139NGDCSchema ( XsdValidator ) :
'''
XSD based validation for ISO 19139 documents .
Uses XSD schema from the NOAA National Geophysical Data Center :
http : / / ngdc . noaa . gov / metadata / published / xsd /
'''
name = ' iso19139ngdc '
title = ' ISO19139 XSD Schema (NGDC) '
@classmethod
def is_valid ( cls , xml ) :
xsd_path = ' xml/iso19139ngdc '
xsd_filepath = os . path . join ( os . path . dirname ( __file__ ) ,
2017-05-23 16:19:14 +02:00
xsd_path , ' schema.xsd ' )
2012-11-19 18:15:16 +01:00
return cls . _is_valid ( xml , xsd_filepath , ' NGDC Schema (schema.xsd) ' )
2012-10-29 15:28:58 +01:00
2017-05-23 16:19:14 +02:00
2012-10-29 15:34:29 +01:00
class FGDCSchema ( XsdValidator ) :
'''
XSD based validation for FGDC metadata documents .
Uses XSD schema from the Federal Geographic Data Comittee :
http : / / www . fgdc . gov / schemas / metadata /
'''
name = ' fgdc '
title = ' FGDC XSD Schema '
@classmethod
def is_valid ( cls , xml ) :
xsd_path = ' xml/fgdc '
xsd_filepath = os . path . join ( os . path . dirname ( __file__ ) ,
xsd_path , ' fgdc-std-001-1998.xsd ' )
2012-11-19 18:15:16 +01:00
return cls . _is_valid ( xml , xsd_filepath , ' FGDC Schema (fgdc-std-001-1998.xsd) ' )
2012-10-29 15:34:29 +01:00
2012-10-17 17:59:02 +02:00
class SchematronValidator ( BaseValidator ) :
''' Base class for a validator that uses Schematron. '''
has_init = False
@classmethod
def get_schematrons ( cls ) :
''' Subclasses should override this method to implement
their validation . '''
raise NotImplementedError
@classmethod
def is_valid ( cls , xml ) :
2012-11-19 18:15:16 +01:00
''' Returns whether or not an XML file is valid according to
a schematron . Returns a tuple , the first value is a boolean indicating
whether the validation passed or not . The second is a list of tuples ,
each containing the error message and the error line ( which defaults to
None on the schematron validation case ) .
Params :
xml - etree of the XML to be validated
Returns :
( is_valid , [ ( error_message_string , error_line_number ) ] )
'''
2012-10-17 17:59:02 +02:00
if not hasattr ( cls , ' schematrons ' ) :
log . info ( ' Compiling schematron " %s " ' , cls . title )
cls . schematrons = cls . get_schematrons ( )
for schematron in cls . schematrons :
result = schematron ( xml )
errors = [ ]
for element in result . findall ( " { http://purl.oclc.org/dsdl/svrl}failed-assert " ) :
errors . append ( element )
if len ( errors ) > 0 :
messages_already_reported = set ( )
error_details = [ ]
for error in errors :
message , details = cls . extract_error_details ( error )
if not message in messages_already_reported :
2012-11-19 18:15:16 +01:00
#TODO: perhaps can extract the source line from the error location
error_details . append ( ( details , None ) )
2012-10-17 17:59:02 +02:00
messages_already_reported . add ( message )
return False , error_details
return True , [ ]
@classmethod
def extract_error_details ( cls , failed_assert_element ) :
''' Given the XML Element describing a schematron test failure,
this method extracts the strings describing the failure and returns
them .
Returns :
( error_message , fuller_error_details )
'''
assert_ = failed_assert_element . get ( ' test ' )
location = failed_assert_element . get ( ' location ' )
message_element = failed_assert_element . find ( " { http://purl.oclc.org/dsdl/svrl}text " )
message = message_element . text . strip ( )
2012-11-19 18:15:16 +01:00
#TODO: Do we really need such detail on the error messages?
2012-10-17 17:59:02 +02:00
return message , ' Error Message: " %s " Error Location: " %s " Error Assert: " %s " ' % ( message , location , assert_ )
@classmethod
def schematron ( cls , schema ) :
transforms = [
2017-05-23 17:07:07 +02:00
" xml/schematron/iso_dsdl_include.xsl " ,
" xml/schematron/iso_abstract_expand.xsl " ,
" xml/schematron/iso_svrl_for_xslt1.xsl " ,
2012-10-17 17:59:02 +02:00
]
if isinstance ( schema , file ) :
compiled = etree . parse ( schema )
else :
compiled = schema
for filename in transforms :
2017-05-23 17:07:07 +02:00
with resource_stream ( __name__ , filename ) as stream :
2012-10-17 17:59:02 +02:00
xform_xml = etree . parse ( stream )
xform = etree . XSLT ( xform_xml )
compiled = xform ( compiled )
return etree . XSLT ( compiled )
2012-10-22 20:39:07 +02:00
2012-10-17 17:59:02 +02:00
class ConstraintsSchematron ( SchematronValidator ) :
name = ' constraints '
2012-11-07 17:23:27 +01:00
title = ' ISO19139 Table A.1 Constraints Schematron (Medin 1.3) '
2012-10-17 17:59:02 +02:00
@classmethod
def get_schematrons ( cls ) :
2017-05-23 16:19:14 +02:00
with resource_stream (
__name__ ,
" xml/medin/ISOTS19139A1Constraints_v1.3.sch " ) as schema :
2012-10-17 17:59:02 +02:00
return [ cls . schematron ( schema ) ]
2017-05-23 16:19:14 +02:00
2012-12-05 12:42:57 +01:00
class ConstraintsSchematron14 ( SchematronValidator ) :
name = ' constraints-1.4 '
title = ' ISO19139 Table A.1 Constraints Schematron (Medin/Parslow 1.4) '
@classmethod
def get_schematrons ( cls ) :
2017-05-23 16:19:14 +02:00
with resource_stream (
__name__ ,
" xml/medin/ISOTS19139A1Constraints_v1.4.sch " ) as schema :
2012-12-05 12:42:57 +01:00
return [ cls . schematron ( schema ) ]
2012-10-17 17:59:02 +02:00
class Gemini2Schematron ( SchematronValidator ) :
name = ' gemini2 '
title = ' GEMINI 2.1 Schematron 1.2 '
@classmethod
def get_schematrons ( cls ) :
2017-05-23 16:19:14 +02:00
with resource_stream (
__name__ ,
" xml/gemini2/gemini2-schematron-20110906-v1.2.sch " ) as schema :
2012-10-17 17:59:02 +02:00
return [ cls . schematron ( schema ) ]
2017-05-23 16:19:14 +02:00
2013-01-24 12:30:23 +01:00
class Gemini2Schematron13 ( SchematronValidator ) :
name = ' gemini2-1.3 '
2013-02-01 15:00:51 +01:00
title = ' GEMINI 2.1 Schematron 1.3 '
2013-01-24 12:30:23 +01:00
@classmethod
def get_schematrons ( cls ) :
2017-05-23 16:19:14 +02:00
with resource_stream ( __name__ ,
" xml/gemini2/Gemini2_R1r3.sch " ) as schema :
2013-01-24 12:30:23 +01:00
return [ cls . schematron ( schema ) ]
2012-10-17 17:59:02 +02:00
all_validators = ( ISO19139Schema ,
ISO19139EdenSchema ,
2012-10-29 15:28:58 +01:00
ISO19139NGDCSchema ,
2012-10-29 15:34:29 +01:00
FGDCSchema ,
2012-10-17 17:59:02 +02:00
ConstraintsSchematron ,
2012-12-05 12:42:57 +01:00
ConstraintsSchematron14 ,
2013-01-24 12:30:23 +01:00
Gemini2Schematron ,
Gemini2Schematron13 )
2012-10-17 17:59:02 +02:00
2012-10-19 12:19:01 +02:00
class Validators ( object ) :
2012-10-17 17:59:02 +02:00
'''
Validates XML against one or more profiles ( i . e . validators ) .
'''
def __init__ ( self , profiles = [ " iso19139 " , " constraints " , " gemini2 " ] ) :
self . profiles = profiles
2017-05-23 16:19:14 +02:00
self . validators = { } # name: class
2012-11-19 18:15:16 +01:00
for validator_class in all_validators :
self . validators [ validator_class . name ] = validator_class
2012-10-17 17:59:02 +02:00
2012-12-20 19:26:40 +01:00
def add_validator ( self , validator_class ) :
self . validators [ validator_class . name ] = validator_class
2012-10-17 17:59:02 +02:00
def isvalid ( self , xml ) :
''' For backward compatibility '''
2012-10-19 12:19:01 +02:00
return self . is_valid ( xml )
2012-10-22 20:39:07 +02:00
2012-10-17 17:59:02 +02:00
def is_valid ( self , xml ) :
2012-11-19 18:15:16 +01:00
''' Returns whether or not an XML file is valid.
Returns a tuple , the first value is a boolean indicating
2017-05-23 16:19:14 +02:00
whether the validation passed or not . The second is the name of the
profile that failed and the third is a list of tuples ,
2012-11-19 18:15:16 +01:00
each containing the error message and the error line if present .
Params :
xml - etree of the XML to be validated
Returns :
( is_valid , failed_profile_name , [ ( error_message_string , error_line_number ) ] )
'''
2012-10-30 15:18:01 +01:00
log . debug ( ' Starting validation against profile(s) %s ' % ' , ' . join ( self . profiles ) )
2012-10-17 17:59:02 +02:00
for name in self . profiles :
validator = self . validators [ name ]
is_valid , error_message_list = validator . is_valid ( xml )
if not is_valid :
2012-11-19 18:15:16 +01:00
#error_message_list.insert(0, 'Validating against "%s" profile failed' % validator.title)
log . info ( ' Validating against " %s " profile failed ' % validator . title )
log . debug ( ' %r ' , error_message_list )
return False , validator . name , error_message_list
log . debug ( ' Validated against " %s " ' , validator . title )
2012-10-17 17:59:02 +02:00
log . info ( ' Validation passed ' )
2012-11-19 18:15:16 +01:00
return True , None , [ ]
2012-10-22 20:39:07 +02:00
2012-10-17 17:59:02 +02:00
if __name__ == ' __main__ ' :
from sys import argv
import logging
2012-10-22 20:39:07 +02:00
from pprint import pprint
2012-10-17 17:59:02 +02:00
logging . basicConfig ( )
2012-10-22 20:39:07 +02:00
if len ( argv ) == 3 :
profiles = argv [ 2 ] . split ( ' , ' )
else :
profiles = [ " iso19139 " , " constraints " , " gemini2 " ]
v = Validators ( profiles )
result = v . is_valid ( etree . parse ( open ( argv [ 1 ] ) ) )
pprint ( result )