Added ability to produce report into validation errors, for when changing validation. Added report infrastructure.

This commit is contained in:
David Read 2012-10-19 18:20:32 +01:00
parent 0e8a62fe1e
commit d90114cf07
4 changed files with 207 additions and 0 deletions

View File

@ -0,0 +1,71 @@
import sys
import re
from pprint import pprint
import logging
from lxml import etree
from ckan.lib.cli import CkanCommand
log = logging.getLogger(__name__)
class Validation(CkanCommand):
'''Validation commands
Usage:
validation report [package-name]
Performs validation on the harvested metadata, either for all
packages or one specified.
validation report-csv <filename>.csv
'''
summary = __doc__.split('\n')[0]
usage = __doc__
max_args = 3
min_args = 0
def command(self):
if not self.args or self.args[0] in ['--help', '-h', 'help']:
print self.usage
sys.exit(1)
self._load_config()
cmd = self.args[0]
if cmd == 'report':
self.report()
elif cmd == 'report-csv':
self.report_csv()
else:
print 'Command %s not recognized' % cmd
def report(self):
from ckan import model
from ckanext.harvest.model import HarvestObject
from ckanext.spatial.lib.reports import validation_report
if len(self.args) >= 2:
package_ref = unicode(self.args[1])
pkg = model.Package.get(package_ref)
if not pkg:
print 'Package ref "%s" not recognised' % package_ref
sys.exit(1)
else:
pkg = None
report = validation_report(package_id=pkg.id)
for row in report.get_rows_html_formatted():
print
for i, col_name in enumerate(report.column_names):
print ' %s: %s' % (col_name, row[i])
def report_csv(self):
from ckanext.spatial.lib.reports import validation_report
if len(self.args) != 2:
print 'Wrong number of arguments'
sys.exit(1)
csv_filepath = self.args[1]
report = validation_report()
with open(csv_filepath, 'wb') as f:
f.write(report.get_csv())

View File

@ -0,0 +1,69 @@
'''
Library for creating reports that can be displayed easily in an HTML table
and then saved as a CSV.
'''
import datetime
import csv
try: from cStringIO import StringIO
except ImportError: from StringIO import StringIO
class ReportTable(object):
def __init__(self, column_names):
assert isinstance(column_names, (list, tuple))
self.column_names = column_names
self.rows = []
def add_row_dict(self, row_dict):
'''Adds a row to the report table'''
row = []
for col_name in self.column_names:
if col_name in row_dict:
value = row_dict.pop(col_name)
else:
value = None
row.append(value)
if row_dict:
raise Exception('Have left-over keys not under a column: %s' % row_dict)
self.rows.append(row)
def get_rows_html_formatted(self, date_format='%d/%m/%y %H:%M',
blank_cell_html=''):
for row in self.rows:
row_formatted = row[:]
for i, cell in enumerate(row):
if isinstance(cell, datetime.datetime):
row_formatted[i] = cell.strftime(date_format)
elif cell is None:
row_formatted[i] = blank_cell_html
yield row_formatted
def get_csv(self):
csvout = StringIO()
csvwriter = csv.writer(
csvout,
dialect='excel',
quoting=csv.QUOTE_NONNUMERIC
)
csvwriter.writerow(self.column_names)
for row in self.rows:
row_formatted = []
for cell in row:
if isinstance(cell, datetime.datetime):
cell = cell.strftime('%Y-%m-%d %H:%M')
elif isinstance(cell, (int, long)):
cell = str(cell)
elif isinstance(cell, (list, tuple)):
cell = str(cell)
elif cell is None:
cell = ''
else:
cell = cell.encode('utf8')
row_formatted.append(cell)
try:
csvwriter.writerow(row_formatted)
except Exception, e:
raise Exception("%s: %s, %s"%(e, row, row_formatted))
csvout.seek(0)
return csvout.read()

View File

@ -0,0 +1,66 @@
import logging
from lxml import etree
from ckanext.spatial.harvesters import SpatialHarvester
from ckanext.spatial.lib.report import ReportTable
from ckan import model
from ckanext.harvest.model import HarvestObject
log = logging.getLogger(__name__)
def validation_report(package_id=None):
'''
Looks at every harvested metadata record and compares the
validation errors that it had on last import and what it would be with
the current validators. Useful when going to update the validators.
Returns a ReportTable.
'''
validators = SpatialHarvester()._get_validator()
log.debug('Validators: %r', validators.profiles)
query = model.Session.query(HarvestObject).\
filter_by(current=True).\
order_by(HarvestObject.fetch_finished.desc())
if package_id:
query = query.filter(HarvestObject.package_id==package_id)
report = ReportTable([
'Harvest Object id',
'GEMINI2 id',
'Date fetched',
'Dataset name',
'Publisher',
'Source URL',
'Old validation errors',
'New validation errors'])
for harvest_object in query:
validation_errors = []
for err in harvest_object.errors:
if 'not a valid Gemini' in err.message or \
'Validating against' in err.message:
validation_errors.append(err.message)
groups = harvest_object.package.get_groups()
publisher = groups[0].title if groups else '(none)'
xml = etree.fromstring(harvest_object.content.encode("utf-8"))
valid, errors = validators.is_valid(xml)
report.add_row_dict({
'Harvest Object id': harvest_object.id,
'GEMINI2 id': harvest_object.guid,
'Date fetched': harvest_object.fetch_finished,
'Dataset name': harvest_object.package.name,
'Publisher': publisher,
'Source URL': harvest_object.source.url,
'Old validation errors': '; '.join(validation_errors),
'New validation errors': '; '.join(errors),
})
log.debug('%i results', query.count())
return report

View File

@ -41,5 +41,6 @@ setup(
[paste.paster_command]
spatial=ckanext.spatial.commands.spatial:Spatial
validation=ckanext.spatial.commands.validation:Validation
""",
)