Added ability to produce report into validation errors, for when changing validation. Added report infrastructure.
This commit is contained in:
parent
0e8a62fe1e
commit
d90114cf07
|
@ -0,0 +1,71 @@
|
|||
import sys
|
||||
import re
|
||||
from pprint import pprint
|
||||
import logging
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from ckan.lib.cli import CkanCommand
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
class Validation(CkanCommand):
|
||||
'''Validation commands
|
||||
|
||||
Usage:
|
||||
validation report [package-name]
|
||||
Performs validation on the harvested metadata, either for all
|
||||
packages or one specified.
|
||||
|
||||
validation report-csv <filename>.csv
|
||||
|
||||
'''
|
||||
summary = __doc__.split('\n')[0]
|
||||
usage = __doc__
|
||||
max_args = 3
|
||||
min_args = 0
|
||||
|
||||
def command(self):
|
||||
if not self.args or self.args[0] in ['--help', '-h', 'help']:
|
||||
print self.usage
|
||||
sys.exit(1)
|
||||
|
||||
self._load_config()
|
||||
|
||||
cmd = self.args[0]
|
||||
if cmd == 'report':
|
||||
self.report()
|
||||
elif cmd == 'report-csv':
|
||||
self.report_csv()
|
||||
else:
|
||||
print 'Command %s not recognized' % cmd
|
||||
|
||||
def report(self):
|
||||
from ckan import model
|
||||
from ckanext.harvest.model import HarvestObject
|
||||
from ckanext.spatial.lib.reports import validation_report
|
||||
|
||||
if len(self.args) >= 2:
|
||||
package_ref = unicode(self.args[1])
|
||||
pkg = model.Package.get(package_ref)
|
||||
if not pkg:
|
||||
print 'Package ref "%s" not recognised' % package_ref
|
||||
sys.exit(1)
|
||||
else:
|
||||
pkg = None
|
||||
|
||||
report = validation_report(package_id=pkg.id)
|
||||
for row in report.get_rows_html_formatted():
|
||||
print
|
||||
for i, col_name in enumerate(report.column_names):
|
||||
print ' %s: %s' % (col_name, row[i])
|
||||
|
||||
def report_csv(self):
|
||||
from ckanext.spatial.lib.reports import validation_report
|
||||
if len(self.args) != 2:
|
||||
print 'Wrong number of arguments'
|
||||
sys.exit(1)
|
||||
csv_filepath = self.args[1]
|
||||
report = validation_report()
|
||||
with open(csv_filepath, 'wb') as f:
|
||||
f.write(report.get_csv())
|
|
@ -0,0 +1,69 @@
|
|||
'''
|
||||
Library for creating reports that can be displayed easily in an HTML table
|
||||
and then saved as a CSV.
|
||||
'''
|
||||
|
||||
import datetime
|
||||
import csv
|
||||
try: from cStringIO import StringIO
|
||||
except ImportError: from StringIO import StringIO
|
||||
|
||||
class ReportTable(object):
|
||||
def __init__(self, column_names):
|
||||
assert isinstance(column_names, (list, tuple))
|
||||
self.column_names = column_names
|
||||
self.rows = []
|
||||
|
||||
def add_row_dict(self, row_dict):
|
||||
'''Adds a row to the report table'''
|
||||
row = []
|
||||
for col_name in self.column_names:
|
||||
if col_name in row_dict:
|
||||
value = row_dict.pop(col_name)
|
||||
else:
|
||||
value = None
|
||||
row.append(value)
|
||||
if row_dict:
|
||||
raise Exception('Have left-over keys not under a column: %s' % row_dict)
|
||||
self.rows.append(row)
|
||||
|
||||
def get_rows_html_formatted(self, date_format='%d/%m/%y %H:%M',
|
||||
blank_cell_html=''):
|
||||
for row in self.rows:
|
||||
row_formatted = row[:]
|
||||
for i, cell in enumerate(row):
|
||||
if isinstance(cell, datetime.datetime):
|
||||
row_formatted[i] = cell.strftime(date_format)
|
||||
elif cell is None:
|
||||
row_formatted[i] = blank_cell_html
|
||||
yield row_formatted
|
||||
|
||||
def get_csv(self):
|
||||
csvout = StringIO()
|
||||
csvwriter = csv.writer(
|
||||
csvout,
|
||||
dialect='excel',
|
||||
quoting=csv.QUOTE_NONNUMERIC
|
||||
)
|
||||
csvwriter.writerow(self.column_names)
|
||||
for row in self.rows:
|
||||
row_formatted = []
|
||||
for cell in row:
|
||||
if isinstance(cell, datetime.datetime):
|
||||
cell = cell.strftime('%Y-%m-%d %H:%M')
|
||||
elif isinstance(cell, (int, long)):
|
||||
cell = str(cell)
|
||||
elif isinstance(cell, (list, tuple)):
|
||||
cell = str(cell)
|
||||
elif cell is None:
|
||||
cell = ''
|
||||
else:
|
||||
cell = cell.encode('utf8')
|
||||
row_formatted.append(cell)
|
||||
try:
|
||||
csvwriter.writerow(row_formatted)
|
||||
except Exception, e:
|
||||
raise Exception("%s: %s, %s"%(e, row, row_formatted))
|
||||
csvout.seek(0)
|
||||
return csvout.read()
|
||||
|
|
@ -0,0 +1,66 @@
|
|||
import logging
|
||||
|
||||
from lxml import etree
|
||||
|
||||
from ckanext.spatial.harvesters import SpatialHarvester
|
||||
from ckanext.spatial.lib.report import ReportTable
|
||||
from ckan import model
|
||||
from ckanext.harvest.model import HarvestObject
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
def validation_report(package_id=None):
|
||||
'''
|
||||
Looks at every harvested metadata record and compares the
|
||||
validation errors that it had on last import and what it would be with
|
||||
the current validators. Useful when going to update the validators.
|
||||
|
||||
Returns a ReportTable.
|
||||
'''
|
||||
|
||||
validators = SpatialHarvester()._get_validator()
|
||||
log.debug('Validators: %r', validators.profiles)
|
||||
|
||||
query = model.Session.query(HarvestObject).\
|
||||
filter_by(current=True).\
|
||||
order_by(HarvestObject.fetch_finished.desc())
|
||||
|
||||
if package_id:
|
||||
query = query.filter(HarvestObject.package_id==package_id)
|
||||
|
||||
report = ReportTable([
|
||||
'Harvest Object id',
|
||||
'GEMINI2 id',
|
||||
'Date fetched',
|
||||
'Dataset name',
|
||||
'Publisher',
|
||||
'Source URL',
|
||||
'Old validation errors',
|
||||
'New validation errors'])
|
||||
|
||||
for harvest_object in query:
|
||||
validation_errors = []
|
||||
for err in harvest_object.errors:
|
||||
if 'not a valid Gemini' in err.message or \
|
||||
'Validating against' in err.message:
|
||||
validation_errors.append(err.message)
|
||||
|
||||
groups = harvest_object.package.get_groups()
|
||||
publisher = groups[0].title if groups else '(none)'
|
||||
|
||||
xml = etree.fromstring(harvest_object.content.encode("utf-8"))
|
||||
valid, errors = validators.is_valid(xml)
|
||||
|
||||
report.add_row_dict({
|
||||
'Harvest Object id': harvest_object.id,
|
||||
'GEMINI2 id': harvest_object.guid,
|
||||
'Date fetched': harvest_object.fetch_finished,
|
||||
'Dataset name': harvest_object.package.name,
|
||||
'Publisher': publisher,
|
||||
'Source URL': harvest_object.source.url,
|
||||
'Old validation errors': '; '.join(validation_errors),
|
||||
'New validation errors': '; '.join(errors),
|
||||
})
|
||||
|
||||
log.debug('%i results', query.count())
|
||||
return report
|
Loading…
Reference in New Issue