Added ability to produce report into validation errors, for when changing validation. Added report infrastructure.

2012-10-19 18:20:32 +01:00 · 2012-10-19 18:20:32 +01:00 · d90114cf07
parent 0e8a62fe1e
commit d90114cf07
4 changed files with 207 additions and 0 deletions
--- a/ckanext/spatial/commands/validation.py
+++ b/ckanext/spatial/commands/validation.py
@ -0,0 +1,71 @@
+import sys
+import re
+from pprint import pprint
+import logging
+
+from lxml import etree
+
+from ckan.lib.cli import CkanCommand
+
+log = logging.getLogger(__name__)
+
+class Validation(CkanCommand):
+    '''Validation commands
+
+    Usage:
+        validation report [package-name]
+            Performs validation on the harvested metadata, either for all
+            packages or one specified.
+
+        validation report-csv <filename>.csv
+      
+    '''
+    summary = __doc__.split('\n')[0]
+    usage = __doc__
+    max_args = 3
+    min_args = 0
+
+    def command(self):
+        if not self.args or self.args[0] in ['--help', '-h', 'help']:
+            print self.usage
+            sys.exit(1)
+
+        self._load_config()
+
+        cmd = self.args[0]
+        if cmd == 'report':
+            self.report()
+        elif cmd == 'report-csv':
+            self.report_csv()
+        else:
+            print 'Command %s not recognized' % cmd
+
+    def report(self):
+        from ckan import model
+        from ckanext.harvest.model import HarvestObject
+        from ckanext.spatial.lib.reports import validation_report
+
+        if len(self.args) >= 2:
+            package_ref = unicode(self.args[1])
+            pkg = model.Package.get(package_ref)
+            if not pkg:
+                print 'Package ref "%s" not recognised' % package_ref
+                sys.exit(1)
+        else:
+            pkg = None
+
+        report = validation_report(package_id=pkg.id)
+        for row in report.get_rows_html_formatted():
+            print
+            for i, col_name in enumerate(report.column_names):
+                print '  %s: %s' % (col_name, row[i])
+
+    def report_csv(self):
+        from ckanext.spatial.lib.reports import validation_report
+        if len(self.args) != 2:
+            print 'Wrong number of arguments'
+            sys.exit(1)
+        csv_filepath = self.args[1]
+        report = validation_report()
+        with open(csv_filepath, 'wb') as f:
+            f.write(report.get_csv())
--- a/ckanext/spatial/lib/report.py
+++ b/ckanext/spatial/lib/report.py
@ -0,0 +1,69 @@
+'''
+Library for creating reports that can be displayed easily in an HTML table
+and then saved as a CSV.
+'''
+
+import datetime
+import csv
+try: from cStringIO import StringIO
+except ImportError: from StringIO import StringIO
+
+class ReportTable(object):
+    def __init__(self, column_names):
+        assert isinstance(column_names, (list, tuple))
+        self.column_names = column_names
+        self.rows = []
+
+    def add_row_dict(self, row_dict):
+        '''Adds a row to the report table'''
+        row = []
+        for col_name in self.column_names:
+            if col_name in row_dict:
+                value = row_dict.pop(col_name)
+            else:
+                value = None
+            row.append(value)
+        if row_dict:
+            raise Exception('Have left-over keys not under a column: %s' % row_dict)
+        self.rows.append(row)
+
+    def get_rows_html_formatted(self, date_format='%d/%m/%y %H:%M',
+                                blank_cell_html=''):
+        for row in self.rows:
+            row_formatted = row[:]
+            for i, cell in enumerate(row):
+                if isinstance(cell, datetime.datetime):
+                    row_formatted[i] = cell.strftime(date_format)
+                elif cell is None:
+                    row_formatted[i] = blank_cell_html
+            yield row_formatted
+
+    def get_csv(self):
+        csvout = StringIO()
+        csvwriter = csv.writer(
+            csvout,
+            dialect='excel',
+            quoting=csv.QUOTE_NONNUMERIC
+        )
+        csvwriter.writerow(self.column_names)
+        for row in self.rows:
+            row_formatted = []
+            for cell in row:
+                if isinstance(cell, datetime.datetime):
+                    cell = cell.strftime('%Y-%m-%d %H:%M')
+                elif isinstance(cell, (int, long)):
+                    cell = str(cell)
+                elif isinstance(cell, (list, tuple)):
+                    cell = str(cell)
+                elif cell is None:
+                    cell = ''
+                else:
+                    cell = cell.encode('utf8')
+                row_formatted.append(cell)
+            try:
+                csvwriter.writerow(row_formatted)
+            except Exception, e:
+                raise Exception("%s: %s, %s"%(e, row, row_formatted))
+        csvout.seek(0)
+        return csvout.read()
+        
--- a/ckanext/spatial/lib/reports.py
+++ b/ckanext/spatial/lib/reports.py
@ -0,0 +1,66 @@
+import logging
+
+from lxml import etree
+
+from ckanext.spatial.harvesters import SpatialHarvester
+from ckanext.spatial.lib.report import ReportTable
+from ckan import model
+from ckanext.harvest.model import HarvestObject
+
+log = logging.getLogger(__name__)
+
+def validation_report(package_id=None):
+    '''
+    Looks at every harvested metadata record and compares the
+    validation errors that it had on last import and what it would be with
+    the current validators. Useful when going to update the validators.
+
+    Returns a ReportTable.
+    '''
+
+    validators = SpatialHarvester()._get_validator()
+    log.debug('Validators: %r', validators.profiles)
+
+    query = model.Session.query(HarvestObject).\
+            filter_by(current=True).\
+            order_by(HarvestObject.fetch_finished.desc())
+
+    if package_id:
+        query = query.filter(HarvestObject.package_id==package_id)
+
+    report = ReportTable([
+        'Harvest Object id',
+        'GEMINI2 id',
+        'Date fetched',
+        'Dataset name',
+        'Publisher',
+        'Source URL',
+        'Old validation errors',
+        'New validation errors'])
+
+    for harvest_object in query:
+        validation_errors = []
+        for err in harvest_object.errors:
+            if 'not a valid Gemini' in err.message or \
+                   'Validating against' in err.message:
+                validation_errors.append(err.message)
+
+        groups = harvest_object.package.get_groups()
+        publisher = groups[0].title if groups else '(none)'
+
+        xml = etree.fromstring(harvest_object.content.encode("utf-8"))
+        valid, errors = validators.is_valid(xml)
+                         
+        report.add_row_dict({
+                             'Harvest Object id': harvest_object.id,
+                             'GEMINI2 id': harvest_object.guid,
+                             'Date fetched': harvest_object.fetch_finished,
+                             'Dataset name': harvest_object.package.name,
+                             'Publisher': publisher,
+                             'Source URL': harvest_object.source.url,
+                             'Old validation errors': '; '.join(validation_errors),
+                             'New validation errors': '; '.join(errors),
+                             })
+
+    log.debug('%i results', query.count())
+    return report
--- a/setup.py
+++ b/setup.py
@ -41,5 +41,6 @@ setup(

    [paste.paster_command]
    spatial=ckanext.spatial.commands.spatial:Spatial
+    validation=ckanext.spatial.commands.validation:Validation
 	""",
 )