From 9ba6e8f3b323713869b43ef29d8b2df3a3de8b8f Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Tue, 22 Jan 2013 13:13:24 +0000
Subject: [PATCH] [#5] Add error summary to harvest_job_dictize

It will return the counts for the 20 most common errors for that
particular job. These will available when calling harvest_job_show.

Also refactor the harvest source status object to just call
harvest_job_dictize on the 'last_job' key, as it has all the
interesting fields anyway.
---
 ckanext/harvest/logic/action/get.py  | 25 +++++--------------------
 ckanext/harvest/logic/dictization.py | 20 +++++++++++++-------
 2 files changed, 18 insertions(+), 27 deletions(-)

diff --git a/ckanext/harvest/logic/action/get.py b/ckanext/harvest/logic/action/get.py
index 2212d18..a499ecf 100644
--- a/ckanext/harvest/logic/action/get.py
+++ b/ckanext/harvest/logic/action/get.py
@@ -65,9 +65,8 @@ def harvest_source_show_status(context, data_dict):
 
     out = {
            'job_count': 0,
-           'next_harvest': p.toolkit._('Not yet scheduled'),
-           'last_harvest_request': '',
-           'last_harvest_statistics': {'new': 0, 'updated': 0, 'deleted': 0,'errored': 0},
+           'next_job': p.toolkit._('Not yet scheduled'),
+           'last_job': None,
            'total_datasets': 0,
            }
 
@@ -82,31 +81,16 @@ def harvest_source_show_status(context, data_dict):
     # Get next scheduled job
     next_job = harvest_model.HarvestJob.filter(source=source,status=u'New').first()
     if next_job:
-        out['next_harvest'] = p.toolkit._('Scheduled')
+        out['next_job'] = p.toolkit._('Scheduled')
 
     # Get the last finished job
     last_job = harvest_model.HarvestJob.filter(source=source,status=u'Finished') \
                .order_by(harvest_model.HarvestJob.created.desc()).first()
 
     if not last_job:
-        out['last_harvest_request'] = p.toolkit._('Not yet harvested')
         return out
 
-    out['last_job_id'] = last_job.id
-    out['last_harvest_request'] = str(last_job.gather_finished)
-
-    last_job_report = model.Session.query(
-                harvest_model.HarvestObject.report_status,
-                func.count(harvest_model.HarvestObject.report_status)) \
-            .filter(harvest_model.HarvestObject.harvest_job_id==last_job.id) \
-            .group_by(harvest_model.HarvestObject.report_status)
-
-    for row in last_job_report:
-        if row[0]:
-            out['last_harvest_statistics'][row[0]] = row[1]
-
-    # Add the gather stage errors
-    out['last_harvest_statistics']['errored'] += len(last_job.gather_errors)
+    out['last_job'] = harvest_job_dictize(last_job, context)
 
     # Overall statistics
     packages = model.Session.query(model.Package) \
@@ -166,6 +150,7 @@ def harvest_job_show(context,data_dict):
 
     return harvest_job_dictize(job,context)
 
+
 def harvest_job_list(context,data_dict):
 
     check_access('harvest_job_list',context,data_dict)
diff --git a/ckanext/harvest/logic/dictization.py b/ckanext/harvest/logic/dictization.py
index c38559b..bbee5a5 100644
--- a/ckanext/harvest/logic/dictization.py
+++ b/ckanext/harvest/logic/dictization.py
@@ -24,16 +24,11 @@ def harvest_source_dictize(source, context):
 
 def harvest_job_dictize(job, context):
     out = job.as_dict()
-    out['source'] = job.source_id
-    out['objects'] = []
-    out['gather_errors'] = []
 
-    if context.get('return_objects', True):
-        for obj in job.objects:
-            out['objects'].append(obj.as_dict())
+    model = context['model']
 
     if context.get('return_stats', True):
-        stats = context['model'].Session.query(
+        stats = model.Session.query(
             HarvestObject.report_status,
             func.count(HarvestObject.id).label('total_objects'))\
                 .filter_by(harvest_job_id=job.id)\
@@ -42,9 +37,20 @@ def harvest_job_dictize(job, context):
         for status, count in stats:
             out['stats'][status] = count
 
+    out['gather_errors'] = []
     for error in job.gather_errors:
         out['gather_errors'].append(error.as_dict())
 
+    q = model.Session.query(HarvestObjectError.message, \
+                            func.count(HarvestObjectError.message).label('error_count')) \
+                      .join(HarvestObject) \
+                      .filter(HarvestObject.harvest_job_id==job.id) \
+                      .group_by(HarvestObjectError.message) \
+                      .order_by('error_count desc') \
+                      .limit(context.get('error_summmary_limit', 20))
+
+    out['error_summary'] = q.all()
+
     return out
 
 def harvest_object_dictize(obj, context):