From 9ba6e8f3b323713869b43ef29d8b2df3a3de8b8f Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 22 Jan 2013 13:13:24 +0000 Subject: [PATCH] [#5] Add error summary to harvest_job_dictize It will return the counts for the 20 most common errors for that particular job. These will available when calling harvest_job_show. Also refactor the harvest source status object to just call harvest_job_dictize on the 'last_job' key, as it has all the interesting fields anyway. --- ckanext/harvest/logic/action/get.py | 25 +++++-------------------- ckanext/harvest/logic/dictization.py | 20 +++++++++++++------- 2 files changed, 18 insertions(+), 27 deletions(-) diff --git a/ckanext/harvest/logic/action/get.py b/ckanext/harvest/logic/action/get.py index 2212d18..a499ecf 100644 --- a/ckanext/harvest/logic/action/get.py +++ b/ckanext/harvest/logic/action/get.py @@ -65,9 +65,8 @@ def harvest_source_show_status(context, data_dict): out = { 'job_count': 0, - 'next_harvest': p.toolkit._('Not yet scheduled'), - 'last_harvest_request': '', - 'last_harvest_statistics': {'new': 0, 'updated': 0, 'deleted': 0,'errored': 0}, + 'next_job': p.toolkit._('Not yet scheduled'), + 'last_job': None, 'total_datasets': 0, } @@ -82,31 +81,16 @@ def harvest_source_show_status(context, data_dict): # Get next scheduled job next_job = harvest_model.HarvestJob.filter(source=source,status=u'New').first() if next_job: - out['next_harvest'] = p.toolkit._('Scheduled') + out['next_job'] = p.toolkit._('Scheduled') # Get the last finished job last_job = harvest_model.HarvestJob.filter(source=source,status=u'Finished') \ .order_by(harvest_model.HarvestJob.created.desc()).first() if not last_job: - out['last_harvest_request'] = p.toolkit._('Not yet harvested') return out - out['last_job_id'] = last_job.id - out['last_harvest_request'] = str(last_job.gather_finished) - - last_job_report = model.Session.query( - harvest_model.HarvestObject.report_status, - func.count(harvest_model.HarvestObject.report_status)) \ - .filter(harvest_model.HarvestObject.harvest_job_id==last_job.id) \ - .group_by(harvest_model.HarvestObject.report_status) - - for row in last_job_report: - if row[0]: - out['last_harvest_statistics'][row[0]] = row[1] - - # Add the gather stage errors - out['last_harvest_statistics']['errored'] += len(last_job.gather_errors) + out['last_job'] = harvest_job_dictize(last_job, context) # Overall statistics packages = model.Session.query(model.Package) \ @@ -166,6 +150,7 @@ def harvest_job_show(context,data_dict): return harvest_job_dictize(job,context) + def harvest_job_list(context,data_dict): check_access('harvest_job_list',context,data_dict) diff --git a/ckanext/harvest/logic/dictization.py b/ckanext/harvest/logic/dictization.py index c38559b..bbee5a5 100644 --- a/ckanext/harvest/logic/dictization.py +++ b/ckanext/harvest/logic/dictization.py @@ -24,16 +24,11 @@ def harvest_source_dictize(source, context): def harvest_job_dictize(job, context): out = job.as_dict() - out['source'] = job.source_id - out['objects'] = [] - out['gather_errors'] = [] - if context.get('return_objects', True): - for obj in job.objects: - out['objects'].append(obj.as_dict()) + model = context['model'] if context.get('return_stats', True): - stats = context['model'].Session.query( + stats = model.Session.query( HarvestObject.report_status, func.count(HarvestObject.id).label('total_objects'))\ .filter_by(harvest_job_id=job.id)\ @@ -42,9 +37,20 @@ def harvest_job_dictize(job, context): for status, count in stats: out['stats'][status] = count + out['gather_errors'] = [] for error in job.gather_errors: out['gather_errors'].append(error.as_dict()) + q = model.Session.query(HarvestObjectError.message, \ + func.count(HarvestObjectError.message).label('error_count')) \ + .join(HarvestObject) \ + .filter(HarvestObject.harvest_job_id==job.id) \ + .group_by(HarvestObjectError.message) \ + .order_by('error_count desc') \ + .limit(context.get('error_summmary_limit', 20)) + + out['error_summary'] = q.all() + return out def harvest_object_dictize(obj, context):