Improve harvest source status creation

Use report_status field to improve speed, remove unnecessary fields.
This commit is contained in:
amercader 2013-01-17 15:43:45 +00:00
parent bfce5185f0
commit 30c9eedf5f
1 changed files with 36 additions and 59 deletions

View File

@ -1,5 +1,5 @@
import logging
from sqlalchemy import or_, distinct
from sqlalchemy import or_, func
from ckan.model import User
import datetime
@ -7,6 +7,7 @@ from ckan import logic
from ckan.plugins import PluginImplementations
from ckanext.harvest.interfaces import IHarvester
import ckan.plugins as p
from ckan.logic import NotFound, check_access
from ckanext.harvest import model as harvest_model
@ -18,6 +19,7 @@ from ckanext.harvest.logic.dictization import (harvest_source_dictize,
from ckanext.harvest.logic.schema import harvest_source_db_to_form_schema
log = logging.getLogger(__name__)
def harvest_source_show(context,data_dict):
'''
Returns the metadata of a harvest source
@ -41,8 +43,7 @@ def harvest_source_show(context,data_dict):
return source_dict
def harvest_source_show_status(context,data_dict):
def harvest_source_show_status(context, data_dict):
'''
Returns a status report for a harvest source
@ -58,87 +59,63 @@ def harvest_source_show_status(context,data_dict):
'''
model = context.get('model')
detailed = context.get('detailed',True)
source = harvest_model.HarvestSource.get(data_dict['id'])
if not source:
raise logic.NotFound('Harvest source {0} does not exist'.format(data_dict['id']))
out = {}
jobs = harvest_model.HarvestJob.filter(source=source).all()
raise p.toolkit.NotFound('Harvest source {0} does not exist'.format(data_dict['id']))
out = {
'job_count': 0,
'next_harvest':'',
'last_harvest_request':'',
'last_harvest_statistics':{'added':0,'updated':0,'errors':0},
'overall_statistics':{'added':0, 'errors':0},
'next_harvest': p.toolkit._('Not yet scheduled'),
'last_harvest_request': '',
'last_harvest_statistics': {'new': 0, 'updated': 0, 'deleted': 0,'errored': 0},
'total_datasets': 0,
}
jobs = harvest_model.HarvestJob.filter(source=source).all()
job_count = len(jobs)
if job_count == 0:
out['msg'] = 'No jobs yet'
return out
else:
out['job_count'] = job_count
# Get next scheduled job
next_job = harvest_model.HarvestJob.filter(source=source,status=u'New').first()
if next_job:
out['next_harvest'] = 'Scheduled'
else:
out['next_harvest'] = 'Not yet scheduled'
out['next_harvest'] = p.toolkit._('Scheduled')
# Get the last finished job
last_job = harvest_model.HarvestJob.filter(source=source,status=u'Finished') \
.order_by(harvest_model.HarvestJob.created.desc()).first()
if last_job:
if not last_job:
out['last_harvest_request'] = p.toolkit._('Not yet harvested')
return out
out['last_job_id'] = last_job.id
out['last_harvest_request'] = str(last_job.gather_finished)
#Get HarvestObjects from last job with links to packages
if detailed:
last_objects = [obj for obj in last_job.objects if obj.package is not None]
last_job_report = model.Session.query(
harvest_model.HarvestObject.report_status,
func.count(harvest_model.HarvestObject.report_status)) \
.filter(harvest_model.HarvestObject.harvest_job_id==last_job.id) \
.group_by(harvest_model.HarvestObject.report_status)
if len(last_objects) == 0:
# No packages added or updated
out['last_harvest_statistics']['added'] = 0
out['last_harvest_statistics']['updated'] = 0
else:
# Check wether packages were added or updated
for last_object in last_objects:
# Check if the same package had been linked before
previous_objects = model.Session.query(harvest_model.HarvestObject) \
.filter(harvest_model.HarvestObject.package==last_object.package) \
.count()
for row in last_job_report:
if row[0]:
out['last_harvest_statistics'][row[0]] = row[1]
if previous_objects == 1:
# It didn't previously exist, it has been added
out['last_harvest_statistics']['added'] += 1
else:
# Pacakge already existed, but it has been updated
out['last_harvest_statistics']['updated'] += 1
# Add the gather stage errors
out['last_harvest_statistics']['errored'] += len(last_job.gather_errors)
# Last harvest errors
# We have the gathering errors in last_job.gather_errors, so let's also
# get also the object errors.
object_errors = model.Session.query(harvest_model.HarvestObjectError).join(harvest_model.HarvestObject) \
.filter(harvest_model.HarvestObject.job==last_job)
out['last_harvest_statistics']['errors'] = len(last_job.gather_errors) \
+ object_errors.count()
# Overall statistics
packages = model.Session.query(distinct(harvest_model.HarvestObject.package_id), model.Package.name) \
.join(model.Package).join(HarvestSource) \
.filter(HarvestObject.source==source) \
.filter(HarvestObject.current==True) \
packages = model.Session.query(model.Package) \
.join(harvest_model.HarvestObject) \
.filter(harvest_model.HarvestObject.harvest_source_id==source.id) \
.filter(harvest_model.HarvestObject.current==True) \
.filter(model.Package.state==u'active')
out['overall_statistics']['added'] = packages.count()
else:
out['last_harvest_request'] = 'Not yet harvested'
out['total_datasets'] = packages.count()
return out