Improve harvest source status creation

Use report_status field to improve speed, remove unnecessary fields.
This commit is contained in:
amercader 2013-01-17 15:43:45 +00:00
parent bfce5185f0
commit 30c9eedf5f
1 changed files with 36 additions and 59 deletions

View File

@ -1,5 +1,5 @@
import logging import logging
from sqlalchemy import or_, distinct from sqlalchemy import or_, func
from ckan.model import User from ckan.model import User
import datetime import datetime
@ -7,6 +7,7 @@ from ckan import logic
from ckan.plugins import PluginImplementations from ckan.plugins import PluginImplementations
from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.interfaces import IHarvester
import ckan.plugins as p
from ckan.logic import NotFound, check_access from ckan.logic import NotFound, check_access
from ckanext.harvest import model as harvest_model from ckanext.harvest import model as harvest_model
@ -18,6 +19,7 @@ from ckanext.harvest.logic.dictization import (harvest_source_dictize,
from ckanext.harvest.logic.schema import harvest_source_db_to_form_schema from ckanext.harvest.logic.schema import harvest_source_db_to_form_schema
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
def harvest_source_show(context,data_dict): def harvest_source_show(context,data_dict):
''' '''
Returns the metadata of a harvest source Returns the metadata of a harvest source
@ -41,7 +43,6 @@ def harvest_source_show(context,data_dict):
return source_dict return source_dict
def harvest_source_show_status(context, data_dict): def harvest_source_show_status(context, data_dict):
''' '''
Returns a status report for a harvest source Returns a status report for a harvest source
@ -58,87 +59,63 @@ def harvest_source_show_status(context,data_dict):
''' '''
model = context.get('model') model = context.get('model')
detailed = context.get('detailed',True)
source = harvest_model.HarvestSource.get(data_dict['id']) source = harvest_model.HarvestSource.get(data_dict['id'])
if not source: if not source:
raise logic.NotFound('Harvest source {0} does not exist'.format(data_dict['id'])) raise p.toolkit.NotFound('Harvest source {0} does not exist'.format(data_dict['id']))
out = {}
jobs = harvest_model.HarvestJob.filter(source=source).all()
out = { out = {
'job_count': 0, 'job_count': 0,
'next_harvest':'', 'next_harvest': p.toolkit._('Not yet scheduled'),
'last_harvest_request': '', 'last_harvest_request': '',
'last_harvest_statistics':{'added':0,'updated':0,'errors':0}, 'last_harvest_statistics': {'new': 0, 'updated': 0, 'deleted': 0,'errored': 0},
'overall_statistics':{'added':0, 'errors':0}, 'total_datasets': 0,
} }
jobs = harvest_model.HarvestJob.filter(source=source).all()
job_count = len(jobs) job_count = len(jobs)
if job_count == 0: if job_count == 0:
out['msg'] = 'No jobs yet'
return out return out
else:
out['job_count'] = job_count out['job_count'] = job_count
# Get next scheduled job # Get next scheduled job
next_job = harvest_model.HarvestJob.filter(source=source,status=u'New').first() next_job = harvest_model.HarvestJob.filter(source=source,status=u'New').first()
if next_job: if next_job:
out['next_harvest'] = 'Scheduled' out['next_harvest'] = p.toolkit._('Scheduled')
else:
out['next_harvest'] = 'Not yet scheduled'
# Get the last finished job # Get the last finished job
last_job = harvest_model.HarvestJob.filter(source=source,status=u'Finished') \ last_job = harvest_model.HarvestJob.filter(source=source,status=u'Finished') \
.order_by(harvest_model.HarvestJob.created.desc()).first() .order_by(harvest_model.HarvestJob.created.desc()).first()
if last_job: if not last_job:
out['last_harvest_request'] = p.toolkit._('Not yet harvested')
return out
out['last_job_id'] = last_job.id out['last_job_id'] = last_job.id
out['last_harvest_request'] = str(last_job.gather_finished) out['last_harvest_request'] = str(last_job.gather_finished)
#Get HarvestObjects from last job with links to packages last_job_report = model.Session.query(
if detailed: harvest_model.HarvestObject.report_status,
last_objects = [obj for obj in last_job.objects if obj.package is not None] func.count(harvest_model.HarvestObject.report_status)) \
.filter(harvest_model.HarvestObject.harvest_job_id==last_job.id) \
.group_by(harvest_model.HarvestObject.report_status)
if len(last_objects) == 0: for row in last_job_report:
# No packages added or updated if row[0]:
out['last_harvest_statistics']['added'] = 0 out['last_harvest_statistics'][row[0]] = row[1]
out['last_harvest_statistics']['updated'] = 0
else:
# Check wether packages were added or updated
for last_object in last_objects:
# Check if the same package had been linked before
previous_objects = model.Session.query(harvest_model.HarvestObject) \
.filter(harvest_model.HarvestObject.package==last_object.package) \
.count()
if previous_objects == 1: # Add the gather stage errors
# It didn't previously exist, it has been added out['last_harvest_statistics']['errored'] += len(last_job.gather_errors)
out['last_harvest_statistics']['added'] += 1
else:
# Pacakge already existed, but it has been updated
out['last_harvest_statistics']['updated'] += 1
# Last harvest errors
# We have the gathering errors in last_job.gather_errors, so let's also
# get also the object errors.
object_errors = model.Session.query(harvest_model.HarvestObjectError).join(harvest_model.HarvestObject) \
.filter(harvest_model.HarvestObject.job==last_job)
out['last_harvest_statistics']['errors'] = len(last_job.gather_errors) \
+ object_errors.count()
# Overall statistics # Overall statistics
packages = model.Session.query(distinct(harvest_model.HarvestObject.package_id), model.Package.name) \ packages = model.Session.query(model.Package) \
.join(model.Package).join(HarvestSource) \ .join(harvest_model.HarvestObject) \
.filter(HarvestObject.source==source) \ .filter(harvest_model.HarvestObject.harvest_source_id==source.id) \
.filter(HarvestObject.current==True) \ .filter(harvest_model.HarvestObject.current==True) \
.filter(model.Package.state==u'active') .filter(model.Package.state==u'active')
out['overall_statistics']['added'] = packages.count() out['total_datasets'] = packages.count()
else:
out['last_harvest_request'] = 'Not yet harvested'
return out return out