reduce number of queries for harvest index to a less insane number. still heavy.

This commit is contained in:
Friedrich Lindenberg 2011-06-13 17:36:35 +02:00
parent ef04ce1774
commit 0d9d1f8096
2 changed files with 46 additions and 46 deletions

View File

@ -19,12 +19,10 @@ import logging
log = logging.getLogger('ckanext') log = logging.getLogger('ckanext')
def _get_source_status(source): def _get_source_status(source, detailed=True):
out = dict() out = dict()
job_count = HarvestJob.filter(source=source).count()
jobs = get_harvest_jobs(source=source) if not job_count:
if not len(jobs):
out['msg'] = 'No jobs yet' out['msg'] = 'No jobs yet'
return out return out
out = {'next_harvest':'', out = {'next_harvest':'',
@ -33,7 +31,6 @@ def _get_source_status(source):
'last_harvest_errors':[], 'last_harvest_errors':[],
'overall_statistics':{'added':0, 'errors':0}, 'overall_statistics':{'added':0, 'errors':0},
'packages':[]} 'packages':[]}
# Get next scheduled job # Get next scheduled job
next_job = HarvestJob.filter(source=source,status=u'New').first() next_job = HarvestJob.filter(source=source,status=u'New').first()
if next_job: if next_job:
@ -43,14 +40,14 @@ def _get_source_status(source):
# Get the last finished job # Get the last finished job
last_job = HarvestJob.filter(source=source,status=u'Finished') \ last_job = HarvestJob.filter(source=source,status=u'Finished') \
.order_by(HarvestJob.created.desc()).limit(1).first() .order_by(HarvestJob.created.desc()).first()
if last_job: if last_job:
#TODO: Should we encode the dates as strings? #TODO: Should we encode the dates as strings?
out['last_harvest_request'] = str(last_job.gather_finished) out['last_harvest_request'] = str(last_job.gather_finished)
#Get HarvestObjects from last job whit links to packages #Get HarvestObjects from last job whit links to packages
if detailed:
last_objects = [obj for obj in last_job.objects if obj.package is not None] last_objects = [obj for obj in last_job.objects if obj.package is not None]
if len(last_objects) == 0: if len(last_objects) == 0:
@ -63,9 +60,9 @@ def _get_source_status(source):
# Check if the same package had been linked before # Check if the same package had been linked before
previous_objects = Session.query(HarvestObject) \ previous_objects = Session.query(HarvestObject) \
.filter(HarvestObject.package==last_object.package) \ .filter(HarvestObject.package==last_object.package) \
.all() .count()
if len(previous_objects) == 1: if previous_objects == 1:
# It didn't previously exist, it has been added # It didn't previously exist, it has been added
out['last_harvest_statistics']['added'] += 1 out['last_harvest_statistics']['added'] += 1
else: else:
@ -76,36 +73,36 @@ def _get_source_status(source):
# We have the gathering errors in last_job.gather_errors, so let's also # We have the gathering errors in last_job.gather_errors, so let's also
# get also the object errors. # get also the object errors.
object_errors = Session.query(HarvestObjectError).join(HarvestObject) \ object_errors = Session.query(HarvestObjectError).join(HarvestObject) \
.filter(HarvestObject.job==last_job).all() .filter(HarvestObject.job==last_job)
out['last_harvest_statistics']['errors'] = len(last_job.gather_errors) \ out['last_harvest_statistics']['errors'] = len(last_job.gather_errors) \
+ len(object_errors) + object_errors.count()
if detailed:
for gather_error in last_job.gather_errors: for gather_error in last_job.gather_errors:
out['last_harvest_errors'].append(gather_error.message) out['last_harvest_errors'].append(gather_error.message)
for object_error in object_errors: for object_error in object_errors:
msg = 'GUID %s: %s' % (object_error.object.guid,object_error.message) msg = 'GUID %s: %s' % (object_error.object.guid, object_error.message)
out['last_harvest_errors'].append(msg) out['last_harvest_errors'].append(msg)
# Overall statistics # Overall statistics
packages = Session.query(distinct(HarvestObject.package_id),Package.name) \ packages = Session.query(distinct(HarvestObject.package_id),Package.name) \
.join(Package).join(HarvestJob).join(HarvestSource) \ .join(Package).join(HarvestJob).join(HarvestSource) \
.filter(HarvestJob.source==source).all() .filter(HarvestJob.source==source)
out['overall_statistics']['added'] = len(packages) out['overall_statistics']['added'] = packages.count()
if detailed:
for package in packages: for package in packages:
out['packages'].append(package.name) out['packages'].append(package.name)
gather_errors = Session.query(HarvestGatherError) \ gather_errors = Session.query(HarvestGatherError) \
.join(HarvestJob).join(HarvestSource) \ .join(HarvestJob).join(HarvestSource) \
.filter(HarvestJob.source==source).all() .filter(HarvestJob.source==source).count()
object_errors = Session.query(HarvestObjectError) \ object_errors = Session.query(HarvestObjectError) \
.join(HarvestObject).join(HarvestJob).join(HarvestSource) \ .join(HarvestObject).join(HarvestJob).join(HarvestSource) \
.filter(HarvestJob.source==source).all() .filter(HarvestJob.source==source).count()
out['overall_statistics']['errors'] = len(gather_errors) + len(object_errors) out['overall_statistics']['errors'] = gather_errors + object_errors
else: else:
out['last_harvest_request'] = 'Not yet harvested' out['last_harvest_request'] = 'Not yet harvested'
@ -114,14 +111,14 @@ def _get_source_status(source):
def _source_as_dict(source): def _source_as_dict(source, detailed=True):
out = source.as_dict() out = source.as_dict()
out['jobs'] = [] out['jobs'] = []
for job in source.jobs: for job in source.jobs:
out['jobs'].append(job.as_dict()) out['jobs'].append(job.as_dict())
out['status'] = _get_source_status(source) out['status'] = _get_source_status(source, detailed=detailed)
return out return out
@ -213,7 +210,7 @@ def get_harvest_sources(**kwds):
sources = HarvestSource.filter(**kwds) \ sources = HarvestSource.filter(**kwds) \
.order_by(HarvestSource.created.desc()) \ .order_by(HarvestSource.created.desc()) \
.all() .all()
return [_source_as_dict(source) for source in sources] return [_source_as_dict(source, detailed=False) for source in sources]
def create_harvest_source(data_dict): def create_harvest_source(data_dict):

View File

@ -164,6 +164,7 @@ def create_harvester_tables():
properties={ properties={
'jobs': relation( 'jobs': relation(
HarvestJob, HarvestJob,
lazy=True,
backref=u'source', backref=u'source',
order_by=harvest_job_table.c.created, order_by=harvest_job_table.c.created,
), ),
@ -181,10 +182,12 @@ def create_harvester_tables():
properties={ properties={
'package':relation( 'package':relation(
Package, Package,
lazy=True,
backref='harvest_objects', backref='harvest_objects',
), ),
'job': relation( 'job': relation(
HarvestJob, HarvestJob,
lazy=True,
backref=u'objects', backref=u'objects',
), ),
}, },