reduce number of queries for harvest index to a less insane number. still heavy.
This commit is contained in:
parent
ef04ce1774
commit
0d9d1f8096
|
@ -19,12 +19,10 @@ import logging
|
||||||
log = logging.getLogger('ckanext')
|
log = logging.getLogger('ckanext')
|
||||||
|
|
||||||
|
|
||||||
def _get_source_status(source):
|
def _get_source_status(source, detailed=True):
|
||||||
out = dict()
|
out = dict()
|
||||||
|
job_count = HarvestJob.filter(source=source).count()
|
||||||
jobs = get_harvest_jobs(source=source)
|
if not job_count:
|
||||||
|
|
||||||
if not len(jobs):
|
|
||||||
out['msg'] = 'No jobs yet'
|
out['msg'] = 'No jobs yet'
|
||||||
return out
|
return out
|
||||||
out = {'next_harvest':'',
|
out = {'next_harvest':'',
|
||||||
|
@ -33,7 +31,6 @@ def _get_source_status(source):
|
||||||
'last_harvest_errors':[],
|
'last_harvest_errors':[],
|
||||||
'overall_statistics':{'added':0, 'errors':0},
|
'overall_statistics':{'added':0, 'errors':0},
|
||||||
'packages':[]}
|
'packages':[]}
|
||||||
|
|
||||||
# Get next scheduled job
|
# Get next scheduled job
|
||||||
next_job = HarvestJob.filter(source=source,status=u'New').first()
|
next_job = HarvestJob.filter(source=source,status=u'New').first()
|
||||||
if next_job:
|
if next_job:
|
||||||
|
@ -43,14 +40,14 @@ def _get_source_status(source):
|
||||||
|
|
||||||
# Get the last finished job
|
# Get the last finished job
|
||||||
last_job = HarvestJob.filter(source=source,status=u'Finished') \
|
last_job = HarvestJob.filter(source=source,status=u'Finished') \
|
||||||
.order_by(HarvestJob.created.desc()).limit(1).first()
|
.order_by(HarvestJob.created.desc()).first()
|
||||||
|
|
||||||
if last_job:
|
if last_job:
|
||||||
#TODO: Should we encode the dates as strings?
|
#TODO: Should we encode the dates as strings?
|
||||||
out['last_harvest_request'] = str(last_job.gather_finished)
|
out['last_harvest_request'] = str(last_job.gather_finished)
|
||||||
|
|
||||||
|
|
||||||
#Get HarvestObjects from last job whit links to packages
|
#Get HarvestObjects from last job whit links to packages
|
||||||
|
if detailed:
|
||||||
last_objects = [obj for obj in last_job.objects if obj.package is not None]
|
last_objects = [obj for obj in last_job.objects if obj.package is not None]
|
||||||
|
|
||||||
if len(last_objects) == 0:
|
if len(last_objects) == 0:
|
||||||
|
@ -63,9 +60,9 @@ def _get_source_status(source):
|
||||||
# Check if the same package had been linked before
|
# Check if the same package had been linked before
|
||||||
previous_objects = Session.query(HarvestObject) \
|
previous_objects = Session.query(HarvestObject) \
|
||||||
.filter(HarvestObject.package==last_object.package) \
|
.filter(HarvestObject.package==last_object.package) \
|
||||||
.all()
|
.count()
|
||||||
|
|
||||||
if len(previous_objects) == 1:
|
if previous_objects == 1:
|
||||||
# It didn't previously exist, it has been added
|
# It didn't previously exist, it has been added
|
||||||
out['last_harvest_statistics']['added'] += 1
|
out['last_harvest_statistics']['added'] += 1
|
||||||
else:
|
else:
|
||||||
|
@ -76,36 +73,36 @@ def _get_source_status(source):
|
||||||
# We have the gathering errors in last_job.gather_errors, so let's also
|
# We have the gathering errors in last_job.gather_errors, so let's also
|
||||||
# get also the object errors.
|
# get also the object errors.
|
||||||
object_errors = Session.query(HarvestObjectError).join(HarvestObject) \
|
object_errors = Session.query(HarvestObjectError).join(HarvestObject) \
|
||||||
.filter(HarvestObject.job==last_job).all()
|
.filter(HarvestObject.job==last_job)
|
||||||
|
|
||||||
out['last_harvest_statistics']['errors'] = len(last_job.gather_errors) \
|
out['last_harvest_statistics']['errors'] = len(last_job.gather_errors) \
|
||||||
+ len(object_errors)
|
+ object_errors.count()
|
||||||
|
if detailed:
|
||||||
for gather_error in last_job.gather_errors:
|
for gather_error in last_job.gather_errors:
|
||||||
out['last_harvest_errors'].append(gather_error.message)
|
out['last_harvest_errors'].append(gather_error.message)
|
||||||
|
|
||||||
for object_error in object_errors:
|
for object_error in object_errors:
|
||||||
msg = 'GUID %s: %s' % (object_error.object.guid,object_error.message)
|
msg = 'GUID %s: %s' % (object_error.object.guid, object_error.message)
|
||||||
out['last_harvest_errors'].append(msg)
|
out['last_harvest_errors'].append(msg)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Overall statistics
|
# Overall statistics
|
||||||
packages = Session.query(distinct(HarvestObject.package_id),Package.name) \
|
packages = Session.query(distinct(HarvestObject.package_id),Package.name) \
|
||||||
.join(Package).join(HarvestJob).join(HarvestSource) \
|
.join(Package).join(HarvestJob).join(HarvestSource) \
|
||||||
.filter(HarvestJob.source==source).all()
|
.filter(HarvestJob.source==source)
|
||||||
|
|
||||||
out['overall_statistics']['added'] = len(packages)
|
out['overall_statistics']['added'] = packages.count()
|
||||||
|
if detailed:
|
||||||
for package in packages:
|
for package in packages:
|
||||||
out['packages'].append(package.name)
|
out['packages'].append(package.name)
|
||||||
|
|
||||||
gather_errors = Session.query(HarvestGatherError) \
|
gather_errors = Session.query(HarvestGatherError) \
|
||||||
.join(HarvestJob).join(HarvestSource) \
|
.join(HarvestJob).join(HarvestSource) \
|
||||||
.filter(HarvestJob.source==source).all()
|
.filter(HarvestJob.source==source).count()
|
||||||
|
|
||||||
object_errors = Session.query(HarvestObjectError) \
|
object_errors = Session.query(HarvestObjectError) \
|
||||||
.join(HarvestObject).join(HarvestJob).join(HarvestSource) \
|
.join(HarvestObject).join(HarvestJob).join(HarvestSource) \
|
||||||
.filter(HarvestJob.source==source).all()
|
.filter(HarvestJob.source==source).count()
|
||||||
out['overall_statistics']['errors'] = len(gather_errors) + len(object_errors)
|
out['overall_statistics']['errors'] = gather_errors + object_errors
|
||||||
else:
|
else:
|
||||||
out['last_harvest_request'] = 'Not yet harvested'
|
out['last_harvest_request'] = 'Not yet harvested'
|
||||||
|
|
||||||
|
@ -114,14 +111,14 @@ def _get_source_status(source):
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def _source_as_dict(source):
|
def _source_as_dict(source, detailed=True):
|
||||||
out = source.as_dict()
|
out = source.as_dict()
|
||||||
out['jobs'] = []
|
out['jobs'] = []
|
||||||
|
|
||||||
for job in source.jobs:
|
for job in source.jobs:
|
||||||
out['jobs'].append(job.as_dict())
|
out['jobs'].append(job.as_dict())
|
||||||
|
|
||||||
out['status'] = _get_source_status(source)
|
out['status'] = _get_source_status(source, detailed=detailed)
|
||||||
|
|
||||||
|
|
||||||
return out
|
return out
|
||||||
|
@ -213,7 +210,7 @@ def get_harvest_sources(**kwds):
|
||||||
sources = HarvestSource.filter(**kwds) \
|
sources = HarvestSource.filter(**kwds) \
|
||||||
.order_by(HarvestSource.created.desc()) \
|
.order_by(HarvestSource.created.desc()) \
|
||||||
.all()
|
.all()
|
||||||
return [_source_as_dict(source) for source in sources]
|
return [_source_as_dict(source, detailed=False) for source in sources]
|
||||||
|
|
||||||
def create_harvest_source(data_dict):
|
def create_harvest_source(data_dict):
|
||||||
|
|
||||||
|
|
|
@ -164,6 +164,7 @@ def create_harvester_tables():
|
||||||
properties={
|
properties={
|
||||||
'jobs': relation(
|
'jobs': relation(
|
||||||
HarvestJob,
|
HarvestJob,
|
||||||
|
lazy=True,
|
||||||
backref=u'source',
|
backref=u'source',
|
||||||
order_by=harvest_job_table.c.created,
|
order_by=harvest_job_table.c.created,
|
||||||
),
|
),
|
||||||
|
@ -181,10 +182,12 @@ def create_harvester_tables():
|
||||||
properties={
|
properties={
|
||||||
'package':relation(
|
'package':relation(
|
||||||
Package,
|
Package,
|
||||||
|
lazy=True,
|
||||||
backref='harvest_objects',
|
backref='harvest_objects',
|
||||||
),
|
),
|
||||||
'job': relation(
|
'job': relation(
|
||||||
HarvestJob,
|
HarvestJob,
|
||||||
|
lazy=True,
|
||||||
backref=u'objects',
|
backref=u'objects',
|
||||||
),
|
),
|
||||||
},
|
},
|
||||||
|
|
Loading…
Reference in New Issue