[#21] Improve gather stage error handling

See issue for full details. Basically we don't want to catch any
exception at the queue.py level, as they prevent debugging. Harvesters
should deal with them and return a list of ids or an empty list if no
objects need to be fetched.
Also improved the debug messages.
This commit is contained in:
amercader 2013-03-14 17:31:07 +00:00
parent 91f18bffab
commit d77f16aba9
1 changed files with 55 additions and 45 deletions

View File

@ -97,15 +97,21 @@ def gather_callback(channel, method, header, body):
try: try:
id = json.loads(body)['harvest_job_id'] id = json.loads(body)['harvest_job_id']
log.debug('Received harvest job id: %s' % id) log.debug('Received harvest job id: %s' % id)
except KeyError:
log.error('No harvest job id received')
channel.basic_ack(method.delivery_tag)
return False
# Get a publisher for the fetch queue # Get a publisher for the fetch queue
publisher = get_fetch_publisher() publisher = get_fetch_publisher()
try:
job = HarvestJob.get(id) job = HarvestJob.get(id)
except:
if not job:
log.error('Harvest job does not exist: %s' % id) log.error('Harvest job does not exist: %s' % id)
else: channel.basic_ack(method.delivery_tag)
return False
# Send the harvest job to the plugins that implement # Send the harvest job to the plugins that implement
# the Harvester interface, only if the source type # the Harvester interface, only if the source type
# matches # matches
@ -115,21 +121,30 @@ def gather_callback(channel, method, header, body):
harvester_found = True harvester_found = True
# Get a list of harvest object ids from the plugin # Get a list of harvest object ids from the plugin
job.gather_started = datetime.datetime.now() job.gather_started = datetime.datetime.now()
try:
harvest_object_ids = harvester.gather_stage(job) harvest_object_ids = harvester.gather_stage(job)
except Exception, e:
log.error('Gather stage failed unexpectedly: %s' % e)
job.status = 'Errored'
job.save()
continue
job.gather_finished = datetime.datetime.now() job.gather_finished = datetime.datetime.now()
job.save() job.save()
log.debug('Received from plugin''s gather_stage: %r' % harvest_object_ids)
if harvest_object_ids and len(harvest_object_ids) > 0: if not isinstance(harvest_object_ids, list):
log.error('Gather stage failed')
publisher.close()
channel.basic_ack(method.delivery_tag)
return False
if len(harvest_object_ids) == 0:
log.info('No harvest objects to fetch')
publisher.close()
channel.basic_ack(method.delivery_tag)
return False
log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format(
len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:]))
for id in harvest_object_ids: for id in harvest_object_ids:
# Send the id to the fetch queue # Send the id to the fetch queue
publisher.send({'harvest_object_id':id}) publisher.send({'harvest_object_id':id})
log.debug('Sent object %s to the fetch queue' % id) log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))
if not harvester_found: if not harvester_found:
msg = 'No harvester could be found for source type %s' % job.source.type msg = 'No harvester could be found for source type %s' % job.source.type
@ -137,13 +152,8 @@ def gather_callback(channel, method, header, body):
err.save() err.save()
log.error(msg) log.error(msg)
finally:
publisher.close()
except KeyError:
log.error('No harvest job id received')
finally:
model.Session.remove() model.Session.remove()
publisher.close()
channel.basic_ack(method.delivery_tag) channel.basic_ack(method.delivery_tag)