Fix migration for old harvests so that ones that errored are correctly marked. Added helpful comments in model.

This commit is contained in:
David Read 2015-07-22 10:13:02 +01:00
parent 3e21ea4f82
commit 2da918c2e4
2 changed files with 21 additions and 4 deletions

View File

@ -196,12 +196,21 @@ def define_harvester_tables():
Column('gather_finished', types.DateTime), Column('gather_finished', types.DateTime),
Column('finished', types.DateTime), Column('finished', types.DateTime),
Column('source_id', types.UnicodeText, ForeignKey('harvest_source.id')), Column('source_id', types.UnicodeText, ForeignKey('harvest_source.id')),
# status: New, Running, Finished
Column('status', types.UnicodeText, default=u'New', nullable=False), Column('status', types.UnicodeText, default=u'New', nullable=False),
) )
# Was harvested_document # A harvest_object contains a representation of one dataset during a
# particular harvest
harvest_object_table = Table('harvest_object', metadata, harvest_object_table = Table('harvest_object', metadata,
Column('id', types.UnicodeText, primary_key=True, default=make_uuid), Column('id', types.UnicodeText, primary_key=True, default=make_uuid),
# The guid is the 'identity' of the dataset, according to the source.
# So if you reharvest it, then the harvester knows which dataset to
# update because of this identity. The identity needs to be unique
# within this CKAN.
Column('guid', types.UnicodeText, default=u''), Column('guid', types.UnicodeText, default=u''),
# When you harvest a dataset multiple times, only the latest
# successfully imported harvest_object should be flagged 'current'.
# The import_stage reads and writes it.
Column('current',types.Boolean,default=False), Column('current',types.Boolean,default=False),
Column('gathered', types.DateTime, default=datetime.datetime.utcnow), Column('gathered', types.DateTime, default=datetime.datetime.utcnow),
Column('fetch_started', types.DateTime), Column('fetch_started', types.DateTime),
@ -209,6 +218,7 @@ def define_harvester_tables():
Column('fetch_finished', types.DateTime), Column('fetch_finished', types.DateTime),
Column('import_started', types.DateTime), Column('import_started', types.DateTime),
Column('import_finished', types.DateTime), Column('import_finished', types.DateTime),
# state: WAITING, FETCH, IMPORT, COMPLETE, ERROR
Column('state', types.UnicodeText, default=u'WAITING'), Column('state', types.UnicodeText, default=u'WAITING'),
Column('metadata_modified_date', types.DateTime), Column('metadata_modified_date', types.DateTime),
Column('retry_times',types.Integer, default=0), Column('retry_times',types.Integer, default=0),
@ -391,9 +401,11 @@ ALTER TABLE harvest_object_extra
ALTER TABLE harvest_object_extra ALTER TABLE harvest_object_extra
ADD CONSTRAINT harvest_object_extra_harvest_object_id_fkey FOREIGN KEY (harvest_object_id) REFERENCES harvest_object(id); ADD CONSTRAINT harvest_object_extra_harvest_object_id_fkey FOREIGN KEY (harvest_object_id) REFERENCES harvest_object(id);
UPDATE harvest_object set state = 'COMPLETE'; UPDATE harvest_object set state = 'COMPLETE' where package_id is not null;
UPDATE harvest_object set state = 'ERROR' where package_id is null;
UPDATE harvest_object set retry_times = 0; UPDATE harvest_object set retry_times = 0;
UPDATE harvest_object set report_status = 'new'; UPDATE harvest_object set report_status = 'updated' where package_id is not null;
UPDATE harvest_object set report_status = 'errored' where package_id is null;
UPDATE harvest_source set frequency = 'MANUAL'; UPDATE harvest_source set frequency = 'MANUAL';
ALTER TABLE harvest_object DROP CONSTRAINT harvest_object_package_id_fkey; ALTER TABLE harvest_object DROP CONSTRAINT harvest_object_package_id_fkey;

View File

@ -250,7 +250,12 @@ def gather_callback(channel, method, header, body):
log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids))) log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))
if not harvester_found: if not harvester_found:
msg = 'No harvester could be found for source type %s' % job.source.type # This can occur if you:
# * remove a harvester and it still has sources that are then
# refreshed
# * add a new harvester and restart CKAN but not the gather
# queue.
msg = 'System error - No harvester could be found for source type %s' % job.source.type
err = HarvestGatherError(message=msg,job=job) err = HarvestGatherError(message=msg,job=job)
err.save() err.save()
log.error(msg) log.error(msg)