harvester-d4science/ckanext/harvest/tests/test_queue.py

280 lines
9.6 KiB
Python
Raw Normal View History

import ckanext.harvest.model as harvest_model
2012-10-25 20:01:54 +02:00
from ckanext.harvest.model import HarvestObject, HarvestObjectExtra
from ckanext.harvest.interfaces import IHarvester
import ckanext.harvest.queue as queue
from ckan.plugins.core import SingletonPlugin, implements
import json
import ckan.logic as logic
from ckan import model
class TestHarvester(SingletonPlugin):
implements(IHarvester)
def info(self):
return {'name': 'test', 'title': 'test', 'description': 'test'}
def gather_stage(self, harvest_job):
2012-11-05 14:17:32 +01:00
if harvest_job.source.url.startswith('basic_test'):
obj = HarvestObject(guid = 'test1', job = harvest_job)
2012-10-25 20:01:54 +02:00
obj.extras.append(HarvestObjectExtra(key='key', value='value'))
obj2 = HarvestObject(guid = 'test2', job = harvest_job)
2012-12-18 00:50:26 +01:00
obj3 = HarvestObject(guid = 'test_to_delete', job = harvest_job)
obj.add()
2012-12-18 00:50:26 +01:00
obj2.add()
obj3.save() # this will commit both
return [obj.id, obj2.id, obj3.id]
return []
def fetch_stage(self, harvest_object):
2012-10-25 20:01:54 +02:00
assert harvest_object.state == "FETCH"
assert harvest_object.fetch_started != None
harvest_object.content = json.dumps({'name': harvest_object.guid})
harvest_object.save()
return True
def import_stage(self, harvest_object):
2012-10-25 20:01:54 +02:00
assert harvest_object.state == "IMPORT"
assert harvest_object.fetch_finished != None
assert harvest_object.import_started != None
user = logic.get_action('get_site_user')(
{'model': model, 'ignore_auth': True}, {}
)['name']
2012-12-18 00:50:26 +01:00
package = json.loads(harvest_object.content)
name = package['name']
package_object = model.Package.get(name)
if package_object:
logic_function = 'package_update'
else:
logic_function = 'package_create'
package_dict = logic.get_action(logic_function)(
{'model': model, 'session': model.Session,
2013-08-14 13:28:27 +02:00
'user': user, 'api_version': 3, 'ignore_auth': True},
json.loads(harvest_object.content)
)
2012-12-18 00:50:26 +01:00
2013-04-22 18:55:46 +02:00
# set previous objects to not current
previous_object = model.Session.query(HarvestObject) \
.filter(HarvestObject.guid==harvest_object.guid) \
.filter(HarvestObject.current==True) \
.first()
if previous_object:
previous_object.current = False
previous_object.save()
2012-12-18 00:50:26 +01:00
# delete test_to_delete package on second run
harvest_object.package_id = package_dict['id']
harvest_object.current = True
if package_dict['name'] == 'test_to_delete' and package_object:
harvest_object.current = False
2012-12-18 03:39:14 +01:00
package_object.state = 'deleted'
package_object.save()
2012-12-18 00:50:26 +01:00
harvest_object.save()
return True
class TestHarvestQueue(object):
@classmethod
def setup_class(cls):
harvest_model.setup()
@classmethod
def teardown_class(cls):
model.repo.rebuild_db()
2012-12-18 00:50:26 +01:00
def test_01_basic_harvester(self):
### make sure queues/exchanges are created first and are empty
consumer = queue.get_consumer('ckan.harvest.gather','harvest_job_id')
consumer_fetch = queue.get_consumer('ckan.harvest.fetch','harvest_object_id')
consumer.queue_purge(queue='ckan.harvest.gather')
consumer_fetch.queue_purge(queue='ckan.harvest.fetch')
user = logic.get_action('get_site_user')(
{'model': model, 'ignore_auth': True}, {}
)['name']
context = {'model': model, 'session': model.Session,
2013-08-14 13:28:27 +02:00
'user': user, 'api_version': 3, 'ignore_auth': True}
source_dict = {
'title': 'Test Source',
'name': 'test-source',
'url': 'basic_test',
'source_type': 'test',
}
harvest_source = logic.get_action('harvest_source_create')(
context,
source_dict
)
assert harvest_source['source_type'] == 'test', harvest_source
assert harvest_source['url'] == 'basic_test', harvest_source
harvest_job = logic.get_action('harvest_job_create')(
context,
{'source_id':harvest_source['id']}
)
job_id = harvest_job['id']
assert harvest_job['source_id'] == harvest_source['id'], harvest_job
assert harvest_job['status'] == u'New'
logic.get_action('harvest_jobs_run')(
context,
{'source_id':harvest_source['id']}
)
assert logic.get_action('harvest_job_show')(
context,
{'id': job_id}
)['status'] == u'Running'
## pop on item off the queue and run the callback
reply = consumer.basic_get(queue='ckan.harvest.gather')
2013-04-22 18:55:46 +02:00
queue.gather_callback(consumer, *reply)
2012-10-25 20:01:54 +02:00
all_objects = model.Session.query(HarvestObject).all()
2012-12-18 00:50:26 +01:00
assert len(all_objects) == 3
2012-10-25 20:01:54 +02:00
assert all_objects[0].state == 'WAITING'
assert all_objects[1].state == 'WAITING'
2012-12-18 00:50:26 +01:00
assert all_objects[2].state == 'WAITING'
2012-10-25 20:01:54 +02:00
2012-12-18 00:50:26 +01:00
assert len(model.Session.query(HarvestObject).all()) == 3
2012-10-25 20:01:54 +02:00
assert len(model.Session.query(HarvestObjectExtra).all()) == 1
2012-12-18 00:50:26 +01:00
## do three times as three harvest objects
reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
2013-04-22 18:55:46 +02:00
queue.fetch_callback(consumer_fetch, *reply)
reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
queue.fetch_callback(consumer_fetch, *reply)
reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
queue.fetch_callback(consumer_fetch, *reply)
count = model.Session.query(model.Package) \
2013-04-22 18:55:46 +02:00
.filter(model.Package.type=='dataset') \
.count()
assert count == 3
2012-12-18 00:50:26 +01:00
all_objects = model.Session.query(HarvestObject).filter_by(current=True).all()
2012-12-18 00:50:26 +01:00
assert len(all_objects) == 3
2012-10-25 20:01:54 +02:00
assert all_objects[0].state == 'COMPLETE'
2013-04-22 18:55:46 +02:00
assert all_objects[0].report_status == 'added'
2012-10-25 20:01:54 +02:00
assert all_objects[1].state == 'COMPLETE'
2013-04-22 18:55:46 +02:00
assert all_objects[1].report_status == 'added'
2012-12-18 00:50:26 +01:00
assert all_objects[2].state == 'COMPLETE'
2013-04-22 18:55:46 +02:00
assert all_objects[2].report_status == 'added'
## fire run again to check if job is set to Finished
try:
logic.get_action('harvest_jobs_run')(
context,
{'source_id':harvest_source['id']}
)
except Exception, e:
assert 'There are no new harvesting jobs' in str(e)
2012-12-18 00:50:26 +01:00
harvest_job = logic.get_action('harvest_job_show')(
context,
{'id': job_id}
)
assert harvest_job['status'] == u'Finished'
2013-04-22 18:55:46 +02:00
assert harvest_job['stats'] == {'added': 3}
2012-12-18 03:39:14 +01:00
harvest_source_dict = logic.get_action('harvest_source_show')(
context,
{'id': harvest_source['id']}
)
2013-04-22 18:55:46 +02:00
assert harvest_source_dict['status']['last_job']['stats'] == {'added': 3}
assert harvest_source_dict['status']['total_datasets'] == 3
assert harvest_source_dict['status']['job_count'] == 1
2012-12-18 03:39:14 +01:00
2012-12-18 00:50:26 +01:00
########### Second run ########################
harvest_job = logic.get_action('harvest_job_create')(
context,
{'source_id':harvest_source['id']}
)
logic.get_action('harvest_jobs_run')(
context,
{'source_id':harvest_source['id']}
)
job_id = harvest_job['id']
2013-04-22 18:55:46 +02:00
assert logic.get_action('harvest_job_show')(
context,
{'id': job_id}
)['status'] == u'Running'
2012-12-18 00:50:26 +01:00
## pop on item off the queue and run the callback
reply = consumer.basic_get(queue='ckan.harvest.gather')
queue.gather_callback(consumer, *reply)
all_objects = model.Session.query(HarvestObject).all()
assert len(all_objects) == 6
2013-04-22 18:55:46 +02:00
reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
queue.fetch_callback(consumer_fetch, *reply)
reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
queue.fetch_callback(consumer_fetch, *reply)
reply = consumer_fetch.basic_get(queue='ckan.harvest.fetch')
queue.fetch_callback(consumer_fetch, *reply)
2012-12-18 00:50:26 +01:00
2013-04-22 18:55:46 +02:00
count = model.Session.query(model.Package) \
.filter(model.Package.type=='dataset') \
.count()
assert count == 3
2012-12-18 00:50:26 +01:00
2013-04-22 18:55:46 +02:00
all_objects = model.Session.query(HarvestObject).filter_by(report_status='added').all()
2012-12-18 00:50:26 +01:00
assert len(all_objects) == 3, len(all_objects)
all_objects = model.Session.query(HarvestObject).filter_by(report_status='updated').all()
assert len(all_objects) == 2, len(all_objects)
all_objects = model.Session.query(HarvestObject).filter_by(report_status='deleted').all()
assert len(all_objects) == 1, len(all_objects)
2012-12-18 03:39:14 +01:00
# run to make sure job is marked as finshed
try:
logic.get_action('harvest_jobs_run')(
context,
{'source_id':harvest_source['id']}
)
except Exception, e:
assert 'There are no new harvesting jobs' in str(e)
2012-12-18 00:50:26 +01:00
harvest_job = logic.get_action('harvest_job_show')(
context,
{'id': job_id}
2012-12-18 00:50:26 +01:00
)
assert harvest_job['stats'] == {'updated': 2, 'deleted': 1}
2012-12-18 03:39:14 +01:00
context['detailed'] = True
harvest_source_dict = logic.get_action('harvest_source_show')(
context,
{'id': harvest_source['id']}
)
2013-04-22 18:55:46 +02:00
assert harvest_source_dict['status']['last_job']['stats'] == {'updated': 2, 'deleted': 1}
assert harvest_source_dict['status']['total_datasets'] == 2
assert harvest_source_dict['status']['job_count'] == 2