harvester-d4science/ckanext/harvest/tests/lib.py

from ckanext.harvest.tests.factories import HarvestSourceObj, HarvestJobObj
import ckanext.harvest.model as harvest_model
from ckanext.harvest import queue
from ckan.plugins import toolkit


def run_harvest(url, harvester, config=''):
    '''Runs a harvest and returns the results.
    This allows you to test a harvester.
    Queues are avoided as they are a pain in tests.
    '''
    # User creates a harvest source
    source = HarvestSourceObj(url=url, config=config)

    # User triggers a harvest, which is the creation of a harvest job.
    # We set run=False so that it doesn't put it on the gather queue.
    job = HarvestJobObj(source=source, run=False)

    return run_harvest_job(job, harvester)


def run_harvest_job(job, harvester):
    # In 'harvest_job_create' it would call 'harvest_send_job_to_gather_queue'
    # which would do 2 things to 'run' the job:
    # 1. change the job status to Running
    job.status = 'Running'
    job.save()
    # 2. put the job on the gather queue which is consumed by
    # queue.gather_callback, which determines the harvester and then calls
    # gather_stage. We simply call the gather_stage.
    obj_ids = queue.gather_stage(harvester, job)
    if not isinstance(obj_ids, list):
        # gather failed
        return None

    # The object ids are put onto the fetch queue, consumed by
    # queue.fetch_callback which calls queue.fetch_and_import_stages
    results_by_guid = {}
    for obj_id in obj_ids:
        harvest_object = harvest_model.HarvestObject.get(obj_id)
        guid = harvest_object.guid
        results_by_guid[guid] = {'obj_id': obj_id}

        queue.fetch_and_import_stages(harvester, harvest_object)
        results_by_guid[guid]['state'] = harvest_object.state
        results_by_guid[guid]['report_status'] = harvest_object.report_status
        if harvest_object.state == 'COMPLETE' and harvest_object.package_id:
            results_by_guid[guid]['dataset'] = \
                toolkit.get_action('package_show')(
                    {'ignore_auth': True},
                    dict(id=harvest_object.package_id))
        results_by_guid[guid]['errors'] = harvest_object.errors

    # Do 'harvest_jobs_run' to change the job status to 'finished'
    toolkit.get_action('harvest_jobs_run')({'ignore_auth': True}, {})

    return results_by_guid
Add framework for testing harvesters. Modernize existing tests. 2015-10-21 18:26:57 +02:00			`from ckanext.harvest.tests.factories import HarvestSourceObj, HarvestJobObj`
			`import ckanext.harvest.model as harvest_model`
			`from ckanext.harvest import queue`
			`from ckan.plugins import toolkit`


			`def run_harvest(url, harvester, config=''):`
			`'''Runs a harvest and returns the results.`
			`This allows you to test a harvester.`
			`Queues are avoided as they are a pain in tests.`
			`'''`
			`# User creates a harvest source`
			`source = HarvestSourceObj(url=url, config=config)`

[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`# User triggers a harvest, which is the creation of a harvest job.`
			`# We set run=False so that it doesn't put it on the gather queue.`
			`job = HarvestJobObj(source=source, run=False)`
Add framework for testing harvesters. Modernize existing tests. 2015-10-21 18:26:57 +02:00
			`return run_harvest_job(job, harvester)`


			`def run_harvest_job(job, harvester):`
[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`# In 'harvest_job_create' it would call 'harvest_send_job_to_gather_queue'`
			`# which would do 2 things to 'run' the job:`
Add framework for testing harvesters. Modernize existing tests. 2015-10-21 18:26:57 +02:00			`# 1. change the job status to Running`
			`job.status = 'Running'`
			`job.save()`
			`# 2. put the job on the gather queue which is consumed by`
			`# queue.gather_callback, which determines the harvester and then calls`
			`# gather_stage. We simply call the gather_stage.`
			`obj_ids = queue.gather_stage(harvester, job)`
Improved error handling. e.g. if the site it harvests just returns errors. 2016-02-15 13:10:44 +01:00			`if not isinstance(obj_ids, list):`
			`# gather failed`
			`return None`
Add framework for testing harvesters. Modernize existing tests. 2015-10-21 18:26:57 +02:00
			`# The object ids are put onto the fetch queue, consumed by`
			`# queue.fetch_callback which calls queue.fetch_and_import_stages`
			`results_by_guid = {}`
			`for obj_id in obj_ids:`
			`harvest_object = harvest_model.HarvestObject.get(obj_id)`
			`guid = harvest_object.guid`
			`results_by_guid[guid] = {'obj_id': obj_id}`

			`queue.fetch_and_import_stages(harvester, harvest_object)`
			`results_by_guid[guid]['state'] = harvest_object.state`
			`results_by_guid[guid]['report_status'] = harvest_object.report_status`
			`if harvest_object.state == 'COMPLETE' and harvest_object.package_id:`
			`results_by_guid[guid]['dataset'] = \`
			`toolkit.get_action('package_show')(`
[#107] "unchanged" response tested and related fixes * fix "existing_package_dict" which wasn't containing metadata_modified (because of the schema in the context) so you never skipped an object. * fix IntegrityError due to resource revision_id being harvested. No idea why this hasn't caused errors before now. * "unchanged" is now checked in base instead of ckanharvester - makes sense. Looking at other harvesters, it's normal to return from the import_stage with the value returned from base._create_or_update_package so I've continued with that. * "unchanged" response is now documented * better report_status tests in test_queue2. 2015-11-03 01:22:53 +01:00			`{'ignore_auth': True},`
			`dict(id=harvest_object.package_id))`
Add framework for testing harvesters. Modernize existing tests. 2015-10-21 18:26:57 +02:00			`results_by_guid[guid]['errors'] = harvest_object.errors`

			`# Do 'harvest_jobs_run' to change the job status to 'finished'`
[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`toolkit.get_action('harvest_jobs_run')({'ignore_auth': True}, {})`
Add framework for testing harvesters. Modernize existing tests. 2015-10-21 18:26:57 +02:00
			`return results_by_guid`