harvester-d4science/ckanext/harvest/logic/action/create.py

import logging

import ckan

from ckan.plugins import toolkit

from ckanext.harvest.logic import HarvestJobExists
from ckanext.harvest.plugin import DATASET_TYPE_NAME
from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject,
    HarvestObjectExtra)
from ckanext.harvest.logic.dictization import (harvest_job_dictize,
    harvest_object_dictize)
from ckanext.harvest.logic.schema import (harvest_source_show_package_schema,
    harvest_object_create_schema)
from ckanext.harvest.logic.action.get import harvest_source_list,harvest_job_list

log = logging.getLogger(__name__)

_validate = ckan.lib.navl.dictization_functions.validate
check_access = toolkit.check_access


class InactiveSource(Exception):
    pass

def harvest_source_create(context,data_dict):
    '''
    Creates a new harvest source

    This method just proxies the request to package_create,
    which will create a harvest_source dataset type and the
    HarvestSource object. All auth checks and validation will
    be done there .We only make sure to set the dataset type.

    Note that the harvest source type (ckan, waf, csw, etc)
    is now set via the source_type field.

    :param url: the URL for the harvest source
    :type url: string
    :param name: the name of the new harvest source, must be between 2 and 100
        characters long and contain only lowercase alphanumeric characters
    :type name: string
    :param title: the title of the dataset (optional, default: same as
        ``name``)
    :type title: string
    :param notes: a description of the harvest source (optional)
    :type notes: string
    :param source_type: the harvester type for this source. This must be one
        of the registerd harvesters, eg 'ckan', 'csw', etc.
    :type source_type: string
    :param frequency: the frequency in wich this harvester should run. See
        ``ckanext.harvest.model`` source for possible values. Default is
        'MANUAL'
    :type frequency: string
    :param config: extra configuration options for the particular harvester
        type. Should be a serialized as JSON. (optional)
    :type config: string


    :returns: the newly created harvest source
    :rtype: dictionary
    '''

    log.info('Creating harvest source: %r', data_dict)

    data_dict['type'] = DATASET_TYPE_NAME

    context['extras_as_string'] = True
    source = toolkit.get_action('package_create')(context, data_dict)

    return source


def harvest_job_create(context, data_dict):
    '''
    Creates a Harvest Job for a Harvest Source and runs it (by putting it on
    the gather queue)

    :param source_id:
    :type param: string
    :param run: whether to also run it or not (default: True)
    :type run: bool
    '''
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create', context, data_dict)

    source_id = data_dict['source_id']
    run_it = data_dict.get('run', True)

    # Check if source exists
    source = HarvestSource.get(source_id)
    if not source:
        log.warn('Harvest source %s does not exist', source_id)
        raise toolkit.NotFound('Harvest source %s does not exist' % source_id)

    # Check if the source is active
    if not source.active:
        log.warn('Harvest job cannot be created for inactive source %s',
                 source_id)
        raise Exception('Can not create jobs on inactive sources')

    # Check if there already is an unrun or currently running job for this
    # source
    exists = _check_for_existing_jobs(context, source_id)
    if exists:
        log.warn('There is already an unrun job %r for this source %s',
                 exists, source_id)
        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source
    job.save()
    log.info('Harvest job saved %s', job.id)

    if run_it:
        toolkit.get_action('harvest_send_job_to_gather_queue')(
            context, {'id': job.id})

    return harvest_job_dictize(job, context)


def harvest_job_create_all(context, data_dict):
    '''
    Creates a Harvest Job for all Harvest Sources and runs them (by
    putting them on the gather queue)

    :param source_id:
    :type param: string
    :param run: whether to also run the jobs or not (default: True)
    :type run: bool
    '''

    log.info('Harvest job create all: %r', data_dict)
    check_access('harvest_job_create_all',context,data_dict)

    run = data_dict.get('run', True)

    data_dict.update({'only_active':True})

    # Get all active sources
    sources = harvest_source_list(context,data_dict)
    jobs = []
    # Create a new job for each, if there isn't already one
    for source in sources:
        exists = _check_for_existing_jobs(context, source['id'])
        if exists:
            log.info('Skipping source %s as it already has a pending job', source['id'])
            continue

        job = harvest_job_create(
            context, {'source_id': source['id'], 'run': run})
        jobs.append(job)

    log.info('Created jobs for %s%i harvest sources',
             'and run ' if run else '', len(jobs))
    return jobs

def _check_for_existing_jobs(context, source_id):
    '''
    Given a source id, checks if there are jobs for this source
    with status 'New' or 'Running'

    rtype: boolean
    '''
    data_dict ={
        'source_id':source_id,
        'status':u'New'
    }
    exist_new = harvest_job_list(context,data_dict)
    data_dict ={
        'source_id':source_id,
        'status':u'Running'
    }
    exist_running = harvest_job_list(context,data_dict)
    exist = len(exist_new + exist_running) > 0

    return exist

def harvest_object_create(context, data_dict):
    ''' Create a new harvest object

    :type guid: string (optional)
    :type content: string (optional)
    :type job_id: string
    :type source_id: string (optional)
    :type package_id: string (optional)
    :type extras: dict (optional)
    '''
    check_access('harvest_object_create', context, data_dict)
    data, errors = _validate(data_dict, harvest_object_create_schema(), context)

    if errors:
        raise toolkit.ValidationError(errors)

    obj = HarvestObject(
        guid=data.get('guid'),
        content=data.get('content'),
        job=data['job_id'],
        harvest_source_id=data.get('source_id'),
        package_id=data.get('package_id'),
        extras=[ HarvestObjectExtra(key=k, value=v) 
            for k, v in data.get('extras', {}).items() ]
    )

    obj.save()
    return harvest_object_dictize(obj, context)
Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`import logging`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00			`import ckan`
Update harvest source create and update logic functions `harvest_source_create` and `harvest_source_update` now call `package_create` and `package_update` respectively, making sure to define a 'harvest_source' type. The returned dict uses the db_to_form schema. 2012-11-30 15:03:04 +01:00
[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`from ckan.plugins import toolkit`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00
[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`from ckanext.harvest.logic import HarvestJobExists`
Update harvest source create and update logic functions `harvest_source_create` and `harvest_source_update` now call `package_create` and `package_update` respectively, making sure to define a 'harvest_source' type. The returned dict uses the db_to_form schema. 2012-11-30 15:03:04 +01:00			`from ckanext.harvest.plugin import DATASET_TYPE_NAME`
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00			`from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject,`
			`HarvestObjectExtra)`
add harvest_object_create action 2013-09-04 15:17:01 +02:00			`from ckanext.harvest.logic.dictization import (harvest_job_dictize,`
			`harvest_object_dictize)`
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00			`from ckanext.harvest.logic.schema import (harvest_source_show_package_schema,`
			`harvest_object_create_schema)`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`from ckanext.harvest.logic.action.get import harvest_source_list,harvest_job_list`

Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log = logging.getLogger(__name__)`

[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00			`_validate = ckan.lib.navl.dictization_functions.validate`
[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`check_access = toolkit.check_access`

[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00
			`class InactiveSource(Exception):`
			`pass`

[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`def harvest_source_create(context,data_dict):`
Update harvest source create and update logic functions `harvest_source_create` and `harvest_source_update` now call `package_create` and `package_update` respectively, making sure to define a 'harvest_source' type. The returned dict uses the db_to_form schema. 2012-11-30 15:03:04 +01:00			`'''`
			`Creates a new harvest source`

			`This method just proxies the request to package_create,`
			`which will create a harvest_source dataset type and the`
			`HarvestSource object. All auth checks and validation will`
			`be done there .We only make sure to set the dataset type.`

			`Note that the harvest source type (ckan, waf, csw, etc)`
			`is now set via the source_type field.`

			`:param url: the URL for the harvest source`
			`:type url: string`
			`:param name: the name of the new harvest source, must be between 2 and 100`
			`characters long and contain only lowercase alphanumeric characters`
			`:type name: string`
			`:param title: the title of the dataset (optional, default: same as`
			``name``)
			`:type title: string`
			`:param notes: a description of the harvest source (optional)`
			`:type notes: string`
			`:param source_type: the harvester type for this source. This must be one`
			`of the registerd harvesters, eg 'ckan', 'csw', etc.`
			`:type source_type: string`
			`:param frequency: the frequency in wich this harvester should run. See`
			``ckanext.harvest.model`` source for possible values. Default is
			`'MANUAL'`
			`:type frequency: string`
			`:param config: extra configuration options for the particular harvester`
			`type. Should be a serialized as JSON. (optional)`
			`:type config: string`


			`:returns: the newly created harvest source`
			`:rtype: dictionary`
			`'''`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00
Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log.info('Creating harvest source: %r', data_dict)`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00
Update harvest source create and update logic functions `harvest_source_create` and `harvest_source_update` now call `package_create` and `package_update` respectively, making sure to define a 'harvest_source' type. The returned dict uses the db_to_form schema. 2012-11-30 15:03:04 +01:00			`data_dict['type'] = DATASET_TYPE_NAME`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00
Update harvest source create and update logic functions `harvest_source_create` and `harvest_source_update` now call `package_create` and `package_update` respectively, making sure to define a 'harvest_source' type. The returned dict uses the db_to_form schema. 2012-11-30 15:03:04 +01:00			`context['extras_as_string'] = True`
[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`source = toolkit.get_action('package_create')(context, data_dict)`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00
Update harvest source create and update logic functions `harvest_source_create` and `harvest_source_update` now call `package_create` and `package_update` respectively, making sure to define a 'harvest_source' type. The returned dict uses the db_to_form schema. 2012-11-30 15:03:04 +01:00			`return source`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00
add jobs at certain frequencies 2012-10-29 18:15:02 +01:00
[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`def harvest_job_create(context, data_dict):`
			`'''`
			`Creates a Harvest Job for a Harvest Source and runs it (by putting it on`
			`the gather queue)`

			`:param source_id:`
			`:type param: string`
			`:param run: whether to also run it or not (default: True)`
			`:type run: bool`
			`'''`
Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log.info('Harvest job create: %r', data_dict)`
[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`check_access('harvest_job_create', context, data_dict)`
[logic,auth] Add auth logic layer The first version of the auth layer is based on the current policy, i.e. you need to be sysadmin to perform any action. TODO: the CLI is still not working. 2012-03-01 13:02:16 +01:00
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`source_id = data_dict['source_id']`
[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`run_it = data_dict.get('run', True)`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00
			`# Check if source exists`
			`source = HarvestSource.get(source_id)`
			`if not source:`
Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log.warn('Harvest source %s does not exist', source_id)`
[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`raise toolkit.NotFound('Harvest source %s does not exist' % source_id)`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00
			`# Check if the source is active`
			`if not source.active:`
Add run_test, job_abort, source commands * run_test - for running a whole harvest on the command-line * job_abort - for aborting a limbo job * source - for showing a single harvest source * allowing a source to be specified by name in several commands 2015-10-28 18:51:58 +01:00			`log.warn('Harvest job cannot be created for inactive source %s',`
			`source_id)`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`raise Exception('Can not create jobs on inactive sources')`

Add run_test, job_abort, source commands * run_test - for running a whole harvest on the command-line * job_abort - for aborting a limbo job * source - for showing a single harvest source * allowing a source to be specified by name in several commands 2015-10-28 18:51:58 +01:00			`# Check if there already is an unrun or currently running job for this`
			`# source`
Set job status to Finished when actually finishing it Until now, harvest jobs were set to Finished just after sending all objects to the fetch stage. Now every time the run command is run, jobs are set to Running, and all previous Running jobs are checked to see if all harvest objects have a state of Complete or Error. Only then the job is flagged as Finished. 2012-12-13 17:33:44 +01:00			`exists = _check_for_existing_jobs(context, source_id)`
			`if exists:`
Add run_test, job_abort, source commands * run_test - for running a whole harvest on the command-line * job_abort - for aborting a limbo job * source - for showing a single harvest source * allowing a source to be specified by name in several commands 2015-10-28 18:51:58 +01:00			`log.warn('There is already an unrun job %r for this source %s',`
			`exists, source_id)`
add jobs at certain frequencies 2012-10-29 18:15:02 +01:00			`raise HarvestJobExists('There already is an unrun job for this source')`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00
			`job = HarvestJob()`
			`job.source = source`
			`job.save()`
Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log.info('Harvest job saved %s', job.id)`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00
[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`if run_it:`
			`toolkit.get_action('harvest_send_job_to_gather_queue')(`
			`context, {'id': job.id})`

			`return harvest_job_dictize(job, context)`


			`def harvest_job_create_all(context, data_dict):`
			`'''`
			`Creates a Harvest Job for all Harvest Sources and runs them (by`
			`putting them on the gather queue)`

			`:param source_id:`
			`:type param: string`
			`:param run: whether to also run the jobs or not (default: True)`
			`:type run: bool`
			`'''`

Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log.info('Harvest job create all: %r', data_dict)`
[logic,auth] Add auth logic layer The first version of the auth layer is based on the current policy, i.e. you need to be sysadmin to perform any action. TODO: the CLI is still not working. 2012-03-01 13:02:16 +01:00			`check_access('harvest_job_create_all',context,data_dict)`

[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`run = data_dict.get('run', True)`

[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`data_dict.update({'only_active':True})`

			`# Get all active sources`
			`sources = harvest_source_list(context,data_dict)`
			`jobs = []`
			`# Create a new job for each, if there isn't already one`
			`for source in sources:`
Fix bug where source was being treated as an object, when it's a dict 2013-07-29 12:06:58 +02:00			`exists = _check_for_existing_jobs(context, source['id'])`
Set job status to Finished when actually finishing it Until now, harvest jobs were set to Finished just after sending all objects to the fetch stage. Now every time the run command is run, jobs are set to Running, and all previous Running jobs are checked to see if all harvest objects have a state of Complete or Error. Only then the job is flagged as Finished. 2012-12-13 17:33:44 +01:00			`if exists:`
Fix bug where source was being treated as an object, when it's a dict 2013-07-29 12:06:58 +02:00			`log.info('Skipping source %s as it already has a pending job', source['id'])`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`continue`

[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`job = harvest_job_create(`
			`context, {'source_id': source['id'], 'run': run})`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`jobs.append(job)`

[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`log.info('Created jobs for %s%i harvest sources',`
			`'and run ' if run else '', len(jobs))`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`return jobs`

Set job status to Finished when actually finishing it Until now, harvest jobs were set to Finished just after sending all objects to the fetch stage. Now every time the run command is run, jobs are set to Running, and all previous Running jobs are checked to see if all harvest objects have a state of Complete or Error. Only then the job is flagged as Finished. 2012-12-13 17:33:44 +01:00			`def _check_for_existing_jobs(context, source_id):`
			`'''`
			`Given a source id, checks if there are jobs for this source`
			`with status 'New' or 'Running'`

			`rtype: boolean`
			`'''`
			`data_dict ={`
			`'source_id':source_id,`
			`'status':u'New'`
			`}`
			`exist_new = harvest_job_list(context,data_dict)`
			`data_dict ={`
			`'source_id':source_id,`
			`'status':u'Running'`
			`}`
			`exist_running = harvest_job_list(context,data_dict)`
			`exist = len(exist_new + exist_running) > 0`

			`return exist`
add harvest_object_create action 2013-09-04 15:17:01 +02:00
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00			`def harvest_object_create(context, data_dict):`
PEP8 based on #174 2015-11-03 21:30:11 +01:00			`''' Create a new harvest object`
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00
			`:type guid: string (optional)`
			`:type content: string (optional)`
PEP8 based on #174 2015-11-03 21:30:11 +01:00			`:type job_id: string`
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00			`:type source_id: string (optional)`
			`:type package_id: string (optional)`
			`:type extras: dict (optional)`
PEP8 based on #174 2015-11-03 21:30:11 +01:00			`'''`
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00			`check_access('harvest_object_create', context, data_dict)`
			`data, errors = _validate(data_dict, harvest_object_create_schema(), context)`

			`if errors:`
[#111] Run jobs straight away. 2015-10-28 22:58:36 +01:00			`raise toolkit.ValidationError(errors)`
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00
			`obj = HarvestObject(`
			`guid=data.get('guid'),`
			`content=data.get('content'),`
[#65] make harvest_job_exists validator return model object return the model in the validator instead of checking that it exists in the validator, returning the id and then fetching it again in the action function 2013-10-03 16:51:37 +02:00			`job=data['job_id'],`
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00			`harvest_source_id=data.get('source_id'),`
			`package_id=data.get('package_id'),`
			`extras=[ HarvestObjectExtra(key=k, value=v)`
			`for k, v in data.get('extras', {}).items() ]`
			`)`

			`obj.save()`
add harvest_object_create action 2013-09-04 15:17:01 +02:00			`return harvest_object_dictize(obj, context)`