harvester-d4science/ckanext/harvest/logic/validators.py

import logging
import urlparse
import json

from ckan.lib.navl.dictization_functions import Invalid, validate
from ckan import model
from ckan.plugins import PluginImplementations

from ckanext.harvest.plugin import DATASET_TYPE_NAME
from ckanext.harvest.model import HarvestSource, UPDATE_FREQUENCIES, HarvestJob
from ckanext.harvest.interfaces import IHarvester

from ckan.lib.navl.validators import keep_extras

log = logging.getLogger(__name__)


def harvest_source_id_exists(value, context):

    result = HarvestSource.get(value)

    if not result:
        raise Invalid('Harvest Source with id %r does not exist.' % str(value))
    return value


def harvest_job_exists(value, context):
    '''Check if a harvest job exists and returns the model if it does'''
    result = HarvestJob.get(value)

    if not result:
        raise Invalid('Harvest Job with id %r does not exist.' % str(value))
    return result


def _normalize_url(url):
    o = urlparse.urlparse(url)

    # Normalize port
    if ':' in o.netloc:
        parts = o.netloc.split(':')
        if (o.scheme == 'http' and parts[1] == '80') or \
           (o.scheme == 'https' and parts[1] == '443'):
            netloc = parts[0]
        else:
            netloc = ':'.join(parts)
    else:
        netloc = o.netloc

    # Remove trailing slash
    path = o.path.rstrip('/')

    check_url = urlparse.urlunparse((
        o.scheme,
        netloc,
        path,
        None, None, None))

    return check_url


def harvest_source_url_validator(key, data, errors, context):
    package = context.get("package")

    if package:
        package_id = package.id
    else:
        package_id = data.get(key[:-1] + ("id",))

    new_url = _normalize_url(data[key])
    #pkg_id = data.get(('id',),'')

    q = model.Session.query(model.Package.url, model.Package.state) \
             .filter(model.Package.type == DATASET_TYPE_NAME)

    if package_id:
        # When editing a source we need to avoid its own URL
        q = q.filter(model.Package.id != package_id)

    existing_sources = q.all()

    for url, state in existing_sources:
        url = _normalize_url(url)
        if url == new_url:
            raise Invalid('There already is a Harvest Source for this URL: %s'
                          % data[key])

    return data[key]


def harvest_source_type_exists(value, context):
    #TODO: use new description interface

    # Get all the registered harvester types
    available_types = []
    for harvester in PluginImplementations(IHarvester):
        info = harvester.info()
        if not info or 'name' not in info:
            log.error('Harvester %s does not provide the harvester name in '
                      'the info response' % harvester)
            continue
        available_types.append(info['name'])

    if not value in available_types:
        raise Invalid('Unknown harvester type: %s. Have you registered a '
                      'harvester for this type?' % value)

    return value


def harvest_source_config_validator(key, data, errors, context):
    harvester_type = data.get(('source_type',), '')
    for harvester in PluginImplementations(IHarvester):
        info = harvester.info()
        if info['name'] == harvester_type:
            if hasattr(harvester, 'validate_config'):
                try:
                    return harvester.validate_config(data[key])
                except Exception, e:
                    raise Invalid('Error parsing the configuration options: %s'
                                  % e)
            else:
                return data[key]


def keep_not_empty_extras(key, data, errors, context):
    extras = data.pop(key, {})
    for extras_key, value in extras.iteritems():
        if value:
            data[key[:-1] + (extras_key,)] = value


def harvest_source_extra_validator(key, data, errors, context):
    harvester_type = data.get(('source_type',), '')

    # gather all extra fields to use as whitelist of what
    # can be added to top level data_dict
    all_extra_fields = set()
    for harvester in PluginImplementations(IHarvester):
        if not hasattr(harvester, 'extra_schema'):
            continue
        all_extra_fields.update(harvester.extra_schema().keys())

    extra_schema = {'__extras': [keep_not_empty_extras]}
    for harvester in PluginImplementations(IHarvester):
        if not hasattr(harvester, 'extra_schema'):
            continue
        info = harvester.info()
        if not info['name'] == harvester_type:
            continue
        extra_schema.update(harvester.extra_schema())
        break

    extra_data, extra_errors = validate(data.get(key, {}), extra_schema)
    for key in extra_data.keys():
        #only allow keys that appear in at least one harvester
        if key not in all_extra_fields:
            extra_data.pop(key)

    for key, value in extra_data.iteritems():
        data[(key,)] = value

    for key, value in extra_errors.iteritems():
        errors[(key,)] = value

    # need to get config out of extras as __extra runs
    # after rest of validation
    package_extras = data.get(('extras',), [])

    for num, extra in enumerate(list(package_extras)):
        if extra['key'] == 'config':
            # remove config extra so we can add back cleanly later
            package_extras.pop(num)
            try:
                config_dict = json.loads(extra.get('value') or '{}')
            except ValueError:
                log.error('Wrong JSON provided in config, skipping')
                config_dict = {}
            break
    else:
        config_dict = {}
    config_dict.update(extra_data)
    if config_dict and not extra_errors:
        config = json.dumps(config_dict)
        package_extras.append(dict(key='config',
                                   value=config))
        data[('config',)] = config
    if package_extras:
        data[('extras',)] = package_extras


def harvest_source_convert_from_config(key, data, errors, context):
    config = data[key]
    if config:
        config_dict = json.loads(config)
        for key, value in config_dict.iteritems():
            data[(key,)] = value


def harvest_source_active_validator(value, context):
    if isinstance(value, basestring):
        if value.lower() == 'true':
            return True
        else:
            return False
    return bool(value)


def harvest_source_frequency_exists(value):
    if value == '':
        value = 'MANUAL'
    if value.upper() not in UPDATE_FREQUENCIES:
        raise Invalid('Frequency %s not recognised' % value)
    return value.upper()


def dataset_type_exists(value):
    if value != DATASET_TYPE_NAME:
        value = DATASET_TYPE_NAME
    return value


def harvest_object_extras_validator(value, context):
    if not isinstance(value, dict):
        raise Invalid('extras must be a dict')
    for v in value.values():
        if not isinstance(v, basestring):
            raise Invalid('extras must be a dict of strings')
    return value
Handle wrong JSON in harvest_source_extra_validator 2013-03-28 17:19:16 +01:00			`import logging`
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00			`import urlparse`
source extras field type 2013-02-25 19:07:34 +01:00			`import json`
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00
source extras field type 2013-02-25 19:07:34 +01:00			`from ckan.lib.navl.dictization_functions import Invalid, validate`
Various fixes for the harvest source dataset type forms Add a db to form schema to show the fields stored in extras. Validate the source url on the Package object. 2012-11-29 17:52:10 +01:00			`from ckan import model`
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00			`from ckan.plugins import PluginImplementations`

First stub of the new dataset type forms Adds a 'harvest_source' dataset type that mimics the original harvest source form. It works against the 3022 branch on CKAN core. 2012-11-29 12:48:36 +01:00			`from ckanext.harvest.plugin import DATASET_TYPE_NAME`
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00			`from ckanext.harvest.model import HarvestSource, UPDATE_FREQUENCIES, HarvestJob`
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00			`from ckanext.harvest.interfaces import IHarvester`

source extras field type 2013-02-25 19:07:34 +01:00			`from ckan.lib.navl.validators import keep_extras`
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00
Handle wrong JSON in harvest_source_extra_validator 2013-03-28 17:19:16 +01:00			`log = logging.getLogger(__name__)`

PEP8 based on #174 2015-11-03 21:30:11 +01:00
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00			`def harvest_source_id_exists(value, context):`
Various fixes for the harvest source dataset type forms Add a db to form schema to show the fields stored in extras. Validate the source url on the Package object. 2012-11-29 17:52:10 +01:00
PEP8 based on #174 2015-11-03 21:30:11 +01:00			`result = HarvestSource.get(value)`
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00
			`if not result:`
			`raise Invalid('Harvest Source with id %r does not exist.' % str(value))`
			`return value`

PEP8 based on #174 2015-11-03 21:30:11 +01:00
[#65] make harvest_job_exists validator return model object return the model in the validator instead of checking that it exists in the validator, returning the id and then fetching it again in the action function 2013-10-03 16:51:37 +02:00			`def harvest_job_exists(value, context):`
PEP8 based on #174 2015-11-03 21:30:11 +01:00			`'''Check if a harvest job exists and returns the model if it does'''`
			`result = HarvestJob.get(value)`
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00
			`if not result:`
			`raise Invalid('Harvest Job with id %r does not exist.' % str(value))`
[#65] make harvest_job_exists validator return model object return the model in the validator instead of checking that it exists in the validator, returning the id and then fetching it again in the action function 2013-10-03 16:51:37 +02:00			`return result`
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00
PEP8 based on #174 2015-11-03 21:30:11 +01:00
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00			`def _normalize_url(url):`
			`o = urlparse.urlparse(url)`

			`# Normalize port`
			`if ':' in o.netloc:`
			`parts = o.netloc.split(':')`
			`if (o.scheme == 'http' and parts[1] == '80') or \`
			`(o.scheme == 'https' and parts[1] == '443'):`
			`netloc = parts[0]`
			`else:`
			`netloc = ':'.join(parts)`
			`else:`
			`netloc = o.netloc`
Various fixes for the harvest source dataset type forms Add a db to form schema to show the fields stored in extras. Validate the source url on the Package object. 2012-11-29 17:52:10 +01:00
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00			`# Remove trailing slash`
			`path = o.path.rstrip('/')`

			`check_url = urlparse.urlunparse((`
PEP8 based on #174 2015-11-03 21:30:11 +01:00			`o.scheme,`
			`netloc,`
			`path,`
			`None, None, None))`
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00
			`return check_url`

PEP8 based on #174 2015-11-03 21:30:11 +01:00
			`def harvest_source_url_validator(key, data, errors, context):`
Various fixes for the harvest source dataset type forms Add a db to form schema to show the fields stored in extras. Validate the source url on the Package object. 2012-11-29 17:52:10 +01:00			`package = context.get("package")`

			`if package:`
			`package_id = package.id`
			`else:`
			`package_id = data.get(key[:-1] + ("id",))`

[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00			`new_url = _normalize_url(data[key])`
Various fixes for the harvest source dataset type forms Add a db to form schema to show the fields stored in extras. Validate the source url on the Package object. 2012-11-29 17:52:10 +01:00			`#pkg_id = data.get(('id',),'')`

			`q = model.Session.query(model.Package.url, model.Package.state) \`
PEP8 based on #174 2015-11-03 21:30:11 +01:00			`.filter(model.Package.type == DATASET_TYPE_NAME)`
Various fixes for the harvest source dataset type forms Add a db to form schema to show the fields stored in extras. Validate the source url on the Package object. 2012-11-29 17:52:10 +01:00
			`if package_id:`
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00			`# When editing a source we need to avoid its own URL`
PEP8 based on #174 2015-11-03 21:30:11 +01:00			`q = q.filter(model.Package.id != package_id)`
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00
Various fixes for the harvest source dataset type forms Add a db to form schema to show the fields stored in extras. Validate the source url on the Package object. 2012-11-29 17:52:10 +01:00			`existing_sources = q.all()`

			`for url, state in existing_sources:`
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00			`url = _normalize_url(url)`
[lib] Renable unique constraint in url for inactive sources 2012-02-14 12:28:11 +01:00			`if url == new_url:`
PEP8 based on #174 2015-11-03 21:30:11 +01:00			`raise Invalid('There already is a Harvest Source for this URL: %s'`
			`% data[key])`
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00
Various fixes for the harvest source dataset type forms Add a db to form schema to show the fields stored in extras. Validate the source url on the Package object. 2012-11-29 17:52:10 +01:00			`return data[key]`
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00
PEP8 based on #174 2015-11-03 21:30:11 +01:00
			`def harvest_source_type_exists(value, context):`
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00			`#TODO: use new description interface`

			`# Get all the registered harvester types`
			`available_types = []`
			`for harvester in PluginImplementations(IHarvester):`
Add a new info method to the harvester interface so implementations can provide details. Use this to build the WUI form 2011-05-13 19:39:36 +02:00			`info = harvester.info()`
			`if not info or 'name' not in info:`
PEP8 based on #174 2015-11-03 21:30:11 +01:00			`log.error('Harvester %s does not provide the harvester name in '`
			`'the info response' % harvester)`
Add a new info method to the harvester interface so implementations can provide details. Use this to build the WUI form 2011-05-13 19:39:36 +02:00			`continue`
			`available_types.append(info['name'])`

[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00			`if not value in available_types:`
PEP8 based on #174 2015-11-03 21:30:11 +01:00			`raise Invalid('Unknown harvester type: %s. Have you registered a '`
			`'harvester for this type?' % value)`
Various fixes for the harvest source dataset type forms Add a db to form schema to show the fields stored in extras. Validate the source url on the Package object. 2012-11-29 17:52:10 +01:00
[forms] Major refactoring of the harvest forms. Forms no longer use the DGU form API, and are handled similarly to the new ones on CKAN core (logic, schema, validators...). The UI is also more consistent with the CKAN one. 2011-05-13 15:17:58 +02:00			`return value`
Add a simple way for harvesters to store configuration options. If form_config_interface is Text on the info dictionary, the configuration field will be enabled in the form. Harvesters can also provide a validate_config method. 2011-06-07 13:07:53 +02:00
PEP8 based on #174 2015-11-03 21:30:11 +01:00
			`def harvest_source_config_validator(key, data, errors, context):`
			`harvester_type = data.get(('source_type',), '')`
Add a simple way for harvesters to store configuration options. If form_config_interface is Text on the info dictionary, the configuration field will be enabled in the form. Harvesters can also provide a validate_config method. 2011-06-07 13:07:53 +02:00			`for harvester in PluginImplementations(IHarvester):`
			`info = harvester.info()`
			`if info['name'] == harvester_type:`
use hasattr for config validation 2011-06-09 11:35:58 +02:00			`if hasattr(harvester, 'validate_config'):`
Add a simple way for harvesters to store configuration options. If form_config_interface is Text on the info dictionary, the configuration field will be enabled in the form. Harvesters can also provide a validate_config method. 2011-06-07 13:07:53 +02:00			`try:`
			`return harvester.validate_config(data[key])`
			`except Exception, e:`
PEP8 based on #174 2015-11-03 21:30:11 +01:00			`raise Invalid('Error parsing the configuration options: %s'`
			`% e)`
Add a simple way for harvesters to store configuration options. If form_config_interface is Text on the info dictionary, the configuration field will be enabled in the form. Harvesters can also provide a validate_config method. 2011-06-07 13:07:53 +02:00			`else:`
			`return data[key]`

PEP8 based on #174 2015-11-03 21:30:11 +01:00
source extras field type 2013-02-25 19:07:34 +01:00			`def keep_not_empty_extras(key, data, errors, context):`
			`extras = data.pop(key, {})`
			`for extras_key, value in extras.iteritems():`
			`if value:`
			`data[key[:-1] + (extras_key,)] = value`


PEP8 based on #174 2015-11-03 21:30:11 +01:00			`def harvest_source_extra_validator(key, data, errors, context):`
			`harvester_type = data.get(('source_type',), '')`

			`# gather all extra fields to use as whitelist of what`
			`# can be added to top level data_dict`
source extras field type 2013-02-25 19:07:34 +01:00			`all_extra_fields = set()`
			`for harvester in PluginImplementations(IHarvester):`
			`if not hasattr(harvester, 'extra_schema'):`
			`continue`
			`all_extra_fields.update(harvester.extra_schema().keys())`

			`extra_schema = {'__extras': [keep_not_empty_extras]}`
			`for harvester in PluginImplementations(IHarvester):`
			`if not hasattr(harvester, 'extra_schema'):`
			`continue`
			`info = harvester.info()`
			`if not info['name'] == harvester_type:`
			`continue`
			`extra_schema.update(harvester.extra_schema())`
			`break`

			`extra_data, extra_errors = validate(data.get(key, {}), extra_schema)`
			`for key in extra_data.keys():`
			`#only allow keys that appear in at least one harvester`
			`if key not in all_extra_fields:`
			`extra_data.pop(key)`

			`for key, value in extra_data.iteritems():`
			`data[(key,)] = value`

			`for key, value in extra_errors.iteritems():`
			`errors[(key,)] = value`

PEP8 based on #174 2015-11-03 21:30:11 +01:00			`# need to get config out of extras as __extra runs`
			`# after rest of validation`
source extras field type 2013-02-25 19:07:34 +01:00			`package_extras = data.get(('extras',), [])`

			`for num, extra in enumerate(list(package_extras)):`
			`if extra['key'] == 'config':`
			`# remove config extra so we can add back cleanly later`
			`package_extras.pop(num)`
Handle wrong JSON in harvest_source_extra_validator 2013-03-28 17:19:16 +01:00			`try:`
			`config_dict = json.loads(extra.get('value') or '{}')`
			`except ValueError:`
			`log.error('Wrong JSON provided in config, skipping')`
			`config_dict = {}`
source extras field type 2013-02-25 19:07:34 +01:00			`break`
			`else:`
			`config_dict = {}`
			`config_dict.update(extra_data)`
make sure config dict is not jsonified if it contains an error 2013-04-08 19:52:36 +02:00			`if config_dict and not extra_errors:`
make sure config is also on top level 2013-02-28 14:46:16 +01:00			`config = json.dumps(config_dict)`
source extras field type 2013-02-25 19:07:34 +01:00			`package_extras.append(dict(key='config',`
make sure config is also on top level 2013-02-28 14:46:16 +01:00			`value=config))`
			`data[('config',)] = config`
source extras field type 2013-02-25 19:07:34 +01:00			`if package_extras:`
			`data[('extras',)] = package_extras`

PEP8 based on #174 2015-11-03 21:30:11 +01:00
			`def harvest_source_convert_from_config(key, data, errors, context):`
source extras field type 2013-02-25 19:07:34 +01:00			`config = data[key]`
			`if config:`
			`config_dict = json.loads(config)`
			`for key, value in config_dict.iteritems():`
			`data[(key,)] = value`

PEP8 based on #174 2015-11-03 21:30:11 +01:00
			`def harvest_source_active_validator(value, context):`
			`if isinstance(value, basestring):`
[ui,logic] Expose source state (active/inactive) in the source form 2012-02-14 15:24:32 +01:00			`if value.lower() == 'true':`
			`return True`
			`else:`
			`return False`
			`return bool(value)`

PEP8 based on #174 2015-11-03 21:30:11 +01:00
add jobs at certain frequencies 2012-10-29 18:15:02 +01:00			`def harvest_source_frequency_exists(value):`
made manual default not null 2012-11-05 14:17:32 +01:00			`if value == '':`
			`value = 'MANUAL'`
First stub of the new dataset type forms Adds a 'harvest_source' dataset type that mimics the original harvest source form. It works against the 3022 branch on CKAN core. 2012-11-29 12:48:36 +01:00			`if value.upper() not in UPDATE_FREQUENCIES:`
add jobs at certain frequencies 2012-10-29 18:15:02 +01:00			`raise Invalid('Frequency %s not recognised' % value)`
			`return value.upper()`
First stub of the new dataset type forms Adds a 'harvest_source' dataset type that mimics the original harvest source form. It works against the 3022 branch on CKAN core. 2012-11-29 12:48:36 +01:00

			`def dataset_type_exists(value):`
			`if value != DATASET_TYPE_NAME:`
			`value = DATASET_TYPE_NAME`
			`return value`
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00
PEP8 based on #174 2015-11-03 21:30:11 +01:00
[#65] harvest_object_create action update to use schema and validators. Also accept more parameters to data_dict. 2013-09-17 17:49:19 +02:00			`def harvest_object_extras_validator(value, context):`
			`if not isinstance(value, dict):`
			`raise Invalid('extras must be a dict')`
			`for v in value.values():`
			`if not isinstance(v, basestring):`
			`raise Invalid('extras must be a dict of strings')`
			`return value`