2013-03-28 17:19:16 +01:00
|
|
|
import logging
|
2011-05-13 15:17:58 +02:00
|
|
|
import urlparse
|
2013-02-25 19:07:34 +01:00
|
|
|
import json
|
2011-05-13 15:17:58 +02:00
|
|
|
|
2013-02-25 19:07:34 +01:00
|
|
|
from ckan.lib.navl.dictization_functions import Invalid, validate
|
2012-11-29 17:52:10 +01:00
|
|
|
from ckan import model
|
2011-05-13 15:17:58 +02:00
|
|
|
from ckan.plugins import PluginImplementations
|
|
|
|
|
2012-11-29 12:48:36 +01:00
|
|
|
from ckanext.harvest.plugin import DATASET_TYPE_NAME
|
2013-09-17 17:49:19 +02:00
|
|
|
from ckanext.harvest.model import HarvestSource, UPDATE_FREQUENCIES, HarvestJob
|
2011-05-13 15:17:58 +02:00
|
|
|
from ckanext.harvest.interfaces import IHarvester
|
|
|
|
|
2013-02-25 19:07:34 +01:00
|
|
|
from ckan.lib.navl.validators import keep_extras
|
2011-05-13 15:17:58 +02:00
|
|
|
|
2013-03-28 17:19:16 +01:00
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
|
2011-05-13 15:17:58 +02:00
|
|
|
def harvest_source_id_exists(value, context):
|
2012-11-29 17:52:10 +01:00
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
result = HarvestSource.get(value, None)
|
2011-05-13 15:17:58 +02:00
|
|
|
|
|
|
|
if not result:
|
|
|
|
raise Invalid('Harvest Source with id %r does not exist.' % str(value))
|
|
|
|
return value
|
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
|
2013-10-03 16:51:37 +02:00
|
|
|
def harvest_job_exists(value, context):
|
|
|
|
"""Check if a harvest job exists and returns the model if it does"""
|
|
|
|
result = HarvestJob.get(value, None)
|
2013-09-17 17:49:19 +02:00
|
|
|
|
|
|
|
if not result:
|
|
|
|
raise Invalid('Harvest Job with id %r does not exist.' % str(value))
|
2013-10-03 16:51:37 +02:00
|
|
|
return result
|
2013-09-17 17:49:19 +02:00
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
|
2011-05-13 15:17:58 +02:00
|
|
|
def _normalize_url(url):
|
|
|
|
o = urlparse.urlparse(url)
|
|
|
|
|
|
|
|
# Normalize port
|
|
|
|
if ':' in o.netloc:
|
|
|
|
parts = o.netloc.split(':')
|
|
|
|
if (o.scheme == 'http' and parts[1] == '80') or \
|
|
|
|
(o.scheme == 'https' and parts[1] == '443'):
|
|
|
|
netloc = parts[0]
|
|
|
|
else:
|
|
|
|
netloc = ':'.join(parts)
|
|
|
|
else:
|
|
|
|
netloc = o.netloc
|
2012-11-29 17:52:10 +01:00
|
|
|
|
2011-05-13 15:17:58 +02:00
|
|
|
# Remove trailing slash
|
|
|
|
path = o.path.rstrip('/')
|
|
|
|
|
|
|
|
check_url = urlparse.urlunparse((
|
|
|
|
o.scheme,
|
|
|
|
netloc,
|
|
|
|
path,
|
2015-10-29 18:18:51 +01:00
|
|
|
None, None, None))
|
2011-05-13 15:17:58 +02:00
|
|
|
|
|
|
|
return check_url
|
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
|
|
|
|
def harvest_source_url_validator(key, data, errors, context):
|
2015-10-29 19:30:51 +01:00
|
|
|
"""Validate the provided harvest source URL.
|
|
|
|
|
|
|
|
Checks that the URL is not already existing with the same config.
|
|
|
|
"""
|
|
|
|
|
2012-11-29 17:52:10 +01:00
|
|
|
package = context.get("package")
|
|
|
|
|
|
|
|
if package:
|
|
|
|
package_id = package.id
|
|
|
|
else:
|
|
|
|
package_id = data.get(key[:-1] + ("id",))
|
|
|
|
|
2015-10-29 19:30:51 +01:00
|
|
|
try:
|
|
|
|
new_config = data.get(key[:-1] + ('config',))
|
|
|
|
new_config_dict = json.loads(new_config)
|
|
|
|
new_config_set = new_config_dict.get('set', None)
|
|
|
|
except:
|
|
|
|
new_config_set = None
|
|
|
|
|
2011-05-13 15:17:58 +02:00
|
|
|
new_url = _normalize_url(data[key])
|
2012-11-29 17:52:10 +01:00
|
|
|
|
2015-10-29 19:30:51 +01:00
|
|
|
# q = model.Session.query(model.Package.url, model.Package.state) \
|
|
|
|
q = model.Session.query(HarvestSource.url, HarvestSource.config) \
|
2015-10-29 18:18:51 +01:00
|
|
|
.filter(model.Package.type == DATASET_TYPE_NAME)
|
2012-11-29 17:52:10 +01:00
|
|
|
|
|
|
|
if package_id:
|
2015-10-29 19:30:51 +01:00
|
|
|
# When editing a source we need to avoid its own URL.
|
2015-10-29 18:18:51 +01:00
|
|
|
q = q.filter(model.Package.id != package_id)
|
2011-05-13 15:17:58 +02:00
|
|
|
|
2012-11-29 17:52:10 +01:00
|
|
|
existing_sources = q.all()
|
|
|
|
|
2015-10-29 19:30:51 +01:00
|
|
|
for url, conf in existing_sources:
|
2011-05-13 15:17:58 +02:00
|
|
|
url = _normalize_url(url)
|
2015-10-29 19:30:51 +01:00
|
|
|
try:
|
|
|
|
config_dict = json.loads(conf)
|
|
|
|
config_set = config_dict.get('set', None)
|
|
|
|
except:
|
|
|
|
config_set = None
|
|
|
|
|
|
|
|
if url == new_url and config_set == new_config_set:
|
|
|
|
# You can have a duplicate URL if it's pointing to a unique
|
|
|
|
# set as it will be harvesting unique datasets.
|
2015-10-29 18:18:51 +01:00
|
|
|
raise Invalid(
|
|
|
|
'There already is a Harvest Source for this URL: %s'
|
|
|
|
% data[key]
|
|
|
|
)
|
2011-05-13 15:17:58 +02:00
|
|
|
|
2012-11-29 17:52:10 +01:00
|
|
|
return data[key]
|
2011-05-13 15:17:58 +02:00
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
|
|
|
|
def harvest_source_type_exists(value, context):
|
|
|
|
# TODO: use new description interface
|
2011-05-13 15:17:58 +02:00
|
|
|
|
|
|
|
# Get all the registered harvester types
|
|
|
|
available_types = []
|
|
|
|
for harvester in PluginImplementations(IHarvester):
|
2011-05-13 19:39:36 +02:00
|
|
|
info = harvester.info()
|
|
|
|
if not info or 'name' not in info:
|
2015-10-29 18:18:51 +01:00
|
|
|
log.error(
|
|
|
|
'Harvester %r does not provide the harvester name in the info '
|
|
|
|
'response' % str(harvester)
|
|
|
|
)
|
2011-05-13 19:39:36 +02:00
|
|
|
continue
|
|
|
|
available_types.append(info['name'])
|
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
if value not in available_types:
|
|
|
|
raise Invalid(
|
|
|
|
'Unknown harvester type: %s. Have you registered a harvester for '
|
|
|
|
'this type?' % value
|
|
|
|
)
|
2012-11-29 17:52:10 +01:00
|
|
|
|
2011-05-13 15:17:58 +02:00
|
|
|
return value
|
2011-06-07 13:07:53 +02:00
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
|
|
|
|
def harvest_source_config_validator(key, data, errors, context):
|
|
|
|
harvester_type = data.get(('source_type',), '')
|
2011-06-07 13:07:53 +02:00
|
|
|
for harvester in PluginImplementations(IHarvester):
|
|
|
|
info = harvester.info()
|
|
|
|
if info['name'] == harvester_type:
|
2011-06-09 11:35:58 +02:00
|
|
|
if hasattr(harvester, 'validate_config'):
|
2011-06-07 13:07:53 +02:00
|
|
|
try:
|
|
|
|
return harvester.validate_config(data[key])
|
|
|
|
except Exception, e:
|
2015-10-29 18:18:51 +01:00
|
|
|
raise Invalid(
|
|
|
|
'Error parsing the configuration options: %s' % str(e))
|
2011-06-07 13:07:53 +02:00
|
|
|
else:
|
|
|
|
return data[key]
|
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
|
2013-02-25 19:07:34 +01:00
|
|
|
def keep_not_empty_extras(key, data, errors, context):
|
|
|
|
extras = data.pop(key, {})
|
|
|
|
for extras_key, value in extras.iteritems():
|
|
|
|
if value:
|
|
|
|
data[key[:-1] + (extras_key,)] = value
|
|
|
|
|
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
def harvest_source_extra_validator(key, data, errors, context):
|
|
|
|
harvester_type = data.get(('source_type',), '')
|
|
|
|
|
|
|
|
# gather all extra fields to use as whitelist of what
|
|
|
|
# can be added to top level data_dict
|
2013-02-25 19:07:34 +01:00
|
|
|
all_extra_fields = set()
|
|
|
|
for harvester in PluginImplementations(IHarvester):
|
|
|
|
if not hasattr(harvester, 'extra_schema'):
|
|
|
|
continue
|
|
|
|
all_extra_fields.update(harvester.extra_schema().keys())
|
|
|
|
|
|
|
|
extra_schema = {'__extras': [keep_not_empty_extras]}
|
|
|
|
for harvester in PluginImplementations(IHarvester):
|
|
|
|
if not hasattr(harvester, 'extra_schema'):
|
|
|
|
continue
|
|
|
|
info = harvester.info()
|
|
|
|
if not info['name'] == harvester_type:
|
|
|
|
continue
|
|
|
|
extra_schema.update(harvester.extra_schema())
|
|
|
|
break
|
|
|
|
|
|
|
|
extra_data, extra_errors = validate(data.get(key, {}), extra_schema)
|
|
|
|
for key in extra_data.keys():
|
2015-10-29 18:18:51 +01:00
|
|
|
# only allow keys that appear in at least one harvester
|
2013-02-25 19:07:34 +01:00
|
|
|
if key not in all_extra_fields:
|
|
|
|
extra_data.pop(key)
|
|
|
|
|
|
|
|
for key, value in extra_data.iteritems():
|
|
|
|
data[(key,)] = value
|
|
|
|
|
|
|
|
for key, value in extra_errors.iteritems():
|
|
|
|
errors[(key,)] = value
|
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
# need to get config out of extras as __extra runs
|
|
|
|
# after rest of validation
|
2013-02-25 19:07:34 +01:00
|
|
|
package_extras = data.get(('extras',), [])
|
|
|
|
|
|
|
|
for num, extra in enumerate(list(package_extras)):
|
|
|
|
if extra['key'] == 'config':
|
|
|
|
# remove config extra so we can add back cleanly later
|
|
|
|
package_extras.pop(num)
|
2013-03-28 17:19:16 +01:00
|
|
|
try:
|
|
|
|
config_dict = json.loads(extra.get('value') or '{}')
|
|
|
|
except ValueError:
|
|
|
|
log.error('Wrong JSON provided in config, skipping')
|
|
|
|
config_dict = {}
|
2013-02-25 19:07:34 +01:00
|
|
|
break
|
|
|
|
else:
|
|
|
|
config_dict = {}
|
|
|
|
config_dict.update(extra_data)
|
2013-04-08 19:52:36 +02:00
|
|
|
if config_dict and not extra_errors:
|
2013-02-28 14:46:16 +01:00
|
|
|
config = json.dumps(config_dict)
|
2013-02-25 19:07:34 +01:00
|
|
|
package_extras.append(dict(key='config',
|
2013-02-28 14:46:16 +01:00
|
|
|
value=config))
|
|
|
|
data[('config',)] = config
|
2013-02-25 19:07:34 +01:00
|
|
|
if package_extras:
|
|
|
|
data[('extras',)] = package_extras
|
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
|
|
|
|
def harvest_source_convert_from_config(key, data, errors, context):
|
2013-02-25 19:07:34 +01:00
|
|
|
config = data[key]
|
|
|
|
if config:
|
|
|
|
config_dict = json.loads(config)
|
|
|
|
for key, value in config_dict.iteritems():
|
|
|
|
data[(key,)] = value
|
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
|
|
|
|
def harvest_source_active_validator(value, context):
|
|
|
|
if isinstance(value, basestring):
|
2012-02-14 15:24:32 +01:00
|
|
|
if value.lower() == 'true':
|
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
return bool(value)
|
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
|
2012-10-29 18:15:02 +01:00
|
|
|
def harvest_source_frequency_exists(value):
|
2012-11-05 14:17:32 +01:00
|
|
|
if value == '':
|
|
|
|
value = 'MANUAL'
|
2012-11-29 12:48:36 +01:00
|
|
|
if value.upper() not in UPDATE_FREQUENCIES:
|
2012-10-29 18:15:02 +01:00
|
|
|
raise Invalid('Frequency %s not recognised' % value)
|
|
|
|
return value.upper()
|
2012-11-29 12:48:36 +01:00
|
|
|
|
|
|
|
|
|
|
|
def dataset_type_exists(value):
|
|
|
|
if value != DATASET_TYPE_NAME:
|
|
|
|
value = DATASET_TYPE_NAME
|
|
|
|
return value
|
2013-09-17 17:49:19 +02:00
|
|
|
|
2015-10-29 18:18:51 +01:00
|
|
|
|
2013-09-17 17:49:19 +02:00
|
|
|
def harvest_object_extras_validator(value, context):
|
|
|
|
if not isinstance(value, dict):
|
|
|
|
raise Invalid('extras must be a dict')
|
|
|
|
for v in value.values():
|
|
|
|
if not isinstance(v, basestring):
|
|
|
|
raise Invalid('extras must be a dict of strings')
|
|
|
|
return value
|