diff --git a/README.rst b/README.rst index c4449d2..13b3a7d 100644 --- a/README.rst +++ b/README.rst @@ -110,6 +110,61 @@ to add the `ckan_harvester` plugin to your options file:: After adding it, a 'CKAN' option should appear in the 'New harvest source' form. +The CKAN harvesters support a number of configuration options to control their +behaviour. Those need to defined as a JSON object in the configuration form +field. The currently supported configuration options are: + +* api_version: You can force the harvester to use either version '1' or '2' of + the CKAN API. Default is '2'. + +* default_tags: A list of tags that will be added to all harvested datasets. + Tags don't need to previously exist. + +* default_groups: A list of groups to which the harvested datasets will be + added to. The groups must exist. Note that you must use ids or names to + define the groups according to the API version you defined (names for version + '1', ids for version '2'). + +* default_extras: A dictionary of key value pairs that will be added to extras + of the harvested datasets. You can use the following replacement strings, + that will be replaced before creating or updating the datasets: + + * {dataset_id} + * {harvest_source_id} + * {harvest_source_url} # Will be stripped of trailing forward slashes (/) + * {harvest_job_id} + * {harvest_object_id} + +* override_extras: Assign default extras even if they already exist in the + remote dataset. Default is False (only non existing extras are added). + +* user: User who will run the harvesting process. Please note that this user + needs to have permission for creating packages, and if default groups were + defined, the user must have permission to assign packages to these groups. + +* api_key: If the remote CKAN instance has restricted access to the API you + can provide a CKAN API key, which will be sent in any request. + +* read_only: Create harvested packages in read-only mode. Only the user who + performed the harvest (the one defined in the previous setting or the + 'harvest' sysadmin) will be able to edit and administer the packages + created from this harvesting source. Logged in users and visitors will be + only able to read them. + +Here is an example of a configuration object (the one that must be entered in +the configuration field):: + + { + "api_version":"1", + "default_tags":["new-tag-1","new-tag-2"], + "default_groups":["my-own-group"], + "default_extras":{"new_extra":"Test",harvest_url":"{harvest_source_url}/dataset/{dataset_id}"}, + "override_extras": true, + "user":"harverster-user", + "api_key":"", + "read_only": true + } + The harvesting interface ======================== @@ -256,7 +311,7 @@ Finally, on a third console, run the following command to start any pending harvesting jobs:: paster harvester run --config=../ckan/development.ini - + After packages have been imported, the search index will have to be updated before the packages appear in search results (from the ckan directory): diff --git a/ckanext/harvest/harvesters/base.py b/ckanext/harvest/harvesters/base.py index 965b541..3f01f6b 100644 --- a/ckanext/harvest/harvesters/base.py +++ b/ckanext/harvest/harvesters/base.py @@ -1,13 +1,13 @@ import logging - +import re from ckan import model from ckan.model import Session, Package from ckan.logic import ValidationError, NotFound, get_action from ckan.logic.schema import default_package_schema -from ckan.lib.navl.validators import ignore_missing -from ckan.lib.munge import munge_title_to_name, munge_tag +from ckan.lib.navl.validators import ignore_missing,ignore +from ckan.lib.munge import munge_title_to_name,substitute_ascii_equivalents from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \ HarvestObjectError @@ -17,6 +17,11 @@ from ckanext.harvest.interfaces import IHarvester log = logging.getLogger(__name__) +def munge_tag(tag): + tag = substitute_ascii_equivalents(tag) + tag = tag.lower().strip() + return re.sub(r'[^a-zA-Z0-9 -]', '', tag).replace(' ', '-') + class HarvesterBase(SingletonPlugin): ''' Generic class for harvesters with helper functions @@ -104,13 +109,23 @@ class HarvesterBase(SingletonPlugin): try: # Change default schema schema = default_package_schema() - schema["id"] = [ignore_missing, unicode] + schema['id'] = [ignore_missing, unicode] + schema['__junk'] = [ignore] + + # Check API version + if self.config: + api_version = self.config.get('api_version','2') + #TODO: use site user when available + user_name = self.config.get('user',u'harvest') + else: + api_version = '2' + user_name = u'harvest' context = { 'model': model, 'session': Session, - 'user': u'harvest', - 'api_version':'2', + 'user': user_name, + 'api_version': api_version, 'schema': schema, } diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 0354bf5..7e293e4 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -1,7 +1,9 @@ import urllib2 +from ckan.lib.base import c +from ckan import model from ckan.model import Session, Package -from ckan.logic import ValidationError, NotFound +from ckan.logic import ValidationError, NotFound, get_action from ckan.lib.helpers import json from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \ @@ -34,6 +36,9 @@ class CKANHarvester(HarvesterBase): ) try: + api_key = self.config.get('api_key',None) + if api_key: + http_request.add_header('Authorization',api_key) http_response = urllib2.urlopen(http_request) return http_response.read() @@ -65,6 +70,35 @@ class CKANHarvester(HarvesterBase): try: config_obj = json.loads(config) + + if 'default_tags' in config_obj: + if not isinstance(config_obj['default_tags'],list): + raise ValueError('default_tags must be a list') + + if 'default_groups' in config_obj: + if not isinstance(config_obj['default_groups'],list): + raise ValueError('default_groups must be a list') + + # Check if default groups exist + context = {'model':model,'user':c.user} + for group_name in config_obj['default_groups']: + try: + group = get_action('group_show')(context,{'id':group_name}) + except NotFound,e: + raise ValueError('Default group not found') + + if 'default_extras' in config_obj: + if not isinstance(config_obj['default_extras'],dict): + raise ValueError('default_extras must be a dictionary') + + if 'user' in config_obj: + # Check if user exists + context = {'model':model,'user':c.user} + try: + user = get_action('user_show')(context,{'id':config_obj.get('user')}) + except NotFound,e: + raise ValueError('User not found') + except ValueError,e: raise e @@ -196,7 +230,61 @@ class CKANHarvester(HarvesterBase): try: package_dict = json.loads(harvest_object.content) - return self._create_or_update_package(package_dict,harvest_object) + + # Set default tags if needed + default_tags = self.config.get('default_tags',[]) + if default_tags: + if not 'tags' in package_dict: + package_dict['tags'] = [] + package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) + + # Ignore remote groups for the time being + del package_dict['groups'] + + # Set default groups if needed + default_groups = self.config.get('default_groups',[]) + if default_groups: + if not 'groups' in package_dict: + package_dict['groups'] = [] + package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) + + # Set default extras if needed + default_extras = self.config.get('default_extras',{}) + if default_extras: + override_extras = self.config.get('override_extras',False) + if not 'extras' in package_dict: + package_dict['extras'] = {} + for key,value in default_extras.iteritems(): + if not key in package_dict['extras'] or override_extras: + # Look for replacement strings + if isinstance(value,basestring): + value = value.format(harvest_source_id=harvest_object.job.source.id, + harvest_source_url=harvest_object.job.source.url.strip('/'), + harvest_job_id=harvest_object.job.id, + harvest_object_id=harvest_object.id, + dataset_id=package_dict['id']) + package_dict['extras'][key] = value + + result = self._create_or_update_package(package_dict,harvest_object) + + if result and self.config.get('read_only',False) == True: + + package = model.Package.get(package_dict['id']) + + # Clear default permissions + model.clear_user_roles(package) + + # Setup harvest user as admin + user_name = self.config.get('user',u'harvest') + user = model.User.get(user_name) + pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) + + # Other users can only read + for user_name in (u'visitor',u'logged_in'): + user = model.User.get(user_name) + pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) + + except ValidationError,e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import') diff --git a/ckanext/harvest/lib/__init__.py b/ckanext/harvest/lib/__init__.py index cd3ef06..c1c5d81 100644 --- a/ckanext/harvest/lib/__init__.py +++ b/ckanext/harvest/lib/__init__.py @@ -249,11 +249,13 @@ def edit_harvest_source(source_id,data_dict): Session.rollback() raise ValidationError(errors,_error_summary(errors)) - fields = ['url','type','active','description','user_id','publisher_id','config'] + fields = ['url','type','active','description','user_id','publisher_id'] for f in fields: if f in data_dict and data_dict[f] is not None and data_dict[f] != '': source.__setattr__(f,data_dict[f]) + source.config = data_dict['config'] + source.save() return _source_as_dict(source) diff --git a/ckanext/harvest/public/ckanext/harvest/style.css b/ckanext/harvest/public/ckanext/harvest/style.css index de04d84..9391159 100644 --- a/ckanext/harvest/public/ckanext/harvest/style.css +++ b/ckanext/harvest/public/ckanext/harvest/style.css @@ -13,3 +13,12 @@ .harvester-title{ font-weight: bold; } + +#harvest-source-actions { + margin-bottom: 10px; +} + +#harvest-source-actions img{ + vertical-align: middle; + margin: 0 5px; +} diff --git a/ckanext/harvest/templates/source/read.html b/ckanext/harvest/templates/source/read.html index d60b986..43fd8a9 100644 --- a/ckanext/harvest/templates/source/read.html +++ b/ckanext/harvest/templates/source/read.html @@ -13,6 +13,9 @@

Harvest Source Details

+ @@ -34,6 +37,15 @@ + + + + + + + + +
IDDescription ${c.source.description}
Configuration${c.source.config}-
User ${c.source.user_id}