diff --git a/README.rst b/README.rst index 42fbb0e..373052d 100644 --- a/README.rst +++ b/README.rst @@ -117,25 +117,38 @@ The CKAN harvesters support a number of configuration options to control their behaviour. Those need to defined as a JSON object in the configuration form field. The currently supported configuration options are: -* api_version: You can force the harvester to use either version '1' or - '2' of the CKAN API. Default is '2'. +* api_version: You can force the harvester to use either version '1' or '2' of + the CKAN API. Default is '2'. -* default_tags: A list of tags that will be added to all harvested datasets. +* default_tags: A list of tags that will be added to all harvested datasets. Tags don't need to previously exist. -* default_groups: A list of groups to which the harvested datasets will be +* default_groups: A list of groups to which the harvested datasets will be added to. The groups must exist. Note that you must use ids or names to - define the groups according to the API version you defined (names for - version '1', ids for version '2') + define the groups according to the API version you defined (names for version + '1', ids for version '2'). -* user: User who will run the harvesting process. Please note that this user +* default_extras: A dictionary of key value pairs that will be added to extras + of the harvested datasets. You can use the following replacement strings, + that will be replaced before creating or updating the datasets: + + * {dataset_id} + * {harvest_source_id} + * {harvest_source_url} # Will be stripped of trailing forward slashes (/) + * {harvest_job_id} + * {harvest_object_id} + +* override_extras: Assign default extras even if they already exist in the + remote dataset. Default is False (only non existing extras are added). + +* user: User who will run the harvesting process. Please note that this user needs to have permission for creating packages, and if default groups were defined, the user must have permission to assign packages to these groups. -* api_key: If the remote CKAN instance has restricted access to the API you +* api_key: If the remote CKAN instance has restricted access to the API you can provide a CKAN API key, which will be sent in any request. -* read_only: Create harvested packages in read-only mode. Only the user who +* read_only: Create harvested packages in read-only mode. Only the user who performed the harvest (the one defined in the previous setting or the 'harvest' sysadmin) will be able to edit and administer the packages created from this harvesting source. Logged in users and visitors will be @@ -148,6 +161,8 @@ the configuration field):: "api_version":"1", "default_tags":["new-tag-1","new-tag-2"], "default_groups":["my-own-group"], + "default_extras":{"new_extra":"Test",harvest_url":"{harvest_source_url}/dataset/{dataset_id}"}, + "override_extras": true, "user":"harverster-user", "api_key":"", "read_only": true diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 50202d7..7e293e4 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -71,7 +71,14 @@ class CKANHarvester(HarvesterBase): try: config_obj = json.loads(config) + if 'default_tags' in config_obj: + if not isinstance(config_obj['default_tags'],list): + raise ValueError('default_tags must be a list') + if 'default_groups' in config_obj: + if not isinstance(config_obj['default_groups'],list): + raise ValueError('default_groups must be a list') + # Check if default groups exist context = {'model':model,'user':c.user} for group_name in config_obj['default_groups']: @@ -80,6 +87,10 @@ class CKANHarvester(HarvesterBase): except NotFound,e: raise ValueError('Default group not found') + if 'default_extras' in config_obj: + if not isinstance(config_obj['default_extras'],dict): + raise ValueError('default_extras must be a dictionary') + if 'user' in config_obj: # Check if user exists context = {'model':model,'user':c.user} @@ -237,6 +248,23 @@ class CKANHarvester(HarvesterBase): package_dict['groups'] = [] package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) + # Set default extras if needed + default_extras = self.config.get('default_extras',{}) + if default_extras: + override_extras = self.config.get('override_extras',False) + if not 'extras' in package_dict: + package_dict['extras'] = {} + for key,value in default_extras.iteritems(): + if not key in package_dict['extras'] or override_extras: + # Look for replacement strings + if isinstance(value,basestring): + value = value.format(harvest_source_id=harvest_object.job.source.id, + harvest_source_url=harvest_object.job.source.url.strip('/'), + harvest_job_id=harvest_object.job.id, + harvest_object_id=harvest_object.id, + dataset_id=package_dict['id']) + package_dict['extras'][key] = value + result = self._create_or_update_package(package_dict,harvest_object) if result and self.config.get('read_only',False) == True: