[ckan harvester] Add support for defining default extras

2012-01-10 17:07:19 +00:00 · 2012-01-10 17:07:19 +00:00 · eb646b3385
parent ae51093213
commit eb646b3385
2 changed files with 52 additions and 9 deletions
--- a/README.rst
+++ b/README.rst
@ -117,25 +117,38 @@ The CKAN harvesters support a number of configuration options to control their
 behaviour. Those need to defined as a JSON object in the configuration form
 field. The currently supported configuration options are:

-* api_version: You can force the harvester to use either version '1' or
-    '2' of the CKAN API. Default is '2'.
+*   api_version: You can force the harvester to use either version '1' or '2' of
+    the CKAN API. Default is '2'.

-* default_tags: A list of tags that will be added to all harvested datasets.
+*   default_tags: A list of tags that will be added to all harvested datasets.
    Tags don't need to previously exist.

-* default_groups: A list of groups to which the harvested datasets will be
+*   default_groups: A list of groups to which the harvested datasets will be
    added to. The groups must exist. Note that you must use ids or names to
-    define the groups according to the API version you defined (names for
-    version '1', ids for version '2')
+    define the groups according to the API version you defined (names for version
+    '1', ids for version '2').

-* user: User who will run the harvesting process. Please note that this user
+*   default_extras: A dictionary of key value pairs that will be added to extras
+    of the harvested datasets. You can use the following replacement strings,
+    that will be replaced before creating or updating the datasets:
+
+    * {dataset_id}
+    * {harvest_source_id}
+    * {harvest_source_url}   # Will be stripped of trailing forward slashes (/)
+    * {harvest_job_id}
+    * {harvest_object_id}
+
+*   override_extras: Assign default extras even if they already exist in the
+    remote dataset. Default is False (only non existing extras are added).
+
+*   user: User who will run the harvesting process. Please note that this user
    needs to have permission for creating packages, and if default groups were
    defined, the user must have permission to assign packages to these groups.

-* api_key: If the remote CKAN instance has restricted access to the API you
+*   api_key: If the remote CKAN instance has restricted access to the API you
    can provide a CKAN API key, which will be sent in any request.

-* read_only: Create harvested packages in read-only mode. Only the user who
+*   read_only: Create harvested packages in read-only mode. Only the user who
    performed the harvest (the one defined in the previous setting or the
    'harvest' sysadmin) will be able to edit and administer the packages
    created from this harvesting source. Logged in users and visitors will be
@ -148,6 +161,8 @@ the configuration field)::
     "api_version":"1",
     "default_tags":["new-tag-1","new-tag-2"],
     "default_groups":["my-own-group"],
+     "default_extras":{"new_extra":"Test",harvest_url":"{harvest_source_url}/dataset/{dataset_id}"},
+     "override_extras": true,
     "user":"harverster-user",
     "api_key":"<REMOTE_API_KEY>",
     "read_only": true
--- a/ckanext/harvest/harvesters/ckanharvester.py
+++ b/ckanext/harvest/harvesters/ckanharvester.py
@ -71,7 +71,14 @@ class CKANHarvester(HarvesterBase):
        try:
            config_obj = json.loads(config)

+            if 'default_tags' in config_obj:
+                if not isinstance(config_obj['default_tags'],list):
+                    raise ValueError('default_tags must be a list')
+
            if 'default_groups' in config_obj:
+                if not isinstance(config_obj['default_groups'],list):
+                    raise ValueError('default_groups must be a list')
+
                # Check if default groups exist
                context = {'model':model,'user':c.user}
                for group_name in config_obj['default_groups']:
@ -80,6 +87,10 @@ class CKANHarvester(HarvesterBase):
                    except NotFound,e:
                        raise ValueError('Default group not found')

+            if 'default_extras' in config_obj:
+                if not isinstance(config_obj['default_extras'],dict):
+                    raise ValueError('default_extras must be a dictionary')
+
            if 'user' in config_obj:
                # Check if user exists
                context = {'model':model,'user':c.user}
@ -237,6 +248,23 @@ class CKANHarvester(HarvesterBase):
                    package_dict['groups'] = []
                package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']])

+            # Set default extras if needed
+            default_extras = self.config.get('default_extras',{})
+            if default_extras:
+                override_extras = self.config.get('override_extras',False)
+                if not 'extras' in package_dict:
+                    package_dict['extras'] = {}
+                for key,value in default_extras.iteritems():
+                    if not key in package_dict['extras'] or override_extras:
+                        # Look for replacement strings
+                        if isinstance(value,basestring):
+                            value = value.format(harvest_source_id=harvest_object.job.source.id,
+                                     harvest_source_url=harvest_object.job.source.url.strip('/'),
+                                     harvest_job_id=harvest_object.job.id,
+                                     harvest_object_id=harvest_object.id,
+                                     dataset_id=package_dict['id'])
+                        package_dict['extras'][key] = value
+
            result = self._create_or_update_package(package_dict,harvest_object)

            if result and self.config.get('read_only',False) == True: