diff --git a/README.rst b/README.rst index 1660254..0a6466b 100644 --- a/README.rst +++ b/README.rst @@ -151,6 +151,11 @@ field. The currently supported configuration options are: created from this harvesting source. Logged in users and visitors will be only able to read them. +* force_all: By default, after the first harvesting, the harvester will gather + only the modified packages from the remote site since the last harvesting. + Setting this property to true will force the harvester to gather all remote + packages regardless of the modification date. Default is False. + Here is an example of a configuration object (the one that must be entered in the configuration field):: diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 7e293e4..8a3c5fc 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -99,6 +99,11 @@ class CKANHarvester(HarvesterBase): except NotFound,e: raise ValueError('User not found') + for key in ('read_only','force_all'): + if key in config_obj: + if not isinstance(config_obj[key],bool): + raise ValueError('%s must be boolean' % key) + except ValueError,e: raise e @@ -125,7 +130,8 @@ class CKANHarvester(HarvesterBase): base_rest_url = base_url + self._get_rest_api_offset() base_search_url = base_url + self._get_search_api_offset() - if previous_job and not previous_job.gather_errors and not len(previous_job.objects) == 0: + if (previous_job and not previous_job.gather_errors and not len(previous_job.objects) == 0) \ + or not self.config.get('force_all',False): get_all_packages = False # Request only the packages modified since last harvest job