From 4d2fdeac57dfcf888d481007ae8300415012fdf0 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 2 Aug 2012 18:41:59 +0100 Subject: [PATCH] Allow defining segments of harvest objects to import Useful when importing large number of objects, as it allows parallelization --- ckanext/harvest/commands/harvester.py | 14 ++++++++++++-- ckanext/harvest/logic/action/update.py | 9 +++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index 8e3bfbe..e9b44f3 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -40,7 +40,7 @@ class Harvester(CkanCommand): harvester fetch_consumer - starts the consumer for the fetching queue - harvester [-j] import [{source-id}] + harvester [-j] [--segments={segments}] import [{source-id}] - perform the import stage with the last fetched objects, optionally belonging to a certain source. Please note that no objects will be fetched from the remote server. It will only affect the last fetched objects already present in the database. @@ -48,6 +48,9 @@ class Harvester(CkanCommand): If the -j flag is provided, the objects are not joined to existing datasets. This may be useful when importing objects for the first time. + The --segments flag allows to define a string containing hex digits that represent which of + the 16 harvest object segments to import. e.g. 15af will run segments 1,5,a,f + harvester job-all - create new harvest jobs for all active sources. @@ -71,6 +74,11 @@ class Harvester(CkanCommand): self.parser.add_option('-j', '--no-join-datasets', dest='no_join_datasets', action='store_true', default=False, help='Do not join harvest objects to existing datasets') + self.parser.add_option('--segments', dest='segments', + default=False, help= +'''A string containing hex digits that represent which of + the 16 harvest object segments to import. e.g. 15af will run segments 1,5,a,f''') + def command(self): self._load_config() @@ -237,13 +245,15 @@ class Harvester(CkanCommand): #print 'Sent %s jobs to the gather queue' % len(jobs) def import_stage(self): + if len(self.args) >= 2: source_id = unicode(self.args[1]) else: source_id = None context = {'model': model, 'session':model.Session, 'user': self.admin_user['name'], - 'join_datasets': not self.options.no_join_datasets} + 'join_datasets': not self.options.no_join_datasets, + 'segments': self.options.segments} objs = get_action('harvest_objects_import')(context,{'source_id':source_id}) diff --git a/ckanext/harvest/logic/action/update.py b/ckanext/harvest/logic/action/update.py index 8db7162..c99c5cc 100644 --- a/ckanext/harvest/logic/action/update.py +++ b/ckanext/harvest/logic/action/update.py @@ -1,3 +1,5 @@ +import hashlib + import logging from ckan.plugins import PluginImplementations @@ -81,6 +83,8 @@ def harvest_objects_import(context,data_dict): session = context['session'] source_id = data_dict.get('source_id',None) + segments = context.get('segments',None) + join_datasets = context.get('join_datasets',True) if source_id: @@ -109,8 +113,13 @@ def harvest_objects_import(context,data_dict): last_objects_ids = last_objects_ids.all() last_objects = [] + for obj_id in last_objects_ids: + if segments and str(hashlib.md5(obj_id[0]).hexdigest())[0] not in segments: + continue + obj = session.query(HarvestObject).get(obj_id) + for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == obj.source.type: if hasattr(harvester,'force_import'):