From 7011efe5dc42ea0b9f1d078953c2fc3e43c91260 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 30 Jul 2012 12:11:55 +0100 Subject: [PATCH] Allow not linking to datasets when importing records With the -j flag, harvest objects are not linked to datasets when importing. This is useful sometimes when importing records for the first time. --- ckanext/harvest/commands/harvester.py | 18 ++++++++++++++++-- ckanext/harvest/logic/action/update.py | 20 ++++++++++++-------- 2 files changed, 28 insertions(+), 10 deletions(-) diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index 77830bc..8e3bfbe 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -40,11 +40,14 @@ class Harvester(CkanCommand): harvester fetch_consumer - starts the consumer for the fetching queue - harvester import [{source-id}] + harvester [-j] import [{source-id}] - perform the import stage with the last fetched objects, optionally belonging to a certain source. Please note that no objects will be fetched from the remote server. It will only affect the last fetched objects already present in the database. + If the -j flag is provided, the objects are not joined to existing datasets. This may be useful + when importing objects for the first time. + harvester job-all - create new harvest jobs for all active sources. @@ -61,6 +64,13 @@ class Harvester(CkanCommand): max_args = 6 min_args = 0 + def __init__(self,name): + + super(Harvester,self).__init__(name) + + self.parser.add_option('-j', '--no-join-datasets', dest='no_join_datasets', + action='store_true', default=False, help='Do not join harvest objects to existing datasets') + def command(self): self._load_config() @@ -231,7 +241,11 @@ class Harvester(CkanCommand): source_id = unicode(self.args[1]) else: source_id = None - context = {'model': model, 'session':model.Session, 'user': self.admin_user['name']} + + context = {'model': model, 'session':model.Session, 'user': self.admin_user['name'], + 'join_datasets': not self.options.no_join_datasets} + + objs = get_action('harvest_objects_import')(context,{'source_id':source_id}) print '%s objects reimported' % len(objs) diff --git a/ckanext/harvest/logic/action/update.py b/ckanext/harvest/logic/action/update.py index 49536cb..8db7162 100644 --- a/ckanext/harvest/logic/action/update.py +++ b/ckanext/harvest/logic/action/update.py @@ -81,6 +81,8 @@ def harvest_objects_import(context,data_dict): session = context['session'] source_id = data_dict.get('source_id',None) + join_datasets = context.get('join_datasets',True) + if source_id: source = HarvestSource.get(source_id) if not source: @@ -92,17 +94,19 @@ def harvest_objects_import(context,data_dict): raise Exception('This harvest source is not active') last_objects_ids = session.query(HarvestObject.id) \ - .join(HarvestSource).join(Package) \ + .join(HarvestSource) \ .filter(HarvestObject.source==source) \ - .filter(HarvestObject.current==True) \ - .filter(Package.state==u'active') \ - .all() + .filter(HarvestObject.current==True) + else: last_objects_ids = session.query(HarvestObject.id) \ - .join(Package) \ .filter(HarvestObject.current==True) \ - .filter(Package.state==u'active') \ - .all() + + if join_datasets: + last_objects_ids = last_objects_ids.join(Package) \ + .filter(Package.state==u'active') + + last_objects_ids = last_objects_ids.all() last_objects = [] for obj_id in last_objects_ids: @@ -114,7 +118,7 @@ def harvest_objects_import(context,data_dict): harvester.import_stage(obj) break last_objects.append(harvest_object_dictize(obj,context)) - log.info('Harvest objects imported: %r', last_objects) + log.info('Harvest objects imported: %s', len(last_objects)) return last_objects def harvest_jobs_run(context,data_dict):