From 7011efe5dc42ea0b9f1d078953c2fc3e43c91260 Mon Sep 17 00:00:00 2001
From: amercader <amercadero@gmail.com>
Date: Mon, 30 Jul 2012 12:11:55 +0100
Subject: [PATCH] Allow not linking to datasets when importing records

With the -j flag, harvest objects are not linked to datasets when
importing. This is useful sometimes when importing records for the first
time.
---
 ckanext/harvest/commands/harvester.py  | 18 ++++++++++++++++--
 ckanext/harvest/logic/action/update.py | 20 ++++++++++++--------
 2 files changed, 28 insertions(+), 10 deletions(-)

diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py
index 77830bc..8e3bfbe 100644
--- a/ckanext/harvest/commands/harvester.py
+++ b/ckanext/harvest/commands/harvester.py
@@ -40,11 +40,14 @@ class Harvester(CkanCommand):
       harvester fetch_consumer
         - starts the consumer for the fetching queue
 
-      harvester import [{source-id}]
+      harvester [-j] import [{source-id}]
         - perform the import stage with the last fetched objects, optionally belonging to a certain source.
           Please note that no objects will be fetched from the remote server. It will only affect
           the last fetched objects already present in the database.
 
+          If the -j flag is provided, the objects are not joined to existing datasets. This may be useful
+          when importing objects for the first time.
+
       harvester job-all
         - create new harvest jobs for all active sources.
 
@@ -61,6 +64,13 @@ class Harvester(CkanCommand):
     max_args = 6
     min_args = 0
 
+    def __init__(self,name):
+
+        super(Harvester,self).__init__(name)
+
+        self.parser.add_option('-j', '--no-join-datasets', dest='no_join_datasets',
+            action='store_true', default=False, help='Do not join harvest objects to existing datasets')
+
     def command(self):
         self._load_config()
 
@@ -231,7 +241,11 @@ class Harvester(CkanCommand):
             source_id = unicode(self.args[1])
         else:
             source_id = None
-        context = {'model': model, 'session':model.Session, 'user': self.admin_user['name']}
+
+        context = {'model': model, 'session':model.Session, 'user': self.admin_user['name'],
+                   'join_datasets': not self.options.no_join_datasets}
+
+
         objs = get_action('harvest_objects_import')(context,{'source_id':source_id})
 
         print '%s objects reimported' % len(objs)
diff --git a/ckanext/harvest/logic/action/update.py b/ckanext/harvest/logic/action/update.py
index 49536cb..8db7162 100644
--- a/ckanext/harvest/logic/action/update.py
+++ b/ckanext/harvest/logic/action/update.py
@@ -81,6 +81,8 @@ def harvest_objects_import(context,data_dict):
     session = context['session']
     source_id = data_dict.get('source_id',None)
 
+    join_datasets = context.get('join_datasets',True)
+
     if source_id:
         source = HarvestSource.get(source_id)
         if not source:
@@ -92,17 +94,19 @@ def harvest_objects_import(context,data_dict):
             raise Exception('This harvest source is not active')
 
         last_objects_ids = session.query(HarvestObject.id) \
-                .join(HarvestSource).join(Package) \
+                .join(HarvestSource) \
                 .filter(HarvestObject.source==source) \
-                .filter(HarvestObject.current==True) \
-                .filter(Package.state==u'active') \
-                .all()
+                .filter(HarvestObject.current==True)
+
     else:
         last_objects_ids = session.query(HarvestObject.id) \
-                .join(Package) \
                 .filter(HarvestObject.current==True) \
-                .filter(Package.state==u'active') \
-                .all()
+
+    if join_datasets:
+        last_objects_ids = last_objects_ids.join(Package) \
+            .filter(Package.state==u'active')
+
+    last_objects_ids = last_objects_ids.all()
 
     last_objects = []
     for obj_id in last_objects_ids:
@@ -114,7 +118,7 @@ def harvest_objects_import(context,data_dict):
                 harvester.import_stage(obj)
                 break
         last_objects.append(harvest_object_dictize(obj,context))
-    log.info('Harvest objects imported: %r', last_objects)
+    log.info('Harvest objects imported: %s', len(last_objects))
     return last_objects
 
 def harvest_jobs_run(context,data_dict):