Allow not linking to datasets when importing records

With the -j flag, harvest objects are not linked to datasets when importing. This is useful sometimes when importing records for the first time.
2012-07-30 12:11:55 +01:00 · 2012-07-30 12:11:55 +01:00 · 7011efe5dc
parent 203bcb053b
commit 7011efe5dc
2 changed files with 28 additions and 10 deletions
--- a/ckanext/harvest/commands/harvester.py
+++ b/ckanext/harvest/commands/harvester.py
@ -40,11 +40,14 @@ class Harvester(CkanCommand):
      harvester fetch_consumer
        - starts the consumer for the fetching queue
-      harvester import [{source-id}]
+      harvester [-j] import [{source-id}]
        - perform the import stage with the last fetched objects, optionally belonging to a certain source.
          Please note that no objects will be fetched from the remote server. It will only affect
          the last fetched objects already present in the database.
          If the -j flag is provided, the objects are not joined to existing datasets. This may be useful
          when importing objects for the first time.
      harvester job-all
        - create new harvest jobs for all active sources.
@ -61,6 +64,13 @@ class Harvester(CkanCommand):
    max_args = 6
    min_args = 0
    def __init__(self,name):
        super(Harvester,self).__init__(name)
        self.parser.add_option('-j', '--no-join-datasets', dest='no_join_datasets',
            action='store_true', default=False, help='Do not join harvest objects to existing datasets')
    def command(self):
        self._load_config()
@ -231,7 +241,11 @@ class Harvester(CkanCommand):
            source_id = unicode(self.args[1])
        else:
            source_id = None
-        context = {'model': model, 'session':model.Session, 'user': self.admin_user['name']}
+
        context = {'model': model, 'session':model.Session, 'user': self.admin_user['name'],
                   'join_datasets': not self.options.no_join_datasets}
        objs = get_action('harvest_objects_import')(context,{'source_id':source_id})
        print '%s objects reimported' % len(objs)
--- a/ckanext/harvest/logic/action/update.py
+++ b/ckanext/harvest/logic/action/update.py
@ -81,6 +81,8 @@ def harvest_objects_import(context,data_dict):
    session = context['session']
    source_id = data_dict.get('source_id',None)
    join_datasets = context.get('join_datasets',True)
    if source_id:
        source = HarvestSource.get(source_id)
        if not source:
@ -92,17 +94,19 @@ def harvest_objects_import(context,data_dict):
            raise Exception('This harvest source is not active')
        last_objects_ids = session.query(HarvestObject.id) \
-                .join(HarvestSource).join(Package) \
+                .join(HarvestSource) \
                .filter(HarvestObject.source==source) \
-                .filter(HarvestObject.current==True) \
+                .filter(HarvestObject.current==True)
-                .filter(Package.state==u'active') \
+
                .all()
    else:
        last_objects_ids = session.query(HarvestObject.id) \
                .join(Package) \
                .filter(HarvestObject.current==True) \
-                .filter(Package.state==u'active') \
+
-                .all()
+    if join_datasets:
        last_objects_ids = last_objects_ids.join(Package) \
            .filter(Package.state==u'active')
    last_objects_ids = last_objects_ids.all()
    last_objects = []
    for obj_id in last_objects_ids:
@ -114,7 +118,7 @@ def harvest_objects_import(context,data_dict):
                harvester.import_stage(obj)
                break
        last_objects.append(harvest_object_dictize(obj,context))
-    log.info('Harvest objects imported: %r', last_objects)
+    log.info('Harvest objects imported: %s', len(last_objects))
    return last_objects
 def harvest_jobs_run(context,data_dict):