Allow not linking to datasets when importing records

With the -j flag, harvest objects are not linked to datasets when
importing. This is useful sometimes when importing records for the first
time.
This commit is contained in:
amercader 2012-07-30 12:11:55 +01:00
parent 203bcb053b
commit 7011efe5dc
2 changed files with 28 additions and 10 deletions

View File

@ -40,11 +40,14 @@ class Harvester(CkanCommand):
harvester fetch_consumer harvester fetch_consumer
- starts the consumer for the fetching queue - starts the consumer for the fetching queue
harvester import [{source-id}] harvester [-j] import [{source-id}]
- perform the import stage with the last fetched objects, optionally belonging to a certain source. - perform the import stage with the last fetched objects, optionally belonging to a certain source.
Please note that no objects will be fetched from the remote server. It will only affect Please note that no objects will be fetched from the remote server. It will only affect
the last fetched objects already present in the database. the last fetched objects already present in the database.
If the -j flag is provided, the objects are not joined to existing datasets. This may be useful
when importing objects for the first time.
harvester job-all harvester job-all
- create new harvest jobs for all active sources. - create new harvest jobs for all active sources.
@ -61,6 +64,13 @@ class Harvester(CkanCommand):
max_args = 6 max_args = 6
min_args = 0 min_args = 0
def __init__(self,name):
super(Harvester,self).__init__(name)
self.parser.add_option('-j', '--no-join-datasets', dest='no_join_datasets',
action='store_true', default=False, help='Do not join harvest objects to existing datasets')
def command(self): def command(self):
self._load_config() self._load_config()
@ -231,7 +241,11 @@ class Harvester(CkanCommand):
source_id = unicode(self.args[1]) source_id = unicode(self.args[1])
else: else:
source_id = None source_id = None
context = {'model': model, 'session':model.Session, 'user': self.admin_user['name']}
context = {'model': model, 'session':model.Session, 'user': self.admin_user['name'],
'join_datasets': not self.options.no_join_datasets}
objs = get_action('harvest_objects_import')(context,{'source_id':source_id}) objs = get_action('harvest_objects_import')(context,{'source_id':source_id})
print '%s objects reimported' % len(objs) print '%s objects reimported' % len(objs)

View File

@ -81,6 +81,8 @@ def harvest_objects_import(context,data_dict):
session = context['session'] session = context['session']
source_id = data_dict.get('source_id',None) source_id = data_dict.get('source_id',None)
join_datasets = context.get('join_datasets',True)
if source_id: if source_id:
source = HarvestSource.get(source_id) source = HarvestSource.get(source_id)
if not source: if not source:
@ -92,17 +94,19 @@ def harvest_objects_import(context,data_dict):
raise Exception('This harvest source is not active') raise Exception('This harvest source is not active')
last_objects_ids = session.query(HarvestObject.id) \ last_objects_ids = session.query(HarvestObject.id) \
.join(HarvestSource).join(Package) \ .join(HarvestSource) \
.filter(HarvestObject.source==source) \ .filter(HarvestObject.source==source) \
.filter(HarvestObject.current==True) \ .filter(HarvestObject.current==True)
.filter(Package.state==u'active') \
.all()
else: else:
last_objects_ids = session.query(HarvestObject.id) \ last_objects_ids = session.query(HarvestObject.id) \
.join(Package) \
.filter(HarvestObject.current==True) \ .filter(HarvestObject.current==True) \
.filter(Package.state==u'active') \
.all() if join_datasets:
last_objects_ids = last_objects_ids.join(Package) \
.filter(Package.state==u'active')
last_objects_ids = last_objects_ids.all()
last_objects = [] last_objects = []
for obj_id in last_objects_ids: for obj_id in last_objects_ids:
@ -114,7 +118,7 @@ def harvest_objects_import(context,data_dict):
harvester.import_stage(obj) harvester.import_stage(obj)
break break
last_objects.append(harvest_object_dictize(obj,context)) last_objects.append(harvest_object_dictize(obj,context))
log.info('Harvest objects imported: %r', last_objects) log.info('Harvest objects imported: %s', len(last_objects))
return last_objects return last_objects
def harvest_jobs_run(context,data_dict): def harvest_jobs_run(context,data_dict):