import sys import re from pprint import pprint from ckan.lib.cli import CkanCommand from ckanext.harvest.lib import * from ckanext.harvest.queue import get_gather_consumer, get_fetch_consumer class Harvester(CkanCommand): '''Harvests remotely mastered metadata Usage: harvester initdb - Creates the necessary tables in the database harvester source {url} {type} [{active}] [{user-id}] [{publisher-id}] - create new harvest source harvester rmsource {id} - remove (inactivate) a harvester source harvester sources [all] - lists harvest sources If 'all' is defined, it also shows the Inactive sources harvester job {source-id} - create new harvest job harvester jobs - lists harvest jobs harvester run - runs harvest jobs harvester gather_consumer - starts the consumer for the gathering queue harvester fetch_consumer - starts the consumer for the fetching queue harvester import [{source-id}] - perform the import stage with the last fetched objects, optionally belonging to a certain source. Please note that no objects will be fetched from the remote server. It will only affect the last fetched objects already present in the database. The commands should be run from the ckanext-harvest directory and expect a development.ini file to be present. Most of the time you will specify the config explicitly though:: paster harvester sources --config=../ckan/development.ini ''' summary = __doc__.split('\n')[0] usage = __doc__ max_args = 6 min_args = 0 def command(self): self._load_config() print '' if len(self.args) == 0: self.parser.print_usage() sys.exit(1) cmd = self.args[0] if cmd == 'source': self.create_harvest_source() elif cmd == "rmsource": self.remove_harvest_source() elif cmd == 'sources': self.list_harvest_sources() elif cmd == 'job': self.create_harvest_job() elif cmd == 'jobs': self.list_harvest_jobs() elif cmd == 'run': self.run_harvester() elif cmd == 'gather_consumer': import logging logging.getLogger('amqplib').setLevel(logging.INFO) consumer = get_gather_consumer() consumer.wait() elif cmd == 'fetch_consumer': import logging logging.getLogger('amqplib').setLevel(logging.INFO) consumer = get_fetch_consumer() consumer.wait() elif cmd == 'initdb': self.initdb() elif cmd == 'import': self.import_stage() else: print 'Command %s not recognized' % cmd def _load_config(self): super(Harvester, self)._load_config() def initdb(self): from ckanext.harvest.model import setup as db_setup db_setup() print 'DB tables created' def create_harvest_source(self): if len(self.args) >= 2: url = unicode(self.args[1]) else: print 'Please provide a source URL' sys.exit(1) if len(self.args) >= 3: type = unicode(self.args[2]) else: print 'Please provide a source type' sys.exit(1) if len(self.args) >= 4: active = not(self.args[3].lower() == 'false' or \ self.args[3] == '0') else: active = True if len(self.args) >= 5: user_id = unicode(self.args[4]) else: user_id = u'' if len(self.args) >= 6: publisher_id = unicode(self.args[5]) else: publisher_id = u'' source = create_harvest_source({ 'url':url, 'type':type, 'active':active, 'user_id':user_id, 'publisher_id':publisher_id}) print 'Created new harvest source:' self.print_harvest_source(source) sources = get_harvest_sources() self.print_there_are('harvest source', sources) # Create a Harvest Job for the new Source create_harvest_job(source['id']) print 'A new Harvest Job for this source has also been created' def remove_harvest_source(self): if len(self.args) >= 2: source_id = unicode(self.args[1]) else: print 'Please provide a source id' sys.exit(1) remove_harvest_source(source_id) print 'Removed harvest source: %s' % source_id def list_harvest_sources(self): if len(self.args) >= 2 and self.args[1] == 'all': sources = get_harvest_sources() what = 'harvest source' else: sources = get_harvest_sources(active=True) what = 'active harvest source' self.print_harvest_sources(sources) self.print_there_are(what=what, sequence=sources) def create_harvest_job(self): if len(self.args) >= 2: source_id = unicode(self.args[1]) else: print 'Please provide a source id' sys.exit(1) job = create_harvest_job(source_id) self.print_harvest_job(job) status = u'New' jobs = get_harvest_jobs(status=status) self.print_there_are('harvest jobs', jobs, condition=status) def list_harvest_jobs(self): jobs = get_harvest_jobs() self.print_harvest_jobs(jobs) self.print_there_are(what='harvest job', sequence=jobs) def run_harvester(self): try: jobs = run_harvest_jobs() except: pass sys.exit(0) #print 'Sent %s jobs to the gather queue' % len(jobs) def import_stage(self): if len(self.args) >= 2: source_id = unicode(self.args[1]) else: source_id = None import_last_objects(source_id) def print_harvest_sources(self, sources): if sources: print '' for source in sources: self.print_harvest_source(source) def print_harvest_source(self, source): print 'Source id: %s' % source['id'] print ' url: %s' % source['url'] print ' type: %s' % source['type'] print ' active: %s' % source['active'] print ' user: %s' % source['user_id'] print 'publisher: %s' % source['publisher_id'] print ' jobs: %s' % len(source['jobs']) print '' def print_harvest_jobs(self, jobs): if jobs: print '' for job in jobs: self.print_harvest_job(job) def print_harvest_job(self, job): print ' Job id: %s' % job['id'] print ' status: %s' % job['status'] print ' source: %s' % job['source']['id'] print ' url: %s' % job['source']['url'] print ' objects: %s' % len(job['objects']) print 'gather_errors: %s' % len(job['gather_errors']) if (len(job['gather_errors']) > 0): for error in job['gather_errors']: print ' %s' % error['message'] print '' def print_there_are(self, what, sequence, condition=''): is_singular = self.is_singular(sequence) print 'There %s %s %s%s%s' % ( is_singular and 'is' or 'are', len(sequence), condition and ('%s ' % condition.lower()) or '', what, not is_singular and 's' or '', ) def is_singular(self, sequence): return len(sequence) == 1