harvester-d4science/ckanext/harvest/commands/harvester.py

import sys
import re
from pprint import pprint

from ckan.lib.cli import CkanCommand
from ckanext.harvest.lib import *
from ckanext.harvest.queue import get_gather_consumer, get_fetch_consumer

class Harvester(CkanCommand):
    '''Harvests remotely mastered metadata

    Usage:

      harvester initdb
        - Creates the necessary tables in the database

      harvester source {url} {type} [{active}] [{user-id}] [{publisher-id}] 
        - create new harvest source

      harvester rmsource {id}
        - remove (inactivate) a harvester source

      harvester sources [all]        
        - lists harvest sources
          If 'all' is defined, it also shows the Inactive sources

      harvester job {source-id}
        - create new harvest job
  
      harvester jobs
        - lists harvest jobs

      harvester run
        - runs harvest jobs

      harvester gather_consumer
        - starts the consumer for the gathering queue

      harvester fetch_consumer
        - starts the consumer for the fetching queue

      harvester import [{source-id}]
        - perform the import stage with the last fetched objects, optionally belonging to a certain source.
          Please note that no objects will be fetched from the remote server. It will only affect
          the last fetched objects already present in the database.

    The commands should be run from the ckanext-harvest directory and expect
    a development.ini file to be present. Most of the time you will
    specify the config explicitly though::

        paster harvester sources --config=../ckan/development.ini

    '''

    summary = __doc__.split('\n')[0]
    usage = __doc__
    max_args = 6
    min_args = 0

    def command(self):
        self._load_config()
        print ''

        if len(self.args) == 0:
            self.parser.print_usage()
            sys.exit(1)
        cmd = self.args[0]
        if cmd == 'source':
            self.create_harvest_source()            
        elif cmd == "rmsource":
            self.remove_harvest_source()
        elif cmd == 'sources':
            self.list_harvest_sources()
        elif cmd == 'job':
            self.create_harvest_job()
        elif cmd == 'jobs':
            self.list_harvest_jobs()
        elif cmd == 'run':
            self.run_harvester()
        elif cmd == 'gather_consumer':
            import logging
            logging.getLogger('amqplib').setLevel(logging.INFO)
            consumer = get_gather_consumer()
            consumer.wait()
        elif cmd == 'fetch_consumer':
            import logging
            logging.getLogger('amqplib').setLevel(logging.INFO)
            consumer = get_fetch_consumer()
            consumer.wait()
        elif cmd == 'initdb':
            self.initdb()
        elif cmd == 'import':
            self.import_stage()
        else:
            print 'Command %s not recognized' % cmd

    def _load_config(self):
        super(Harvester, self)._load_config()
    
    def initdb(self):
        from ckanext.harvest.model import setup as db_setup
        db_setup()

        print 'DB tables created'

    def create_harvest_source(self):

        if len(self.args) >= 2:
            url = unicode(self.args[1])
        else:
            print 'Please provide a source URL'
            sys.exit(1)
        if len(self.args) >= 3:
            type = unicode(self.args[2])
        else:
            print 'Please provide a source type'
            sys.exit(1)
        if len(self.args) >= 4:
            active = not(self.args[3].lower() == 'false' or \
                    self.args[3] == '0')
        else:
            active = True
        if len(self.args) >= 5:
            user_id = unicode(self.args[4])
        else:
            user_id = u''
        if len(self.args) >= 6:
            publisher_id = unicode(self.args[5])
        else:
            publisher_id = u''
        
        source = create_harvest_source({
                'url':url,
                'type':type,
                'active':active,
                'user_id':user_id, 
                'publisher_id':publisher_id})

        print 'Created new harvest source:'
        self.print_harvest_source(source)

        sources = get_harvest_sources()
        self.print_there_are('harvest source', sources)
        
        # Create a Harvest Job for the new Source
        create_harvest_job(source['id'])
        print 'A new Harvest Job for this source has also been created'

    def remove_harvest_source(self):
        if len(self.args) >= 2:
            source_id = unicode(self.args[1])
        else:
            print 'Please provide a source id'
            sys.exit(1)

        remove_harvest_source(source_id)
        print 'Removed harvest source: %s' % source_id
    
    def list_harvest_sources(self):
        if len(self.args) >= 2 and self.args[1] == 'all':
            sources = get_harvest_sources()
            what = 'harvest source'
        else:
            sources = get_harvest_sources(active=True)
            what = 'active harvest source'

        self.print_harvest_sources(sources)
        self.print_there_are(what=what, sequence=sources)

    def create_harvest_job(self):
        if len(self.args) >= 2:
            source_id = unicode(self.args[1])
        else:
            print 'Please provide a source id'
            sys.exit(1)

        job = create_harvest_job(source_id)

        self.print_harvest_job(job)
        status = u'New'
        jobs = get_harvest_jobs(status=status)
        self.print_there_are('harvest jobs', jobs, condition=status)

    def list_harvest_jobs(self):
        jobs = get_harvest_jobs()
        self.print_harvest_jobs(jobs)
        self.print_there_are(what='harvest job', sequence=jobs)
    
    def run_harvester(self):
        try:
            jobs = run_harvest_jobs()
        except:
            pass
        sys.exit(0)
        #print 'Sent %s jobs to the gather queue' % len(jobs)

    def import_stage(self):
        if len(self.args) >= 2:
            source_id = unicode(self.args[1])
        else:
            source_id = None
        import_last_objects(source_id)

    def print_harvest_sources(self, sources):
        if sources:
            print ''
        for source in sources:
            self.print_harvest_source(source)

    def print_harvest_source(self, source):
        print 'Source id: %s' % source['id']
        print '      url: %s' % source['url']
        print '     type: %s' % source['type']
        print '   active: %s' % source['active'] 
        print '     user: %s' % source['user_id']
        print 'publisher: %s' % source['publisher_id']
        print '     jobs: %s' % len(source['jobs'])
        print ''

    def print_harvest_jobs(self, jobs):
        if jobs:
            print ''
        for job in jobs:
            self.print_harvest_job(job)

    def print_harvest_job(self, job):
        print '       Job id: %s' % job['id']
        print '       status: %s' % job['status']
        print '       source: %s' % job['source']['id']
        print '          url: %s' % job['source']['url']
        print '      objects: %s' % len(job['objects'])

        print 'gather_errors: %s' % len(job['gather_errors'])
        if (len(job['gather_errors']) > 0):
            for error in job['gather_errors']:
                print '               %s' % error['message']
        
        print ''

    def print_there_are(self, what, sequence, condition=''):
        is_singular = self.is_singular(sequence)
        print 'There %s %s %s%s%s' % (
            is_singular and 'is' or 'are',
            len(sequence),
            condition and ('%s ' % condition.lower()) or '',
            what,
            not is_singular and 's' or '',
        )

    def is_singular(self, sequence):
        return len(sequence) == 1
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`import sys`
			`import re`
			`from pprint import pprint`

			`from ckan.lib.cli import CkanCommand`
[refactoring] Simplify model relations 2011-04-08 17:48:29 +02:00			`from ckanext.harvest.lib import *`
			`from ckanext.harvest.queue import get_gather_consumer, get_fetch_consumer`
Add a CLI command to create or update the geometries for package extents 2011-03-18 16:44:40 +01:00
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`class Harvester(CkanCommand):`
			`'''Harvests remotely mastered metadata`

			`Usage:`
[refactoring] Add a command to create the necessary tables in the database 2011-04-13 13:39:53 +02:00
			`harvester initdb`
			`- Creates the necessary tables in the database`

[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`harvester source {url} {type} [{active}] [{user-id}] [{publisher-id}]`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`- create new harvest source`

[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`harvester rmsource {id}`
[refactoring] Do not delete sources, just inactivate them. Also don't delete jobs. 2011-04-08 18:07:19 +02:00			`- remove (inactivate) a harvester source`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00
[refactoring] Do not delete sources, just inactivate them. Also don't delete jobs. 2011-04-08 18:07:19 +02:00			`harvester sources [all]`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`- lists harvest sources`
[refactoring] Do not delete sources, just inactivate them. Also don't delete jobs. 2011-04-08 18:07:19 +02:00			`If 'all' is defined, it also shows the Inactive sources`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`harvester job {source-id}`
			`- create new harvest job`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00
			`harvester jobs`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`- lists harvest jobs`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00
			`harvester run`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`- runs harvest jobs`
[refactoring] Add code to handle queuing and the new IHarvester interface. Add new commands in the CLI to start the queue consumers and fire the harvesting process. 2011-04-06 13:45:00 +02:00
			`harvester gather_consumer`
			`- starts the consumer for the gathering queue`

			`harvester fetch_consumer`
			`- starts the consumer for the fetching queue`
Add command to reimport existing harvest objects 2011-05-10 17:06:57 +02:00
			`harvester import [{source-id}]`
			`- perform the import stage with the last fetched objects, optionally belonging to a certain source.`
			`Please note that no objects will be fetched from the remote server. It will only affect`
			`the last fetched objects already present in the database.`

#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`The commands should be run from the ckanext-harvest directory and expect`
			`a development.ini file to be present. Most of the time you will`
			`specify the config explicitly though::`

			`paster harvester sources --config=../ckan/development.ini`

			`'''`

			`summary = __doc__.split('\n')[0]`
			`usage = __doc__`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`max_args = 6`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`min_args = 0`

			`def command(self):`
			`self._load_config()`
[refactoring] Add code to handle queuing and the new IHarvester interface. Add new commands in the CLI to start the queue consumers and fire the harvesting process. 2011-04-06 13:45:00 +02:00			`print ''`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00
			`if len(self.args) == 0:`
			`self.parser.print_usage()`
			`sys.exit(1)`
			`cmd = self.args[0]`
			`if cmd == 'source':`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`self.create_harvest_source()`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`elif cmd == "rmsource":`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`self.remove_harvest_source()`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`elif cmd == 'sources':`
			`self.list_harvest_sources()`
			`elif cmd == 'job':`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`self.create_harvest_job()`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`elif cmd == 'jobs':`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`self.list_harvest_jobs()`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`elif cmd == 'run':`
			`self.run_harvester()`
[refactoring] Add code to handle queuing and the new IHarvester interface. Add new commands in the CLI to start the queue consumers and fire the harvesting process. 2011-04-06 13:45:00 +02:00			`elif cmd == 'gather_consumer':`
[refactoring] Remove old harvesting controller. This functionality now lives on ckanext-inspire 2011-04-08 16:54:33 +02:00			`import logging`
			`logging.getLogger('amqplib').setLevel(logging.INFO)`
[refactoring] Add code to handle queuing and the new IHarvester interface. Add new commands in the CLI to start the queue consumers and fire the harvesting process. 2011-04-06 13:45:00 +02:00			`consumer = get_gather_consumer()`
			`consumer.wait()`
			`elif cmd == 'fetch_consumer':`
[refactoring] Remove old harvesting controller. This functionality now lives on ckanext-inspire 2011-04-08 16:54:33 +02:00			`import logging`
			`logging.getLogger('amqplib').setLevel(logging.INFO)`
[refactoring] Add code to handle queuing and the new IHarvester interface. Add new commands in the CLI to start the queue consumers and fire the harvesting process. 2011-04-06 13:45:00 +02:00			`consumer = get_fetch_consumer()`
			`consumer.wait()`
Add command to reimport existing harvest objects 2011-05-10 17:06:57 +02:00			`elif cmd == 'initdb':`
[refactoring] Add a command to create the necessary tables in the database 2011-04-13 13:39:53 +02:00			`self.initdb()`
Add command to reimport existing harvest objects 2011-05-10 17:06:57 +02:00			`elif cmd == 'import':`
			`self.import_stage()`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`else:`
			`print 'Command %s not recognized' % cmd`

			`def _load_config(self):`
			`super(Harvester, self)._load_config()`
[refactoring] Add a command to create the necessary tables in the database 2011-04-13 13:39:53 +02:00
			`def initdb(self):`
			`from ckanext.harvest.model import setup as db_setup`
			`db_setup()`

			`print 'DB tables created'`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`def create_harvest_source(self):`
Stop harvesting job if the package extent could not be saved. Still shows an ugly SA exception 2011-03-29 18:23:49 +02:00
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`if len(self.args) >= 2:`
			`url = unicode(self.args[1])`
Stop harvesting job if the package extent could not be saved. Still shows an ugly SA exception 2011-03-29 18:23:49 +02:00			`else:`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`print 'Please provide a source URL'`
			`sys.exit(1)`
			`if len(self.args) >= 3:`
			`type = unicode(self.args[2])`
			`else:`
			`print 'Please provide a source type'`
			`sys.exit(1)`
			`if len(self.args) >= 4:`
			`active = not(self.args[3].lower() == 'false' or \`
			`self.args[3] == '0')`
			`else:`
			`active = True`
			`if len(self.args) >= 5:`
			`user_id = unicode(self.args[4])`
			`else:`
			`user_id = u''`
			`if len(self.args) >= 6:`
			`publisher_id = unicode(self.args[5])`
			`else:`
			`publisher_id = u''`

			`source = create_harvest_source({`
			`'url':url,`
			`'type':type,`
			`'active':active,`
			`'user_id':user_id,`
			`'publisher_id':publisher_id})`

			`print 'Created new harvest source:'`
			`self.print_harvest_source(source)`

			`sources = get_harvest_sources()`
			`self.print_there_are('harvest source', sources)`

			`# Create a Harvest Job for the new Source`
[refactoring] Return dictionaries from the common functions, and use them in the CLI" 2011-04-05 13:55:58 +02:00			`create_harvest_job(source['id'])`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`print 'A new Harvest Job for this source has also been created'`

			`def remove_harvest_source(self):`
			`if len(self.args) >= 2:`
			`source_id = unicode(self.args[1])`
			`else:`
			`print 'Please provide a source id'`
			`sys.exit(1)`
Add a CLI command to create or update the geometries for package extents 2011-03-18 16:44:40 +01:00
[refactoring] Do not delete sources, just inactivate them. Also don't delete jobs. 2011-04-08 18:07:19 +02:00			`remove_harvest_source(source_id)`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`print 'Removed harvest source: %s' % source_id`

			`def list_harvest_sources(self):`
[refactoring] Do not delete sources, just inactivate them. Also don't delete jobs. 2011-04-08 18:07:19 +02:00			`if len(self.args) >= 2 and self.args[1] == 'all':`
			`sources = get_harvest_sources()`
			`what = 'harvest source'`
			`else:`
			`sources = get_harvest_sources(active=True)`
			`what = 'active harvest source'`

[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`self.print_harvest_sources(sources)`
[refactoring] Do not delete sources, just inactivate them. Also don't delete jobs. 2011-04-08 18:07:19 +02:00			`self.print_there_are(what=what, sequence=sources)`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00
			`def create_harvest_job(self):`
			`if len(self.args) >= 2:`
			`source_id = unicode(self.args[1])`
			`else:`
			`print 'Please provide a source id'`
			`sys.exit(1)`

			`job = create_harvest_job(source_id)`
Add a CLI command to create or update the geometries for package extents 2011-03-18 16:44:40 +01:00
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`self.print_harvest_job(job)`
			`status = u'New'`
			`jobs = get_harvest_jobs(status=status)`
			`self.print_there_are('harvest jobs', jobs, condition=status)`

			`def list_harvest_jobs(self):`
			`jobs = get_harvest_jobs()`
			`self.print_harvest_jobs(jobs)`
			`self.print_there_are(what='harvest job', sequence=jobs)`

[refactoring] Remove old harvesting controller. This functionality now lives on ckanext-inspire 2011-04-08 16:54:33 +02:00			`def run_harvester(self):`
Do not output messages when running the 'run' command 2011-04-15 16:35:19 +02:00			`try:`
			`jobs = run_harvest_jobs()`
			`except:`
			`pass`
Add command to reimport existing harvest objects 2011-05-10 17:06:57 +02:00			`sys.exit(0)`
Do not output messages when running the 'run' command 2011-04-15 16:35:19 +02:00			`#print 'Sent %s jobs to the gather queue' % len(jobs)`
[refactoring] Add code to handle queuing and the new IHarvester interface. Add new commands in the CLI to start the queue consumers and fire the harvesting process. 2011-04-06 13:45:00 +02:00
Add command to reimport existing harvest objects 2011-05-10 17:06:57 +02:00			`def import_stage(self):`
			`if len(self.args) >= 2:`
			`source_id = unicode(self.args[1])`
			`else:`
			`source_id = None`
			`import_last_objects(source_id)`

#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`def print_harvest_sources(self, sources):`
			`if sources:`
[refactoring] Add code to handle queuing and the new IHarvester interface. Add new commands in the CLI to start the queue consumers and fire the harvesting process. 2011-04-06 13:45:00 +02:00			`print ''`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`for source in sources:`
			`self.print_harvest_source(source)`

			`def print_harvest_source(self, source):`
[refactoring] Return dictionaries from the common functions, and use them in the CLI" 2011-04-05 13:55:58 +02:00			`print 'Source id: %s' % source['id']`
			`print ' url: %s' % source['url']`
			`print ' type: %s' % source['type']`
			`print ' active: %s' % source['active']`
			`print ' user: %s' % source['user_id']`
			`print 'publisher: %s' % source['publisher_id']`
			`print ' jobs: %s' % len(source['jobs'])`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`print ''`

			`def print_harvest_jobs(self, jobs):`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`if jobs:`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`print ''`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`for job in jobs:`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`self.print_harvest_job(job)`

			`def print_harvest_job(self, job):`
[refactoring] Return dictionaries from the common functions, and use them in the CLI" 2011-04-05 13:55:58 +02:00			`print ' Job id: %s' % job['id']`
			`print ' status: %s' % job['status']`
			`print ' source: %s' % job['source']['id']`
			`print ' url: %s' % job['source']['url']`
			`print ' objects: %s' % len(job['objects'])`

			`print 'gather_errors: %s' % len(job['gather_errors'])`
			`if (len(job['gather_errors']) > 0):`
			`for error in job['gather_errors']:`
			`print ' %s' % error['message']`

[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`print ''`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`def print_there_are(self, what, sequence, condition=''):`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`is_singular = self.is_singular(sequence)`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`print 'There %s %s %s%s%s' % (`
			`is_singular and 'is' or 'are',`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`len(sequence),`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`condition and ('%s ' % condition.lower()) or '',`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`what,`
[refactoring] Move common functions to lib. Adapt the CLI to use these common functions. 2011-04-05 12:53:39 +02:00			`not is_singular and 's' or '',`
#1030 Move paster harvester CLI command to ckanext-harvest 2011-03-14 14:34:48 +01:00			`)`

			`def is_singular(self, sequence):`
			`return len(sequence) == 1`