harvester-d4science/ckanext/harvest/commands/harvester.py

253 lines
7.6 KiB
Python
Raw Normal View History

import sys
import re
from pprint import pprint
from ckan.lib.cli import CkanCommand
2011-04-08 17:48:29 +02:00
from ckanext.harvest.lib import *
from ckanext.harvest.queue import get_gather_consumer, get_fetch_consumer
class Harvester(CkanCommand):
'''Harvests remotely mastered metadata
Usage:
harvester initdb
- Creates the necessary tables in the database
harvester source {url} {type} [{active}] [{user-id}] [{publisher-id}]
- create new harvest source
harvester rmsource {id}
- remove (inactivate) a harvester source
harvester sources [all]
- lists harvest sources
If 'all' is defined, it also shows the Inactive sources
harvester job {source-id}
- create new harvest job
harvester jobs
- lists harvest jobs
harvester run
- runs harvest jobs
harvester gather_consumer
- starts the consumer for the gathering queue
harvester fetch_consumer
- starts the consumer for the fetching queue
harvester import [{source-id}]
- perform the import stage with the last fetched objects, optionally belonging to a certain source.
Please note that no objects will be fetched from the remote server. It will only affect
the last fetched objects already present in the database.
The commands should be run from the ckanext-harvest directory and expect
a development.ini file to be present. Most of the time you will
specify the config explicitly though::
paster harvester sources --config=../ckan/development.ini
'''
summary = __doc__.split('\n')[0]
usage = __doc__
max_args = 6
min_args = 0
def command(self):
self._load_config()
print ''
if len(self.args) == 0:
self.parser.print_usage()
sys.exit(1)
cmd = self.args[0]
if cmd == 'source':
self.create_harvest_source()
elif cmd == "rmsource":
self.remove_harvest_source()
elif cmd == 'sources':
self.list_harvest_sources()
elif cmd == 'job':
self.create_harvest_job()
elif cmd == 'jobs':
self.list_harvest_jobs()
elif cmd == 'run':
self.run_harvester()
elif cmd == 'gather_consumer':
import logging
logging.getLogger('amqplib').setLevel(logging.INFO)
consumer = get_gather_consumer()
consumer.wait()
elif cmd == 'fetch_consumer':
import logging
logging.getLogger('amqplib').setLevel(logging.INFO)
consumer = get_fetch_consumer()
consumer.wait()
elif cmd == 'initdb':
self.initdb()
elif cmd == 'import':
self.import_stage()
else:
print 'Command %s not recognized' % cmd
def _load_config(self):
super(Harvester, self)._load_config()
def initdb(self):
from ckanext.harvest.model import setup as db_setup
db_setup()
print 'DB tables created'
def create_harvest_source(self):
if len(self.args) >= 2:
url = unicode(self.args[1])
else:
print 'Please provide a source URL'
sys.exit(1)
if len(self.args) >= 3:
type = unicode(self.args[2])
else:
print 'Please provide a source type'
sys.exit(1)
if len(self.args) >= 4:
active = not(self.args[3].lower() == 'false' or \
self.args[3] == '0')
else:
active = True
if len(self.args) >= 5:
user_id = unicode(self.args[4])
else:
user_id = u''
if len(self.args) >= 6:
publisher_id = unicode(self.args[5])
else:
publisher_id = u''
source = create_harvest_source({
'url':url,
'type':type,
'active':active,
'user_id':user_id,
'publisher_id':publisher_id})
print 'Created new harvest source:'
self.print_harvest_source(source)
sources = get_harvest_sources()
self.print_there_are('harvest source', sources)
# Create a Harvest Job for the new Source
create_harvest_job(source['id'])
print 'A new Harvest Job for this source has also been created'
def remove_harvest_source(self):
if len(self.args) >= 2:
source_id = unicode(self.args[1])
else:
print 'Please provide a source id'
sys.exit(1)
remove_harvest_source(source_id)
print 'Removed harvest source: %s' % source_id
def list_harvest_sources(self):
if len(self.args) >= 2 and self.args[1] == 'all':
sources = get_harvest_sources()
what = 'harvest source'
else:
sources = get_harvest_sources(active=True)
what = 'active harvest source'
self.print_harvest_sources(sources)
self.print_there_are(what=what, sequence=sources)
def create_harvest_job(self):
if len(self.args) >= 2:
source_id = unicode(self.args[1])
else:
print 'Please provide a source id'
sys.exit(1)
job = create_harvest_job(source_id)
self.print_harvest_job(job)
status = u'New'
jobs = get_harvest_jobs(status=status)
self.print_there_are('harvest jobs', jobs, condition=status)
def list_harvest_jobs(self):
jobs = get_harvest_jobs()
self.print_harvest_jobs(jobs)
self.print_there_are(what='harvest job', sequence=jobs)
def run_harvester(self):
try:
jobs = run_harvest_jobs()
except:
pass
sys.exit(0)
#print 'Sent %s jobs to the gather queue' % len(jobs)
def import_stage(self):
if len(self.args) >= 2:
source_id = unicode(self.args[1])
else:
source_id = None
import_last_objects(source_id)
def print_harvest_sources(self, sources):
if sources:
print ''
for source in sources:
self.print_harvest_source(source)
def print_harvest_source(self, source):
print 'Source id: %s' % source['id']
print ' url: %s' % source['url']
print ' type: %s' % source['type']
print ' active: %s' % source['active']
print ' user: %s' % source['user_id']
print 'publisher: %s' % source['publisher_id']
print ' jobs: %s' % len(source['jobs'])
print ''
def print_harvest_jobs(self, jobs):
if jobs:
print ''
for job in jobs:
self.print_harvest_job(job)
def print_harvest_job(self, job):
print ' Job id: %s' % job['id']
print ' status: %s' % job['status']
print ' source: %s' % job['source']['id']
print ' url: %s' % job['source']['url']
print ' objects: %s' % len(job['objects'])
print 'gather_errors: %s' % len(job['gather_errors'])
if (len(job['gather_errors']) > 0):
for error in job['gather_errors']:
print ' %s' % error['message']
print ''
def print_there_are(self, what, sequence, condition=''):
is_singular = self.is_singular(sequence)
print 'There %s %s %s%s%s' % (
is_singular and 'is' or 'are',
len(sequence),
condition and ('%s ' % condition.lower()) or '',
what,
not is_singular and 's' or '',
)
def is_singular(self, sequence):
return len(sequence) == 1