2011-03-14 14:34:48 +01:00
|
|
|
import sys
|
|
|
|
import re
|
|
|
|
from pprint import pprint
|
|
|
|
|
|
|
|
from ckan.lib.cli import CkanCommand
|
2011-04-08 17:48:29 +02:00
|
|
|
from ckanext.harvest.lib import *
|
|
|
|
from ckanext.harvest.queue import get_gather_consumer, get_fetch_consumer
|
2011-03-18 16:44:40 +01:00
|
|
|
|
2011-03-14 14:34:48 +01:00
|
|
|
class Harvester(CkanCommand):
|
|
|
|
'''Harvests remotely mastered metadata
|
|
|
|
|
|
|
|
Usage:
|
2011-04-13 13:39:53 +02:00
|
|
|
|
|
|
|
harvester initdb
|
|
|
|
- Creates the necessary tables in the database
|
|
|
|
|
2011-05-13 17:00:36 +02:00
|
|
|
harvester source {url} {type} [{active}] [{user-id}] [{publisher-id}]
|
2011-03-14 14:34:48 +01:00
|
|
|
- create new harvest source
|
|
|
|
|
2011-04-05 12:53:39 +02:00
|
|
|
harvester rmsource {id}
|
2011-04-08 18:07:19 +02:00
|
|
|
- remove (inactivate) a harvester source
|
2011-03-14 14:34:48 +01:00
|
|
|
|
2011-05-13 17:00:36 +02:00
|
|
|
harvester sources [all]
|
2011-03-14 14:34:48 +01:00
|
|
|
- lists harvest sources
|
2011-04-08 18:07:19 +02:00
|
|
|
If 'all' is defined, it also shows the Inactive sources
|
2011-03-14 14:34:48 +01:00
|
|
|
|
2011-04-05 12:53:39 +02:00
|
|
|
harvester job {source-id}
|
|
|
|
- create new harvest job
|
2011-05-13 17:00:36 +02:00
|
|
|
|
2011-03-14 14:34:48 +01:00
|
|
|
harvester jobs
|
2011-04-05 12:53:39 +02:00
|
|
|
- lists harvest jobs
|
2011-03-14 14:34:48 +01:00
|
|
|
|
|
|
|
harvester run
|
2011-04-05 12:53:39 +02:00
|
|
|
- runs harvest jobs
|
2011-04-06 13:45:00 +02:00
|
|
|
|
|
|
|
harvester gather_consumer
|
|
|
|
- starts the consumer for the gathering queue
|
|
|
|
|
|
|
|
harvester fetch_consumer
|
|
|
|
- starts the consumer for the fetching queue
|
2011-05-10 17:06:57 +02:00
|
|
|
|
|
|
|
harvester import [{source-id}]
|
|
|
|
- perform the import stage with the last fetched objects, optionally belonging to a certain source.
|
|
|
|
Please note that no objects will be fetched from the remote server. It will only affect
|
|
|
|
the last fetched objects already present in the database.
|
|
|
|
|
2011-09-06 19:25:17 +02:00
|
|
|
harvester job-all
|
|
|
|
- create new harvest jobs for all active sources.
|
|
|
|
|
2011-03-14 14:34:48 +01:00
|
|
|
The commands should be run from the ckanext-harvest directory and expect
|
|
|
|
a development.ini file to be present. Most of the time you will
|
|
|
|
specify the config explicitly though::
|
|
|
|
|
|
|
|
paster harvester sources --config=../ckan/development.ini
|
|
|
|
|
|
|
|
'''
|
|
|
|
|
|
|
|
summary = __doc__.split('\n')[0]
|
|
|
|
usage = __doc__
|
2011-04-05 12:53:39 +02:00
|
|
|
max_args = 6
|
2011-03-14 14:34:48 +01:00
|
|
|
min_args = 0
|
|
|
|
|
|
|
|
def command(self):
|
|
|
|
self._load_config()
|
2011-04-06 13:45:00 +02:00
|
|
|
print ''
|
2011-03-14 14:34:48 +01:00
|
|
|
|
|
|
|
if len(self.args) == 0:
|
|
|
|
self.parser.print_usage()
|
|
|
|
sys.exit(1)
|
|
|
|
cmd = self.args[0]
|
|
|
|
if cmd == 'source':
|
2011-05-13 17:00:36 +02:00
|
|
|
self.create_harvest_source()
|
2011-03-14 14:34:48 +01:00
|
|
|
elif cmd == "rmsource":
|
2011-04-05 12:53:39 +02:00
|
|
|
self.remove_harvest_source()
|
2011-03-14 14:34:48 +01:00
|
|
|
elif cmd == 'sources':
|
|
|
|
self.list_harvest_sources()
|
|
|
|
elif cmd == 'job':
|
2011-04-05 12:53:39 +02:00
|
|
|
self.create_harvest_job()
|
2011-03-14 14:34:48 +01:00
|
|
|
elif cmd == 'jobs':
|
2011-04-05 12:53:39 +02:00
|
|
|
self.list_harvest_jobs()
|
2011-03-14 14:34:48 +01:00
|
|
|
elif cmd == 'run':
|
|
|
|
self.run_harvester()
|
2011-04-06 13:45:00 +02:00
|
|
|
elif cmd == 'gather_consumer':
|
2011-04-08 16:54:33 +02:00
|
|
|
import logging
|
|
|
|
logging.getLogger('amqplib').setLevel(logging.INFO)
|
2011-04-06 13:45:00 +02:00
|
|
|
consumer = get_gather_consumer()
|
|
|
|
consumer.wait()
|
|
|
|
elif cmd == 'fetch_consumer':
|
2011-04-08 16:54:33 +02:00
|
|
|
import logging
|
|
|
|
logging.getLogger('amqplib').setLevel(logging.INFO)
|
2011-04-06 13:45:00 +02:00
|
|
|
consumer = get_fetch_consumer()
|
|
|
|
consumer.wait()
|
2011-05-10 17:06:57 +02:00
|
|
|
elif cmd == 'initdb':
|
2011-04-13 13:39:53 +02:00
|
|
|
self.initdb()
|
2011-05-10 17:06:57 +02:00
|
|
|
elif cmd == 'import':
|
2011-09-28 15:27:28 +02:00
|
|
|
self.initdb()
|
2011-05-10 17:06:57 +02:00
|
|
|
self.import_stage()
|
2011-09-06 19:25:17 +02:00
|
|
|
elif cmd == 'job-all':
|
|
|
|
self.create_harvest_job_all()
|
2011-03-14 14:34:48 +01:00
|
|
|
else:
|
|
|
|
print 'Command %s not recognized' % cmd
|
|
|
|
|
|
|
|
def _load_config(self):
|
|
|
|
super(Harvester, self)._load_config()
|
2011-05-13 17:00:36 +02:00
|
|
|
|
2011-04-13 13:39:53 +02:00
|
|
|
def initdb(self):
|
|
|
|
from ckanext.harvest.model import setup as db_setup
|
|
|
|
db_setup()
|
|
|
|
|
|
|
|
print 'DB tables created'
|
2011-03-14 14:34:48 +01:00
|
|
|
|
2011-04-05 12:53:39 +02:00
|
|
|
def create_harvest_source(self):
|
2011-03-29 18:23:49 +02:00
|
|
|
|
2011-04-05 12:53:39 +02:00
|
|
|
if len(self.args) >= 2:
|
|
|
|
url = unicode(self.args[1])
|
2011-03-29 18:23:49 +02:00
|
|
|
else:
|
2011-04-05 12:53:39 +02:00
|
|
|
print 'Please provide a source URL'
|
|
|
|
sys.exit(1)
|
|
|
|
if len(self.args) >= 3:
|
|
|
|
type = unicode(self.args[2])
|
|
|
|
else:
|
|
|
|
print 'Please provide a source type'
|
|
|
|
sys.exit(1)
|
|
|
|
if len(self.args) >= 4:
|
2011-06-13 16:56:19 +02:00
|
|
|
config = unicode(self.args[3])
|
2011-04-05 12:53:39 +02:00
|
|
|
else:
|
2011-06-13 16:56:19 +02:00
|
|
|
config = None
|
2011-04-05 12:53:39 +02:00
|
|
|
if len(self.args) >= 5:
|
2011-06-13 16:56:19 +02:00
|
|
|
active = not(self.args[4].lower() == 'false' or \
|
|
|
|
self.args[4] == '0')
|
2011-04-05 12:53:39 +02:00
|
|
|
else:
|
2011-06-13 16:56:19 +02:00
|
|
|
active = True
|
2011-04-05 12:53:39 +02:00
|
|
|
if len(self.args) >= 6:
|
2011-06-13 16:56:19 +02:00
|
|
|
user_id = unicode(self.args[5])
|
|
|
|
else:
|
|
|
|
user_id = u''
|
|
|
|
if len(self.args) >= 7:
|
|
|
|
publisher_id = unicode(self.args[6])
|
2011-04-05 12:53:39 +02:00
|
|
|
else:
|
|
|
|
publisher_id = u''
|
2011-05-13 17:00:36 +02:00
|
|
|
try:
|
|
|
|
source = create_harvest_source({
|
|
|
|
'url':url,
|
|
|
|
'type':type,
|
2011-06-13 16:56:19 +02:00
|
|
|
'config':config,
|
2011-05-13 17:00:36 +02:00
|
|
|
'active':active,
|
|
|
|
'user_id':user_id,
|
|
|
|
'publisher_id':publisher_id})
|
|
|
|
|
|
|
|
print 'Created new harvest source:'
|
|
|
|
self.print_harvest_source(source)
|
|
|
|
|
|
|
|
sources = get_harvest_sources()
|
|
|
|
self.print_there_are('harvest source', sources)
|
|
|
|
|
|
|
|
# Create a Harvest Job for the new Source
|
|
|
|
create_harvest_job(source['id'])
|
|
|
|
print 'A new Harvest Job for this source has also been created'
|
|
|
|
|
|
|
|
except ValidationError,e:
|
|
|
|
print 'An error occurred:'
|
|
|
|
print str(e.error_dict)
|
|
|
|
raise e
|
|
|
|
|
2011-04-05 12:53:39 +02:00
|
|
|
|
|
|
|
def remove_harvest_source(self):
|
|
|
|
if len(self.args) >= 2:
|
|
|
|
source_id = unicode(self.args[1])
|
|
|
|
else:
|
|
|
|
print 'Please provide a source id'
|
|
|
|
sys.exit(1)
|
2011-03-18 16:44:40 +01:00
|
|
|
|
2011-04-08 18:07:19 +02:00
|
|
|
remove_harvest_source(source_id)
|
2011-04-05 12:53:39 +02:00
|
|
|
print 'Removed harvest source: %s' % source_id
|
2011-05-13 17:00:36 +02:00
|
|
|
|
2011-04-05 12:53:39 +02:00
|
|
|
def list_harvest_sources(self):
|
2011-04-08 18:07:19 +02:00
|
|
|
if len(self.args) >= 2 and self.args[1] == 'all':
|
|
|
|
sources = get_harvest_sources()
|
|
|
|
what = 'harvest source'
|
|
|
|
else:
|
|
|
|
sources = get_harvest_sources(active=True)
|
|
|
|
what = 'active harvest source'
|
|
|
|
|
2011-04-05 12:53:39 +02:00
|
|
|
self.print_harvest_sources(sources)
|
2011-04-08 18:07:19 +02:00
|
|
|
self.print_there_are(what=what, sequence=sources)
|
2011-04-05 12:53:39 +02:00
|
|
|
|
|
|
|
def create_harvest_job(self):
|
|
|
|
if len(self.args) >= 2:
|
|
|
|
source_id = unicode(self.args[1])
|
|
|
|
else:
|
|
|
|
print 'Please provide a source id'
|
|
|
|
sys.exit(1)
|
|
|
|
|
|
|
|
job = create_harvest_job(source_id)
|
2011-03-18 16:44:40 +01:00
|
|
|
|
2011-04-05 12:53:39 +02:00
|
|
|
self.print_harvest_job(job)
|
|
|
|
status = u'New'
|
|
|
|
jobs = get_harvest_jobs(status=status)
|
|
|
|
self.print_there_are('harvest jobs', jobs, condition=status)
|
|
|
|
|
|
|
|
def list_harvest_jobs(self):
|
|
|
|
jobs = get_harvest_jobs()
|
|
|
|
self.print_harvest_jobs(jobs)
|
|
|
|
self.print_there_are(what='harvest job', sequence=jobs)
|
2011-05-13 17:00:36 +02:00
|
|
|
|
2011-04-08 16:54:33 +02:00
|
|
|
def run_harvester(self):
|
2011-04-15 16:35:19 +02:00
|
|
|
try:
|
|
|
|
jobs = run_harvest_jobs()
|
|
|
|
except:
|
|
|
|
pass
|
2011-05-10 17:06:57 +02:00
|
|
|
sys.exit(0)
|
2011-04-15 16:35:19 +02:00
|
|
|
#print 'Sent %s jobs to the gather queue' % len(jobs)
|
2011-04-06 13:45:00 +02:00
|
|
|
|
2011-05-10 17:06:57 +02:00
|
|
|
def import_stage(self):
|
|
|
|
if len(self.args) >= 2:
|
|
|
|
source_id = unicode(self.args[1])
|
|
|
|
else:
|
|
|
|
source_id = None
|
2012-02-02 14:20:03 +01:00
|
|
|
objs = import_last_objects(source_id)
|
|
|
|
print '%s objects reimported' % len(objs)
|
2011-05-10 17:06:57 +02:00
|
|
|
|
2011-09-06 19:25:17 +02:00
|
|
|
def create_harvest_job_all(self):
|
|
|
|
jobs = create_harvest_job_all()
|
|
|
|
print "Created %s new harvest jobs" % len(jobs)
|
|
|
|
|
2011-03-14 14:34:48 +01:00
|
|
|
def print_harvest_sources(self, sources):
|
|
|
|
if sources:
|
2011-04-06 13:45:00 +02:00
|
|
|
print ''
|
2011-03-14 14:34:48 +01:00
|
|
|
for source in sources:
|
|
|
|
self.print_harvest_source(source)
|
|
|
|
|
|
|
|
def print_harvest_source(self, source):
|
2011-04-05 13:55:58 +02:00
|
|
|
print 'Source id: %s' % source['id']
|
|
|
|
print ' url: %s' % source['url']
|
|
|
|
print ' type: %s' % source['type']
|
2011-05-13 17:00:36 +02:00
|
|
|
print ' active: %s' % source['active']
|
2011-04-05 13:55:58 +02:00
|
|
|
print ' user: %s' % source['user_id']
|
|
|
|
print 'publisher: %s' % source['publisher_id']
|
|
|
|
print ' jobs: %s' % len(source['jobs'])
|
2011-04-05 12:53:39 +02:00
|
|
|
print ''
|
|
|
|
|
|
|
|
def print_harvest_jobs(self, jobs):
|
2011-03-14 14:34:48 +01:00
|
|
|
if jobs:
|
2011-04-05 12:53:39 +02:00
|
|
|
print ''
|
2011-03-14 14:34:48 +01:00
|
|
|
for job in jobs:
|
2011-04-05 12:53:39 +02:00
|
|
|
self.print_harvest_job(job)
|
|
|
|
|
|
|
|
def print_harvest_job(self, job):
|
2011-04-05 13:55:58 +02:00
|
|
|
print ' Job id: %s' % job['id']
|
|
|
|
print ' status: %s' % job['status']
|
|
|
|
print ' source: %s' % job['source']['id']
|
|
|
|
print ' url: %s' % job['source']['url']
|
|
|
|
print ' objects: %s' % len(job['objects'])
|
|
|
|
|
|
|
|
print 'gather_errors: %s' % len(job['gather_errors'])
|
|
|
|
if (len(job['gather_errors']) > 0):
|
|
|
|
for error in job['gather_errors']:
|
|
|
|
print ' %s' % error['message']
|
2011-05-13 17:00:36 +02:00
|
|
|
|
2011-04-05 12:53:39 +02:00
|
|
|
print ''
|
2011-03-14 14:34:48 +01:00
|
|
|
|
2011-04-05 12:53:39 +02:00
|
|
|
def print_there_are(self, what, sequence, condition=''):
|
2011-03-14 14:34:48 +01:00
|
|
|
is_singular = self.is_singular(sequence)
|
2011-04-05 12:53:39 +02:00
|
|
|
print 'There %s %s %s%s%s' % (
|
|
|
|
is_singular and 'is' or 'are',
|
2011-03-14 14:34:48 +01:00
|
|
|
len(sequence),
|
2011-04-05 12:53:39 +02:00
|
|
|
condition and ('%s ' % condition.lower()) or '',
|
2011-03-14 14:34:48 +01:00
|
|
|
what,
|
2011-04-05 12:53:39 +02:00
|
|
|
not is_singular and 's' or '',
|
2011-03-14 14:34:48 +01:00
|
|
|
)
|
|
|
|
|
|
|
|
def is_singular(self, sequence):
|
|
|
|
return len(sequence) == 1
|
|
|
|
|