harvester-d4science/ckanext/harvest/commands/harvester.py

281 lines
8.7 KiB
Python
Raw Normal View History

import sys
import re
from pprint import pprint
from ckan.lib.cli import CkanCommand
from ckan.model import repo
#from ckanext.harvest.model import HarvestSource, HarvestingJob, HarvestedDocument
#from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
from ckanext.harvest.lib import *
class Harvester(CkanCommand):
'''Harvests remotely mastered metadata
Usage:
harvester source {url} {type} [{active}] [{user-id}] [{publisher-id}]
- create new harvest source
harvester rmsource {id}
- remove a harvester source (and associated jobs)
harvester sources
- lists harvest sources
harvester job {source-id}
- create new harvest job
harvester rmjob {job-id}
- remove a harvest job
harvester jobs
- lists harvest jobs
harvester run
- runs harvest jobs
2011-03-18 16:48:08 +01:00
harvester extents
- creates or updates the extent geometry column for packages with
a bounding box defined in extras
The commands should be run from the ckanext-harvest directory and expect
a development.ini file to be present. Most of the time you will
specify the config explicitly though::
paster harvester sources --config=../ckan/development.ini
'''
summary = __doc__.split('\n')[0]
usage = __doc__
max_args = 6
min_args = 0
def command(self):
self._load_config()
print ""
if len(self.args) == 0:
self.parser.print_usage()
sys.exit(1)
cmd = self.args[0]
if cmd == 'source':
self.create_harvest_source()
elif cmd == "rmsource":
self.remove_harvest_source()
elif cmd == 'sources':
self.list_harvest_sources()
elif cmd == 'job':
self.create_harvest_job()
elif cmd == "rmjob":
self.remove_harvest_job()
elif cmd == 'jobs':
self.list_harvest_jobs()
elif cmd == 'run':
self.run_harvester()
elif cmd == 'extents':
self.update_extents()
else:
print 'Command %s not recognized' % cmd
def _load_config(self):
super(Harvester, self)._load_config()
def create_harvest_source(self):
if len(self.args) >= 2:
url = unicode(self.args[1])
else:
print 'Please provide a source URL'
sys.exit(1)
if len(self.args) >= 3:
type = unicode(self.args[2])
else:
print 'Please provide a source type'
sys.exit(1)
if len(self.args) >= 4:
active = not(self.args[3].lower() == 'false' or \
self.args[3] == '0')
else:
active = True
if len(self.args) >= 5:
user_id = unicode(self.args[4])
else:
user_id = u''
if len(self.args) >= 6:
publisher_id = unicode(self.args[5])
else:
publisher_id = u''
source = create_harvest_source({
'url':url,
'type':type,
'active':active,
'user_id':user_id,
'publisher_id':publisher_id})
print 'Created new harvest source:'
self.print_harvest_source(source)
sources = get_harvest_sources()
self.print_there_are('harvest source', sources)
# Create a Harvest Job for the new Source
create_harvest_job(source.id)
print 'A new Harvest Job for this source has also been created'
def remove_harvest_source(self):
if len(self.args) >= 2:
source_id = unicode(self.args[1])
else:
print 'Please provide a source id'
sys.exit(1)
delete_harvest_source(source_id)
print 'Removed harvest source: %s' % source_id
def list_harvest_sources(self):
sources = get_harvest_sources()
self.print_harvest_sources(sources)
self.print_there_are(what="harvest source", sequence=sources)
def create_harvest_job(self):
if len(self.args) >= 2:
source_id = unicode(self.args[1])
else:
print 'Please provide a source id'
sys.exit(1)
job = create_harvest_job(source_id)
self.print_harvest_job(job)
status = u'New'
jobs = get_harvest_jobs(status=status)
self.print_there_are('harvest jobs', jobs, condition=status)
def remove_harvest_job(self):
if len(self.args) >= 2:
job_id = unicode(self.args[1])
else:
print 'Please provide a job id'
sys.exit(1)
delete_harvest_job(job_id)
print 'Removed harvest job: %s' % job_id
def list_harvest_jobs(self):
jobs = get_harvest_jobs()
self.print_harvest_jobs(jobs)
self.print_there_are(what='harvest job', sequence=jobs)
#TODO: Move to lib and implement the queue system
def run_harvester(self, *args, **kwds):
from pylons.i18n.translation import _get_translator
import pylons
pylons.translator._push_object(_get_translator(pylons.config.get('lang')))
from ckanext.harvest.controllers.harvesting import HarvestingJobController
from ckanext.csw.validation import Validator
jobs = HarvestingJob.filter(status=u"New").all()
jobs_len = len(jobs)
jobs_count = 0
if jobs_len:
print "Running %s harvesting jobs..." % jobs_len
profiles = [
x.strip() for x in
pylons.config.get(
"ckan.harvestor.validator.profiles",
"iso19139,gemini2",
).split(",")
]
validator = Validator(profiles=profiles)
print ""
for job in jobs:
jobs_count += 1
if job.source is None:
print 'ERRROR: no source associated with this job'
else:
print "Running job %s/%s: %s" % (jobs_count, jobs_len, job.id)
self.print_harvesting_job(job)
job_controller = HarvestingJobController(job, validator)
job_controller.harvest_documents()
pprint (job.report)
else:
print "There are no new harvesting jobs."
#TODO: move to ckanext-?? for geo stuff
def update_extents(self):
from ckan.model import PackageExtra, Package, Session
conn = Session.connection()
packages = [extra.package \
for extra in \
Session.query(PackageExtra).filter(PackageExtra.key == 'bbox-east-long').all()]
error = False
for package in packages:
try:
save_extent(package)
except:
errors = True
if error:
msg = "There was an error saving the package extent. Have you set up the package_extent table in the DB?"
else:
msg = "Done. Extents generated for %i packages" % len(packages)
print msg
def print_harvest_sources(self, sources):
if sources:
print ""
for source in sources:
self.print_harvest_source(source)
def print_harvest_source(self, source):
print 'Source id: %s' % source.id
print ' url: %s' % source.url
print ' type: %s' % source.type
print ' active: %s' % source.active
print ' user: %s' % source.user_id
print 'publisher: %s' % source.publisher_id
print ' objects: %s' % len(source.objects)
print ''
def print_harvest_jobs(self, jobs):
if jobs:
print ''
for job in jobs:
self.print_harvest_job(job)
def print_harvest_job(self, job):
print 'Job id: %s' % job.id
print 'status: %s' % job.status
print 'source: %s' % job.source.id
print ' url: %s' % job.source.url
#print "report: %s" % job.report
#TODO: print errors
'''
if job.report and job.report['added']:
for package_id in job.report['added']:
print " doc: %s" % package_id
if job.report and job.report['errors']:
for msg in job.report['errors']:
print " error: %s" % msg
'''
print ''
def print_there_are(self, what, sequence, condition=''):
is_singular = self.is_singular(sequence)
print 'There %s %s %s%s%s' % (
is_singular and 'is' or 'are',
len(sequence),
condition and ('%s ' % condition.lower()) or '',
what,
not is_singular and 's' or '',
)
def is_singular(self, sequence):
return len(sequence) == 1