[merge]

2011-03-14 16:06:30 +00:00 · 2011-03-14 16:06:30 +00:00 · 898149bb8f
parent 52b17ffceb 544a88d18d
commit 898149bb8f
4 changed files with 294 additions and 3 deletions
--- a/ckanext/harvest/commands/init.py
+++ b/ckanext/harvest/commands/init.py
@ -0,0 +1,6 @@
+try:
+    import pkg_resources
+    pkg_resources.declare_namespace(__name__)
+except ImportError:
+    import pkgutil
+    __path__ = pkgutil.extend_path(__path__, __name__)
--- a/ckanext/harvest/commands/harvester.py
+++ b/ckanext/harvest/commands/harvester.py
@ -0,0 +1,287 @@
+import sys
+import re
+from pprint import pprint
+
+from ckan.lib.cli import CkanCommand
+
+class Harvester(CkanCommand):
+    '''Harvests remotely mastered metadata
+
+    Usage:
+      harvester source {url} [{user-ref} [{publisher-ref}]]     
+        - create new harvest source
+
+      harvester rmsource {url}
+        - remove a harvester source (and associated jobs)
+
+      harvester sources                                 
+        - lists harvest sources
+
+      harvester job {source-id} [{user-ref}]
+        - create new harvesting job
+
+      harvester rmjob {job-id}
+        - remove a harvesting job
+  
+      harvester jobs
+        - lists harvesting jobs
+
+      harvester run
+        - runs harvesting jobs
+        
+    The commands should be run from the ckanext-harvest directory and expect
+    a development.ini file to be present. Most of the time you will
+    specify the config explicitly though::
+
+        paster harvester sources --config=../ckan/development.ini
+
+    '''
+
+    summary = __doc__.split('\n')[0]
+    usage = __doc__
+    max_args = 4
+    min_args = 0
+
+    def command(self):
+        self._load_config()
+        # Clear the 'No handlers could be found for logger "vdm"' warning message.
+        print ""
+
+        if len(self.args) == 0:
+            self.parser.print_usage()
+            sys.exit(1)
+        cmd = self.args[0]
+        if cmd == 'source':
+            if len(self.args) >= 2:
+                url = unicode(self.args[1])
+            else:
+                print self.usage
+                print 'Error, source url is not given.'
+                sys.exit(1)
+            if len(self.args) >= 3:
+                user_ref = unicode(self.args[2])
+            else:
+                user_ref = u''
+            if len(self.args) >= 4:
+                publisher_ref = unicode(self.args[3])
+            else:
+                publisher_ref = u''
+            self.register_harvest_source(url, user_ref, publisher_ref)
+        elif cmd == "rmsource":
+            url = unicode(self.args[1])
+            self.remove_harvest_source(url)
+        elif cmd == 'sources':
+            self.list_harvest_sources()
+        elif cmd == 'job':
+            if len(self.args) >= 2:
+                source_id = unicode(self.args[1])
+            else:
+                print self.usage
+                print 'Error, job source is not given.'
+                sys.exit(1)
+            if len(self.args) >= 3:
+                user_ref = unicode(self.args[2])
+            else:
+                user_ref = u''
+            self.register_harvesting_job(source_id, user_ref)
+        elif cmd == "rmjob":
+            job_id = unicode(self.args[1])
+            self.remove_harvesting_job(job_id)
+        elif cmd == 'jobs':
+            self.list_harvesting_jobs()
+        elif cmd == 'run':
+            self.run_harvester()
+        else:
+            print 'Command %s not recognized' % cmd
+
+    def _load_config(self):
+        super(Harvester, self)._load_config()
+        import logging
+        logging.basicConfig()
+        logger_vdm = logging.getLogger('vdm')
+        logger_vdm.setLevel(logging.ERROR)
+
+    def run_harvester(self, *args, **kwds):
+        from pylons.i18n.translation import _get_translator
+        import pylons
+        pylons.translator._push_object(_get_translator(pylons.config.get('lang')))
+
+        from ckan.model import HarvestingJob
+        from ckan.controllers.harvesting import HarvestingJobController
+        from ckanext.csw.validation import Validator
+
+        jobs = HarvestingJob.filter(status=u"New").all()
+        jobs_len = len(jobs)
+        jobs_count = 0
+        if jobs_len:
+            print "Running %s harvesting jobs..." % jobs_len
+            profiles = [
+                x.strip() for x in
+                pylons.config.get(
+                    "ckan.harvestor.validator.profiles", 
+                    "iso19139,gemini2",
+                ).split(",")
+            ]
+            validator = Validator(profiles=profiles)
+            print ""
+            for job in jobs:
+                jobs_count += 1
+                if job.source is None:
+                    print 'ERRROR: no source associated with this job'
+                else:
+                    print "Running job %s/%s: %s" % (jobs_count, jobs_len, job.id)
+                    self.print_harvesting_job(job)
+                    job_controller = HarvestingJobController(job, validator)
+                    job_controller.harvest_documents()
+                    pprint (job.report)
+        else:
+            print "There are no new harvesting jobs."
+
+    def remove_harvesting_job(self, job_id):
+        from ckan import model
+        try:
+            job = model.HarvestingJob.get(job_id)
+            job.delete()
+            model.repo.commit_and_remove()
+            print "Removed job: %s" % job_id
+        except model.HarvestingObjectNotFound:
+            print "No such job"
+
+    def register_harvesting_job(self, source_id, user_ref):
+        from ckan.model import HarvestSource
+        from ckan.model import HarvestingJob
+        if re.match('(http|file)://', source_id):
+            source_url = unicode(source_id)
+            source_id = None
+            sources = HarvestSource.filter(url=source_url).all()
+            if sources:
+                source = sources[0]
+            else:
+                source = self.create_harvest_source(url=source_url, user_ref=user_ref, publisher_ref=u'')
+        else:
+            source = HarvestSource.get(source_id)
+        objects = HarvestingJob.filter(status='New', source=source)
+        if objects.count():
+            raise Exception('There is already an unrun job for the harvest source %r'%source.id)
+        job = HarvestingJob(
+            source=source,
+            user_ref=user_ref,
+            status=u"New",
+        )
+        job.save()
+        print "Created new harvesting job:"
+        self.print_harvesting_job(job)
+        status = u"New"
+        jobs = HarvestingJob.filter(status=status).all()
+        self.print_there_are("harvesting job", jobs, condition=status)
+
+    def register_harvest_source(self, url, user_ref, publisher_ref):
+        from ckan.model import HarvestSource
+        existing = self.get_harvest_sources(url=url)
+        if existing:
+            print "Error, there is already a harvesting source for that URL"
+            self.print_harvest_sources(existing)
+            sys.exit(1)
+        else:
+            source = self.create_harvest_source(url=url, user_ref=user_ref, publisher_ref=publisher_ref)
+            self.register_harvesting_job(source.id, user_ref)
+            print "Created new harvest source:"
+            self.print_harvest_source(source)
+            sources = self.get_harvest_sources()
+            self.print_there_are("harvest source", sources)
+
+    def remove_harvest_source(self, url):
+        from ckan import model
+        model.repo.new_revision()
+        sources = model.HarvestSource.filter(url=url)
+        if sources.count() == 0:
+            print "No such source"
+        else:
+            source = sources[0]
+            jobs = model.HarvestingJob.filter(source=source)
+            print "Removing %d jobs" % jobs.count()
+            for job in jobs:
+                job.delete()
+            source.delete()
+            model.repo.commit_and_remove()
+            print "Removed harvest source: %s" % url
+
+    def list_harvest_sources(self):
+        sources = self.get_harvest_sources()
+        self.print_harvest_sources(sources)
+        self.print_there_are(what="harvest source", sequence=sources)
+       
+    def list_harvesting_jobs(self):
+        jobs = self.get_harvesting_jobs()
+        self.print_harvesting_jobs(jobs)
+        self.print_there_are(what="harvesting job", sequence=jobs)
+
+    def get_harvest_sources(self, **kwds):
+        from ckan.model import HarvestSource
+        return HarvestSource.filter(**kwds).all()
+
+    def get_harvesting_jobs(self, **kwds):
+        from ckan.model import HarvestingJob
+        return HarvestingJob.filter(**kwds).all()
+
+    def create_harvest_source(self, **kwds):
+        from ckan.model import HarvestSource
+        source = HarvestSource(**kwds)
+        source.save()
+        return source
+
+    def create_harvesting_job(self, **kwds):
+        from ckan.model import HarvestingJob
+        job = HarvestingJob(**kwds)
+        job.save()
+        return job
+
+    def print_harvest_sources(self, sources):
+        if sources:
+            print ""
+        for source in sources:
+            self.print_harvest_source(source)
+
+    def print_harvest_source(self, source):
+        print "Source id: %s" % source.id
+        print "      url: %s" % source.url
+        print "     user: %s" % source.user_ref
+        print "publisher: %s" % source.publisher_ref
+        print "     docs: %s" % len(source.documents)
+        print ""
+
+    def print_harvesting_jobs(self, jobs):
+        if jobs:
+            print ""
+        for job in jobs:
+            self.print_harvesting_job(job)
+
+    def print_harvesting_job(self, job):
+        print "Job id: %s" % job.id
+        if job.user_ref:
+            print "  user: %s" % job.user_ref
+        print "status: %s" % job.status
+        print "source: %s" % job.source.id
+        print "   url: %s" % job.source.url
+        #print "report: %s" % job.report
+        if job.report and job.report['added']:
+            for package_id in job.report['added']:
+                print "   doc: %s" % package_id
+        if job.report and job.report['errors']:
+            for msg in job.report['errors']:
+                print " error: %s" % msg
+        print ""
+
+    def print_there_are(self, what, sequence, condition=""):
+        is_singular = self.is_singular(sequence)
+        print "There %s %s %s%s%s" % (
+            is_singular and "is" or "are",
+            len(sequence),
+            condition and ("%s " % condition.lower()) or "",
+            what,
+            not is_singular and "s" or "",
+        )
+
+    def is_singular(self, sequence):
+        return len(sequence) == 1
+
--- a/setup.py
+++ b/setup.py
@ -33,6 +33,6 @@ setup(
 	# Add plugins here, eg
 	harvest=ckanext.harvest:Harvest
 	[paste.paster_command]
-	package-scores = ckanext.harvest.commands.package_score:PackageScore
+	harvester = ckanext.harvest.commands.harvester:Harvester
 	""",
 )
--- a/templates/ckanext/harvest/index.html
+++ b/templates/ckanext/harvest/index.html
@ -15,7 +15,6 @@
    <a id="new-harvest-source" href="harvest/create">Add a harvesting source</a>
    <table id="harvest-sources">
        <tr>
-            <th></th>
            <th></th>
            <th></th>
            <th></th>
@ -29,7 +28,6 @@
        <tr py:for="source in c.sources">
            <td>${h.link_to('view', 'harvest/' + source.id)}</td>
            <td>${h.link_to('edit', 'harvest/' + source.id + '/edit')}</td> 
-            <td>${h.link_to('delete', 'harvest/' + source.id + '/delete')}</td> 
            <td>${h.link_to('refresh', 'harvest/' + source.id + '/refresh')}</td>            
            <td>${source.url}</td>
            <td>${source.status.last_harvest_status}</td>