diff --git a/ckanext/harvest/controllers/view.py b/ckanext/harvest/controllers/view.py index 023ec0e..717c7b3 100644 --- a/ckanext/harvest/controllers/view.py +++ b/ckanext/harvest/controllers/view.py @@ -85,10 +85,13 @@ class ViewController(BaseController): redirect(h.url_for(controller='harvest', action='index')) def show(self,id): - c.source = get_harvest_source(id) + try: + c.source = get_harvest_source(id) + + return render('ckanext/harvest/show.html') + except: + abort(404,'Harvest source not found') - #TODO: show source reports - return render('ckanext/harvest/show.html') def delete(self,id): try: diff --git a/ckanext/harvest/interfaces.py b/ckanext/harvest/interfaces.py index da647fb..7fc892a 100644 --- a/ckanext/harvest/interfaces.py +++ b/ckanext/harvest/interfaces.py @@ -54,6 +54,8 @@ class IHarvester(Interface): responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). + Note: if this stage creates or updates a package, a reference + to the package should be added to the HarvestObject. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. diff --git a/ckanext/harvest/lib/__init__.py b/ckanext/harvest/lib/__init__.py index d7804d8..6c1a09f 100644 --- a/ckanext/harvest/lib/__init__.py +++ b/ckanext/harvest/lib/__init__.py @@ -1,22 +1,117 @@ -from ckan.model import Session -from ckan.model import repo +from sqlalchemy import distinct,func +from ckan.model import Session, repo +from ckan.model import Package from ckan.lib.base import config -from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject +from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject, \ + HarvestGatherError, HarvestObjectError from ckanext.harvest.queue import get_gather_publisher log = __import__("logging").getLogger(__name__) + +def _get_source_status(source): + out = dict() + + jobs = get_harvest_jobs(source=source) + + if not len(jobs): + out['msg'] = 'No jobs yet' + return out + out = {'next_harvest':'', + 'last_harvest_request':'', + 'last_harvest_statistics':{'added':0,'updated':0,'errors':0}, + 'last_harvest_errors':[], + 'overall_statistics':{'added':0, 'errors':0}, + 'packages':[]} + + # Get next scheduled job + next_job = HarvestJob.filter(source=source,status=u'New').first() + if next_job: + out['next_harvest'] = 'Within 15 minutes' + else: + out['next_harvest'] = 'Not yet scheduled' + + # Get the last finished job + last_job = HarvestJob.filter(source=source,status=u'Finished') \ + .order_by(HarvestJob.created.desc()).limit(1).first() + + if last_job: + out['last_harvest_request'] = last_job.gather_finished + + + #Get HarvestObjects from last job whit links to packages + last_objects = [obj for obj in last_job.objects if obj.package is not None] + + if len(last_objects) == 0: + # No packages added or updated + out['last_harvest_statistics']['added'] = 0 + out['last_harvest_statistics']['updated'] = 0 + else: + # Check wether packages were added or updated + for last_object in last_objects: + # Check if the same package had been linked before + previous_objects = Session.query(HarvestObject) \ + .filter(HarvestObject.package==last_object.package) \ + .all() + + if len(previous_objects) == 1: + # It didn't previously exist, it has been added + out['last_harvest_statistics']['added'] += 1 + else: + # Pacakge already existed, but it has been updated + out['last_harvest_statistics']['updated'] += 1 + + # Last harvest errors + # We have the gathering errors in last_job.gather_errors, so let's also + # get also the object errors. + object_errors = Session.query(HarvestObjectError).join(HarvestObject) \ + .filter(HarvestObject.job==last_job).all() + + out['last_harvest_statistics']['errors'] = len(last_job.gather_errors) \ + + len(object_errors) + for gather_error in last_job.gather_errors: + out['last_harvest_errors'].append(gather_error.message) + + for object_error in object_errors: + out['last_harvest_errors'].append(object_error.message) + + + # Overall statistics + packages = Session.query(distinct(HarvestObject.package_id),Package.name) \ + .join(Package).join(HarvestJob).join(HarvestSource) \ + .filter(HarvestJob.source==source).all() + + out['overall_statistics']['added'] = len(packages) + for package in packages: + out['packages'].append(package.name) + + gather_errors = Session.query(HarvestGatherError) \ + .join(HarvestJob).join(HarvestSource) \ + .filter(HarvestJob.source==source).all() + + object_errors = Session.query(HarvestObjectError) \ + .join(HarvestObject).join(HarvestJob).join(HarvestSource) \ + .filter(HarvestJob.source==source).all() + out['overall_statistics']['errors'] = len(gather_errors) + len(object_errors) + else: + out['last_harvest_request'] = 'Not yet harvested' + + return out + + + + def _source_as_dict(source): out = source.as_dict() out['jobs'] = [] for job in source.jobs: out['jobs'].append(job.as_dict()) + + out['status'] = _get_source_status(source) - #TODO: Get some report data - return out def _job_as_dict(job): diff --git a/templates/ckanext/harvest/index.html b/templates/ckanext/harvest/index.html index 4b1a559..23c01bd 100644 --- a/templates/ckanext/harvest/index.html +++ b/templates/ckanext/harvest/index.html @@ -11,10 +11,13 @@
-

Harvesting Sources

Add a harvesting source - + + + + +
@@ -22,9 +25,8 @@ - + @@ -35,13 +37,17 @@ - +
URL Type ActiveNext Harvest Created
${source.url} ${source.type} ${source.active}${source.status.next_harvest} ${source.created}
-
+ + +
No harvest sources defined yet.
+
+ +
diff --git a/templates/ckanext/harvest/show.html b/templates/ckanext/harvest/show.html index 4ac883a..76aefcd 100644 --- a/templates/ckanext/harvest/show.html +++ b/templates/ckanext/harvest/show.html @@ -50,22 +50,20 @@ Total jobs ${len(c.source.jobs)} -