From a997e45470b879373dae5648c4b5162dbb118326 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 25 Jan 2012 17:47:35 +0000 Subject: [PATCH 01/43] [lib] Ignore deleted packages in source stats --- ckanext/harvest/lib/__init__.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/ckanext/harvest/lib/__init__.py b/ckanext/harvest/lib/__init__.py index 4b7b6d5..d1da001 100644 --- a/ckanext/harvest/lib/__init__.py +++ b/ckanext/harvest/lib/__init__.py @@ -47,7 +47,7 @@ def _get_source_status(source, detailed=True): out['last_harvest_request'] = str(last_job.gather_finished) #Get HarvestObjects from last job whit links to packages - if detailed: + if detailed: last_objects = [obj for obj in last_job.objects if obj.package is not None] if len(last_objects) == 0: @@ -77,7 +77,7 @@ def _get_source_status(source, detailed=True): out['last_harvest_statistics']['errors'] = len(last_job.gather_errors) \ + object_errors.count() - if detailed: + if detailed: for gather_error in last_job.gather_errors: out['last_harvest_errors']['gather'].append(gather_error.message) @@ -88,7 +88,8 @@ def _get_source_status(source, detailed=True): # Overall statistics packages = Session.query(distinct(HarvestObject.package_id),Package.name) \ .join(Package).join(HarvestJob).join(HarvestSource) \ - .filter(HarvestJob.source==source) + .filter(HarvestJob.source==source) \ + .filter(Package.state==u'active') out['overall_statistics']['added'] = packages.count() if detailed: @@ -348,17 +349,23 @@ def import_last_objects(source_id=None): if not source: raise NotFound('Harvest source %s does not exist' % source_id) + if not source.active: + raise Exception('This harvest source is not active') + last_objects_ids = Session.query(HarvestObject.id) \ - .join(HarvestJob) \ + .join(HarvestJob).join(Package) \ .filter(HarvestJob.source==source) \ .filter(HarvestObject.package!=None) \ + .filter(Package.state==u'active') \ .order_by(HarvestObject.guid) \ .order_by(HarvestObject.metadata_modified_date.desc()) \ .order_by(HarvestObject.gathered.desc()) \ .all() else: last_objects_ids = Session.query(HarvestObject.id) \ + .join(Package) \ .filter(HarvestObject.package!=None) \ + .filter(Package.state==u'active') \ .order_by(HarvestObject.guid) \ .order_by(HarvestObject.metadata_modified_date.desc()) \ .order_by(HarvestObject.gathered.desc()) \ @@ -382,7 +389,7 @@ def import_last_objects(source_id=None): return imported_objects def create_harvest_job_all(): - + # Get all active sources sources = get_harvest_sources(active=True) jobs = [] From f086e908bce24f0f3534bd357423554d7be81d7b Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 30 Jan 2012 17:09:28 +0000 Subject: [PATCH 02/43] [model] Clearer table initialization --- ckanext/harvest/model/__init__.py | 36 +++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/ckanext/harvest/model/__init__.py b/ckanext/harvest/model/__init__.py index 7ad8921..4512349 100644 --- a/ckanext/harvest/model/__init__.py +++ b/ckanext/harvest/model/__init__.py @@ -27,15 +27,29 @@ harvest_gather_error_table = None harvest_object_error_table = None def setup(): + + if harvest_source_table is None: - create_harvester_tables() + define_harvester_tables() log.debug('Harvest tables defined in memory') + if model.repo.are_tables_created(): - metadata.create_all() - log.debug('Harvest tables created') + if not harvest_source_table.exists(): + + # Create each table individually rahter than + # using metadata.create_all() + harvest_source_table.create() + harvest_job_table.create() + harvest_object_table.create() + harvest_gather_error_table.create() + harvest_object_error_table.create() + + log.debug('Harvest tables created') + else: + log.debug('Harvest tables already exist') else: log.debug('Harvest table creation deferred') - + class HarvestError(Exception): pass @@ -46,20 +60,20 @@ class HarvestDomainObject(DomainObject): key_attr = 'id' @classmethod - def get(self, key, default=None, attr=None): + def get(cls, key, default=None, attr=None): '''Finds a single entity in the register.''' if attr == None: - attr = self.key_attr + attr = cls.key_attr kwds = {attr: key} - o = self.filter(**kwds).first() + o = cls.filter(**kwds).first() if o: return o else: return default @classmethod - def filter(self, **kwds): - query = Session.query(self).autoflush(False) + def filter(cls, **kwds): + query = Session.query(cls).autoflush(False) return query.filter_by(**kwds) @@ -107,7 +121,7 @@ class HarvestObjectError(HarvestDomainObject): ''' pass -def create_harvester_tables(): +def define_harvester_tables(): global harvest_source_table global harvest_job_table @@ -161,7 +175,7 @@ def create_harvester_tables(): Column('harvest_object_id', types.UnicodeText, ForeignKey('harvest_object.id')), Column('message',types.UnicodeText), Column('stage', types.UnicodeText), - Column('created', DateTime, default=datetime.datetime.utcnow), + Column('created', DateTime, default=datetime.datetime.utcnow), ) mapper( From d1783f5415bcc7ba6c03b08fb1de60c7a6f789f2 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 30 Jan 2012 17:37:05 +0000 Subject: [PATCH 03/43] [model] Changes in harvest model Added three changes to the harvest model: * 'title' column in harvest_source table * 'current' column in harvest_job table * foreign key from harvest_object to harvest_source Tables are checked on startup to see if they need to be updated. TODO: populate current and harvest_source_id fields --- ckanext/harvest/model/__init__.py | 61 ++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 13 deletions(-) diff --git a/ckanext/harvest/model/__init__.py b/ckanext/harvest/model/__init__.py index 4512349..e38220b 100644 --- a/ckanext/harvest/model/__init__.py +++ b/ckanext/harvest/model/__init__.py @@ -1,14 +1,20 @@ import logging import datetime +from sqlalchemy.engine.reflection import Inspector +from sqlalchemy.orm import backref, relation + from ckan import model -from ckan.model.meta import * +from ckan.model.meta import (metadata, mapper, Session, + Table, Column, ForeignKey, types) from ckan.model.types import make_uuid -from ckan.model.core import * from ckan.model.domain_object import DomainObject from ckan.model.package import Package -from sqlalchemy.orm import backref, relation + + + + log = logging.getLogger(__name__) __all__ = [ @@ -46,7 +52,15 @@ def setup(): log.debug('Harvest tables created') else: + from ckan.model.meta import engine log.debug('Harvest tables already exist') + # Check if existing tables need to be updated + inspector = Inspector.from_engine(engine) + columns = inspector.get_columns('harvest_source') + if not 'title' in [column['name'] for column in columns]: + log.debug('Harvest tables need to be updated') + migrate_v2() + else: log.debug('Harvest table creation deferred') @@ -132,9 +146,10 @@ def define_harvester_tables(): harvest_source_table = Table('harvest_source', metadata, Column('id', types.UnicodeText, primary_key=True, default=make_uuid), Column('url', types.UnicodeText, nullable=False), + Column('title', types.UnicodeText, default=u''), Column('description', types.UnicodeText, default=u''), Column('config', types.UnicodeText, default=u''), - Column('created', DateTime, default=datetime.datetime.utcnow), + Column('created', types.DateTime, default=datetime.datetime.utcnow), Column('type',types.UnicodeText,nullable=False), Column('active',types.Boolean,default=True), Column('user_id', types.UnicodeText, default=u''), @@ -143,9 +158,9 @@ def define_harvester_tables(): # Was harvesting_job harvest_job_table = Table('harvest_job', metadata, Column('id', types.UnicodeText, primary_key=True, default=make_uuid), - Column('created', DateTime, default=datetime.datetime.utcnow), - Column('gather_started', DateTime), - Column('gather_finished', DateTime), + Column('created', types.DateTime, default=datetime.datetime.utcnow), + Column('gather_started', types.DateTime), + Column('gather_finished', types.DateTime), Column('source_id', types.UnicodeText, ForeignKey('harvest_source.id')), Column('status', types.UnicodeText, default=u'New', nullable=False), ) @@ -153,13 +168,15 @@ def define_harvester_tables(): harvest_object_table = Table('harvest_object', metadata, Column('id', types.UnicodeText, primary_key=True, default=make_uuid), Column('guid', types.UnicodeText, default=''), - Column('gathered', DateTime, default=datetime.datetime.utcnow), - Column('fetch_started', DateTime), + Column('current',types.Boolean,default=False), + Column('gathered', types.DateTime, default=datetime.datetime.utcnow), + Column('fetch_started', types.DateTime), Column('content', types.UnicodeText, nullable=True), - Column('fetch_finished', DateTime), - Column('metadata_modified_date', DateTime), + Column('fetch_finished', types.DateTime), + Column('metadata_modified_date', types.DateTime), Column('retry_times',types.Integer), Column('harvest_job_id', types.UnicodeText, ForeignKey('harvest_job.id')), + Column('harvest_source_id', types.UnicodeText, ForeignKey('harvest_source.id')), Column('package_id', types.UnicodeText, ForeignKey('package.id'), nullable=True), ) # New table @@ -167,7 +184,7 @@ def define_harvester_tables(): Column('id', types.UnicodeText, primary_key=True, default=make_uuid), Column('harvest_job_id', types.UnicodeText, ForeignKey('harvest_job.id')), Column('message', types.UnicodeText), - Column('created', DateTime, default=datetime.datetime.utcnow), + Column('created', types.DateTime, default=datetime.datetime.utcnow), ) # New table harvest_object_error_table = Table('harvest_object_error',metadata, @@ -175,7 +192,7 @@ def define_harvester_tables(): Column('harvest_object_id', types.UnicodeText, ForeignKey('harvest_object.id')), Column('message',types.UnicodeText), Column('stage', types.UnicodeText), - Column('created', DateTime, default=datetime.datetime.utcnow), + Column('created', types.DateTime, default=datetime.datetime.utcnow), ) mapper( @@ -234,3 +251,21 @@ def define_harvester_tables(): ), }, ) + + +def migrate_v2(): + conn = Session.connection() + + command = ''' + ALTER TABLE harvest_source ADD COLUMN title text; + + ALTER TABLE harvest_object ADD COLUMN current boolean; + ALTER TABLE harvest_object ADD COLUMN harvest_source_id text; + ALTER TABLE harvest_object ADD CONSTRAINT harvest_object_harvest_source_id_fkey FOREIGN KEY (harvest_source_id) REFERENCES harvest_source(id); + ''' + conn.execute(command) + + # TODO: scripts for populating current and harvest_source_id + + Session.commit() + log.debug('Harvest tables migrated to v2') From b64d97118cd5493fba6e60f1f8a7c7588c17a9a0 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 1 Feb 2012 11:08:41 +0000 Subject: [PATCH 04/43] [#1726][model] Add scripts for populating source_id and current fields --- ckanext/harvest/model/__init__.py | 37 ++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/ckanext/harvest/model/__init__.py b/ckanext/harvest/model/__init__.py index e38220b..8c4705f 100644 --- a/ckanext/harvest/model/__init__.py +++ b/ckanext/harvest/model/__init__.py @@ -1,6 +1,7 @@ import logging import datetime +from sqlalchemy import distinct from sqlalchemy.engine.reflection import Inspector from sqlalchemy.orm import backref, relation @@ -42,7 +43,7 @@ def setup(): if model.repo.are_tables_created(): if not harvest_source_table.exists(): - # Create each table individually rahter than + # Create each table individually rather than # using metadata.create_all() harvest_source_table.create() harvest_job_table.create() @@ -254,18 +255,42 @@ def define_harvester_tables(): def migrate_v2(): + log.debug('Migrating harvest tables to v2. This may take a while...') conn = Session.connection() - command = ''' + statements = ''' ALTER TABLE harvest_source ADD COLUMN title text; ALTER TABLE harvest_object ADD COLUMN current boolean; ALTER TABLE harvest_object ADD COLUMN harvest_source_id text; ALTER TABLE harvest_object ADD CONSTRAINT harvest_object_harvest_source_id_fkey FOREIGN KEY (harvest_source_id) REFERENCES harvest_source(id); - ''' - conn.execute(command) - # TODO: scripts for populating current and harvest_source_id + UPDATE harvest_object o SET harvest_source_id = j.source_id FROM harvest_job j WHERE o.harvest_job_id = j.id; + ''' + conn.execute(statements) + + # Flag current harvest_objects + guids = Session.query(distinct(HarvestObject.guid)) \ + .join(Package) \ + .filter(HarvestObject.package!=None) \ + .filter(Package.state==u'active') + + update_statement = ''' + UPDATE harvest_object + SET current = TRUE + WHERE id = ( + SELECT o.id + FROM harvest_object o JOIN package p ON p.id = o.package_id + WHERE o.package_id IS NOT null AND p.state = 'active' + AND o.guid = '%s' + ORDER BY metadata_modified_date DESC, fetch_finished DESC, gathered DESC + LIMIT 1) + ''' + + for guid in guids: + conn.execute(update_statement % guid) + + conn.execute('UPDATE harvest_object SET current = FALSE WHERE current IS NOT TRUE') Session.commit() - log.debug('Harvest tables migrated to v2') + log.info('Harvest tables migrated to v2') From 004210935ab68cfe4c0a9dd74cd7684e6661cdbe Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 1 Feb 2012 11:10:44 +0000 Subject: [PATCH 05/43] [model] Avoid unicode warning --- ckanext/harvest/model/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/harvest/model/__init__.py b/ckanext/harvest/model/__init__.py index 8c4705f..a372de4 100644 --- a/ckanext/harvest/model/__init__.py +++ b/ckanext/harvest/model/__init__.py @@ -168,7 +168,7 @@ def define_harvester_tables(): # Was harvested_document harvest_object_table = Table('harvest_object', metadata, Column('id', types.UnicodeText, primary_key=True, default=make_uuid), - Column('guid', types.UnicodeText, default=''), + Column('guid', types.UnicodeText, default=u''), Column('current',types.Boolean,default=False), Column('gathered', types.DateTime, default=datetime.datetime.utcnow), Column('fetch_started', types.DateTime), From 4c81c7c3a7591f5076a93c7c1e2466b55a58d9ba Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 1 Feb 2012 12:52:52 +0000 Subject: [PATCH 06/43] [#1726][model] Harvest source reference compatibility The 'source' property of harvest objects now comes from the actual foreign key. For compatibility with old harvesters, an before insert event listener has beeen added to check if the source id has been set, and set it automatically from the job if not. Note that this requires SQLAlchemy 0.7 (ie CKAN 1.5.1) --- ckanext/harvest/model/__init__.py | 24 ++++++++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/ckanext/harvest/model/__init__.py b/ckanext/harvest/model/__init__.py index a372de4..205873e 100644 --- a/ckanext/harvest/model/__init__.py +++ b/ckanext/harvest/model/__init__.py @@ -1,6 +1,7 @@ import logging import datetime +from sqlalchemy import event from sqlalchemy import distinct from sqlalchemy.engine.reflection import Inspector from sqlalchemy.orm import backref, relation @@ -120,10 +121,6 @@ class HarvestObject(HarvestDomainObject): ''' - @property - def source(self): - return self.job.source - class HarvestGatherError(HarvestDomainObject): '''Gather errors are raised during the **gather** stage of a harvesting job. @@ -136,6 +133,18 @@ class HarvestObjectError(HarvestDomainObject): ''' pass +def harvest_object_before_insert_listener(mapper,connection,target): + ''' + For compatibility with old harvesters, check if the source id has + been set, and set it automatically from the job if not. + ''' + if not target.harvest_source_id or not target.source: + if not target.job: + raise Exception('You must define a Harvest Job for each Harvest Object') + target.source = target.job.source + target.harvest_source_id = target.job.source.id + + def define_harvester_tables(): global harvest_source_table @@ -228,6 +237,12 @@ def define_harvester_tables(): lazy=True, backref=u'objects', ), + 'source': relation( + HarvestSource, + lazy=True, + backref=u'objects', + ), + }, ) @@ -253,6 +268,7 @@ def define_harvester_tables(): }, ) + event.listen(HarvestObject, 'before_insert', harvest_object_before_insert_listener) def migrate_v2(): log.debug('Migrating harvest tables to v2. This may take a while...') From 479750da0928c4837fffa149e6ba5ac4a69210a3 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 1 Feb 2012 15:50:41 +0000 Subject: [PATCH 07/43] [#1726][base harvester] Set current field when importing --- ckanext/harvest/harvesters/base.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/ckanext/harvest/harvesters/base.py b/ckanext/harvest/harvesters/base.py index 3f01f6b..865a06f 100644 --- a/ckanext/harvest/harvesters/base.py +++ b/ckanext/harvest/harvesters/base.py @@ -1,6 +1,8 @@ import logging import re +from sqlalchemy.sql import update,and_, bindparam + from ckan import model from ckan.model import Session, Package from ckan.logic import ValidationError, NotFound, get_action @@ -145,10 +147,8 @@ class HarvesterBase(SingletonPlugin): log.info('Package with GUID %s exists and needs to be updated' % harvest_object.guid) # Update package context.update({'id':package_dict['id']}) - updated_package = get_action('package_update_rest')(context, package_dict) + new_package = get_action('package_update_rest')(context, package_dict) - harvest_object.package_id = updated_package['id'] - harvest_object.save() else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) @@ -161,7 +161,21 @@ class HarvesterBase(SingletonPlugin): log.info('Package with GUID %s does not exist, let\'s create it' % harvest_object.guid) new_package = get_action('package_create_rest')(context, package_dict) harvest_object.package_id = new_package['id'] - harvest_object.save() + + # Flag the other objects linking to this package as not current anymore + from ckanext.harvest.model import harvest_object_table + conn = Session.connection() + u = update(harvest_object_table) \ + .where(harvest_object_table.c.package_id==bindparam('b_package_id')) \ + .values(current=False) + conn.execute(u, b_package_id=new_package['id']) + Session.commit() + + # Flag this as the current harvest object + + harvest_object.package_id = new_package['id'] + harvest_object.current = True + harvest_object.save() return True From a5cf445fa615a4b935e876f7a2ae26fd15cd3382 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 2 Feb 2012 13:20:03 +0000 Subject: [PATCH 08/43] [#1727][lib] Use 'current' field in queries returning harvest objects --- ckanext/harvest/commands/harvester.py | 3 +- ckanext/harvest/lib/__init__.py | 42 ++++++++++----------------- 2 files changed, 18 insertions(+), 27 deletions(-) diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index 1c9c673..46fd36e 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -216,7 +216,8 @@ class Harvester(CkanCommand): source_id = unicode(self.args[1]) else: source_id = None - import_last_objects(source_id) + objs = import_last_objects(source_id) + print '%s objects reimported' % len(objs) def create_harvest_job_all(self): jobs = create_harvest_job_all() diff --git a/ckanext/harvest/lib/__init__.py b/ckanext/harvest/lib/__init__.py index d1da001..7fcf192 100644 --- a/ckanext/harvest/lib/__init__.py +++ b/ckanext/harvest/lib/__init__.py @@ -87,8 +87,9 @@ def _get_source_status(source, detailed=True): # Overall statistics packages = Session.query(distinct(HarvestObject.package_id),Package.name) \ - .join(Package).join(HarvestJob).join(HarvestSource) \ - .filter(HarvestJob.source==source) \ + .join(Package).join(HarvestSource) \ + .filter(HarvestObject.source==source) \ + .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') out['overall_statistics']['added'] = packages.count() @@ -353,40 +354,29 @@ def import_last_objects(source_id=None): raise Exception('This harvest source is not active') last_objects_ids = Session.query(HarvestObject.id) \ - .join(HarvestJob).join(Package) \ - .filter(HarvestJob.source==source) \ - .filter(HarvestObject.package!=None) \ + .join(HarvestSource).join(Package) \ + .filter(HarvestObject.source==source) \ + .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') \ - .order_by(HarvestObject.guid) \ - .order_by(HarvestObject.metadata_modified_date.desc()) \ - .order_by(HarvestObject.gathered.desc()) \ .all() else: last_objects_ids = Session.query(HarvestObject.id) \ .join(Package) \ - .filter(HarvestObject.package!=None) \ + .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') \ - .order_by(HarvestObject.guid) \ - .order_by(HarvestObject.metadata_modified_date.desc()) \ - .order_by(HarvestObject.gathered.desc()) \ .all() - - last_obj_guid = '' - imported_objects = [] + last_objects = [] for obj_id in last_objects_ids: obj = Session.query(HarvestObject).get(obj_id) - if obj.guid != last_obj_guid: - imported_objects.append(obj) - for harvester in PluginImplementations(IHarvester): - if harvester.info()['name'] == obj.job.source.type: - if hasattr(harvester,'force_import'): - harvester.force_import = True - harvester.import_stage(obj) - break - last_obj_guid = obj.guid - - return imported_objects + for harvester in PluginImplementations(IHarvester): + if harvester.info()['name'] == obj.source.type: + if hasattr(harvester,'force_import'): + harvester.force_import = True + harvester.import_stage(obj) + break + last_objects.append(obj) + return last_objects def create_harvest_job_all(): From 9ed152cbea7754941dc99af425d9fbf1c359b77f Mon Sep 17 00:00:00 2001 From: amercader Date: Fri, 3 Feb 2012 17:54:34 +0000 Subject: [PATCH 09/43] [ckan harvester] Add support for forcing gathering of all remote packages --- README.rst | 5 +++++ ckanext/harvest/harvesters/ckanharvester.py | 8 +++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 3641e55..0f91ab4 100644 --- a/README.rst +++ b/README.rst @@ -151,6 +151,11 @@ field. The currently supported configuration options are: created from this harvesting source. Logged in users and visitors will be only able to read them. +* force_all: By default, after the first harvesting, the harvester will gather + only the modified packages from the remote site since the last harvesting. + Setting this property to true will force the harvester to gather all remote + packages regardless of the modification date. Default is False. + Here is an example of a configuration object (the one that must be entered in the configuration field):: diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 7e293e4..8a3c5fc 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -99,6 +99,11 @@ class CKANHarvester(HarvesterBase): except NotFound,e: raise ValueError('User not found') + for key in ('read_only','force_all'): + if key in config_obj: + if not isinstance(config_obj[key],bool): + raise ValueError('%s must be boolean' % key) + except ValueError,e: raise e @@ -125,7 +130,8 @@ class CKANHarvester(HarvesterBase): base_rest_url = base_url + self._get_rest_api_offset() base_search_url = base_url + self._get_search_api_offset() - if previous_job and not previous_job.gather_errors and not len(previous_job.objects) == 0: + if (previous_job and not previous_job.gather_errors and not len(previous_job.objects) == 0) \ + or not self.config.get('force_all',False): get_all_packages = False # Request only the packages modified since last harvest job From 78b61a09e9de7b503093d0e5fba8bbdc7c1e18ed Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 6 Feb 2012 10:17:27 +0000 Subject: [PATCH 10/43] [doc] Quick note about the current field --- README.rst | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 3641e55..1660254 100644 --- a/README.rst +++ b/README.rst @@ -273,7 +273,8 @@ following methods:: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference - to the package should be added to the HarvestObject. + to the package must be added to the HarvestObject. + Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. From 4d7b8143b9fd9c863cfac34a38a52da566fc231f Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 14 Feb 2012 11:28:11 +0000 Subject: [PATCH 11/43] [lib] Renable unique constraint in url for inactive sources --- ckanext/harvest/lib/__init__.py | 38 ----------------------------- ckanext/harvest/logic/validators.py | 2 +- 2 files changed, 1 insertion(+), 39 deletions(-) diff --git a/ckanext/harvest/lib/__init__.py b/ckanext/harvest/lib/__init__.py index 7fcf192..364c64d 100644 --- a/ckanext/harvest/lib/__init__.py +++ b/ckanext/harvest/lib/__init__.py @@ -111,8 +111,6 @@ def _get_source_status(source, detailed=True): return out - - def _source_as_dict(source, detailed=True): out = source.as_dict() out['jobs'] = [] @@ -154,42 +152,6 @@ def _object_as_dict(obj): return out -def _url_exists(url): - new_url = _normalize_url(url) - - existing_sources = get_harvest_sources() - - for existing_source in existing_sources: - existing_url = _normalize_url(existing_source['url']) - if existing_url == new_url and existing_source['active'] == True: - return existing_source - return False - -def _normalize_url(url): - o = urlparse.urlparse(url) - - # Normalize port - if ':' in o.netloc: - parts = o.netloc.split(':') - if (o.scheme == 'http' and parts[1] == '80') or \ - (o.scheme == 'https' and parts[1] == '443'): - netloc = parts[0] - else: - netloc = ':'.join(parts) - else: - netloc = o.netloc - - # Remove trailing slash - path = o.path.rstrip('/') - - check_url = urlparse.urlunparse(( - o.scheme, - netloc, - path, - None,None,None)) - - return check_url - def _prettify(field_name): field_name = re.sub('(? Date: Tue, 14 Feb 2012 14:24:32 +0000 Subject: [PATCH 12/43] [ui,logic] Expose source state (active/inactive) in the source form --- ckanext/harvest/controllers/view.py | 6 ++--- ckanext/harvest/lib/__init__.py | 22 +++++++++++++++---- ckanext/harvest/logic/schema.py | 11 +++++----- ckanext/harvest/logic/validators.py | 10 ++++++++- .../harvest/public/ckanext/harvest/style.css | 9 ++++++++ .../templates/source/new_source_form.html | 15 +++++++++++++ 6 files changed, 60 insertions(+), 13 deletions(-) diff --git a/ckanext/harvest/controllers/view.py b/ckanext/harvest/controllers/view.py index b146c88..2589159 100644 --- a/ckanext/harvest/controllers/view.py +++ b/ckanext/harvest/controllers/view.py @@ -42,7 +42,7 @@ class ViewController(BaseController): errors = errors or {} error_summary = error_summary or {} vars = {'data': data, 'errors': errors, 'error_summary': error_summary, 'harvesters': get_registered_harvesters_info()} - + c.form = render('source/new_source_form.html', extra_vars=vars) return render('source/new.html') @@ -80,9 +80,9 @@ class ViewController(BaseController): data = data or old_data errors = errors or {} error_summary = error_summary or {} - #TODO: Use new description interface to build the types select and descriptions + vars = {'data': data, 'errors': errors, 'error_summary': error_summary, 'harvesters': get_registered_harvesters_info()} - + c.form = render('source/new_source_form.html', extra_vars=vars) return render('source/edit.html') diff --git a/ckanext/harvest/lib/__init__.py b/ckanext/harvest/lib/__init__.py index 364c64d..78a1926 100644 --- a/ckanext/harvest/lib/__init__.py +++ b/ckanext/harvest/lib/__init__.py @@ -194,6 +194,9 @@ def create_harvest_source(data_dict): if o in data and data[o] is not None: source.__setattr__(o,data[o]) + if 'active' in data_dict: + source.active = data['active'] + source.save() return _source_as_dict(source) @@ -213,14 +216,25 @@ def edit_harvest_source(source_id,data_dict): Session.rollback() raise ValidationError(errors,_error_summary(errors)) - fields = ['url','type','active','description','user_id','publisher_id'] + fields = ['url','type','description','user_id','publisher_id'] for f in fields: - if f in data_dict and data_dict[f] is not None and data_dict[f] != '': - source.__setattr__(f,data_dict[f]) + if f in data and data[f] is not None and data[f] != '': + source.__setattr__(f,data[f]) - source.config = data_dict['config'] + if 'active' in data_dict: + source.active = data['active'] + + if 'config' in data_dict: + source.config = data['config'] source.save() + # Abort any pending jobs + if not source.active: + jobs = HarvestJob.filter(source=source,status=u'New') + if jobs: + for job in jobs: + job.status = u'Aborted' + job.save() return _source_as_dict(source) diff --git a/ckanext/harvest/logic/schema.py b/ckanext/harvest/logic/schema.py index 37c804a..0145c7a 100644 --- a/ckanext/harvest/logic/schema.py +++ b/ckanext/harvest/logic/schema.py @@ -5,10 +5,11 @@ from ckan.lib.navl.validators import (ignore_missing, not_missing ) -from ckanext.harvest.logic.validators import harvest_source_id_exists, \ - harvest_source_url_validator, \ - harvest_source_type_exists, \ - harvest_source_config_validator +from ckanext.harvest.logic.validators import (harvest_source_id_exists, + harvest_source_url_validator, + harvest_source_type_exists, + harvest_source_config_validator, + harvest_source_active_validator,) def default_harvest_source_schema(): @@ -17,7 +18,7 @@ def default_harvest_source_schema(): 'url': [not_empty, unicode, harvest_source_url_validator], 'type': [not_empty, unicode, harvest_source_type_exists], 'description': [ignore_missing], - 'active': [ignore_missing], + 'active': [ignore_missing,harvest_source_active_validator], 'user_id': [ignore_missing], 'publisher_id': [ignore_missing], 'config': [ignore_missing,harvest_source_config_validator] diff --git a/ckanext/harvest/logic/validators.py b/ckanext/harvest/logic/validators.py index 87959a7..e851649 100644 --- a/ckanext/harvest/logic/validators.py +++ b/ckanext/harvest/logic/validators.py @@ -56,7 +56,7 @@ def harvest_source_url_validator(key,data,errors,context): for url,active in existing_sources: url = _normalize_url(url) if url == new_url: - raise Invalid('There already is an active Harvest Source for this URL: %s' % data[key]) + raise Invalid('There already is a Harvest Source for this URL: %s' % data[key]) return data[key] @@ -91,3 +91,11 @@ def harvest_source_config_validator(key,data,errors,context): else: return data[key] +def harvest_source_active_validator(value,context): + if isinstance(value,basestring): + if value.lower() == 'true': + return True + else: + return False + return bool(value) + diff --git a/ckanext/harvest/public/ckanext/harvest/style.css b/ckanext/harvest/public/ckanext/harvest/style.css index 163e43e..9f5aaca 100644 --- a/ckanext/harvest/public/ckanext/harvest/style.css +++ b/ckanext/harvest/public/ckanext/harvest/style.css @@ -58,3 +58,12 @@ body.index.ViewController #content { vertical-align: middle; margin: 0 5px; } + +.source-state-active{ + font-weight:bold; +} + +.source-state-inactive{ + font-weight:bold; + color: red; +} diff --git a/ckanext/harvest/templates/source/new_source_form.html b/ckanext/harvest/templates/source/new_source_form.html index 72c94e8..a455789 100644 --- a/ckanext/harvest/templates/source/new_source_form.html +++ b/ckanext/harvest/templates/source/new_source_form.html @@ -40,6 +40,21 @@
You can add your own notes here about what the URL above represents to remind you later.
+ +
+
+ + +
This harvest source is Active
+
+ +
This harvest source is Inactive
+
+ +
or Return to the harvest sources list From 3489a004ad356f4880833c2285dde0a918017339 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 14 Feb 2012 17:23:17 +0000 Subject: [PATCH 13/43] [ui] Minor tweak to support older themes --- ckanext/harvest/templates/index.html | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ckanext/harvest/templates/index.html b/ckanext/harvest/templates/index.html index ee05623..9c03476 100644 --- a/ckanext/harvest/templates/index.html +++ b/ckanext/harvest/templates/index.html @@ -5,6 +5,8 @@ Harvesting Sources + harvest + From e03c2545ca588cf016e59006d8726cad54ff7217 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 15 Feb 2012 11:49:59 +0000 Subject: [PATCH 14/43] [ui,logic] Expose source title in the source form --- ckanext/harvest/lib/__init__.py | 6 +++--- ckanext/harvest/logic/schema.py | 3 ++- ckanext/harvest/templates/source/new_source_form.html | 6 ++++++ ckanext/harvest/templates/source/read.html | 5 +++++ 4 files changed, 16 insertions(+), 4 deletions(-) diff --git a/ckanext/harvest/lib/__init__.py b/ckanext/harvest/lib/__init__.py index 78a1926..aafe09f 100644 --- a/ckanext/harvest/lib/__init__.py +++ b/ckanext/harvest/lib/__init__.py @@ -189,7 +189,7 @@ def create_harvest_source(data_dict): source.url = data['url'] source.type = data['type'] - opt = ['active','description','user_id','publisher_id','config'] + opt = ['active','title','description','user_id','publisher_id','config'] for o in opt: if o in data and data[o] is not None: source.__setattr__(o,data[o]) @@ -216,9 +216,9 @@ def edit_harvest_source(source_id,data_dict): Session.rollback() raise ValidationError(errors,_error_summary(errors)) - fields = ['url','type','description','user_id','publisher_id'] + fields = ['url','title','type','description','user_id','publisher_id'] for f in fields: - if f in data and data[f] is not None and data[f] != '': + if f in data and data[f] is not None: source.__setattr__(f,data[f]) if 'active' in data_dict: diff --git a/ckanext/harvest/logic/schema.py b/ckanext/harvest/logic/schema.py index 0145c7a..231a530 100644 --- a/ckanext/harvest/logic/schema.py +++ b/ckanext/harvest/logic/schema.py @@ -17,7 +17,8 @@ def default_harvest_source_schema(): 'id': [ignore_missing, unicode, harvest_source_id_exists], 'url': [not_empty, unicode, harvest_source_url_validator], 'type': [not_empty, unicode, harvest_source_type_exists], - 'description': [ignore_missing], + 'title': [ignore_missing,unicode], + 'description': [ignore_missing,unicode], 'active': [ignore_missing,harvest_source_active_validator], 'user_id': [ignore_missing], 'publisher_id': [ignore_missing], diff --git a/ckanext/harvest/templates/source/new_source_form.html b/ckanext/harvest/templates/source/new_source_form.html index a455789..d3c5adb 100644 --- a/ckanext/harvest/templates/source/new_source_form.html +++ b/ckanext/harvest/templates/source/new_source_form.html @@ -35,6 +35,12 @@ + +
+
+
${errors.get('title', '')}
+
This will be shown as the datasets source.
+
You can add your own notes here about what the URL above represents to remind you later.
diff --git a/ckanext/harvest/templates/source/read.html b/ckanext/harvest/templates/source/read.html index dc01327..3ca8348 100644 --- a/ckanext/harvest/templates/source/read.html +++ b/ckanext/harvest/templates/source/read.html @@ -33,6 +33,11 @@ Active ${c.source.active} + + Title + ${c.source.title} + + Description ${c.source.description} From 33aa6f9356c7423e55fff5ec153042e0d0c1fe87 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 16 Feb 2012 12:52:43 +0000 Subject: [PATCH 15/43] [doc] Added documentation regarding production setup --- README.rst | 150 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 146 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 0a6466b..b924d38 100644 --- a/README.rst +++ b/README.rst @@ -101,7 +101,7 @@ the config explicitly though:: paster harvester sources --config=../ckan/development.ini The CKAN harverster -================== +=================== The plugin includes a harvester for remote CKAN instances. To use it, you need to add the `ckan_harvester` plugin to your options file:: @@ -318,7 +318,149 @@ pending harvesting jobs:: paster harvester run --config=../ckan/development.ini -After packages have been imported, the search index will have to be updated -before the packages appear in search results (from the ckan directory): +Note: If you don't have the `synchronous_search` plugin loaded, you will need +to update the search index after the harvesting in order for the packages to +appear in search results (from the ckan directory): + + paster search-index rebuild + + +Setting up the harvesters on a production server +================================================ + +The previous approach works fine during development or debugging, but it is +not recommended for production servers. There are several possible ways of +setting up the harvesters, which will depend on your particular infrastructure +and needs. The bottom line is that the gather and fetch process should be kept +running somehow and then the run command should be run periodically to start +any pending jobs. + +The following approach is the one generally used on CKAN deployments, and it +will probably suit most of the users. It uses Supervisor_, a tool to monitor +processes, and a cron job to run the harvest jobs, and it assumes that you +have already installed and configured the harvesting extension (See +`Installation` if not). + +Note: It is recommended to run the harvest process from a non-root user +(generally the one you are running CKAN with). Replace the user `okfn` in the +following steps with the one you are using. + +1. Install Supervisor:: + + sudo apt-get install supervisor + + You can check if it is running with this command:: + + ps aux | grep supervisord + + You should see a line similar to this one:: + + root 9224 0.0 0.3 56420 12204 ? Ss 15:52 0:00 /usr/bin/python /usr/bin/supervisord + +2. Supervisor needs to have programs added to its configuration, which will + describe the tasks that need to be monitored. This configuration files are + stored in `/etc/supervisor/conf.d`. + + Create a file named `/etc/supervisor/conf.d/ckan_harvesting.conf`, and copy the following contents:: + + + ; =============================== + ; ckan harvester + ; =============================== + + [program:ckan_gather_consumer] + + command=/var/lib/ckan/std/pyenv/bin/paster --plugin=ckanext-harvest harvester gather_consumer --config=/etc/ckan/std/std.ini + + ; user that owns virtual environment. + user=okfn + + numprocs=1 + stdout_logfile=/var/log/ckan/std/gather_consumer.log + stderr_logfile=/var/log/ckan/std/gather_consumer.log + autostart=true + autorestart=true + startsecs=10 + + [program:ckan_fetch_consumer] + + command=/var/lib/ckan/std/pyenv/bin/paster --plugin=ckanext-harvest harvester fetch_consumer --config=/etc/ckan/std/std.ini + + ; user that owns virtual environment. + user=okfn + + numprocs=1 + stdout_logfile=/var/log/ckan/std/fetch_consumer.log + stderr_logfile=/var/log/ckan/std/fetch_consumer.log + autostart=true + autorestart=true + startsecs=10 + + + There are a number of things that you will need to replace with your + specific installation settings (the example above shows paths from a + ckan instance installed via Debian packages): + + * command: The absolute path to the paster command located in the + python virtual environment and the absolute path to the config + ini file. + + * user: The unix user you are running CKAN with + + * stdout_logfile and stderr_logfile: All output coming from the + harvest consumers will be written to this file. Ensure that the + necessary permissions are setup. + + The rest of the configuration options are pretty self explanatory. Refer + to the `Supervisor documentation `_ + to know more about these and other options available. + +3. Start the supervisor tasks with the following commands:: + + sudo supervisorctl start ckan_gather_consumer + sudo supervisorctl start ckan_fetch_consumer + + To check that the processes are running, you can run:: + + sudo supervisorctl status + + ckan_fetch_consumer RUNNING pid 6983, uptime 0:22:06 + ckan_gather_consumer RUNNING pid 6968, uptime 0:22:45 + + Some problems you may encounter when starting the processes: + + * `ckan_gather_consumer: ERROR (no such process)` + Double-check your supervisor configuration file and stop and restart the supervisor daemon:: + + sudo service supervisor start; sudo service supervisor stop + + * `ckan_gather_consumer: ERROR (abnormal termination)` + Something prevented the command from running properly. Have a look at the log file that + you defined in the `stdout_logfile` section to see what happened. Common errors include: + + * `socket.error: [Errno 111] Connection refused` + RabbitMQ is not running:: + + sudo service rabbitmq-server start + +4. Once we have the two consumers running and monitored, we just need to create a cron job + that will run the `run` harvester command periodically. To do so, edit the cron table with + the following command (it may ask you to choose an editor):: + + sudo crontab -e -u okfn + + Note that we are running this command as the same user we configured the processes to be run with + (`okfn` in our example). + + Paste this line into your crontab, again replacing the paths to paster and the ini file with yours:: + + # m h dom mon dow command + */15 * * * * /var/lib/ckan/std/pyenv/bin/paster --plugin=ckanext-harvest harvester run --config=/etc/ckan/std/std.ini + + This particular example will check for pending jobs every fifteen minutes. + You can of course modify this periodicity, this `Wikipedia page `_ + has a good overview of the crontab syntax. + + +.. _Supervisor: http://supervisord.org - paster search-index From 1027d777e5220cdf5e672a0e631fa636f909e00e Mon Sep 17 00:00:00 2001 From: Ian Murray Date: Thu, 16 Feb 2012 16:03:15 +0000 Subject: [PATCH 16/43] [master][docs] Updated docs a little --- README.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.rst b/README.rst index b924d38..b6143e1 100644 --- a/README.rst +++ b/README.rst @@ -417,6 +417,9 @@ following steps with the one you are using. 3. Start the supervisor tasks with the following commands:: + sudo supervisorctl reread + sudo supervisorctl add ckan_gather_consumer + sudo supervisorctl add ckan_fetch_consumer sudo supervisorctl start ckan_gather_consumer sudo supervisorctl start ckan_fetch_consumer From 0aa7b7d1996683f47de060a6247128c91428a2a0 Mon Sep 17 00:00:00 2001 From: Ian Murray Date: Thu, 16 Feb 2012 20:27:51 +0000 Subject: [PATCH 17/43] [master][doc] Provided an example supervidor config file And reference to it from the README. This allows the example conf file to be used as a base in an automatated deployment. --- README.rst | 34 +--------------------- config/supervisor/ckan_harvesting.conf | 40 ++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 33 deletions(-) create mode 100644 config/supervisor/ckan_harvesting.conf diff --git a/README.rst b/README.rst index b6143e1..77493db 100644 --- a/README.rst +++ b/README.rst @@ -363,39 +363,7 @@ following steps with the one you are using. Create a file named `/etc/supervisor/conf.d/ckan_harvesting.conf`, and copy the following contents:: - - ; =============================== - ; ckan harvester - ; =============================== - - [program:ckan_gather_consumer] - - command=/var/lib/ckan/std/pyenv/bin/paster --plugin=ckanext-harvest harvester gather_consumer --config=/etc/ckan/std/std.ini - - ; user that owns virtual environment. - user=okfn - - numprocs=1 - stdout_logfile=/var/log/ckan/std/gather_consumer.log - stderr_logfile=/var/log/ckan/std/gather_consumer.log - autostart=true - autorestart=true - startsecs=10 - - [program:ckan_fetch_consumer] - - command=/var/lib/ckan/std/pyenv/bin/paster --plugin=ckanext-harvest harvester fetch_consumer --config=/etc/ckan/std/std.ini - - ; user that owns virtual environment. - user=okfn - - numprocs=1 - stdout_logfile=/var/log/ckan/std/fetch_consumer.log - stderr_logfile=/var/log/ckan/std/fetch_consumer.log - autostart=true - autorestart=true - startsecs=10 - + .. include:: config/supervisor/ckan_harvesting.conf There are a number of things that you will need to replace with your specific installation settings (the example above shows paths from a diff --git a/config/supervisor/ckan_harvesting.conf b/config/supervisor/ckan_harvesting.conf new file mode 100644 index 0000000..4274a54 --- /dev/null +++ b/config/supervisor/ckan_harvesting.conf @@ -0,0 +1,40 @@ +; =============================== +; ckan harvester example +; =============================== + +; symlink or copy this file to /etc/supervisr/conf.d +; change the path/to/virtualenv below to the virtualenv ckan is in. + +[program:ckan_gather_consumer] + +; Full Path to executable, should be path to virtural environment, +; Full path to config file too. + +command=/path/to/pyenv/bin/paster --plugin=ckanext-harvest harvester gather_consumer --config=/path/to/config/std.ini + +; user that owns virtual environment. +user=ckan + +numprocs=1 +stdout_logfile=/var/log/ckan/std/gather_consumer.log +stderr_logfile=/var/log/ckan/std/gather_consumer.log +autostart=true +autorestart=true +startsecs=10 + +[program:ckan_fetch_consumer] + +; Full Path to executable, should be path to virtural environment, +; Full path to config file too. + +command=/path/to/pyenv/bin/paster --plugin=ckanext-harvest harvester fetch_consumer --config=/path/to/config/std.ini + +; user that owns virtual environment. +user=ckan + +numprocs=1 +stdout_logfile=/var/log/ckan/std/fetch_consumer.log +stderr_logfile=/var/log/ckan/std/fetch_consumer.log +autostart=true +autorestart=true +startsecs=10 From f482eb607c51b9f15ce03a43ff5414fb32b090c0 Mon Sep 17 00:00:00 2001 From: Ian Murray Date: Thu, 16 Feb 2012 21:08:00 +0000 Subject: [PATCH 18/43] [master][doc] Revert inclusion of external file as github doesn't support it. This partially reverts commit 0aa7b7d1996683f47de060a6247128c91428a2a0. --- README.rst | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 77493db..b6143e1 100644 --- a/README.rst +++ b/README.rst @@ -363,7 +363,39 @@ following steps with the one you are using. Create a file named `/etc/supervisor/conf.d/ckan_harvesting.conf`, and copy the following contents:: - .. include:: config/supervisor/ckan_harvesting.conf + + ; =============================== + ; ckan harvester + ; =============================== + + [program:ckan_gather_consumer] + + command=/var/lib/ckan/std/pyenv/bin/paster --plugin=ckanext-harvest harvester gather_consumer --config=/etc/ckan/std/std.ini + + ; user that owns virtual environment. + user=okfn + + numprocs=1 + stdout_logfile=/var/log/ckan/std/gather_consumer.log + stderr_logfile=/var/log/ckan/std/gather_consumer.log + autostart=true + autorestart=true + startsecs=10 + + [program:ckan_fetch_consumer] + + command=/var/lib/ckan/std/pyenv/bin/paster --plugin=ckanext-harvest harvester fetch_consumer --config=/etc/ckan/std/std.ini + + ; user that owns virtual environment. + user=okfn + + numprocs=1 + stdout_logfile=/var/log/ckan/std/fetch_consumer.log + stderr_logfile=/var/log/ckan/std/fetch_consumer.log + autostart=true + autorestart=true + startsecs=10 + There are a number of things that you will need to replace with your specific installation settings (the example above shows paths from a From 651474e9f1f58e36c68f7b43073f242545eb72d1 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 29 Feb 2012 10:59:02 +0000 Subject: [PATCH 19/43] [logic] Refactor logic layer to follow CKAN core conventions To make maintenance easier and better support the upcoming auth checks, the logic layer has been refactored to mimic the structure of the one on CKAN core: separate actions and dictize functions and logic functions receive a context. Only get functions are included in this commit. --- ckanext/harvest/commands/harvester.py | 23 ++- ckanext/harvest/controllers/view.py | 30 ++-- ckanext/harvest/lib/__init__.py | 198 ++--------------------- ckanext/harvest/logic/action/__init__.py | 7 + ckanext/harvest/logic/action/get.py | 89 ++++++++++ ckanext/harvest/logic/dictization.py | 156 ++++++++++++++++++ ckanext/harvest/plugin.py | 18 ++- 7 files changed, 314 insertions(+), 207 deletions(-) create mode 100644 ckanext/harvest/logic/action/__init__.py create mode 100644 ckanext/harvest/logic/action/get.py create mode 100644 ckanext/harvest/logic/dictization.py diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index 46fd36e..e98bf35 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -2,6 +2,9 @@ import sys import re from pprint import pprint +from ckan import model +from ckan.logic import get_action + from ckan.lib.cli import CkanCommand from ckanext.harvest.lib import * from ckanext.harvest.queue import get_gather_consumer, get_fetch_consumer @@ -150,7 +153,8 @@ class Harvester(CkanCommand): print 'Created new harvest source:' self.print_harvest_source(source) - sources = get_harvest_sources() + context = {'model': model} + sources = get_action('harvest_source_list')(context,{}) self.print_there_are('harvest source', sources) # Create a Harvest Job for the new Source @@ -175,12 +179,14 @@ class Harvester(CkanCommand): def list_harvest_sources(self): if len(self.args) >= 2 and self.args[1] == 'all': - sources = get_harvest_sources() + data_dict = {} what = 'harvest source' else: - sources = get_harvest_sources(active=True) + data_dict = {'only_active':True} what = 'active harvest source' + context = {'model': model} + sources = get_action('harvest_source_list')(context,data_dict) self.print_harvest_sources(sources) self.print_there_are(what=what, sequence=sources) @@ -194,12 +200,14 @@ class Harvester(CkanCommand): job = create_harvest_job(source_id) self.print_harvest_job(job) - status = u'New' - jobs = get_harvest_jobs(status=status) + context = {'model': model} + jobs = get_action('harvest_job_list')(context,{'status':u'New'}) self.print_there_are('harvest jobs', jobs, condition=status) def list_harvest_jobs(self): - jobs = get_harvest_jobs() + context = {'model': model} + jobs = get_action('harvest_job_list')(context,{}) + self.print_harvest_jobs(jobs) self.print_there_are(what='harvest job', sequence=jobs) @@ -248,8 +256,7 @@ class Harvester(CkanCommand): def print_harvest_job(self, job): print ' Job id: %s' % job['id'] print ' status: %s' % job['status'] - print ' source: %s' % job['source']['id'] - print ' url: %s' % job['source']['url'] + print ' source: %s' % job['source'] print ' objects: %s' % len(job['objects']) print 'gather_errors: %s' % len(job['gather_errors']) diff --git a/ckanext/harvest/controllers/view.py b/ckanext/harvest/controllers/view.py index 2589159..6150df4 100644 --- a/ckanext/harvest/controllers/view.py +++ b/ckanext/harvest/controllers/view.py @@ -2,15 +2,16 @@ from lxml import etree from lxml.etree import XMLSyntaxError from pylons.i18n import _ +from ckan import model + import ckan.lib.helpers as h, json from ckan.lib.base import BaseController, c, g, request, \ response, session, render, config, abort, redirect from ckan.lib.navl.dictization_functions import DataError -from ckan.logic import NotFound, ValidationError +from ckan.logic import NotFound, ValidationError, get_action from ckanext.harvest.logic.schema import harvest_source_form_schema from ckanext.harvest.lib import create_harvest_source, edit_harvest_source, \ - get_harvest_source, get_harvest_sources, \ create_harvest_job, get_registered_harvesters_info, \ get_harvest_object from ckan.lib.helpers import Page @@ -29,7 +30,9 @@ class ViewController(BaseController): def index(self): # Request all harvest sources - c.sources = get_harvest_sources() + context = {'model':model} + + c.sources = get_action('harvest_source_list')(context,{}) return render('index.html') @@ -71,9 +74,12 @@ class ViewController(BaseController): if ('save' in request.params) and not data: return self._save_edit(id) + if not data: try: - old_data = get_harvest_source(id) + context = {'model':model} + + old_data = get_action('harvest_source_show')(context, {'id':id}) except NotFound: abort(404, _('Harvest Source not found')) @@ -117,7 +123,9 @@ class ViewController(BaseController): def read(self,id): try: - c.source = get_harvest_source(id) + context = {'model':model} + c.source = get_action('harvest_source_show')(context, {'id':id}) + c.page = Page( collection=c.source['status']['packages'], page=request.params.get('page', 1), @@ -153,20 +161,22 @@ class ViewController(BaseController): def show_object(self,id): try: - object = get_harvest_object(id) + context = {'model':model} + obj = get_action('harvest_object_show')(context, {'id':id}) + # Check content type. It will probably be either XML or JSON try: - etree.fromstring(object['content']) + etree.fromstring(obj['content']) response.content_type = 'application/xml' except XMLSyntaxError: try: - json.loads(object['content']) + json.loads(obj['content']) response.content_type = 'application/json' except ValueError: pass - response.headers["Content-Length"] = len(object['content']) - return object['content'] + response.headers['Content-Length'] = len(obj['content']) + return obj['content'] except NotFound: abort(404,_('Harvest object not found')) except Exception, e: diff --git a/ckanext/harvest/lib/__init__.py b/ckanext/harvest/lib/__init__.py index aafe09f..1c22ec9 100644 --- a/ckanext/harvest/lib/__init__.py +++ b/ckanext/harvest/lib/__init__.py @@ -1,14 +1,14 @@ import urlparse import re -from sqlalchemy import distinct,func +from ckan import model from ckan.model import Session, repo from ckan.model import Package from ckan.lib.navl.dictization_functions import validate from ckan.logic import NotFound, ValidationError from ckanext.harvest.logic.schema import harvest_source_form_schema - +from ckanext.harvest.logic.dictization import (harvest_source_dictize, harvest_job_dictize, harvest_object_dictize) from ckan.plugins import PluginImplementations from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject, \ HarvestGatherError, HarvestObjectError @@ -18,163 +18,8 @@ from ckanext.harvest.interfaces import IHarvester import logging log = logging.getLogger('ckanext') - -def _get_source_status(source, detailed=True): - out = dict() - job_count = HarvestJob.filter(source=source).count() - if not job_count: - out['msg'] = 'No jobs yet' - return out - out = {'next_harvest':'', - 'last_harvest_request':'', - 'last_harvest_statistics':{'added':0,'updated':0,'errors':0}, - 'last_harvest_errors':{'gather':[],'object':[]}, - 'overall_statistics':{'added':0, 'errors':0}, - 'packages':[]} - # Get next scheduled job - next_job = HarvestJob.filter(source=source,status=u'New').first() - if next_job: - out['next_harvest'] = 'Scheduled' - else: - out['next_harvest'] = 'Not yet scheduled' - - # Get the last finished job - last_job = HarvestJob.filter(source=source,status=u'Finished') \ - .order_by(HarvestJob.created.desc()).first() - - if last_job: - #TODO: Should we encode the dates as strings? - out['last_harvest_request'] = str(last_job.gather_finished) - - #Get HarvestObjects from last job whit links to packages - if detailed: - last_objects = [obj for obj in last_job.objects if obj.package is not None] - - if len(last_objects) == 0: - # No packages added or updated - out['last_harvest_statistics']['added'] = 0 - out['last_harvest_statistics']['updated'] = 0 - else: - # Check wether packages were added or updated - for last_object in last_objects: - # Check if the same package had been linked before - previous_objects = Session.query(HarvestObject) \ - .filter(HarvestObject.package==last_object.package) \ - .count() - - if previous_objects == 1: - # It didn't previously exist, it has been added - out['last_harvest_statistics']['added'] += 1 - else: - # Pacakge already existed, but it has been updated - out['last_harvest_statistics']['updated'] += 1 - - # Last harvest errors - # We have the gathering errors in last_job.gather_errors, so let's also - # get also the object errors. - object_errors = Session.query(HarvestObjectError).join(HarvestObject) \ - .filter(HarvestObject.job==last_job) - - out['last_harvest_statistics']['errors'] = len(last_job.gather_errors) \ - + object_errors.count() - if detailed: - for gather_error in last_job.gather_errors: - out['last_harvest_errors']['gather'].append(gather_error.message) - - for object_error in object_errors: - err = {'object_id':object_error.object.id,'object_guid':object_error.object.guid,'message': object_error.message} - out['last_harvest_errors']['object'].append(err) - - # Overall statistics - packages = Session.query(distinct(HarvestObject.package_id),Package.name) \ - .join(Package).join(HarvestSource) \ - .filter(HarvestObject.source==source) \ - .filter(HarvestObject.current==True) \ - .filter(Package.state==u'active') - - out['overall_statistics']['added'] = packages.count() - if detailed: - for package in packages: - out['packages'].append(package.name) - - gather_errors = Session.query(HarvestGatherError) \ - .join(HarvestJob).join(HarvestSource) \ - .filter(HarvestJob.source==source).count() - - object_errors = Session.query(HarvestObjectError) \ - .join(HarvestObject).join(HarvestJob).join(HarvestSource) \ - .filter(HarvestJob.source==source).count() - out['overall_statistics']['errors'] = gather_errors + object_errors - else: - out['last_harvest_request'] = 'Not yet harvested' - - return out - - -def _source_as_dict(source, detailed=True): - out = source.as_dict() - out['jobs'] = [] - - for job in source.jobs: - out['jobs'].append(job.as_dict()) - - out['status'] = _get_source_status(source, detailed=detailed) - - - return out - -def _job_as_dict(job): - out = job.as_dict() - out['source'] = job.source.as_dict() - out['objects'] = [] - out['gather_errors'] = [] - - for obj in job.objects: - out['objects'].append(obj.as_dict()) - - for error in job.gather_errors: - out['gather_errors'].append(error.as_dict()) - - return out - -def _object_as_dict(obj): - out = obj.as_dict() - out['source'] = obj.source.as_dict() - out['job'] = obj.job.as_dict() - - if obj.package: - out['package'] = obj.package.as_dict() - - out['errors'] = [] - - for error in obj.errors: - out['errors'].append(error.as_dict()) - - return out - -def _prettify(field_name): - field_name = re.sub('(? Date: Wed, 29 Feb 2012 15:20:35 +0000 Subject: [PATCH 20/43] [logic] Refactor the rest of the logic functions (create,update,delete) --- ckanext/harvest/commands/harvester.py | 35 +++-- ckanext/harvest/controllers/view.py | 26 ++-- ckanext/harvest/lib/__init__.py | 201 ------------------------- ckanext/harvest/logic/action/create.py | 99 ++++++++++++ ckanext/harvest/logic/action/delete.py | 24 +++ ckanext/harvest/logic/action/get.py | 32 +++- ckanext/harvest/logic/action/update.py | 125 +++++++++++++++ ckanext/harvest/logic/dictization.py | 10 -- ckanext/harvest/plugin.py | 19 ++- 9 files changed, 326 insertions(+), 245 deletions(-) delete mode 100644 ckanext/harvest/lib/__init__.py create mode 100644 ckanext/harvest/logic/action/create.py create mode 100644 ckanext/harvest/logic/action/delete.py create mode 100644 ckanext/harvest/logic/action/update.py diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index e98bf35..338bb99 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -6,7 +6,6 @@ from ckan import model from ckan.logic import get_action from ckan.lib.cli import CkanCommand -from ckanext.harvest.lib import * from ckanext.harvest.queue import get_gather_consumer, get_fetch_consumer class Harvester(CkanCommand): @@ -100,6 +99,9 @@ class Harvester(CkanCommand): self.import_stage() elif cmd == 'job-all': self.create_harvest_job_all() + elif cmd == 'harvesters-info': + harvesters_info = get_action('harvesters_info_show')() + pprint(harvesters_info) else: print 'Command %s not recognized' % cmd @@ -142,14 +144,16 @@ class Harvester(CkanCommand): else: publisher_id = u'' try: - source = create_harvest_source({ + data_dict = { 'url':url, 'type':type, 'config':config, 'active':active, 'user_id':user_id, - 'publisher_id':publisher_id}) + 'publisher_id':publisher_id} + context = {'model':model} + source = get_action('harvest_source_create')(context,data_dict) print 'Created new harvest source:' self.print_harvest_source(source) @@ -157,8 +161,8 @@ class Harvester(CkanCommand): sources = get_action('harvest_source_list')(context,{}) self.print_there_are('harvest source', sources) - # Create a Harvest Job for the new Source - create_harvest_job(source['id']) + # Create a harvest job for the new source + get_action('harvest_job_create')(context,{'source_id':source['id']}) print 'A new Harvest Job for this source has also been created' except ValidationError,e: @@ -173,8 +177,8 @@ class Harvester(CkanCommand): else: print 'Please provide a source id' sys.exit(1) - - remove_harvest_source(source_id) + context = {'model': model} + get_action('harvest_source_delete')(context,{'id':source_id}) print 'Removed harvest source: %s' % source_id def list_harvest_sources(self): @@ -212,11 +216,9 @@ class Harvester(CkanCommand): self.print_there_are(what='harvest job', sequence=jobs) def run_harvester(self): - try: - jobs = run_harvest_jobs() - except: - pass - sys.exit(0) + context = {'model': model} + jobs = get_action('harvest_jobs_run')(context,{}) + #print 'Sent %s jobs to the gather queue' % len(jobs) def import_stage(self): @@ -224,12 +226,15 @@ class Harvester(CkanCommand): source_id = unicode(self.args[1]) else: source_id = None - objs = import_last_objects(source_id) + context = {'model': model} + objs = get_action('harvest_objects_import')(context,{'source_id':source_id}) + print '%s objects reimported' % len(objs) def create_harvest_job_all(self): - jobs = create_harvest_job_all() - print "Created %s new harvest jobs" % len(jobs) + context = {'model': model} + jobs = get_action('harvest_job_create_all')(context,{}) + print 'Created %s new harvest jobs' % len(jobs) def print_harvest_sources(self, sources): if sources: diff --git a/ckanext/harvest/controllers/view.py b/ckanext/harvest/controllers/view.py index 6150df4..4c560ae 100644 --- a/ckanext/harvest/controllers/view.py +++ b/ckanext/harvest/controllers/view.py @@ -11,9 +11,6 @@ from ckan.lib.base import BaseController, c, g, request, \ from ckan.lib.navl.dictization_functions import DataError from ckan.logic import NotFound, ValidationError, get_action from ckanext.harvest.logic.schema import harvest_source_form_schema -from ckanext.harvest.lib import create_harvest_source, edit_harvest_source, \ - create_harvest_job, get_registered_harvesters_info, \ - get_harvest_object from ckan.lib.helpers import Page import logging log = logging.getLogger(__name__) @@ -44,7 +41,8 @@ class ViewController(BaseController): data = data or {} errors = errors or {} error_summary = error_summary or {} - vars = {'data': data, 'errors': errors, 'error_summary': error_summary, 'harvesters': get_registered_harvesters_info()} + harvesters_info = get_action('harvesters_info_show')() + vars = {'data': data, 'errors': errors, 'error_summary': error_summary, 'harvesters': harvesters_info} c.form = render('source/new_source_form.html', extra_vars=vars) return render('source/new.html') @@ -54,10 +52,11 @@ class ViewController(BaseController): data_dict = dict(request.params) self._check_data_dict(data_dict) - source = create_harvest_source(data_dict) + context = {'model':model} + source = get_action('harvest_source_create')(context,data_dict) # Create a harvest job for the new source - create_harvest_job(source['id']) + get_action('harvest_job_create')(context,{'source_id':source['id']}) h.flash_success(_('New harvest source added successfully.' 'A new harvest job for the source has also been created.')) @@ -87,7 +86,8 @@ class ViewController(BaseController): errors = errors or {} error_summary = error_summary or {} - vars = {'data': data, 'errors': errors, 'error_summary': error_summary, 'harvesters': get_registered_harvesters_info()} + harvesters_info = get_action('harvesters_info_show')() + vars = {'data': data, 'errors': errors, 'error_summary': error_summary, 'harvesters': harvesters_info} c.form = render('source/new_source_form.html', extra_vars=vars) return render('source/edit.html') @@ -95,9 +95,11 @@ class ViewController(BaseController): def _save_edit(self,id): try: data_dict = dict(request.params) + data_dict['id'] = id self._check_data_dict(data_dict) + context = {'model':model} - source = edit_harvest_source(id,data_dict) + source = get_action('harvest_source_update')(context,data_dict) h.flash_success(_('Harvest source edited successfully.')) redirect(h.url_for('harvest')) @@ -139,9 +141,10 @@ class ViewController(BaseController): def delete(self,id): try: - delete_harvest_source(id) + context = {'model':model} + get_action('harvest_source_delete')(context, {'id':id}) - h.flash_success(_('Harvesting source deleted successfully')) + h.flash_success(_('Harvesting source successfully inactivated')) redirect(h.url_for('harvest')) except NotFound: abort(404,_('Harvest source not found')) @@ -149,7 +152,8 @@ class ViewController(BaseController): def create_harvesting_job(self,id): try: - create_harvest_job(id) + context = {'model':model} + get_action('harvest_job_create')(context,{'source_id':id}) h.flash_success(_('Refresh requested, harvesting will take place within 15 minutes.')) except NotFound: abort(404,_('Harvest source not found')) diff --git a/ckanext/harvest/lib/__init__.py b/ckanext/harvest/lib/__init__.py deleted file mode 100644 index 1c22ec9..0000000 --- a/ckanext/harvest/lib/__init__.py +++ /dev/null @@ -1,201 +0,0 @@ -import urlparse -import re - -from ckan import model -from ckan.model import Session, repo -from ckan.model import Package -from ckan.lib.navl.dictization_functions import validate -from ckan.logic import NotFound, ValidationError - -from ckanext.harvest.logic.schema import harvest_source_form_schema -from ckanext.harvest.logic.dictization import (harvest_source_dictize, harvest_job_dictize, harvest_object_dictize) -from ckan.plugins import PluginImplementations -from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject, \ - HarvestGatherError, HarvestObjectError -from ckanext.harvest.queue import get_gather_publisher -from ckanext.harvest.interfaces import IHarvester - -import logging -log = logging.getLogger('ckanext') - -#TODO: remove! -context = {'model':model} - -def create_harvest_source(data_dict): - - schema = harvest_source_form_schema() - data, errors = validate(data_dict, schema) - - if errors: - Session.rollback() - raise ValidationError(errors,_error_summary(errors)) - - source = HarvestSource() - source.url = data['url'] - source.type = data['type'] - - opt = ['active','title','description','user_id','publisher_id','config'] - for o in opt: - if o in data and data[o] is not None: - source.__setattr__(o,data[o]) - - if 'active' in data_dict: - source.active = data['active'] - - source.save() - - return harvest_source_dictize(source,context) - -def edit_harvest_source(source_id,data_dict): - schema = harvest_source_form_schema() - - source = HarvestSource.get(source_id) - if not source: - raise NotFound('Harvest source %s does not exist' % source_id) - - # Add source id to the dict, as some validators will need it - data_dict['id'] = source.id - - data, errors = validate(data_dict, schema) - if errors: - Session.rollback() - raise ValidationError(errors,_error_summary(errors)) - - fields = ['url','title','type','description','user_id','publisher_id'] - for f in fields: - if f in data and data[f] is not None: - source.__setattr__(f,data[f]) - - if 'active' in data_dict: - source.active = data['active'] - - if 'config' in data_dict: - source.config = data['config'] - - source.save() - # Abort any pending jobs - if not source.active: - jobs = HarvestJob.filter(source=source,status=u'New') - if jobs: - for job in jobs: - job.status = u'Aborted' - job.save() - - return harvest_source_dictize(source,context) - - -def remove_harvest_source(source_id): - - source = HarvestSource.get(source_id) - if not source: - raise NotFound('Harvest source %s does not exist' % source_id) - - # Don't actually delete the record, just flag it as inactive - source.active = False - source.save() - - # Abort any pending jobs - jobs = HarvestJob.filter(source=source,status=u'New') - if jobs: - for job in jobs: - job.status = u'Aborted' - job.save() - - return True - -def create_harvest_job(source_id): - # Check if source exists - source = HarvestSource.get(source_id) - if not source: - raise NotFound('Harvest source %s does not exist' % source_id) - - # Check if the source is active - if not source.active: - raise Exception('Can not create jobs on inactive sources') - - # Check if there already is an unrun job for this source - exists = get_action('harvest_job_list')(context,{'status':u'New'}) - if len(exists): - raise Exception('There already is an unrun job for this source') - - job = HarvestJob() - job.source = source - - job.save() - return harvest_job_dictize(job,context) - -def run_harvest_jobs(): - # Check if there are pending harvest jobs - jobs = get_action('harvest_job_list')(context,{'status':u'New'}) - if len(jobs) == 0: - raise Exception('There are no new harvesting jobs') - - # Send each job to the gather queue - publisher = get_gather_publisher() - sent_jobs = [] - for job in jobs: - if job['source']['active']: - publisher.send({'harvest_job_id': job['id']}) - log.info('Sent job %s to the gather queue' % job['id']) - sent_jobs.append(job) - - publisher.close() - return sent_jobs - -def import_last_objects(source_id=None): - if source_id: - source = HarvestSource.get(source_id) - if not source: - raise NotFound('Harvest source %s does not exist' % source_id) - - if not source.active: - raise Exception('This harvest source is not active') - - last_objects_ids = Session.query(HarvestObject.id) \ - .join(HarvestSource).join(Package) \ - .filter(HarvestObject.source==source) \ - .filter(HarvestObject.current==True) \ - .filter(Package.state==u'active') \ - .all() - else: - last_objects_ids = Session.query(HarvestObject.id) \ - .join(Package) \ - .filter(HarvestObject.current==True) \ - .filter(Package.state==u'active') \ - .all() - - last_objects = [] - for obj_id in last_objects_ids: - obj = Session.query(HarvestObject).get(obj_id) - for harvester in PluginImplementations(IHarvester): - if harvester.info()['name'] == obj.source.type: - if hasattr(harvester,'force_import'): - harvester.force_import = True - harvester.import_stage(obj) - break - last_objects.append(obj) - return last_objects - -def create_harvest_job_all(): - - # Get all active sources - sources = harvest_sources_list(active=True) - jobs = [] - # Create a new job for each - for source in sources: - job = create_harvest_job(source['id']) - jobs.append(job) - - return jobs - -def get_registered_harvesters_info(): - available_harvesters = [] - for harvester in PluginImplementations(IHarvester): - info = harvester.info() - if not info or 'name' not in info: - log.error('Harvester %r does not provide the harvester name in the info response' % str(harvester)) - continue - info['show_config'] = (info.get('form_config_interface','') == 'Text') - available_harvesters.append(info) - - return available_harvesters diff --git a/ckanext/harvest/logic/action/create.py b/ckanext/harvest/logic/action/create.py new file mode 100644 index 0000000..a1a92d9 --- /dev/null +++ b/ckanext/harvest/logic/action/create.py @@ -0,0 +1,99 @@ +import re + +from ckan.logic import NotFound, ValidationError +from ckan.lib.navl.dictization_functions import validate + +from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject) +from ckanext.harvest.logic.schema import harvest_source_form_schema +from ckanext.harvest.logic.dictization import (harvest_source_dictize, + harvest_job_dictize) +from ckanext.harvest.logic.action.get import harvest_source_list,harvest_job_list + +def harvest_source_create(context,data_dict): + + model = context['model'] + + schema = harvest_source_form_schema() + data, errors = validate(data_dict, schema) + + if errors: + model.Session.rollback() + raise ValidationError(errors,_error_summary(errors)) + + source = HarvestSource() + source.url = data['url'] + source.type = data['type'] + + opt = ['active','title','description','user_id','publisher_id','config'] + for o in opt: + if o in data and data[o] is not None: + source.__setattr__(o,data[o]) + + if 'active' in data_dict: + source.active = data['active'] + + source.save() + + return harvest_source_dictize(source,context) + +def harvest_job_create(context,data_dict): + + source_id = data_dict['source_id'] + + # Check if source exists + source = HarvestSource.get(source_id) + if not source: + raise NotFound('Harvest source %s does not exist' % source_id) + + # Check if the source is active + if not source.active: + raise Exception('Can not create jobs on inactive sources') + + # Check if there already is an unrun job for this source + data_dict ={ + 'source_id':source_id, + 'status':u'New' + } + exists = harvest_job_list(context,data_dict) + if len(exists): + raise Exception('There already is an unrun job for this source') + + job = HarvestJob() + job.source = source + + job.save() + return harvest_job_dictize(job,context) + +def harvest_job_create_all(context,data_dict): + + data_dict.update({'only_active':True}) + + # Get all active sources + sources = harvest_source_list(context,data_dict) + jobs = [] + # Create a new job for each, if there isn't already one + for source in sources: + data_dict ={ + 'source_id':source['id'], + 'status':u'New' + } + + exists = harvest_job_list(context,data_dict) + if len(exists): + continue + + job = harvest_job_create(context,{'source_id':source['id']}) + jobs.append(job) + + return jobs + +def _error_summary(error_dict): + error_summary = {} + for key, error in error_dict.iteritems(): + error_summary[_prettify(key)] = error[0] + return error_summary + +def _prettify(field_name): + field_name = re.sub('(? Date: Thu, 1 Mar 2012 12:02:16 +0000 Subject: [PATCH 21/43] [logic,auth] Add auth logic layer The first version of the auth layer is based on the current policy, i.e. you need to be sysadmin to perform any action. TODO: the CLI is still not working. --- ckanext/harvest/commands/harvester.py | 9 ++-- ckanext/harvest/controllers/view.py | 65 ++++++++++++++++--------- ckanext/harvest/logic/action/create.py | 11 ++++- ckanext/harvest/logic/action/delete.py | 4 +- ckanext/harvest/logic/action/get.py | 33 ++++++++++--- ckanext/harvest/logic/action/update.py | 19 ++++++-- ckanext/harvest/logic/auth/__init__.py | 7 +++ ckanext/harvest/logic/auth/create.py | 30 ++++++++++++ ckanext/harvest/logic/auth/delete.py | 13 +++++ ckanext/harvest/logic/auth/get.py | 67 ++++++++++++++++++++++++++ ckanext/harvest/logic/auth/update.py | 30 ++++++++++++ ckanext/harvest/plugin.py | 50 ++++++++++++++++--- 12 files changed, 289 insertions(+), 49 deletions(-) create mode 100644 ckanext/harvest/logic/auth/__init__.py create mode 100644 ckanext/harvest/logic/auth/create.py create mode 100644 ckanext/harvest/logic/auth/delete.py create mode 100644 ckanext/harvest/logic/auth/get.py create mode 100644 ckanext/harvest/logic/auth/update.py diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index 338bb99..9f69978 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -152,12 +152,11 @@ class Harvester(CkanCommand): 'user_id':user_id, 'publisher_id':publisher_id} - context = {'model':model} + context = {'model':model, 'session':model.Session} source = get_action('harvest_source_create')(context,data_dict) print 'Created new harvest source:' self.print_harvest_source(source) - context = {'model': model} sources = get_action('harvest_source_list')(context,{}) self.print_there_are('harvest source', sources) @@ -189,7 +188,7 @@ class Harvester(CkanCommand): data_dict = {'only_active':True} what = 'active harvest source' - context = {'model': model} + context = {'model': model,'session':model.Session} sources = get_action('harvest_source_list')(context,data_dict) self.print_harvest_sources(sources) self.print_there_are(what=what, sequence=sources) @@ -204,7 +203,7 @@ class Harvester(CkanCommand): job = create_harvest_job(source_id) self.print_harvest_job(job) - context = {'model': model} + context = {'model': model,'session':model.Session} jobs = get_action('harvest_job_list')(context,{'status':u'New'}) self.print_there_are('harvest jobs', jobs, condition=status) @@ -226,7 +225,7 @@ class Harvester(CkanCommand): source_id = unicode(self.args[1]) else: source_id = None - context = {'model': model} + context = {'model': model, 'session':model.Session} objs = get_action('harvest_objects_import')(context,{'source_id':source_id}) print '%s objects reimported' % len(objs) diff --git a/ckanext/harvest/controllers/view.py b/ckanext/harvest/controllers/view.py index 4c560ae..899358a 100644 --- a/ckanext/harvest/controllers/view.py +++ b/ckanext/harvest/controllers/view.py @@ -9,7 +9,7 @@ from ckan.lib.base import BaseController, c, g, request, \ response, session, render, config, abort, redirect from ckan.lib.navl.dictization_functions import DataError -from ckan.logic import NotFound, ValidationError, get_action +from ckan.logic import NotFound, ValidationError, get_action, NotAuthorized from ckanext.harvest.logic.schema import harvest_source_form_schema from ckan.lib.helpers import Page import logging @@ -17,19 +17,15 @@ log = logging.getLogger(__name__) class ViewController(BaseController): - def __before__(self, action, **env): - super(ViewController, self).__before__(action, **env) - # All calls to this controller must be with a sysadmin key - if not self.authorizer.is_sysadmin(c.user): - response_msg = _('Not authorized to see this page') - status = 401 - abort(status, response_msg) + not_auth_message = _('Not authorized to see this page') def index(self): - # Request all harvest sources - context = {'model':model} - - c.sources = get_action('harvest_source_list')(context,{}) + context = {'model':model, 'user':c.user,'session':model.Session} + try: + # Request all harvest sources + c.sources = get_action('harvest_source_list')(context,{}) + except NotAuthorized,e: + abort(401,self.not_auth_message) return render('index.html') @@ -41,7 +37,13 @@ class ViewController(BaseController): data = data or {} errors = errors or {} error_summary = error_summary or {} - harvesters_info = get_action('harvesters_info_show')() + + try: + context = {'model':model, 'user':c.user} + harvesters_info = get_action('harvesters_info_show')(context,{}) + except NotAuthorized,e: + abort(401,self.not_auth_message) + vars = {'data': data, 'errors': errors, 'error_summary': error_summary, 'harvesters': harvesters_info} c.form = render('source/new_source_form.html', extra_vars=vars) @@ -51,8 +53,7 @@ class ViewController(BaseController): try: data_dict = dict(request.params) self._check_data_dict(data_dict) - - context = {'model':model} + context = {'model':model, 'user':c.user, 'session':model.Session} source = get_action('harvest_source_create')(context,data_dict) # Create a harvest job for the new source @@ -61,6 +62,8 @@ class ViewController(BaseController): h.flash_success(_('New harvest source added successfully.' 'A new harvest job for the source has also been created.')) redirect(h.url_for('harvest')) + except NotAuthorized,e: + abort(401,self.not_auth_message) except DataError,e: abort(400, 'Integrity Error') except ValidationError,e: @@ -76,17 +79,23 @@ class ViewController(BaseController): if not data: try: - context = {'model':model} + context = {'model':model, 'user':c.user} old_data = get_action('harvest_source_show')(context, {'id':id}) except NotFound: abort(404, _('Harvest Source not found')) + except NotAuthorized,e: + abort(401,self.not_auth_message) data = data or old_data errors = errors or {} error_summary = error_summary or {} + try: + context = {'model':model, 'user':c.user} + harvesters_info = get_action('harvesters_info_show')(context,{}) + except NotAuthorized,e: + abort(401,self.not_auth_message) - harvesters_info = get_action('harvesters_info_show')() vars = {'data': data, 'errors': errors, 'error_summary': error_summary, 'harvesters': harvesters_info} c.form = render('source/new_source_form.html', extra_vars=vars) @@ -97,12 +106,14 @@ class ViewController(BaseController): data_dict = dict(request.params) data_dict['id'] = id self._check_data_dict(data_dict) - context = {'model':model} + context = {'model':model, 'user':c.user, 'session':model.Session} source = get_action('harvest_source_update')(context,data_dict) h.flash_success(_('Harvest source edited successfully.')) redirect(h.url_for('harvest')) + except NotAuthorized,e: + abort(401,self.not_auth_message) except DataError,e: abort(400, _('Integrity Error')) except NotFound, e: @@ -125,7 +136,7 @@ class ViewController(BaseController): def read(self,id): try: - context = {'model':model} + context = {'model':model, 'user':c.user} c.source = get_action('harvest_source_show')(context, {'id':id}) c.page = Page( @@ -137,26 +148,33 @@ class ViewController(BaseController): return render('source/read.html') except NotFound: abort(404,_('Harvest source not found')) + except NotAuthorized,e: + abort(401,self.not_auth_message) + def delete(self,id): try: - context = {'model':model} + context = {'model':model, 'user':c.user} get_action('harvest_source_delete')(context, {'id':id}) h.flash_success(_('Harvesting source successfully inactivated')) redirect(h.url_for('harvest')) except NotFound: abort(404,_('Harvest source not found')) + except NotAuthorized,e: + abort(401,self.not_auth_message) def create_harvesting_job(self,id): try: - context = {'model':model} + context = {'model':model, 'user':c.user, 'session':model.Session} get_action('harvest_job_create')(context,{'source_id':id}) h.flash_success(_('Refresh requested, harvesting will take place within 15 minutes.')) except NotFound: abort(404,_('Harvest source not found')) + except NotAuthorized,e: + abort(401,self.not_auth_message) except Exception, e: msg = 'An error occurred: [%s]' % e.message h.flash_error(msg) @@ -164,8 +182,9 @@ class ViewController(BaseController): redirect(h.url_for('harvest')) def show_object(self,id): + try: - context = {'model':model} + context = {'model':model, 'user':c.user} obj = get_action('harvest_object_show')(context, {'id':id}) # Check content type. It will probably be either XML or JSON @@ -183,6 +202,8 @@ class ViewController(BaseController): return obj['content'] except NotFound: abort(404,_('Harvest object not found')) + except NotAuthorized,e: + abort(401,self.not_auth_message) except Exception, e: msg = 'An error occurred: [%s]' % e.message h.flash_error(msg) diff --git a/ckanext/harvest/logic/action/create.py b/ckanext/harvest/logic/action/create.py index a1a92d9..6b0907b 100644 --- a/ckanext/harvest/logic/action/create.py +++ b/ckanext/harvest/logic/action/create.py @@ -1,6 +1,6 @@ import re -from ckan.logic import NotFound, ValidationError +from ckan.logic import NotFound, ValidationError, check_access from ckan.lib.navl.dictization_functions import validate from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject) @@ -11,13 +11,16 @@ from ckanext.harvest.logic.action.get import harvest_source_list,harvest_job_lis def harvest_source_create(context,data_dict): + check_access('harvest_source_create',context,data_dict) + model = context['model'] + session = context['session'] schema = harvest_source_form_schema() data, errors = validate(data_dict, schema) if errors: - model.Session.rollback() + session.rollback() raise ValidationError(errors,_error_summary(errors)) source = HarvestSource() @@ -38,6 +41,8 @@ def harvest_source_create(context,data_dict): def harvest_job_create(context,data_dict): + check_access('harvest_job_create',context,data_dict) + source_id = data_dict['source_id'] # Check if source exists @@ -66,6 +71,8 @@ def harvest_job_create(context,data_dict): def harvest_job_create_all(context,data_dict): + check_access('harvest_job_create_all',context,data_dict) + data_dict.update({'only_active':True}) # Get all active sources diff --git a/ckanext/harvest/logic/action/delete.py b/ckanext/harvest/logic/action/delete.py index ecf5cbd..a69d3aa 100644 --- a/ckanext/harvest/logic/action/delete.py +++ b/ckanext/harvest/logic/action/delete.py @@ -1,10 +1,12 @@ -from ckan.logic import NotFound +from ckan.logic import NotFound, check_access from ckanext.harvest.model import (HarvestSource, HarvestJob) def harvest_source_delete(context,data_dict): + check_access('harvest_source_delete',context,data_dict) + source_id = data_dict.get('id') source = HarvestSource.get(source_id) if not source: diff --git a/ckanext/harvest/logic/action/get.py b/ckanext/harvest/logic/action/get.py index cc280ac..9654bf9 100644 --- a/ckanext/harvest/logic/action/get.py +++ b/ckanext/harvest/logic/action/get.py @@ -2,7 +2,7 @@ from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester -from ckan.logic import NotFound +from ckan.logic import NotFound, check_access from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject) from ckanext.harvest.logic.dictization import (harvest_source_dictize, @@ -11,6 +11,8 @@ from ckanext.harvest.logic.dictization import (harvest_source_dictize, def harvest_source_show(context,data_dict): + check_access('harvest_source_show',context,data_dict) + id = data_dict.get('id') attr = data_dict.get('attr',None) @@ -23,17 +25,20 @@ def harvest_source_show(context,data_dict): def harvest_source_list(context, data_dict): + check_access('harvest_source_list',context,data_dict) + model = context['model'] + session = context['session'] only_active = data_dict.get('only_active',False) if only_active: - sources = model.Session.query(HarvestSource) \ + sources = session.query(HarvestSource) \ .filter(HarvestSource.active==True) \ .order_by(HarvestSource.created.desc()) \ .all() else: - sources = model.Session.query(HarvestSource) \ + sources = session.query(HarvestSource) \ .order_by(HarvestSource.created.desc()) \ .all() @@ -42,6 +47,8 @@ def harvest_source_list(context, data_dict): def harvest_job_show(context,data_dict): + check_access('harvest_job_show',context,data_dict) + id = data_dict.get('id') attr = data_dict.get('attr',None) @@ -53,12 +60,15 @@ def harvest_job_show(context,data_dict): def harvest_job_list(context,data_dict): + check_access('harvest_job_list',context,data_dict) + model = context['model'] + session = context['session'] source_id = data_dict.get('source_id',False) status = data_dict.get('status',False) - query = model.Session.query(HarvestJob) + query = session.query(HarvestJob) if source_id: query = query.filter(HarvestJob.source_id==source_id) @@ -72,9 +82,10 @@ def harvest_job_list(context,data_dict): def harvest_object_show(context,data_dict): + check_access('harvest_object_show',context,data_dict) + id = data_dict.get('id') attr = data_dict.get('attr',None) - obj = HarvestObject.get(id,attr=attr) if not obj: raise NotFound @@ -83,20 +94,26 @@ def harvest_object_show(context,data_dict): def harvest_object_list(context,data_dict): + check_access('harvest_object_list',context,data_dict) + model = context['model'] + session = context['session'] only_current = data_dict.get('only_current',True) if only_current: - objects = model.Session.query(HarvestObject) \ + objects = session.query(HarvestObject) \ .filter(HarvestObject.current==True) \ .all() else: - objects = model.Session.query(HarvestObject).all() + objects = session.query(HarvestObject).all() return [getattr(obj,'id') for obj in objects] -def harvesters_info_show(context = {},data_dict = {}): +def harvesters_info_show(context,data_dict): + + check_access('harvesters_info_show',context,data_dict) + available_harvesters = [] for harvester in PluginImplementations(IHarvester): info = harvester.info() diff --git a/ckanext/harvest/logic/action/update.py b/ckanext/harvest/logic/action/update.py index 86f73b9..0515208 100644 --- a/ckanext/harvest/logic/action/update.py +++ b/ckanext/harvest/logic/action/update.py @@ -5,7 +5,7 @@ from ckanext.harvest.interfaces import IHarvester from ckan.model import Package -from ckan.logic import NotFound, ValidationError +from ckan.logic import NotFound, ValidationError, check_access from ckan.lib.navl.dictization_functions import validate from ckanext.harvest.queue import get_gather_publisher @@ -22,7 +22,11 @@ log = logging.getLogger(__name__) def harvest_source_update(context,data_dict): + check_access('harvest_source_update',context,data_dict) + model = context['model'] + session = context['session'] + source_id = data_dict.get('id') schema = harvest_source_form_schema() @@ -33,7 +37,7 @@ def harvest_source_update(context,data_dict): data, errors = validate(data_dict, schema) if errors: - model.Session.rollback() + session.rollback() raise ValidationError(errors,_error_summary(errors)) fields = ['url','title','type','description','user_id','publisher_id'] @@ -67,7 +71,10 @@ def harvest_objects_import(context,data_dict): It will only affect the last fetched objects already present in the database. ''' + check_access('harvest_objects_import',context,data_dict) + model = context['model'] + session = context['session'] source_id = data_dict.get('source_id',None) if source_id: @@ -78,14 +85,14 @@ def harvest_objects_import(context,data_dict): if not source.active: raise Exception('This harvest source is not active') - last_objects_ids = model.Session.query(HarvestObject.id) \ + last_objects_ids = session.query(HarvestObject.id) \ .join(HarvestSource).join(Package) \ .filter(HarvestObject.source==source) \ .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') \ .all() else: - last_objects_ids = model.Session.query(HarvestObject.id) \ + last_objects_ids = session.query(HarvestObject.id) \ .join(Package) \ .filter(HarvestObject.current==True) \ .filter(Package.state==u'active') \ @@ -93,7 +100,7 @@ def harvest_objects_import(context,data_dict): last_objects = [] for obj_id in last_objects_ids: - obj = model.Session.query(HarvestObject).get(obj_id) + obj = session.query(HarvestObject).get(obj_id) for harvester in PluginImplementations(IHarvester): if harvester.info()['name'] == obj.source.type: if hasattr(harvester,'force_import'): @@ -105,6 +112,8 @@ def harvest_objects_import(context,data_dict): def harvest_jobs_run(context,data_dict): + check_access('harvest_jobs_run',context,data_dict) + # Check if there are pending harvest jobs jobs = harvest_job_list(context,{'status':u'New'}) if len(jobs) == 0: diff --git a/ckanext/harvest/logic/auth/__init__.py b/ckanext/harvest/logic/auth/__init__.py new file mode 100644 index 0000000..d0ed2fc --- /dev/null +++ b/ckanext/harvest/logic/auth/__init__.py @@ -0,0 +1,7 @@ +try: + import pkg_resources + pkg_resources.declare_namespace(__name__) +except ImportError: + import pkgutil + __path__ = pkgutil.extend_path(__path__, __name__) + diff --git a/ckanext/harvest/logic/auth/create.py b/ckanext/harvest/logic/auth/create.py new file mode 100644 index 0000000..2173263 --- /dev/null +++ b/ckanext/harvest/logic/auth/create.py @@ -0,0 +1,30 @@ +from ckan.lib.base import _ +from ckan.authz import Authorizer + +def harvest_source_create(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to create harvest sources') % str(user)} + else: + return {'success': True} + +def harvest_job_create(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to create harvest jobs') % str(user)} + else: + return {'success': True} + +def harvest_job_create_all(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to create harvest jobs for all sources') % str(user)} + else: + return {'success': True} + diff --git a/ckanext/harvest/logic/auth/delete.py b/ckanext/harvest/logic/auth/delete.py new file mode 100644 index 0000000..f527aea --- /dev/null +++ b/ckanext/harvest/logic/auth/delete.py @@ -0,0 +1,13 @@ +from ckan.lib.base import _ +from ckan.authz import Authorizer + +def harvest_source_delete(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to delete harvest sources') % str(user)} + else: + return {'success': True} + + diff --git a/ckanext/harvest/logic/auth/get.py b/ckanext/harvest/logic/auth/get.py new file mode 100644 index 0000000..2581818 --- /dev/null +++ b/ckanext/harvest/logic/auth/get.py @@ -0,0 +1,67 @@ +from ckan.lib.base import _ +from ckan.authz import Authorizer + +def harvest_source_show(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to read this harvest source') % str(user)} + else: + return {'success': True} + +def harvest_source_list(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to see the harvest sources') % str(user)} + else: + return {'success': True} + + +def harvest_job_show(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to read this harvest job') % str(user)} + else: + return {'success': True} + +def harvest_job_list(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to see the harvest jobs') % str(user)} + else: + return {'success': True} + +def harvest_object_show(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to read this harvest object') % str(user)} + else: + return {'success': True} + +def harvest_object_list(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to see the harvest objects') % str(user)} + else: + return {'success': True} + +def harvesters_info_show(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to see the harvesters information') % str(user)} + else: + return {'success': True} + diff --git a/ckanext/harvest/logic/auth/update.py b/ckanext/harvest/logic/auth/update.py new file mode 100644 index 0000000..efe84c7 --- /dev/null +++ b/ckanext/harvest/logic/auth/update.py @@ -0,0 +1,30 @@ +from ckan.lib.base import _ +from ckan.authz import Authorizer + +def harvest_source_update(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to update harvest sources') % str(user)} + else: + return {'success': True} + +def harvest_objects_import(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to reimport harvest objects') % str(user)} + else: + return {'success': True} + +def harvest_jobs_run(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('User %s not authorized to run the pending harvest jobs') % str(user)} + else: + return {'success': True} + diff --git a/ckanext/harvest/plugin.py b/ckanext/harvest/plugin.py index 1d6decd..13fbedb 100644 --- a/ckanext/harvest/plugin.py +++ b/ckanext/harvest/plugin.py @@ -8,18 +8,18 @@ import ckan.lib.helpers as h from ckan.plugins import implements, SingletonPlugin from ckan.plugins import IRoutes, IConfigurer -from ckan.plugins import IConfigurable, IActions +from ckan.plugins import IConfigurable, IActions, IAuthFunctions from ckanext.harvest.model import setup log = getLogger(__name__) class Harvest(SingletonPlugin): - + implements(IConfigurable) implements(IRoutes, inherit=True) implements(IConfigurer, inherit=True) implements(IActions) - + implements(IAuthFunctions) def configure(self, config): setup() @@ -30,11 +30,11 @@ class Harvest(SingletonPlugin): map.connect('harvest', '/harvest',controller=controller,action='index') map.connect('/harvest/new', controller=controller, action='new') - map.connect('/harvest/edit/:id', controller=controller, action='edit') + map.connect('/harvest/edit/:id', controller=controller, action='edit') map.connect('/harvest/delete/:id',controller=controller, action='delete') map.connect('/harvest/:id', controller=controller, action='read') - map.connect('harvesting_job_create', '/harvest/refresh/:id',controller=controller, + map.connect('harvesting_job_create', '/harvest/refresh/:id',controller=controller, action='create_harvesting_job') map.connect('/harvest/object/:id', controller=controller, action='show_object') @@ -59,6 +59,8 @@ class Harvest(SingletonPlugin): harvest_source_list, harvest_job_show, harvest_job_list, + harvest_object_show, + harvest_object_list, harvesters_info_show,) from ckanext.harvest.logic.action.create import (harvest_source_create, harvest_job_create, @@ -73,12 +75,48 @@ class Harvest(SingletonPlugin): 'harvest_source_list': harvest_source_list, 'harvest_job_show': harvest_job_show, 'harvest_job_list': harvest_job_list, + 'harvest_object_show': harvest_object_show, + 'harvest_object_list': harvest_object_list, + 'harvesters_info_show': harvesters_info_show, 'harvest_source_create': harvest_source_create, 'harvest_job_create': harvest_job_create, 'harvest_job_create_all': harvest_job_create_all, 'harvest_source_update': harvest_source_update, 'harvest_source_delete': harvest_source_delete, - 'harvesters_info_show': harvesters_info_show, 'harvest_objects_import': harvest_objects_import, 'harvest_jobs_run':harvest_jobs_run } + + def get_auth_functions(self): + from ckanext.harvest.logic.auth.get import (harvest_source_show, + harvest_source_list, + harvest_job_show, + harvest_job_list, + harvest_object_show, + harvest_object_list, + harvesters_info_show,) + from ckanext.harvest.logic.auth.create import (harvest_source_create, + harvest_job_create, + harvest_job_create_all,) + from ckanext.harvest.logic.auth.update import (harvest_source_update, + harvest_objects_import, + harvest_jobs_run) + from ckanext.harvest.logic.auth.delete import (harvest_source_delete,) + + return { + 'harvest_source_show': harvest_source_show, + 'harvest_source_list': harvest_source_list, + 'harvest_job_show': harvest_job_show, + 'harvest_job_list': harvest_job_list, + 'harvest_object_show': harvest_object_show, + 'harvest_object_list': harvest_object_list, + 'harvesters_info_show': harvesters_info_show, + 'harvest_source_create': harvest_source_create, + 'harvest_job_create': harvest_job_create, + 'harvest_job_create_all': harvest_job_create_all, + 'harvest_source_update': harvest_source_update, + 'harvest_source_delete': harvest_source_delete, + 'harvest_objects_import': harvest_objects_import, + 'harvest_jobs_run':harvest_jobs_run + } + From 3b68298bbaa9cb959f70ba1b5cdf759e4561fb94 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 1 Mar 2012 12:46:42 +0000 Subject: [PATCH 22/43] [logic,auth] Use the site user for CLI commands auth checks --- ckanext/harvest/commands/harvester.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index 9f69978..01e91cc 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -3,7 +3,7 @@ import re from pprint import pprint from ckan import model -from ckan.logic import get_action +from ckan.logic import get_action, ValidationError from ckan.lib.cli import CkanCommand from ckanext.harvest.queue import get_gather_consumer, get_fetch_consumer @@ -64,6 +64,13 @@ class Harvester(CkanCommand): def command(self): self._load_config() + + # We'll need a sysadmin user to perform most of the actions + # We will use the sysadmin site user (named as the site_id) + context = {'model':model,'session':model.Session,'ignore_auth':True} + self.admin_user = get_action('get_site_user')(context,{}) + + print '' if len(self.args) == 0: @@ -152,7 +159,7 @@ class Harvester(CkanCommand): 'user_id':user_id, 'publisher_id':publisher_id} - context = {'model':model, 'session':model.Session} + context = {'model':model, 'session':model.Session, 'user': self.admin_user['name']} source = get_action('harvest_source_create')(context,data_dict) print 'Created new harvest source:' self.print_harvest_source(source) @@ -163,20 +170,18 @@ class Harvester(CkanCommand): # Create a harvest job for the new source get_action('harvest_job_create')(context,{'source_id':source['id']}) print 'A new Harvest Job for this source has also been created' - except ValidationError,e: print 'An error occurred:' print str(e.error_dict) raise e - def remove_harvest_source(self): if len(self.args) >= 2: source_id = unicode(self.args[1]) else: print 'Please provide a source id' sys.exit(1) - context = {'model': model} + context = {'model': model, 'user': self.admin_user['name']} get_action('harvest_source_delete')(context,{'id':source_id}) print 'Removed harvest source: %s' % source_id @@ -188,7 +193,7 @@ class Harvester(CkanCommand): data_dict = {'only_active':True} what = 'active harvest source' - context = {'model': model,'session':model.Session} + context = {'model': model,'session':model.Session, 'user': self.admin_user['name']} sources = get_action('harvest_source_list')(context,data_dict) self.print_harvest_sources(sources) self.print_there_are(what=what, sequence=sources) @@ -203,19 +208,19 @@ class Harvester(CkanCommand): job = create_harvest_job(source_id) self.print_harvest_job(job) - context = {'model': model,'session':model.Session} + context = {'model': model,'session':model.Session, 'user': self.admin_user['name']} jobs = get_action('harvest_job_list')(context,{'status':u'New'}) self.print_there_are('harvest jobs', jobs, condition=status) def list_harvest_jobs(self): - context = {'model': model} + context = {'model': model, 'user': self.admin_user['name']} jobs = get_action('harvest_job_list')(context,{}) self.print_harvest_jobs(jobs) self.print_there_are(what='harvest job', sequence=jobs) def run_harvester(self): - context = {'model': model} + context = {'model': model, 'user': self.admin_user['name']} jobs = get_action('harvest_jobs_run')(context,{}) #print 'Sent %s jobs to the gather queue' % len(jobs) @@ -225,13 +230,13 @@ class Harvester(CkanCommand): source_id = unicode(self.args[1]) else: source_id = None - context = {'model': model, 'session':model.Session} + context = {'model': model, 'session':model.Session, 'user': self.admin_user['name']} objs = get_action('harvest_objects_import')(context,{'source_id':source_id}) print '%s objects reimported' % len(objs) def create_harvest_job_all(self): - context = {'model': model} + context = {'model': model, 'user': self.admin_user['name']} jobs = get_action('harvest_job_create_all')(context,{}) print 'Created %s new harvest jobs' % len(jobs) From 2a2397c0ed66be728d021d8a12456a5ccc9c515a Mon Sep 17 00:00:00 2001 From: amercader Date: Fri, 2 Mar 2012 16:49:39 +0000 Subject: [PATCH 23/43] [logic,auth] Implement publisher auth profile The publisher profile allows general users to handle harvest sources based on membership to a certain group (publisher), as opposed to the default auth profile where only sysadmins can perform any harvesting task. To enable it, put this directive in your ini file: ckan.harvest.auth.profile = publisher TODO: * Save publisher id / user id when creating sources * Show publisher in form and index page --- ckanext/harvest/logic/action/get.py | 68 ++++++-- ckanext/harvest/logic/action/update.py | 4 +- ckanext/harvest/logic/auth/__init__.py | 32 +++- .../harvest/logic/auth/publisher/__init__.py | 7 + .../harvest/logic/auth/publisher/create.py | 53 ++++++ .../harvest/logic/auth/publisher/delete.py | 27 +++ ckanext/harvest/logic/auth/publisher/get.py | 156 ++++++++++++++++++ .../harvest/logic/auth/publisher/update.py | 83 ++++++++++ ckanext/harvest/plugin.py | 62 +++---- 9 files changed, 439 insertions(+), 53 deletions(-) create mode 100644 ckanext/harvest/logic/auth/publisher/__init__.py create mode 100644 ckanext/harvest/logic/auth/publisher/create.py create mode 100644 ckanext/harvest/logic/auth/publisher/delete.py create mode 100644 ckanext/harvest/logic/auth/publisher/get.py create mode 100644 ckanext/harvest/logic/auth/publisher/update.py diff --git a/ckanext/harvest/logic/action/get.py b/ckanext/harvest/logic/action/get.py index 9654bf9..0a4058f 100644 --- a/ckanext/harvest/logic/action/get.py +++ b/ckanext/harvest/logic/action/get.py @@ -1,3 +1,7 @@ +from sqlalchemy import or_ +from ckan.authz import Authorizer +from ckan.model import User + from ckan.plugins import PluginImplementations from ckanext.harvest.interfaces import IHarvester @@ -29,18 +33,9 @@ def harvest_source_list(context, data_dict): model = context['model'] session = context['session'] + user = context.get('user','') - only_active = data_dict.get('only_active',False) - - if only_active: - sources = session.query(HarvestSource) \ - .filter(HarvestSource.active==True) \ - .order_by(HarvestSource.created.desc()) \ - .all() - else: - sources = session.query(HarvestSource) \ - .order_by(HarvestSource.created.desc()) \ - .all() + sources = _get_sources_for_user(context, data_dict) context.update({'detailed':False}) return [harvest_source_dictize(source, context) for source in sources] @@ -100,13 +95,17 @@ def harvest_object_list(context,data_dict): session = context['session'] only_current = data_dict.get('only_current',True) + source_id = data_dict.get('source_id',False) + + query = session.query(HarvestObject) + + if source_id: + query = query.filter(HarvestObject.source_id==source_id) if only_current: - objects = session.query(HarvestObject) \ - .filter(HarvestObject.current==True) \ - .all() - else: - objects = session.query(HarvestObject).all() + query = query.filter(HarvestObject.current==True) + + objects = query.all() return [getattr(obj,'id') for obj in objects] @@ -124,3 +123,40 @@ def harvesters_info_show(context,data_dict): available_harvesters.append(info) return available_harvesters + +def _get_sources_for_user(context,data_dict): + + model = context['model'] + session = context['session'] + user = context.get('user','') + + only_active = data_dict.get('only_active',False) + + query = session.query(HarvestSource) \ + .order_by(HarvestSource.created.desc()) + + if only_active: + query = query.filter(HarvestSource.active==True) \ + + # Sysadmins will get all sources + if not Authorizer().is_sysadmin(user): + # This only applies to a non sysadmin user when using the + # publisher auth profile. When using the default profile, + # normal users will never arrive at this point, but even if they + # do, they will get an empty list. + user_obj = User.get(user) + + publisher_filters = [] + + for publisher_id in [g.id for g in user_obj.get_groups()]: + publisher_filters.append(HarvestSource.publisher_id==publisher_id) + + if len(publisher_filters): + query = query.filter(or_(*publisher_filters)) + else: + # This user does not belong to a publisher yet, no sources for him/her + return [] + + sources = query.all() + + return sources diff --git a/ckanext/harvest/logic/action/update.py b/ckanext/harvest/logic/action/update.py index 0515208..0aacf39 100644 --- a/ckanext/harvest/logic/action/update.py +++ b/ckanext/harvest/logic/action/update.py @@ -114,8 +114,10 @@ def harvest_jobs_run(context,data_dict): check_access('harvest_jobs_run',context,data_dict) + source_id = data_dict.get('source_id',None) + # Check if there are pending harvest jobs - jobs = harvest_job_list(context,{'status':u'New'}) + jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'}) if len(jobs) == 0: raise Exception('There are no new harvesting jobs') diff --git a/ckanext/harvest/logic/auth/__init__.py b/ckanext/harvest/logic/auth/__init__.py index d0ed2fc..b015aed 100644 --- a/ckanext/harvest/logic/auth/__init__.py +++ b/ckanext/harvest/logic/auth/__init__.py @@ -1,7 +1,27 @@ -try: - import pkg_resources - pkg_resources.declare_namespace(__name__) -except ImportError: - import pkgutil - __path__ = pkgutil.extend_path(__path__, __name__) +from ckan.logic import NotFound +from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject + +def get_source_object(context, data_dict = {}): + if not 'source' in context: + model = context['model'] + id = data_dict.get('id',None) + source = HarvestSource.get(id) + if not source: + raise NotFound + else: + source = context['source'] + + return source + +def get_job_object(context, data_dict = {}): + if not 'job' in context: + model = context['model'] + id = data_dict.get('id',None) + job = HarvestJob.get(id) + if not job: + raise NotFound + else: + job = context['job'] + + return job diff --git a/ckanext/harvest/logic/auth/publisher/__init__.py b/ckanext/harvest/logic/auth/publisher/__init__.py new file mode 100644 index 0000000..d0ed2fc --- /dev/null +++ b/ckanext/harvest/logic/auth/publisher/__init__.py @@ -0,0 +1,7 @@ +try: + import pkg_resources + pkg_resources.declare_namespace(__name__) +except ImportError: + import pkgutil + __path__ = pkgutil.extend_path(__path__, __name__) + diff --git a/ckanext/harvest/logic/auth/publisher/create.py b/ckanext/harvest/logic/auth/publisher/create.py new file mode 100644 index 0000000..b8beee5 --- /dev/null +++ b/ckanext/harvest/logic/auth/publisher/create.py @@ -0,0 +1,53 @@ +from ckan.lib.base import _ +from ckan.authz import Authorizer +from ckan.model import User + +from ckanext.harvest.model import HarvestSource + +def harvest_source_create(context,data_dict): + model = context['model'] + user = context.get('user','') + + # Non-logged users can not create sources + if not user: + return {'success': False, 'msg': _('Non-logged in users are not authorized to create harvest sources')} + + # Sysadmins and the rest of logged users can create sources, + # as long as they belong to a publisher + user_obj = User.get(user) + if not Authorizer().is_sysadmin(user) and len(user_obj.get_groups()) == 0: + return {'success': False, 'msg': _('User %s must belong to a publisher to create harvest sources') % str(user)} + else: + return {'success': True} + +def harvest_job_create(context,data_dict): + model = context['model'] + user = context.get('user') + + source_id = data_dict['source_id'] + + if not user: + return {'success': False, 'msg': _('Non-logged in users are not authorized to create harvest jobs')} + + if Authorizer().is_sysadmin(user): + return {'success': True} + + user_obj = User.get(user) + source = HarvestSource.get(source_id) + if not source: + raise NotFound + + if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + return {'success': False, 'msg': _('User %s not authorized to create a job for source %s') % (str(user),source.id)} + else: + return {'success': True} + +def harvest_job_create_all(context,data_dict): + model = context['model'] + user = context.get('user') + + if not Authorizer().is_sysadmin(user): + return {'success': False, 'msg': _('Only sysadmins can create harvest jobs for all sources') % str(user)} + else: + return {'success': True} + diff --git a/ckanext/harvest/logic/auth/publisher/delete.py b/ckanext/harvest/logic/auth/publisher/delete.py new file mode 100644 index 0000000..b19ec16 --- /dev/null +++ b/ckanext/harvest/logic/auth/publisher/delete.py @@ -0,0 +1,27 @@ +from ckan.lib.base import _ +from ckan.authz import Authorizer +from ckan.model import User + +from ckanext.harvest.logic.auth import get_source_object + +def harvest_source_delete(context,data_dict): + model = context['model'] + user = context.get('user','') + + source = get_source_object(context,data_dict) + + # Non-logged users can not delete this source + if not user: + return {'success': False, 'msg': _('Non-logged in users are not authorized to delete harvest sources')} + + # Sysadmins can delete the source + if Authorizer().is_sysadmin(user): + return {'success': True} + + # Check if the source publisher id exists on the user's groups + user_obj = User.get(user) + if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + return {'success': False, 'msg': _('User %s not authorized to delete harvest source %s') % (str(user),source.id)} + else: + return {'success': True} + diff --git a/ckanext/harvest/logic/auth/publisher/get.py b/ckanext/harvest/logic/auth/publisher/get.py new file mode 100644 index 0000000..568511a --- /dev/null +++ b/ckanext/harvest/logic/auth/publisher/get.py @@ -0,0 +1,156 @@ +from ckan.lib.base import _ +from ckan.authz import Authorizer +from ckan.model import User + +from ckanext.harvest.model import HarvestSource +from ckanext.harvest.logic.auth import get_source_object, get_job_object + +def harvest_source_show(context,data_dict): + model = context['model'] + user = context.get('user','') + + source = get_source_object(context,data_dict) + + # Non-logged users can not read the source + if not user: + return {'success': False, 'msg': _('Non-logged in users are not authorized to see harvest sources')} + + # Sysadmins can read the source + if Authorizer().is_sysadmin(user): + return {'success': True} + + # Check if the source publisher id exists on the user's groups + user_obj = User.get(user) + if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + return {'success': False, 'msg': _('User %s not authorized to read harvest source %s') % (str(user),source.id)} + else: + return {'success': True} + +def harvest_source_list(context,data_dict): + + model = context['model'] + user = context.get('user') + + # Here we will just check that the user is logged in. + # The logic action will return an empty list if the user does not + # have permissons on any source. + if not user: + return {'success': False, 'msg': _('Only logged users are authorized to see their sources')} + else: + user_obj = User.get(user) + # Only users belonging to a publisher can list sources, + # unless they are sysadmins + if not Authorizer().is_sysadmin(user) and len(user_obj.get_groups()) == 0: + return {'success': False, 'msg': _('User %s must belong to a publisher to list harvest sources') % str(user)} + else: + return {'success': True} + + +def harvest_job_show(context,data_dict): + model = context['model'] + user = context.get('user') + + job = get_job_object(context,data_dict) + + if not user: + return {'success': False, 'msg': _('Non-logged in users are not authorized to see harvest jobs')} + + if Authorizer().is_sysadmin(user): + return {'success': True} + + user_obj = User.get(user) + if not job.source.publisher_id in [g.id for g in user_obj.get_groups()]: + return {'success': False, 'msg': _('User %s not authorized to read harvest job %s') % (str(user),job.id)} + else: + return {'success': True} + +def harvest_job_list(context,data_dict): + model = context['model'] + user = context.get('user') + + # Check user is logged in + if not user: + return {'success': False, 'msg': _('Only logged users are authorized to see their sources')} + + user_obj = User.get(user) + + # Checks for non sysadmin users + if not Authorizer().is_sysadmin(user): + if len(user_obj.get_groups()) == 0: + return {'success': False, 'msg': _('User %s must belong to a publisher to list harvest jobs') % str(user)} + + source_id = data_dict.get('source_id',False) + if not source_id: + return {'success': False, 'msg': _('Only sysadmins can list all harvest jobs') % str(user)} + + source = HarvestSource.get(source_id) + if not source: + raise NotFound + + if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + return {'success': False, 'msg': _('User %s not authorized to list jobs from source %s') % (str(user),source.id)} + + return {'success': True} + +def harvest_object_show(context,data_dict): + model = context['model'] + user = context.get('user') + + obj = get_obj_object(context,data_dict) + + if not user: + return {'success': False, 'msg': _('Non-logged in users are not authorized to see harvest objects')} + + if Authorizer().is_sysadmin(user): + return {'success': True} + + user_obj = User.get(user) + if not obj.source.publisher_id in [g.id for g in user_obj.get_groups()]: + return {'success': False, 'msg': _('User %s not authorized to read harvest object %s') % (str(user),obj.id)} + else: + return {'success': True} + +def harvest_object_list(context,data_dict): + model = context['model'] + user = context.get('user') + + # Check user is logged in + if not user: + return {'success': False, 'msg': _('Only logged users are authorized to see their sources')} + + user_obj = User.get(user) + + # Checks for non sysadmin users + if not Authorizer().is_sysadmin(user): + if len(user_obj.get_groups()) == 0: + return {'success': False, 'msg': _('User %s must belong to a publisher to list harvest objects') % str(user)} + + source_id = data_dict.get('source_id',False) + if not source_id: + return {'success': False, 'msg': _('Only sysadmins can list all harvest objects') % str(user)} + + source = HarvestSource.get(source_id) + if not source: + raise NotFound + + if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + return {'success': False, 'msg': _('User %s not authorized to list objects from source %s') % (str(user),source.id)} + + return {'success': True} + +def harvesters_info_show(context,data_dict): + model = context['model'] + user = context.get('user','') + + # Non-logged users can not create sources + if not user: + return {'success': False, 'msg': _('Non-logged in users can not see the harvesters info')} + + # Sysadmins and the rest of logged users can see the harvesters info, + # as long as they belong to a publisher + user_obj = User.get(user) + if not Authorizer().is_sysadmin(user) and len(user_obj.get_groups()) == 0: + return {'success': False, 'msg': _('User %s must belong to a publisher to see the harvesters info') % str(user)} + else: + return {'success': True} + diff --git a/ckanext/harvest/logic/auth/publisher/update.py b/ckanext/harvest/logic/auth/publisher/update.py new file mode 100644 index 0000000..b05b795 --- /dev/null +++ b/ckanext/harvest/logic/auth/publisher/update.py @@ -0,0 +1,83 @@ +from ckan.lib.base import _ +from ckan.authz import Authorizer +from ckan.model import User + +from ckanext.harvest.logic.auth import get_source_object + +def harvest_source_update(context,data_dict): + model = context['model'] + user = context.get('user','') + + source = get_source_object(context,data_dict) + + # Non-logged users can not update this source + if not user: + return {'success': False, 'msg': _('Non-logged in users are not authorized to update harvest sources')} + + # Sysadmins can update the source + if Authorizer().is_sysadmin(user): + return {'success': True} + + # Check if the source publisher id exists on the user's groups + user_obj = User.get(user) + if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + return {'success': False, 'msg': _('User %s not authorized to update harvest source %s') % (str(user),source.id)} + else: + return {'success': True} + +def harvest_objects_import(context,data_dict): + model = context['model'] + user = context.get('user') + + # Check user is logged in + if not user: + return {'success': False, 'msg': _('Only logged users are authorized to reimport harvest objects')} + + user_obj = User.get(user) + + # Checks for non sysadmin users + if not Authorizer().is_sysadmin(user): + if len(user_obj.get_groups()) == 0: + return {'success': False, 'msg': _('User %s must belong to a publisher to reimport harvest objects') % str(user)} + + source_id = data_dict.get('source_id',False) + if not source_id: + return {'success': False, 'msg': _('Only sysadmins can reimport all harvest objects') % str(user)} + + source = HarvestSource.get(source_id) + if not source: + raise NotFound + + if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + return {'success': False, 'msg': _('User %s not authorized to reimport objects from source %s') % (str(user),source.id)} + + return {'success': True} + +def harvest_jobs_run(context,data_dict): + model = context['model'] + user = context.get('user') + + # Check user is logged in + if not user: + return {'success': False, 'msg': _('Only logged users are authorized to run harvest jobs')} + + user_obj = User.get(user) + + # Checks for non sysadmin users + if not Authorizer().is_sysadmin(user): + if len(user_obj.get_groups()) == 0: + return {'success': False, 'msg': _('User %s must belong to a publisher to run harvest jobs') % str(user)} + + source_id = data_dict.get('source_id',False) + if not source_id: + return {'success': False, 'msg': _('Only sysadmins can run all harvest jobs') % str(user)} + + source = HarvestSource.get(source_id) + if not source: + raise NotFound + + if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + return {'success': False, 'msg': _('User %s not authorized to run jobs from source %s') % (str(user),source.id)} + + return {'success': True} + diff --git a/ckanext/harvest/plugin.py b/ckanext/harvest/plugin.py index 13fbedb..5c54666 100644 --- a/ckanext/harvest/plugin.py +++ b/ckanext/harvest/plugin.py @@ -1,6 +1,7 @@ import os from logging import getLogger +from pylons import config from genshi.input import HTML from genshi.filters import Transformer @@ -88,35 +89,36 @@ class Harvest(SingletonPlugin): } def get_auth_functions(self): - from ckanext.harvest.logic.auth.get import (harvest_source_show, - harvest_source_list, - harvest_job_show, - harvest_job_list, - harvest_object_show, - harvest_object_list, - harvesters_info_show,) - from ckanext.harvest.logic.auth.create import (harvest_source_create, - harvest_job_create, - harvest_job_create_all,) - from ckanext.harvest.logic.auth.update import (harvest_source_update, - harvest_objects_import, - harvest_jobs_run) - from ckanext.harvest.logic.auth.delete import (harvest_source_delete,) - return { - 'harvest_source_show': harvest_source_show, - 'harvest_source_list': harvest_source_list, - 'harvest_job_show': harvest_job_show, - 'harvest_job_list': harvest_job_list, - 'harvest_object_show': harvest_object_show, - 'harvest_object_list': harvest_object_list, - 'harvesters_info_show': harvesters_info_show, - 'harvest_source_create': harvest_source_create, - 'harvest_job_create': harvest_job_create, - 'harvest_job_create_all': harvest_job_create_all, - 'harvest_source_update': harvest_source_update, - 'harvest_source_delete': harvest_source_delete, - 'harvest_objects_import': harvest_objects_import, - 'harvest_jobs_run':harvest_jobs_run - } + module_root = 'ckanext.harvest.logic.auth' + auth_profile = config.get('ckan.harvest.auth.profile', '') + + auth_functions = _get_auth_functions(module_root) + if auth_profile: + module_root = '%s.%s' % (module_root, auth_profile) + auth_functions = _get_auth_functions(module_root,auth_functions) + + log.info('Using auth profile at %s' % module_root) + + return auth_functions + +def _get_auth_functions(module_root, auth_functions = {}): + + for auth_module_name in ['get', 'create', 'update','delete']: + module_path = '%s.%s' % (module_root, auth_module_name,) + try: + module = __import__(module_path) + except ImportError,e: + log.debug('No auth module for action "%s"' % auth_module_name) + continue + + for part in module_path.split('.')[1:]: + module = getattr(module, part) + + for key, value in module.__dict__.items(): + if not key.startswith('_'): + auth_functions[key] = value + + + return auth_functions From d98206858de99e12719a69d12cd3dda77e7f0653 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 5 Mar 2012 17:10:02 +0000 Subject: [PATCH 24/43] [plugin,auth] Check on startup if ckan is also using the publisher profile --- ckanext/harvest/plugin.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/plugin.py b/ckanext/harvest/plugin.py index 5c54666..951c78b 100644 --- a/ckanext/harvest/plugin.py +++ b/ckanext/harvest/plugin.py @@ -10,7 +10,7 @@ import ckan.lib.helpers as h from ckan.plugins import implements, SingletonPlugin from ckan.plugins import IRoutes, IConfigurer from ckan.plugins import IConfigurable, IActions, IAuthFunctions -from ckanext.harvest.model import setup +from ckanext.harvest.model import setup as model_setup log = getLogger(__name__) @@ -23,7 +23,28 @@ class Harvest(SingletonPlugin): implements(IAuthFunctions) def configure(self, config): - setup() + + auth_profile = config.get('ckan.harvest.auth.profile',None) + + if auth_profile: + # Check if auth profile exists + module_root = 'ckanext.harvest.logic.auth' + module_path = '%s.%s' % (module_root, auth_profile) + try: + module = __import__(module_path) + except ImportError,e: + raise ImportError('Unknown auth profile: %s' % auth_profile) + + # If we are using the publisher auth profile, make sure CKAN core + # also uses it. + if auth_profile == 'publisher' and \ + not config.get('ckan.auth.profile','') == 'publisher': + raise Exception('You must enable the "publisher" auth profile' + +' in CKAN in order to use it on the harvest extension' + +' (adding "ckan.auth.profile=publisher" to your ini file)') + + # Setup harvest model + model_setup() def before_map(self, map): From f0e2521d9b890347833e5ca4d3c60e985787cf29 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 6 Mar 2012 10:16:27 +0000 Subject: [PATCH 25/43] [logic,auth] Modify checks to ensure users are admins of their publishers --- ckanext/harvest/logic/action/get.py | 2 +- ckanext/harvest/logic/auth/publisher/create.py | 4 ++-- ckanext/harvest/logic/auth/publisher/delete.py | 2 +- ckanext/harvest/logic/auth/publisher/get.py | 18 +++++++++--------- ckanext/harvest/logic/auth/publisher/update.py | 10 +++++----- 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/ckanext/harvest/logic/action/get.py b/ckanext/harvest/logic/action/get.py index 0a4058f..1d4fcd9 100644 --- a/ckanext/harvest/logic/action/get.py +++ b/ckanext/harvest/logic/action/get.py @@ -148,7 +148,7 @@ def _get_sources_for_user(context,data_dict): publisher_filters = [] - for publisher_id in [g.id for g in user_obj.get_groups()]: + for publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: publisher_filters.append(HarvestSource.publisher_id==publisher_id) if len(publisher_filters): diff --git a/ckanext/harvest/logic/auth/publisher/create.py b/ckanext/harvest/logic/auth/publisher/create.py index b8beee5..08f987b 100644 --- a/ckanext/harvest/logic/auth/publisher/create.py +++ b/ckanext/harvest/logic/auth/publisher/create.py @@ -15,7 +15,7 @@ def harvest_source_create(context,data_dict): # Sysadmins and the rest of logged users can create sources, # as long as they belong to a publisher user_obj = User.get(user) - if not Authorizer().is_sysadmin(user) and len(user_obj.get_groups()) == 0: + if not Authorizer().is_sysadmin(user) and len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to create harvest sources') % str(user)} else: return {'success': True} @@ -37,7 +37,7 @@ def harvest_job_create(context,data_dict): if not source: raise NotFound - if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to create a job for source %s') % (str(user),source.id)} else: return {'success': True} diff --git a/ckanext/harvest/logic/auth/publisher/delete.py b/ckanext/harvest/logic/auth/publisher/delete.py index b19ec16..21cb02d 100644 --- a/ckanext/harvest/logic/auth/publisher/delete.py +++ b/ckanext/harvest/logic/auth/publisher/delete.py @@ -20,7 +20,7 @@ def harvest_source_delete(context,data_dict): # Check if the source publisher id exists on the user's groups user_obj = User.get(user) - if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to delete harvest source %s') % (str(user),source.id)} else: return {'success': True} diff --git a/ckanext/harvest/logic/auth/publisher/get.py b/ckanext/harvest/logic/auth/publisher/get.py index 568511a..a839383 100644 --- a/ckanext/harvest/logic/auth/publisher/get.py +++ b/ckanext/harvest/logic/auth/publisher/get.py @@ -21,7 +21,7 @@ def harvest_source_show(context,data_dict): # Check if the source publisher id exists on the user's groups user_obj = User.get(user) - if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to read harvest source %s') % (str(user),source.id)} else: return {'success': True} @@ -40,7 +40,7 @@ def harvest_source_list(context,data_dict): user_obj = User.get(user) # Only users belonging to a publisher can list sources, # unless they are sysadmins - if not Authorizer().is_sysadmin(user) and len(user_obj.get_groups()) == 0: + if not Authorizer().is_sysadmin(user) and len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to list harvest sources') % str(user)} else: return {'success': True} @@ -59,7 +59,7 @@ def harvest_job_show(context,data_dict): return {'success': True} user_obj = User.get(user) - if not job.source.publisher_id in [g.id for g in user_obj.get_groups()]: + if not job.source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to read harvest job %s') % (str(user),job.id)} else: return {'success': True} @@ -76,7 +76,7 @@ def harvest_job_list(context,data_dict): # Checks for non sysadmin users if not Authorizer().is_sysadmin(user): - if len(user_obj.get_groups()) == 0: + if len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to list harvest jobs') % str(user)} source_id = data_dict.get('source_id',False) @@ -87,7 +87,7 @@ def harvest_job_list(context,data_dict): if not source: raise NotFound - if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to list jobs from source %s') % (str(user),source.id)} return {'success': True} @@ -105,7 +105,7 @@ def harvest_object_show(context,data_dict): return {'success': True} user_obj = User.get(user) - if not obj.source.publisher_id in [g.id for g in user_obj.get_groups()]: + if not obj.source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to read harvest object %s') % (str(user),obj.id)} else: return {'success': True} @@ -122,7 +122,7 @@ def harvest_object_list(context,data_dict): # Checks for non sysadmin users if not Authorizer().is_sysadmin(user): - if len(user_obj.get_groups()) == 0: + if len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to list harvest objects') % str(user)} source_id = data_dict.get('source_id',False) @@ -133,7 +133,7 @@ def harvest_object_list(context,data_dict): if not source: raise NotFound - if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to list objects from source %s') % (str(user),source.id)} return {'success': True} @@ -149,7 +149,7 @@ def harvesters_info_show(context,data_dict): # Sysadmins and the rest of logged users can see the harvesters info, # as long as they belong to a publisher user_obj = User.get(user) - if not Authorizer().is_sysadmin(user) and len(user_obj.get_groups()) == 0: + if not Authorizer().is_sysadmin(user) and len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to see the harvesters info') % str(user)} else: return {'success': True} diff --git a/ckanext/harvest/logic/auth/publisher/update.py b/ckanext/harvest/logic/auth/publisher/update.py index b05b795..a128760 100644 --- a/ckanext/harvest/logic/auth/publisher/update.py +++ b/ckanext/harvest/logic/auth/publisher/update.py @@ -20,7 +20,7 @@ def harvest_source_update(context,data_dict): # Check if the source publisher id exists on the user's groups user_obj = User.get(user) - if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to update harvest source %s') % (str(user),source.id)} else: return {'success': True} @@ -37,7 +37,7 @@ def harvest_objects_import(context,data_dict): # Checks for non sysadmin users if not Authorizer().is_sysadmin(user): - if len(user_obj.get_groups()) == 0: + if len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to reimport harvest objects') % str(user)} source_id = data_dict.get('source_id',False) @@ -48,7 +48,7 @@ def harvest_objects_import(context,data_dict): if not source: raise NotFound - if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to reimport objects from source %s') % (str(user),source.id)} return {'success': True} @@ -65,7 +65,7 @@ def harvest_jobs_run(context,data_dict): # Checks for non sysadmin users if not Authorizer().is_sysadmin(user): - if len(user_obj.get_groups()) == 0: + if len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to run harvest jobs') % str(user)} source_id = data_dict.get('source_id',False) @@ -76,7 +76,7 @@ def harvest_jobs_run(context,data_dict): if not source: raise NotFound - if not source.publisher_id in [g.id for g in user_obj.get_groups()]: + if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to run jobs from source %s') % (str(user),source.id)} return {'success': True} From aea785701f248055d19c625bff2fefdcd999e2ed Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 6 Mar 2012 10:37:31 +0000 Subject: [PATCH 26/43] [logic,auth] Check that users actually exist --- ckanext/harvest/logic/auth/publisher/create.py | 4 ++-- ckanext/harvest/logic/auth/publisher/delete.py | 2 +- ckanext/harvest/logic/auth/publisher/get.py | 14 +++++++------- ckanext/harvest/logic/auth/publisher/update.py | 6 +++--- 4 files changed, 13 insertions(+), 13 deletions(-) diff --git a/ckanext/harvest/logic/auth/publisher/create.py b/ckanext/harvest/logic/auth/publisher/create.py index 08f987b..8a0b272 100644 --- a/ckanext/harvest/logic/auth/publisher/create.py +++ b/ckanext/harvest/logic/auth/publisher/create.py @@ -15,7 +15,7 @@ def harvest_source_create(context,data_dict): # Sysadmins and the rest of logged users can create sources, # as long as they belong to a publisher user_obj = User.get(user) - if not Authorizer().is_sysadmin(user) and len(user_obj.get_groups(u'publisher',u'admin')) == 0: + if not user_obj or not Authorizer().is_sysadmin(user) and len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to create harvest sources') % str(user)} else: return {'success': True} @@ -37,7 +37,7 @@ def harvest_job_create(context,data_dict): if not source: raise NotFound - if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: + if not user_obj or not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to create a job for source %s') % (str(user),source.id)} else: return {'success': True} diff --git a/ckanext/harvest/logic/auth/publisher/delete.py b/ckanext/harvest/logic/auth/publisher/delete.py index 21cb02d..89324f9 100644 --- a/ckanext/harvest/logic/auth/publisher/delete.py +++ b/ckanext/harvest/logic/auth/publisher/delete.py @@ -20,7 +20,7 @@ def harvest_source_delete(context,data_dict): # Check if the source publisher id exists on the user's groups user_obj = User.get(user) - if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: + if not user_obj or not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to delete harvest source %s') % (str(user),source.id)} else: return {'success': True} diff --git a/ckanext/harvest/logic/auth/publisher/get.py b/ckanext/harvest/logic/auth/publisher/get.py index a839383..d78f336 100644 --- a/ckanext/harvest/logic/auth/publisher/get.py +++ b/ckanext/harvest/logic/auth/publisher/get.py @@ -21,7 +21,7 @@ def harvest_source_show(context,data_dict): # Check if the source publisher id exists on the user's groups user_obj = User.get(user) - if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: + if not user_obj or not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to read harvest source %s') % (str(user),source.id)} else: return {'success': True} @@ -40,7 +40,7 @@ def harvest_source_list(context,data_dict): user_obj = User.get(user) # Only users belonging to a publisher can list sources, # unless they are sysadmins - if not Authorizer().is_sysadmin(user) and len(user_obj.get_groups(u'publisher',u'admin')) == 0: + if not user_obj or not Authorizer().is_sysadmin(user) and len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to list harvest sources') % str(user)} else: return {'success': True} @@ -59,7 +59,7 @@ def harvest_job_show(context,data_dict): return {'success': True} user_obj = User.get(user) - if not job.source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: + if not user_obj or not job.source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to read harvest job %s') % (str(user),job.id)} else: return {'success': True} @@ -76,7 +76,7 @@ def harvest_job_list(context,data_dict): # Checks for non sysadmin users if not Authorizer().is_sysadmin(user): - if len(user_obj.get_groups(u'publisher',u'admin')) == 0: + if not user_obj or len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to list harvest jobs') % str(user)} source_id = data_dict.get('source_id',False) @@ -105,7 +105,7 @@ def harvest_object_show(context,data_dict): return {'success': True} user_obj = User.get(user) - if not obj.source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: + if not user_obj or not obj.source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to read harvest object %s') % (str(user),obj.id)} else: return {'success': True} @@ -122,7 +122,7 @@ def harvest_object_list(context,data_dict): # Checks for non sysadmin users if not Authorizer().is_sysadmin(user): - if len(user_obj.get_groups(u'publisher',u'admin')) == 0: + if not user_obj or len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to list harvest objects') % str(user)} source_id = data_dict.get('source_id',False) @@ -149,7 +149,7 @@ def harvesters_info_show(context,data_dict): # Sysadmins and the rest of logged users can see the harvesters info, # as long as they belong to a publisher user_obj = User.get(user) - if not Authorizer().is_sysadmin(user) and len(user_obj.get_groups(u'publisher',u'admin')) == 0: + if not user_obj or not Authorizer().is_sysadmin(user) and len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to see the harvesters info') % str(user)} else: return {'success': True} diff --git a/ckanext/harvest/logic/auth/publisher/update.py b/ckanext/harvest/logic/auth/publisher/update.py index a128760..ea239bc 100644 --- a/ckanext/harvest/logic/auth/publisher/update.py +++ b/ckanext/harvest/logic/auth/publisher/update.py @@ -20,7 +20,7 @@ def harvest_source_update(context,data_dict): # Check if the source publisher id exists on the user's groups user_obj = User.get(user) - if not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: + if not user_obj or not source.publisher_id in [g.id for g in user_obj.get_groups(u'publisher',u'admin')]: return {'success': False, 'msg': _('User %s not authorized to update harvest source %s') % (str(user),source.id)} else: return {'success': True} @@ -37,7 +37,7 @@ def harvest_objects_import(context,data_dict): # Checks for non sysadmin users if not Authorizer().is_sysadmin(user): - if len(user_obj.get_groups(u'publisher',u'admin')) == 0: + if not user_obj or len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to reimport harvest objects') % str(user)} source_id = data_dict.get('source_id',False) @@ -65,7 +65,7 @@ def harvest_jobs_run(context,data_dict): # Checks for non sysadmin users if not Authorizer().is_sysadmin(user): - if len(user_obj.get_groups(u'publisher',u'admin')) == 0: + if not user_obj or len(user_obj.get_groups(u'publisher',u'admin')) == 0: return {'success': False, 'msg': _('User %s must belong to a publisher to run harvest jobs') % str(user)} source_id = data_dict.get('source_id',False) From 97b390f3c160368aad42f7753128e4b2705a36eb Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 6 Mar 2012 16:01:43 +0000 Subject: [PATCH 27/43] [auth,logic,ui] Handle publishers on the UI Add fields for publishers in the form when using the publihser auth profile. Some changes related to the source schema. --- ckanext/harvest/controllers/view.py | 49 ++++++++++++++++--- ckanext/harvest/logic/action/create.py | 2 +- ckanext/harvest/logic/action/update.py | 4 +- ckanext/harvest/logic/schema.py | 10 +++- .../templates/source/new_source_form.html | 12 +++++ ckanext/harvest/templates/source/read.html | 4 +- 6 files changed, 68 insertions(+), 13 deletions(-) diff --git a/ckanext/harvest/controllers/view.py b/ckanext/harvest/controllers/view.py index 899358a..dfe64b1 100644 --- a/ckanext/harvest/controllers/view.py +++ b/ckanext/harvest/controllers/view.py @@ -2,7 +2,9 @@ from lxml import etree from lxml.etree import XMLSyntaxError from pylons.i18n import _ +from ckan.authz import Authorizer from ckan import model +from ckan.model.group import Group import ckan.lib.helpers as h, json from ckan.lib.base import BaseController, c, g, request, \ @@ -19,6 +21,33 @@ class ViewController(BaseController): not_auth_message = _('Not authorized to see this page') + def __before__(self, action, **params): + + super(ViewController,self).__before__(action, **params) + + c.publisher_auth = (config.get('ckan.harvest.auth.profile',None) == 'publisher') + + def _get_publishers(self): + groups = None + + if c.publisher_auth: + if Authorizer().is_sysadmin(c.user): + groups = Group.all(group_type='publisher') + elif c.userobj: + groups = c.userobj.get_groups('publisher') + else: # anonymous user shouldn't have access to this page anyway. + groups = [] + + # Be explicit about which fields we make available in the template + groups = [ { + 'name': g.name, + 'id': g.id, + 'title': g.title, + } for g in groups ] + + return groups + + def index(self): context = {'model':model, 'user':c.user,'session':model.Session} try: @@ -46,6 +75,7 @@ class ViewController(BaseController): vars = {'data': data, 'errors': errors, 'error_summary': error_summary, 'harvesters': harvesters_info} + c.groups = self._get_publishers() c.form = render('source/new_source_form.html', extra_vars=vars) return render('source/new.html') @@ -53,7 +83,9 @@ class ViewController(BaseController): try: data_dict = dict(request.params) self._check_data_dict(data_dict) - context = {'model':model, 'user':c.user, 'session':model.Session} + context = {'model':model, 'user':c.user, 'session':model.Session, + 'schema':harvest_source_form_schema()} + source = get_action('harvest_source_create')(context,data_dict) # Create a harvest job for the new source @@ -61,7 +93,7 @@ class ViewController(BaseController): h.flash_success(_('New harvest source added successfully.' 'A new harvest job for the source has also been created.')) - redirect(h.url_for('harvest')) + redirect('/harvest/%s' % source['id']) except NotAuthorized,e: abort(401,self.not_auth_message) except DataError,e: @@ -98,6 +130,7 @@ class ViewController(BaseController): vars = {'data': data, 'errors': errors, 'error_summary': error_summary, 'harvesters': harvesters_info} + c.groups = self._get_publishers() c.form = render('source/new_source_form.html', extra_vars=vars) return render('source/edit.html') @@ -106,12 +139,13 @@ class ViewController(BaseController): data_dict = dict(request.params) data_dict['id'] = id self._check_data_dict(data_dict) - context = {'model':model, 'user':c.user, 'session':model.Session} + context = {'model':model, 'user':c.user, 'session':model.Session, + 'schema':harvest_source_form_schema()} source = get_action('harvest_source_update')(context,data_dict) h.flash_success(_('Harvest source edited successfully.')) - redirect(h.url_for('harvest')) + redirect('/harvest/%s' %id) except NotAuthorized,e: abort(401,self.not_auth_message) except DataError,e: @@ -125,11 +159,14 @@ class ViewController(BaseController): def _check_data_dict(self, data_dict): '''Check if the return data is correct''' - surplus_keys_schema = ['id','publisher_id','user_id','active','save','config'] - + surplus_keys_schema = ['id','publisher_id','user_id','config','save'] schema_keys = harvest_source_form_schema().keys() keys_in_schema = set(schema_keys) - set(surplus_keys_schema) + # user_id is not yet used, we'll set the logged user one for the time being + if not data_dict.get('user_id',None): + if c.userobj: + data_dict['user_id'] = c.userobj.id if keys_in_schema - set(data_dict.keys()): log.info(_('Incorrect form fields posted')) raise DataError(data_dict) diff --git a/ckanext/harvest/logic/action/create.py b/ckanext/harvest/logic/action/create.py index 6b0907b..63580a3 100644 --- a/ckanext/harvest/logic/action/create.py +++ b/ckanext/harvest/logic/action/create.py @@ -15,8 +15,8 @@ def harvest_source_create(context,data_dict): model = context['model'] session = context['session'] + schema = context.get('schema') or default_harvest_source_schema() - schema = harvest_source_form_schema() data, errors = validate(data_dict, schema) if errors: diff --git a/ckanext/harvest/logic/action/update.py b/ckanext/harvest/logic/action/update.py index 0aacf39..8b84155 100644 --- a/ckanext/harvest/logic/action/update.py +++ b/ckanext/harvest/logic/action/update.py @@ -28,14 +28,14 @@ def harvest_source_update(context,data_dict): session = context['session'] source_id = data_dict.get('id') - - schema = harvest_source_form_schema() + schema = context.get('schema') or default_harvest_source_schema() source = HarvestSource.get(source_id) if not source: raise NotFound('Harvest source %s does not exist' % source_id) data, errors = validate(data_dict, schema) + if errors: session.rollback() raise ValidationError(errors,_error_summary(errors)) diff --git a/ckanext/harvest/logic/schema.py b/ckanext/harvest/logic/schema.py index 231a530..c95c7c8 100644 --- a/ckanext/harvest/logic/schema.py +++ b/ckanext/harvest/logic/schema.py @@ -1,3 +1,5 @@ +from ckan.lib.base import config + from ckan.lib.navl.validators import (ignore_missing, not_empty, empty, @@ -20,11 +22,15 @@ def default_harvest_source_schema(): 'title': [ignore_missing,unicode], 'description': [ignore_missing,unicode], 'active': [ignore_missing,harvest_source_active_validator], - 'user_id': [ignore_missing], - 'publisher_id': [ignore_missing], + 'user_id': [ignore_missing,unicode], 'config': [ignore_missing,harvest_source_config_validator] } + if config.get('ckan.harvest.auth.profile',None) == 'publisher': + schema['publisher_id'] = [not_empty,unicode] + else: + schema['publisher_id'] = [ignore_missing,unicode] + return schema diff --git a/ckanext/harvest/templates/source/new_source_form.html b/ckanext/harvest/templates/source/new_source_form.html index d3c5adb..b98b2a7 100644 --- a/ckanext/harvest/templates/source/new_source_form.html +++ b/ckanext/harvest/templates/source/new_source_form.html @@ -44,6 +44,18 @@
You can add your own notes here about what the URL above represents to remind you later.
+ +
+
+ +
+
Cannot add any publishers.
+ +
diff --git a/ckanext/harvest/templates/source/read.html b/ckanext/harvest/templates/source/read.html index 3ca8348..a5fc697 100644 --- a/ckanext/harvest/templates/source/read.html +++ b/ckanext/harvest/templates/source/read.html @@ -51,11 +51,11 @@ - - + User ${c.source.user_id} - + Publisher ${c.source.publisher_id} From d9cfc526436667bcae2427f99bb7d4a36c015742 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 7 Mar 2012 11:49:12 +0000 Subject: [PATCH 28/43] [ui,auth] Aggregate sources by publisher on the sources list --- ckanext/harvest/controllers/view.py | 3 + ckanext/harvest/logic/dictization.py | 17 +++-- .../harvest/public/ckanext/harvest/style.css | 12 ++++ ckanext/harvest/templates/index.html | 70 +++++++++++-------- ckanext/harvest/templates/source/read.html | 10 ++- 5 files changed, 78 insertions(+), 34 deletions(-) diff --git a/ckanext/harvest/controllers/view.py b/ckanext/harvest/controllers/view.py index dfe64b1..25aa9a8 100644 --- a/ckanext/harvest/controllers/view.py +++ b/ckanext/harvest/controllers/view.py @@ -56,6 +56,9 @@ class ViewController(BaseController): except NotAuthorized,e: abort(401,self.not_auth_message) + if c.publisher_auth: + c.sources = sorted(c.sources,key=lambda source : source['publisher_title']) + return render('index.html') def new(self,data = None,errors = None, error_summary = None): diff --git a/ckanext/harvest/logic/dictization.py b/ckanext/harvest/logic/dictization.py index 8eec19c..0016029 100644 --- a/ckanext/harvest/logic/dictization.py +++ b/ckanext/harvest/logic/dictization.py @@ -1,16 +1,20 @@ from sqlalchemy import distinct -from ckan.model import Package +from ckan.model import Package,Group from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject, \ HarvestGatherError, HarvestObjectError def harvest_source_dictize(source, context): out = source.as_dict() - out['jobs'] = [] - for job in source.jobs: - out['jobs'].append(job.as_dict()) + out['publisher_title'] = u'' + + publisher_id = out.get('publisher_id') + if publisher_id: + group = Group.get(publisher_id) + if group: + out['publisher_title'] = group.title out['status'] = _get_source_status(source, context) @@ -58,7 +62,10 @@ def _get_source_status(source, context): if not job_count: out['msg'] = 'No jobs yet' return out - out = {'next_harvest':'', + + out = { + 'job_count': job_count, + 'next_harvest':'', 'last_harvest_request':'', 'last_harvest_statistics':{'added':0,'updated':0,'errors':0}, 'last_harvest_errors':{'gather':[],'object':[]}, diff --git a/ckanext/harvest/public/ckanext/harvest/style.css b/ckanext/harvest/public/ckanext/harvest/style.css index 9f5aaca..8c0e591 100644 --- a/ckanext/harvest/public/ckanext/harvest/style.css +++ b/ckanext/harvest/public/ckanext/harvest/style.css @@ -46,6 +46,11 @@ body.index.ViewController #content { color: red; } +#harvest-sources td{ + background-color: white !important; + border-bottom: 1px solid #E3E3E3; +} + .harvester-title{ font-weight: bold; } @@ -67,3 +72,10 @@ body.index.ViewController #content { font-weight:bold; color: red; } + +#harvest-sources .publisher > td{ + background-color: #E3E3E3 !important; + padding: 3px; + font-weight: bold; +} + diff --git a/ckanext/harvest/templates/index.html b/ckanext/harvest/templates/index.html index 9c03476..71b2aa6 100644 --- a/ckanext/harvest/templates/index.html +++ b/ckanext/harvest/templates/index.html @@ -26,7 +26,7 @@ - +
@@ -38,35 +38,49 @@ + + + + + + + + + + + - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + + + + - - + + +
View EditNext Harvest Created
${source['publisher_title']}${source['publisher_id']}
ViewEditRefresh${source.url[:50]}...${source.url}${source.type}${source.active}${source.status.msg}${source.status.msg}Datasets: ${source.status.overall_statistics.added}
- Last errors: ${source.status.last_harvest_statistics.errors}
${source.status.next_harvest}
ViewEditRefresh${source.url[:50]}...${source.url}${source.type}${source.active}${source.status.msg}${source.status.msg}Datasets: ${source.status.overall_statistics.added}
+ Last errors: ${source.status.last_harvest_statistics.errors}
${source.status.next_harvest}${h.render_datetime(source.created)}
${h.render_datetime(source.created)}
diff --git a/ckanext/harvest/templates/source/read.html b/ckanext/harvest/templates/source/read.html index a5fc697..8c0c3d9 100644 --- a/ckanext/harvest/templates/source/read.html +++ b/ckanext/harvest/templates/source/read.html @@ -15,7 +15,10 @@

Harvest Source Details

+ RefreshRefresh source | + Sources list + + @@ -57,7 +60,12 @@ + + + + + From 124f3191c897dfd44e9e09363efa6a0ddcbfaafb Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 7 Mar 2012 11:56:18 +0000 Subject: [PATCH 29/43] [ui] Add class to config fields so they can be hidden via CSS --- ckanext/harvest/templates/source/new_source_form.html | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/templates/source/new_source_form.html b/ckanext/harvest/templates/source/new_source_form.html index b98b2a7..6437a95 100644 --- a/ckanext/harvest/templates/source/new_source_form.html +++ b/ckanext/harvest/templates/source/new_source_form.html @@ -56,8 +56,8 @@
Cannot add any publishers.
-
-
+
+
From 6cccbb61c96025e67a37ed64a41175e64f810f07 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 7 Mar 2012 12:10:32 +0000 Subject: [PATCH 30/43] Bug fix, new job count property had not been updated --- ckanext/harvest/commands/harvester.py | 2 +- ckanext/harvest/templates/source/read.html | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index 01e91cc..1409b85 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -253,7 +253,7 @@ class Harvester(CkanCommand): print ' active: %s' % source['active'] print ' user: %s' % source['user_id'] print 'publisher: %s' % source['publisher_id'] - print ' jobs: %s' % len(source['jobs']) + print ' jobs: %s' % source['status']['job_count'] print '' def print_harvest_jobs(self, jobs): diff --git a/ckanext/harvest/templates/source/read.html b/ckanext/harvest/templates/source/read.html index 8c0c3d9..f75e5e9 100644 --- a/ckanext/harvest/templates/source/read.html +++ b/ckanext/harvest/templates/source/read.html @@ -73,7 +73,7 @@
- + From 9fcaefe8ff0a1a73233d215f9f1aff28bd80577a Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 7 Mar 2012 15:03:33 +0000 Subject: [PATCH 31/43] [ui] Fix source datasets paging --- ckanext/harvest/controllers/view.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/controllers/view.py b/ckanext/harvest/controllers/view.py index 2589159..cbf48bc 100644 --- a/ckanext/harvest/controllers/view.py +++ b/ckanext/harvest/controllers/view.py @@ -13,7 +13,7 @@ from ckanext.harvest.lib import create_harvest_source, edit_harvest_source, \ get_harvest_source, get_harvest_sources, \ create_harvest_job, get_registered_harvesters_info, \ get_harvest_object -from ckan.lib.helpers import Page +from ckan.lib.helpers import Page,pager_url import logging log = logging.getLogger(__name__) @@ -121,7 +121,8 @@ class ViewController(BaseController): c.page = Page( collection=c.source['status']['packages'], page=request.params.get('page', 1), - items_per_page=20 + items_per_page=20, + url=pager_url ) return render('source/read.html') From 763f07fcadfb970e2bb907130519f8d41fad14d4 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 7 Mar 2012 15:20:49 +0000 Subject: [PATCH 32/43] [logic,cli] Add session to the context in cli commands --- ckanext/harvest/commands/harvester.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index 1409b85..754d9ac 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -181,7 +181,7 @@ class Harvester(CkanCommand): else: print 'Please provide a source id' sys.exit(1) - context = {'model': model, 'user': self.admin_user['name']} + context = {'model': model, 'user': self.admin_user['name'], 'session':model.Session} get_action('harvest_source_delete')(context,{'id':source_id}) print 'Removed harvest source: %s' % source_id @@ -213,14 +213,14 @@ class Harvester(CkanCommand): self.print_there_are('harvest jobs', jobs, condition=status) def list_harvest_jobs(self): - context = {'model': model, 'user': self.admin_user['name']} + context = {'model': model, 'user': self.admin_user['name'], 'session':model.Session} jobs = get_action('harvest_job_list')(context,{}) self.print_harvest_jobs(jobs) self.print_there_are(what='harvest job', sequence=jobs) def run_harvester(self): - context = {'model': model, 'user': self.admin_user['name']} + context = {'model': model, 'user': self.admin_user['name'], 'session':model.Session} jobs = get_action('harvest_jobs_run')(context,{}) #print 'Sent %s jobs to the gather queue' % len(jobs) @@ -236,7 +236,7 @@ class Harvester(CkanCommand): print '%s objects reimported' % len(objs) def create_harvest_job_all(self): - context = {'model': model, 'user': self.admin_user['name']} + context = {'model': model, 'user': self.admin_user['name'], 'session':model.Session} jobs = get_action('harvest_job_create_all')(context,{}) print 'Created %s new harvest jobs' % len(jobs) From 4a7007460be096c155287bbedc930da140b5000d Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 7 Mar 2012 17:08:17 +0000 Subject: [PATCH 33/43] [logic] Fix broken imports --- ckanext/harvest/logic/action/create.py | 2 +- ckanext/harvest/logic/action/update.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/logic/action/create.py b/ckanext/harvest/logic/action/create.py index 63580a3..b3553f1 100644 --- a/ckanext/harvest/logic/action/create.py +++ b/ckanext/harvest/logic/action/create.py @@ -4,7 +4,7 @@ from ckan.logic import NotFound, ValidationError, check_access from ckan.lib.navl.dictization_functions import validate from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject) -from ckanext.harvest.logic.schema import harvest_source_form_schema +from ckanext.harvest.logic.schema import default_harvest_source_schema from ckanext.harvest.logic.dictization import (harvest_source_dictize, harvest_job_dictize) from ckanext.harvest.logic.action.get import harvest_source_list,harvest_job_list diff --git a/ckanext/harvest/logic/action/update.py b/ckanext/harvest/logic/action/update.py index 8b84155..92315be 100644 --- a/ckanext/harvest/logic/action/update.py +++ b/ckanext/harvest/logic/action/update.py @@ -11,7 +11,7 @@ from ckan.lib.navl.dictization_functions import validate from ckanext.harvest.queue import get_gather_publisher from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject) -from ckanext.harvest.logic.schema import harvest_source_form_schema +from ckanext.harvest.logic.schema import default_harvest_source_schema from ckanext.harvest.logic.dictization import (harvest_source_dictize,harvest_object_dictize) from ckanext.harvest.logic.action.create import _error_summary From 4fe38ec49d494bd4ea8dac727ad01e9c769101dd Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 8 Mar 2012 16:14:44 +0000 Subject: [PATCH 34/43] [tests,auth] Add tests for the auth profiles Note that only the tests related to the currently loaded auth profile will be run. --- .../{test_form_api.py => _test_form_api.py} | 0 ...vest_source.py => _test_harvest_source.py} | 0 ckanext/harvest/tests/test_auth.py | 223 ++++++++++++++++++ 3 files changed, 223 insertions(+) rename ckanext/harvest/tests/{test_form_api.py => _test_form_api.py} (100%) rename ckanext/harvest/tests/{test_harvest_source.py => _test_harvest_source.py} (100%) create mode 100644 ckanext/harvest/tests/test_auth.py diff --git a/ckanext/harvest/tests/test_form_api.py b/ckanext/harvest/tests/_test_form_api.py similarity index 100% rename from ckanext/harvest/tests/test_form_api.py rename to ckanext/harvest/tests/_test_form_api.py diff --git a/ckanext/harvest/tests/test_harvest_source.py b/ckanext/harvest/tests/_test_harvest_source.py similarity index 100% rename from ckanext/harvest/tests/test_harvest_source.py rename to ckanext/harvest/tests/_test_harvest_source.py diff --git a/ckanext/harvest/tests/test_auth.py b/ckanext/harvest/tests/test_auth.py new file mode 100644 index 0000000..25cb134 --- /dev/null +++ b/ckanext/harvest/tests/test_auth.py @@ -0,0 +1,223 @@ +import logging +from pprint import pprint +from nose.plugins.skip import SkipTest; + +from ckan import model +from ckan.model import Package, Session +from ckan.lib.helpers import url_for,json +from ckan.lib.base import config + + +from ckan.tests import CreateTestData +from ckan.tests.functional.base import FunctionalTestCase + +from ckanext.harvest.plugin import Harvest +from ckanext.harvest.model import HarvestSource, HarvestJob, setup as harvest_model_setup + +log = logging.getLogger(__name__) + + +class HarvestAuthBaseCase(): + @classmethod + def setup_class(cls): + harvest_model_setup() + + @classmethod + def teardown_class(cls): + pass + + def _test_auth_not_allowed(self,user_name = None, source = None, status = 401): + + if not source: + # Create harvest source + source = HarvestSource(url=u'http://test-source.com',type='ckan') + Session.add(source) + Session.commit() + + if user_name: + extra_environ = {'REMOTE_USER': user_name.encode('utf8')} + else: + extra_environ = {} + + # List + res = self.app.get('/harvest', status=status, extra_environ=extra_environ) + # Create + res = self.app.get('/harvest/new', status=status, extra_environ=extra_environ) + # Read + res = self.app.get('/harvest/%s' % source.id, status=status, extra_environ=extra_environ) + # Edit + res = self.app.get('/harvest/edit/%s' % source.id, status=status, extra_environ=extra_environ) + # Refresh + res = self.app.get('/harvest/refresh/%s' % source.id, status=status, extra_environ=extra_environ) + + def _test_auth_allowed(self,user_name,auth_profile=None): + + extra_environ={'REMOTE_USER': user_name.encode('utf8')} + + # List + res = self.app.get('/harvest', extra_environ=extra_environ) + assert 'Harvesting Sources' in res + + # Create + res = self.app.get('/harvest/new', extra_environ=extra_environ) + assert 'New harvest source' in res + if auth_profile == 'publisher': + assert 'publisher_id' in res + else: + assert not 'publisher_id' in res + + fv = res.forms['source-new'] + fv['url'] = u'http://test-source.com' + fv['type'] = u'ckan' + fv['title'] = u'Test harvest source' + fv['description'] = u'Test harvest source' + fv['config'] = u'{"a":1,"b":2}' + + if auth_profile == 'publisher': + fv['publisher_id'] = self.publisher1.id + + res = fv.submit('save', extra_environ=extra_environ) + assert not 'Error' in res, res + + source = Session.query(HarvestSource).first() + assert source.url == u'http://test-source.com' + assert source.type == u'ckan' + + # Read + res = self.app.get('/harvest/%s' % source.id, extra_environ=extra_environ) + assert 'Harvest Source Details' in res + assert source.id in res + assert source.title in res + + # Edit + res = self.app.get('/harvest/edit/%s' % source.id, extra_environ=extra_environ) + assert 'Edit harvest source' in res + if auth_profile == 'publisher': + assert 'publisher_id' in res + else: + assert not 'publisher_id' in res + + fv = res.forms['source-new'] + fv['title'] = u'Test harvest source Updated' + + res = fv.submit('save', extra_environ=extra_environ) + assert not 'Error' in res, res + + source = Session.query(HarvestSource).first() + assert source.title == u'Test harvest source Updated' + + # Refresh + res = self.app.get('/harvest/refresh/%s' % source.id, extra_environ=extra_environ) + + job = Session.query(HarvestJob).first() + assert job.source_id == source.id + + + + +class TestAuthDefaultProfile(FunctionalTestCase,HarvestAuthBaseCase): + + @classmethod + def setup_class(cls): + if (config.get('ckan.harvest.auth.profile','') != ''): + raise SkipTest('Skipping default auth profile tests. Set ckan.harvest.auth.profile = \'\' to run them') + + super(TestAuthDefaultProfile,cls).setup_class() + + def setup(self): + CreateTestData.create() + self.sysadmin_user = model.User.get('testsysadmin') + self.normal_user = model.User.get('annafan') + + def teardown(self): + model.repo.rebuild_db() + + def test_auth_default_profile_sysadmin(self): + self._test_auth_allowed(self.sysadmin_user.name) + + def test_auth_default_profile_normal(self): + self._test_auth_not_allowed(self.normal_user.name) + + def test_auth_default_profile_notloggedin(self): + self._test_auth_not_allowed(status=302) + +class TestAuthPublisherProfile(FunctionalTestCase,HarvestAuthBaseCase): + + @classmethod + def setup_class(cls): + if (config.get('ckan.harvest.auth.profile') != 'publisher'): + raise SkipTest('Skipping publisher auth profile tests. Set ckan.harvest.auth.profile = \'publisher\' to run them') + + super(TestAuthPublisherProfile,cls).setup_class() + + def setup(self): + + model.Session.remove() + CreateTestData.create(auth_profile='publisher') + self.sysadmin_user = model.User.get('testsysadmin') + self.normal_user = model.User.get('annafan') # Does not belong to a publisher + self.publisher1_user = model.User.by_name('russianfan') + self.publisher2_user = model.User.by_name('tester') + + # Create two Publishers + rev = model.repo.new_revision() + self.publisher1 = model.Group(name=u'test-publisher1',title=u'Test Publihser 1',type=u'publisher') + Session.add(self.publisher1) + self.publisher2 = model.Group(name=u'test-publisher2',title=u'Test Publihser 2',type=u'publisher') + Session.add(self.publisher2) + + member1 = model.Member(table_name = 'user', + table_id = self.publisher1_user.id, + group=self.publisher1, + capacity='admin') + Session.add(member1) + member2 = model.Member(table_name = 'user', + table_id = self.publisher2_user.id, + group=self.publisher2, + capacity='admin') + Session.add(member2) + + Session.commit() + + def teardown(self): + model.repo.rebuild_db() + + def test_auth_publisher_profile_normal(self): + self._test_auth_not_allowed(self.normal_user.name) + + def test_auth_publisher_profile_notloggedin(self): + self._test_auth_not_allowed(status=302) + + def test_auth_publisher_profile_sysadmin(self): + self._test_auth_allowed(self.sysadmin_user.name,auth_profile='publisher') + + def test_auth_publisher_profile_publisher(self): + self._test_auth_allowed(self.publisher1_user.name,auth_profile='publisher') + + def test_auth_publisher_profile_different_publisher(self): + + # Create a source for publisher 1 + source = HarvestSource(url=u'http://test-source.com',type='ckan', + publisher_id=self.publisher1.id) + Session.add(source) + Session.commit() + + extra_environ = {'REMOTE_USER': self.publisher2_user.name.encode('utf8')} + + # List (Publihsers can see the sources list) + res = self.app.get('/harvest', extra_environ=extra_environ) + assert 'Harvesting Sources' in res + # Create + res = self.app.get('/harvest/new', extra_environ=extra_environ) + assert 'New harvest source' in res + assert 'publisher_id' in res + + # Check that this publihser is not allowed to manage sources from other publishers + status = 401 + # Read + res = self.app.get('/harvest/%s' % source.id, status=status, extra_environ=extra_environ) + # Edit + res = self.app.get('/harvest/edit/%s' % source.id, status=status, extra_environ=extra_environ) + # Refresh + res = self.app.get('/harvest/refresh/%s' % source.id, status=status, extra_environ=extra_environ) + From 60e31094a3b162e8a35c8df8888b29b3836b051b Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 8 Mar 2012 17:29:05 +0000 Subject: [PATCH 35/43] [auth,docs] Add auth profiles docs --- README.rst | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/README.rst b/README.rst index b6143e1..f40c748 100644 --- a/README.rst +++ b/README.rst @@ -100,6 +100,34 @@ the config explicitly though:: paster harvester sources --config=../ckan/development.ini +Authorization Profiles +====================== + +Starting from CKAN 1.6.1, the harvester extension offers the hability to use +different authorization profiles. These can be defined in your ini file as:: + + ckan.harvest.auth.profile = + +The two available profiles right now are: + +* `default`: This is the default profile, the same one that this extension has + used historically. Basically, only sysadmins can manage anything related to + harvesting, including creating and editing harvest sources or running harvest + jobs. + +* `publisher`: When using this profile, sysadmins can still perform any + harvesting related action, but in addition, users belonging to a publisher + (with role `admin`) can manage and run their own harvest sources and jobs. + Note that this requires CKAN core to also use the `publisher` authorization + profile, i.e you will also need to add:: + + ckan.auth.profile = publisher + + To know more about the CKAN publisher auth profile, visit + + http://wiki.ckan.org/Working_with_the_publisher_auth_profile + + The CKAN harverster =================== From 076d8145a6c3591d180957fb7b7ca289c87413d0 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 8 Mar 2012 17:36:16 +0000 Subject: [PATCH 36/43] [tests,auth] Add tests ini files --- test-core.ini | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++ test.ini | 54 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 test-core.ini create mode 100644 test.ini diff --git a/test-core.ini b/test-core.ini new file mode 100644 index 0000000..6428eff --- /dev/null +++ b/test-core.ini @@ -0,0 +1,54 @@ +[DEFAULT] +debug = true +# Uncomment and replace with the address which should receive any error reports +#email_to = you@yourdomain.com +smtp_server = localhost +error_email_from = paste@localhost + +[server:main] +use = egg:Paste#http +host = 0.0.0.0 +port = 5000 + + +[app:main] +use = config:../ckan/test-core.ini +# Here we hard-code the database and a flag to make default tests +# run fast. +ckan.plugins = harvest ckan_harvester +# NB: other test configuration should go in test-core.ini, which is +# what the postgres tests use. + + +# Logging configuration +[loggers] +keys = root, ckan, sqlalchemy + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console + +[logger_ckan] +qualname = ckan +handlers = +level = INFO + +[logger_sqlalchemy] +handlers = +qualname = sqlalchemy.engine +level = WARN + +[handler_console] +class = StreamHandler +args = (sys.stdout,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(asctime)s %(levelname)-5.5s [%(name)s] %(message)s diff --git a/test.ini b/test.ini new file mode 100644 index 0000000..bdb886e --- /dev/null +++ b/test.ini @@ -0,0 +1,54 @@ +[DEFAULT] +debug = true +# Uncomment and replace with the address which should receive any error reports +#email_to = you@yourdomain.com +smtp_server = localhost +error_email_from = paste@localhost + +[server:main] +use = egg:Paste#http +host = 0.0.0.0 +port = 5000 + + +[app:main] +use = config:../ckan/test.ini +# Here we hard-code the database and a flag to make default tests +# run fast. +ckan.plugins = harvest ckan_harvester +# NB: other test configuration should go in test-core.ini, which is +# what the postgres tests use. + + +# Logging configuration +[loggers] +keys = root, ckan, sqlalchemy + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console + +[logger_ckan] +qualname = ckan +handlers = +level = INFO + +[logger_sqlalchemy] +handlers = +qualname = sqlalchemy.engine +level = WARN + +[handler_console] +class = StreamHandler +args = (sys.stdout,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(asctime)s %(levelname)-5.5s [%(name)s] %(message)s From e0bef2ef9cc9c61e732b24d6fda9e04d24a99627 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 12 Mar 2012 14:46:28 +0000 Subject: [PATCH 37/43] [base] Minor fix for harvesters without config --- ckanext/harvest/harvesters/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ckanext/harvest/harvesters/base.py b/ckanext/harvest/harvesters/base.py index 865a06f..e857c5f 100644 --- a/ckanext/harvest/harvesters/base.py +++ b/ckanext/harvest/harvesters/base.py @@ -30,6 +30,8 @@ class HarvesterBase(SingletonPlugin): ''' implements(IHarvester) + config = None + def _gen_new_name(self,title): ''' Creates a URL friendly name from a title From f210455aef1c59a1c38dea691e7ffb32a9d3e53c Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 13 Mar 2012 12:38:14 +0000 Subject: [PATCH 38/43] [ckan harvester] Replace title on default extras --- README.rst | 1 + ckanext/harvest/harvesters/base.py | 1 + ckanext/harvest/harvesters/ckanharvester.py | 2 ++ 3 files changed, 4 insertions(+) diff --git a/README.rst b/README.rst index f40c748..2aea2d6 100644 --- a/README.rst +++ b/README.rst @@ -160,6 +160,7 @@ field. The currently supported configuration options are: * {dataset_id} * {harvest_source_id} * {harvest_source_url} # Will be stripped of trailing forward slashes (/) + * {harvest_source_title} # Requires CKAN 1.6 * {harvest_job_id} * {harvest_object_id} diff --git a/ckanext/harvest/harvesters/base.py b/ckanext/harvest/harvesters/base.py index e857c5f..189a5ad 100644 --- a/ckanext/harvest/harvesters/base.py +++ b/ckanext/harvest/harvesters/base.py @@ -153,6 +153,7 @@ class HarvesterBase(SingletonPlugin): else: log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) + return except NotFound: # Package needs to be created diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 8a3c5fc..e9f1aeb 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -266,9 +266,11 @@ class CKANHarvester(HarvesterBase): if isinstance(value,basestring): value = value.format(harvest_source_id=harvest_object.job.source.id, harvest_source_url=harvest_object.job.source.url.strip('/'), + harvest_source_title=harvest_object.job.source.title, harvest_job_id=harvest_object.job.id, harvest_object_id=harvest_object.id, dataset_id=package_dict['id']) + package_dict['extras'][key] = value result = self._create_or_update_package(package_dict,harvest_object) From 871eae94b653c15821c8f58e22ac9c71f1ddc8d2 Mon Sep 17 00:00:00 2001 From: amercader Date: Thu, 15 Mar 2012 11:31:12 +0000 Subject: [PATCH 39/43] [ckan harvester] Fix bug on force all check --- ckanext/harvest/harvesters/ckanharvester.py | 62 ++++++++++----------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index e9f1aeb..9ba05db 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -130,42 +130,42 @@ class CKANHarvester(HarvesterBase): base_rest_url = base_url + self._get_rest_api_offset() base_search_url = base_url + self._get_search_api_offset() - if (previous_job and not previous_job.gather_errors and not len(previous_job.objects) == 0) \ - or not self.config.get('force_all',False): - get_all_packages = False + if (previous_job and not previous_job.gather_errors and not len(previous_job.objects) == 0): + if not self.config.get('force_all',False): + get_all_packages = False - # Request only the packages modified since last harvest job - last_time = harvest_job.gather_started.isoformat() - url = base_search_url + '/revision?since_time=%s' % last_time + # Request only the packages modified since last harvest job + last_time = harvest_job.gather_started.isoformat() + url = base_search_url + '/revision?since_time=%s' % last_time - try: - content = self._get_content(url) + try: + content = self._get_content(url) - revision_ids = json.loads(content) - if len(revision_ids): - for revision_id in revision_ids: - url = base_rest_url + '/revision/%s' % revision_id - try: - content = self._get_content(url) - except Exception,e: - self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job) - continue + revision_ids = json.loads(content) + if len(revision_ids): + for revision_id in revision_ids: + url = base_rest_url + '/revision/%s' % revision_id + try: + content = self._get_content(url) + except Exception,e: + self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job) + continue - revision = json.loads(content) - for package_id in revision.packages: - if not package_id in package_ids: - package_ids.append(package_id) - else: - log.info('No packages have been updated on the remote CKAN instance since the last harvest job') - return None + revision = json.loads(content) + for package_id in revision.packages: + if not package_id in package_ids: + package_ids.append(package_id) + else: + log.info('No packages have been updated on the remote CKAN instance since the last harvest job') + return None - except urllib2.HTTPError,e: - if e.getcode() == 400: - log.info('CKAN instance %s does not suport revision filtering' % base_url) - get_all_packages = True - else: - self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job) - return None + except urllib2.HTTPError,e: + if e.getcode() == 400: + log.info('CKAN instance %s does not suport revision filtering' % base_url) + get_all_packages = True + else: + self._save_gather_error('Unable to get content for URL: %s: %s' % (url, str(e)),harvest_job) + return None From 7f10418f4411d804bc864013aab9bea080c06c55 Mon Sep 17 00:00:00 2001 From: Ian Murray Date: Thu, 15 Mar 2012 18:09:44 +0000 Subject: [PATCH 40/43] [master][auth] get_obj_object() function was missing --- ckanext/harvest/logic/auth/__init__.py | 12 ++++++++++++ ckanext/harvest/logic/auth/publisher/get.py | 3 ++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/ckanext/harvest/logic/auth/__init__.py b/ckanext/harvest/logic/auth/__init__.py index b015aed..47d6664 100644 --- a/ckanext/harvest/logic/auth/__init__.py +++ b/ckanext/harvest/logic/auth/__init__.py @@ -25,3 +25,15 @@ def get_job_object(context, data_dict = {}): job = context['job'] return job + +def get_obj_object(context, data_dict = {}): + if not 'obj' in context: + model = context['model'] + id = data_dict.get('id',None) + job = HarvestObject.get(id) + if not job: + raise NotFound + else: + job = context['job'] + + return job diff --git a/ckanext/harvest/logic/auth/publisher/get.py b/ckanext/harvest/logic/auth/publisher/get.py index d78f336..84da404 100644 --- a/ckanext/harvest/logic/auth/publisher/get.py +++ b/ckanext/harvest/logic/auth/publisher/get.py @@ -1,9 +1,10 @@ from ckan.lib.base import _ +from ckan.logic import NotFound from ckan.authz import Authorizer from ckan.model import User from ckanext.harvest.model import HarvestSource -from ckanext.harvest.logic.auth import get_source_object, get_job_object +from ckanext.harvest.logic.auth import get_source_object, get_job_object, get_obj_object def harvest_source_show(context,data_dict): model = context['model'] From 1145e6ea72e4cb9d159cc249b445cbd1d8d94c7f Mon Sep 17 00:00:00 2001 From: Ian Murray Date: Thu, 15 Mar 2012 18:14:57 +0000 Subject: [PATCH 41/43] [master][auth/publisher] Check for 'ignore_auth' in harvest_object_show Use case: In ckanext-dgu we want to index the harvest_object.content field. As indexing is done synchronously we need to provide a way for that harvest_object to be accessed when the current http request is made by a non-sysadmin user. --- ckanext/harvest/logic/auth/publisher/get.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ckanext/harvest/logic/auth/publisher/get.py b/ckanext/harvest/logic/auth/publisher/get.py index 84da404..ed1759b 100644 --- a/ckanext/harvest/logic/auth/publisher/get.py +++ b/ckanext/harvest/logic/auth/publisher/get.py @@ -99,6 +99,9 @@ def harvest_object_show(context,data_dict): obj = get_obj_object(context,data_dict) + if context.get('ignore_auth', False): + return {'success': True} + if not user: return {'success': False, 'msg': _('Non-logged in users are not authorized to see harvest objects')} From 38a9a03355f536b983af3d57f114be527a2e0107 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 19 Mar 2012 17:01:20 +0000 Subject: [PATCH 42/43] [logic] Fix variables naming --- ckanext/harvest/logic/auth/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ckanext/harvest/logic/auth/__init__.py b/ckanext/harvest/logic/auth/__init__.py index 47d6664..2f2b306 100644 --- a/ckanext/harvest/logic/auth/__init__.py +++ b/ckanext/harvest/logic/auth/__init__.py @@ -30,10 +30,10 @@ def get_obj_object(context, data_dict = {}): if not 'obj' in context: model = context['model'] id = data_dict.get('id',None) - job = HarvestObject.get(id) - if not job: + obj = HarvestObject.get(id) + if not obj: raise NotFound else: - job = context['job'] + obj = context['obj'] - return job + return obj From e797f50a053e2021cf1e10eb03fba74742e5d742 Mon Sep 17 00:00:00 2001 From: amercader Date: Mon, 19 Mar 2012 17:28:53 +0000 Subject: [PATCH 43/43] [cli] Fix create job command --- ckanext/harvest/commands/harvester.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index 754d9ac..d3ee346 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -205,12 +205,12 @@ class Harvester(CkanCommand): print 'Please provide a source id' sys.exit(1) - job = create_harvest_job(source_id) + context = {'model': model,'session':model.Session, 'user': self.admin_user['name']} + job = get_action('harvest_job_create')(context,{'source_id':source_id}) self.print_harvest_job(job) - context = {'model': model,'session':model.Session, 'user': self.admin_user['name']} jobs = get_action('harvest_job_list')(context,{'status':u'New'}) - self.print_there_are('harvest jobs', jobs, condition=status) + self.print_there_are('harvest jobs', jobs, condition=u'New') def list_harvest_jobs(self): context = {'model': model, 'user': self.admin_user['name'], 'session':model.Session}
ID
Publisher${c.source.publisher_title}${c.source.publisher_id}
Created
Total jobs${len(c.source.jobs)}${c.source.status.job_count}
Status