From 2018d9e5130172d3e6546b90466a3a7906660c7f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Mercader?= Date: Fri, 18 Nov 2011 13:20:41 +0000 Subject: [PATCH 01/12] [ckan harvester] Support for default tags and groups --- README.rst | 24 ++++++++++++++++ ckanext/harvest/harvesters/base.py | 8 +++++- ckanext/harvest/harvesters/ckanharvester.py | 32 ++++++++++++++++++++- 3 files changed, 62 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 7d553b7..c540d4f 100644 --- a/README.rst +++ b/README.rst @@ -113,6 +113,30 @@ to add the `ckan_harvester` plugin to your options file: After adding it, a 'CKAN' option should appear in the 'New harvest source' form. +The CKAN harvesters support a number of configuration options to control their +behaviour. Those need to defined as a JSON object in the configuration form +field. The currently supported configuration options are: + +* api_version: You can force the harvester to use eithoer version '1' or + '2' of the CKAN API. Default is '2'. + +* default_tags: A list of tags that will be added to all harvested datasets. + Tags don't need to previously exist. + +* default_groups: A list of groups to which the harvested datasets will be + added to. The groups must exist. Note that you must use ids or names to + define the groups according to the API version you defined (names for + version '1', ids for version '2') + +Here is an example of a configuration object (the one that must be entered in +the configuration field):: + + { + "api_version":"1", + "default_tags":["new-tag-1","new-tag-2"], + "default_groups":["my-own-group"] + } + The harvesting interface ======================== diff --git a/ckanext/harvest/harvesters/base.py b/ckanext/harvest/harvesters/base.py index 965b541..3a8de25 100644 --- a/ckanext/harvest/harvesters/base.py +++ b/ckanext/harvest/harvesters/base.py @@ -106,11 +106,17 @@ class HarvesterBase(SingletonPlugin): schema = default_package_schema() schema["id"] = [ignore_missing, unicode] + # Check API version + if self.config: + api_version = self.config.get('api_version','2') + else: + api_verion = '2' + context = { 'model': model, 'session': Session, 'user': u'harvest', - 'api_version':'2', + 'api_version': api_version, 'schema': schema, } diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 0354bf5..208daa5 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -1,7 +1,9 @@ import urllib2 +from ckan.lib.base import c +from ckan import model from ckan.model import Session, Package -from ckan.logic import ValidationError, NotFound +from ckan.logic import ValidationError, NotFound, get_action from ckan.lib.helpers import json from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \ @@ -65,6 +67,16 @@ class CKANHarvester(HarvesterBase): try: config_obj = json.loads(config) + + if 'default_groups' in config_obj: + # Check if default groups exist + context = {'model':model,'user':c.user} + for group_name in config_obj['default_groups']: + try: + group = get_action('group_show')(context,{'id':group_name}) + except NotFound,e: + raise ValueError('Default group not found') + except ValueError,e: raise e @@ -196,6 +208,24 @@ class CKANHarvester(HarvesterBase): try: package_dict = json.loads(harvest_object.content) + + # Set default tags if needed + default_tags = self.config.get('default_tags',[]) + if default_tags: + if not 'tags' in package_dict: + package_dict['tags'] = [] + package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']]) + + # Ignore remote groups for the time being + del package_dict['groups'] + + # Set default groups if needed + default_groups = self.config.get('default_groups',[]) + if default_groups: + if not 'groups' in package_dict: + package_dict['groups'] = [] + package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) + return self._create_or_update_package(package_dict,harvest_object) except ValidationError,e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), From c939d90dbbe2d119b9c71fb05c153e781f7d139d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Mercader?= Date: Fri, 18 Nov 2011 14:12:30 +0000 Subject: [PATCH 02/12] [ckan harvester] Support for defining a custom user to do the harvesting --- README.rst | 9 +++++++-- ckanext/harvest/harvesters/base.py | 5 ++++- ckanext/harvest/harvesters/ckanharvester.py | 8 ++++++++ 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index c540d4f..7ce3098 100644 --- a/README.rst +++ b/README.rst @@ -117,7 +117,7 @@ The CKAN harvesters support a number of configuration options to control their behaviour. Those need to defined as a JSON object in the configuration form field. The currently supported configuration options are: -* api_version: You can force the harvester to use eithoer version '1' or +* api_version: You can force the harvester to use either version '1' or '2' of the CKAN API. Default is '2'. * default_tags: A list of tags that will be added to all harvested datasets. @@ -128,13 +128,18 @@ field. The currently supported configuration options are: define the groups according to the API version you defined (names for version '1', ids for version '2') +* user: User who will run the harvesting process. Please note that this user + needs to have permission for creating packages, and if default groups were + defined, the user must have permission to assign packages to these groups. + Here is an example of a configuration object (the one that must be entered in the configuration field):: { "api_version":"1", "default_tags":["new-tag-1","new-tag-2"], - "default_groups":["my-own-group"] + "default_groups":["my-own-group"], + "user":"harverster-user" } diff --git a/ckanext/harvest/harvesters/base.py b/ckanext/harvest/harvesters/base.py index 3a8de25..3a8e992 100644 --- a/ckanext/harvest/harvesters/base.py +++ b/ckanext/harvest/harvesters/base.py @@ -109,13 +109,16 @@ class HarvesterBase(SingletonPlugin): # Check API version if self.config: api_version = self.config.get('api_version','2') + #TODO: use site user when available + user_name = self.config.get('user',u'harvest') else: api_verion = '2' + user_name = u'harvest' context = { 'model': model, 'session': Session, - 'user': u'harvest', + 'user': user_name, 'api_version': api_version, 'schema': schema, } diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 208daa5..919af83 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -77,6 +77,14 @@ class CKANHarvester(HarvesterBase): except NotFound,e: raise ValueError('Default group not found') + if 'user' in config_obj: + # Check if user exists + context = {'model':model,'user':c.user} + try: + user = get_action('user_show')(context,{'id':config_obj.get('user')}) + except NotFound,e: + raise ValueError('User not found') + except ValueError,e: raise e From 994590531e3059b78f3f423918dfe36f5e284e33 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Mercader?= Date: Fri, 18 Nov 2011 14:30:10 +0000 Subject: [PATCH 03/12] [ckan harvester] Support for creating read-only packages --- README.rst | 11 +++++++++-- ckanext/harvest/harvesters/ckanharvester.py | 21 ++++++++++++++++++++- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 7ce3098..67273ac 100644 --- a/README.rst +++ b/README.rst @@ -132,6 +132,12 @@ field. The currently supported configuration options are: needs to have permission for creating packages, and if default groups were defined, the user must have permission to assign packages to these groups. +* read_only: Create harvested packages in read-only mode. Only the user who + performed the harvest (the one defined in the previous setting or the + 'harvest' sysadmin) will be able to edit and administer the packages + created from this harvesting source. Logged in users and visitors will be + only able to read them. + Here is an example of a configuration object (the one that must be entered in the configuration field):: @@ -139,7 +145,8 @@ the configuration field):: "api_version":"1", "default_tags":["new-tag-1","new-tag-2"], "default_groups":["my-own-group"], - "user":"harverster-user" + "user":"harverster-user", + "read_only": true } @@ -288,7 +295,7 @@ Finally, on a third console, run the following command to start any pending harvesting jobs:: paster harvester run --config=../ckan/development.ini - + After packages have been imported, the search index will have to be updated before the packages appear in search results (from the ckan directory): diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 919af83..4f86032 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -234,7 +234,26 @@ class CKANHarvester(HarvesterBase): package_dict['groups'] = [] package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) - return self._create_or_update_package(package_dict,harvest_object) + result = self._create_or_update_package(package_dict,harvest_object) + + if result and self.config.get('read_only',False) == True: + + package = model.Package.get(package_dict['id']) + + # Clear default permissions + model.clear_user_roles(package) + + # Setup harvest user as admin + user_name = self.config.get('user',u'harvest') + user = model.User.get(user_name) + pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN) + + # Other users can only read + for user_name in (u'visitor',u'logged_in'): + user = model.User.get(user_name) + pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) + + except ValidationError,e: self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), harvest_object, 'Import') From f02ee45aae149fa305e0fb81316cee0257d8f53d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Mercader?= Date: Fri, 18 Nov 2011 14:35:46 +0000 Subject: [PATCH 04/12] [ui] Show config options in harvest source details page --- ckanext/harvest/templates/source/read.html | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ckanext/harvest/templates/source/read.html b/ckanext/harvest/templates/source/read.html index d60b986..e38902c 100644 --- a/ckanext/harvest/templates/source/read.html +++ b/ckanext/harvest/templates/source/read.html @@ -34,6 +34,15 @@ Description ${c.source.description} + + Configuration + + ${c.source.config} + + + - + + User ${c.source.user_id} From 0ab5c53b479385e215960b4c6328779fbcbc1bd6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Mercader?= Date: Fri, 18 Nov 2011 17:53:01 +0000 Subject: [PATCH 05/12] [ckan harvester] Fix typo --- ckanext/harvest/harvesters/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/harvest/harvesters/base.py b/ckanext/harvest/harvesters/base.py index 3a8e992..5c38701 100644 --- a/ckanext/harvest/harvesters/base.py +++ b/ckanext/harvest/harvesters/base.py @@ -112,7 +112,7 @@ class HarvesterBase(SingletonPlugin): #TODO: use site user when available user_name = self.config.get('user',u'harvest') else: - api_verion = '2' + api_version = '2' user_name = u'harvest' context = { From cfaba6e1e8027dba1ca7c60db793f062af6c53ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Mercader?= Date: Mon, 21 Nov 2011 17:29:10 +0000 Subject: [PATCH 06/12] [ckan harvester] Add support for sending an API key --- ckanext/harvest/harvesters/ckanharvester.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 4f86032..50202d7 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -36,6 +36,9 @@ class CKANHarvester(HarvesterBase): ) try: + api_key = self.config.get('api_key',None) + if api_key: + http_request.add_header('Authorization',api_key) http_response = urllib2.urlopen(http_request) return http_response.read() From da469ab08eb1f198394d36255a1017a65b7cf844 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Mercader?= Date: Wed, 23 Nov 2011 11:05:52 +0000 Subject: [PATCH 07/12] [base harvester] Custom tag munge function. TODO: check with flexible tags --- ckanext/harvest/harvesters/base.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/harvesters/base.py b/ckanext/harvest/harvesters/base.py index 5c38701..3a1d9ce 100644 --- a/ckanext/harvest/harvesters/base.py +++ b/ckanext/harvest/harvesters/base.py @@ -1,5 +1,5 @@ import logging - +import re from ckan import model from ckan.model import Session, Package @@ -7,7 +7,7 @@ from ckan.logic import ValidationError, NotFound, get_action from ckan.logic.schema import default_package_schema from ckan.lib.navl.validators import ignore_missing -from ckan.lib.munge import munge_title_to_name, munge_tag +from ckan.lib.munge import munge_title_to_name,substitute_ascii_equivalents from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \ HarvestObjectError @@ -17,6 +17,11 @@ from ckanext.harvest.interfaces import IHarvester log = logging.getLogger(__name__) +def munge_tag(tag): + tag = substitute_ascii_equivalents(tag) + tag = tag.lower().strip() + return re.sub(r'[^a-zA-Z0-9 -]', '', tag).replace(' ', '-') + class HarvesterBase(SingletonPlugin): ''' Generic class for harvesters with helper functions From ac9a9100f832abf974b7d844d41274e12091fd21 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Mercader?= Date: Wed, 23 Nov 2011 11:09:16 +0000 Subject: [PATCH 08/12] [doc] Document api_key config option --- README.rst | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.rst b/README.rst index 67273ac..42fbb0e 100644 --- a/README.rst +++ b/README.rst @@ -132,6 +132,9 @@ field. The currently supported configuration options are: needs to have permission for creating packages, and if default groups were defined, the user must have permission to assign packages to these groups. +* api_key: If the remote CKAN instance has restricted access to the API you + can provide a CKAN API key, which will be sent in any request. + * read_only: Create harvested packages in read-only mode. Only the user who performed the harvest (the one defined in the previous setting or the 'harvest' sysadmin) will be able to edit and administer the packages @@ -146,6 +149,7 @@ the configuration field):: "default_tags":["new-tag-1","new-tag-2"], "default_groups":["my-own-group"], "user":"harverster-user", + "api_key":"", "read_only": true } From ae5109321361513fac15b97b5e67d3a86e130977 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 10 Jan 2012 14:46:12 +0000 Subject: [PATCH 09/12] [ckan harvester] Ignore __junk field, was causing imports to fail --- ckanext/harvest/harvesters/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/harvesters/base.py b/ckanext/harvest/harvesters/base.py index 3a1d9ce..3f01f6b 100644 --- a/ckanext/harvest/harvesters/base.py +++ b/ckanext/harvest/harvesters/base.py @@ -6,7 +6,7 @@ from ckan.model import Session, Package from ckan.logic import ValidationError, NotFound, get_action from ckan.logic.schema import default_package_schema -from ckan.lib.navl.validators import ignore_missing +from ckan.lib.navl.validators import ignore_missing,ignore from ckan.lib.munge import munge_title_to_name,substitute_ascii_equivalents from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \ @@ -109,7 +109,8 @@ class HarvesterBase(SingletonPlugin): try: # Change default schema schema = default_package_schema() - schema["id"] = [ignore_missing, unicode] + schema['id'] = [ignore_missing, unicode] + schema['__junk'] = [ignore] # Check API version if self.config: From eb646b338573e2a00f8660ddbdafe041c716c592 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 10 Jan 2012 17:07:19 +0000 Subject: [PATCH 10/12] [ckan harvester] Add support for defining default extras --- README.rst | 33 +++++++++++++++------ ckanext/harvest/harvesters/ckanharvester.py | 28 +++++++++++++++++ 2 files changed, 52 insertions(+), 9 deletions(-) diff --git a/README.rst b/README.rst index 42fbb0e..373052d 100644 --- a/README.rst +++ b/README.rst @@ -117,25 +117,38 @@ The CKAN harvesters support a number of configuration options to control their behaviour. Those need to defined as a JSON object in the configuration form field. The currently supported configuration options are: -* api_version: You can force the harvester to use either version '1' or - '2' of the CKAN API. Default is '2'. +* api_version: You can force the harvester to use either version '1' or '2' of + the CKAN API. Default is '2'. -* default_tags: A list of tags that will be added to all harvested datasets. +* default_tags: A list of tags that will be added to all harvested datasets. Tags don't need to previously exist. -* default_groups: A list of groups to which the harvested datasets will be +* default_groups: A list of groups to which the harvested datasets will be added to. The groups must exist. Note that you must use ids or names to - define the groups according to the API version you defined (names for - version '1', ids for version '2') + define the groups according to the API version you defined (names for version + '1', ids for version '2'). -* user: User who will run the harvesting process. Please note that this user +* default_extras: A dictionary of key value pairs that will be added to extras + of the harvested datasets. You can use the following replacement strings, + that will be replaced before creating or updating the datasets: + + * {dataset_id} + * {harvest_source_id} + * {harvest_source_url} # Will be stripped of trailing forward slashes (/) + * {harvest_job_id} + * {harvest_object_id} + +* override_extras: Assign default extras even if they already exist in the + remote dataset. Default is False (only non existing extras are added). + +* user: User who will run the harvesting process. Please note that this user needs to have permission for creating packages, and if default groups were defined, the user must have permission to assign packages to these groups. -* api_key: If the remote CKAN instance has restricted access to the API you +* api_key: If the remote CKAN instance has restricted access to the API you can provide a CKAN API key, which will be sent in any request. -* read_only: Create harvested packages in read-only mode. Only the user who +* read_only: Create harvested packages in read-only mode. Only the user who performed the harvest (the one defined in the previous setting or the 'harvest' sysadmin) will be able to edit and administer the packages created from this harvesting source. Logged in users and visitors will be @@ -148,6 +161,8 @@ the configuration field):: "api_version":"1", "default_tags":["new-tag-1","new-tag-2"], "default_groups":["my-own-group"], + "default_extras":{"new_extra":"Test",harvest_url":"{harvest_source_url}/dataset/{dataset_id}"}, + "override_extras": true, "user":"harverster-user", "api_key":"", "read_only": true diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index 50202d7..7e293e4 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -71,7 +71,14 @@ class CKANHarvester(HarvesterBase): try: config_obj = json.loads(config) + if 'default_tags' in config_obj: + if not isinstance(config_obj['default_tags'],list): + raise ValueError('default_tags must be a list') + if 'default_groups' in config_obj: + if not isinstance(config_obj['default_groups'],list): + raise ValueError('default_groups must be a list') + # Check if default groups exist context = {'model':model,'user':c.user} for group_name in config_obj['default_groups']: @@ -80,6 +87,10 @@ class CKANHarvester(HarvesterBase): except NotFound,e: raise ValueError('Default group not found') + if 'default_extras' in config_obj: + if not isinstance(config_obj['default_extras'],dict): + raise ValueError('default_extras must be a dictionary') + if 'user' in config_obj: # Check if user exists context = {'model':model,'user':c.user} @@ -237,6 +248,23 @@ class CKANHarvester(HarvesterBase): package_dict['groups'] = [] package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']]) + # Set default extras if needed + default_extras = self.config.get('default_extras',{}) + if default_extras: + override_extras = self.config.get('override_extras',False) + if not 'extras' in package_dict: + package_dict['extras'] = {} + for key,value in default_extras.iteritems(): + if not key in package_dict['extras'] or override_extras: + # Look for replacement strings + if isinstance(value,basestring): + value = value.format(harvest_source_id=harvest_object.job.source.id, + harvest_source_url=harvest_object.job.source.url.strip('/'), + harvest_job_id=harvest_object.job.id, + harvest_object_id=harvest_object.id, + dataset_id=package_dict['id']) + package_dict['extras'][key] = value + result = self._create_or_update_package(package_dict,harvest_object) if result and self.config.get('read_only',False) == True: From 2ad29df5c53b030dc16de46fe68f037390276a37 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 10 Jan 2012 17:15:56 +0000 Subject: [PATCH 11/12] [lib] Fix bug: couldn't delete source conf --- ckanext/harvest/lib/__init__.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ckanext/harvest/lib/__init__.py b/ckanext/harvest/lib/__init__.py index cd3ef06..c1c5d81 100644 --- a/ckanext/harvest/lib/__init__.py +++ b/ckanext/harvest/lib/__init__.py @@ -249,11 +249,13 @@ def edit_harvest_source(source_id,data_dict): Session.rollback() raise ValidationError(errors,_error_summary(errors)) - fields = ['url','type','active','description','user_id','publisher_id','config'] + fields = ['url','type','active','description','user_id','publisher_id'] for f in fields: if f in data_dict and data_dict[f] is not None and data_dict[f] != '': source.__setattr__(f,data_dict[f]) + source.config = data_dict['config'] + source.save() return _source_as_dict(source) From a53b79c18198314f7121d0ff9bf624d5980fb15d Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 10 Jan 2012 17:24:05 +0000 Subject: [PATCH 12/12] [ui] Show edit and refresh links in source page --- ckanext/harvest/public/ckanext/harvest/style.css | 9 +++++++++ ckanext/harvest/templates/source/read.html | 3 +++ 2 files changed, 12 insertions(+) diff --git a/ckanext/harvest/public/ckanext/harvest/style.css b/ckanext/harvest/public/ckanext/harvest/style.css index de04d84..9391159 100644 --- a/ckanext/harvest/public/ckanext/harvest/style.css +++ b/ckanext/harvest/public/ckanext/harvest/style.css @@ -13,3 +13,12 @@ .harvester-title{ font-weight: bold; } + +#harvest-source-actions { + margin-bottom: 10px; +} + +#harvest-source-actions img{ + vertical-align: middle; + margin: 0 5px; +} diff --git a/ckanext/harvest/templates/source/read.html b/ckanext/harvest/templates/source/read.html index e38902c..43fd8a9 100644 --- a/ckanext/harvest/templates/source/read.html +++ b/ckanext/harvest/templates/source/read.html @@ -13,6 +13,9 @@

Harvest Source Details

+
ID