Merge branch 'feature-new-ckan-harvester-features'

This commit is contained in:
amercader 2012-01-10 17:27:33 +00:00
commit eb591521ff
6 changed files with 191 additions and 10 deletions

View File

@ -110,6 +110,61 @@ to add the `ckan_harvester` plugin to your options file::
After adding it, a 'CKAN' option should appear in the 'New harvest source' form.
The CKAN harvesters support a number of configuration options to control their
behaviour. Those need to defined as a JSON object in the configuration form
field. The currently supported configuration options are:
* api_version: You can force the harvester to use either version '1' or '2' of
the CKAN API. Default is '2'.
* default_tags: A list of tags that will be added to all harvested datasets.
Tags don't need to previously exist.
* default_groups: A list of groups to which the harvested datasets will be
added to. The groups must exist. Note that you must use ids or names to
define the groups according to the API version you defined (names for version
'1', ids for version '2').
* default_extras: A dictionary of key value pairs that will be added to extras
of the harvested datasets. You can use the following replacement strings,
that will be replaced before creating or updating the datasets:
* {dataset_id}
* {harvest_source_id}
* {harvest_source_url} # Will be stripped of trailing forward slashes (/)
* {harvest_job_id}
* {harvest_object_id}
* override_extras: Assign default extras even if they already exist in the
remote dataset. Default is False (only non existing extras are added).
* user: User who will run the harvesting process. Please note that this user
needs to have permission for creating packages, and if default groups were
defined, the user must have permission to assign packages to these groups.
* api_key: If the remote CKAN instance has restricted access to the API you
can provide a CKAN API key, which will be sent in any request.
* read_only: Create harvested packages in read-only mode. Only the user who
performed the harvest (the one defined in the previous setting or the
'harvest' sysadmin) will be able to edit and administer the packages
created from this harvesting source. Logged in users and visitors will be
only able to read them.
Here is an example of a configuration object (the one that must be entered in
the configuration field)::
{
"api_version":"1",
"default_tags":["new-tag-1","new-tag-2"],
"default_groups":["my-own-group"],
"default_extras":{"new_extra":"Test",harvest_url":"{harvest_source_url}/dataset/{dataset_id}"},
"override_extras": true,
"user":"harverster-user",
"api_key":"<REMOTE_API_KEY>",
"read_only": true
}
The harvesting interface
========================
@ -256,7 +311,7 @@ Finally, on a third console, run the following command to start any
pending harvesting jobs::
paster harvester run --config=../ckan/development.ini
After packages have been imported, the search index will have to be updated
before the packages appear in search results (from the ckan directory):

View File

@ -1,13 +1,13 @@
import logging
import re
from ckan import model
from ckan.model import Session, Package
from ckan.logic import ValidationError, NotFound, get_action
from ckan.logic.schema import default_package_schema
from ckan.lib.navl.validators import ignore_missing
from ckan.lib.munge import munge_title_to_name, munge_tag
from ckan.lib.navl.validators import ignore_missing,ignore
from ckan.lib.munge import munge_title_to_name,substitute_ascii_equivalents
from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \
HarvestObjectError
@ -17,6 +17,11 @@ from ckanext.harvest.interfaces import IHarvester
log = logging.getLogger(__name__)
def munge_tag(tag):
tag = substitute_ascii_equivalents(tag)
tag = tag.lower().strip()
return re.sub(r'[^a-zA-Z0-9 -]', '', tag).replace(' ', '-')
class HarvesterBase(SingletonPlugin):
'''
Generic class for harvesters with helper functions
@ -104,13 +109,23 @@ class HarvesterBase(SingletonPlugin):
try:
# Change default schema
schema = default_package_schema()
schema["id"] = [ignore_missing, unicode]
schema['id'] = [ignore_missing, unicode]
schema['__junk'] = [ignore]
# Check API version
if self.config:
api_version = self.config.get('api_version','2')
#TODO: use site user when available
user_name = self.config.get('user',u'harvest')
else:
api_version = '2'
user_name = u'harvest'
context = {
'model': model,
'session': Session,
'user': u'harvest',
'api_version':'2',
'user': user_name,
'api_version': api_version,
'schema': schema,
}

View File

@ -1,7 +1,9 @@
import urllib2
from ckan.lib.base import c
from ckan import model
from ckan.model import Session, Package
from ckan.logic import ValidationError, NotFound
from ckan.logic import ValidationError, NotFound, get_action
from ckan.lib.helpers import json
from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \
@ -34,6 +36,9 @@ class CKANHarvester(HarvesterBase):
)
try:
api_key = self.config.get('api_key',None)
if api_key:
http_request.add_header('Authorization',api_key)
http_response = urllib2.urlopen(http_request)
return http_response.read()
@ -65,6 +70,35 @@ class CKANHarvester(HarvesterBase):
try:
config_obj = json.loads(config)
if 'default_tags' in config_obj:
if not isinstance(config_obj['default_tags'],list):
raise ValueError('default_tags must be a list')
if 'default_groups' in config_obj:
if not isinstance(config_obj['default_groups'],list):
raise ValueError('default_groups must be a list')
# Check if default groups exist
context = {'model':model,'user':c.user}
for group_name in config_obj['default_groups']:
try:
group = get_action('group_show')(context,{'id':group_name})
except NotFound,e:
raise ValueError('Default group not found')
if 'default_extras' in config_obj:
if not isinstance(config_obj['default_extras'],dict):
raise ValueError('default_extras must be a dictionary')
if 'user' in config_obj:
# Check if user exists
context = {'model':model,'user':c.user}
try:
user = get_action('user_show')(context,{'id':config_obj.get('user')})
except NotFound,e:
raise ValueError('User not found')
except ValueError,e:
raise e
@ -196,7 +230,61 @@ class CKANHarvester(HarvesterBase):
try:
package_dict = json.loads(harvest_object.content)
return self._create_or_update_package(package_dict,harvest_object)
# Set default tags if needed
default_tags = self.config.get('default_tags',[])
if default_tags:
if not 'tags' in package_dict:
package_dict['tags'] = []
package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']])
# Ignore remote groups for the time being
del package_dict['groups']
# Set default groups if needed
default_groups = self.config.get('default_groups',[])
if default_groups:
if not 'groups' in package_dict:
package_dict['groups'] = []
package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']])
# Set default extras if needed
default_extras = self.config.get('default_extras',{})
if default_extras:
override_extras = self.config.get('override_extras',False)
if not 'extras' in package_dict:
package_dict['extras'] = {}
for key,value in default_extras.iteritems():
if not key in package_dict['extras'] or override_extras:
# Look for replacement strings
if isinstance(value,basestring):
value = value.format(harvest_source_id=harvest_object.job.source.id,
harvest_source_url=harvest_object.job.source.url.strip('/'),
harvest_job_id=harvest_object.job.id,
harvest_object_id=harvest_object.id,
dataset_id=package_dict['id'])
package_dict['extras'][key] = value
result = self._create_or_update_package(package_dict,harvest_object)
if result and self.config.get('read_only',False) == True:
package = model.Package.get(package_dict['id'])
# Clear default permissions
model.clear_user_roles(package)
# Setup harvest user as admin
user_name = self.config.get('user',u'harvest')
user = model.User.get(user_name)
pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)
# Other users can only read
for user_name in (u'visitor',u'logged_in'):
user = model.User.get(user_name)
pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)
except ValidationError,e:
self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
harvest_object, 'Import')

View File

@ -249,11 +249,13 @@ def edit_harvest_source(source_id,data_dict):
Session.rollback()
raise ValidationError(errors,_error_summary(errors))
fields = ['url','type','active','description','user_id','publisher_id','config']
fields = ['url','type','active','description','user_id','publisher_id']
for f in fields:
if f in data_dict and data_dict[f] is not None and data_dict[f] != '':
source.__setattr__(f,data_dict[f])
source.config = data_dict['config']
source.save()
return _source_as_dict(source)

View File

@ -13,3 +13,12 @@
.harvester-title{
font-weight: bold;
}
#harvest-source-actions {
margin-bottom: 10px;
}
#harvest-source-actions img{
vertical-align: middle;
margin: 0 5px;
}

View File

@ -13,6 +13,9 @@
<div class="harvest-content">
<py:if test="c.source">
<h1>Harvest Source Details</h1>
<div id="harvest-source-actions">
<img src="/ckanext/harvest/images/icons/source_edit.png" alt="Edit" /><a href="/harvest/edit/${c.source.id}">Edit source</a> |
<img src="/ckanext/harvest/images/icons/source_refresh.png" alt="Refresh" /><a href="/harvest/refresh/${c.source.id}">Refresh source</a></div>
<table id="harvest-source-details">
<tr>
<th>ID</th>
@ -34,6 +37,15 @@
<th>Description</th>
<td>${c.source.description}</td>
</tr>
<tr>
<th>Configuration</th>
<py:if test="c.source.config">
<td>${c.source.config}</td>
</py:if>
<py:if test="not c.source.config">
<td>-</td>
</py:if>
</tr>
<tr>
<th>User</th>
<td>${c.source.user_id}</td>