Merge branch 'feature-new-ckan-harvester-features'
This commit is contained in:
commit
eb591521ff
55
README.rst
55
README.rst
|
@ -110,6 +110,61 @@ to add the `ckan_harvester` plugin to your options file::
|
|||
|
||||
After adding it, a 'CKAN' option should appear in the 'New harvest source' form.
|
||||
|
||||
The CKAN harvesters support a number of configuration options to control their
|
||||
behaviour. Those need to defined as a JSON object in the configuration form
|
||||
field. The currently supported configuration options are:
|
||||
|
||||
* api_version: You can force the harvester to use either version '1' or '2' of
|
||||
the CKAN API. Default is '2'.
|
||||
|
||||
* default_tags: A list of tags that will be added to all harvested datasets.
|
||||
Tags don't need to previously exist.
|
||||
|
||||
* default_groups: A list of groups to which the harvested datasets will be
|
||||
added to. The groups must exist. Note that you must use ids or names to
|
||||
define the groups according to the API version you defined (names for version
|
||||
'1', ids for version '2').
|
||||
|
||||
* default_extras: A dictionary of key value pairs that will be added to extras
|
||||
of the harvested datasets. You can use the following replacement strings,
|
||||
that will be replaced before creating or updating the datasets:
|
||||
|
||||
* {dataset_id}
|
||||
* {harvest_source_id}
|
||||
* {harvest_source_url} # Will be stripped of trailing forward slashes (/)
|
||||
* {harvest_job_id}
|
||||
* {harvest_object_id}
|
||||
|
||||
* override_extras: Assign default extras even if they already exist in the
|
||||
remote dataset. Default is False (only non existing extras are added).
|
||||
|
||||
* user: User who will run the harvesting process. Please note that this user
|
||||
needs to have permission for creating packages, and if default groups were
|
||||
defined, the user must have permission to assign packages to these groups.
|
||||
|
||||
* api_key: If the remote CKAN instance has restricted access to the API you
|
||||
can provide a CKAN API key, which will be sent in any request.
|
||||
|
||||
* read_only: Create harvested packages in read-only mode. Only the user who
|
||||
performed the harvest (the one defined in the previous setting or the
|
||||
'harvest' sysadmin) will be able to edit and administer the packages
|
||||
created from this harvesting source. Logged in users and visitors will be
|
||||
only able to read them.
|
||||
|
||||
Here is an example of a configuration object (the one that must be entered in
|
||||
the configuration field)::
|
||||
|
||||
{
|
||||
"api_version":"1",
|
||||
"default_tags":["new-tag-1","new-tag-2"],
|
||||
"default_groups":["my-own-group"],
|
||||
"default_extras":{"new_extra":"Test",harvest_url":"{harvest_source_url}/dataset/{dataset_id}"},
|
||||
"override_extras": true,
|
||||
"user":"harverster-user",
|
||||
"api_key":"<REMOTE_API_KEY>",
|
||||
"read_only": true
|
||||
}
|
||||
|
||||
|
||||
The harvesting interface
|
||||
========================
|
||||
|
|
|
@ -1,13 +1,13 @@
|
|||
import logging
|
||||
|
||||
import re
|
||||
|
||||
from ckan import model
|
||||
from ckan.model import Session, Package
|
||||
from ckan.logic import ValidationError, NotFound, get_action
|
||||
|
||||
from ckan.logic.schema import default_package_schema
|
||||
from ckan.lib.navl.validators import ignore_missing
|
||||
from ckan.lib.munge import munge_title_to_name, munge_tag
|
||||
from ckan.lib.navl.validators import ignore_missing,ignore
|
||||
from ckan.lib.munge import munge_title_to_name,substitute_ascii_equivalents
|
||||
|
||||
from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \
|
||||
HarvestObjectError
|
||||
|
@ -17,6 +17,11 @@ from ckanext.harvest.interfaces import IHarvester
|
|||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
def munge_tag(tag):
|
||||
tag = substitute_ascii_equivalents(tag)
|
||||
tag = tag.lower().strip()
|
||||
return re.sub(r'[^a-zA-Z0-9 -]', '', tag).replace(' ', '-')
|
||||
|
||||
class HarvesterBase(SingletonPlugin):
|
||||
'''
|
||||
Generic class for harvesters with helper functions
|
||||
|
@ -104,13 +109,23 @@ class HarvesterBase(SingletonPlugin):
|
|||
try:
|
||||
# Change default schema
|
||||
schema = default_package_schema()
|
||||
schema["id"] = [ignore_missing, unicode]
|
||||
schema['id'] = [ignore_missing, unicode]
|
||||
schema['__junk'] = [ignore]
|
||||
|
||||
# Check API version
|
||||
if self.config:
|
||||
api_version = self.config.get('api_version','2')
|
||||
#TODO: use site user when available
|
||||
user_name = self.config.get('user',u'harvest')
|
||||
else:
|
||||
api_version = '2'
|
||||
user_name = u'harvest'
|
||||
|
||||
context = {
|
||||
'model': model,
|
||||
'session': Session,
|
||||
'user': u'harvest',
|
||||
'api_version':'2',
|
||||
'user': user_name,
|
||||
'api_version': api_version,
|
||||
'schema': schema,
|
||||
}
|
||||
|
||||
|
|
|
@ -1,7 +1,9 @@
|
|||
import urllib2
|
||||
|
||||
from ckan.lib.base import c
|
||||
from ckan import model
|
||||
from ckan.model import Session, Package
|
||||
from ckan.logic import ValidationError, NotFound
|
||||
from ckan.logic import ValidationError, NotFound, get_action
|
||||
from ckan.lib.helpers import json
|
||||
|
||||
from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \
|
||||
|
@ -34,6 +36,9 @@ class CKANHarvester(HarvesterBase):
|
|||
)
|
||||
|
||||
try:
|
||||
api_key = self.config.get('api_key',None)
|
||||
if api_key:
|
||||
http_request.add_header('Authorization',api_key)
|
||||
http_response = urllib2.urlopen(http_request)
|
||||
|
||||
return http_response.read()
|
||||
|
@ -65,6 +70,35 @@ class CKANHarvester(HarvesterBase):
|
|||
|
||||
try:
|
||||
config_obj = json.loads(config)
|
||||
|
||||
if 'default_tags' in config_obj:
|
||||
if not isinstance(config_obj['default_tags'],list):
|
||||
raise ValueError('default_tags must be a list')
|
||||
|
||||
if 'default_groups' in config_obj:
|
||||
if not isinstance(config_obj['default_groups'],list):
|
||||
raise ValueError('default_groups must be a list')
|
||||
|
||||
# Check if default groups exist
|
||||
context = {'model':model,'user':c.user}
|
||||
for group_name in config_obj['default_groups']:
|
||||
try:
|
||||
group = get_action('group_show')(context,{'id':group_name})
|
||||
except NotFound,e:
|
||||
raise ValueError('Default group not found')
|
||||
|
||||
if 'default_extras' in config_obj:
|
||||
if not isinstance(config_obj['default_extras'],dict):
|
||||
raise ValueError('default_extras must be a dictionary')
|
||||
|
||||
if 'user' in config_obj:
|
||||
# Check if user exists
|
||||
context = {'model':model,'user':c.user}
|
||||
try:
|
||||
user = get_action('user_show')(context,{'id':config_obj.get('user')})
|
||||
except NotFound,e:
|
||||
raise ValueError('User not found')
|
||||
|
||||
except ValueError,e:
|
||||
raise e
|
||||
|
||||
|
@ -196,7 +230,61 @@ class CKANHarvester(HarvesterBase):
|
|||
|
||||
try:
|
||||
package_dict = json.loads(harvest_object.content)
|
||||
return self._create_or_update_package(package_dict,harvest_object)
|
||||
|
||||
# Set default tags if needed
|
||||
default_tags = self.config.get('default_tags',[])
|
||||
if default_tags:
|
||||
if not 'tags' in package_dict:
|
||||
package_dict['tags'] = []
|
||||
package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']])
|
||||
|
||||
# Ignore remote groups for the time being
|
||||
del package_dict['groups']
|
||||
|
||||
# Set default groups if needed
|
||||
default_groups = self.config.get('default_groups',[])
|
||||
if default_groups:
|
||||
if not 'groups' in package_dict:
|
||||
package_dict['groups'] = []
|
||||
package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']])
|
||||
|
||||
# Set default extras if needed
|
||||
default_extras = self.config.get('default_extras',{})
|
||||
if default_extras:
|
||||
override_extras = self.config.get('override_extras',False)
|
||||
if not 'extras' in package_dict:
|
||||
package_dict['extras'] = {}
|
||||
for key,value in default_extras.iteritems():
|
||||
if not key in package_dict['extras'] or override_extras:
|
||||
# Look for replacement strings
|
||||
if isinstance(value,basestring):
|
||||
value = value.format(harvest_source_id=harvest_object.job.source.id,
|
||||
harvest_source_url=harvest_object.job.source.url.strip('/'),
|
||||
harvest_job_id=harvest_object.job.id,
|
||||
harvest_object_id=harvest_object.id,
|
||||
dataset_id=package_dict['id'])
|
||||
package_dict['extras'][key] = value
|
||||
|
||||
result = self._create_or_update_package(package_dict,harvest_object)
|
||||
|
||||
if result and self.config.get('read_only',False) == True:
|
||||
|
||||
package = model.Package.get(package_dict['id'])
|
||||
|
||||
# Clear default permissions
|
||||
model.clear_user_roles(package)
|
||||
|
||||
# Setup harvest user as admin
|
||||
user_name = self.config.get('user',u'harvest')
|
||||
user = model.User.get(user_name)
|
||||
pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)
|
||||
|
||||
# Other users can only read
|
||||
for user_name in (u'visitor',u'logged_in'):
|
||||
user = model.User.get(user_name)
|
||||
pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)
|
||||
|
||||
|
||||
except ValidationError,e:
|
||||
self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
|
||||
harvest_object, 'Import')
|
||||
|
|
|
@ -249,11 +249,13 @@ def edit_harvest_source(source_id,data_dict):
|
|||
Session.rollback()
|
||||
raise ValidationError(errors,_error_summary(errors))
|
||||
|
||||
fields = ['url','type','active','description','user_id','publisher_id','config']
|
||||
fields = ['url','type','active','description','user_id','publisher_id']
|
||||
for f in fields:
|
||||
if f in data_dict and data_dict[f] is not None and data_dict[f] != '':
|
||||
source.__setattr__(f,data_dict[f])
|
||||
|
||||
source.config = data_dict['config']
|
||||
|
||||
source.save()
|
||||
|
||||
return _source_as_dict(source)
|
||||
|
|
|
@ -13,3 +13,12 @@
|
|||
.harvester-title{
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
#harvest-source-actions {
|
||||
margin-bottom: 10px;
|
||||
}
|
||||
|
||||
#harvest-source-actions img{
|
||||
vertical-align: middle;
|
||||
margin: 0 5px;
|
||||
}
|
||||
|
|
|
@ -13,6 +13,9 @@
|
|||
<div class="harvest-content">
|
||||
<py:if test="c.source">
|
||||
<h1>Harvest Source Details</h1>
|
||||
<div id="harvest-source-actions">
|
||||
<img src="/ckanext/harvest/images/icons/source_edit.png" alt="Edit" /><a href="/harvest/edit/${c.source.id}">Edit source</a> |
|
||||
<img src="/ckanext/harvest/images/icons/source_refresh.png" alt="Refresh" /><a href="/harvest/refresh/${c.source.id}">Refresh source</a></div>
|
||||
<table id="harvest-source-details">
|
||||
<tr>
|
||||
<th>ID</th>
|
||||
|
@ -34,6 +37,15 @@
|
|||
<th>Description</th>
|
||||
<td>${c.source.description}</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>Configuration</th>
|
||||
<py:if test="c.source.config">
|
||||
<td>${c.source.config}</td>
|
||||
</py:if>
|
||||
<py:if test="not c.source.config">
|
||||
<td>-</td>
|
||||
</py:if>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>User</th>
|
||||
<td>${c.source.user_id}</td>
|
||||
|
|
Loading…
Reference in New Issue