Merge branch 'feature-new-ckan-harvester-features'
This commit is contained in:
commit
eb591521ff
57
README.rst
57
README.rst
|
@ -110,6 +110,61 @@ to add the `ckan_harvester` plugin to your options file::
|
||||||
|
|
||||||
After adding it, a 'CKAN' option should appear in the 'New harvest source' form.
|
After adding it, a 'CKAN' option should appear in the 'New harvest source' form.
|
||||||
|
|
||||||
|
The CKAN harvesters support a number of configuration options to control their
|
||||||
|
behaviour. Those need to defined as a JSON object in the configuration form
|
||||||
|
field. The currently supported configuration options are:
|
||||||
|
|
||||||
|
* api_version: You can force the harvester to use either version '1' or '2' of
|
||||||
|
the CKAN API. Default is '2'.
|
||||||
|
|
||||||
|
* default_tags: A list of tags that will be added to all harvested datasets.
|
||||||
|
Tags don't need to previously exist.
|
||||||
|
|
||||||
|
* default_groups: A list of groups to which the harvested datasets will be
|
||||||
|
added to. The groups must exist. Note that you must use ids or names to
|
||||||
|
define the groups according to the API version you defined (names for version
|
||||||
|
'1', ids for version '2').
|
||||||
|
|
||||||
|
* default_extras: A dictionary of key value pairs that will be added to extras
|
||||||
|
of the harvested datasets. You can use the following replacement strings,
|
||||||
|
that will be replaced before creating or updating the datasets:
|
||||||
|
|
||||||
|
* {dataset_id}
|
||||||
|
* {harvest_source_id}
|
||||||
|
* {harvest_source_url} # Will be stripped of trailing forward slashes (/)
|
||||||
|
* {harvest_job_id}
|
||||||
|
* {harvest_object_id}
|
||||||
|
|
||||||
|
* override_extras: Assign default extras even if they already exist in the
|
||||||
|
remote dataset. Default is False (only non existing extras are added).
|
||||||
|
|
||||||
|
* user: User who will run the harvesting process. Please note that this user
|
||||||
|
needs to have permission for creating packages, and if default groups were
|
||||||
|
defined, the user must have permission to assign packages to these groups.
|
||||||
|
|
||||||
|
* api_key: If the remote CKAN instance has restricted access to the API you
|
||||||
|
can provide a CKAN API key, which will be sent in any request.
|
||||||
|
|
||||||
|
* read_only: Create harvested packages in read-only mode. Only the user who
|
||||||
|
performed the harvest (the one defined in the previous setting or the
|
||||||
|
'harvest' sysadmin) will be able to edit and administer the packages
|
||||||
|
created from this harvesting source. Logged in users and visitors will be
|
||||||
|
only able to read them.
|
||||||
|
|
||||||
|
Here is an example of a configuration object (the one that must be entered in
|
||||||
|
the configuration field)::
|
||||||
|
|
||||||
|
{
|
||||||
|
"api_version":"1",
|
||||||
|
"default_tags":["new-tag-1","new-tag-2"],
|
||||||
|
"default_groups":["my-own-group"],
|
||||||
|
"default_extras":{"new_extra":"Test",harvest_url":"{harvest_source_url}/dataset/{dataset_id}"},
|
||||||
|
"override_extras": true,
|
||||||
|
"user":"harverster-user",
|
||||||
|
"api_key":"<REMOTE_API_KEY>",
|
||||||
|
"read_only": true
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
The harvesting interface
|
The harvesting interface
|
||||||
========================
|
========================
|
||||||
|
@ -256,7 +311,7 @@ Finally, on a third console, run the following command to start any
|
||||||
pending harvesting jobs::
|
pending harvesting jobs::
|
||||||
|
|
||||||
paster harvester run --config=../ckan/development.ini
|
paster harvester run --config=../ckan/development.ini
|
||||||
|
|
||||||
After packages have been imported, the search index will have to be updated
|
After packages have been imported, the search index will have to be updated
|
||||||
before the packages appear in search results (from the ckan directory):
|
before the packages appear in search results (from the ckan directory):
|
||||||
|
|
||||||
|
|
|
@ -1,13 +1,13 @@
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
from ckan import model
|
from ckan import model
|
||||||
from ckan.model import Session, Package
|
from ckan.model import Session, Package
|
||||||
from ckan.logic import ValidationError, NotFound, get_action
|
from ckan.logic import ValidationError, NotFound, get_action
|
||||||
|
|
||||||
from ckan.logic.schema import default_package_schema
|
from ckan.logic.schema import default_package_schema
|
||||||
from ckan.lib.navl.validators import ignore_missing
|
from ckan.lib.navl.validators import ignore_missing,ignore
|
||||||
from ckan.lib.munge import munge_title_to_name, munge_tag
|
from ckan.lib.munge import munge_title_to_name,substitute_ascii_equivalents
|
||||||
|
|
||||||
from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \
|
from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \
|
||||||
HarvestObjectError
|
HarvestObjectError
|
||||||
|
@ -17,6 +17,11 @@ from ckanext.harvest.interfaces import IHarvester
|
||||||
|
|
||||||
log = logging.getLogger(__name__)
|
log = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
def munge_tag(tag):
|
||||||
|
tag = substitute_ascii_equivalents(tag)
|
||||||
|
tag = tag.lower().strip()
|
||||||
|
return re.sub(r'[^a-zA-Z0-9 -]', '', tag).replace(' ', '-')
|
||||||
|
|
||||||
class HarvesterBase(SingletonPlugin):
|
class HarvesterBase(SingletonPlugin):
|
||||||
'''
|
'''
|
||||||
Generic class for harvesters with helper functions
|
Generic class for harvesters with helper functions
|
||||||
|
@ -104,13 +109,23 @@ class HarvesterBase(SingletonPlugin):
|
||||||
try:
|
try:
|
||||||
# Change default schema
|
# Change default schema
|
||||||
schema = default_package_schema()
|
schema = default_package_schema()
|
||||||
schema["id"] = [ignore_missing, unicode]
|
schema['id'] = [ignore_missing, unicode]
|
||||||
|
schema['__junk'] = [ignore]
|
||||||
|
|
||||||
|
# Check API version
|
||||||
|
if self.config:
|
||||||
|
api_version = self.config.get('api_version','2')
|
||||||
|
#TODO: use site user when available
|
||||||
|
user_name = self.config.get('user',u'harvest')
|
||||||
|
else:
|
||||||
|
api_version = '2'
|
||||||
|
user_name = u'harvest'
|
||||||
|
|
||||||
context = {
|
context = {
|
||||||
'model': model,
|
'model': model,
|
||||||
'session': Session,
|
'session': Session,
|
||||||
'user': u'harvest',
|
'user': user_name,
|
||||||
'api_version':'2',
|
'api_version': api_version,
|
||||||
'schema': schema,
|
'schema': schema,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -1,7 +1,9 @@
|
||||||
import urllib2
|
import urllib2
|
||||||
|
|
||||||
|
from ckan.lib.base import c
|
||||||
|
from ckan import model
|
||||||
from ckan.model import Session, Package
|
from ckan.model import Session, Package
|
||||||
from ckan.logic import ValidationError, NotFound
|
from ckan.logic import ValidationError, NotFound, get_action
|
||||||
from ckan.lib.helpers import json
|
from ckan.lib.helpers import json
|
||||||
|
|
||||||
from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \
|
from ckanext.harvest.model import HarvestJob, HarvestObject, HarvestGatherError, \
|
||||||
|
@ -34,6 +36,9 @@ class CKANHarvester(HarvesterBase):
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
api_key = self.config.get('api_key',None)
|
||||||
|
if api_key:
|
||||||
|
http_request.add_header('Authorization',api_key)
|
||||||
http_response = urllib2.urlopen(http_request)
|
http_response = urllib2.urlopen(http_request)
|
||||||
|
|
||||||
return http_response.read()
|
return http_response.read()
|
||||||
|
@ -65,6 +70,35 @@ class CKANHarvester(HarvesterBase):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
config_obj = json.loads(config)
|
config_obj = json.loads(config)
|
||||||
|
|
||||||
|
if 'default_tags' in config_obj:
|
||||||
|
if not isinstance(config_obj['default_tags'],list):
|
||||||
|
raise ValueError('default_tags must be a list')
|
||||||
|
|
||||||
|
if 'default_groups' in config_obj:
|
||||||
|
if not isinstance(config_obj['default_groups'],list):
|
||||||
|
raise ValueError('default_groups must be a list')
|
||||||
|
|
||||||
|
# Check if default groups exist
|
||||||
|
context = {'model':model,'user':c.user}
|
||||||
|
for group_name in config_obj['default_groups']:
|
||||||
|
try:
|
||||||
|
group = get_action('group_show')(context,{'id':group_name})
|
||||||
|
except NotFound,e:
|
||||||
|
raise ValueError('Default group not found')
|
||||||
|
|
||||||
|
if 'default_extras' in config_obj:
|
||||||
|
if not isinstance(config_obj['default_extras'],dict):
|
||||||
|
raise ValueError('default_extras must be a dictionary')
|
||||||
|
|
||||||
|
if 'user' in config_obj:
|
||||||
|
# Check if user exists
|
||||||
|
context = {'model':model,'user':c.user}
|
||||||
|
try:
|
||||||
|
user = get_action('user_show')(context,{'id':config_obj.get('user')})
|
||||||
|
except NotFound,e:
|
||||||
|
raise ValueError('User not found')
|
||||||
|
|
||||||
except ValueError,e:
|
except ValueError,e:
|
||||||
raise e
|
raise e
|
||||||
|
|
||||||
|
@ -196,7 +230,61 @@ class CKANHarvester(HarvesterBase):
|
||||||
|
|
||||||
try:
|
try:
|
||||||
package_dict = json.loads(harvest_object.content)
|
package_dict = json.loads(harvest_object.content)
|
||||||
return self._create_or_update_package(package_dict,harvest_object)
|
|
||||||
|
# Set default tags if needed
|
||||||
|
default_tags = self.config.get('default_tags',[])
|
||||||
|
if default_tags:
|
||||||
|
if not 'tags' in package_dict:
|
||||||
|
package_dict['tags'] = []
|
||||||
|
package_dict['tags'].extend([t for t in default_tags if t not in package_dict['tags']])
|
||||||
|
|
||||||
|
# Ignore remote groups for the time being
|
||||||
|
del package_dict['groups']
|
||||||
|
|
||||||
|
# Set default groups if needed
|
||||||
|
default_groups = self.config.get('default_groups',[])
|
||||||
|
if default_groups:
|
||||||
|
if not 'groups' in package_dict:
|
||||||
|
package_dict['groups'] = []
|
||||||
|
package_dict['groups'].extend([g for g in default_groups if g not in package_dict['groups']])
|
||||||
|
|
||||||
|
# Set default extras if needed
|
||||||
|
default_extras = self.config.get('default_extras',{})
|
||||||
|
if default_extras:
|
||||||
|
override_extras = self.config.get('override_extras',False)
|
||||||
|
if not 'extras' in package_dict:
|
||||||
|
package_dict['extras'] = {}
|
||||||
|
for key,value in default_extras.iteritems():
|
||||||
|
if not key in package_dict['extras'] or override_extras:
|
||||||
|
# Look for replacement strings
|
||||||
|
if isinstance(value,basestring):
|
||||||
|
value = value.format(harvest_source_id=harvest_object.job.source.id,
|
||||||
|
harvest_source_url=harvest_object.job.source.url.strip('/'),
|
||||||
|
harvest_job_id=harvest_object.job.id,
|
||||||
|
harvest_object_id=harvest_object.id,
|
||||||
|
dataset_id=package_dict['id'])
|
||||||
|
package_dict['extras'][key] = value
|
||||||
|
|
||||||
|
result = self._create_or_update_package(package_dict,harvest_object)
|
||||||
|
|
||||||
|
if result and self.config.get('read_only',False) == True:
|
||||||
|
|
||||||
|
package = model.Package.get(package_dict['id'])
|
||||||
|
|
||||||
|
# Clear default permissions
|
||||||
|
model.clear_user_roles(package)
|
||||||
|
|
||||||
|
# Setup harvest user as admin
|
||||||
|
user_name = self.config.get('user',u'harvest')
|
||||||
|
user = model.User.get(user_name)
|
||||||
|
pkg_role = model.PackageRole(package=package, user=user, role=model.Role.ADMIN)
|
||||||
|
|
||||||
|
# Other users can only read
|
||||||
|
for user_name in (u'visitor',u'logged_in'):
|
||||||
|
user = model.User.get(user_name)
|
||||||
|
pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)
|
||||||
|
|
||||||
|
|
||||||
except ValidationError,e:
|
except ValidationError,e:
|
||||||
self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
|
self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
|
||||||
harvest_object, 'Import')
|
harvest_object, 'Import')
|
||||||
|
|
|
@ -249,11 +249,13 @@ def edit_harvest_source(source_id,data_dict):
|
||||||
Session.rollback()
|
Session.rollback()
|
||||||
raise ValidationError(errors,_error_summary(errors))
|
raise ValidationError(errors,_error_summary(errors))
|
||||||
|
|
||||||
fields = ['url','type','active','description','user_id','publisher_id','config']
|
fields = ['url','type','active','description','user_id','publisher_id']
|
||||||
for f in fields:
|
for f in fields:
|
||||||
if f in data_dict and data_dict[f] is not None and data_dict[f] != '':
|
if f in data_dict and data_dict[f] is not None and data_dict[f] != '':
|
||||||
source.__setattr__(f,data_dict[f])
|
source.__setattr__(f,data_dict[f])
|
||||||
|
|
||||||
|
source.config = data_dict['config']
|
||||||
|
|
||||||
source.save()
|
source.save()
|
||||||
|
|
||||||
return _source_as_dict(source)
|
return _source_as_dict(source)
|
||||||
|
|
|
@ -13,3 +13,12 @@
|
||||||
.harvester-title{
|
.harvester-title{
|
||||||
font-weight: bold;
|
font-weight: bold;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#harvest-source-actions {
|
||||||
|
margin-bottom: 10px;
|
||||||
|
}
|
||||||
|
|
||||||
|
#harvest-source-actions img{
|
||||||
|
vertical-align: middle;
|
||||||
|
margin: 0 5px;
|
||||||
|
}
|
||||||
|
|
|
@ -13,6 +13,9 @@
|
||||||
<div class="harvest-content">
|
<div class="harvest-content">
|
||||||
<py:if test="c.source">
|
<py:if test="c.source">
|
||||||
<h1>Harvest Source Details</h1>
|
<h1>Harvest Source Details</h1>
|
||||||
|
<div id="harvest-source-actions">
|
||||||
|
<img src="/ckanext/harvest/images/icons/source_edit.png" alt="Edit" /><a href="/harvest/edit/${c.source.id}">Edit source</a> |
|
||||||
|
<img src="/ckanext/harvest/images/icons/source_refresh.png" alt="Refresh" /><a href="/harvest/refresh/${c.source.id}">Refresh source</a></div>
|
||||||
<table id="harvest-source-details">
|
<table id="harvest-source-details">
|
||||||
<tr>
|
<tr>
|
||||||
<th>ID</th>
|
<th>ID</th>
|
||||||
|
@ -34,6 +37,15 @@
|
||||||
<th>Description</th>
|
<th>Description</th>
|
||||||
<td>${c.source.description}</td>
|
<td>${c.source.description}</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>Configuration</th>
|
||||||
|
<py:if test="c.source.config">
|
||||||
|
<td>${c.source.config}</td>
|
||||||
|
</py:if>
|
||||||
|
<py:if test="not c.source.config">
|
||||||
|
<td>-</td>
|
||||||
|
</py:if>
|
||||||
|
</tr>
|
||||||
<tr>
|
<tr>
|
||||||
<th>User</th>
|
<th>User</th>
|
||||||
<td>${c.source.user_id}</td>
|
<td>${c.source.user_id}</td>
|
||||||
|
|
Loading…
Reference in New Issue