Merge pull request #251 from ckan/249-default-tag

Fix default_groups, plus fix docs for default_tags
This commit is contained in:
Adrià Mercader 2016-06-13 13:59:54 +01:00 committed by GitHub
commit f2d8a5f8cc
7 changed files with 133 additions and 24 deletions

View File

@ -289,12 +289,11 @@ field. The currently supported configuration options are:
the CKAN API. Default is 2. the CKAN API. Default is 2.
* default_tags: A list of tags that will be added to all harvested datasets. * default_tags: A list of tags that will be added to all harvested datasets.
Tags don't need to previously exist. Tags don't need to previously exist. This field takes a list of tag dicts
(see example), which allows you to optinally specify a vocabulary.
* default_groups: A list of groups to which the harvested datasets will be * default_groups: A list of group IDs or names to which the harvested datasets
added to. The groups must exist. Note that you must use ids or names to will be added to. The groups must exist.
define the groups according to the API version you defined (names for version
1, ids for version 2).
* default_extras: A dictionary of key value pairs that will be added to extras * default_extras: A dictionary of key value pairs that will be added to extras
of the harvested datasets. You can use the following replacement strings, of the harvested datasets. You can use the following replacement strings,
@ -367,9 +366,9 @@ the configuration field)::
{ {
"api_version": 1, "api_version": 1,
"default_tags":["new-tag-1","new-tag-2"], "default_tags": [{"name": "geo"}, {"name": "namibia"],
"default_groups":["my-own-group"], "default_groups": ["science", "spend-data"],
"default_extras":{"new_extra":"Test","harvest_url":"{harvest_source_url}/dataset/{dataset_id}"}, "default_extras": {"encoding":"utf8", "harvest_url": "{harvest_source_url}/dataset/{dataset_id}"},
"override_extras": true, "override_extras": true,
"organizations_filter_include": [], "organizations_filter_include": [],
"organizations_filter_exclude": ["remote-organization"], "organizations_filter_exclude": ["remote-organization"],

View File

@ -6,7 +6,6 @@ import socket
from sqlalchemy import exists from sqlalchemy import exists
from ckan.lib.base import c
from ckan import model from ckan import model
from ckan.logic import ValidationError, NotFound, get_action from ckan.logic import ValidationError, NotFound, get_action
from ckan.lib.helpers import json from ckan.lib.helpers import json
@ -68,9 +67,7 @@ class CKANHarvester(HarvesterBase):
data = json.loads(content) data = json.loads(content)
if self.action_api_version == 3: if self.action_api_version == 3:
return data.pop('result') return data.pop('result')
return data return data
except (ContentFetchError, ValueError): except (ContentFetchError, ValueError):
log.debug('Could not fetch/decode remote group') log.debug('Could not fetch/decode remote group')
raise RemoteResourceError('Could not fetch/decode remote group') raise RemoteResourceError('Could not fetch/decode remote group')
@ -121,17 +118,28 @@ class CKANHarvester(HarvesterBase):
if 'default_tags' in config_obj: if 'default_tags' in config_obj:
if not isinstance(config_obj['default_tags'], list): if not isinstance(config_obj['default_tags'], list):
raise ValueError('default_tags must be a list') raise ValueError('default_tags must be a list')
if config_obj['default_tags'] and \
not isinstance(config_obj['default_tags'][0], dict):
raise ValueError('default_tags must be a list of '
'dictionaries')
if 'default_groups' in config_obj: if 'default_groups' in config_obj:
if not isinstance(config_obj['default_groups'], list): if not isinstance(config_obj['default_groups'], list):
raise ValueError('default_groups must be a list') raise ValueError('default_groups must be a *list* of group'
' names/ids')
if config_obj['default_groups'] and \
not isinstance(config_obj['default_groups'][0], str):
raise ValueError('default_groups must be a list of group '
'names/ids (i.e. strings)')
# Check if default groups exist # Check if default groups exist
context = {'model': model, 'user': c.user} context = {'model': model, 'user': toolkit.c.user}
for group_name in config_obj['default_groups']: self.default_group_dicts = []
for group_name_or_id in config_obj['default_groups']:
try: try:
group = get_action('group_show')( group = get_action('group_show')(
context, {'id': group_name}) context, {'id': group_name_or_id})
self.default_group_dicts.append(group)
except NotFound, e: except NotFound, e:
raise ValueError('Default group not found') raise ValueError('Default group not found')
@ -141,7 +149,7 @@ class CKANHarvester(HarvesterBase):
if 'user' in config_obj: if 'user' in config_obj:
# Check if user exists # Check if user exists
context = {'model': model, 'user': c.user} context = {'model': model, 'user': toolkit.c.user}
try: try:
user = get_action('user_show')( user = get_action('user_show')(
context, {'id': config_obj.get('user')}) context, {'id': config_obj.get('user')})
@ -481,9 +489,10 @@ class CKANHarvester(HarvesterBase):
if default_groups: if default_groups:
if not 'groups' in package_dict: if not 'groups' in package_dict:
package_dict['groups'] = [] package_dict['groups'] = []
existing_group_ids = [g['id'] for g in package_dict['groups']]
package_dict['groups'].extend( package_dict['groups'].extend(
[g for g in default_groups [g for g in self.default_group_dicts
if g not in package_dict['groups']]) if g['id'] not in existing_group_ids])
# Set default extras if needed # Set default extras if needed
default_extras = self.config.get('default_extras', {}) default_extras = self.config.get('default_extras', {})

View File

@ -197,8 +197,16 @@ class HarvestObjectError(HarvestDomainObject):
stage=stage, line=line) stage=stage, line=line)
try: try:
err.save() err.save()
except InvalidRequestError: except InvalidRequestError, e:
Session.rollback() # Clear any in-progress sqlalchemy transactions
try:
Session.rollback()
except:
pass
try:
Session.remove()
except:
pass
err.save() err.save()
finally: finally:
log_message = '{0}, line {1}'.format(message, line) \ log_message = '{0}, line {1}'.format(message, line) \

View File

@ -4,6 +4,7 @@ from nose.tools import assert_equal, assert_raises
import json import json
from mock import patch, MagicMock from mock import patch, MagicMock
try: try:
from ckan.tests.helpers import reset_db, call_action from ckan.tests.helpers import reset_db, call_action
from ckan.tests.factories import Organization, Group from ckan.tests.factories import Organization, Group
@ -221,3 +222,93 @@ class TestCkanHarvester(object):
harvester=CKANHarvester()) harvester=CKANHarvester())
assert not results_by_guid assert not results_by_guid
assert not was_last_job_considered_error_free() assert not was_last_job_considered_error_free()
def test_default_tags(self):
config = {'default_tags': [{'name': 'geo'}]}
results_by_guid = run_harvest(
url='http://localhost:%s' % mock_ckan.PORT,
harvester=CKANHarvester(),
config=json.dumps(config))
tags = results_by_guid['dataset1-id']['dataset']['tags']
tag_names = [tag['name'] for tag in tags]
assert 'geo' in tag_names
def test_default_tags_invalid(self):
config = {'default_tags': ['geo']} # should be list of dicts
assert_raises(
run_harvest,
url='http://localhost:%s' % mock_ckan.PORT,
harvester=CKANHarvester(),
config=json.dumps(config))
def test_default_groups(self):
Group(id='group1-id', name='group1')
Group(id='group2-id', name='group2')
Group(id='group3-id', name='group3')
config = {'default_groups': ['group2-id', 'group3'],
'remote_groups': 'only_local'}
tmp_c = toolkit.c
try:
# c.user is used by the validation (annoying),
# however patch doesn't work because it's a weird
# StackedObjectProxy, so we swap it manually
toolkit.c = MagicMock(user='')
results_by_guid = run_harvest(
url='http://localhost:%s' % mock_ckan.PORT,
harvester=CKANHarvester(),
config=json.dumps(config))
finally:
toolkit.c = tmp_c
assert_equal(results_by_guid['dataset1-id']['errors'], [])
groups = results_by_guid['dataset1-id']['dataset']['groups']
group_names = set(group['name'] for group in groups)
# group1 comes from the harvested dataset
# group2 & 3 come from the default_groups
assert_equal(group_names, set(('group1', 'group2', 'group3')))
def test_default_groups_invalid(self):
Group(id='group2-id', name='group2')
# should be list of strings
config = {'default_tags': [{'name': 'group2'}]}
assert_raises(
run_harvest,
url='http://localhost:%s' % mock_ckan.PORT,
harvester=CKANHarvester(),
config=json.dumps(config))
def test_default_extras(self):
config = {
'default_extras': {
'encoding': 'utf8',
'harvest_url': '{harvest_source_url}/dataset/{dataset_id}'
}}
tmp_c = toolkit.c
try:
# c.user is used by the validation (annoying),
# however patch doesn't work because it's a weird
# StackedObjectProxy, so we swap it manually
toolkit.c = MagicMock(user='')
results_by_guid = run_harvest(
url='http://localhost:%s' % mock_ckan.PORT,
harvester=CKANHarvester(),
config=json.dumps(config))
finally:
toolkit.c = tmp_c
assert_equal(results_by_guid['dataset1-id']['errors'], [])
extras = results_by_guid['dataset1-id']['dataset']['extras']
extras_dict = dict((e['key'], e['value']) for e in extras)
assert_equal(extras_dict['encoding'], 'utf8')
assert_equal(extras_dict['harvest_url'],
'http://localhost:8998/dataset/dataset1-id')
def test_default_extras_invalid(self):
config = {
'default_extras': 'utf8', # value should be a dict
}
assert_raises(
run_harvest,
url='http://localhost:%s' % mock_ckan.PORT,
harvester=CKANHarvester(),
config=json.dumps(config))

View File

@ -10,7 +10,8 @@ def run_harvest(url, harvester, config=''):
Queues are avoided as they are a pain in tests. Queues are avoided as they are a pain in tests.
''' '''
# User creates a harvest source # User creates a harvest source
source = HarvestSourceObj(url=url, config=config) source = HarvestSourceObj(url=url, config=config,
source_type=harvester.info()['name'])
# User triggers a harvest, which is the creation of a harvest job. # User triggers a harvest, which is the creation of a harvest job.
# We set run=False so that it doesn't put it on the gather queue. # We set run=False so that it doesn't put it on the gather queue.

View File

@ -36,6 +36,7 @@ setup(
ckan_harvester=ckanext.harvest.harvesters:CKANHarvester ckan_harvester=ckanext.harvest.harvesters:CKANHarvester
[ckan.test_plugins] [ckan.test_plugins]
test_harvester=ckanext.harvest.tests.test_queue:MockHarvester test_harvester=ckanext.harvest.tests.test_queue:MockHarvester
test_harvester2=ckanext.harvest.tests.test_queue2:MockHarvester
test_action_harvester=ckanext.harvest.tests.test_action:MockHarvesterForActionTests test_action_harvester=ckanext.harvest.tests.test_action:MockHarvesterForActionTests
[paste.paster_command] [paste.paster_command]
harvester = ckanext.harvest.commands.harvester:Harvester harvester = ckanext.harvest.commands.harvester:Harvester

View File

@ -15,7 +15,7 @@ port = 5000
use = config:../ckan/test-core.ini use = config:../ckan/test-core.ini
# Here we hard-code the database and a flag to make default tests # Here we hard-code the database and a flag to make default tests
# run fast. # run fast.
ckan.plugins = harvest ckan_harvester test_harvester test_action_harvester ckan.plugins = harvest ckan_harvester test_harvester test_harvester2 test_action_harvester
ckan.harvest.mq.type = redis ckan.harvest.mq.type = redis
ckan.legacy_templates = false ckan.legacy_templates = false
# NB: other test configuration should go in test-core.ini, which is # NB: other test configuration should go in test-core.ini, which is