diff --git a/README.rst b/README.rst index 421861c..45fc386 100644 --- a/README.rst +++ b/README.rst @@ -289,12 +289,11 @@ field. The currently supported configuration options are: the CKAN API. Default is 2. * default_tags: A list of tags that will be added to all harvested datasets. - Tags don't need to previously exist. + Tags don't need to previously exist. This field takes a list of tag dicts + (see example), which allows you to optinally specify a vocabulary. -* default_groups: A list of groups to which the harvested datasets will be - added to. The groups must exist. Note that you must use ids or names to - define the groups according to the API version you defined (names for version - 1, ids for version 2). +* default_groups: A list of group IDs or names to which the harvested datasets + will be added to. The groups must exist. * default_extras: A dictionary of key value pairs that will be added to extras of the harvested datasets. You can use the following replacement strings, @@ -367,9 +366,9 @@ the configuration field):: { "api_version": 1, - "default_tags":["new-tag-1","new-tag-2"], - "default_groups":["my-own-group"], - "default_extras":{"new_extra":"Test","harvest_url":"{harvest_source_url}/dataset/{dataset_id}"}, + "default_tags": [{"name": "geo"}, {"name": "namibia"], + "default_groups": ["science", "spend-data"], + "default_extras": {"encoding":"utf8", "harvest_url": "{harvest_source_url}/dataset/{dataset_id}"}, "override_extras": true, "organizations_filter_include": [], "organizations_filter_exclude": ["remote-organization"], diff --git a/ckanext/harvest/harvesters/ckanharvester.py b/ckanext/harvest/harvesters/ckanharvester.py index f533147..fde710a 100644 --- a/ckanext/harvest/harvesters/ckanharvester.py +++ b/ckanext/harvest/harvesters/ckanharvester.py @@ -6,7 +6,6 @@ import socket from sqlalchemy import exists -from ckan.lib.base import c from ckan import model from ckan.logic import ValidationError, NotFound, get_action from ckan.lib.helpers import json @@ -68,9 +67,7 @@ class CKANHarvester(HarvesterBase): data = json.loads(content) if self.action_api_version == 3: return data.pop('result') - return data - except (ContentFetchError, ValueError): log.debug('Could not fetch/decode remote group') raise RemoteResourceError('Could not fetch/decode remote group') @@ -121,17 +118,28 @@ class CKANHarvester(HarvesterBase): if 'default_tags' in config_obj: if not isinstance(config_obj['default_tags'], list): raise ValueError('default_tags must be a list') + if config_obj['default_tags'] and \ + not isinstance(config_obj['default_tags'][0], dict): + raise ValueError('default_tags must be a list of ' + 'dictionaries') if 'default_groups' in config_obj: if not isinstance(config_obj['default_groups'], list): - raise ValueError('default_groups must be a list') + raise ValueError('default_groups must be a *list* of group' + ' names/ids') + if config_obj['default_groups'] and \ + not isinstance(config_obj['default_groups'][0], str): + raise ValueError('default_groups must be a list of group ' + 'names/ids (i.e. strings)') # Check if default groups exist - context = {'model': model, 'user': c.user} - for group_name in config_obj['default_groups']: + context = {'model': model, 'user': toolkit.c.user} + self.default_group_dicts = [] + for group_name_or_id in config_obj['default_groups']: try: group = get_action('group_show')( - context, {'id': group_name}) + context, {'id': group_name_or_id}) + self.default_group_dicts.append(group) except NotFound, e: raise ValueError('Default group not found') @@ -141,7 +149,7 @@ class CKANHarvester(HarvesterBase): if 'user' in config_obj: # Check if user exists - context = {'model': model, 'user': c.user} + context = {'model': model, 'user': toolkit.c.user} try: user = get_action('user_show')( context, {'id': config_obj.get('user')}) @@ -481,9 +489,10 @@ class CKANHarvester(HarvesterBase): if default_groups: if not 'groups' in package_dict: package_dict['groups'] = [] + existing_group_ids = [g['id'] for g in package_dict['groups']] package_dict['groups'].extend( - [g for g in default_groups - if g not in package_dict['groups']]) + [g for g in self.default_group_dicts + if g['id'] not in existing_group_ids]) # Set default extras if needed default_extras = self.config.get('default_extras', {}) diff --git a/ckanext/harvest/model/__init__.py b/ckanext/harvest/model/__init__.py index 158a017..e5b41fd 100644 --- a/ckanext/harvest/model/__init__.py +++ b/ckanext/harvest/model/__init__.py @@ -197,8 +197,16 @@ class HarvestObjectError(HarvestDomainObject): stage=stage, line=line) try: err.save() - except InvalidRequestError: - Session.rollback() + except InvalidRequestError, e: + # Clear any in-progress sqlalchemy transactions + try: + Session.rollback() + except: + pass + try: + Session.remove() + except: + pass err.save() finally: log_message = '{0}, line {1}'.format(message, line) \ @@ -578,7 +586,7 @@ def migrate_v3_create_datasets(source_ids=None): log.info('Created new package for source {0} ({1})'.format(source.id, source.url)) except logic.ValidationError,e: log.error('Validation Error: %s' % str(e.error_summary)) - + def clean_harvest_log(condition): Session.query(HarvestLog).filter(HarvestLog.created <= condition)\ .delete(synchronize_session=False) @@ -587,5 +595,5 @@ def clean_harvest_log(condition): except InvalidRequestError: Session.rollback() log.error('An error occurred while trying to clean-up the harvest log table') - + log.info('Harvest log table clean-up finished successfully') diff --git a/ckanext/harvest/tests/harvesters/test_ckanharvester.py b/ckanext/harvest/tests/harvesters/test_ckanharvester.py index d8c0349..a7c0126 100644 --- a/ckanext/harvest/tests/harvesters/test_ckanharvester.py +++ b/ckanext/harvest/tests/harvesters/test_ckanharvester.py @@ -4,6 +4,7 @@ from nose.tools import assert_equal, assert_raises import json from mock import patch, MagicMock + try: from ckan.tests.helpers import reset_db, call_action from ckan.tests.factories import Organization, Group @@ -221,3 +222,93 @@ class TestCkanHarvester(object): harvester=CKANHarvester()) assert not results_by_guid assert not was_last_job_considered_error_free() + + def test_default_tags(self): + config = {'default_tags': [{'name': 'geo'}]} + results_by_guid = run_harvest( + url='http://localhost:%s' % mock_ckan.PORT, + harvester=CKANHarvester(), + config=json.dumps(config)) + tags = results_by_guid['dataset1-id']['dataset']['tags'] + tag_names = [tag['name'] for tag in tags] + assert 'geo' in tag_names + + def test_default_tags_invalid(self): + config = {'default_tags': ['geo']} # should be list of dicts + assert_raises( + run_harvest, + url='http://localhost:%s' % mock_ckan.PORT, + harvester=CKANHarvester(), + config=json.dumps(config)) + + def test_default_groups(self): + Group(id='group1-id', name='group1') + Group(id='group2-id', name='group2') + Group(id='group3-id', name='group3') + + config = {'default_groups': ['group2-id', 'group3'], + 'remote_groups': 'only_local'} + tmp_c = toolkit.c + try: + # c.user is used by the validation (annoying), + # however patch doesn't work because it's a weird + # StackedObjectProxy, so we swap it manually + toolkit.c = MagicMock(user='') + results_by_guid = run_harvest( + url='http://localhost:%s' % mock_ckan.PORT, + harvester=CKANHarvester(), + config=json.dumps(config)) + finally: + toolkit.c = tmp_c + assert_equal(results_by_guid['dataset1-id']['errors'], []) + groups = results_by_guid['dataset1-id']['dataset']['groups'] + group_names = set(group['name'] for group in groups) + # group1 comes from the harvested dataset + # group2 & 3 come from the default_groups + assert_equal(group_names, set(('group1', 'group2', 'group3'))) + + def test_default_groups_invalid(self): + Group(id='group2-id', name='group2') + + # should be list of strings + config = {'default_tags': [{'name': 'group2'}]} + assert_raises( + run_harvest, + url='http://localhost:%s' % mock_ckan.PORT, + harvester=CKANHarvester(), + config=json.dumps(config)) + + def test_default_extras(self): + config = { + 'default_extras': { + 'encoding': 'utf8', + 'harvest_url': '{harvest_source_url}/dataset/{dataset_id}' + }} + tmp_c = toolkit.c + try: + # c.user is used by the validation (annoying), + # however patch doesn't work because it's a weird + # StackedObjectProxy, so we swap it manually + toolkit.c = MagicMock(user='') + results_by_guid = run_harvest( + url='http://localhost:%s' % mock_ckan.PORT, + harvester=CKANHarvester(), + config=json.dumps(config)) + finally: + toolkit.c = tmp_c + assert_equal(results_by_guid['dataset1-id']['errors'], []) + extras = results_by_guid['dataset1-id']['dataset']['extras'] + extras_dict = dict((e['key'], e['value']) for e in extras) + assert_equal(extras_dict['encoding'], 'utf8') + assert_equal(extras_dict['harvest_url'], + 'http://localhost:8998/dataset/dataset1-id') + + def test_default_extras_invalid(self): + config = { + 'default_extras': 'utf8', # value should be a dict + } + assert_raises( + run_harvest, + url='http://localhost:%s' % mock_ckan.PORT, + harvester=CKANHarvester(), + config=json.dumps(config)) \ No newline at end of file diff --git a/ckanext/harvest/tests/lib.py b/ckanext/harvest/tests/lib.py index 8b21ec7..e40eb8f 100644 --- a/ckanext/harvest/tests/lib.py +++ b/ckanext/harvest/tests/lib.py @@ -10,7 +10,8 @@ def run_harvest(url, harvester, config=''): Queues are avoided as they are a pain in tests. ''' # User creates a harvest source - source = HarvestSourceObj(url=url, config=config) + source = HarvestSourceObj(url=url, config=config, + source_type=harvester.info()['name']) # User triggers a harvest, which is the creation of a harvest job. # We set run=False so that it doesn't put it on the gather queue. diff --git a/setup.py b/setup.py index 3604e58..ae33712 100644 --- a/setup.py +++ b/setup.py @@ -36,6 +36,7 @@ setup( ckan_harvester=ckanext.harvest.harvesters:CKANHarvester [ckan.test_plugins] test_harvester=ckanext.harvest.tests.test_queue:MockHarvester + test_harvester2=ckanext.harvest.tests.test_queue2:MockHarvester test_action_harvester=ckanext.harvest.tests.test_action:MockHarvesterForActionTests [paste.paster_command] harvester = ckanext.harvest.commands.harvester:Harvester diff --git a/test-core.ini b/test-core.ini index b736803..d18bebc 100644 --- a/test-core.ini +++ b/test-core.ini @@ -15,7 +15,7 @@ port = 5000 use = config:../ckan/test-core.ini # Here we hard-code the database and a flag to make default tests # run fast. -ckan.plugins = harvest ckan_harvester test_harvester test_action_harvester +ckan.plugins = harvest ckan_harvester test_harvester test_harvester2 test_action_harvester ckan.harvest.mq.type = redis ckan.legacy_templates = false # NB: other test configuration should go in test-core.ini, which is