[#107] "unchanged" response tested and related fixes
* fix "existing_package_dict" which wasn't containing metadata_modified (because of the schema in the context) so you never skipped an object. * fix IntegrityError due to resource revision_id being harvested. No idea why this hasn't caused errors before now. * "unchanged" is now checked in base instead of ckanharvester - makes sense. Looking at other harvesters, it's normal to return from the import_stage with the value returned from base._create_or_update_package so I've continued with that. * "unchanged" response is now documented * better report_status tests in test_queue2.
This commit is contained in:
parent
e59760fefe
commit
c7fac36c1c
17
README.rst
17
README.rst
|
@ -430,17 +430,24 @@ following methods::
|
|||
'''
|
||||
The import stage will receive a HarvestObject object and will be
|
||||
responsible for:
|
||||
- performing any necessary action with the fetched object (e.g
|
||||
create a CKAN package).
|
||||
- performing any necessary action with the fetched object (e.g.
|
||||
create, update or delete a CKAN package).
|
||||
Note: if this stage creates or updates a package, a reference
|
||||
to the package should be added to the HarvestObject.
|
||||
- creating the HarvestObject - Package relation (if necessary)
|
||||
- setting the HarvestObject.package (if there is one)
|
||||
- setting the HarvestObject.current for this harvest:
|
||||
- True if successfully created/updated
|
||||
- False if successfully deleted
|
||||
- setting HarvestObject.current to False for previous harvest
|
||||
objects of this harvest source if the action was successful.
|
||||
- creating and storing any suitable HarvestObjectErrors that may
|
||||
occur.
|
||||
- returning True if everything went as expected, False otherwise.
|
||||
- returning True if the action was done, "unchanged" if nothing
|
||||
was needed doing after all or False if there were errors.
|
||||
|
||||
:param harvest_object: HarvestObject object
|
||||
:returns: True if everything went right, False if errors were found
|
||||
:returns: True if the action was done, "unchanged" if nothing was
|
||||
needed doing after all and False if something went wrong.
|
||||
'''
|
||||
|
||||
|
||||
|
|
|
@ -240,6 +240,10 @@ class HarvesterBase(SingletonPlugin):
|
|||
If the remote server provides the modification date of the remote
|
||||
package, add it to package_dict['metadata_modified'].
|
||||
|
||||
:returns: The same as what import_stage should return. i.e. True if the
|
||||
create or update occurred ok, 'unchanged' if it didn't need
|
||||
updating or False if there were errors.
|
||||
|
||||
|
||||
TODO: Not sure it is worth keeping this function. If useful it should
|
||||
use the output of package_show logic function (maybe keeping support
|
||||
|
@ -280,7 +284,10 @@ class HarvesterBase(SingletonPlugin):
|
|||
data_dict = {}
|
||||
data_dict['id'] = package_dict['id']
|
||||
try:
|
||||
existing_package_dict = get_action('package_show')(context, data_dict)
|
||||
package_show_context = {'model': model, 'session': Session,
|
||||
'ignore_auth': True}
|
||||
existing_package_dict = get_action('package_show')(
|
||||
package_show_context, data_dict)
|
||||
|
||||
# In case name has been modified when first importing. See issue #101.
|
||||
package_dict['name'] = existing_package_dict['name']
|
||||
|
@ -297,7 +304,8 @@ class HarvesterBase(SingletonPlugin):
|
|||
|
||||
else:
|
||||
log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
|
||||
return
|
||||
# NB harvest_object.current/package_id are not set
|
||||
return 'unchanged'
|
||||
|
||||
# Flag the other objects linking to this package as not current anymore
|
||||
from ckanext.harvest.model import harvest_object_table
|
||||
|
|
|
@ -423,25 +423,20 @@ class CKANHarvester(HarvesterBase):
|
|||
|
||||
package_dict['extras'][key] = value
|
||||
|
||||
# Clear remote url_type for resources (eg datastore, upload) as we
|
||||
# are only creating normal resources with links to the remote ones
|
||||
for resource in package_dict.get('resources', []):
|
||||
# Clear remote url_type for resources (eg datastore, upload) as
|
||||
# we are only creating normal resources with links to the
|
||||
# remote ones
|
||||
resource.pop('url_type', None)
|
||||
|
||||
# Check if package exists
|
||||
data_dict = {}
|
||||
data_dict['id'] = package_dict['id']
|
||||
try:
|
||||
existing_package_dict = get_action('package_show')(context, data_dict)
|
||||
if 'metadata_modified' in package_dict and \
|
||||
package_dict['metadata_modified'] <= existing_package_dict.get('metadata_modified'):
|
||||
return "unchanged"
|
||||
except NotFound:
|
||||
pass
|
||||
# Clear revision_id as the revision won't exist on this CKAN
|
||||
# and saving it will cause an IntegrityError with the foreign
|
||||
# key.
|
||||
resource.pop('revision_id', None)
|
||||
|
||||
result = self._create_or_update_package(package_dict,harvest_object)
|
||||
|
||||
if result and self.config.get('read_only',False) == True:
|
||||
if result is True and self.config.get('read_only', False) is True:
|
||||
|
||||
package = model.Package.get(package_dict['id'])
|
||||
|
||||
|
@ -458,8 +453,7 @@ class CKANHarvester(HarvesterBase):
|
|||
user = model.User.get(user_name)
|
||||
pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)
|
||||
|
||||
|
||||
return True
|
||||
return result
|
||||
except ValidationError,e:
|
||||
self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
|
||||
harvest_object, 'Import')
|
||||
|
|
|
@ -106,15 +106,22 @@ class IHarvester(Interface):
|
|||
'''
|
||||
The import stage will receive a HarvestObject object and will be
|
||||
responsible for:
|
||||
- performing any necessary action with the fetched object (e.g
|
||||
create a CKAN package).
|
||||
- performing any necessary action with the fetched object (e.g.
|
||||
create, update or delete a CKAN package).
|
||||
Note: if this stage creates or updates a package, a reference
|
||||
to the package should be added to the HarvestObject.
|
||||
- creating the HarvestObject - Package relation (if necessary)
|
||||
- setting the HarvestObject.package (if there is one)
|
||||
- setting the HarvestObject.current for this harvest:
|
||||
- True if successfully created/updated
|
||||
- False if successfully deleted
|
||||
- setting HarvestObject.current to False for previous harvest
|
||||
objects of this harvest source if the action was successful.
|
||||
- creating and storing any suitable HarvestObjectErrors that may
|
||||
occur.
|
||||
- returning True if everything went as expected, False otherwise.
|
||||
- returning True if the action was done, "unchanged" if nothing
|
||||
was needed doing after all or False if there were errors.
|
||||
|
||||
:param harvest_object: HarvestObject object
|
||||
:returns: True if everything went right, False if errors were found
|
||||
:returns: True if the action was done, "unchanged" if nothing was
|
||||
needed doing after all and False if something went wrong.
|
||||
'''
|
||||
|
|
|
@ -218,7 +218,7 @@ def define_harvester_tables():
|
|||
Column('guid', types.UnicodeText, default=u''),
|
||||
# When you harvest a dataset multiple times, only the latest
|
||||
# successfully imported harvest_object should be flagged 'current'.
|
||||
# The import_stage reads and writes it.
|
||||
# The import_stage usually reads and writes it.
|
||||
Column('current',types.Boolean,default=False),
|
||||
Column('gathered', types.DateTime, default=datetime.datetime.utcnow),
|
||||
Column('fetch_started', types.DateTime),
|
||||
|
@ -233,6 +233,7 @@ def define_harvester_tables():
|
|||
Column('harvest_job_id', types.UnicodeText, ForeignKey('harvest_job.id')),
|
||||
Column('harvest_source_id', types.UnicodeText, ForeignKey('harvest_source.id')),
|
||||
Column('package_id', types.UnicodeText, ForeignKey('package.id', deferrable=True), nullable=True),
|
||||
# report_status: 'added', 'updated', 'not modified', 'deleted', 'errored'
|
||||
Column('report_status', types.UnicodeText, nullable=True),
|
||||
)
|
||||
|
||||
|
|
|
@ -440,7 +440,7 @@ REVISIONS = [
|
|||
"approved_timestamp": None,
|
||||
"packages":
|
||||
[
|
||||
"dataset1"
|
||||
DATASETS[1]['name']
|
||||
],
|
||||
"groups": [ ]
|
||||
},
|
||||
|
@ -452,7 +452,7 @@ REVISIONS = [
|
|||
"approved_timestamp": None,
|
||||
"packages":
|
||||
[
|
||||
"dataset1"
|
||||
DATASETS[1]['name']
|
||||
],
|
||||
"groups": [ ]
|
||||
}]
|
||||
|
|
|
@ -1,6 +1,9 @@
|
|||
import copy
|
||||
from nose.tools import assert_equal
|
||||
import json
|
||||
|
||||
from mock import patch
|
||||
|
||||
try:
|
||||
from ckan.tests.helpers import reset_db
|
||||
from ckan.tests.factories import Organization
|
||||
|
@ -92,15 +95,21 @@ class TestCkanHarvester(object):
|
|||
run_harvest(
|
||||
url='http://localhost:%s/' % mock_ckan.PORT,
|
||||
harvester=CKANHarvester())
|
||||
results_by_guid = run_harvest(
|
||||
url='http://localhost:%s/' % mock_ckan.PORT,
|
||||
harvester=CKANHarvester())
|
||||
|
||||
# change the modified date
|
||||
datasets = copy.deepcopy(mock_ckan.DATASETS)
|
||||
datasets[1]['metadata_modified'] = '2050-05-09T22:00:01.486366'
|
||||
with patch('ckanext.harvest.tests.harvesters.mock_ckan.DATASETS',
|
||||
datasets):
|
||||
results_by_guid = run_harvest(
|
||||
url='http://localhost:%s/' % mock_ckan.PORT,
|
||||
harvester=CKANHarvester())
|
||||
|
||||
# updated the dataset which has revisions
|
||||
result = results_by_guid['dataset1']
|
||||
result = results_by_guid[mock_ckan.DATASETS[1]['name']]
|
||||
assert_equal(result['state'], 'COMPLETE')
|
||||
assert_equal(result['report_status'], 'updated')
|
||||
assert_equal(result['dataset']['name'], mock_ckan.DATASETS[0]['name'])
|
||||
assert_equal(result['dataset']['name'], mock_ckan.DATASETS[1]['name'])
|
||||
assert_equal(result['errors'], [])
|
||||
|
||||
# the other dataset is unchanged and not harvested
|
||||
|
@ -134,3 +143,20 @@ class TestCkanHarvester(object):
|
|||
config=json.dumps(config))
|
||||
assert 'dataset1-id' in results_by_guid
|
||||
assert mock_ckan.DATASETS[1]['id'] not in results_by_guid
|
||||
|
||||
def test_harvest_not_modified(self):
|
||||
run_harvest(
|
||||
url='http://localhost:%s/' % mock_ckan.PORT,
|
||||
harvester=CKANHarvester())
|
||||
|
||||
results_by_guid = run_harvest(
|
||||
url='http://localhost:%s/' % mock_ckan.PORT,
|
||||
harvester=CKANHarvester())
|
||||
|
||||
# The metadata_modified was the same for this dataset so the import
|
||||
# would have returned 'unchanged'
|
||||
result = results_by_guid[mock_ckan.DATASETS[1]['name']]
|
||||
assert_equal(result['state'], 'COMPLETE')
|
||||
assert_equal(result['report_status'], 'not modified')
|
||||
assert 'dataset' not in result
|
||||
assert_equal(result['errors'], [])
|
||||
|
|
|
@ -43,7 +43,8 @@ def run_harvest_job(job, harvester):
|
|||
if harvest_object.state == 'COMPLETE' and harvest_object.package_id:
|
||||
results_by_guid[guid]['dataset'] = \
|
||||
toolkit.get_action('package_show')(
|
||||
{}, dict(id=harvest_object.package_id))
|
||||
{'ignore_auth': True},
|
||||
dict(id=harvest_object.package_id))
|
||||
results_by_guid[guid]['errors'] = harvest_object.errors
|
||||
|
||||
# Do 'harvest_jobs_run' to change the job status to 'finished'
|
||||
|
|
|
@ -12,7 +12,7 @@ import ckan.logic as logic
|
|||
from ckan import model
|
||||
|
||||
|
||||
class TestHarvester(SingletonPlugin):
|
||||
class MockHarvester(SingletonPlugin):
|
||||
implements(IHarvester)
|
||||
def info(self):
|
||||
return {'name': 'test', 'title': 'test', 'description': 'test'}
|
||||
|
|
|
@ -0,0 +1,169 @@
|
|||
'''Tests elements of queue.py, but doesn't use the queue subsystem
|
||||
(redis/rabbitmq)
|
||||
'''
|
||||
import json
|
||||
|
||||
from nose.tools import assert_equal
|
||||
|
||||
try:
|
||||
from ckan.tests.helpers import reset_db
|
||||
except ImportError:
|
||||
from ckan.new_tests.helpers import reset_db
|
||||
from ckan import model
|
||||
from ckan import plugins as p
|
||||
from ckan.plugins import toolkit
|
||||
|
||||
from ckanext.harvest.tests.factories import (HarvestObjectObj)
|
||||
from ckanext.harvest.interfaces import IHarvester
|
||||
import ckanext.harvest.model as harvest_model
|
||||
from ckanext.harvest.tests.lib import run_harvest
|
||||
|
||||
|
||||
class MockHarvester(p.SingletonPlugin):
|
||||
p.implements(IHarvester)
|
||||
|
||||
@classmethod
|
||||
def _set_test_params(cls, guid, **test_params):
|
||||
cls._guid = guid
|
||||
cls._test_params = test_params
|
||||
|
||||
def info(self):
|
||||
return {'name': 'test', 'title': 'test', 'description': 'test'}
|
||||
|
||||
def gather_stage(self, harvest_job):
|
||||
obj = HarvestObjectObj(guid=self._guid, job=harvest_job)
|
||||
return [obj.id]
|
||||
|
||||
def fetch_stage(self, harvest_object):
|
||||
harvest_object.content = json.dumps({'name': harvest_object.guid})
|
||||
harvest_object.save()
|
||||
return True
|
||||
|
||||
def import_stage(self, harvest_object):
|
||||
user = toolkit.get_action('get_site_user')(
|
||||
{'model': model, 'ignore_auth': True}, {}
|
||||
)['name']
|
||||
|
||||
package = json.loads(harvest_object.content)
|
||||
name = package['name']
|
||||
|
||||
package_object = model.Package.get(name)
|
||||
if package_object:
|
||||
logic_function = 'package_update'
|
||||
else:
|
||||
logic_function = 'package_create'
|
||||
|
||||
package_dict = toolkit.get_action(logic_function)(
|
||||
{'model': model, 'session': model.Session,
|
||||
'user': user},
|
||||
json.loads(harvest_object.content)
|
||||
)
|
||||
|
||||
if self._test_params.get('object_error'):
|
||||
return False
|
||||
|
||||
# successful, so move 'current' to this object
|
||||
previous_object = model.Session.query(harvest_model.HarvestObject) \
|
||||
.filter_by(guid=harvest_object.guid) \
|
||||
.filter_by(current=True) \
|
||||
.first()
|
||||
if previous_object:
|
||||
previous_object.current = False
|
||||
previous_object.save()
|
||||
harvest_object.package_id = package_dict['id']
|
||||
harvest_object.current = True
|
||||
|
||||
if self._test_params.get('delete'):
|
||||
# 'current=False' is the key step in getting report_status to be
|
||||
# set as 'deleted'
|
||||
harvest_object.current = False
|
||||
package_object.save()
|
||||
|
||||
harvest_object.save()
|
||||
|
||||
if self._test_params.get('object_unchanged'):
|
||||
return 'unchanged'
|
||||
return True
|
||||
|
||||
|
||||
class TestEndStates(object):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
reset_db()
|
||||
harvest_model.setup()
|
||||
|
||||
def test_create_dataset(self):
|
||||
guid = 'obj-create'
|
||||
MockHarvester._set_test_params(guid=guid)
|
||||
|
||||
results_by_guid = run_harvest(
|
||||
url='http://some-url.com',
|
||||
harvester=MockHarvester())
|
||||
|
||||
result = results_by_guid[guid]
|
||||
assert_equal(result['state'], 'COMPLETE')
|
||||
assert_equal(result['report_status'], 'added')
|
||||
assert_equal(result['errors'], [])
|
||||
|
||||
def test_update_dataset(self):
|
||||
guid = 'obj-update'
|
||||
MockHarvester._set_test_params(guid=guid)
|
||||
|
||||
# create the original harvest_object and dataset
|
||||
run_harvest(
|
||||
url='http://some-url.com',
|
||||
harvester=MockHarvester())
|
||||
# update it
|
||||
results_by_guid = run_harvest(
|
||||
url='http://some-url.com',
|
||||
harvester=MockHarvester())
|
||||
|
||||
result = results_by_guid[guid]
|
||||
assert_equal(result['state'], 'COMPLETE')
|
||||
assert_equal(result['report_status'], 'updated')
|
||||
assert_equal(result['errors'], [])
|
||||
|
||||
def test_delete_dataset(self):
|
||||
guid = 'obj-delete'
|
||||
MockHarvester._set_test_params(guid=guid)
|
||||
# create the original harvest_object and dataset
|
||||
run_harvest(
|
||||
url='http://some-url.com',
|
||||
harvester=MockHarvester())
|
||||
MockHarvester._set_test_params(guid=guid, delete=True)
|
||||
|
||||
# delete it
|
||||
results_by_guid = run_harvest(
|
||||
url='http://some-url.com',
|
||||
harvester=MockHarvester())
|
||||
|
||||
result = results_by_guid[guid]
|
||||
assert_equal(result['state'], 'COMPLETE')
|
||||
assert_equal(result['report_status'], 'deleted')
|
||||
assert_equal(result['errors'], [])
|
||||
|
||||
def test_obj_error(self):
|
||||
guid = 'obj-error'
|
||||
MockHarvester._set_test_params(guid=guid, object_error=True)
|
||||
|
||||
results_by_guid = run_harvest(
|
||||
url='http://some-url.com',
|
||||
harvester=MockHarvester())
|
||||
|
||||
result = results_by_guid[guid]
|
||||
assert_equal(result['state'], 'ERROR')
|
||||
assert_equal(result['report_status'], 'errored')
|
||||
assert_equal(result['errors'], [])
|
||||
|
||||
def test_unchanged(self):
|
||||
guid = 'obj-error'
|
||||
MockHarvester._set_test_params(guid=guid, object_unchanged=True)
|
||||
|
||||
results_by_guid = run_harvest(
|
||||
url='http://some-url.com',
|
||||
harvester=MockHarvester())
|
||||
|
||||
result = results_by_guid[guid]
|
||||
assert_equal(result['state'], 'COMPLETE')
|
||||
assert_equal(result['report_status'], 'not modified')
|
||||
assert_equal(result['errors'], [])
|
2
setup.py
2
setup.py
|
@ -35,7 +35,7 @@ setup(
|
|||
harvest=ckanext.harvest.plugin:Harvest
|
||||
ckan_harvester=ckanext.harvest.harvesters:CKANHarvester
|
||||
[ckan.test_plugins]
|
||||
test_harvester=ckanext.harvest.tests.test_queue:TestHarvester
|
||||
test_harvester=ckanext.harvest.tests.test_queue:MockHarvester
|
||||
test_action_harvester=ckanext.harvest.tests.test_action:MockHarvesterForActionTests
|
||||
[paste.paster_command]
|
||||
harvester = ckanext.harvest.commands.harvester:Harvester
|
||||
|
|
Loading…
Reference in New Issue