Merge branch 'yhteentoimivuuspalvelut-job-reporting-fixes'

This commit is contained in:
amercader 2015-11-20 12:20:32 +00:00
commit 36babfb26a
14 changed files with 265 additions and 36 deletions

View File

@ -430,17 +430,24 @@ following methods::
''' '''
The import stage will receive a HarvestObject object and will be The import stage will receive a HarvestObject object and will be
responsible for: responsible for:
- performing any necessary action with the fetched object (e.g - performing any necessary action with the fetched object (e.g.
create a CKAN package). create, update or delete a CKAN package).
Note: if this stage creates or updates a package, a reference Note: if this stage creates or updates a package, a reference
to the package should be added to the HarvestObject. to the package should be added to the HarvestObject.
- creating the HarvestObject - Package relation (if necessary) - setting the HarvestObject.package (if there is one)
- setting the HarvestObject.current for this harvest:
- True if successfully created/updated
- False if successfully deleted
- setting HarvestObject.current to False for previous harvest
objects of this harvest source if the action was successful.
- creating and storing any suitable HarvestObjectErrors that may - creating and storing any suitable HarvestObjectErrors that may
occur. occur.
- returning True if everything went as expected, False otherwise. - returning True if the action was done, "unchanged" if nothing
was needed doing after all or False if there were errors.
:param harvest_object: HarvestObject object :param harvest_object: HarvestObject object
:returns: True if everything went right, False if errors were found :returns: True if the action was done, "unchanged" if nothing was
needed doing after all and False if something went wrong.
''' '''

View File

@ -240,6 +240,10 @@ class HarvesterBase(SingletonPlugin):
If the remote server provides the modification date of the remote If the remote server provides the modification date of the remote
package, add it to package_dict['metadata_modified']. package, add it to package_dict['metadata_modified'].
:returns: The same as what import_stage should return. i.e. True if the
create or update occurred ok, 'unchanged' if it didn't need
updating or False if there were errors.
TODO: Not sure it is worth keeping this function. If useful it should TODO: Not sure it is worth keeping this function. If useful it should
use the output of package_show logic function (maybe keeping support use the output of package_show logic function (maybe keeping support
@ -280,7 +284,10 @@ class HarvesterBase(SingletonPlugin):
data_dict = {} data_dict = {}
data_dict['id'] = package_dict['id'] data_dict['id'] = package_dict['id']
try: try:
existing_package_dict = get_action('package_show')(context, data_dict) package_show_context = {'model': model, 'session': Session,
'ignore_auth': True}
existing_package_dict = get_action('package_show')(
package_show_context, data_dict)
# In case name has been modified when first importing. See issue #101. # In case name has been modified when first importing. See issue #101.
package_dict['name'] = existing_package_dict['name'] package_dict['name'] = existing_package_dict['name']
@ -297,7 +304,8 @@ class HarvesterBase(SingletonPlugin):
else: else:
log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid) log.info('Package with GUID %s not updated, skipping...' % harvest_object.guid)
return # NB harvest_object.current/package_id are not set
return 'unchanged'
# Flag the other objects linking to this package as not current anymore # Flag the other objects linking to this package as not current anymore
from ckanext.harvest.model import harvest_object_table from ckanext.harvest.model import harvest_object_table

View File

@ -206,7 +206,7 @@ class CKANHarvester(HarvesterBase):
package_ids = revision['packages'] package_ids = revision['packages']
else: else:
log.info('No packages have been updated on the remote CKAN instance since the last harvest job') log.info('No packages have been updated on the remote CKAN instance since the last harvest job')
return None return []
except urllib2.HTTPError,e: except urllib2.HTTPError,e:
if e.getcode() == 400: if e.getcode() == 400:
@ -427,14 +427,20 @@ class CKANHarvester(HarvesterBase):
package_dict['extras'][key] = value package_dict['extras'][key] = value
# Clear remote url_type for resources (eg datastore, upload) as we
# are only creating normal resources with links to the remote ones
for resource in package_dict.get('resources', []): for resource in package_dict.get('resources', []):
# Clear remote url_type for resources (eg datastore, upload) as
# we are only creating normal resources with links to the
# remote ones
resource.pop('url_type', None) resource.pop('url_type', None)
# Clear revision_id as the revision won't exist on this CKAN
# and saving it will cause an IntegrityError with the foreign
# key.
resource.pop('revision_id', None)
result = self._create_or_update_package(package_dict,harvest_object) result = self._create_or_update_package(package_dict,harvest_object)
if result and self.config.get('read_only',False) == True: if result is True and self.config.get('read_only', False) is True:
package = model.Package.get(package_dict['id']) package = model.Package.get(package_dict['id'])
@ -451,8 +457,7 @@ class CKANHarvester(HarvesterBase):
user = model.User.get(user_name) user = model.User.get(user_name)
pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER) pkg_role = model.PackageRole(package=package, user=user, role=model.Role.READER)
return result
return True
except ValidationError,e: except ValidationError,e:
self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict), self._save_object_error('Invalid package with GUID %s: %r' % (harvest_object.guid, e.error_dict),
harvest_object, 'Import') harvest_object, 'Import')

View File

@ -106,15 +106,22 @@ class IHarvester(Interface):
''' '''
The import stage will receive a HarvestObject object and will be The import stage will receive a HarvestObject object and will be
responsible for: responsible for:
- performing any necessary action with the fetched object (e.g - performing any necessary action with the fetched object (e.g.
create a CKAN package). create, update or delete a CKAN package).
Note: if this stage creates or updates a package, a reference Note: if this stage creates or updates a package, a reference
to the package should be added to the HarvestObject. to the package should be added to the HarvestObject.
- creating the HarvestObject - Package relation (if necessary) - setting the HarvestObject.package (if there is one)
- setting the HarvestObject.current for this harvest:
- True if successfully created/updated
- False if successfully deleted
- setting HarvestObject.current to False for previous harvest
objects of this harvest source if the action was successful.
- creating and storing any suitable HarvestObjectErrors that may - creating and storing any suitable HarvestObjectErrors that may
occur. occur.
- returning True if everything went as expected, False otherwise. - returning True if the action was done, "unchanged" if nothing
was needed doing after all or False if there were errors.
:param harvest_object: HarvestObject object :param harvest_object: HarvestObject object
:returns: True if everything went right, False if errors were found :returns: True if the action was done, "unchanged" if nothing was
needed doing after all and False if something went wrong.
''' '''

View File

@ -419,6 +419,8 @@ def harvest_jobs_run(context, data_dict):
.first() .first()
if last_object: if last_object:
job_obj.finished = last_object.import_finished job_obj.finished = last_object.import_finished
else:
job_obj.finished = job['gather_finished']
job_obj.save() job_obj.save()
# Reindex the harvest source dataset so it has the latest # Reindex the harvest source dataset so it has the latest
# status # status

View File

@ -37,7 +37,8 @@ def harvest_job_dictize(job, context):
func.count(HarvestObject.id).label('total_objects'))\ func.count(HarvestObject.id).label('total_objects'))\
.filter_by(harvest_job_id=job.id)\ .filter_by(harvest_job_id=job.id)\
.group_by(HarvestObject.report_status).all() .group_by(HarvestObject.report_status).all()
out['stats'] = {'added': 0, 'updated': 0, 'errors': 0, 'deleted': 0} out['stats'] = {'added': 0, 'updated': 0, 'not modified': 0,
'errors': 0, 'deleted': 0}
for status, count in stats: for status, count in stats:
out['stats'][status] = count out['stats'][status] = count

View File

@ -218,7 +218,7 @@ def define_harvester_tables():
Column('guid', types.UnicodeText, default=u''), Column('guid', types.UnicodeText, default=u''),
# When you harvest a dataset multiple times, only the latest # When you harvest a dataset multiple times, only the latest
# successfully imported harvest_object should be flagged 'current'. # successfully imported harvest_object should be flagged 'current'.
# The import_stage reads and writes it. # The import_stage usually reads and writes it.
Column('current',types.Boolean,default=False), Column('current',types.Boolean,default=False),
Column('gathered', types.DateTime, default=datetime.datetime.utcnow), Column('gathered', types.DateTime, default=datetime.datetime.utcnow),
Column('fetch_started', types.DateTime), Column('fetch_started', types.DateTime),
@ -233,6 +233,7 @@ def define_harvester_tables():
Column('harvest_job_id', types.UnicodeText, ForeignKey('harvest_job.id')), Column('harvest_job_id', types.UnicodeText, ForeignKey('harvest_job.id')),
Column('harvest_source_id', types.UnicodeText, ForeignKey('harvest_source.id')), Column('harvest_source_id', types.UnicodeText, ForeignKey('harvest_source.id')),
Column('package_id', types.UnicodeText, ForeignKey('package.id', deferrable=True), nullable=True), Column('package_id', types.UnicodeText, ForeignKey('package.id', deferrable=True), nullable=True),
# report_status: 'added', 'updated', 'not modified', 'deleted', 'errored'
Column('report_status', types.UnicodeText, nullable=True), Column('report_status', types.UnicodeText, nullable=True),
) )

View File

@ -347,14 +347,16 @@ def fetch_and_import_stages(harvester, obj):
obj.import_finished = datetime.datetime.utcnow() obj.import_finished = datetime.datetime.utcnow()
if success_import: if success_import:
obj.state = "COMPLETE" obj.state = "COMPLETE"
if success_import is 'unchanged':
obj.report_status = 'not modified'
obj.save()
return
else: else:
obj.state = "ERROR" obj.state = "ERROR"
obj.save() obj.save()
else: else:
obj.state = "ERROR" obj.state = "ERROR"
obj.save() obj.save()
if obj.report_status:
return
if obj.state == 'ERROR': if obj.state == 'ERROR':
obj.report_status = 'errored' obj.report_status = 'errored'
elif obj.current == False: elif obj.current == False:

View File

@ -440,7 +440,7 @@ REVISIONS = [
"approved_timestamp": None, "approved_timestamp": None,
"packages": "packages":
[ [
"dataset1" DATASETS[1]['name']
], ],
"groups": [ ] "groups": [ ]
}, },
@ -452,7 +452,7 @@ REVISIONS = [
"approved_timestamp": None, "approved_timestamp": None,
"packages": "packages":
[ [
"dataset1" DATASETS[1]['name']
], ],
"groups": [ ] "groups": [ ]
}] }]

View File

@ -1,6 +1,9 @@
import copy
from nose.tools import assert_equal from nose.tools import assert_equal
import json import json
from mock import patch
try: try:
from ckan.tests.helpers import reset_db from ckan.tests.helpers import reset_db
from ckan.tests.factories import Organization from ckan.tests.factories import Organization
@ -92,15 +95,21 @@ class TestCkanHarvester(object):
run_harvest( run_harvest(
url='http://localhost:%s/' % mock_ckan.PORT, url='http://localhost:%s/' % mock_ckan.PORT,
harvester=CKANHarvester()) harvester=CKANHarvester())
results_by_guid = run_harvest(
url='http://localhost:%s/' % mock_ckan.PORT, # change the modified date
harvester=CKANHarvester()) datasets = copy.deepcopy(mock_ckan.DATASETS)
datasets[1]['metadata_modified'] = '2050-05-09T22:00:01.486366'
with patch('ckanext.harvest.tests.harvesters.mock_ckan.DATASETS',
datasets):
results_by_guid = run_harvest(
url='http://localhost:%s/' % mock_ckan.PORT,
harvester=CKANHarvester())
# updated the dataset which has revisions # updated the dataset which has revisions
result = results_by_guid['dataset1'] result = results_by_guid[mock_ckan.DATASETS[1]['name']]
assert_equal(result['state'], 'COMPLETE') assert_equal(result['state'], 'COMPLETE')
assert_equal(result['report_status'], 'updated') assert_equal(result['report_status'], 'updated')
assert_equal(result['dataset']['name'], mock_ckan.DATASETS[0]['name']) assert_equal(result['dataset']['name'], mock_ckan.DATASETS[1]['name'])
assert_equal(result['errors'], []) assert_equal(result['errors'], [])
# the other dataset is unchanged and not harvested # the other dataset is unchanged and not harvested
@ -134,3 +143,20 @@ class TestCkanHarvester(object):
config=json.dumps(config)) config=json.dumps(config))
assert 'dataset1-id' in results_by_guid assert 'dataset1-id' in results_by_guid
assert mock_ckan.DATASETS[1]['id'] not in results_by_guid assert mock_ckan.DATASETS[1]['id'] not in results_by_guid
def test_harvest_not_modified(self):
run_harvest(
url='http://localhost:%s/' % mock_ckan.PORT,
harvester=CKANHarvester())
results_by_guid = run_harvest(
url='http://localhost:%s/' % mock_ckan.PORT,
harvester=CKANHarvester())
# The metadata_modified was the same for this dataset so the import
# would have returned 'unchanged'
result = results_by_guid[mock_ckan.DATASETS[1]['name']]
assert_equal(result['state'], 'COMPLETE')
assert_equal(result['report_status'], 'not modified')
assert 'dataset' not in result
assert_equal(result['errors'], [])

View File

@ -43,7 +43,8 @@ def run_harvest_job(job, harvester):
if harvest_object.state == 'COMPLETE' and harvest_object.package_id: if harvest_object.state == 'COMPLETE' and harvest_object.package_id:
results_by_guid[guid]['dataset'] = \ results_by_guid[guid]['dataset'] = \
toolkit.get_action('package_show')( toolkit.get_action('package_show')(
{}, dict(id=harvest_object.package_id)) {'ignore_auth': True},
dict(id=harvest_object.package_id))
results_by_guid[guid]['errors'] = harvest_object.errors results_by_guid[guid]['errors'] = harvest_object.errors
# Do 'harvest_jobs_run' to change the job status to 'finished' # Do 'harvest_jobs_run' to change the job status to 'finished'

View File

@ -13,7 +13,7 @@ from ckan import model
from nose.tools import assert_equal from nose.tools import assert_equal
class TestHarvester(SingletonPlugin): class MockHarvester(SingletonPlugin):
implements(IHarvester) implements(IHarvester)
def info(self): def info(self):
return {'name': 'test', 'title': 'test', 'description': 'test'} return {'name': 'test', 'title': 'test', 'description': 'test'}
@ -196,14 +196,14 @@ class TestHarvestQueue(object):
) )
assert_equal(harvest_job['status'], u'Finished') assert_equal(harvest_job['status'], u'Finished')
assert_equal(harvest_job['stats'], {'added': 3, 'updated': 0, 'errors': 0, 'deleted': 0}) assert_equal(harvest_job['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errors': 0, 'deleted': 0})
harvest_source_dict = logic.get_action('harvest_source_show')( harvest_source_dict = logic.get_action('harvest_source_show')(
context, context,
{'id': harvest_source['id']} {'id': harvest_source['id']}
) )
assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 3, 'updated': 0, 'errors': 0, 'deleted': 0}) assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 3, 'updated': 0, 'not modified': 0, 'errors': 0, 'deleted': 0})
assert_equal(harvest_source_dict['status']['total_datasets'], 3) assert_equal(harvest_source_dict['status']['total_datasets'], 3)
assert_equal(harvest_source_dict['status']['job_count'], 1) assert_equal(harvest_source_dict['status']['job_count'], 1)
@ -267,7 +267,7 @@ class TestHarvestQueue(object):
context, context,
{'id': job_id} {'id': job_id}
) )
assert_equal(harvest_job['stats'], {'added': 0, 'updated': 2, 'errors': 0, 'deleted': 1}) assert_equal(harvest_job['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errors': 0, 'deleted': 1})
context['detailed'] = True context['detailed'] = True
harvest_source_dict = logic.get_action('harvest_source_show')( harvest_source_dict = logic.get_action('harvest_source_show')(
@ -275,6 +275,6 @@ class TestHarvestQueue(object):
{'id': harvest_source['id']} {'id': harvest_source['id']}
) )
assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 0, 'updated': 2, 'errors': 0, 'deleted': 1}) assert_equal(harvest_source_dict['status']['last_job']['stats'], {'added': 0, 'updated': 2, 'not modified': 0, 'errors': 0, 'deleted': 1})
assert_equal(harvest_source_dict['status']['total_datasets'], 2) assert_equal(harvest_source_dict['status']['total_datasets'], 2)
assert_equal(harvest_source_dict['status']['job_count'], 2) assert_equal(harvest_source_dict['status']['job_count'], 2)

View File

@ -0,0 +1,169 @@
'''Tests elements of queue.py, but doesn't use the queue subsystem
(redis/rabbitmq)
'''
import json
from nose.tools import assert_equal
try:
from ckan.tests.helpers import reset_db
except ImportError:
from ckan.new_tests.helpers import reset_db
from ckan import model
from ckan import plugins as p
from ckan.plugins import toolkit
from ckanext.harvest.tests.factories import (HarvestObjectObj)
from ckanext.harvest.interfaces import IHarvester
import ckanext.harvest.model as harvest_model
from ckanext.harvest.tests.lib import run_harvest
class MockHarvester(p.SingletonPlugin):
p.implements(IHarvester)
@classmethod
def _set_test_params(cls, guid, **test_params):
cls._guid = guid
cls._test_params = test_params
def info(self):
return {'name': 'test', 'title': 'test', 'description': 'test'}
def gather_stage(self, harvest_job):
obj = HarvestObjectObj(guid=self._guid, job=harvest_job)
return [obj.id]
def fetch_stage(self, harvest_object):
harvest_object.content = json.dumps({'name': harvest_object.guid})
harvest_object.save()
return True
def import_stage(self, harvest_object):
user = toolkit.get_action('get_site_user')(
{'model': model, 'ignore_auth': True}, {}
)['name']
package = json.loads(harvest_object.content)
name = package['name']
package_object = model.Package.get(name)
if package_object:
logic_function = 'package_update'
else:
logic_function = 'package_create'
package_dict = toolkit.get_action(logic_function)(
{'model': model, 'session': model.Session,
'user': user},
json.loads(harvest_object.content)
)
if self._test_params.get('object_error'):
return False
# successful, so move 'current' to this object
previous_object = model.Session.query(harvest_model.HarvestObject) \
.filter_by(guid=harvest_object.guid) \
.filter_by(current=True) \
.first()
if previous_object:
previous_object.current = False
previous_object.save()
harvest_object.package_id = package_dict['id']
harvest_object.current = True
if self._test_params.get('delete'):
# 'current=False' is the key step in getting report_status to be
# set as 'deleted'
harvest_object.current = False
package_object.save()
harvest_object.save()
if self._test_params.get('object_unchanged'):
return 'unchanged'
return True
class TestEndStates(object):
@classmethod
def setup_class(cls):
reset_db()
harvest_model.setup()
def test_create_dataset(self):
guid = 'obj-create'
MockHarvester._set_test_params(guid=guid)
results_by_guid = run_harvest(
url='http://some-url.com',
harvester=MockHarvester())
result = results_by_guid[guid]
assert_equal(result['state'], 'COMPLETE')
assert_equal(result['report_status'], 'added')
assert_equal(result['errors'], [])
def test_update_dataset(self):
guid = 'obj-update'
MockHarvester._set_test_params(guid=guid)
# create the original harvest_object and dataset
run_harvest(
url='http://some-url.com',
harvester=MockHarvester())
# update it
results_by_guid = run_harvest(
url='http://some-url.com',
harvester=MockHarvester())
result = results_by_guid[guid]
assert_equal(result['state'], 'COMPLETE')
assert_equal(result['report_status'], 'updated')
assert_equal(result['errors'], [])
def test_delete_dataset(self):
guid = 'obj-delete'
MockHarvester._set_test_params(guid=guid)
# create the original harvest_object and dataset
run_harvest(
url='http://some-url.com',
harvester=MockHarvester())
MockHarvester._set_test_params(guid=guid, delete=True)
# delete it
results_by_guid = run_harvest(
url='http://some-url.com',
harvester=MockHarvester())
result = results_by_guid[guid]
assert_equal(result['state'], 'COMPLETE')
assert_equal(result['report_status'], 'deleted')
assert_equal(result['errors'], [])
def test_obj_error(self):
guid = 'obj-error'
MockHarvester._set_test_params(guid=guid, object_error=True)
results_by_guid = run_harvest(
url='http://some-url.com',
harvester=MockHarvester())
result = results_by_guid[guid]
assert_equal(result['state'], 'ERROR')
assert_equal(result['report_status'], 'errored')
assert_equal(result['errors'], [])
def test_unchanged(self):
guid = 'obj-error'
MockHarvester._set_test_params(guid=guid, object_unchanged=True)
results_by_guid = run_harvest(
url='http://some-url.com',
harvester=MockHarvester())
result = results_by_guid[guid]
assert_equal(result['state'], 'COMPLETE')
assert_equal(result['report_status'], 'not modified')
assert_equal(result['errors'], [])

View File

@ -35,7 +35,7 @@ setup(
harvest=ckanext.harvest.plugin:Harvest harvest=ckanext.harvest.plugin:Harvest
ckan_harvester=ckanext.harvest.harvesters:CKANHarvester ckan_harvester=ckanext.harvest.harvesters:CKANHarvester
[ckan.test_plugins] [ckan.test_plugins]
test_harvester=ckanext.harvest.tests.test_queue:TestHarvester test_harvester=ckanext.harvest.tests.test_queue:MockHarvester
test_action_harvester=ckanext.harvest.tests.test_action:MockHarvesterForActionTests test_action_harvester=ckanext.harvest.tests.test_action:MockHarvesterForActionTests
[paste.paster_command] [paste.paster_command]
harvester = ckanext.harvest.commands.harvester:Harvester harvester = ckanext.harvest.commands.harvester:Harvester