From 102bbecebae9b3a8869ceda153c776d84cf42218 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adri=C3=A0=20Mercader?= Date: Tue, 15 Mar 2011 10:29:39 +0000 Subject: [PATCH] [tests] Add harvesting tests from ckan core --- tests/test_api.py | 206 ++++++++++++++++++++++++ tests/test_model.py | 384 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 590 insertions(+) create mode 100644 tests/test_api.py create mode 100644 tests/test_model.py diff --git a/tests/test_api.py b/tests/test_api.py new file mode 100644 index 0000000..920d22f --- /dev/null +++ b/tests/test_api.py @@ -0,0 +1,206 @@ +from ckan.tests.functional.api.base import BaseModelApiTestCase +from ckan.tests.functional.api.base import Api1TestCase as Version1TestCase +from ckan.tests.functional.api.base import Api2TestCase as Version2TestCase +from ckan.tests.functional.api.base import ApiUnversionedTestCase as UnversionedTestCase + +# Todo: Remove this ckan.model stuff. +import ckan.model as model +from ckanext.harvest.model import HarvestSource +from ckanext.harvest.model import HarvestingJob +from ckanext.harvest.model import HarvestedDocument + +class HarvestingTestCase(BaseModelApiTestCase): + + commit_changesets = False + reuse_common_fixtures = True + + def setup(self): + #model.repo.init_db() + super(HarvestingTestCase, self).setup() + self.source = None + self.source1 = None + self.source2 = None + self.source3 = None + self.source4 = None + self.source5 = None + self.job = None + self.job1 = None + self.job2 = None + self.job3 = None + + def teardown(self): + model.repo.delete_all() + + def init_extra_environ(self): + self.user = model.User.by_name(self.user_name) + self.extra_environ={'Authorization' : config.get('ckan.harvesting.api_key')} + + def _create_harvest_source_fixture(self, **kwds): + source = HarvestSource(**kwds) + model.Session.add(source) + model.Session.commit() + assert source.id + return source + + def _create_harvesting_job_fixture(self, **kwds): + if not kwds.get('user_ref'): + kwds['user_ref'] = u'c_publisher_user' + job = HarvestingJob(**kwds) + model.Session.add(job) + model.Session.commit() + assert job.id + return job + + def test_harvestsource_entity_get_ok(self): + # Setup harvest source fixture. + fixture_url = u'http://localhost/' + self.source = self._create_harvest_source_fixture(url=fixture_url) + offset = self.offset('/rest/harvestsource/%s' % self.source.id) + res = self.app.get(offset, status=[200]) + source_data = self.data_from_res(res) + assert 'url' in source_data, "No 'id' in changeset data: %s" % source_data + self.assert_equal(source_data.get('url'), fixture_url) + + def test_harvestsource_entity_get_not_found(self): + offset = self.offset('/rest/harvestsource/%s' % "notasource") + self.app.get(offset, status=[404]) + + def test_publisher_harvestsource_register_get_ok(self): + # Setup harvest source fixtures. + fixture_url = u'http://localhost/' + self.source1 = self._create_harvest_source_fixture(url=fixture_url+'1', publisher_ref=u'pub1') + self.source2 = self._create_harvest_source_fixture(url=fixture_url+'2', publisher_ref=u'pub1') + self.source3 = self._create_harvest_source_fixture(url=fixture_url+'3', publisher_ref=u'pub1') + self.source4 = self._create_harvest_source_fixture(url=fixture_url+'4', publisher_ref=u'pub2') + self.source5 = self._create_harvest_source_fixture(url=fixture_url+'5', publisher_ref=u'pub2') + offset = self.offset('/rest/harvestsource/publisher/pub1') + res = self.app.get(offset, status=[200]) + source_data = self.data_from_res(res) + self.assert_equal(len(source_data), 3) + offset = self.offset('/rest/harvestsource/publisher/pub2') + res = self.app.get(offset, status=[200]) + source_data = self.data_from_res(res) + self.assert_equal(len(source_data), 2) + + def test_harvestingjob_entity_get_ok(self): + # Setup harvesting job fixture. + fixture_url = u'http://localhost/6' + self.source = self._create_harvest_source_fixture(url=fixture_url) + self.job = self._create_harvesting_job_fixture(source_id=self.source.id) + offset = self.offset('/rest/harvestingjob/%s' % self.job.id) + res = self.app.get(offset, status=[200]) + job_data = self.data_from_res(res) + self.assert_equal(job_data.get('source_id'), self.source.id) + + def test_harvestingjob_entity_get_not_found(self): + # Setup harvesting job fixture. + offset = self.offset('/rest/harvestingjob/%s' % "notajob") + self.app.get(offset, status=[404]) + + def test_harvestingjob_register_post_ok(self): + # Setup harvest source fixture. + fixture_url = u'http://localhost/7' + self.source = self._create_harvest_source_fixture(url=fixture_url) + # Prepare and send POST request to register. + offset = self.offset('/rest/harvestingjob') + # - invalid example. + job_details = { + 'source_id': 'made-up-source-id', + 'user_ref': u'a_publisher_user', + } + assert not HarvestingJob.get(u'a_publisher_user', default=None, attr='user_ref') + response = self.post(offset, job_details, status=400) + job_error = self.data_from_res(response) + assert "does not exist" in job_error + assert not HarvestingJob.get(u'a_publisher_user', default=None, attr='user_ref') + # - invalid example. + job_details = { + 'source_id': self.source.id, + 'user_ref': u'', + } + assert not HarvestingJob.get(u'a_publisher_user', None, 'user_ref') + response = self.post(offset, job_details, status=400) + job_error = self.data_from_res(response) + assert "You must supply a user_ref" in job_error + assert not HarvestingJob.get(self.source.id, default=None, attr='source_id') + # - valid example. + job_details = { + 'source_id': self.source.id, + 'user_ref': u'a_publisher_user', + } + assert not HarvestingJob.get(u'a_publisher_user', None, 'user_ref') + response = self.post(offset, job_details) + new_job = self.data_from_res(response) + assert new_job['id'] + self.assert_equal(new_job['source_id'], self.source.id) + self.assert_equal(new_job['user_ref'], u'a_publisher_user') + self.job = HarvestingJob.get(self.source.id, attr='source_id') + HarvestingJob.get(u'a_publisher_user', attr='user_ref') + + def test_harvestingjob_register_get_filter_by_status(self): + # Setup harvest source fixture. + fixture_url = u'http://localhost/8' + self.source = self._create_harvest_source_fixture(url=fixture_url) + self.job = self._create_harvesting_job_fixture(source_id=self.source.id) + register_offset = self.offset('/rest/harvestingjob') + self.assert_equal(self.job.status, 'New') + + filter_offset = '/status/new' + offset = register_offset + filter_offset + res = self.get(offset) + data = self.data_from_res(res) + self.assert_equal(data, [self.job.id]) + + filter_offset = '/status/error' + offset = register_offset + filter_offset + res = self.get(offset) + data = self.data_from_res(res) + self.assert_equal(data, []) + + self.job.status = u'Error' + self.job.save() + res = self.get(offset) + data = self.data_from_res(res) + self.assert_equal(data, [self.job.id]) + + filter_offset = '/status/new' + offset = register_offset + filter_offset + res = self.get(offset) + data = self.data_from_res(res) + self.assert_equal(data, []) + + filter_offset = '/status/error' + offset = register_offset + filter_offset + res = self.get(offset) + data = self.data_from_res(res) + self.assert_equal(data, [self.job.id]) + + def test_harvestingjob_entity_delete_ok(self): + # Setup harvesting job fixture. + fixture_url = u'http://localhost/6' + self.source = self._create_harvest_source_fixture(url=fixture_url) + self.job = self._create_harvesting_job_fixture(source_id=self.source.id) + offset = self.offset('/rest/harvestingjob/%s' % self.job.id) + self.get(offset, status=[200]) + res = self.app_delete(offset, status=[200]) + self.get(offset, status=[404]) + + def test_harvestingjob_entity_delete_denied(self): + self.send_authorization_header = False + # Setup harvesting job fixture. + fixture_url = u'http://localhost/6' + self.source = self._create_harvest_source_fixture(url=fixture_url) + self.job = self._create_harvesting_job_fixture(source_id=self.source.id) + offset = self.offset('/rest/harvestingjob/%s' % self.job.id) + self.get(offset, status=[200]) + self.app_delete(offset, status=[403]) + + def test_harvestingjob_entity_delete_not_found(self): + # Setup harvesting job fixture. + offset = self.offset('/rest/harvestingjob/%s' % "notajob") + self.get(offset, status=[404]) + +class TestHarvestingVersion1(Version1TestCase, HarvestingTestCase): pass +class TestHarvestingVersion2(Version2TestCase, HarvestingTestCase): pass +class TestHarvestingUnversioned(UnversionedTestCase, HarvestingTestCase): pass + diff --git a/tests/test_model.py b/tests/test_model.py new file mode 100644 index 0000000..e6e079c --- /dev/null +++ b/tests/test_model.py @@ -0,0 +1,384 @@ +import os +from lxml import etree + +from nose.plugins.skip import SkipTest + +from ckan import model +from ckanext.harvest.model import HarvestSource +from ckanext.harvest.model import HarvestingJob +from ckanext.harvest.model import HarvestedDocument +from ckanext.harvest.controllers.harvesting import HarvestingJobController + +from ckan.tests import * +from ckan.tests.gemini2_examples.expected_values import expect_values0 +from ckan.tests.gemini2_examples.expected_values import expect_values1 + + +class HarvesterTestCase(TestCase): + + require_common_fixtures = False + + def setup(self): + CreateTestData.create() + self.gemini_example = GeminiExamples() + + def teardown(self): + model.repo.rebuild_db() + + +class TestHarvestSource(HarvesterTestCase): + + def test_create_delete_harvest_source(self): + url = self.gemini_example.url_for(file_index=0) + source = HarvestSource(url=url) + source.save() + source_id = source.id + source = HarvestSource.get(source_id) + self.assert_true(source.id) + self.assert_equal(source.url, url) + self.delete(source) + self.commit() + self.assert_raises(Exception, HarvestSource.get, source_id) + + def test_write_package_and_delete_source(self): + """Create a package, then ensure that deleting its source + doesn't delete the package. + """ + #raise SkipTest('This needs fixing, but JG is going to refactor this. 2011-2-10.') + url = self.gemini_example.url_for(file_index=0) + source = HarvestSource(url=url) + count_before_write = self.count_packages() + job = HarvestingJob(source=source, + user_ref="me") + controller = HarvestingJobController(job) + controller.harvest_documents() + count_after_write = self.count_packages() + self.assert_equal(count_after_write, count_before_write + 1) + self.delete_commit(source) + count_after_delete = self.count_packages() + self.assert_equal(count_after_delete, count_after_write) + + def _make_package_from_source(self): + return package, source + + +class TestHarvestingJob(HarvesterTestCase): + + fixture_user_ref = u'publisheruser1' + + def setup(self): + super(TestHarvestingJob, self).setup() + self.source = HarvestSource( + url=self.gemini_example.url_for(file_index=0) + ) + self.job = HarvestingJob( + source=self.source, + user_ref=self.fixture_user_ref + ) + self.job.save() + self.controller = HarvestingJobController(self.job) + self.job2 = None + self.source2 = None + + def teardown(self): + if self.job2: + self.delete(self.job2) + if self.source2: + self.delete(self.source2) + super(TestHarvestingJob, self).teardown() + + def test_create_and_delete_job(self): + self.assert_equal(self.job.source_id, self.source.id) + self.delete_commit(self.job) + self.assert_raises(Exception, HarvestingJob.get, self.job.id) + # - check source has not been deleted! + HarvestSource.get(self.source.id) + + def test_harvest_documents(self): + before_count = self.count_packages() + job = self.controller.harvest_documents() + after_count = self.count_packages() + self.assert_equal(after_count, before_count + 1) + self.assert_equal(job.source.documents[0].package.name, + (job.report['added'][0])) + self.assert_true(job.report) + self.assert_len(job.report['errors'], 0) + self.assert_len(job.report['added'], 1) + + def test_harvest_documents_twice_unchanged(self): + job = self.controller.harvest_documents() + self.assert_len(job.report['errors'], 0) + self.assert_len(job.report['added'], 1) + job2 = HarvestingJobController( + HarvestingJob( + source=self.source, + user_ref=self.fixture_user_ref + ) + ).harvest_documents() + self.assert_len(job2.report['errors'], 0) + self.assert_len(job2.report['added'], 0) + + def test_harvest_documents_twice_changed(self): + job = self.controller.harvest_documents() + self.assert_len(job.report['errors'], 0) + self.assert_len(job.report['added'], 1) + self.source.url = self.gemini_example.url_for(file_index=2) + self.source.save() + job2 = HarvestingJobController( + HarvestingJob( + source=self.source, + user_ref=self.fixture_user_ref + ) + ).harvest_documents() + self.assert_len(job2.report['errors'], 0) + self.assert_len(job2.report['added'], 1) + + def test_harvest_documents_source_guid_contention(self): + job = self.controller.harvest_documents() + source2 = HarvestSource( + url=self.gemini_example.url_for(file_index=2), + ) + # Make sure it has an id by saving it + source2.save() + job2 = HarvestingJobController( + HarvestingJob( + source=source2, + user_ref=self.fixture_user_ref + ) + ).harvest_documents() + error = job2.report['errors'][0] + # XXX Should not allow file:// URLs, security implications + # The one that is conflicting doesn't have a user or publisher set up, otherwise the integers would show here + assert 'Another source' in error + assert 'ckan/tests/gemini2_examples/00a743bf-cca4-4c19-a8e5-e64f7edbcadd_gemini2.xml' in error + assert 'is using metadata GUID 00a743bf-cca4-4c19-a8e5-e64f7edbcadd' in error + + def test_harvest_bad_source_url(self): + source = HarvestSource( + url=self.gemini_example.url_for_bad(0) + ) + job = HarvestingJob( + source=source, + user_ref=self.fixture_user_ref + ) + before_count = self.count_packages() + self.assert_false(job.report['added']) + self.assert_false(job.report['errors']) + job = HarvestingJobController(job).harvest_documents() + after_count = self.count_packages() + self.assert_equal(after_count, before_count) + self.assert_len(job.report['added'], 0) + self.assert_len(job.report['errors'], 1) + error = job.report['errors'][0] + self.assert_contains(error, + 'Unable to detect source type from content') + + +class TestHarvesterSourceTypes(HarvesterTestCase): + + fixture_user_ref = u'publisheruser1' + + def setup(self): + self.gemini_example = GeminiExamples() + # XXX put real-life CSW examples here if you want, and if they + # arrive... + self.sources = [ + ( + 'http://127.0.0.1:44444', + { + 'errors': ["Error harvesting source: Unable to get content for URL: http://127.0.0.1:44444: URLError(error(111, 'Connection refused'),)"], + 'packages': 0, + 'documents': 0, + }, + ), + ( + 'http://www.google.com', + { + 'errors': ["Couldn't find any links to metadata"], + 'packages': 0, + 'documents': 0, + }, + ), + ( + self.gemini_example.url_for(file_index='index.html'), + { + 'errors': [], + 'packages': 2, + 'documents': 2, + }, + ), + ] + self.updated_sources = [ + ( + self.gemini_example.url_for(file_index='index.updated.html'), + { + 'errors': [], + 'packages': 2, + 'documents': 2, + }, + ), + ] + + def test_various_sources(self): + sources = [] + for url, expected in self.sources: + source = HarvestSource(url=url) + # Create an ID for it + source.save() + sources.append(source) + job = HarvestingJob( + source=source, + user_ref=self.fixture_user_ref + ) + before_count = self.count_packages() + self.assert_false(job.report['added']) + self.assert_false(job.report['errors']) + job = HarvestingJobController(job).harvest_documents() + after_count = self.count_packages() + self.assert_equal(after_count, + before_count + expected['packages']) + for (idx, error) in enumerate(job.report['errors']): + assert expected['errors'][idx] in error + # report['added'] is a list, appended to each time a + # package is touched. + self.assert_equal( + len(job.source.documents), + expected['documents'], + ) + for (idx, doc) in enumerate(job.source.documents): + self.assert_true(doc.package) + assert (doc.package.name in job.report['added']) + + # Now test updated sources + for url, expected in self.updated_sources: + sources[-1].url = url + sources[-1].save() + job = HarvestingJob( + # We'll use the last source updated above to test updating a + # document + source=sources[-1], + user_ref=self.fixture_user_ref + ) + self.assert_false(job.report['added']) + self.assert_false(job.report['errors']) + before_count = self.count_packages() + before_content = [doc.content for doc in job.source.documents] + job = HarvestingJobController(job).harvest_documents() + after_count = self.count_packages() + after_content = [doc.content for doc in job.source.documents] + self.assert_true(after_count == before_count == long(expected['packages'])) + # Represents an updated record + self.assert_equal(len(job.report['added']), 1) + self.assert_equal( + len(job.source.documents), + expected['documents'], + ) + self.assert_false(before_content == after_content) + +class TestHarvestedDocument(HarvesterTestCase): + def test_01_document_revisioned(self): + url = self.gemini_example.url_for(0) + model.repo.new_revision() + content = self.gemini_example.get_from_url(url) + document = HarvestedDocument(content=content) + document.save() + assert len(document.all_revisions_unordered) == 1 + + model.repo.new_revision() + url = self.gemini_example.url_for(1) + content = self.gemini_example.get_from_url(url) + document.content = content + document.save() + model.Session().expire(document) + assert len(document.all_revisions) == 2 + + document_id = document.id + self.assert_equal(document.content, content) + + self.delete_commit(document) + self.assert_raises(Exception, HarvestedDocument.get, document_id) + + def test_read_values_example0(self): + self.assert_read_values(0, expect_values0) + + def test_read_values_example1(self): + self.assert_read_values(1, expect_values1) + + def assert_read_values(self, example_index, expect_values): + url = self.gemini_example.url_for(file_index=example_index) + content = self.gemini_example.get_from_url(url) + document = HarvestedDocument(url=url, content=content) + values = document.read_values() + self.assert_gemini_values(values, expect_values) + + def assert_gemini_values(self, values, expect_values): + for name in expect_values: + value = values[name] + expect = expect_values[name] + self.assert_gemini_value(value, expect, name) + + def assert_gemini_value(self, value, expect, name): + try: + self.assert_equal(value, expect) + except AssertionError, inst: + msg = "'%s' has unexpected value: %s (expected %s)" %\ + (name, inst, expect) + raise AssertionError(msg) + + +class GeminiExamples(object): + """Encapsulates the Gemini example files in ckan/tests/gemini2_examples.""" + + file_names = [ + u'00a743bf-cca4-4c19-a8e5-e64f7edbcadd_gemini2.xml', + u'My series sample.xml', + u'00a743bf-cca4-4c19-a8e5-e64f7edbcadd_gemini2.update.xml', + ] + + file_names_bad = [ + u'RSS-example.xml', + ] + + def url_for(self, file_index=None): + if file_index in [None, 'index.html']: + name = "index.html" + elif file_index in ['index.updated.html']: + name = "index.updated.html" + else: + name = self.file_names[file_index] + path = os.path.join(self.folder_path(), name) + if not os.path.exists(path): + raise Exception("Gemini example not found on path: %s" % path) + return "file://%s" % path + + # Todo: Refactor url_for() and url_for_bad(). + def url_for_bad(self, index=None): + if index in [None, 'index.html']: + name = "index.html" + else: + name = self.file_names_bad[index] + path = os.path.join(self.folder_path_bad(), name) + if not os.path.exists(path): + raise Exception("Gemini bad example not found on path: %s" % path) + return "file://%s" % path + + # Todo: Refactor folder_path() and folder_path_bad(). + def folder_path(self): + from pylons import config + here_path = config['here'] + return os.path.join(here_path, 'ckan', 'tests', 'gemini2_examples') + + def folder_path_bad(self): + from pylons import config + here_path = config['here'] + return os.path.join(here_path, 'ckan', 'tests', 'gemini2_examples_bad') + + def get_from_url(self, url): + import urllib2 + resource = urllib2.urlopen(url) + # This returns the raw, data + data = resource.read() + # To get it as unicode we need to decode it + xml = etree.fromstring(data) + return etree.tostring(xml, encoding=unicode, pretty_print=True) +