import os
from lxml import etree

from nose.plugins.skip import SkipTest

from ckan import model
from ckanext.harvest.model import HarvestSource
from ckanext.harvest.model import HarvestingJob
from ckanext.harvest.model import HarvestedDocument
from ckanext.harvest.controllers.harvesting import HarvestingJobController

from ckan.tests import *
from ckan.tests.gemini2_examples.expected_values import expect_values0
from ckan.tests.gemini2_examples.expected_values import expect_values1


class HarvesterTestCase(TestCase):

    require_common_fixtures = False

    def setup(self):
        CreateTestData.create()
        self.gemini_example = GeminiExamples()

    def teardown(self):
        model.repo.rebuild_db()


class TestHarvestSource(HarvesterTestCase):

    def test_create_delete_harvest_source(self):
        url = self.gemini_example.url_for(file_index=0)
        source = HarvestSource(url=url)
        source.save()
        source_id = source.id
        source = HarvestSource.get(source_id)
        self.assert_true(source.id)
        self.assert_equal(source.url, url)
        self.delete(source)
        self.commit()
        self.assert_raises(Exception, HarvestSource.get, source_id)

    def test_write_package_and_delete_source(self):
        """Create a package, then ensure that deleting its source
        doesn't delete the package.
        """
        #raise SkipTest('This needs fixing, but JG is going to refactor this. 2011-2-10.')
        url = self.gemini_example.url_for(file_index=0)
        source = HarvestSource(url=url)
        count_before_write = self.count_packages()
        job = HarvestingJob(source=source,
                            user_ref="me")
        controller = HarvestingJobController(job)
        controller.harvest_documents()
        count_after_write = self.count_packages()
        self.assert_equal(count_after_write, count_before_write + 1)
        self.delete_commit(source)
        count_after_delete = self.count_packages()
        self.assert_equal(count_after_delete, count_after_write)

    def _make_package_from_source(self):
        return package, source


class TestHarvestingJob(HarvesterTestCase):

    fixture_user_ref = u'publisheruser1'

    def setup(self):
        super(TestHarvestingJob, self).setup()
        self.source = HarvestSource(
            url=self.gemini_example.url_for(file_index=0)
        )
        self.job = HarvestingJob(
            source=self.source,
            user_ref=self.fixture_user_ref
        )
        self.job.save()
        self.controller = HarvestingJobController(self.job)
        self.job2 = None
        self.source2 = None

    def teardown(self):
        if self.job2:
            self.delete(self.job2)
        if self.source2:
            self.delete(self.source2)
        super(TestHarvestingJob, self).teardown()

    def test_create_and_delete_job(self):
        self.assert_equal(self.job.source_id, self.source.id)
        self.delete_commit(self.job)
        self.assert_raises(Exception, HarvestingJob.get, self.job.id)
        # - check source has not been deleted!
        HarvestSource.get(self.source.id)

    def test_harvest_documents(self):
        before_count = self.count_packages()
        job = self.controller.harvest_documents()
        after_count = self.count_packages()
        self.assert_equal(after_count, before_count + 1)
        self.assert_equal(job.source.documents[0].package.name,
                          (job.report['added'][0]))
        self.assert_true(job.report)
        self.assert_len(job.report['errors'], 0)
        self.assert_len(job.report['added'], 1)

    def test_harvest_documents_twice_unchanged(self):
        job = self.controller.harvest_documents()
        self.assert_len(job.report['errors'], 0)
        self.assert_len(job.report['added'], 1)
        job2 = HarvestingJobController(
            HarvestingJob(
                source=self.source,
                user_ref=self.fixture_user_ref
                )
            ).harvest_documents()
        self.assert_len(job2.report['errors'], 0)
        self.assert_len(job2.report['added'], 0)

    def test_harvest_documents_twice_changed(self):
        job = self.controller.harvest_documents()
        self.assert_len(job.report['errors'], 0)
        self.assert_len(job.report['added'], 1)
        self.source.url = self.gemini_example.url_for(file_index=2)
        self.source.save()
        job2 = HarvestingJobController(
            HarvestingJob(
                source=self.source,
                user_ref=self.fixture_user_ref
                )
            ).harvest_documents()
        self.assert_len(job2.report['errors'], 0)
        self.assert_len(job2.report['added'], 1)

    def test_harvest_documents_source_guid_contention(self):
        job = self.controller.harvest_documents()
        source2 = HarvestSource(
            url=self.gemini_example.url_for(file_index=2),
        )
        # Make sure it has an id by saving it
        source2.save()
        job2 = HarvestingJobController(
            HarvestingJob(
                source=source2,
                user_ref=self.fixture_user_ref
                )
            ).harvest_documents()
        error = job2.report['errors'][0]
        # XXX Should not allow file:// URLs, security implications
        # The one that is conflicting doesn't have a user or publisher set up, otherwise the integers would show here
        assert 'Another source' in error
        assert 'ckan/tests/gemini2_examples/00a743bf-cca4-4c19-a8e5-e64f7edbcadd_gemini2.xml' in error
        assert 'is using metadata GUID 00a743bf-cca4-4c19-a8e5-e64f7edbcadd' in error

    def test_harvest_bad_source_url(self):
        source = HarvestSource(
            url=self.gemini_example.url_for_bad(0)
            )
        job = HarvestingJob(
            source=source,
            user_ref=self.fixture_user_ref
            )
        before_count = self.count_packages()
        self.assert_false(job.report['added'])
        self.assert_false(job.report['errors'])
        job = HarvestingJobController(job).harvest_documents()
        after_count = self.count_packages()
        self.assert_equal(after_count, before_count)
        self.assert_len(job.report['added'], 0)
        self.assert_len(job.report['errors'], 1)
        error = job.report['errors'][0]
        self.assert_contains(error,
                             'Unable to detect source type from content')


class TestHarvesterSourceTypes(HarvesterTestCase):

    fixture_user_ref = u'publisheruser1'

    def setup(self):
        self.gemini_example = GeminiExamples()
        # XXX put real-life CSW examples here if you want, and if they
        # arrive...
        self.sources = [
            (
                'http://127.0.0.1:44444',
                {
                    'errors': ["Error harvesting source: Unable to get content for URL: http://127.0.0.1:44444: URLError(error(111, 'Connection refused'),)"],
                    'packages': 0,
                    'documents': 0,
                },
            ),
            (
                'http://www.google.com',
                {
                    'errors': ["Couldn't find any links to metadata"],
                    'packages': 0,
                    'documents': 0,
                },
            ),
            (
                self.gemini_example.url_for(file_index='index.html'),
                {
                    'errors': [],
                    'packages': 2,
                    'documents': 2,
                },
            ),
        ]
        self.updated_sources = [
            (
                self.gemini_example.url_for(file_index='index.updated.html'),
                {
                    'errors': [],
                    'packages': 2,
                    'documents': 2,
                },
            ),
        ]

    def test_various_sources(self):
        sources = []
        for url, expected in self.sources:
            source = HarvestSource(url=url)
            # Create an ID for it
            source.save()
            sources.append(source)
            job = HarvestingJob(
                source=source,
                user_ref=self.fixture_user_ref
            )
            before_count = self.count_packages()
            self.assert_false(job.report['added'])
            self.assert_false(job.report['errors'])
            job = HarvestingJobController(job).harvest_documents()
            after_count = self.count_packages()
            self.assert_equal(after_count,
                              before_count + expected['packages'])
            for (idx, error) in enumerate(job.report['errors']):
                assert expected['errors'][idx] in error
            # report['added'] is a list, appended to each time a
            # package is touched.
            self.assert_equal(
                len(job.source.documents),
                expected['documents'],
            )
            for (idx, doc) in enumerate(job.source.documents):
                self.assert_true(doc.package)
                assert (doc.package.name in job.report['added'])

        # Now test updated sources
        for url, expected in self.updated_sources:
            sources[-1].url = url
            sources[-1].save()
            job = HarvestingJob(
                # We'll use the last source updated above to test updating a 
                # document
                source=sources[-1],
                user_ref=self.fixture_user_ref
            )
            self.assert_false(job.report['added'])
            self.assert_false(job.report['errors'])
            before_count = self.count_packages()
            before_content = [doc.content for doc in job.source.documents]
            job = HarvestingJobController(job).harvest_documents()
            after_count = self.count_packages()
            after_content = [doc.content for doc in job.source.documents]
            self.assert_true(after_count == before_count == long(expected['packages']))
            # Represents an updated record
            self.assert_equal(len(job.report['added']), 1)
            self.assert_equal(
                len(job.source.documents),
                expected['documents'],
            )
            self.assert_false(before_content == after_content)

class TestHarvestedDocument(HarvesterTestCase):
    def test_01_document_revisioned(self):
        url = self.gemini_example.url_for(0)
        model.repo.new_revision()
        content = self.gemini_example.get_from_url(url)
        document = HarvestedDocument(content=content)
        document.save()
        assert len(document.all_revisions_unordered) == 1

        model.repo.new_revision()
        url = self.gemini_example.url_for(1)
        content = self.gemini_example.get_from_url(url)
        document.content = content
        document.save()
        model.Session().expire(document)
        assert len(document.all_revisions) == 2

        document_id = document.id
        self.assert_equal(document.content, content)

        self.delete_commit(document)
        self.assert_raises(Exception, HarvestedDocument.get, document_id)

    def test_read_values_example0(self):
        self.assert_read_values(0, expect_values0)

    def test_read_values_example1(self):
        self.assert_read_values(1, expect_values1)

    def assert_read_values(self, example_index, expect_values):
        url = self.gemini_example.url_for(file_index=example_index)
        content = self.gemini_example.get_from_url(url)
        document = HarvestedDocument(url=url, content=content)
        values = document.read_values()
        self.assert_gemini_values(values, expect_values)

    def assert_gemini_values(self, values, expect_values):
        for name in expect_values:
            value = values[name]
            expect = expect_values[name]
            self.assert_gemini_value(value, expect, name)

    def assert_gemini_value(self, value, expect, name):
        try:
            self.assert_equal(value, expect)
        except AssertionError, inst:
            msg = "'%s' has unexpected value: %s (expected %s)" %\
                  (name, inst, expect)
            raise AssertionError(msg)


class GeminiExamples(object):
    """Encapsulates the Gemini example files in ckan/tests/gemini2_examples."""

    file_names = [
        u'00a743bf-cca4-4c19-a8e5-e64f7edbcadd_gemini2.xml',
        u'My series sample.xml',
        u'00a743bf-cca4-4c19-a8e5-e64f7edbcadd_gemini2.update.xml',
    ]

    file_names_bad = [
        u'RSS-example.xml',
    ]

    def url_for(self, file_index=None):
        if file_index in [None, 'index.html']:
            name = "index.html"
        elif file_index in ['index.updated.html']:
            name = "index.updated.html"
        else:
            name = self.file_names[file_index]
        path = os.path.join(self.folder_path(), name)
        if not os.path.exists(path):
            raise Exception("Gemini example not found on path: %s" % path)
        return "file://%s" % path

    # Todo: Refactor url_for() and url_for_bad().
    def url_for_bad(self, index=None):
        if index in [None, 'index.html']:
            name = "index.html"
        else:
            name = self.file_names_bad[index]
        path = os.path.join(self.folder_path_bad(), name)
        if not os.path.exists(path):
            raise Exception("Gemini bad example not found on path: %s" % path)
        return "file://%s" % path

    # Todo: Refactor folder_path() and folder_path_bad().
    def folder_path(self):
        from pylons import config
        here_path = config['here']
        return os.path.join(here_path, 'ckan', 'tests', 'gemini2_examples')

    def folder_path_bad(self):
        from pylons import config
        here_path = config['here']
        return os.path.join(here_path, 'ckan', 'tests', 'gemini2_examples_bad')

    def get_from_url(self, url):
        import urllib2
        resource = urllib2.urlopen(url)
        # This returns the raw, data
        data = resource.read()
        # To get it as unicode we need to decode it
        xml = etree.fromstring(data)
        return etree.tostring(xml, encoding=unicode, pretty_print=True)