More debug logging added to WAF harvester.

2013-01-16 13:10:32 +00:00 · 2013-01-16 13:10:32 +00:00 · 7841691fc3
parent 348c1c4dc1
commit 7841691fc3
3 changed files with 15 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,3 +3,4 @@ syntax: glob
 *.egg-info
 *.swp
 *~
 dist
--- a/README.rst
+++ b/README.rst
@ -12,7 +12,7 @@ The following plugins are currently available:
 * WMS Preview - a Web Map Service (WMS) previewer (`wms_preview`).
 * CSW Server - a basic CSW server - to server metadata from the CKAN instance (`cswserver`)
 * GEMINI Harvesters - for importing INSPIRE-style metadata into CKAN (`gemini_csw_harvester`, `gemini_doc_harvester`, `gemini_waf_harvester`)
-* Harvest Metadata API - a way for a user to view the harvested metadata XML, either as a raw file or styled to view in a web browser. (`inspire_api`)
+* Harvest Metadata API - a way for a user to view the harvested metadata XML, either as a raw file or styled to view in a web browser. (`spatial_harvest_metadata_api`)
 These libraries:
 * CSW Client - a basic client for accessing a CSW server
--- a/ckanext/spatial/harvesters.py
+++ b/ckanext/spatial/harvesters.py
@ -21,6 +21,7 @@ import sys
 import uuid
 import os
 import logging
 import difflib
 from lxml import etree
 from pylons import config
@ -232,7 +233,11 @@ class GeminiHarvester(SpatialHarvester):
            else:
                if last_harvested_object.content != self.obj.content and \
                 last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date:
-                    raise Exception('The contents of document with GUID %s changed, but the metadata date has not been updated' % gemini_guid)
+                    diff_generator = difflib.HtmlDiff().make_table(
                        last_harvested_object.content.split('\n'),
                        self.obj.content.split('\n'))
                    diff = '\n'.join([line for line in diff_generator])
                    raise Exception('The contents of document with GUID %s changed, but the metadata date has not been updated.\nDiff:\n%s' % (gemini_guid, diff))
                else:
                    # The content hasn't changed, no need to update the package
                    log.info('Document with GUID %s unchanged, skipping...' % (gemini_guid))
@ -241,7 +246,6 @@ class GeminiHarvester(SpatialHarvester):
            log.info('No package with GEMINI guid %s found, let''s create one' % gemini_guid)
        extras = {
            'published_by': self.obj.source.publisher_id or '',
            'UKLP': 'True',
            'harvest_object_id': self.obj.id
        }
@ -783,7 +787,7 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin):
        if len(ids) > 0:
            return ids
        else:
-            self._save_gather_error('Couldn''t find any links to metadata files',
+            self._save_gather_error('Couldn\'t find any links to metadata files',
                                     harvest_job)
            return None
@ -809,19 +813,25 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin):
            if not url:
                continue
            if '?' in url:
                log.debug('Ignoring link in WAF because it has "?": %s', url)
                continue
            if '/' in url:
                log.debug('Ignoring link in WAF because it has "/": %s', url)
                continue
            if '#' in url:
                log.debug('Ignoring link in WAF because it has "#": %s', url)
                continue
            if 'mailto:' in url:
                log.debug('Ignoring link in WAF because it has "mailto:": %s', url)
                continue
            log.debug('WAF contains file: %s', url)
            urls.append(url)
        base_url = base_url.rstrip('/').split('/')
        if 'index' in base_url[-1]:
            base_url.pop()
        base_url = '/'.join(base_url)
        base_url += '/'
        log.debug('WAF base URL: %s', base_url)
        return [base_url + i for i in urls]