From 7841691fc3ec61a02d7902e7c2c59db4d85d610a Mon Sep 17 00:00:00 2001 From: David Read Date: Wed, 16 Jan 2013 13:10:32 +0000 Subject: [PATCH] More debug logging added to WAF harvester. --- .gitignore | 1 + README.rst | 2 +- ckanext/spatial/harvesters.py | 16 +++++++++++++--- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/.gitignore b/.gitignore index 11e5c7f..39f7065 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ syntax: glob *.egg-info *.swp *~ +dist diff --git a/README.rst b/README.rst index 0a82c83..7e1db6f 100644 --- a/README.rst +++ b/README.rst @@ -12,7 +12,7 @@ The following plugins are currently available: * WMS Preview - a Web Map Service (WMS) previewer (`wms_preview`). * CSW Server - a basic CSW server - to server metadata from the CKAN instance (`cswserver`) * GEMINI Harvesters - for importing INSPIRE-style metadata into CKAN (`gemini_csw_harvester`, `gemini_doc_harvester`, `gemini_waf_harvester`) -* Harvest Metadata API - a way for a user to view the harvested metadata XML, either as a raw file or styled to view in a web browser. (`inspire_api`) +* Harvest Metadata API - a way for a user to view the harvested metadata XML, either as a raw file or styled to view in a web browser. (`spatial_harvest_metadata_api`) These libraries: * CSW Client - a basic client for accessing a CSW server diff --git a/ckanext/spatial/harvesters.py b/ckanext/spatial/harvesters.py index 47e9b12..3f16542 100644 --- a/ckanext/spatial/harvesters.py +++ b/ckanext/spatial/harvesters.py @@ -21,6 +21,7 @@ import sys import uuid import os import logging +import difflib from lxml import etree from pylons import config @@ -232,7 +233,11 @@ class GeminiHarvester(SpatialHarvester): else: if last_harvested_object.content != self.obj.content and \ last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date: - raise Exception('The contents of document with GUID %s changed, but the metadata date has not been updated' % gemini_guid) + diff_generator = difflib.HtmlDiff().make_table( + last_harvested_object.content.split('\n'), + self.obj.content.split('\n')) + diff = '\n'.join([line for line in diff_generator]) + raise Exception('The contents of document with GUID %s changed, but the metadata date has not been updated.\nDiff:\n%s' % (gemini_guid, diff)) else: # The content hasn't changed, no need to update the package log.info('Document with GUID %s unchanged, skipping...' % (gemini_guid)) @@ -241,7 +246,6 @@ class GeminiHarvester(SpatialHarvester): log.info('No package with GEMINI guid %s found, let''s create one' % gemini_guid) extras = { - 'published_by': self.obj.source.publisher_id or '', 'UKLP': 'True', 'harvest_object_id': self.obj.id } @@ -783,7 +787,7 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin): if len(ids) > 0: return ids else: - self._save_gather_error('Couldn''t find any links to metadata files', + self._save_gather_error('Couldn\'t find any links to metadata files', harvest_job) return None @@ -809,19 +813,25 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin): if not url: continue if '?' in url: + log.debug('Ignoring link in WAF because it has "?": %s', url) continue if '/' in url: + log.debug('Ignoring link in WAF because it has "/": %s', url) continue if '#' in url: + log.debug('Ignoring link in WAF because it has "#": %s', url) continue if 'mailto:' in url: + log.debug('Ignoring link in WAF because it has "mailto:": %s', url) continue + log.debug('WAF contains file: %s', url) urls.append(url) base_url = base_url.rstrip('/').split('/') if 'index' in base_url[-1]: base_url.pop() base_url = '/'.join(base_url) base_url += '/' + log.debug('WAF base URL: %s', base_url) return [base_url + i for i in urls]