More debug logging added to WAF harvester.

This commit is contained in:
David Read 2013-01-16 13:10:32 +00:00
parent 348c1c4dc1
commit 7841691fc3
3 changed files with 15 additions and 4 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@ syntax: glob
*.egg-info
*.swp
*~
dist

View File

@ -12,7 +12,7 @@ The following plugins are currently available:
* WMS Preview - a Web Map Service (WMS) previewer (`wms_preview`).
* CSW Server - a basic CSW server - to server metadata from the CKAN instance (`cswserver`)
* GEMINI Harvesters - for importing INSPIRE-style metadata into CKAN (`gemini_csw_harvester`, `gemini_doc_harvester`, `gemini_waf_harvester`)
* Harvest Metadata API - a way for a user to view the harvested metadata XML, either as a raw file or styled to view in a web browser. (`inspire_api`)
* Harvest Metadata API - a way for a user to view the harvested metadata XML, either as a raw file or styled to view in a web browser. (`spatial_harvest_metadata_api`)
These libraries:
* CSW Client - a basic client for accessing a CSW server

View File

@ -21,6 +21,7 @@ import sys
import uuid
import os
import logging
import difflib
from lxml import etree
from pylons import config
@ -232,7 +233,11 @@ class GeminiHarvester(SpatialHarvester):
else:
if last_harvested_object.content != self.obj.content and \
last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date:
raise Exception('The contents of document with GUID %s changed, but the metadata date has not been updated' % gemini_guid)
diff_generator = difflib.HtmlDiff().make_table(
last_harvested_object.content.split('\n'),
self.obj.content.split('\n'))
diff = '\n'.join([line for line in diff_generator])
raise Exception('The contents of document with GUID %s changed, but the metadata date has not been updated.\nDiff:\n%s' % (gemini_guid, diff))
else:
# The content hasn't changed, no need to update the package
log.info('Document with GUID %s unchanged, skipping...' % (gemini_guid))
@ -241,7 +246,6 @@ class GeminiHarvester(SpatialHarvester):
log.info('No package with GEMINI guid %s found, let''s create one' % gemini_guid)
extras = {
'published_by': self.obj.source.publisher_id or '',
'UKLP': 'True',
'harvest_object_id': self.obj.id
}
@ -783,7 +787,7 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin):
if len(ids) > 0:
return ids
else:
self._save_gather_error('Couldn''t find any links to metadata files',
self._save_gather_error('Couldn\'t find any links to metadata files',
harvest_job)
return None
@ -809,19 +813,25 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin):
if not url:
continue
if '?' in url:
log.debug('Ignoring link in WAF because it has "?": %s', url)
continue
if '/' in url:
log.debug('Ignoring link in WAF because it has "/": %s', url)
continue
if '#' in url:
log.debug('Ignoring link in WAF because it has "#": %s', url)
continue
if 'mailto:' in url:
log.debug('Ignoring link in WAF because it has "mailto:": %s', url)
continue
log.debug('WAF contains file: %s', url)
urls.append(url)
base_url = base_url.rstrip('/').split('/')
if 'index' in base_url[-1]:
base_url.pop()
base_url = '/'.join(base_url)
base_url += '/'
log.debug('WAF base URL: %s', base_url)
return [base_url + i for i in urls]