More debug logging added to WAF harvester.

This commit is contained in:
David Read 2013-01-16 13:10:32 +00:00
parent 348c1c4dc1
commit 7841691fc3
3 changed files with 15 additions and 4 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@ syntax: glob
*.egg-info *.egg-info
*.swp *.swp
*~ *~
dist

View File

@ -12,7 +12,7 @@ The following plugins are currently available:
* WMS Preview - a Web Map Service (WMS) previewer (`wms_preview`). * WMS Preview - a Web Map Service (WMS) previewer (`wms_preview`).
* CSW Server - a basic CSW server - to server metadata from the CKAN instance (`cswserver`) * CSW Server - a basic CSW server - to server metadata from the CKAN instance (`cswserver`)
* GEMINI Harvesters - for importing INSPIRE-style metadata into CKAN (`gemini_csw_harvester`, `gemini_doc_harvester`, `gemini_waf_harvester`) * GEMINI Harvesters - for importing INSPIRE-style metadata into CKAN (`gemini_csw_harvester`, `gemini_doc_harvester`, `gemini_waf_harvester`)
* Harvest Metadata API - a way for a user to view the harvested metadata XML, either as a raw file or styled to view in a web browser. (`inspire_api`) * Harvest Metadata API - a way for a user to view the harvested metadata XML, either as a raw file or styled to view in a web browser. (`spatial_harvest_metadata_api`)
These libraries: These libraries:
* CSW Client - a basic client for accessing a CSW server * CSW Client - a basic client for accessing a CSW server

View File

@ -21,6 +21,7 @@ import sys
import uuid import uuid
import os import os
import logging import logging
import difflib
from lxml import etree from lxml import etree
from pylons import config from pylons import config
@ -232,7 +233,11 @@ class GeminiHarvester(SpatialHarvester):
else: else:
if last_harvested_object.content != self.obj.content and \ if last_harvested_object.content != self.obj.content and \
last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date: last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date:
raise Exception('The contents of document with GUID %s changed, but the metadata date has not been updated' % gemini_guid) diff_generator = difflib.HtmlDiff().make_table(
last_harvested_object.content.split('\n'),
self.obj.content.split('\n'))
diff = '\n'.join([line for line in diff_generator])
raise Exception('The contents of document with GUID %s changed, but the metadata date has not been updated.\nDiff:\n%s' % (gemini_guid, diff))
else: else:
# The content hasn't changed, no need to update the package # The content hasn't changed, no need to update the package
log.info('Document with GUID %s unchanged, skipping...' % (gemini_guid)) log.info('Document with GUID %s unchanged, skipping...' % (gemini_guid))
@ -241,7 +246,6 @@ class GeminiHarvester(SpatialHarvester):
log.info('No package with GEMINI guid %s found, let''s create one' % gemini_guid) log.info('No package with GEMINI guid %s found, let''s create one' % gemini_guid)
extras = { extras = {
'published_by': self.obj.source.publisher_id or '',
'UKLP': 'True', 'UKLP': 'True',
'harvest_object_id': self.obj.id 'harvest_object_id': self.obj.id
} }
@ -783,7 +787,7 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin):
if len(ids) > 0: if len(ids) > 0:
return ids return ids
else: else:
self._save_gather_error('Couldn''t find any links to metadata files', self._save_gather_error('Couldn\'t find any links to metadata files',
harvest_job) harvest_job)
return None return None
@ -809,19 +813,25 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin):
if not url: if not url:
continue continue
if '?' in url: if '?' in url:
log.debug('Ignoring link in WAF because it has "?": %s', url)
continue continue
if '/' in url: if '/' in url:
log.debug('Ignoring link in WAF because it has "/": %s', url)
continue continue
if '#' in url: if '#' in url:
log.debug('Ignoring link in WAF because it has "#": %s', url)
continue continue
if 'mailto:' in url: if 'mailto:' in url:
log.debug('Ignoring link in WAF because it has "mailto:": %s', url)
continue continue
log.debug('WAF contains file: %s', url)
urls.append(url) urls.append(url)
base_url = base_url.rstrip('/').split('/') base_url = base_url.rstrip('/').split('/')
if 'index' in base_url[-1]: if 'index' in base_url[-1]:
base_url.pop() base_url.pop()
base_url = '/'.join(base_url) base_url = '/'.join(base_url)
base_url += '/' base_url += '/'
log.debug('WAF base URL: %s', base_url)
return [base_url + i for i in urls] return [base_url + i for i in urls]