More debug logging added to WAF harvester.
This commit is contained in:
parent
348c1c4dc1
commit
7841691fc3
|
@ -3,3 +3,4 @@ syntax: glob
|
||||||
*.egg-info
|
*.egg-info
|
||||||
*.swp
|
*.swp
|
||||||
*~
|
*~
|
||||||
|
dist
|
||||||
|
|
|
@ -12,7 +12,7 @@ The following plugins are currently available:
|
||||||
* WMS Preview - a Web Map Service (WMS) previewer (`wms_preview`).
|
* WMS Preview - a Web Map Service (WMS) previewer (`wms_preview`).
|
||||||
* CSW Server - a basic CSW server - to server metadata from the CKAN instance (`cswserver`)
|
* CSW Server - a basic CSW server - to server metadata from the CKAN instance (`cswserver`)
|
||||||
* GEMINI Harvesters - for importing INSPIRE-style metadata into CKAN (`gemini_csw_harvester`, `gemini_doc_harvester`, `gemini_waf_harvester`)
|
* GEMINI Harvesters - for importing INSPIRE-style metadata into CKAN (`gemini_csw_harvester`, `gemini_doc_harvester`, `gemini_waf_harvester`)
|
||||||
* Harvest Metadata API - a way for a user to view the harvested metadata XML, either as a raw file or styled to view in a web browser. (`inspire_api`)
|
* Harvest Metadata API - a way for a user to view the harvested metadata XML, either as a raw file or styled to view in a web browser. (`spatial_harvest_metadata_api`)
|
||||||
|
|
||||||
These libraries:
|
These libraries:
|
||||||
* CSW Client - a basic client for accessing a CSW server
|
* CSW Client - a basic client for accessing a CSW server
|
||||||
|
|
|
@ -21,6 +21,7 @@ import sys
|
||||||
import uuid
|
import uuid
|
||||||
import os
|
import os
|
||||||
import logging
|
import logging
|
||||||
|
import difflib
|
||||||
|
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
from pylons import config
|
from pylons import config
|
||||||
|
@ -232,7 +233,11 @@ class GeminiHarvester(SpatialHarvester):
|
||||||
else:
|
else:
|
||||||
if last_harvested_object.content != self.obj.content and \
|
if last_harvested_object.content != self.obj.content and \
|
||||||
last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date:
|
last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date:
|
||||||
raise Exception('The contents of document with GUID %s changed, but the metadata date has not been updated' % gemini_guid)
|
diff_generator = difflib.HtmlDiff().make_table(
|
||||||
|
last_harvested_object.content.split('\n'),
|
||||||
|
self.obj.content.split('\n'))
|
||||||
|
diff = '\n'.join([line for line in diff_generator])
|
||||||
|
raise Exception('The contents of document with GUID %s changed, but the metadata date has not been updated.\nDiff:\n%s' % (gemini_guid, diff))
|
||||||
else:
|
else:
|
||||||
# The content hasn't changed, no need to update the package
|
# The content hasn't changed, no need to update the package
|
||||||
log.info('Document with GUID %s unchanged, skipping...' % (gemini_guid))
|
log.info('Document with GUID %s unchanged, skipping...' % (gemini_guid))
|
||||||
|
@ -241,7 +246,6 @@ class GeminiHarvester(SpatialHarvester):
|
||||||
log.info('No package with GEMINI guid %s found, let''s create one' % gemini_guid)
|
log.info('No package with GEMINI guid %s found, let''s create one' % gemini_guid)
|
||||||
|
|
||||||
extras = {
|
extras = {
|
||||||
'published_by': self.obj.source.publisher_id or '',
|
|
||||||
'UKLP': 'True',
|
'UKLP': 'True',
|
||||||
'harvest_object_id': self.obj.id
|
'harvest_object_id': self.obj.id
|
||||||
}
|
}
|
||||||
|
@ -783,7 +787,7 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin):
|
||||||
if len(ids) > 0:
|
if len(ids) > 0:
|
||||||
return ids
|
return ids
|
||||||
else:
|
else:
|
||||||
self._save_gather_error('Couldn''t find any links to metadata files',
|
self._save_gather_error('Couldn\'t find any links to metadata files',
|
||||||
harvest_job)
|
harvest_job)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
@ -809,19 +813,25 @@ class GeminiWafHarvester(GeminiHarvester, SingletonPlugin):
|
||||||
if not url:
|
if not url:
|
||||||
continue
|
continue
|
||||||
if '?' in url:
|
if '?' in url:
|
||||||
|
log.debug('Ignoring link in WAF because it has "?": %s', url)
|
||||||
continue
|
continue
|
||||||
if '/' in url:
|
if '/' in url:
|
||||||
|
log.debug('Ignoring link in WAF because it has "/": %s', url)
|
||||||
continue
|
continue
|
||||||
if '#' in url:
|
if '#' in url:
|
||||||
|
log.debug('Ignoring link in WAF because it has "#": %s', url)
|
||||||
continue
|
continue
|
||||||
if 'mailto:' in url:
|
if 'mailto:' in url:
|
||||||
|
log.debug('Ignoring link in WAF because it has "mailto:": %s', url)
|
||||||
continue
|
continue
|
||||||
|
log.debug('WAF contains file: %s', url)
|
||||||
urls.append(url)
|
urls.append(url)
|
||||||
base_url = base_url.rstrip('/').split('/')
|
base_url = base_url.rstrip('/').split('/')
|
||||||
if 'index' in base_url[-1]:
|
if 'index' in base_url[-1]:
|
||||||
base_url.pop()
|
base_url.pop()
|
||||||
base_url = '/'.join(base_url)
|
base_url = '/'.join(base_url)
|
||||||
base_url += '/'
|
base_url += '/'
|
||||||
|
log.debug('WAF base URL: %s', base_url)
|
||||||
return [base_url + i for i in urls]
|
return [base_url + i for i in urls]
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue