821 lines
31 KiB
Python
821 lines
31 KiB
Python
'''
|
|
Different harvesters for spatial metadata
|
|
|
|
These are designed for harvesting GEMINI2 for the UK Location Programme
|
|
but can be easily adapted for other INSPIRE/ISO19139 XML metadata
|
|
- GeminiCswHarvester - CSW servers
|
|
- GeminiDocHarvester - An individual GEMINI resource
|
|
- GeminiWafHarvester - An index page with links to GEMINI resources
|
|
|
|
TODO: Harvesters for generic INSPIRE CSW servers
|
|
|
|
'''
|
|
import cgitb
|
|
import warnings
|
|
import urllib2
|
|
from urlparse import urlparse
|
|
from datetime import datetime
|
|
from string import Template
|
|
from numbers import Number
|
|
import sys
|
|
import uuid
|
|
import os
|
|
import logging
|
|
|
|
from lxml import etree
|
|
from pylons import config
|
|
from sqlalchemy.sql import update, bindparam
|
|
from sqlalchemy.exc import InvalidRequestError
|
|
from owslib import wms
|
|
|
|
from ckan import model
|
|
from ckan.model import Session, Package
|
|
from ckan.lib.munge import munge_title_to_name
|
|
from ckan.plugins.core import SingletonPlugin, implements
|
|
from ckan.lib.helpers import json
|
|
|
|
from ckan import logic
|
|
from ckan.logic import get_action, ValidationError
|
|
from ckan.lib.navl.validators import not_empty
|
|
|
|
from ckanext.harvest.interfaces import IHarvester
|
|
from ckanext.harvest.model import HarvestObject, HarvestGatherError, \
|
|
HarvestObjectError
|
|
|
|
from ckanext.spatial.model import GeminiDocument
|
|
from ckanext.spatial.lib.csw_client import CswService
|
|
from ckanext.spatial.validation import Validators
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
def text_traceback():
|
|
with warnings.catch_warnings():
|
|
warnings.simplefilter("ignore")
|
|
res = 'the original traceback:'.join(
|
|
cgitb.text(sys.exc_info()).split('the original traceback:')[1:]
|
|
).strip()
|
|
return res
|
|
|
|
# When developing, it might be helpful to 'export DEBUG=1' to reraise the
|
|
# exceptions, rather them being caught.
|
|
debug_exception_mode = bool(os.getenv('DEBUG'))
|
|
|
|
class SpatialHarvester(object):
|
|
# Q: Why does this not inherit from HarvesterBase in ckanext-harvest?
|
|
|
|
def _is_wms(self,url):
|
|
try:
|
|
capabilities_url = wms.WMSCapabilitiesReader().capabilities_url(url)
|
|
res = urllib2.urlopen(capabilities_url,None,10)
|
|
xml = res.read()
|
|
|
|
s = wms.WebMapService(url,xml=xml)
|
|
return isinstance(s.contents, dict) and s.contents != {}
|
|
except Exception, e:
|
|
log.error('WMS check for %s failed with exception: %s' % (url, str(e)))
|
|
return False
|
|
|
|
def _get_validator(self):
|
|
if not hasattr(self, '_validator'):
|
|
profiles = [
|
|
x.strip() for x in
|
|
config.get(
|
|
'ckan.spatial.validator.profiles',
|
|
'iso19139,gemini2',
|
|
).split(',')
|
|
]
|
|
self._validator = Validators(profiles=profiles)
|
|
return self._validator
|
|
|
|
def _save_gather_error(self,message,job):
|
|
err = HarvestGatherError(message=message,job=job)
|
|
try:
|
|
err.save()
|
|
except InvalidRequestError:
|
|
Session.rollback()
|
|
err.save()
|
|
finally:
|
|
log.error(message)
|
|
|
|
def _save_object_error(self,message,obj,stage=u'Fetch'):
|
|
err = HarvestObjectError(message=message,object=obj,stage=stage)
|
|
try:
|
|
err.save()
|
|
except InvalidRequestError,e:
|
|
Session.rollback()
|
|
err.save()
|
|
finally:
|
|
log.error(message)
|
|
|
|
def _get_content(self, url):
|
|
url = url.replace(' ','%20')
|
|
http_response = urllib2.urlopen(url)
|
|
return http_response.read()
|
|
|
|
class GeminiHarvester(SpatialHarvester):
|
|
'''Base class for spatial harvesting GEMINI2 documents for the UK Location
|
|
Programme. May be easily adaptable for other INSPIRE and spatial projects.
|
|
|
|
All three harvesters share the same import stage
|
|
'''
|
|
|
|
force_import = False
|
|
|
|
extent_template = Template('''
|
|
{"type":"Polygon","coordinates":[[[$minx, $miny],[$minx, $maxy], [$maxx, $maxy], [$maxx, $miny], [$minx, $miny]]]}
|
|
''')
|
|
|
|
def import_stage(self, harvest_object):
|
|
log = logging.getLogger(__name__ + '.import')
|
|
log.debug('Import stage for harvest object: %r', harvest_object)
|
|
|
|
if not harvest_object:
|
|
log.error('No harvest object received')
|
|
return False
|
|
|
|
# Save a reference
|
|
self.obj = harvest_object
|
|
|
|
if harvest_object.content is None:
|
|
self._save_object_error('Empty content for object %s' % harvest_object.id,harvest_object,'Import')
|
|
return False
|
|
try:
|
|
self.import_gemini_object(harvest_object.content)
|
|
return True
|
|
except Exception, e:
|
|
log.error('Exception during import: %s' % text_traceback())
|
|
if not str(e).strip():
|
|
self._save_object_error('Error importing Gemini document.', harvest_object, 'Import')
|
|
else:
|
|
self._save_object_error('Error importing Gemini document: %s' % str(e), harvest_object, 'Import')
|
|
|
|
if debug_exception_mode:
|
|
raise
|
|
|
|
def import_gemini_object(self, gemini_string):
|
|
log = logging.getLogger(__name__ + '.import')
|
|
xml = etree.fromstring(gemini_string)
|
|
|
|
valid, messages = self._get_validator().is_valid(xml)
|
|
if not valid:
|
|
log.error('Errors found for object with GUID %s:' % self.obj.guid)
|
|
out = messages[0] + ':\n' + '\n'.join(messages[1:])
|
|
self._save_object_error(out,self.obj,'Import')
|
|
|
|
unicode_gemini_string = etree.tostring(xml, encoding=unicode, pretty_print=True)
|
|
|
|
package = self.write_package_from_gemini_string(unicode_gemini_string)
|
|
|
|
|
|
def write_package_from_gemini_string(self, content):
|
|
'''Create or update a Package based on some content that has
|
|
come from a URL.
|
|
'''
|
|
log = logging.getLogger(__name__ + '.import')
|
|
package = None
|
|
gemini_document = GeminiDocument(content)
|
|
gemini_values = gemini_document.read_values()
|
|
gemini_guid = gemini_values['guid']
|
|
|
|
# Save the metadata reference date in the Harvest Object
|
|
try:
|
|
metadata_modified_date = datetime.strptime(gemini_values['metadata-date'],'%Y-%m-%d')
|
|
except ValueError:
|
|
try:
|
|
metadata_modified_date = datetime.strptime(gemini_values['metadata-date'],'%Y-%m-%dT%H:%M:%S')
|
|
except:
|
|
raise Exception('Could not extract reference date for GUID %s (%s)' \
|
|
% (gemini_guid,gemini_values['metadata-date']))
|
|
|
|
self.obj.metadata_modified_date = metadata_modified_date
|
|
self.obj.save()
|
|
|
|
last_harvested_object = Session.query(HarvestObject) \
|
|
.filter(HarvestObject.guid==gemini_guid) \
|
|
.filter(HarvestObject.current==True) \
|
|
.all()
|
|
|
|
if len(last_harvested_object) == 1:
|
|
last_harvested_object = last_harvested_object[0]
|
|
elif len(last_harvested_object) > 1:
|
|
raise Exception('Application Error: more than one current record for GUID %s' % gemini_guid)
|
|
|
|
reactivate_package = False
|
|
if last_harvested_object:
|
|
# We've previously harvested this (i.e. it's an update)
|
|
|
|
# Use metadata modified date instead of content to determine if the package
|
|
# needs to be updated
|
|
if last_harvested_object.metadata_modified_date is None \
|
|
or last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date \
|
|
or self.force_import \
|
|
or (last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date and
|
|
last_harvested_object.source.active is False):
|
|
|
|
if self.force_import:
|
|
log.info('Import forced for object %s with GUID %s' % (self.obj.id,gemini_guid))
|
|
else:
|
|
log.info('Package for object with GUID %s needs to be created or updated' % gemini_guid)
|
|
|
|
package = last_harvested_object.package
|
|
|
|
# If the package has a deleted state, we will only update it and reactivate it if the
|
|
# new document has a more recent modified date
|
|
if package.state == u'deleted':
|
|
if last_harvested_object.metadata_modified_date < self.obj.metadata_modified_date:
|
|
log.info('Package for object with GUID %s will be re-activated' % gemini_guid)
|
|
reactivate_package = True
|
|
else:
|
|
log.info('Remote record with GUID %s is not more recent than a deleted package, skipping... ' % gemini_guid)
|
|
return None
|
|
|
|
else:
|
|
if last_harvested_object.content != self.obj.content and \
|
|
last_harvested_object.metadata_modified_date == self.obj.metadata_modified_date:
|
|
raise Exception('The contents of document with GUID %s changed, but the metadata date has not been updated' % gemini_guid)
|
|
else:
|
|
# The content hasn't changed, no need to update the package
|
|
log.info('Document with GUID %s unchanged, skipping...' % (gemini_guid))
|
|
return None
|
|
else:
|
|
log.info('No package with GEMINI guid %s found, let''s create one' % gemini_guid)
|
|
|
|
extras = {
|
|
'published_by': self.obj.source.publisher_id or '',
|
|
'UKLP': 'True',
|
|
'harvest_object_id': self.obj.id
|
|
}
|
|
|
|
# Just add some of the metadata as extras, not the whole lot
|
|
for name in [
|
|
# Essentials
|
|
'bbox-east-long',
|
|
'bbox-north-lat',
|
|
'bbox-south-lat',
|
|
'bbox-west-long',
|
|
'spatial-reference-system',
|
|
'guid',
|
|
# Usefuls
|
|
'dataset-reference-date',
|
|
'resource-type',
|
|
'metadata-language', # Language
|
|
'metadata-date', # Released
|
|
'coupled-resource',
|
|
'contact-email',
|
|
'frequency-of-update',
|
|
'spatial-data-service-type',
|
|
]:
|
|
extras[name] = gemini_values[name]
|
|
|
|
extras['licence'] = gemini_values.get('use-constraints', '')
|
|
if len(extras['licence']):
|
|
license_url_extracted = self._extract_first_license_url(extras['licence'])
|
|
if license_url_extracted:
|
|
extras['licence_url'] = license_url_extracted
|
|
|
|
extras['access_constraints'] = gemini_values.get('limitations-on-public-access','')
|
|
if gemini_values.has_key('temporal-extent-begin'):
|
|
#gemini_values['temporal-extent-begin'].sort()
|
|
extras['temporal_coverage-from'] = gemini_values['temporal-extent-begin']
|
|
if gemini_values.has_key('temporal-extent-end'):
|
|
#gemini_values['temporal-extent-end'].sort()
|
|
extras['temporal_coverage-to'] = gemini_values['temporal-extent-end']
|
|
|
|
# Save responsible organization roles
|
|
parties = {}
|
|
owners = []
|
|
publishers = []
|
|
for responsible_party in gemini_values['responsible-organisation']:
|
|
|
|
if responsible_party['role'] == 'owner':
|
|
owners.append(responsible_party['organisation-name'])
|
|
elif responsible_party['role'] == 'publisher':
|
|
publishers.append(responsible_party['organisation-name'])
|
|
|
|
if responsible_party['organisation-name'] in parties:
|
|
if not responsible_party['role'] in parties[responsible_party['organisation-name']]:
|
|
parties[responsible_party['organisation-name']].append(responsible_party['role'])
|
|
else:
|
|
parties[responsible_party['organisation-name']] = [responsible_party['role']]
|
|
|
|
parties_extra = []
|
|
for party_name in parties:
|
|
parties_extra.append('%s (%s)' % (party_name, ', '.join(parties[party_name])))
|
|
extras['responsible-party'] = '; '.join(parties_extra)
|
|
|
|
# Save provider in a separate extra:
|
|
# first organization to have a role of 'owner', and if there is none, first one with
|
|
# a role of 'publisher'
|
|
if len(owners):
|
|
extras['provider'] = owners[0]
|
|
elif len(publishers):
|
|
extras['provider'] = publishers[0]
|
|
else:
|
|
extras['provider'] = u''
|
|
|
|
# Construct a GeoJSON extent so ckanext-spatial can register the extent geometry
|
|
extent_string = self.extent_template.substitute(
|
|
minx = extras['bbox-east-long'],
|
|
miny = extras['bbox-south-lat'],
|
|
maxx = extras['bbox-west-long'],
|
|
maxy = extras['bbox-north-lat']
|
|
)
|
|
|
|
extras['spatial'] = extent_string.strip()
|
|
|
|
tags = []
|
|
for tag in gemini_values['tags']:
|
|
tag = tag[:50] if len(tag) > 50 else tag
|
|
tags.append({'name':tag})
|
|
|
|
package_dict = {
|
|
'title': gemini_values['title'],
|
|
'notes': gemini_values['abstract'],
|
|
'tags': tags,
|
|
'resources':[]
|
|
}
|
|
|
|
if self.obj.source.publisher_id:
|
|
package_dict['groups'] = [{'id':self.obj.source.publisher_id}]
|
|
|
|
|
|
if reactivate_package:
|
|
package_dict['state'] = u'active'
|
|
|
|
if package is None or package.title != gemini_values['title']:
|
|
name = self.gen_new_name(gemini_values['title'])
|
|
if not name:
|
|
name = self.gen_new_name(str(gemini_guid))
|
|
if not name:
|
|
raise Exception('Could not generate a unique name from the title or the GUID. Please choose a more unique title.')
|
|
package_dict['name'] = name
|
|
else:
|
|
package_dict['name'] = package.name
|
|
|
|
resource_locators = gemini_values.get('resource-locator', [])
|
|
|
|
if len(resource_locators):
|
|
for resource_locator in resource_locators:
|
|
url = resource_locator.get('url','')
|
|
if url:
|
|
resource_format = ''
|
|
resource = {}
|
|
if extras['resource-type'] == 'service':
|
|
# Check if the service is a view service
|
|
test_url = url.split('?')[0] if '?' in url else url
|
|
if self._is_wms(test_url):
|
|
resource['verified'] = True
|
|
resource['verified_date'] = datetime.now().isoformat()
|
|
resource_format = 'WMS'
|
|
resource.update(
|
|
{
|
|
'url': url,
|
|
'name': resource_locator.get('name',''),
|
|
'description': resource_locator.get('description') if resource_locator.get('description') else 'Resource locator',
|
|
'format': resource_format or None,
|
|
'resource_locator_protocol': resource_locator.get('protocol',''),
|
|
'resource_locator_function':resource_locator.get('function','')
|
|
|
|
})
|
|
package_dict['resources'].append(resource)
|
|
|
|
# Guess the best view service to use in WMS preview
|
|
verified_view_resources = [r for r in package_dict['resources'] if 'verified' in r and r['format'] == 'WMS']
|
|
if len(verified_view_resources):
|
|
verified_view_resources[0]['ckan_recommended_wms_preview'] = True
|
|
else:
|
|
view_resources = [r for r in package_dict['resources'] if r['format'] == 'WMS']
|
|
if len(view_resources):
|
|
view_resources[0]['ckan_recommended_wms_preview'] = True
|
|
|
|
extras_as_dict = []
|
|
for key,value in extras.iteritems():
|
|
if isinstance(value,(basestring,Number)):
|
|
extras_as_dict.append({'key':key,'value':value})
|
|
else:
|
|
extras_as_dict.append({'key':key,'value':json.dumps(value)})
|
|
|
|
package_dict['extras'] = extras_as_dict
|
|
|
|
if package == None:
|
|
# Create new package from data.
|
|
package = self._create_package_from_data(package_dict)
|
|
log.info('Created new package ID %s with GEMINI guid %s', package['id'], gemini_guid)
|
|
else:
|
|
package = self._create_package_from_data(package_dict, package = package)
|
|
log.info('Updated existing package ID %s with existing GEMINI guid %s', package['id'], gemini_guid)
|
|
|
|
# Flag the other objects of this source as not current anymore
|
|
from ckanext.harvest.model import harvest_object_table
|
|
u = update(harvest_object_table) \
|
|
.where(harvest_object_table.c.package_id==bindparam('b_package_id')) \
|
|
.values(current=False)
|
|
Session.execute(u, params={'b_package_id':package['id']})
|
|
Session.commit()
|
|
|
|
# Refresh current object from session, otherwise the
|
|
# import paster command fails
|
|
Session.remove()
|
|
Session.add(self.obj)
|
|
Session.refresh(self.obj)
|
|
|
|
# Set reference to package in the HarvestObject and flag it as
|
|
# the current one
|
|
if not self.obj.package_id:
|
|
self.obj.package_id = package['id']
|
|
|
|
self.obj.current = True
|
|
self.obj.save()
|
|
|
|
|
|
assert gemini_guid == [e['value'] for e in package['extras'] if e['key'] == 'guid'][0]
|
|
assert self.obj.id == [e['value'] for e in package['extras'] if e['key'] == 'harvest_object_id'][0]
|
|
|
|
return package
|
|
|
|
def gen_new_name(self, title):
|
|
name = munge_title_to_name(title).replace('_', '-')
|
|
while '--' in name:
|
|
name = name.replace('--', '-')
|
|
like_q = u'%s%%' % name
|
|
pkg_query = Session.query(Package).filter(Package.name.ilike(like_q)).limit(100)
|
|
taken = [pkg.name for pkg in pkg_query]
|
|
if name not in taken:
|
|
return name
|
|
else:
|
|
counter = 1
|
|
while counter < 101:
|
|
if name+str(counter) not in taken:
|
|
return name+str(counter)
|
|
counter = counter + 1
|
|
return None
|
|
|
|
def _extract_first_license_url(self,licences):
|
|
for licence in licences:
|
|
o = urlparse(licence)
|
|
if o.scheme and o.netloc:
|
|
return licence
|
|
return None
|
|
|
|
def _create_package_from_data(self, package_dict, package = None):
|
|
'''
|
|
{'name': 'council-owned-litter-bins',
|
|
'notes': 'Location of Council owned litter bins within Borough.',
|
|
'resources': [{'description': 'Resource locator',
|
|
'format': 'Unverified',
|
|
'url': 'http://www.barrowbc.gov.uk'}],
|
|
'tags': [{'name':'Utility and governmental services'}],
|
|
'title': 'Council Owned Litter Bins',
|
|
'extras': [{'key':'INSPIRE','value':'True'},
|
|
{'key':'bbox-east-long','value': '-3.12442'},
|
|
{'key':'bbox-north-lat','value': '54.218407'},
|
|
{'key':'bbox-south-lat','value': '54.039634'},
|
|
{'key':'bbox-west-long','value': '-3.32485'},
|
|
# etc.
|
|
]
|
|
}
|
|
'''
|
|
|
|
if not package:
|
|
package_schema = logic.schema.default_create_package_schema()
|
|
else:
|
|
package_schema = logic.schema.default_update_package_schema()
|
|
|
|
# The default package schema does not like Upper case tags
|
|
tag_schema = logic.schema.default_tags_schema()
|
|
tag_schema['name'] = [not_empty,unicode]
|
|
package_schema['tags'] = tag_schema
|
|
|
|
# TODO: user
|
|
context = {'model':model,
|
|
'session':Session,
|
|
'user':'harvest',
|
|
'schema':package_schema,
|
|
'extras_as_string':True,
|
|
'api_version': '2'}
|
|
if not package:
|
|
# We need to explicitly provide a package ID, otherwise ckanext-spatial
|
|
# won't be be able to link the extent to the package.
|
|
package_dict['id'] = unicode(uuid.uuid4())
|
|
package_schema['id'] = [unicode]
|
|
|
|
action_function = get_action('package_create')
|
|
else:
|
|
action_function = get_action('package_update')
|
|
package_dict['id'] = package.id
|
|
|
|
try:
|
|
package_dict = action_function(context, package_dict)
|
|
except ValidationError,e:
|
|
raise Exception('Validation Error: %s' % str(e.error_summary))
|
|
if debug_exception_mode:
|
|
raise
|
|
|
|
return package_dict
|
|
|
|
def get_gemini_string_and_guid(self,content,url=None):
|
|
xml = etree.fromstring(content)
|
|
|
|
# The validator and GeminiDocument don't like the container
|
|
metadata_tag = '{http://www.isotc211.org/2005/gmd}MD_Metadata'
|
|
if xml.tag == metadata_tag:
|
|
gemini_xml = xml
|
|
else:
|
|
gemini_xml = xml.find(metadata_tag)
|
|
|
|
if gemini_xml is None:
|
|
self._save_gather_error('Content is not a valid Gemini document',self.harvest_job)
|
|
|
|
valid, messages = self._get_validator().is_valid(gemini_xml)
|
|
if not valid:
|
|
out = messages[0] + ':\n' + '\n'.join(messages[1:])
|
|
if url:
|
|
self._save_gather_error('Validation error for %s - %s'% (url,out),self.harvest_job)
|
|
else:
|
|
self._save_gather_error('Validation error - %s'%out,self.harvest_job)
|
|
|
|
gemini_string = etree.tostring(gemini_xml)
|
|
gemini_document = GeminiDocument(gemini_string)
|
|
gemini_values = gemini_document.read_values()
|
|
gemini_guid = gemini_values['guid']
|
|
|
|
return gemini_string, gemini_guid
|
|
|
|
class GeminiCswHarvester(GeminiHarvester, SingletonPlugin):
|
|
'''
|
|
A Harvester for CSW servers
|
|
'''
|
|
implements(IHarvester)
|
|
|
|
csw=None
|
|
|
|
def info(self):
|
|
return {
|
|
'name': 'csw',
|
|
'title': 'CSW Server',
|
|
'description': 'A server that implements OGC\'s Catalog Service for the Web (CSW) standard'
|
|
}
|
|
|
|
def gather_stage(self, harvest_job):
|
|
log = logging.getLogger(__name__ + '.CSW.gather')
|
|
log.debug('GeminiCswHarvester gather_stage for job: %r', harvest_job)
|
|
# Get source URL
|
|
url = harvest_job.source.url
|
|
|
|
try:
|
|
self._setup_csw_client(url)
|
|
except Exception, e:
|
|
self._save_gather_error('Error contacting the CSW server: %s' % e, harvest_job)
|
|
return None
|
|
|
|
|
|
log.debug('Starting gathering for %s' % url)
|
|
used_identifiers = []
|
|
ids = []
|
|
try:
|
|
for identifier in self.csw.getidentifiers(page=10):
|
|
try:
|
|
log.info('Got identifier %s from the CSW', identifier)
|
|
if identifier in used_identifiers:
|
|
log.error('CSW identifier %r already used, skipping...' % identifier)
|
|
continue
|
|
if identifier is None:
|
|
log.error('CSW returned identifier %r, skipping...' % identifier)
|
|
## log an error here? happens with the dutch data
|
|
continue
|
|
|
|
# Create a new HarvestObject for this identifier
|
|
obj = HarvestObject(guid=identifier, job=harvest_job)
|
|
obj.save()
|
|
|
|
ids.append(obj.id)
|
|
used_identifiers.append(identifier)
|
|
except Exception, e:
|
|
self._save_gather_error('Error for the identifier %s [%r]' % (identifier,e), harvest_job)
|
|
continue
|
|
|
|
except Exception, e:
|
|
log.error('Exception: %s' % text_traceback())
|
|
self._save_gather_error('Error gathering the identifiers from the CSW server [%s]' % str(e), harvest_job)
|
|
return None
|
|
|
|
if len(ids) == 0:
|
|
self._save_gather_error('No records received from the CSW server', harvest_job)
|
|
return None
|
|
|
|
return ids
|
|
|
|
def fetch_stage(self,harvest_object):
|
|
log = logging.getLogger(__name__ + '.CSW.fetch')
|
|
log.debug('GeminiCswHarvester fetch_stage for object: %r', harvest_object)
|
|
|
|
url = harvest_object.source.url
|
|
try:
|
|
self._setup_csw_client(url)
|
|
except Exception, e:
|
|
self._save_object_error('Error contacting the CSW server: %s' % e,
|
|
harvest_object)
|
|
return False
|
|
|
|
identifier = harvest_object.guid
|
|
try:
|
|
record = self.csw.getrecordbyid([identifier])
|
|
except Exception, e:
|
|
self._save_object_error('Error getting the CSW record with GUID %s' % identifier, harvest_object)
|
|
return False
|
|
|
|
if record is None:
|
|
self._save_object_error('Empty record for GUID %s' % identifier,
|
|
harvest_object)
|
|
return False
|
|
|
|
try:
|
|
# Save the fetch contents in the HarvestObject
|
|
harvest_object.content = record['xml']
|
|
harvest_object.save()
|
|
except Exception,e:
|
|
self._save_object_error('Error saving the harvest object for GUID %s [%r]' % \
|
|
(identifier, e), harvest_object)
|
|
return False
|
|
|
|
log.debug('XML content saved (len %s)', len(record['xml']))
|
|
return True
|
|
|
|
def _setup_csw_client(self, url):
|
|
self.csw = CswService(url)
|
|
|
|
|
|
class GeminiDocHarvester(GeminiHarvester, SingletonPlugin):
|
|
'''
|
|
A Harvester for individual GEMINI documents
|
|
'''
|
|
|
|
implements(IHarvester)
|
|
|
|
def info(self):
|
|
return {
|
|
'name': 'gemini-single',
|
|
'title': 'Single GEMINI 2 document',
|
|
'description': 'A single GEMINI 2.1 document'
|
|
}
|
|
|
|
def gather_stage(self,harvest_job):
|
|
log = logging.getLogger(__name__ + '.individual.gather')
|
|
log.debug('GeminiDocHarvester gather_stage for job: %r', harvest_job)
|
|
|
|
self.harvest_job = harvest_job
|
|
|
|
# Get source URL
|
|
url = harvest_job.source.url
|
|
|
|
# Get contents
|
|
try:
|
|
content = self._get_content(url)
|
|
except Exception,e:
|
|
self._save_gather_error('Unable to get content for URL: %s: %r' % \
|
|
(url, e),harvest_job)
|
|
return None
|
|
try:
|
|
# We need to extract the guid to pass it to the next stage
|
|
gemini_string, gemini_guid = self.get_gemini_string_and_guid(content,url)
|
|
|
|
if gemini_guid:
|
|
# Create a new HarvestObject for this identifier
|
|
# Generally the content will be set in the fetch stage, but as we alredy
|
|
# have it, we might as well save a request
|
|
obj = HarvestObject(guid=gemini_guid,
|
|
job=harvest_job,
|
|
content=gemini_string)
|
|
obj.save()
|
|
|
|
log.info('Got GUID %s' % gemini_guid)
|
|
return [obj.id]
|
|
else:
|
|
self._save_gather_error('Could not get the GUID for source %s' % url, harvest_job)
|
|
return None
|
|
except Exception, e:
|
|
self._save_gather_error('Error parsing the document. Is this a valid Gemini document?: %s [%r]'% (url,e),harvest_job)
|
|
if debug_exception_mode:
|
|
raise
|
|
return None
|
|
|
|
|
|
def fetch_stage(self,harvest_object):
|
|
# The fetching was already done in the previous stage
|
|
return True
|
|
|
|
|
|
class GeminiWafHarvester(GeminiHarvester, SingletonPlugin):
|
|
'''
|
|
A Harvester from a WAF server containing GEMINI documents.
|
|
e.g. Apache serving a directory of GEMINI files.
|
|
'''
|
|
|
|
implements(IHarvester)
|
|
|
|
def info(self):
|
|
return {
|
|
'name': 'gemini-waf',
|
|
'title': 'Web Accessible Folder (WAF) - GEMINI',
|
|
'description': 'A Web Accessible Folder (WAF) displaying a list of GEMINI 2.1 documents'
|
|
}
|
|
|
|
def gather_stage(self,harvest_job):
|
|
log = logging.getLogger(__name__ + '.WAF.gather')
|
|
log.debug('GeminiWafHarvester gather_stage for job: %r', harvest_job)
|
|
|
|
self.harvest_job = harvest_job
|
|
|
|
# Get source URL
|
|
url = harvest_job.source.url
|
|
|
|
# Get contents
|
|
try:
|
|
content = self._get_content(url)
|
|
except Exception,e:
|
|
self._save_gather_error('Unable to get content for URL: %s: %r' % \
|
|
(url, e),harvest_job)
|
|
return None
|
|
|
|
ids = []
|
|
try:
|
|
for url in self._extract_urls(content,url):
|
|
try:
|
|
content = self._get_content(url)
|
|
except Exception, e:
|
|
msg = 'Couldn\'t harvest WAF link: %s: %s' % (url, e)
|
|
self._save_gather_error(msg,harvest_job)
|
|
continue
|
|
else:
|
|
# We need to extract the guid to pass it to the next stage
|
|
try:
|
|
gemini_string, gemini_guid = self.get_gemini_string_and_guid(content,url)
|
|
if gemini_guid:
|
|
log.debug('Got GUID %s' % gemini_guid)
|
|
# Create a new HarvestObject for this identifier
|
|
# Generally the content will be set in the fetch stage, but as we alredy
|
|
# have it, we might as well save a request
|
|
obj = HarvestObject(guid=gemini_guid,
|
|
job=harvest_job,
|
|
content=gemini_string)
|
|
obj.save()
|
|
|
|
ids.append(obj.id)
|
|
|
|
|
|
except Exception,e:
|
|
msg = 'Could not get GUID for source %s: %r' % (url,e)
|
|
self._save_gather_error(msg,harvest_job)
|
|
continue
|
|
except Exception,e:
|
|
msg = 'Error extracting URLs from %s' % url
|
|
self._save_gather_error(msg,harvest_job)
|
|
return None
|
|
|
|
|
|
if len(ids) > 0:
|
|
return ids
|
|
else:
|
|
self._save_gather_error('Couldn''t find any links to metadata files',
|
|
harvest_job)
|
|
return None
|
|
|
|
def fetch_stage(self,harvest_object):
|
|
# The fetching was already done in the previous stage
|
|
return True
|
|
|
|
|
|
def _extract_urls(self, content, base_url):
|
|
'''
|
|
Get the URLs out of a WAF index page
|
|
'''
|
|
try:
|
|
parser = etree.HTMLParser()
|
|
tree = etree.fromstring(content, parser=parser)
|
|
except Exception, inst:
|
|
msg = 'Couldn''t parse content into a tree: %s: %s' \
|
|
% (inst, content)
|
|
raise Exception(msg)
|
|
urls = []
|
|
for url in tree.xpath('//a/@href'):
|
|
url = url.strip()
|
|
if not url:
|
|
continue
|
|
if '?' in url:
|
|
continue
|
|
if '/' in url:
|
|
continue
|
|
if '#' in url:
|
|
continue
|
|
if 'mailto:' in url:
|
|
continue
|
|
urls.append(url)
|
|
base_url = base_url.rstrip('/').split('/')
|
|
if 'index' in base_url[-1]:
|
|
base_url.pop()
|
|
base_url = '/'.join(base_url)
|
|
base_url += '/'
|
|
return [base_url + i for i in urls]
|
|
|
|
|