spatial-d4science/ckanext/spatial/harvesters/doc.py

114 lines
3.6 KiB
Python
Raw Normal View History

import hashlib
import logging
from ckan import model
from ckan.plugins.core import SingletonPlugin, implements
from ckanext.harvest.interfaces import IHarvester
from ckanext.harvest.model import HarvestObject
from ckanext.harvest.model import HarvestObjectExtra as HOExtra
from ckanext.spatial.harvesters.base import SpatialHarvester, guess_standard
class DocHarvester(SpatialHarvester, SingletonPlugin):
'''
A Harvester for individual spatial metadata documents
TODO: Move to new logic
'''
implements(IHarvester)
def info(self):
return {
'name': 'single-doc',
'title': 'Single spatial metadata document',
'description': 'A single spatial metadata document'
}
def get_original_url(self, harvest_object_id):
obj = model.Session.query(HarvestObject).\
filter(HarvestObject.id==harvest_object_id).\
first()
if not obj:
return None
return obj.source.url
def gather_stage(self,harvest_job):
log = logging.getLogger(__name__ + '.individual.gather')
log.debug('DocHarvester gather_stage for job: %r', harvest_job)
self.harvest_job = harvest_job
# Get source URL
url = harvest_job.source.url
self._set_source_config(harvest_job.source.config)
# Get contents
try:
content = self._get_content_as_unicode(url)
except Exception,e:
self._save_gather_error('Unable to get content for URL: %s: %r' % \
(url, e),harvest_job)
return None
existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
filter(HarvestObject.current==True).\
filter(HarvestObject.harvest_source_id==harvest_job.source.id).\
first()
def create_extras(url, status):
return [HOExtra(key='doc_location', value=url),
HOExtra(key='status', value=status)]
if not existing_object:
guid=hashlib.md5(url.encode('utf8',errors='ignore')).hexdigest()
harvest_object = HarvestObject(job=harvest_job,
extras=create_extras(url,
'new'),
guid=guid
)
else:
harvest_object = HarvestObject(job=harvest_job,
extras=create_extras(url,
'change'),
2013-03-27 18:38:42 +01:00
guid=existing_object.guid,
package_id=existing_object.package_id
)
harvest_object.add()
# Check if it is an ISO document
document_format = guess_standard(content)
if document_format == 'iso':
harvest_object.content = content
else:
extra = HOExtra(
object=harvest_object,
key='original_document',
value=content)
extra.save()
extra = HOExtra(
object=harvest_object,
key='original_format',
value=document_format)
extra.save()
harvest_object.save()
return [harvest_object.id]
def fetch_stage(self,harvest_object):
# The fetching was already done in the previous stage
return True