113 lines
3.5 KiB
Python
113 lines
3.5 KiB
Python
|
import hashlib
|
||
|
import logging
|
||
|
|
||
|
from ckan import model
|
||
|
|
||
|
from ckan.plugins.core import SingletonPlugin, implements
|
||
|
|
||
|
from ckanext.harvest.interfaces import IHarvester
|
||
|
from ckanext.harvest.model import HarvestObject
|
||
|
from ckanext.harvest.model import HarvestObjectExtra as HOExtra
|
||
|
|
||
|
from ckanext.spatial.harvesters.base import SpatialHarvester, guess_standard
|
||
|
|
||
|
|
||
|
class DocHarvester(SpatialHarvester, SingletonPlugin):
|
||
|
'''
|
||
|
A Harvester for individual spatial metadata documents
|
||
|
TODO: Move to new logic
|
||
|
'''
|
||
|
|
||
|
implements(IHarvester)
|
||
|
|
||
|
def info(self):
|
||
|
return {
|
||
|
'name': 'single-doc',
|
||
|
'title': 'Single spatial metadata document',
|
||
|
'description': 'A single spatial metadata document'
|
||
|
}
|
||
|
|
||
|
|
||
|
def get_original_url(self, harvest_object_id):
|
||
|
obj = model.Session.query(HarvestObject).\
|
||
|
filter(HarvestObject.id==harvest_object_id).\
|
||
|
first()
|
||
|
if not obj:
|
||
|
return None
|
||
|
|
||
|
return obj.source.url
|
||
|
|
||
|
|
||
|
def gather_stage(self,harvest_job):
|
||
|
log = logging.getLogger(__name__ + '.individual.gather')
|
||
|
log.debug('DocHarvester gather_stage for job: %r', harvest_job)
|
||
|
|
||
|
self.harvest_job = harvest_job
|
||
|
|
||
|
# Get source URL
|
||
|
url = harvest_job.source.url
|
||
|
|
||
|
self._set_source_config(harvest_job.source.config)
|
||
|
|
||
|
# Get contents
|
||
|
try:
|
||
|
content = self._get_content_as_unicode(url)
|
||
|
except Exception,e:
|
||
|
self._save_gather_error('Unable to get content for URL: %s: %r' % \
|
||
|
(url, e),harvest_job)
|
||
|
return None
|
||
|
|
||
|
existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
|
||
|
filter(HarvestObject.current==True).\
|
||
|
filter(HarvestObject.harvest_source_id==harvest_job.source.id).\
|
||
|
first()
|
||
|
|
||
|
def create_extras(url, status):
|
||
|
return [HOExtra(key='doc_location', value=url),
|
||
|
HOExtra(key='status', value=status)]
|
||
|
|
||
|
if not existing_object:
|
||
|
guid=hashlib.md5(url.encode('utf8',errors='ignore')).hexdigest()
|
||
|
harvest_object = HarvestObject(job=harvest_job,
|
||
|
extras=create_extras(url,
|
||
|
'new'),
|
||
|
guid=guid
|
||
|
)
|
||
|
else:
|
||
|
harvest_object = HarvestObject(job=harvest_job,
|
||
|
extras=create_extras(url,
|
||
|
'change'),
|
||
|
guid=existing_object.guid
|
||
|
)
|
||
|
|
||
|
harvest_object.add()
|
||
|
|
||
|
# Check if it is an ISO document
|
||
|
document_format = guess_standard(content)
|
||
|
if document_format == 'iso':
|
||
|
harvest_object.content = content
|
||
|
else:
|
||
|
extra = HOExtra(
|
||
|
object=harvest_object,
|
||
|
key='original_document',
|
||
|
value=content)
|
||
|
extra.save()
|
||
|
|
||
|
extra = HOExtra(
|
||
|
object=harvest_object,
|
||
|
key='original_format',
|
||
|
value=document_format)
|
||
|
extra.save()
|
||
|
|
||
|
harvest_object.save()
|
||
|
|
||
|
return [harvest_object.id]
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
def fetch_stage(self,harvest_object):
|
||
|
# The fetching was already done in the previous stage
|
||
|
return True
|
||
|
|