114 lines
3.6 KiB
Python
114 lines
3.6 KiB
Python
import hashlib
|
|
import logging
|
|
|
|
from ckan import model
|
|
|
|
from ckan.plugins.core import SingletonPlugin, implements
|
|
|
|
from ckanext.harvest.interfaces import IHarvester
|
|
from ckanext.harvest.model import HarvestObject
|
|
from ckanext.harvest.model import HarvestObjectExtra as HOExtra
|
|
|
|
from ckanext.spatial.harvesters.base import SpatialHarvester, guess_standard
|
|
|
|
|
|
class DocHarvester(SpatialHarvester, SingletonPlugin):
|
|
'''
|
|
A Harvester for individual spatial metadata documents
|
|
TODO: Move to new logic
|
|
'''
|
|
|
|
implements(IHarvester)
|
|
|
|
def info(self):
|
|
return {
|
|
'name': 'single-doc',
|
|
'title': 'Single spatial metadata document',
|
|
'description': 'A single spatial metadata document'
|
|
}
|
|
|
|
|
|
def get_original_url(self, harvest_object_id):
|
|
obj = model.Session.query(HarvestObject).\
|
|
filter(HarvestObject.id==harvest_object_id).\
|
|
first()
|
|
if not obj:
|
|
return None
|
|
|
|
return obj.source.url
|
|
|
|
|
|
def gather_stage(self,harvest_job):
|
|
log = logging.getLogger(__name__ + '.individual.gather')
|
|
log.debug('DocHarvester gather_stage for job: %r', harvest_job)
|
|
|
|
self.harvest_job = harvest_job
|
|
|
|
# Get source URL
|
|
url = harvest_job.source.url
|
|
|
|
self._set_source_config(harvest_job.source.config)
|
|
|
|
# Get contents
|
|
try:
|
|
content = self._get_content_as_unicode(url)
|
|
except Exception,e:
|
|
self._save_gather_error('Unable to get content for URL: %s: %r' % \
|
|
(url, e),harvest_job)
|
|
return None
|
|
|
|
existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
|
|
filter(HarvestObject.current==True).\
|
|
filter(HarvestObject.harvest_source_id==harvest_job.source.id).\
|
|
first()
|
|
|
|
def create_extras(url, status):
|
|
return [HOExtra(key='doc_location', value=url),
|
|
HOExtra(key='status', value=status)]
|
|
|
|
if not existing_object:
|
|
guid=hashlib.md5(url.encode('utf8', 'ignore')).hexdigest()
|
|
harvest_object = HarvestObject(job=harvest_job,
|
|
extras=create_extras(url,
|
|
'new'),
|
|
guid=guid
|
|
)
|
|
else:
|
|
harvest_object = HarvestObject(job=harvest_job,
|
|
extras=create_extras(url,
|
|
'change'),
|
|
guid=existing_object.guid,
|
|
package_id=existing_object.package_id
|
|
)
|
|
|
|
harvest_object.add()
|
|
|
|
# Check if it is an ISO document
|
|
document_format = guess_standard(content)
|
|
if document_format == 'iso':
|
|
harvest_object.content = content
|
|
else:
|
|
extra = HOExtra(
|
|
object=harvest_object,
|
|
key='original_document',
|
|
value=content)
|
|
extra.save()
|
|
|
|
extra = HOExtra(
|
|
object=harvest_object,
|
|
key='original_format',
|
|
value=document_format)
|
|
extra.save()
|
|
|
|
harvest_object.save()
|
|
|
|
return [harvest_object.id]
|
|
|
|
|
|
|
|
|
|
def fetch_stage(self,harvest_object):
|
|
# The fetching was already done in the previous stage
|
|
return True
|
|
|