[#8] Update harvesters for CsW, WAF and Doc sources

These are the new versions of the spatial harvesters with significant improvement over previous ones.
2013-02-12 18:29:30 +00:00 · 2013-02-12 18:29:30 +00:00 · 1d8a4c17c4
parent f153b0f4ba
commit 1d8a4c17c4
5 changed files with 611 additions and 0 deletions
--- a/ckanext/spatial/harvesters/init.py
+++ b/ckanext/spatial/harvesters/init.py
@ -5,3 +5,7 @@ try:
 except ImportError:
    import pkgutil
    __path__ = pkgutil.extend_path(__path__, __name__)
+
+from ckanext.spatial.harvesters.csw import CSWHarvester
+from ckanext.spatial.harvesters.waf import WAFHarvester
+from ckanext.spatial.harvesters.doc import DocHarvester
--- a/ckanext/spatial/harvesters/csw.py
+++ b/ckanext/spatial/harvesters/csw.py
@ -0,0 +1,178 @@
+import urllib
+import urlparse
+
+import logging
+
+from ckan import model
+
+from ckan.plugins.core import SingletonPlugin, implements
+
+from ckanext.harvest.interfaces import IHarvester
+from ckanext.harvest.model import HarvestObject
+from ckanext.harvest.model import HarvestObjectExtra as HOExtra
+
+from ckanext.spatial.lib.csw_client import CswService
+from ckanext.spatial.harvesters.base import SpatialHarvester, text_traceback
+
+
+class CSWHarvester(SpatialHarvester, SingletonPlugin):
+    '''
+    A Harvester for CSW servers
+    '''
+    implements(IHarvester)
+
+    csw=None
+
+    def info(self):
+        return {
+            'name': 'csw',
+            'title': 'CSW Server',
+            'description': 'A server that implements OGC\'s Catalog Service for the Web (CSW) standard'
+            }
+
+
+    def get_original_url(self, harvest_object_id):
+        obj = model.Session.query(HarvestObject).\
+                                    filter(HarvestObject.id==harvest_object_id).\
+                                    first()
+
+        parts = urlparse.urlparse(obj.source.url)
+
+        params = {
+            'SERVICE': 'CSW',
+            'VERSION': '2.0.2',
+            'REQUEST': 'GetRecordById',
+            'OUTPUTSCHEMA': 'http://www.isotc211.org/2005/gmd',
+            'OUTPUTFORMAT':'application/xml' ,
+            'ID': obj.guid
+        }
+
+        url = urlparse.urlunparse((
+            parts.scheme,
+            parts.netloc,
+            parts.path,
+            None,
+            urllib.urlencode(params),
+            None
+        ))
+
+        return url
+
+
+    def gather_stage(self, harvest_job):
+        log = logging.getLogger(__name__ + '.CSW.gather')
+        log.debug('CswHarvester gather_stage for job: %r', harvest_job)
+        # Get source URL
+        url = harvest_job.source.url
+
+        self._set_source_config(harvest_job.source.config)
+
+        try:
+            self._setup_csw_client(url)
+        except Exception, e:
+            self._save_gather_error('Error contacting the CSW server: %s' % e, harvest_job)
+            return None
+
+        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
+                                    filter(HarvestObject.current==True).\
+                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)
+        guid_to_package_id = {}
+
+        for guid, package_id in query:
+            guid_to_package_id[guid] = package_id
+
+        guids_in_db = set(guid_to_package_id.keys())
+
+        log.debug('Starting gathering for %s' % url)
+        guids_in_harvest = set()
+        try:
+            for identifier in self.csw.getidentifiers(page=10):
+                try:
+                    log.info('Got identifier %s from the CSW', identifier)
+                    if identifier is None:
+                        log.error('CSW returned identifier %r, skipping...' % identifier)
+                        continue
+
+                    guids_in_harvest.add(identifier)
+                except Exception, e:
+                    self._save_gather_error('Error for the identifier %s [%r]' % (identifier,e), harvest_job)
+                    continue
+
+
+        except Exception, e:
+            log.error('Exception: %s' % text_traceback())
+            self._save_gather_error('Error gathering the identifiers from the CSW server [%s]' % str(e), harvest_job)
+            return None
+
+        new = guids_in_harvest - guids_in_db
+        delete = guids_in_db - guids_in_harvest
+        change = guids_in_db & guids_in_harvest
+
+        ids = []
+        for guid in new:
+            obj = HarvestObject(guid=guid, job=harvest_job,
+                                extras=[HOExtra(key='status', value='new')])
+            obj.save()
+            ids.append(obj.id)
+        for guid in change:
+            obj = HarvestObject(guid=guid, job=harvest_job,
+                                package_id=guid_to_package_id[guid],
+                                extras=[HOExtra(key='status', value='change')])
+            obj.save()
+            ids.append(obj.id)
+        for guid in delete:
+            obj = HarvestObject(guid=guid, job=harvest_job,
+                                package_id=guid_to_package_id[guid],
+                                extras=[HOExtra(key='status', value='delete')])
+            ids.append(obj.id)
+            model.Session.query(HarvestObject).\
+                  filter_by(guid=guid).\
+                  update({'current': False}, False)
+            obj.save()
+
+        if len(ids) == 0:
+            self._save_gather_error('No records received from the CSW server', harvest_job)
+            return None
+
+        return ids
+
+    def fetch_stage(self,harvest_object):
+        log = logging.getLogger(__name__ + '.CSW.fetch')
+        log.debug('CswHarvester fetch_stage for object: %s', harvest_object.id)
+
+        url = harvest_object.source.url
+        try:
+            self._setup_csw_client(url)
+        except Exception, e:
+            self._save_object_error('Error contacting the CSW server: %s' % e,
+                                    harvest_object)
+            return False
+
+        identifier = harvest_object.guid
+        try:
+            record = self.csw.getrecordbyid([identifier])
+        except Exception, e:
+            self._save_object_error('Error getting the CSW record with GUID %s' % identifier, harvest_object)
+            return False
+
+        if record is None:
+            self._save_object_error('Empty record for GUID %s' % identifier,
+                                    harvest_object)
+            return False
+
+        try:
+            # Save the fetch contents in the HarvestObject
+            # Contents come from csw_client already declared and encoded as utf-8
+            harvest_object.content = record['xml']
+            harvest_object.save()
+        except Exception,e:
+            self._save_object_error('Error saving the harvest object for GUID %s [%r]' % \
+                                    (identifier, e), harvest_object)
+            return False
+
+        log.debug('XML content saved (len %s)', len(record['xml']))
+        return True
+
+    def _setup_csw_client(self, url):
+        self.csw = CswService(url)
+
--- a/ckanext/spatial/harvesters/doc.py
+++ b/ckanext/spatial/harvesters/doc.py
@ -0,0 +1,112 @@
+import hashlib
+import logging
+
+from ckan import model
+
+from ckan.plugins.core import SingletonPlugin, implements
+
+from ckanext.harvest.interfaces import IHarvester
+from ckanext.harvest.model import HarvestObject
+from ckanext.harvest.model import HarvestObjectExtra as HOExtra
+
+from ckanext.spatial.harvesters.base import SpatialHarvester,  guess_standard
+
+
+class DocHarvester(SpatialHarvester, SingletonPlugin):
+    '''
+    A Harvester for individual spatial metadata documents
+    TODO: Move to new logic
+    '''
+
+    implements(IHarvester)
+
+    def info(self):
+        return {
+            'name': 'single-doc',
+            'title': 'Single spatial metadata document',
+            'description': 'A single spatial metadata document'
+            }
+
+
+    def get_original_url(self, harvest_object_id):
+        obj = model.Session.query(HarvestObject).\
+                                    filter(HarvestObject.id==harvest_object_id).\
+                                    first()
+        if not obj:
+            return None
+
+        return obj.source.url
+
+
+    def gather_stage(self,harvest_job):
+        log = logging.getLogger(__name__ + '.individual.gather')
+        log.debug('DocHarvester gather_stage for job: %r', harvest_job)
+
+        self.harvest_job = harvest_job
+
+        # Get source URL
+        url = harvest_job.source.url
+
+        self._set_source_config(harvest_job.source.config)
+
+        # Get contents
+        try:
+            content = self._get_content_as_unicode(url)
+        except Exception,e:
+            self._save_gather_error('Unable to get content for URL: %s: %r' % \
+                                        (url, e),harvest_job)
+            return None
+
+        existing_object = model.Session.query(HarvestObject.guid, HarvestObject.package_id).\
+                                    filter(HarvestObject.current==True).\
+                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id).\
+                                    first()
+
+        def create_extras(url, status):
+            return [HOExtra(key='doc_location', value=url),
+                    HOExtra(key='status', value=status)]
+
+        if not existing_object:
+            guid=hashlib.md5(url.encode('utf8',errors='ignore')).hexdigest()
+            harvest_object = HarvestObject(job=harvest_job,
+                                extras=create_extras(url,
+                                                     'new'),
+                                guid=guid
+                               )
+        else:
+            harvest_object = HarvestObject(job=harvest_job,
+                                extras=create_extras(url,
+                                                     'change'),
+                                guid=existing_object.guid
+                               )
+
+        harvest_object.add()
+
+        # Check if it is an ISO document
+        document_format = guess_standard(content)
+        if document_format == 'iso':
+            harvest_object.content = content
+        else:
+            extra = HOExtra(
+                    object=harvest_object,
+                    key='original_document',
+                    value=content)
+            extra.save()
+
+            extra = HOExtra(
+                    object=harvest_object,
+                    key='original_format',
+                    value=document_format)
+            extra.save()
+
+        harvest_object.save()
+
+        return [harvest_object.id]
+
+
+
+
+    def fetch_stage(self,harvest_object):
+        # The fetching was already done in the previous stage
+        return True
+
--- a/ckanext/spatial/harvesters/waf.py
+++ b/ckanext/spatial/harvesters/waf.py
@ -0,0 +1,313 @@
+import logging
+import hashlib
+from urlparse import urljoin
+import dateutil.parser
+import pyparsing as parse
+import requests
+from sqlalchemy.orm import aliased
+from sqlalchemy.exc import DataError
+
+from ckan import model
+
+from ckan.plugins.core import SingletonPlugin, implements
+
+from ckanext.harvest.interfaces import IHarvester
+from ckanext.harvest.model import HarvestObject
+from ckanext.harvest.model import HarvestObjectExtra as HOExtra
+import ckanext.harvest.queue as queue
+
+from ckanext.spatial.harvesters.base import SpatialHarvester, get_extra, guess_standard
+
+class WAFHarvester(SpatialHarvester, SingletonPlugin):
+    '''
+    A Harvester for WAF (Web Accessible Folders) containing spatial metadata documents.
+    e.g. Apache serving a directory of ISO 19139 files.
+    '''
+
+    implements(IHarvester)
+
+    def info(self):
+        return {
+            'name': 'waf',
+            'title': 'Web Accessible Folder (WAF)',
+            'description': 'A Web Accessible Folder (WAF) displaying a list of spatial metadata documents'
+            }
+
+
+    def get_original_url(self, harvest_object_id):
+        url = model.Session.query(HOExtra.value).\
+                                    filter(HOExtra.key=='waf_location').\
+                                    filter(HOExtra.harvest_object_id==harvest_object_id).\
+                                    first()
+
+        return url[0] if url else None
+
+
+    def gather_stage(self,harvest_job,collection_package_id=None):
+        log = logging.getLogger(__name__ + '.WAF.gather')
+        log.debug('WafHarvester gather_stage for job: %r', harvest_job)
+
+        self.harvest_job = harvest_job
+
+        # Get source URL
+        source_url = harvest_job.source.url
+
+        self._set_source_config(harvest_job.source.config)
+
+        # Get contents
+        try:
+            response = requests.get(source_url, timeout=60)
+            content = response.content
+            scraper = _get_scraper(response.headers.get('server'))
+        except Exception,e:
+            self._save_gather_error('Unable to get content for URL: %s: %r' % \
+                                        (source_url, e),harvest_job)
+            return None
+
+        ######  Get current harvest object out of db ######
+
+        url_to_modified_db = {} ## mapping of url to last_modified in db
+        url_to_ids = {} ## mapping of url to guid in db
+
+
+        HOExtraAlias1 = aliased(HOExtra)
+        HOExtraAlias2 = aliased(HOExtra)
+        query = model.Session.query(HarvestObject.guid, HarvestObject.package_id, HOExtraAlias1.value, HOExtraAlias2.value).\
+                                    join(HOExtraAlias1, HarvestObject.extras).\
+                                    join(HOExtraAlias2, HarvestObject.extras).\
+                                    filter(HOExtraAlias1.key=='waf_modified_date').\
+                                    filter(HOExtraAlias2.key=='waf_location').\
+                                    filter(HarvestObject.current==True).\
+                                    filter(HarvestObject.harvest_source_id==harvest_job.source.id)
+
+
+        for guid, package_id, modified_date, url in query:
+            url_to_modified_db[url] = modified_date
+            url_to_ids[url] = (guid, package_id)
+
+        ######  Get current list of records from source ######
+
+        url_to_modified_harvest = {} ## mapping of url to last_modified in harvest
+        try:
+            for url, modified_date in _extract_waf(content,source_url,scraper):
+                url_to_modified_harvest[url] = modified_date
+        except Exception,e:
+            msg = 'Error extracting URLs from %s, error was %s' % (source_url, e)
+            self._save_gather_error(msg,harvest_job)
+            return None
+
+        ######  Compare source and db ######
+
+        harvest_locations = set(url_to_modified_harvest.keys())
+        old_locations = set(url_to_modified_db.keys())
+
+        new = harvest_locations - old_locations
+        delete = old_locations - harvest_locations
+        possible_changes = old_locations & harvest_locations
+        change = []
+
+        for item in possible_changes:
+            if (not url_to_modified_harvest[item] or not url_to_modified_db[item] #if there is no date assume change
+                or url_to_modified_harvest[item] > url_to_modified_db[item]):
+                change.append(item)
+
+        def create_extras(url, date, status):
+            extras = [HOExtra(key='waf_modified_date', value=date),
+                      HOExtra(key='waf_location', value=url),
+                      HOExtra(key='status', value=status)]
+            if collection_package_id:
+                extras.append(
+                    HOExtra(key='collection_package_id',
+                            value=collection_package_id)
+                )
+            return extras
+
+
+        ids = []
+        for location in new:
+            guid=hashlib.md5(location.encode('utf8',errors='ignore')).hexdigest()
+            obj = HarvestObject(job=harvest_job,
+                                extras=create_extras(location,
+                                                     url_to_modified_harvest[location],
+                                                     'new'),
+                                guid=guid
+                               )
+            obj.save()
+            ids.append(obj.id)
+
+        for location in change:
+            obj = HarvestObject(job=harvest_job,
+                                extras=create_extras(location,
+                                                     url_to_modified_harvest[location],
+                                                     'change'),
+                                guid=url_to_ids[location][0],
+                                package_id=url_to_ids[location][1],
+                               )
+            obj.save()
+            ids.append(obj.id)
+
+        for location in delete:
+            obj = HarvestObject(job=harvest_job,
+                                extras=create_extras('','', 'delete'),
+                                guid=url_to_ids[location][0],
+                                package_id=url_to_ids[location][1],
+                               )
+            model.Session.query(HarvestObject).\
+                  filter_by(guid=url_to_ids[location][0]).\
+                  update({'current': False}, False)
+
+            obj.save()
+            ids.append(obj.id)
+
+        if len(ids) > 0:
+            log.debug('{0} objects sent to the next stage: {1} new, {2} change, {3} delete'.format(
+                len(ids), len(new), len(change), len(delete)))
+            return ids
+        else:
+            self._save_gather_error('No records to change',
+                                     harvest_job)
+            return None
+
+    def fetch_stage(self, harvest_object):
+
+        # Check harvest object status
+        status = get_extra(harvest_object,'status')
+
+        if status == 'delete':
+            # No need to fetch anything, just pass to the import stage
+            return True
+
+        # We need to fetch the remote document
+
+        # Get location
+        url = get_extra(harvest_object, 'waf_location')
+        if not url:
+            self._save_object_error(
+                    'No location defined for object {0}'.format(harvest_object.id),
+                    harvest_object)
+            return False
+
+        # Get contents
+        try:
+            content = self._get_content_as_unicode(url)
+        except Exception, e:
+            msg = 'Could not harvest WAF link {0}: {1}'.format(url, e)
+            self._save_object_error(msg, harvest_object)
+            return False
+
+        # Check if it is an ISO document
+        document_format = guess_standard(content)
+        if document_format == 'iso':
+            harvest_object.content = content
+            harvest_object.save()
+        else:
+            extra = HOExtra(
+                    object=harvest_object,
+                    key='original_document',
+                    value=content)
+            extra.save()
+
+            extra = HOExtra(
+                    object=harvest_object,
+                    key='original_format',
+                    value=document_format)
+            extra.save()
+
+        return True
+
+
+apache  = parse.SkipTo(parse.CaselessLiteral("<a href="), include=True).suppress() \
+        + parse.quotedString.setParseAction(parse.removeQuotes).setResultsName('url') \
+        + parse.SkipTo("</a>", include=True).suppress() \
+        + parse.Optional(parse.Literal('</td><td align="right">')).suppress() \
+        + parse.Optional(parse.Combine(
+            parse.Word(parse.alphanums+'-') +
+            parse.Word(parse.alphanums+':')
+        ,adjacent=False, joinString=' ').setResultsName('date')
+        )
+
+iis =      parse.SkipTo("<br>").suppress() \
+         + parse.OneOrMore("<br>").suppress() \
+         + parse.Optional(parse.Combine(
+           parse.Word(parse.alphanums+'/') +
+           parse.Word(parse.alphanums+':') +
+           parse.Word(parse.alphas)
+         , adjacent=False, joinString=' ').setResultsName('date')
+         ) \
+         + parse.Word(parse.nums).suppress() \
+         + parse.Literal('<A HREF=').suppress() \
+         + parse.quotedString.setParseAction(parse.removeQuotes).setResultsName('url')
+
+other = parse.SkipTo(parse.CaselessLiteral("<a href="), include=True).suppress() \
+        + parse.quotedString.setParseAction(parse.removeQuotes).setResultsName('url')
+
+
+scrapers = {'apache': parse.OneOrMore(parse.Group(apache)),
+            'other': parse.OneOrMore(parse.Group(other)),
+            'iis': parse.OneOrMore(parse.Group(iis))}
+
+def _get_scraper(server):
+    if not server or 'apache' in server.lower():
+        return 'apache'
+    if server == 'Microsoft-IIS/7.5':
+        return 'iis'
+    else:
+        return 'other'
+
+def _extract_waf(content, base_url, scraper, results = None, depth=0):
+    if results is None:
+        results = []
+
+    base_url = base_url.rstrip('/').split('/')
+    if 'index' in base_url[-1]:
+        base_url.pop()
+    base_url = '/'.join(base_url)
+    base_url += '/'
+
+    try:
+        parsed = scrapers[scraper].parseString(content)
+    except parse.ParseException:
+        parsed = scrapers['other'].parseString(content)
+
+    for record in parsed:
+        url = record.url
+        if not url:
+            continue
+        if url.startswith('_'):
+            continue
+        if '?' in url:
+            continue
+        if '#' in url:
+            continue
+        if 'mailto:' in url:
+            continue
+        if '..' not in url and url[0] != '/' and url[-1] == '/':
+            new_depth = depth + 1
+            if depth > 10:
+                print 'max depth reached'
+                continue
+            new_url = urljoin(base_url, url)
+            if not new_url.startswith(base_url):
+                continue
+            print 'new_url', new_url
+            try:
+                response = requests.get(new_url)
+                content = response.content
+            except Exception, e:
+                print str(e)
+                continue
+            _extract_waf(content, new_url, scraper, results, new_depth)
+            continue
+        if not url.endswith('.xml'):
+            continue
+        date = record.date
+        if date:
+            try:
+                date = str(dateutil.parser.parse(date))
+            except Exception, e:
+                raise
+                date = None
+        results.append((urljoin(base_url, record.url), date))
+
+    return results
+
--- a/setup.py
+++ b/setup.py
@ -35,6 +35,10 @@ setup(
    cswserver=ckanext.spatial.plugin:CatalogueServiceWeb
    spatial_harvest_metadata_api=ckanext.spatial.plugin:HarvestMetadataApi

+    csw_harvester=ckanext.spatial.harvesters:CSWHarvester
+    waf_harvester=ckanext.spatial.harvesters:WAFHarvester
+    doc_harvester=ckanext.spatial.harvesters:DocHarvester
+
    # Legacy harvesters
    gemini_csw_harvester=ckanext.spatial.harvesters.gemini:GeminiCswHarvester
    gemini_doc_harvester=ckanext.spatial.harvesters.gemini:GeminiDocHarvester