From 66fa23232d9672d3fb0034aee960bf15c8dacfee Mon Sep 17 00:00:00 2001 From: etj Date: Tue, 24 Mar 2015 11:16:58 +0100 Subject: [PATCH] Extract group info from GeoNetwork categories. Closes #2. Collect values from old gml: namespace. Closes #3. Parse TimeIstant extents. --- ckanext/geonetwork/harvesters/__init__.py | 10 +- ckanext/geonetwork/harvesters/geonetwork.py | 201 ++++++++++++++------ ckanext/geonetwork/harvesters/utils.py | 72 +++++++ setup.py | 48 ++--- 4 files changed, 246 insertions(+), 85 deletions(-) create mode 100644 ckanext/geonetwork/harvesters/utils.py diff --git a/ckanext/geonetwork/harvesters/__init__.py b/ckanext/geonetwork/harvesters/__init__.py index f0d1d67..84a1c93 100644 --- a/ckanext/geonetwork/harvesters/__init__.py +++ b/ckanext/geonetwork/harvesters/__init__.py @@ -1 +1,9 @@ -from geonetwork import GeoNetworkHarvester +try: + import pkg_resources + pkg_resources.declare_namespace(__name__) +except ImportError: + import pkgutil + __path__ = pkgutil.extend_path(__path__, __name__) + +from ckanext.geonetwork.harvesters.geonetwork import GeoNetworkHarvester +from ckanext.geonetwork.harvesters.utils import GeoNetworkClient diff --git a/ckanext/geonetwork/harvesters/geonetwork.py b/ckanext/geonetwork/harvesters/geonetwork.py index 27111fc..1cee598 100644 --- a/ckanext/geonetwork/harvesters/geonetwork.py +++ b/ckanext/geonetwork/harvesters/geonetwork.py @@ -1,85 +1,166 @@ +from .utils import GeoNetworkClient +from .utils import GEONETWORK_V210, GEONETWORK_V26 + import re -import urllib -import urlparse import logging from ckan import model +from ckan.model import Session -from ckan.plugins.core import SingletonPlugin, implements - -from ckanext.harvest.interfaces import IHarvester -from ckanext.harvest.model import HarvestObject -from ckanext.harvest.model import HarvestObjectExtra as HOExtra +from ckan.plugins.core import SingletonPlugin from ckanext.spatial.lib.csw_client import CswService -from ckanext.spatial.harvesters.base import SpatialHarvester, text_traceback from ckanext.spatial.harvesters.csw import CSWHarvester +from ckanext.spatial.model import ISODocument +from ckanext.spatial.model import ISOElement -from ckan.lib.helpers import json -import math +from ckan.logic import ValidationError, NotFound, get_action log = logging.getLogger(__name__) +# Extend the ISODocument definitions by adding some more useful elements + +log.info('GeoNetwork harvester: extending ISODocument with TimeInstant') +ISODocument.elements.append( + ISOElement( + name="temporal-extent-instant", + search_paths=[ + "gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimeInstant/gml:timePosition/text()", + ], + multiplicity="*", + )) + +# Some old GN instances still uses the old GML URL +# We'll add more xpath for addressing this issue +log.info('GeoNetwork harvester: adding old GML URI') +ISOElement.namespaces['oldgml'] = "http://www.opengis.net/gml" + +for element in ISODocument.elements: + newpaths = [] + + for path in element.search_paths: + if "gml:" in path: + newpath = path.replace('gml:', 'oldgml:') + newpaths.append(newpath) + + for newpath in newpaths: + element.search_paths.append(newpath) + log.info("Added old URI for gml to %s", element.name) + + class GeoNetworkHarvester(CSWHarvester, SingletonPlugin): - def info(self): - return { - 'name': 'geonetwork', - 'title': 'CSW server (GeoNetwork)', - 'description': 'Harvests GeoNetwork instances via CSW', - 'form_config_interface': 'Text' - } + def info(self): + return { + 'name': 'geonetwork', + 'title': 'CSW server (GeoNetwork)', + 'description': 'Harvests GeoNetwork instances via CSW', + 'form_config_interface': 'Text' + } - def get_package_dict(self, iso_values, harvest_object): + def get_package_dict(self, iso_values, harvest_object): - package_dict = super(GeoNetworkHarvester, self).get_package_dict(iso_values, harvest_object) + package_dict = super(GeoNetworkHarvester, self).get_package_dict(iso_values, harvest_object) - # Add default_tags from config - default_tags = self.source_config.get('default_tags',[]) - if default_tags: - for tag in default_tags: - package_dict['tags'].append({'name': tag}) + # Add default_tags from config + default_tags = self.source_config.get('default_tags', []) + if default_tags: + for tag in default_tags: + package_dict['tags'].append({'name': tag}) - # Add default_extras from config - default_extras = self.source_config.get('default_extras',{}) - if default_extras: - override_extras = self.source_config.get('override_extras',False) + # Add default_extras from config + default_extras = self.source_config.get('default_extras', {}) + if default_extras: + override_extras = self.source_config.get('override_extras', False) - existing_keys = [entry.get('key') for entry in package_dict['extras'] ] + existing_keys = [entry.get('key') for entry in package_dict['extras']] - for key,value in default_extras.iteritems(): - log.debug('Processing extra %s', key) - if not key in existing_keys or override_extras: - # Look for replacement strings - if isinstance(value,basestring): - value = value.format( - harvest_source_id=str(harvest_object.job.source.id), - harvest_source_url=str(harvest_object.job.source.url).strip('/'), - harvest_source_title=str(harvest_object.job.source.title), - harvest_job_id=str(harvest_object.job.id), - harvest_object_id=str(harvest_object.id), - guid=str(harvest_object.guid)) - package_dict['extras'].append( {'key': key, 'value': value }) - else: - log.debug('Skipping existing extra %s', key) + for key, value in default_extras.iteritems(): + log.debug('Processing extra %s', key) + if not key in existing_keys or override_extras: + # Look for replacement strings + if isinstance(value, basestring): + value = value.format( + harvest_source_id=str(harvest_object.job.source.id), + harvest_source_url=str(harvest_object.job.source.url).strip('/'), + harvest_source_title=str(harvest_object.job.source.title), + harvest_job_id=str(harvest_object.job.id), + harvest_object_id=str(harvest_object.id), + guid=str(harvest_object.guid)) + package_dict['extras'].append({'key': key, 'value': value}) + else: + log.debug('Skipping existing extra %s', key) + + # Add GeoNetwork specific extras + gn_localized_url = harvest_object.job.source.url.strip('/') + + if gn_localized_url[-3:] == 'csw': + gn_localized_url = gn_localized_url[:-3] + + log.debug('GN localized URL %s', gn_localized_url) + #log.debug('Package dict is %r ', package_dict['extras']) + + package_dict['extras'].append({'key': 'gn_view_metadata_url', 'value': gn_localized_url + '/metadata.show?uuid=' + harvest_object.guid}) + package_dict['extras'].append({'key': 'gn_localized_url', 'value': gn_localized_url}) + + # Add other elements from ISO metadata + time_extents = self.infer_timeinstants(iso_values) + if time_extents: + log.info("Adding Time Instants...") + package_dict['extras'].append({'key': 'temporal-extent-instant', 'value': time_extents}) + + # Handle groups mapping using GeoNetwork categories + group_mapping = self.source_config.get('group_mapping', {}) + + if group_mapping: + try: + context = {'model': model, 'session': Session, 'user': 'harvest'} + validated_groups = [] + + version = self.source_config.get('version') + client = GeoNetworkClient(gn_localized_url, version) + cats = client.retrieveMetadataCategories(harvest_object.guid) + for cat in cats: + groupname = group_mapping[cat] + + printname = groupname if not None else "NONE" + log.debug("category %s mapped into %s" % (cat, printname)) + + if groupname: + try: + data_dict = {'id': groupname} + group = get_action('group_show')(context, data_dict) + #log.info('Group %s found %s' % (groupname, group)) + #if self.api_version == 1: + #validated_groups.append(group['name']) + #else: + #validated_groups.append(group['id']) + validated_groups.append({'name': groupname}) + except NotFound, e: + log.warning('Group %s from category %s is not available' % (groupname, cat)) + + package_dict['groups'] = validated_groups + except e: + log.warning('Error handling groups for metadata %s' % harvest_object.guid) + + # End of processing, return the modified package + return package_dict + + def infer_timeinstants(self, values): + extents = [] + + for extent in values["temporal-extent-instant"]: + if extent not in extents: + extents.append(extent) + + log.info("%d TIME ISTANTS FOUND", len(extents)) + + if len(extents) > 0: + return ",".join(extents) + + return - # Add GeoNetowrk specific extras - gn_localized_url = harvest_object.job.source.url.strip('/') - - if gn_localized_url[-3:] == 'csw' : - gn_localized_url = gn_localized_url[:-3] - - log.debug('GN localized URL %s', gn_localized_url) - #log.debug('Package dict is %r ', package_dict['extras']) - - package_dict['extras'].append( {'key': 'gn_view_metadata_url', 'value': gn_localized_url + '/metadata.show?uuid=' + harvest_object.guid }) - package_dict['extras'].append( {'key': 'gn_localized_url', 'value': gn_localized_url }) - - - # End of processing, return the modified package - return package_dict - diff --git a/ckanext/geonetwork/harvesters/utils.py b/ckanext/geonetwork/harvesters/utils.py new file mode 100644 index 0000000..31d1f7d --- /dev/null +++ b/ckanext/geonetwork/harvesters/utils.py @@ -0,0 +1,72 @@ +# -*- coding: utf-8 -*- +import logging +#import re +import urllib +import urllib2 +import zipfile +from StringIO import StringIO +from lxml import etree + +GEONETWORK_V26 = "2.6" +GEONETWORK_V210 = "2.10" +GEONETWORK_VERSIONS = [GEONETWORK_V26, GEONETWORK_V210] + +logger = logging.getLogger(__name__) + + +class GeoNetworkClient(object): + + def __init__(self, base, version): + if version is None: + version = GEONETWORK_V210 + + assert version in GEONETWORK_VERSIONS + self.version = version + self.base = base + + def retrieveInfo(self, uuid): + + if self.version == GEONETWORK_V26: + url = "%s/srv/en/mef.export" % self.base + #headers = { + #"Content-Type": "application/x-www-form-urlencoded", + #"Accept": "text/plain" + #} + query = urllib.urlencode({ + "uuid": uuid + }) + + logger.info('Loading MEF for %s', uuid) + request = urllib2.Request(url, query) + opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler()) + + response = opener.open(request) # will get a ZIP file + content = response.read() + + #logger.info('----> %s', content) + #print 'RESPONSE ', content + + zdata = StringIO(content) + zfile = zipfile.ZipFile(zdata) + + xml = None + + for name in zfile.namelist(): + #logger.info(' MEF entry: %s', name) + #print ' MEF entry: ', name + if name == 'info.xml': + uncompressed = zfile.read(name) + xml = etree.fromstring(uncompressed) + + return xml + + def retrieveMetadataCategories(self, uuid): + xml = self.retrieveInfo(uuid) + + cats = [] + + for cat in xml.findall('categories/category'): + cats.append(cat.get('name')) + + return cats + diff --git a/setup.py b/setup.py index a9f822e..df29556 100644 --- a/setup.py +++ b/setup.py @@ -1,32 +1,32 @@ from setuptools import setup, find_packages import sys, os -version = '1.0' +version = '1.1' setup( - name='ckanext-geonetwork', - version=version, - description="CKAN harvester for GeoNetwork", - long_description="""\ - """, - classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers - keywords='', - author='Emanuele Tajariol', - author_email='etj@geo-solutions.it', - url='', - license='', - packages=find_packages(exclude=['ez_setup', 'examples', 'tests']), - namespace_packages=['ckanext', 'ckanext.geonetwork'], - include_package_data=True, - zip_safe=False, - install_requires=[ - # -*- Extra requirements: -*- - ], - entry_points=\ - """ + name='ckanext-geonetwork', + version=version, + description="CKAN harvester for GeoNetwork", + long_description="""\ + """, + classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers + keywords='', + author='Emanuele Tajariol', + author_email='etj@geo-solutions.it', + url='', + license='', + packages=find_packages(exclude=['ez_setup', 'examples', 'tests']), + namespace_packages=['ckanext', 'ckanext.geonetwork'], + include_package_data=True, + zip_safe=False, + install_requires=[ + # -*- Extra requirements: -*- + ], + entry_points= + """ [ckan.plugins] - # Add plugins here, eg - # myplugin=ckanext.geonetwork:PluginClass + # Add plugins here, eg + # myplugin=ckanext.geonetwork:PluginClass geonetwork_harvester=ckanext.geonetwork.harvesters:GeoNetworkHarvester - """, + """, )