Extract group info from GeoNetwork categories. Closes #2.

Collect values from old gml: namespace. Closes #3.
Parse TimeIstant extents.
This commit is contained in:
etj 2015-03-24 11:16:58 +01:00
parent f51ca2d117
commit 66fa23232d
4 changed files with 246 additions and 85 deletions

View File

@ -1 +1,9 @@
from geonetwork import GeoNetworkHarvester try:
import pkg_resources
pkg_resources.declare_namespace(__name__)
except ImportError:
import pkgutil
__path__ = pkgutil.extend_path(__path__, __name__)
from ckanext.geonetwork.harvesters.geonetwork import GeoNetworkHarvester
from ckanext.geonetwork.harvesters.utils import GeoNetworkClient

View File

@ -1,28 +1,56 @@
from .utils import GeoNetworkClient
from .utils import GEONETWORK_V210, GEONETWORK_V26
import re import re
import urllib
import urlparse
import logging import logging
from ckan import model from ckan import model
from ckan.model import Session
from ckan.plugins.core import SingletonPlugin, implements from ckan.plugins.core import SingletonPlugin
from ckanext.harvest.interfaces import IHarvester
from ckanext.harvest.model import HarvestObject
from ckanext.harvest.model import HarvestObjectExtra as HOExtra
from ckanext.spatial.lib.csw_client import CswService from ckanext.spatial.lib.csw_client import CswService
from ckanext.spatial.harvesters.base import SpatialHarvester, text_traceback
from ckanext.spatial.harvesters.csw import CSWHarvester from ckanext.spatial.harvesters.csw import CSWHarvester
from ckanext.spatial.model import ISODocument
from ckanext.spatial.model import ISOElement
from ckan.lib.helpers import json from ckan.logic import ValidationError, NotFound, get_action
import math
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
# Extend the ISODocument definitions by adding some more useful elements
log.info('GeoNetwork harvester: extending ISODocument with TimeInstant')
ISODocument.elements.append(
ISOElement(
name="temporal-extent-instant",
search_paths=[
"gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimeInstant/gml:timePosition/text()",
],
multiplicity="*",
))
# Some old GN instances still uses the old GML URL
# We'll add more xpath for addressing this issue
log.info('GeoNetwork harvester: adding old GML URI')
ISOElement.namespaces['oldgml'] = "http://www.opengis.net/gml"
for element in ISODocument.elements:
newpaths = []
for path in element.search_paths:
if "gml:" in path:
newpath = path.replace('gml:', 'oldgml:')
newpaths.append(newpath)
for newpath in newpaths:
element.search_paths.append(newpath)
log.info("Added old URI for gml to %s", element.name)
class GeoNetworkHarvester(CSWHarvester, SingletonPlugin): class GeoNetworkHarvester(CSWHarvester, SingletonPlugin):
def info(self): def info(self):
@ -66,8 +94,7 @@ class GeoNetworkHarvester(CSWHarvester, SingletonPlugin):
else: else:
log.debug('Skipping existing extra %s', key) log.debug('Skipping existing extra %s', key)
# Add GeoNetwork specific extras
# Add GeoNetowrk specific extras
gn_localized_url = harvest_object.job.source.url.strip('/') gn_localized_url = harvest_object.job.source.url.strip('/')
if gn_localized_url[-3:] == 'csw': if gn_localized_url[-3:] == 'csw':
@ -79,7 +106,61 @@ class GeoNetworkHarvester(CSWHarvester, SingletonPlugin):
package_dict['extras'].append({'key': 'gn_view_metadata_url', 'value': gn_localized_url + '/metadata.show?uuid=' + harvest_object.guid}) package_dict['extras'].append({'key': 'gn_view_metadata_url', 'value': gn_localized_url + '/metadata.show?uuid=' + harvest_object.guid})
package_dict['extras'].append({'key': 'gn_localized_url', 'value': gn_localized_url}) package_dict['extras'].append({'key': 'gn_localized_url', 'value': gn_localized_url})
# Add other elements from ISO metadata
time_extents = self.infer_timeinstants(iso_values)
if time_extents:
log.info("Adding Time Instants...")
package_dict['extras'].append({'key': 'temporal-extent-instant', 'value': time_extents})
# Handle groups mapping using GeoNetwork categories
group_mapping = self.source_config.get('group_mapping', {})
if group_mapping:
try:
context = {'model': model, 'session': Session, 'user': 'harvest'}
validated_groups = []
version = self.source_config.get('version')
client = GeoNetworkClient(gn_localized_url, version)
cats = client.retrieveMetadataCategories(harvest_object.guid)
for cat in cats:
groupname = group_mapping[cat]
printname = groupname if not None else "NONE"
log.debug("category %s mapped into %s" % (cat, printname))
if groupname:
try:
data_dict = {'id': groupname}
group = get_action('group_show')(context, data_dict)
#log.info('Group %s found %s' % (groupname, group))
#if self.api_version == 1:
#validated_groups.append(group['name'])
#else:
#validated_groups.append(group['id'])
validated_groups.append({'name': groupname})
except NotFound, e:
log.warning('Group %s from category %s is not available' % (groupname, cat))
package_dict['groups'] = validated_groups
except e:
log.warning('Error handling groups for metadata %s' % harvest_object.guid)
# End of processing, return the modified package # End of processing, return the modified package
return package_dict return package_dict
def infer_timeinstants(self, values):
extents = []
for extent in values["temporal-extent-instant"]:
if extent not in extents:
extents.append(extent)
log.info("%d TIME ISTANTS FOUND", len(extents))
if len(extents) > 0:
return ",".join(extents)
return

View File

@ -0,0 +1,72 @@
# -*- coding: utf-8 -*-
import logging
#import re
import urllib
import urllib2
import zipfile
from StringIO import StringIO
from lxml import etree
GEONETWORK_V26 = "2.6"
GEONETWORK_V210 = "2.10"
GEONETWORK_VERSIONS = [GEONETWORK_V26, GEONETWORK_V210]
logger = logging.getLogger(__name__)
class GeoNetworkClient(object):
def __init__(self, base, version):
if version is None:
version = GEONETWORK_V210
assert version in GEONETWORK_VERSIONS
self.version = version
self.base = base
def retrieveInfo(self, uuid):
if self.version == GEONETWORK_V26:
url = "%s/srv/en/mef.export" % self.base
#headers = {
#"Content-Type": "application/x-www-form-urlencoded",
#"Accept": "text/plain"
#}
query = urllib.urlencode({
"uuid": uuid
})
logger.info('Loading MEF for %s', uuid)
request = urllib2.Request(url, query)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler())
response = opener.open(request) # will get a ZIP file
content = response.read()
#logger.info('----> %s', content)
#print 'RESPONSE ', content
zdata = StringIO(content)
zfile = zipfile.ZipFile(zdata)
xml = None
for name in zfile.namelist():
#logger.info(' MEF entry: %s', name)
#print ' MEF entry: ', name
if name == 'info.xml':
uncompressed = zfile.read(name)
xml = etree.fromstring(uncompressed)
return xml
def retrieveMetadataCategories(self, uuid):
xml = self.retrieveInfo(uuid)
cats = []
for cat in xml.findall('categories/category'):
cats.append(cat.get('name'))
return cats

View File

@ -1,7 +1,7 @@
from setuptools import setup, find_packages from setuptools import setup, find_packages
import sys, os import sys, os
version = '1.0' version = '1.1'
setup( setup(
name='ckanext-geonetwork', name='ckanext-geonetwork',
@ -22,7 +22,7 @@ setup(
install_requires=[ install_requires=[
# -*- Extra requirements: -*- # -*- Extra requirements: -*-
], ],
entry_points=\ entry_points=
""" """
[ckan.plugins] [ckan.plugins]
# Add plugins here, eg # Add plugins here, eg