Extract group info from GeoNetwork categories. Closes #2.

Collect values from old gml: namespace. Closes #3.
Parse TimeIstant extents.
This commit is contained in:
etj 2015-03-24 11:16:58 +01:00
parent f51ca2d117
commit 66fa23232d
4 changed files with 246 additions and 85 deletions

View File

@ -1 +1,9 @@
from geonetwork import GeoNetworkHarvester
try:
import pkg_resources
pkg_resources.declare_namespace(__name__)
except ImportError:
import pkgutil
__path__ = pkgutil.extend_path(__path__, __name__)
from ckanext.geonetwork.harvesters.geonetwork import GeoNetworkHarvester
from ckanext.geonetwork.harvesters.utils import GeoNetworkClient

View File

@ -1,85 +1,166 @@
from .utils import GeoNetworkClient
from .utils import GEONETWORK_V210, GEONETWORK_V26
import re
import urllib
import urlparse
import logging
from ckan import model
from ckan.model import Session
from ckan.plugins.core import SingletonPlugin, implements
from ckanext.harvest.interfaces import IHarvester
from ckanext.harvest.model import HarvestObject
from ckanext.harvest.model import HarvestObjectExtra as HOExtra
from ckan.plugins.core import SingletonPlugin
from ckanext.spatial.lib.csw_client import CswService
from ckanext.spatial.harvesters.base import SpatialHarvester, text_traceback
from ckanext.spatial.harvesters.csw import CSWHarvester
from ckanext.spatial.model import ISODocument
from ckanext.spatial.model import ISOElement
from ckan.lib.helpers import json
import math
from ckan.logic import ValidationError, NotFound, get_action
log = logging.getLogger(__name__)
# Extend the ISODocument definitions by adding some more useful elements
log.info('GeoNetwork harvester: extending ISODocument with TimeInstant')
ISODocument.elements.append(
ISOElement(
name="temporal-extent-instant",
search_paths=[
"gmd:identificationInfo/gmd:MD_DataIdentification/gmd:extent/gmd:EX_Extent/gmd:temporalElement/gmd:EX_TemporalExtent/gmd:extent/gml:TimeInstant/gml:timePosition/text()",
],
multiplicity="*",
))
# Some old GN instances still uses the old GML URL
# We'll add more xpath for addressing this issue
log.info('GeoNetwork harvester: adding old GML URI')
ISOElement.namespaces['oldgml'] = "http://www.opengis.net/gml"
for element in ISODocument.elements:
newpaths = []
for path in element.search_paths:
if "gml:" in path:
newpath = path.replace('gml:', 'oldgml:')
newpaths.append(newpath)
for newpath in newpaths:
element.search_paths.append(newpath)
log.info("Added old URI for gml to %s", element.name)
class GeoNetworkHarvester(CSWHarvester, SingletonPlugin):
def info(self):
return {
'name': 'geonetwork',
'title': 'CSW server (GeoNetwork)',
'description': 'Harvests GeoNetwork instances via CSW',
'form_config_interface': 'Text'
}
def info(self):
return {
'name': 'geonetwork',
'title': 'CSW server (GeoNetwork)',
'description': 'Harvests GeoNetwork instances via CSW',
'form_config_interface': 'Text'
}
def get_package_dict(self, iso_values, harvest_object):
def get_package_dict(self, iso_values, harvest_object):
package_dict = super(GeoNetworkHarvester, self).get_package_dict(iso_values, harvest_object)
package_dict = super(GeoNetworkHarvester, self).get_package_dict(iso_values, harvest_object)
# Add default_tags from config
default_tags = self.source_config.get('default_tags',[])
if default_tags:
for tag in default_tags:
package_dict['tags'].append({'name': tag})
# Add default_tags from config
default_tags = self.source_config.get('default_tags', [])
if default_tags:
for tag in default_tags:
package_dict['tags'].append({'name': tag})
# Add default_extras from config
default_extras = self.source_config.get('default_extras',{})
if default_extras:
override_extras = self.source_config.get('override_extras',False)
# Add default_extras from config
default_extras = self.source_config.get('default_extras', {})
if default_extras:
override_extras = self.source_config.get('override_extras', False)
existing_keys = [entry.get('key') for entry in package_dict['extras'] ]
existing_keys = [entry.get('key') for entry in package_dict['extras']]
for key,value in default_extras.iteritems():
log.debug('Processing extra %s', key)
if not key in existing_keys or override_extras:
# Look for replacement strings
if isinstance(value,basestring):
value = value.format(
harvest_source_id=str(harvest_object.job.source.id),
harvest_source_url=str(harvest_object.job.source.url).strip('/'),
harvest_source_title=str(harvest_object.job.source.title),
harvest_job_id=str(harvest_object.job.id),
harvest_object_id=str(harvest_object.id),
guid=str(harvest_object.guid))
package_dict['extras'].append( {'key': key, 'value': value })
else:
log.debug('Skipping existing extra %s', key)
for key, value in default_extras.iteritems():
log.debug('Processing extra %s', key)
if not key in existing_keys or override_extras:
# Look for replacement strings
if isinstance(value, basestring):
value = value.format(
harvest_source_id=str(harvest_object.job.source.id),
harvest_source_url=str(harvest_object.job.source.url).strip('/'),
harvest_source_title=str(harvest_object.job.source.title),
harvest_job_id=str(harvest_object.job.id),
harvest_object_id=str(harvest_object.id),
guid=str(harvest_object.guid))
package_dict['extras'].append({'key': key, 'value': value})
else:
log.debug('Skipping existing extra %s', key)
# Add GeoNetwork specific extras
gn_localized_url = harvest_object.job.source.url.strip('/')
if gn_localized_url[-3:] == 'csw':
gn_localized_url = gn_localized_url[:-3]
log.debug('GN localized URL %s', gn_localized_url)
#log.debug('Package dict is %r ', package_dict['extras'])
package_dict['extras'].append({'key': 'gn_view_metadata_url', 'value': gn_localized_url + '/metadata.show?uuid=' + harvest_object.guid})
package_dict['extras'].append({'key': 'gn_localized_url', 'value': gn_localized_url})
# Add other elements from ISO metadata
time_extents = self.infer_timeinstants(iso_values)
if time_extents:
log.info("Adding Time Instants...")
package_dict['extras'].append({'key': 'temporal-extent-instant', 'value': time_extents})
# Handle groups mapping using GeoNetwork categories
group_mapping = self.source_config.get('group_mapping', {})
if group_mapping:
try:
context = {'model': model, 'session': Session, 'user': 'harvest'}
validated_groups = []
version = self.source_config.get('version')
client = GeoNetworkClient(gn_localized_url, version)
cats = client.retrieveMetadataCategories(harvest_object.guid)
for cat in cats:
groupname = group_mapping[cat]
printname = groupname if not None else "NONE"
log.debug("category %s mapped into %s" % (cat, printname))
if groupname:
try:
data_dict = {'id': groupname}
group = get_action('group_show')(context, data_dict)
#log.info('Group %s found %s' % (groupname, group))
#if self.api_version == 1:
#validated_groups.append(group['name'])
#else:
#validated_groups.append(group['id'])
validated_groups.append({'name': groupname})
except NotFound, e:
log.warning('Group %s from category %s is not available' % (groupname, cat))
package_dict['groups'] = validated_groups
except e:
log.warning('Error handling groups for metadata %s' % harvest_object.guid)
# End of processing, return the modified package
return package_dict
def infer_timeinstants(self, values):
extents = []
for extent in values["temporal-extent-instant"]:
if extent not in extents:
extents.append(extent)
log.info("%d TIME ISTANTS FOUND", len(extents))
if len(extents) > 0:
return ",".join(extents)
return
# Add GeoNetowrk specific extras
gn_localized_url = harvest_object.job.source.url.strip('/')
if gn_localized_url[-3:] == 'csw' :
gn_localized_url = gn_localized_url[:-3]
log.debug('GN localized URL %s', gn_localized_url)
#log.debug('Package dict is %r ', package_dict['extras'])
package_dict['extras'].append( {'key': 'gn_view_metadata_url', 'value': gn_localized_url + '/metadata.show?uuid=' + harvest_object.guid })
package_dict['extras'].append( {'key': 'gn_localized_url', 'value': gn_localized_url })
# End of processing, return the modified package
return package_dict

View File

@ -0,0 +1,72 @@
# -*- coding: utf-8 -*-
import logging
#import re
import urllib
import urllib2
import zipfile
from StringIO import StringIO
from lxml import etree
GEONETWORK_V26 = "2.6"
GEONETWORK_V210 = "2.10"
GEONETWORK_VERSIONS = [GEONETWORK_V26, GEONETWORK_V210]
logger = logging.getLogger(__name__)
class GeoNetworkClient(object):
def __init__(self, base, version):
if version is None:
version = GEONETWORK_V210
assert version in GEONETWORK_VERSIONS
self.version = version
self.base = base
def retrieveInfo(self, uuid):
if self.version == GEONETWORK_V26:
url = "%s/srv/en/mef.export" % self.base
#headers = {
#"Content-Type": "application/x-www-form-urlencoded",
#"Accept": "text/plain"
#}
query = urllib.urlencode({
"uuid": uuid
})
logger.info('Loading MEF for %s', uuid)
request = urllib2.Request(url, query)
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(), urllib2.HTTPRedirectHandler())
response = opener.open(request) # will get a ZIP file
content = response.read()
#logger.info('----> %s', content)
#print 'RESPONSE ', content
zdata = StringIO(content)
zfile = zipfile.ZipFile(zdata)
xml = None
for name in zfile.namelist():
#logger.info(' MEF entry: %s', name)
#print ' MEF entry: ', name
if name == 'info.xml':
uncompressed = zfile.read(name)
xml = etree.fromstring(uncompressed)
return xml
def retrieveMetadataCategories(self, uuid):
xml = self.retrieveInfo(uuid)
cats = []
for cat in xml.findall('categories/category'):
cats.append(cat.get('name'))
return cats

View File

@ -1,32 +1,32 @@
from setuptools import setup, find_packages
import sys, os
version = '1.0'
version = '1.1'
setup(
name='ckanext-geonetwork',
version=version,
description="CKAN harvester for GeoNetwork",
long_description="""\
""",
classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
keywords='',
author='Emanuele Tajariol',
author_email='etj@geo-solutions.it',
url='',
license='',
packages=find_packages(exclude=['ez_setup', 'examples', 'tests']),
namespace_packages=['ckanext', 'ckanext.geonetwork'],
include_package_data=True,
zip_safe=False,
install_requires=[
# -*- Extra requirements: -*-
],
entry_points=\
"""
name='ckanext-geonetwork',
version=version,
description="CKAN harvester for GeoNetwork",
long_description="""\
""",
classifiers=[], # Get strings from http://pypi.python.org/pypi?%3Aaction=list_classifiers
keywords='',
author='Emanuele Tajariol',
author_email='etj@geo-solutions.it',
url='',
license='',
packages=find_packages(exclude=['ez_setup', 'examples', 'tests']),
namespace_packages=['ckanext', 'ckanext.geonetwork'],
include_package_data=True,
zip_safe=False,
install_requires=[
# -*- Extra requirements: -*-
],
entry_points=
"""
[ckan.plugins]
# Add plugins here, eg
# myplugin=ckanext.geonetwork:PluginClass
# Add plugins here, eg
# myplugin=ckanext.geonetwork:PluginClass
geonetwork_harvester=ckanext.geonetwork.harvesters:GeoNetworkHarvester
""",
""",
)