Merge branch 'master' into release-v2.0

This commit is contained in:
amercader 2014-06-10 18:17:11 +01:00
commit f1e27c717c
3 changed files with 354 additions and 61 deletions

View File

@ -23,12 +23,14 @@ from ckan import model
from ckan.lib.helpers import json
from ckan import logic
from ckan.lib.navl.validators import not_empty
from ckan.lib.search.index import PackageSearchIndex
from ckanext.harvest.harvesters.base import HarvesterBase
from ckanext.harvest.model import HarvestObject
from ckanext.spatial.validation import Validators, all_validators
from ckanext.spatial.model import ISODocument
from ckanext.spatial.interfaces import ISpatialHarvester
log = logging.getLogger(__name__)
@ -109,6 +111,8 @@ class SpatialHarvester(HarvesterBase):
_user_name = None
_site_user = None
source_config = {}
force_import = False
@ -146,10 +150,6 @@ class SpatialHarvester(HarvesterBase):
## SpatialHarvester
'''
These methods can be safely overridden by classes extending
SpatialHarvester
'''
def get_package_dict(self, iso_values, harvest_object):
'''
@ -157,19 +157,23 @@ class SpatialHarvester(HarvesterBase):
package_update. See documentation on
ckan.logic.action.create.package_create for more details
Tipically, custom harvesters would only want to add or modify the
extras, but the whole method can be replaced if necessary. Note that
if only minor modifications need to be made you can call the parent
method from your custom harvester and modify the output, eg:
Extensions willing to modify the dict should do so implementing the
ISpatialHarvester interface
class MyHarvester(SpatialHarvester):
import ckan.plugins as p
from ckanext.spatial.interfaces import ISpatialHarvester
def get_package_dict(self, iso_values, harvest_object):
class MyHarvester(p.SingletonPlugin):
package_dict = super(MyHarvester, self).get_package_dict(iso_values, harvest_object)
p.implements(ISpatialHarvester, inherit=True)
package_dict['extras']['my-custom-extra-1'] = 'value1'
package_dict['extras']['my-custom-extra-2'] = 'value2'
def get_package_dict(self, context, data_dict):
package_dict = data_dict['package_dict']
package_dict['extras'].append(
{'key': 'my-custom-extra', 'value': 'my-custom-value'}
)
return package_dict
@ -364,34 +368,18 @@ class SpatialHarvester(HarvesterBase):
def transform_to_iso(self, original_document, original_format, harvest_object):
'''
Transforms an XML document to ISO 19139
This method will be only called from the import stage if the
harvest_object content is null and original_document and
original_format harvest object extras exist (eg if an FGDC document
was harvested).
In that case, this method should do the necessary to provide an
ISO 1939 like document, otherwise the import process will stop.
:param original_document: Original XML document
:type original_document: string
:param original_format: Original format (eg 'fgdc')
:type original_format: string
:param harvest_object: HarvestObject domain object (with access to
job and source objects)
:type harvest_object: HarvestObject
:returns: An ISO 19139 document or None if the transformation was not
successful
:rtype: string
DEPRECATED: Use the transform_to_iso method of the ISpatialHarvester
interface
'''
self.__base_transform_to_iso_called = True
return None
def import_stage(self, harvest_object):
context = {
'model': model,
'session': model.Session,
'user': self._get_user_name(),
}
log = logging.getLogger(__name__ + '.import')
log.debug('Import stage for harvest object: %s', harvest_object.id)
@ -415,8 +403,9 @@ class SpatialHarvester(HarvesterBase):
if status == 'delete':
# Delete package
context = {'model': model, 'session': model.Session, 'user': self._get_user_name()}
context.update({
'ignore_auth': True,
})
p.toolkit.get_action('package_delete')(context, {'id': harvest_object.package_id})
log.info('Deleted package {0} with guid {1}'.format(harvest_object.package_id, harvest_object.guid))
@ -426,7 +415,16 @@ class SpatialHarvester(HarvesterBase):
original_document = self._get_object_extra(harvest_object, 'original_document')
original_format = self._get_object_extra(harvest_object, 'original_format')
if original_document and original_format:
#DEPRECATED use the ISpatialHarvester interface method
self.__base_transform_to_iso_called = False
content = self.transform_to_iso(original_document, original_format, harvest_object)
if not self.__base_transform_to_iso_called:
log.warn('Deprecation warning: calling transform_to_iso directly is deprecated. ' +
'Please use the ISpatialHarvester interface method instead.')
for harvester in p.PluginImplementations(ISpatialHarvester):
content = harvester.transform_to_iso(original_document, original_format, harvest_object)
if content:
harvest_object.content = content
else:
@ -449,7 +447,9 @@ class SpatialHarvester(HarvesterBase):
# Parse ISO document
try:
iso_values = ISODocument(harvest_object.content).read_values()
iso_parser = ISODocument(harvest_object.content)
iso_values = iso_parser.read_values()
except Exception, e:
self._save_object_error('Error parsing ISO document for object {0}: {1}'.format(harvest_object.id, str(e)),
harvest_object, 'Import')
@ -495,21 +495,27 @@ class SpatialHarvester(HarvesterBase):
harvest_object.metadata_modified_date = metadata_modified_date
harvest_object.add()
# Build the package dict
package_dict = self.get_package_dict(iso_values, harvest_object)
for harvester in p.PluginImplementations(ISpatialHarvester):
package_dict = harvester.get_package_dict(context, {
'package_dict': package_dict,
'iso_values': iso_values,
'xml_tree': iso_parser.xml_tree,
'harvest_object': harvest_object,
})
if not package_dict:
log.error('No package dict returned, aborting import for object {0}'.format(harvest_object.id))
return False
# Create / update the package
context.update({
'extras_as_string': True,
'api_version': '2',
'return_id_only': True})
context = {'model': model,
'session': model.Session,
'user': self._get_user_name(),
'extras_as_string': True,
'api_version': '2',
'return_id_only': True}
if context['user'] == self._site_user['name']:
if self._site_user and context['user'] == self._site_user['name']:
context['ignore_auth'] = True
@ -550,7 +556,7 @@ class SpatialHarvester(HarvesterBase):
elif status == 'change':
# Check if the modified date is more recent
if not self.force_import and harvest_object.metadata_modified_date <= previous_object.metadata_modified_date:
if not self.force_import and previous_object and harvest_object.metadata_modified_date <= previous_object.metadata_modified_date:
# Assign the previous job id to the new object to
# avoid losing history
@ -560,6 +566,25 @@ class SpatialHarvester(HarvesterBase):
# Delete the previous object to avoid cluttering the object table
previous_object.delete()
# Reindex the corresponding package to update the reference to the
# harvest object
if ((config.get('ckanext.spatial.harvest.reindex_unchanged', True) != 'False'
or self.source_config.get('reindex_unchanged') != 'False')
and harvest_object.package_id):
context.update({'validate': False, 'ignore_auth': True})
try:
package_dict = logic.get_action('package_show')(context,
{'id': harvest_object.package_id})
except p.toolkit.ObjectNotFound:
pass
else:
for extra in package_dict.get('extras', []):
if extra['key'] == 'harvest_object_id':
extra['value'] = harvest_object.id
if package_dict:
package_index = PackageSearchIndex()
package_index.index_package(package_dict)
log.info('Document with GUID %s unchanged, skipping...' % (harvest_object.guid))
else:
package_schema = logic.schema.default_update_package_schema()
@ -637,6 +662,15 @@ class SpatialHarvester(HarvesterBase):
else:
profiles = DEFAULT_VALIDATOR_PROFILES
self._validator = Validators(profiles=profiles)
# Add any custom validators from extensions
for plugin_with_validators in p.PluginImplementations(ISpatialHarvester):
custom_validators = plugin_with_validators.get_validators()
for custom_validator in custom_validators:
if custom_validator not in all_validators:
self._validator.add_validator(custom_validator)
return self._validator
def _get_user_name(self):

View File

@ -0,0 +1,94 @@
from ckan.plugins.interfaces import Interface
class ISpatialHarvester(Interface):
def get_package_dict(self, context, data_dict):
'''
Allows to modify the dataset dict that will be created or updated
This is the dict that the harvesters will pass to the `package_create`
or `package_update` actions. Extensions can modify it to suit their
needs, adding or removing filds, modifying the default ones, etc.
This method should always return a package_dict. Note that, although
unlikely in a particular instance, this method could be implemented by
more than one plugin.
If a dict is not returned by this function, the import stage will be
cancelled.
.. note:: Make sure to run ``model.Session.flush()`` if you perform
queries using the model included in the ``context`` object.
:param context: Contains a reference to the model, eg to
perform DB queries, and the user name used for
authorization.
:type context: dict
:param data_dict: Available data. Contains three keys:
* `package_dict`
The default package_dict generated by the harvester. Modify this
or create a brand new one.
* `iso_values`
The parsed ISO XML document values. These contain more fields
that are not added by default to the ``package_dict``.
* `xml_tree`
The full XML etree object. If some values not present in
``iso_values`` are needed, these can be extracted via xpath.
* `harvest_object`
A ``HarvestObject`` domain object which contains a reference
to the original metadata document (``harvest_object.content``)
and the harvest source (``harvest_object.source``).
:type data_dict: dict
:returns: A dataset dict ready to be used by ``package_create`` or
``package_update``
:rtype: dict
'''
return data_dict['package_dict']
def get_validators(self):
'''
Allows to register custom Validators that can be applied to harvested
metadata documents.
Validators are classes that implement the ``is_valid`` method. Check
the `Writing custom validators`_ section in the docs to know more
about writing custom validators.
:returns: A list of Validator classes
:rtype: list
'''
return []
def transform_to_iso(self, original_document, original_format, harvest_object):
'''
Transforms an XML document to ISO 19139
This method will be only called from the import stage if the
harvest_object content is null and original_document and
original_format harvest object extras exist (eg if an FGDC document
was harvested).
In that case, this method should do the necessary to provide an
ISO 1939 like document, otherwise the import process will stop.
:param original_document: Original XML document
:type original_document: string
:param original_format: Original format (eg 'fgdc')
:type original_format: string
:param harvest_object: HarvestObject domain object (with access to
job and source objects)
:type harvest_object: HarvestObject
:returns: An ISO 19139 document or None if the transformation was not
successful
:rtype: string
'''
return None

View File

@ -31,7 +31,8 @@ separate stages:
content into a CKAN dataset: validates the document, parses it, converts it
to a CKAN dataset dict and saves it in the database.
The extension provides different XSD and schematron based validators. You can
The extension provides different XSD and schematron based validators, and you
can also write your own (see `Writing custom validators`_). You can
specify which validators to use for the remote documents with the following
configuration option::
@ -51,27 +52,191 @@ hardcoded 'harvest' user::
ckanext.spatial.harvest.user_name = harvest
When a document has not been updated remotely, the previous harvest object is
replaced by the current one rather than keeping it, to avoid cluttering the
``harvest_object`` table. This means that the ``harvest_object_id`` reference
on the linked dataset needs to be updated, by reindexing it. This will happen
by default, but if you want to turn it off (eg if you are doing separate
reindexing) it can be turn off with the following option::
ckanext.spatial.harvest.reindex_unchanged = False
Customizing the harvesters
--------------------------
The default harvesters provided in this extension can be overriden from
extensions to customize to your needs. You can either extend ``CswHarvester``,
``WAFfHarverster`` or the main ``SpatialHarvester`` class. There are some
extension points that can be safely overriden from your extension. Probably the
most useful is ``get_package_dict``, which allows to tweak the dataset fields
before creating or updating them. ``transform_to_iso`` allows to hook into
transformation mechanisms to transform other formats into ISO1939, the only one
directly supported byt he spatial harvesters. Finally, the whole
``import_stage`` can be overriden if the default logic does not suit your
needs.
The default harvesters provided in this extension can be extended from
extensions implementing the ``ISpatialHarvester`` interface.
Check the source code of ``ckanext/spatial/harvesters/base.py`` for more
details on these functions.
Probably the most useful extension point is ``get_package_dict``, which
allows to tweak the dataset fields before creating or updating it::
import ckan.plugins as p
from ckanext.spatial.interfaces import ISpatialHarvester
class MyPlugin(p.SingletonPlugin):
p.implements(ISpatialHarvester, inherit=True)
def get_package_dict(self, context, data_dict):
# Check the reference below to see all that's included on data_dict
package_dict = data_dict['package_dict']
iso_values = data_dict['iso_values']
package_dict['extras'].append(
{'key': 'topic-category', 'value': iso_values.get('topic-category')}
)
package_dict['extras'].append(
{'key': 'my-custom-extra', 'value': 'my-custom-value'}
)
return package_dict
``get_validators`` allows to register custom validation classes that can be
applied to the harvested documents. Check the `Writing custom validators`_
section to know more about how to write your custom validators::
import ckan.plugins as p
from ckanext.spatial.interfaces import ISpatialHarvester
from ckanext.spatial.validation.validation import BaseValidator
class MyPlugin(p.SingletonPlugin):
p.implements(ISpatialHarvester, inherit=True)
def get_validators(self):
return [MyValidator]
class MyValidator(BaseValidator):
name = 'my-validator'
title= 'My very own validator'
@classmethod
def is_valid(cls, xml):
return True, []
``transform_to_iso`` allows to hook into transformation mechanisms to
transform other formats into ISO1939, the only one directly supported by
the spatial harvesters.
Here is the full reference for the provided extension points:
.. autoclass:: ckanext.spatial.interfaces.ISpatialHarvester
:members:
If you need to further customize the default behaviour of the harvesters, you
can either extend ``CswHarvester``, ``WAFfHarverster`` or the main
``SpatialHarvester`` class., for instance to override the whole
``import_stage`` if the default logic does not suit your
needs.
The `ckanext-geodatagov`_ extension contains live examples on how to extend
the default spatial harvesters and create new ones for other spatial services
like ArcGIS REST APIs.
Writing custom validators
-------------------------
Validator classes extend the ``BaseValidator`` class:
.. autoclass:: ckanext.spatial.validation.validation.BaseValidator
:members:
Helper classes are provided for XSD and schematron based validation, and
completely custom logic can be also implemented. Here are some examples of
the most common types:
* XSD based validators::
class ISO19139NGDCSchema(XsdValidator):
'''
XSD based validation for ISO 19139 documents.
Uses XSD schema from the NOAA National Geophysical Data Center:
http://ngdc.noaa.gov/metadata/published/xsd/
'''
name = 'iso19139ngdc'
title = 'ISO19139 XSD Schema (NGDC)'
@classmethod
def is_valid(cls, xml):
xsd_path = 'xml/iso19139ngdc'
xsd_filepath = os.path.join(os.path.dirname(__file__),
xsd_path, 'schema.xsd')
return cls._is_valid(xml, xsd_filepath, 'NGDC Schema (schema.xsd)')
* Schematron validators::
class Gemini2Schematron(SchematronValidator):
name = 'gemini2'
title = 'GEMINI 2.1 Schematron 1.2'
@classmethod
def get_schematrons(cls):
with resource_stream("ckanext.spatial",
"validation/xml/gemini2/gemini2-schematron-20110906-v1.2.sch") as schema:
return [cls.schematron(schema)]
* Custom validators::
class MinimalFGDCValidator(BaseValidator):
name = 'fgdc_minimal'
title = 'FGDC Minimal Validation'
_elements = [
('Identification Citation Title', '/metadata/idinfo/citation/citeinfo/title'),
('Identification Citation Originator', '/metadata/idinfo/citation/citeinfo/origin'),
('Identification Citation Publication Date', '/metadata/idinfo/citation/citeinfo/pubdate'),
('Identification Description Abstract', '/metadata/idinfo/descript/abstract'),
('Identification Spatial Domain West Bounding Coordinate', '/metadata/idinfo/spdom/bounding/westbc'),
('Identification Spatial Domain East Bounding Coordinate', '/metadata/idinfo/spdom/bounding/eastbc'),
('Identification Spatial Domain North Bounding Coordinate', '/metadata/idinfo/spdom/bounding/northbc'),
('Identification Spatial Domain South Bounding Coordinate', '/metadata/idinfo/spdom/bounding/southbc'),
('Metadata Reference Information Contact Address Type', '/metadata/metainfo/metc/cntinfo/cntaddr/addrtype'),
('Metadata Reference Information Contact Address State', '/metadata/metainfo/metc/cntinfo/cntaddr/state'),
]
@classmethod
def is_valid(cls, xml):
errors = []
for title, xpath in cls._elements:
element = xml.xpath(xpath)
if len(element) == 0 or not element[0].text:
errors.append(('Element not found: {0}'.format(title), None))
if len(errors):
return False, errors
return True, []
The `validation.py`_ file included in the ckanext-spatial extension contains
more examples of the different types.
Remember that after registering your own validators you must specify them on
the following configuration option::
ckan.spatial.validator.profiles = iso19193eden,my-validator
.. _validation.py: https://github.com/ckan/ckanext-spatial/blob/master/ckanext/spatial/validation/validation.py
Harvest Metadata API
--------------------