Merge branch 'master' into release-v2.0
This commit is contained in:
commit
f1e27c717c
|
@ -23,12 +23,14 @@ from ckan import model
|
|||
from ckan.lib.helpers import json
|
||||
from ckan import logic
|
||||
from ckan.lib.navl.validators import not_empty
|
||||
from ckan.lib.search.index import PackageSearchIndex
|
||||
|
||||
from ckanext.harvest.harvesters.base import HarvesterBase
|
||||
from ckanext.harvest.model import HarvestObject
|
||||
|
||||
from ckanext.spatial.validation import Validators, all_validators
|
||||
from ckanext.spatial.model import ISODocument
|
||||
from ckanext.spatial.interfaces import ISpatialHarvester
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -109,6 +111,8 @@ class SpatialHarvester(HarvesterBase):
|
|||
|
||||
_user_name = None
|
||||
|
||||
_site_user = None
|
||||
|
||||
source_config = {}
|
||||
|
||||
force_import = False
|
||||
|
@ -146,10 +150,6 @@ class SpatialHarvester(HarvesterBase):
|
|||
|
||||
## SpatialHarvester
|
||||
|
||||
'''
|
||||
These methods can be safely overridden by classes extending
|
||||
SpatialHarvester
|
||||
'''
|
||||
|
||||
def get_package_dict(self, iso_values, harvest_object):
|
||||
'''
|
||||
|
@ -157,19 +157,23 @@ class SpatialHarvester(HarvesterBase):
|
|||
package_update. See documentation on
|
||||
ckan.logic.action.create.package_create for more details
|
||||
|
||||
Tipically, custom harvesters would only want to add or modify the
|
||||
extras, but the whole method can be replaced if necessary. Note that
|
||||
if only minor modifications need to be made you can call the parent
|
||||
method from your custom harvester and modify the output, eg:
|
||||
Extensions willing to modify the dict should do so implementing the
|
||||
ISpatialHarvester interface
|
||||
|
||||
class MyHarvester(SpatialHarvester):
|
||||
import ckan.plugins as p
|
||||
from ckanext.spatial.interfaces import ISpatialHarvester
|
||||
|
||||
def get_package_dict(self, iso_values, harvest_object):
|
||||
class MyHarvester(p.SingletonPlugin):
|
||||
|
||||
package_dict = super(MyHarvester, self).get_package_dict(iso_values, harvest_object)
|
||||
p.implements(ISpatialHarvester, inherit=True)
|
||||
|
||||
package_dict['extras']['my-custom-extra-1'] = 'value1'
|
||||
package_dict['extras']['my-custom-extra-2'] = 'value2'
|
||||
def get_package_dict(self, context, data_dict):
|
||||
|
||||
package_dict = data_dict['package_dict']
|
||||
|
||||
package_dict['extras'].append(
|
||||
{'key': 'my-custom-extra', 'value': 'my-custom-value'}
|
||||
)
|
||||
|
||||
return package_dict
|
||||
|
||||
|
@ -364,34 +368,18 @@ class SpatialHarvester(HarvesterBase):
|
|||
|
||||
def transform_to_iso(self, original_document, original_format, harvest_object):
|
||||
'''
|
||||
Transforms an XML document to ISO 19139
|
||||
|
||||
This method will be only called from the import stage if the
|
||||
harvest_object content is null and original_document and
|
||||
original_format harvest object extras exist (eg if an FGDC document
|
||||
was harvested).
|
||||
|
||||
In that case, this method should do the necessary to provide an
|
||||
ISO 1939 like document, otherwise the import process will stop.
|
||||
|
||||
|
||||
:param original_document: Original XML document
|
||||
:type original_document: string
|
||||
:param original_format: Original format (eg 'fgdc')
|
||||
:type original_format: string
|
||||
:param harvest_object: HarvestObject domain object (with access to
|
||||
job and source objects)
|
||||
:type harvest_object: HarvestObject
|
||||
|
||||
:returns: An ISO 19139 document or None if the transformation was not
|
||||
successful
|
||||
:rtype: string
|
||||
|
||||
DEPRECATED: Use the transform_to_iso method of the ISpatialHarvester
|
||||
interface
|
||||
'''
|
||||
|
||||
self.__base_transform_to_iso_called = True
|
||||
return None
|
||||
|
||||
def import_stage(self, harvest_object):
|
||||
context = {
|
||||
'model': model,
|
||||
'session': model.Session,
|
||||
'user': self._get_user_name(),
|
||||
}
|
||||
|
||||
log = logging.getLogger(__name__ + '.import')
|
||||
log.debug('Import stage for harvest object: %s', harvest_object.id)
|
||||
|
@ -415,8 +403,9 @@ class SpatialHarvester(HarvesterBase):
|
|||
|
||||
if status == 'delete':
|
||||
# Delete package
|
||||
context = {'model': model, 'session': model.Session, 'user': self._get_user_name()}
|
||||
|
||||
context.update({
|
||||
'ignore_auth': True,
|
||||
})
|
||||
p.toolkit.get_action('package_delete')(context, {'id': harvest_object.package_id})
|
||||
log.info('Deleted package {0} with guid {1}'.format(harvest_object.package_id, harvest_object.guid))
|
||||
|
||||
|
@ -426,7 +415,16 @@ class SpatialHarvester(HarvesterBase):
|
|||
original_document = self._get_object_extra(harvest_object, 'original_document')
|
||||
original_format = self._get_object_extra(harvest_object, 'original_format')
|
||||
if original_document and original_format:
|
||||
#DEPRECATED use the ISpatialHarvester interface method
|
||||
self.__base_transform_to_iso_called = False
|
||||
content = self.transform_to_iso(original_document, original_format, harvest_object)
|
||||
if not self.__base_transform_to_iso_called:
|
||||
log.warn('Deprecation warning: calling transform_to_iso directly is deprecated. ' +
|
||||
'Please use the ISpatialHarvester interface method instead.')
|
||||
|
||||
for harvester in p.PluginImplementations(ISpatialHarvester):
|
||||
content = harvester.transform_to_iso(original_document, original_format, harvest_object)
|
||||
|
||||
if content:
|
||||
harvest_object.content = content
|
||||
else:
|
||||
|
@ -449,7 +447,9 @@ class SpatialHarvester(HarvesterBase):
|
|||
|
||||
# Parse ISO document
|
||||
try:
|
||||
iso_values = ISODocument(harvest_object.content).read_values()
|
||||
|
||||
iso_parser = ISODocument(harvest_object.content)
|
||||
iso_values = iso_parser.read_values()
|
||||
except Exception, e:
|
||||
self._save_object_error('Error parsing ISO document for object {0}: {1}'.format(harvest_object.id, str(e)),
|
||||
harvest_object, 'Import')
|
||||
|
@ -495,21 +495,27 @@ class SpatialHarvester(HarvesterBase):
|
|||
harvest_object.metadata_modified_date = metadata_modified_date
|
||||
harvest_object.add()
|
||||
|
||||
|
||||
# Build the package dict
|
||||
package_dict = self.get_package_dict(iso_values, harvest_object)
|
||||
for harvester in p.PluginImplementations(ISpatialHarvester):
|
||||
package_dict = harvester.get_package_dict(context, {
|
||||
'package_dict': package_dict,
|
||||
'iso_values': iso_values,
|
||||
'xml_tree': iso_parser.xml_tree,
|
||||
'harvest_object': harvest_object,
|
||||
})
|
||||
if not package_dict:
|
||||
log.error('No package dict returned, aborting import for object {0}'.format(harvest_object.id))
|
||||
return False
|
||||
|
||||
# Create / update the package
|
||||
context.update({
|
||||
'extras_as_string': True,
|
||||
'api_version': '2',
|
||||
'return_id_only': True})
|
||||
|
||||
context = {'model': model,
|
||||
'session': model.Session,
|
||||
'user': self._get_user_name(),
|
||||
'extras_as_string': True,
|
||||
'api_version': '2',
|
||||
'return_id_only': True}
|
||||
if context['user'] == self._site_user['name']:
|
||||
if self._site_user and context['user'] == self._site_user['name']:
|
||||
context['ignore_auth'] = True
|
||||
|
||||
|
||||
|
@ -550,7 +556,7 @@ class SpatialHarvester(HarvesterBase):
|
|||
elif status == 'change':
|
||||
|
||||
# Check if the modified date is more recent
|
||||
if not self.force_import and harvest_object.metadata_modified_date <= previous_object.metadata_modified_date:
|
||||
if not self.force_import and previous_object and harvest_object.metadata_modified_date <= previous_object.metadata_modified_date:
|
||||
|
||||
# Assign the previous job id to the new object to
|
||||
# avoid losing history
|
||||
|
@ -560,6 +566,25 @@ class SpatialHarvester(HarvesterBase):
|
|||
# Delete the previous object to avoid cluttering the object table
|
||||
previous_object.delete()
|
||||
|
||||
# Reindex the corresponding package to update the reference to the
|
||||
# harvest object
|
||||
if ((config.get('ckanext.spatial.harvest.reindex_unchanged', True) != 'False'
|
||||
or self.source_config.get('reindex_unchanged') != 'False')
|
||||
and harvest_object.package_id):
|
||||
context.update({'validate': False, 'ignore_auth': True})
|
||||
try:
|
||||
package_dict = logic.get_action('package_show')(context,
|
||||
{'id': harvest_object.package_id})
|
||||
except p.toolkit.ObjectNotFound:
|
||||
pass
|
||||
else:
|
||||
for extra in package_dict.get('extras', []):
|
||||
if extra['key'] == 'harvest_object_id':
|
||||
extra['value'] = harvest_object.id
|
||||
if package_dict:
|
||||
package_index = PackageSearchIndex()
|
||||
package_index.index_package(package_dict)
|
||||
|
||||
log.info('Document with GUID %s unchanged, skipping...' % (harvest_object.guid))
|
||||
else:
|
||||
package_schema = logic.schema.default_update_package_schema()
|
||||
|
@ -637,6 +662,15 @@ class SpatialHarvester(HarvesterBase):
|
|||
else:
|
||||
profiles = DEFAULT_VALIDATOR_PROFILES
|
||||
self._validator = Validators(profiles=profiles)
|
||||
|
||||
# Add any custom validators from extensions
|
||||
for plugin_with_validators in p.PluginImplementations(ISpatialHarvester):
|
||||
custom_validators = plugin_with_validators.get_validators()
|
||||
for custom_validator in custom_validators:
|
||||
if custom_validator not in all_validators:
|
||||
self._validator.add_validator(custom_validator)
|
||||
|
||||
|
||||
return self._validator
|
||||
|
||||
def _get_user_name(self):
|
||||
|
|
|
@ -0,0 +1,94 @@
|
|||
from ckan.plugins.interfaces import Interface
|
||||
|
||||
|
||||
class ISpatialHarvester(Interface):
|
||||
|
||||
def get_package_dict(self, context, data_dict):
|
||||
'''
|
||||
Allows to modify the dataset dict that will be created or updated
|
||||
|
||||
This is the dict that the harvesters will pass to the `package_create`
|
||||
or `package_update` actions. Extensions can modify it to suit their
|
||||
needs, adding or removing filds, modifying the default ones, etc.
|
||||
|
||||
This method should always return a package_dict. Note that, although
|
||||
unlikely in a particular instance, this method could be implemented by
|
||||
more than one plugin.
|
||||
|
||||
If a dict is not returned by this function, the import stage will be
|
||||
cancelled.
|
||||
|
||||
.. note:: Make sure to run ``model.Session.flush()`` if you perform
|
||||
queries using the model included in the ``context`` object.
|
||||
|
||||
|
||||
:param context: Contains a reference to the model, eg to
|
||||
perform DB queries, and the user name used for
|
||||
authorization.
|
||||
:type context: dict
|
||||
:param data_dict: Available data. Contains three keys:
|
||||
|
||||
* `package_dict`
|
||||
The default package_dict generated by the harvester. Modify this
|
||||
or create a brand new one.
|
||||
* `iso_values`
|
||||
The parsed ISO XML document values. These contain more fields
|
||||
that are not added by default to the ``package_dict``.
|
||||
* `xml_tree`
|
||||
The full XML etree object. If some values not present in
|
||||
``iso_values`` are needed, these can be extracted via xpath.
|
||||
* `harvest_object`
|
||||
A ``HarvestObject`` domain object which contains a reference
|
||||
to the original metadata document (``harvest_object.content``)
|
||||
and the harvest source (``harvest_object.source``).
|
||||
|
||||
:type data_dict: dict
|
||||
|
||||
:returns: A dataset dict ready to be used by ``package_create`` or
|
||||
``package_update``
|
||||
:rtype: dict
|
||||
'''
|
||||
return data_dict['package_dict']
|
||||
|
||||
def get_validators(self):
|
||||
'''
|
||||
Allows to register custom Validators that can be applied to harvested
|
||||
metadata documents.
|
||||
|
||||
Validators are classes that implement the ``is_valid`` method. Check
|
||||
the `Writing custom validators`_ section in the docs to know more
|
||||
about writing custom validators.
|
||||
|
||||
:returns: A list of Validator classes
|
||||
:rtype: list
|
||||
'''
|
||||
return []
|
||||
|
||||
def transform_to_iso(self, original_document, original_format, harvest_object):
|
||||
'''
|
||||
Transforms an XML document to ISO 19139
|
||||
|
||||
This method will be only called from the import stage if the
|
||||
harvest_object content is null and original_document and
|
||||
original_format harvest object extras exist (eg if an FGDC document
|
||||
was harvested).
|
||||
|
||||
In that case, this method should do the necessary to provide an
|
||||
ISO 1939 like document, otherwise the import process will stop.
|
||||
|
||||
|
||||
:param original_document: Original XML document
|
||||
:type original_document: string
|
||||
:param original_format: Original format (eg 'fgdc')
|
||||
:type original_format: string
|
||||
:param harvest_object: HarvestObject domain object (with access to
|
||||
job and source objects)
|
||||
:type harvest_object: HarvestObject
|
||||
|
||||
:returns: An ISO 19139 document or None if the transformation was not
|
||||
successful
|
||||
:rtype: string
|
||||
|
||||
'''
|
||||
return None
|
||||
|
|
@ -31,7 +31,8 @@ separate stages:
|
|||
content into a CKAN dataset: validates the document, parses it, converts it
|
||||
to a CKAN dataset dict and saves it in the database.
|
||||
|
||||
The extension provides different XSD and schematron based validators. You can
|
||||
The extension provides different XSD and schematron based validators, and you
|
||||
can also write your own (see `Writing custom validators`_). You can
|
||||
specify which validators to use for the remote documents with the following
|
||||
configuration option::
|
||||
|
||||
|
@ -51,27 +52,191 @@ hardcoded 'harvest' user::
|
|||
|
||||
ckanext.spatial.harvest.user_name = harvest
|
||||
|
||||
When a document has not been updated remotely, the previous harvest object is
|
||||
replaced by the current one rather than keeping it, to avoid cluttering the
|
||||
``harvest_object`` table. This means that the ``harvest_object_id`` reference
|
||||
on the linked dataset needs to be updated, by reindexing it. This will happen
|
||||
by default, but if you want to turn it off (eg if you are doing separate
|
||||
reindexing) it can be turn off with the following option::
|
||||
|
||||
ckanext.spatial.harvest.reindex_unchanged = False
|
||||
|
||||
|
||||
Customizing the harvesters
|
||||
--------------------------
|
||||
|
||||
The default harvesters provided in this extension can be overriden from
|
||||
extensions to customize to your needs. You can either extend ``CswHarvester``,
|
||||
``WAFfHarverster`` or the main ``SpatialHarvester`` class. There are some
|
||||
extension points that can be safely overriden from your extension. Probably the
|
||||
most useful is ``get_package_dict``, which allows to tweak the dataset fields
|
||||
before creating or updating them. ``transform_to_iso`` allows to hook into
|
||||
transformation mechanisms to transform other formats into ISO1939, the only one
|
||||
directly supported byt he spatial harvesters. Finally, the whole
|
||||
``import_stage`` can be overriden if the default logic does not suit your
|
||||
needs.
|
||||
The default harvesters provided in this extension can be extended from
|
||||
extensions implementing the ``ISpatialHarvester`` interface.
|
||||
|
||||
Check the source code of ``ckanext/spatial/harvesters/base.py`` for more
|
||||
details on these functions.
|
||||
Probably the most useful extension point is ``get_package_dict``, which
|
||||
allows to tweak the dataset fields before creating or updating it::
|
||||
|
||||
import ckan.plugins as p
|
||||
from ckanext.spatial.interfaces import ISpatialHarvester
|
||||
|
||||
class MyPlugin(p.SingletonPlugin):
|
||||
|
||||
p.implements(ISpatialHarvester, inherit=True)
|
||||
|
||||
def get_package_dict(self, context, data_dict):
|
||||
|
||||
# Check the reference below to see all that's included on data_dict
|
||||
|
||||
package_dict = data_dict['package_dict']
|
||||
iso_values = data_dict['iso_values']
|
||||
|
||||
package_dict['extras'].append(
|
||||
{'key': 'topic-category', 'value': iso_values.get('topic-category')}
|
||||
)
|
||||
|
||||
package_dict['extras'].append(
|
||||
{'key': 'my-custom-extra', 'value': 'my-custom-value'}
|
||||
)
|
||||
|
||||
return package_dict
|
||||
|
||||
``get_validators`` allows to register custom validation classes that can be
|
||||
applied to the harvested documents. Check the `Writing custom validators`_
|
||||
section to know more about how to write your custom validators::
|
||||
|
||||
import ckan.plugins as p
|
||||
from ckanext.spatial.interfaces import ISpatialHarvester
|
||||
from ckanext.spatial.validation.validation import BaseValidator
|
||||
|
||||
class MyPlugin(p.SingletonPlugin):
|
||||
|
||||
p.implements(ISpatialHarvester, inherit=True)
|
||||
|
||||
def get_validators(self):
|
||||
return [MyValidator]
|
||||
|
||||
|
||||
class MyValidator(BaseValidator):
|
||||
|
||||
name = 'my-validator'
|
||||
|
||||
title= 'My very own validator'
|
||||
|
||||
@classmethod
|
||||
def is_valid(cls, xml):
|
||||
|
||||
return True, []
|
||||
|
||||
|
||||
``transform_to_iso`` allows to hook into transformation mechanisms to
|
||||
transform other formats into ISO1939, the only one directly supported by
|
||||
the spatial harvesters.
|
||||
|
||||
Here is the full reference for the provided extension points:
|
||||
|
||||
.. autoclass:: ckanext.spatial.interfaces.ISpatialHarvester
|
||||
:members:
|
||||
|
||||
If you need to further customize the default behaviour of the harvesters, you
|
||||
can either extend ``CswHarvester``, ``WAFfHarverster`` or the main
|
||||
``SpatialHarvester`` class., for instance to override the whole
|
||||
``import_stage`` if the default logic does not suit your
|
||||
needs.
|
||||
|
||||
The `ckanext-geodatagov`_ extension contains live examples on how to extend
|
||||
the default spatial harvesters and create new ones for other spatial services
|
||||
like ArcGIS REST APIs.
|
||||
|
||||
Writing custom validators
|
||||
-------------------------
|
||||
|
||||
|
||||
Validator classes extend the ``BaseValidator`` class:
|
||||
|
||||
.. autoclass:: ckanext.spatial.validation.validation.BaseValidator
|
||||
:members:
|
||||
|
||||
Helper classes are provided for XSD and schematron based validation, and
|
||||
completely custom logic can be also implemented. Here are some examples of
|
||||
the most common types:
|
||||
|
||||
* XSD based validators::
|
||||
|
||||
class ISO19139NGDCSchema(XsdValidator):
|
||||
'''
|
||||
XSD based validation for ISO 19139 documents.
|
||||
|
||||
Uses XSD schema from the NOAA National Geophysical Data Center:
|
||||
|
||||
http://ngdc.noaa.gov/metadata/published/xsd/
|
||||
|
||||
'''
|
||||
name = 'iso19139ngdc'
|
||||
title = 'ISO19139 XSD Schema (NGDC)'
|
||||
|
||||
@classmethod
|
||||
def is_valid(cls, xml):
|
||||
xsd_path = 'xml/iso19139ngdc'
|
||||
|
||||
xsd_filepath = os.path.join(os.path.dirname(__file__),
|
||||
xsd_path, 'schema.xsd')
|
||||
return cls._is_valid(xml, xsd_filepath, 'NGDC Schema (schema.xsd)')
|
||||
|
||||
|
||||
|
||||
* Schematron validators::
|
||||
|
||||
class Gemini2Schematron(SchematronValidator):
|
||||
name = 'gemini2'
|
||||
title = 'GEMINI 2.1 Schematron 1.2'
|
||||
|
||||
@classmethod
|
||||
def get_schematrons(cls):
|
||||
with resource_stream("ckanext.spatial",
|
||||
"validation/xml/gemini2/gemini2-schematron-20110906-v1.2.sch") as schema:
|
||||
return [cls.schematron(schema)]
|
||||
|
||||
|
||||
* Custom validators::
|
||||
|
||||
class MinimalFGDCValidator(BaseValidator):
|
||||
|
||||
name = 'fgdc_minimal'
|
||||
title = 'FGDC Minimal Validation'
|
||||
|
||||
_elements = [
|
||||
('Identification Citation Title', '/metadata/idinfo/citation/citeinfo/title'),
|
||||
('Identification Citation Originator', '/metadata/idinfo/citation/citeinfo/origin'),
|
||||
('Identification Citation Publication Date', '/metadata/idinfo/citation/citeinfo/pubdate'),
|
||||
('Identification Description Abstract', '/metadata/idinfo/descript/abstract'),
|
||||
('Identification Spatial Domain West Bounding Coordinate', '/metadata/idinfo/spdom/bounding/westbc'),
|
||||
('Identification Spatial Domain East Bounding Coordinate', '/metadata/idinfo/spdom/bounding/eastbc'),
|
||||
('Identification Spatial Domain North Bounding Coordinate', '/metadata/idinfo/spdom/bounding/northbc'),
|
||||
('Identification Spatial Domain South Bounding Coordinate', '/metadata/idinfo/spdom/bounding/southbc'),
|
||||
('Metadata Reference Information Contact Address Type', '/metadata/metainfo/metc/cntinfo/cntaddr/addrtype'),
|
||||
('Metadata Reference Information Contact Address State', '/metadata/metainfo/metc/cntinfo/cntaddr/state'),
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def is_valid(cls, xml):
|
||||
|
||||
errors = []
|
||||
|
||||
for title, xpath in cls._elements:
|
||||
element = xml.xpath(xpath)
|
||||
if len(element) == 0 or not element[0].text:
|
||||
errors.append(('Element not found: {0}'.format(title), None))
|
||||
if len(errors):
|
||||
return False, errors
|
||||
|
||||
return True, []
|
||||
|
||||
|
||||
The `validation.py`_ file included in the ckanext-spatial extension contains
|
||||
more examples of the different types.
|
||||
|
||||
Remember that after registering your own validators you must specify them on
|
||||
the following configuration option::
|
||||
|
||||
ckan.spatial.validator.profiles = iso19193eden,my-validator
|
||||
|
||||
|
||||
.. _validation.py: https://github.com/ckan/ckanext-spatial/blob/master/ckanext/spatial/validation/validation.py
|
||||
|
||||
Harvest Metadata API
|
||||
--------------------
|
||||
|
|
Loading…
Reference in New Issue