From badd723259f2b1f7937efc4056b1b430e2fb2869 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 12 Mar 2014 18:21:05 +0000 Subject: [PATCH 1/8] [#63] Add new ISpatialHarvest interface Two extension points: ``get_package_dict`` and ``transform_to_iso``, with the same expected behaviour as the old hooks meant to be overriden. For ``get_package_dict`` we now pass, apart from the generated package_dict, the parsed iso_values and the harvest object. Updated docs and added autodocs. --- ckanext/spatial/harvesters/base.py | 85 ++++++++++++++---------------- ckanext/spatial/interfaces.py | 70 ++++++++++++++++++++++++ doc/harvesters.rst | 58 +++++++++++++++----- 3 files changed, 157 insertions(+), 56 deletions(-) create mode 100644 ckanext/spatial/interfaces.py diff --git a/ckanext/spatial/harvesters/base.py b/ckanext/spatial/harvesters/base.py index 245ead6..dea659d 100644 --- a/ckanext/spatial/harvesters/base.py +++ b/ckanext/spatial/harvesters/base.py @@ -29,6 +29,7 @@ from ckanext.harvest.model import HarvestObject from ckanext.spatial.validation import Validators, all_validators from ckanext.spatial.model import ISODocument +from ckanext.spatial.interfaces import ISpatialHarvester log = logging.getLogger(__name__) @@ -146,10 +147,6 @@ class SpatialHarvester(HarvesterBase): ## SpatialHarvester - ''' - These methods can be safely overridden by classes extending - SpatialHarvester - ''' def get_package_dict(self, iso_values, harvest_object): ''' @@ -157,19 +154,23 @@ class SpatialHarvester(HarvesterBase): package_update. See documentation on ckan.logic.action.create.package_create for more details - Tipically, custom harvesters would only want to add or modify the - extras, but the whole method can be replaced if necessary. Note that - if only minor modifications need to be made you can call the parent - method from your custom harvester and modify the output, eg: + Extensions willing to modify the dict should do so implementing the + ISpatialHarvester interface - class MyHarvester(SpatialHarvester): + import ckan.plugins as p + from ckanext.spatial.interfaces import ISpatialHarvester - def get_package_dict(self, iso_values, harvest_object): + class MyHarvester(p.SingletonPlugin): - package_dict = super(MyHarvester, self).get_package_dict(iso_values, harvest_object) + p.implements(ISpatialHarvester, inherit=True) - package_dict['extras']['my-custom-extra-1'] = 'value1' - package_dict['extras']['my-custom-extra-2'] = 'value2' + def get_package_dict(self, context, data_dict): + + package_dict = data_dict['package_dict'] + + package_dict['extras'].append( + {'key': 'my-custom-extra', 'value': 'my-custom-value'} + ) return package_dict @@ -364,34 +365,17 @@ class SpatialHarvester(HarvesterBase): def transform_to_iso(self, original_document, original_format, harvest_object): ''' - Transforms an XML document to ISO 19139 - - This method will be only called from the import stage if the - harvest_object content is null and original_document and - original_format harvest object extras exist (eg if an FGDC document - was harvested). - - In that case, this method should do the necessary to provide an - ISO 1939 like document, otherwise the import process will stop. - - - :param original_document: Original XML document - :type original_document: string - :param original_format: Original format (eg 'fgdc') - :type original_format: string - :param harvest_object: HarvestObject domain object (with access to - job and source objects) - :type harvest_object: HarvestObject - - :returns: An ISO 19139 document or None if the transformation was not - successful - :rtype: string - + DEPRECATED: Use the transform_to_iso method of the ISpatialHarvester + interface ''' - + self.__base_transform_to_iso_called = True return None def import_stage(self, harvest_object): + context = { + 'model': model, + 'session': model.Session, + } log = logging.getLogger(__name__ + '.import') log.debug('Import stage for harvest object: %s', harvest_object.id) @@ -415,9 +399,8 @@ class SpatialHarvester(HarvesterBase): if status == 'delete': # Delete package - context = {'model': model, 'session': model.Session, 'user': self._get_user_name()} - p.toolkit.get_action('package_delete')(context, {'id': harvest_object.package_id}) + p.toolkit.get_action('package_delete')(context.update({'ignore_auth': True}), {'id': harvest_object.package_id}) log.info('Deleted package {0} with guid {1}'.format(harvest_object.package_id, harvest_object.guid)) return True @@ -426,7 +409,16 @@ class SpatialHarvester(HarvesterBase): original_document = self._get_object_extra(harvest_object, 'original_document') original_format = self._get_object_extra(harvest_object, 'original_format') if original_document and original_format: + #DEPRECATED use the ISpatialHarvester interface method + self.__base_transform_to_iso_called = False content = self.transform_to_iso(original_document, original_format, harvest_object) + if not self.__base_transform_to_iso_called: + log.warn('Deprecation warning: calling transform_to_iso directly is deprecated. ' + + 'Please use the ISpatialHarvester interface method instead.') + + for harvester in p.PluginImplementations(ISpatialHarvester): + content = harvester.transform_to_iso(original_document, original_format, harvest_object) + if content: harvest_object.content = content else: @@ -495,20 +487,26 @@ class SpatialHarvester(HarvesterBase): harvest_object.metadata_modified_date = metadata_modified_date harvest_object.add() + # Build the package dict package_dict = self.get_package_dict(iso_values, harvest_object) + for harvester in p.PluginImplementations(ISpatialHarvester): + package_dict = harvester.get_package_dict(context, { + 'package_dict': package_dict, + 'iso_values': iso_values, + 'harvest_object': harvest_object, + }) if not package_dict: log.error('No package dict returned, aborting import for object {0}'.format(harvest_object.id)) return False # Create / update the package - - context = {'model': model, - 'session': model.Session, + context.update({ 'user': self._get_user_name(), 'extras_as_string': True, 'api_version': '2', - 'return_id_only': True} + 'return_id_only': True}) + if context['user'] == self._site_user['name']: context['ignore_auth'] = True @@ -652,7 +650,6 @@ class SpatialHarvester(HarvesterBase): ''' if self._user_name: return self._user_name - self._site_user = p.toolkit.get_action('get_site_user')({'model': model, 'ignore_auth': True}, {}) config_user_name = config.get('ckanext.spatial.harvest.user_name') diff --git a/ckanext/spatial/interfaces.py b/ckanext/spatial/interfaces.py new file mode 100644 index 0000000..faed0b6 --- /dev/null +++ b/ckanext/spatial/interfaces.py @@ -0,0 +1,70 @@ +from ckan.plugins.interfaces import Interface + + +class ISpatialHarvester(Interface): + + def get_package_dict(self, context, data_dict): + ''' + Allows to modify the dataset dict that will be created or updated + + This is the dict that the harvesters will pass to the `package_create` + or `package_update` actions. Extensions can modify it to suit their + needs, adding or removing filds, modifying the default ones, etc. + + This method should always return a package_dict. Note that, although + unlikely in a particular instance, this method could be implemented by + more than one plugin. + + + :param context: Contains a reference to the model, eg to + perform DB queries + :type context: dict + :param data_dict: Available data. Contains three keys: + + * `package_dict` + The default package_dict generated by the harvester. Modify this + or create a brand new one. + * `iso_values` + The parsed ISO XML document values. These contain more fields + that are not added by default to the ``package_dict``. + * `harvest_object` + A ``HarvestObject`` domain object which contains a reference + to the original metadata document (``harvest_object.content``) + and the harvest source (``harvest_object.source``). + + :type data_dict: dict + + :returns: A dataset dict ready to be used by ``package_create`` or + ``package_update`` + :rtype: dict + ''' + return data_dict['package_dict'] + + def transform_to_iso(self, original_document, original_format, harvest_object): + ''' + Transforms an XML document to ISO 19139 + + This method will be only called from the import stage if the + harvest_object content is null and original_document and + original_format harvest object extras exist (eg if an FGDC document + was harvested). + + In that case, this method should do the necessary to provide an + ISO 1939 like document, otherwise the import process will stop. + + + :param original_document: Original XML document + :type original_document: string + :param original_format: Original format (eg 'fgdc') + :type original_format: string + :param harvest_object: HarvestObject domain object (with access to + job and source objects) + :type harvest_object: HarvestObject + + :returns: An ISO 19139 document or None if the transformation was not + successful + :rtype: string + + ''' + return None + diff --git a/doc/harvesters.rst b/doc/harvesters.rst index 69cd9b4..e05a38a 100644 --- a/doc/harvesters.rst +++ b/doc/harvesters.rst @@ -54,19 +54,53 @@ hardcoded 'harvest' user:: Customizing the harvesters -------------------------- -The default harvesters provided in this extension can be overriden from -extensions to customize to your needs. You can either extend ``CswHarvester``, -``WAFfHarverster`` or the main ``SpatialHarvester`` class. There are some -extension points that can be safely overriden from your extension. Probably the -most useful is ``get_package_dict``, which allows to tweak the dataset fields -before creating or updating them. ``transform_to_iso`` allows to hook into -transformation mechanisms to transform other formats into ISO1939, the only one -directly supported byt he spatial harvesters. Finally, the whole -``import_stage`` can be overriden if the default logic does not suit your -needs. +The default harvesters provided in this extension can be extended from +extensions implementing the ``ISpatialHarvester`` interface. -Check the source code of ``ckanext/spatial/harvesters/base.py`` for more -details on these functions. +Probably the most useful extension point is ``get_package_dict``, which +allows to tweak the dataset fields before creating or updating them:: + + import ckan.plugins as p + from ckanext.spatial.interfaces import ISpatialHarvester + + class MyPlugin(p.SingletonPlugin): + + p.implements(ISpatialHarvester, inherit=True) + + def get_package_dict(self, context, data_dict): + + # Check the reference below to see all that's included on data_dict + + package_dict = data_dict['package_dict'] + iso_values = data_dict['iso_values'] + + package_dict['extras'].append( + {'key': 'topic-category', 'value': iso_values.get('topic-category')} + ) + + package_dict['extras'].append( + {'key': 'my-custom-extra', 'value': 'my-custom-value'} + ) + + return package_dict + + + + +``transform_to_iso`` allows to hook into transformation mechanisms to +transform other formats into ISO1939, the only one directly supported by +the spatial harvesters. + +Here is the full reference for the provided extension points: + +.. autoclass:: ckanext.spatial.interfaces.ISpatialHarvester + :members: + +If you need to further customize the default behaviour of the harvesters, you +can either extend ``CswHarvester``, ``WAFfHarverster`` or the main +``SpatialHarvester`` class., for instance to override the whole +``import_stage`` if the default logic does not suit your +needs. The `ckanext-geodatagov`_ extension contains live examples on how to extend the default spatial harvesters and create new ones for other spatial services From 119c0fd40c6c6aa53509e3b8dd7339a20341f069 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 19 Mar 2014 12:45:49 +0000 Subject: [PATCH 2/8] [#63] Add user to delete context to avoid exception --- ckanext/spatial/harvesters/base.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/ckanext/spatial/harvesters/base.py b/ckanext/spatial/harvesters/base.py index 9a79d5e..69e79dd 100644 --- a/ckanext/spatial/harvesters/base.py +++ b/ckanext/spatial/harvesters/base.py @@ -399,8 +399,11 @@ class SpatialHarvester(HarvesterBase): if status == 'delete': # Delete package - - p.toolkit.get_action('package_delete')(context.update({'ignore_auth': True}), {'id': harvest_object.package_id}) + context.update({ + 'ignore_auth': True, + 'user': self._get_user_name(), + }) + p.toolkit.get_action('package_delete')(context, {'id': harvest_object.package_id}) log.info('Deleted package {0} with guid {1}'.format(harvest_object.package_id, harvest_object.guid)) return True From 0513e360e9ffd8ee7d25d9c0380fc36796ed305f Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 19 Mar 2014 12:46:01 +0000 Subject: [PATCH 3/8] [#63] Add previous_object check In rare cases (eg if there was a previous error of two objects sharing a guid) we can have a "changed" state and no previous_object --- ckanext/spatial/harvesters/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ckanext/spatial/harvesters/base.py b/ckanext/spatial/harvesters/base.py index 69e79dd..3d61aa2 100644 --- a/ckanext/spatial/harvesters/base.py +++ b/ckanext/spatial/harvesters/base.py @@ -551,7 +551,7 @@ class SpatialHarvester(HarvesterBase): elif status == 'change': # Check if the modified date is more recent - if not self.force_import and harvest_object.metadata_modified_date <= previous_object.metadata_modified_date: + if not self.force_import and previous_object and harvest_object.metadata_modified_date <= previous_object.metadata_modified_date: # Assign the previous job id to the new object to # avoid losing history From 211f3e4a95093ce6e90c0a408dddb2b5c083796f Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 19 Mar 2014 13:02:03 +0000 Subject: [PATCH 4/8] [#63] Tweak docs --- ckanext/spatial/interfaces.py | 8 +++++++- doc/harvesters.rst | 2 +- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/ckanext/spatial/interfaces.py b/ckanext/spatial/interfaces.py index faed0b6..a40908d 100644 --- a/ckanext/spatial/interfaces.py +++ b/ckanext/spatial/interfaces.py @@ -15,9 +15,15 @@ class ISpatialHarvester(Interface): unlikely in a particular instance, this method could be implemented by more than one plugin. + If a dict is not returned by this function, the import stage will be + cancelled. + + .. note:: Make sure to run ``model.Session.flush()`` if you perform + queries using the model included in the ``context`` object. + :param context: Contains a reference to the model, eg to - perform DB queries + perform DB queries. :type context: dict :param data_dict: Available data. Contains three keys: diff --git a/doc/harvesters.rst b/doc/harvesters.rst index e05a38a..2ddb9e9 100644 --- a/doc/harvesters.rst +++ b/doc/harvesters.rst @@ -58,7 +58,7 @@ The default harvesters provided in this extension can be extended from extensions implementing the ``ISpatialHarvester`` interface. Probably the most useful extension point is ``get_package_dict``, which -allows to tweak the dataset fields before creating or updating them:: +allows to tweak the dataset fields before creating or updating it:: import ckan.plugins as p from ckanext.spatial.interfaces import ISpatialHarvester From e979d08e77fefc0a51cdb706679a931fc15cd605 Mon Sep 17 00:00:00 2001 From: amercader Date: Wed, 30 Apr 2014 18:01:42 +0100 Subject: [PATCH 5/8] [#69] Reindex dataset if harvest object did not change We replace the old harvest object with the new one, and if we don't reindex the reference to the old harvest object will remain in the dataset dict --- ckanext/spatial/harvesters/base.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/ckanext/spatial/harvesters/base.py b/ckanext/spatial/harvesters/base.py index 9122b61..80f9abe 100644 --- a/ckanext/spatial/harvesters/base.py +++ b/ckanext/spatial/harvesters/base.py @@ -23,6 +23,7 @@ from ckan import model from ckan.lib.helpers import json from ckan import logic from ckan.lib.navl.validators import not_empty +from ckan.lib.search.index import PackageSearchIndex from ckanext.harvest.harvesters.base import HarvesterBase from ckanext.harvest.model import HarvestObject @@ -560,6 +561,23 @@ class SpatialHarvester(HarvesterBase): # Delete the previous object to avoid cluttering the object table previous_object.delete() + # Reindex the corresponding package to update the reference to the + # harvest object + if harvest_object.package_id: + context.update({'validate': False, 'ignore_auth': True}) + try: + package_dict = logic.get_action('package_show')(context, + {'id': harvest_object.package_id}) + except p.toolkit.ObjectNotFound: + pass + else: + for extra in package_dict.get('extras', []): + if extra['key'] == 'harvest_object_id': + extra['value'] = harvest_object.id + if package_dict: + package_index = PackageSearchIndex() + package_index.index_package(package_dict) + log.info('Document with GUID %s unchanged, skipping...' % (harvest_object.guid)) else: package_schema = logic.schema.default_update_package_schema() From 6c55aad2236fb9a829e67b738ee3bf9760a0bba4 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 13 May 2014 18:03:12 +0100 Subject: [PATCH 6/8] [#63] Add extra stuff to the get_package_dict extension point Moved the call to get_site_user higher on base.py so it's available to extensions. Also added the parsed XML etree so it does not need to be parsed from the string again. --- ckanext/spatial/harvesters/base.py | 18 +++++++++++------- ckanext/spatial/interfaces.py | 6 +++++- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/ckanext/spatial/harvesters/base.py b/ckanext/spatial/harvesters/base.py index 017d722..f91f42c 100644 --- a/ckanext/spatial/harvesters/base.py +++ b/ckanext/spatial/harvesters/base.py @@ -111,6 +111,8 @@ class SpatialHarvester(HarvesterBase): _user_name = None + _site_user = None + source_config = {} force_import = False @@ -376,6 +378,7 @@ class SpatialHarvester(HarvesterBase): context = { 'model': model, 'session': model.Session, + 'user': self._get_user_name(), } log = logging.getLogger(__name__ + '.import') @@ -402,7 +405,6 @@ class SpatialHarvester(HarvesterBase): # Delete package context.update({ 'ignore_auth': True, - 'user': self._get_user_name(), }) p.toolkit.get_action('package_delete')(context, {'id': harvest_object.package_id}) log.info('Deleted package {0} with guid {1}'.format(harvest_object.package_id, harvest_object.guid)) @@ -445,7 +447,9 @@ class SpatialHarvester(HarvesterBase): # Parse ISO document try: - iso_values = ISODocument(harvest_object.content).read_values() + + iso_parser = ISODocument(harvest_object.content) + iso_values = iso_parser.read_values() except Exception, e: self._save_object_error('Error parsing ISO document for object {0}: {1}'.format(harvest_object.id, str(e)), harvest_object, 'Import') @@ -498,6 +502,7 @@ class SpatialHarvester(HarvesterBase): package_dict = harvester.get_package_dict(context, { 'package_dict': package_dict, 'iso_values': iso_values, + 'xml_tree': iso_parser.xml_tree, 'harvest_object': harvest_object, }) if not package_dict: @@ -506,12 +511,11 @@ class SpatialHarvester(HarvesterBase): # Create / update the package context.update({ - 'user': self._get_user_name(), - 'extras_as_string': True, - 'api_version': '2', - 'return_id_only': True}) + 'extras_as_string': True, + 'api_version': '2', + 'return_id_only': True}) - if context['user'] == self._site_user['name']: + if self._site_user and context['user'] == self._site_user['name']: context['ignore_auth'] = True diff --git a/ckanext/spatial/interfaces.py b/ckanext/spatial/interfaces.py index a40908d..375a664 100644 --- a/ckanext/spatial/interfaces.py +++ b/ckanext/spatial/interfaces.py @@ -23,7 +23,8 @@ class ISpatialHarvester(Interface): :param context: Contains a reference to the model, eg to - perform DB queries. + perform DB queries, and the user name used for + authorization. :type context: dict :param data_dict: Available data. Contains three keys: @@ -33,6 +34,9 @@ class ISpatialHarvester(Interface): * `iso_values` The parsed ISO XML document values. These contain more fields that are not added by default to the ``package_dict``. + * `xml_tree` + The full XML etree object. If some values not present in + ``iso_values`` are needed, these can be extracted via xpath. * `harvest_object` A ``HarvestObject`` domain object which contains a reference to the original metadata document (``harvest_object.content``) From dbf139e732c87c3558825c049d07d4d63b2603da Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 13 May 2014 18:07:14 +0100 Subject: [PATCH 7/8] [#63] Added extension point for defining custom validators --- ckanext/spatial/harvesters/base.py | 9 +++ ckanext/spatial/interfaces.py | 14 ++++ doc/harvesters.rst | 123 ++++++++++++++++++++++++++++- 3 files changed, 145 insertions(+), 1 deletion(-) diff --git a/ckanext/spatial/harvesters/base.py b/ckanext/spatial/harvesters/base.py index f91f42c..5fe9fdd 100644 --- a/ckanext/spatial/harvesters/base.py +++ b/ckanext/spatial/harvesters/base.py @@ -660,6 +660,15 @@ class SpatialHarvester(HarvesterBase): else: profiles = DEFAULT_VALIDATOR_PROFILES self._validator = Validators(profiles=profiles) + + # Add any custom validators from extensions + for plugin_with_validators in p.PluginImplementations(ISpatialHarvester): + custom_validators = plugin_with_validators.get_validators() + for custom_validator in custom_validators: + if custom_validator not in all_validators: + self._validator.add_validator(custom_validator) + + return self._validator def _get_user_name(self): diff --git a/ckanext/spatial/interfaces.py b/ckanext/spatial/interfaces.py index 375a664..904c4f6 100644 --- a/ckanext/spatial/interfaces.py +++ b/ckanext/spatial/interfaces.py @@ -50,6 +50,20 @@ class ISpatialHarvester(Interface): ''' return data_dict['package_dict'] + def get_validators(self): + ''' + Allows to register custom Validators that can be applied to harvested + metadata documents. + + Validators are classes that implement the ``is_valid`` method. Check + the `Writing custom validators`_ section in the docs to know more + about writing custom validators. + + :returns: A list of Validator classes + :rtype: list + ''' + return [] + def transform_to_iso(self, original_document, original_format, harvest_object): ''' Transforms an XML document to ISO 19139 diff --git a/doc/harvesters.rst b/doc/harvesters.rst index 2ddb9e9..4f24c27 100644 --- a/doc/harvesters.rst +++ b/doc/harvesters.rst @@ -31,7 +31,8 @@ separate stages: content into a CKAN dataset: validates the document, parses it, converts it to a CKAN dataset dict and saves it in the database. -The extension provides different XSD and schematron based validators. You can +The extension provides different XSD and schematron based validators, and you +can also write your own (see `Writing custom validators`_). You can specify which validators to use for the remote documents with the following configuration option:: @@ -84,7 +85,32 @@ allows to tweak the dataset fields before creating or updating it:: return package_dict +``get_validators`` allows to register custom validation classes that can be +applied to the harvested documents. Check the `Writing custom validators`_ +section to know more about how to write your custom validators:: + import ckan.plugins as p + from ckanext.spatial.interfaces import ISpatialHarvester + from ckanext.spatial.validation.validation import BaseValidator + + class MyPlugin(p.SingletonPlugin): + + p.implements(ISpatialHarvester, inherit=True) + + def get_validators(self): + return [MyValidator] + + + class MyValidator(BaseValidator): + + name = 'my-validator' + + title= 'My very own validator' + + @classmethod + def is_valid(cls, xml): + + return True, [] ``transform_to_iso`` allows to hook into transformation mechanisms to @@ -106,6 +132,101 @@ The `ckanext-geodatagov`_ extension contains live examples on how to extend the default spatial harvesters and create new ones for other spatial services like ArcGIS REST APIs. +Writing custom validators +------------------------- + + +Validator classes extend the ``BaseValidator`` class: + +.. autoclass:: ckanext.spatial.validation.validation.BaseValidator + :members: + +Helper classes are provided for XSD and schematron based validation, and +completely custom logic can be also implemented. Here are some examples of +the most common types: + +* XSD based validators:: + + class ISO19139NGDCSchema(XsdValidator): + ''' + XSD based validation for ISO 19139 documents. + + Uses XSD schema from the NOAA National Geophysical Data Center: + + http://ngdc.noaa.gov/metadata/published/xsd/ + + ''' + name = 'iso19139ngdc' + title = 'ISO19139 XSD Schema (NGDC)' + + @classmethod + def is_valid(cls, xml): + xsd_path = 'xml/iso19139ngdc' + + xsd_filepath = os.path.join(os.path.dirname(__file__), + xsd_path, 'schema.xsd') + return cls._is_valid(xml, xsd_filepath, 'NGDC Schema (schema.xsd)') + + + +* Schematron validators:: + + class Gemini2Schematron(SchematronValidator): + name = 'gemini2' + title = 'GEMINI 2.1 Schematron 1.2' + + @classmethod + def get_schematrons(cls): + with resource_stream("ckanext.spatial", + "validation/xml/gemini2/gemini2-schematron-20110906-v1.2.sch") as schema: + return [cls.schematron(schema)] + + +* Custom validators:: + + class MinimalFGDCValidator(BaseValidator): + + name = 'fgdc_minimal' + title = 'FGDC Minimal Validation' + + _elements = [ + ('Identification Citation Title', '/metadata/idinfo/citation/citeinfo/title'), + ('Identification Citation Originator', '/metadata/idinfo/citation/citeinfo/origin'), + ('Identification Citation Publication Date', '/metadata/idinfo/citation/citeinfo/pubdate'), + ('Identification Description Abstract', '/metadata/idinfo/descript/abstract'), + ('Identification Spatial Domain West Bounding Coordinate', '/metadata/idinfo/spdom/bounding/westbc'), + ('Identification Spatial Domain East Bounding Coordinate', '/metadata/idinfo/spdom/bounding/eastbc'), + ('Identification Spatial Domain North Bounding Coordinate', '/metadata/idinfo/spdom/bounding/northbc'), + ('Identification Spatial Domain South Bounding Coordinate', '/metadata/idinfo/spdom/bounding/southbc'), + ('Metadata Reference Information Contact Address Type', '/metadata/metainfo/metc/cntinfo/cntaddr/addrtype'), + ('Metadata Reference Information Contact Address State', '/metadata/metainfo/metc/cntinfo/cntaddr/state'), + ] + + @classmethod + def is_valid(cls, xml): + + errors = [] + + for title, xpath in cls._elements: + element = xml.xpath(xpath) + if len(element) == 0 or not element[0].text: + errors.append(('Element not found: {0}'.format(title), None)) + if len(errors): + return False, errors + + return True, [] + + +The `validation.py`_ file included in the ckanext-spatial extension contains +more examples of the different types. + +Remember that after registering your own validators you must specify them on +the following configuration option:: + + ckan.spatial.validator.profiles = iso19193eden,my-validator + + +.. _validation.py: https://github.com/ckan/ckanext-spatial/blob/master/ckanext/spatial/validation/validation.py Harvest Metadata API -------------------- From b428c33ff609a1ae2dc3417d2400b8dc9efe0dc9 Mon Sep 17 00:00:00 2001 From: amercader Date: Tue, 10 Jun 2014 18:08:38 +0100 Subject: [PATCH 8/8] [#69] Add config option to keep old behaviour (not reindex) --- ckanext/spatial/harvesters/base.py | 4 +++- doc/harvesters.rst | 10 ++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/ckanext/spatial/harvesters/base.py b/ckanext/spatial/harvesters/base.py index 5fe9fdd..7b2a993 100644 --- a/ckanext/spatial/harvesters/base.py +++ b/ckanext/spatial/harvesters/base.py @@ -568,7 +568,9 @@ class SpatialHarvester(HarvesterBase): # Reindex the corresponding package to update the reference to the # harvest object - if harvest_object.package_id: + if ((config.get('ckanext.spatial.harvest.reindex_unchanged', True) != 'False' + or self.source_config.get('reindex_unchanged') != 'False') + and harvest_object.package_id): context.update({'validate': False, 'ignore_auth': True}) try: package_dict = logic.get_action('package_show')(context, diff --git a/doc/harvesters.rst b/doc/harvesters.rst index 4f24c27..99333bf 100644 --- a/doc/harvesters.rst +++ b/doc/harvesters.rst @@ -52,6 +52,16 @@ hardcoded 'harvest' user:: ckanext.spatial.harvest.user_name = harvest +When a document has not been updated remotely, the previous harvest object is +replaced by the current one rather than keeping it, to avoid cluttering the +``harvest_object`` table. This means that the ``harvest_object_id`` reference +on the linked dataset needs to be updated, by reindexing it. This will happen +by default, but if you want to turn it off (eg if you are doing separate +reindexing) it can be turn off with the following option:: + + ckanext.spatial.harvest.reindex_unchanged = False + + Customizing the harvesters --------------------------