spatial-d4science/ckanext/spatial/tests/test_harvest.py

1144 lines
45 KiB
Python
Raw Normal View History

2019-12-11 13:23:03 +01:00
from __future__ import absolute_import
from builtins import str
from builtins import object
import os
from datetime import datetime, date
import lxml
import json
from uuid import uuid4
from nose.plugins.skip import SkipTest
from nose.tools import assert_equal, assert_in, assert_raises
from ckan.lib.base import config
from ckan import model
from ckan.model import Session, Package, Group, User
from ckan.logic.schema import default_update_package_schema, default_create_package_schema
from ckan.logic import get_action
try:
from ckan.new_tests.helpers import call_action
except ImportError:
from ckan.tests.helpers import call_action
from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject)
from ckanext.spatial.validation import Validators
from ckanext.spatial.harvesters.gemini import (GeminiDocHarvester,
GeminiWafHarvester,
GeminiHarvester)
from ckanext.spatial.harvesters.base import SpatialHarvester
from ckanext.spatial.tests.base import SpatialTestBase
2019-12-11 13:23:03 +01:00
from .xml_file_server import serve
# Start simple HTTP server that serves XML test files
serve()
class HarvestFixtureBase(SpatialTestBase):
def setup(self):
# Add sysadmin user
harvest_user = model.User(name=u'harvest', password=u'test', sysadmin=True)
Session.add(harvest_user)
Session.commit()
package_schema = default_update_package_schema()
self.context ={'model':model,
'session':Session,
'user':u'harvest',
'schema':package_schema,
'api_version': '2'}
def teardown(self):
model.repo.rebuild_db()
def _create_job(self,source_id):
# Create a job
context ={'model':model,
'session':Session,
'user':u'harvest'}
job_dict=get_action('harvest_job_create')(context,{'source_id':source_id})
job = HarvestJob.get(job_dict['id'])
assert job
return job
def _create_source_and_job(self, source_fixture):
context ={'model':model,
'session':Session,
'user':u'harvest'}
if config.get('ckan.harvest.auth.profile') == u'publisher' \
and not 'publisher_id' in source_fixture:
source_fixture['publisher_id'] = self.publisher.id
source_dict=get_action('harvest_source_create')(context,source_fixture)
source = HarvestSource.get(source_dict['id'])
assert source
job = self._create_job(source.id)
return source, job
def _run_job_for_single_document(self,job,force_import=False,expect_gather_errors=False,expect_obj_errors=False):
harvester = GeminiDocHarvester()
harvester.force_import = force_import
object_ids = harvester.gather_stage(job)
assert object_ids, len(object_ids) == 1
if expect_gather_errors:
assert len(job.gather_errors) > 0
else:
assert len(job.gather_errors) == 0
assert harvester.fetch_stage(object_ids) == True
obj = HarvestObject.get(object_ids[0])
assert obj, obj.content
harvester.import_stage(obj)
Session.refresh(obj)
if expect_obj_errors:
assert len(obj.errors) > 0
else:
assert len(obj.errors) == 0
job.status = u'Finished'
job.save()
return obj
class TestHarvest(HarvestFixtureBase):
@classmethod
def setup_class(cls):
SpatialHarvester._validator = Validators(profiles=['gemini2'])
HarvestFixtureBase.setup_class()
def clean_tags(self, tags):
2019-12-11 13:23:03 +01:00
return [{u'name': x['name']} for x in tags]
def find_extra(self, pkg, key):
values = [e['value'] for e in pkg['extras'] if e['key'] == key]
return values[0] if len(values) == 1 else None
def test_harvest_basic(self):
# Create source
source_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1-waf/index.html',
'source_type': u'gemini-waf'
}
source, job = self._create_source_and_job(source_fixture)
harvester = GeminiWafHarvester()
# We need to send an actual job, not the dict
object_ids = harvester.gather_stage(job)
assert len(object_ids) == 2
# Fetch stage always returns True for Waf harvesters
assert harvester.fetch_stage(object_ids) == True
objects = []
for object_id in object_ids:
obj = HarvestObject.get(object_id)
assert obj
objects.append(obj)
harvester.import_stage(obj)
pkgs = Session.query(Package).filter(Package.type!=u'harvest').all()
assert_equal(len(pkgs), 2)
pkg_ids = [pkg.id for pkg in pkgs]
for obj in objects:
assert obj.current == True
assert obj.package_id in pkg_ids
def test_harvest_fields_service(self):
# Create source
source_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml',
'source_type': u'gemini-single'
}
source, job = self._create_source_and_job(source_fixture)
harvester = GeminiDocHarvester()
object_ids = harvester.gather_stage(job)
assert object_ids, len(object_ids) == 1
# No gather errors
assert len(job.gather_errors) == 0
# Fetch stage always returns True for Single Doc harvesters
assert harvester.fetch_stage(object_ids) == True
obj = HarvestObject.get(object_ids[0])
assert obj, obj.content
assert obj.guid == u'test-service-1'
harvester.import_stage(obj)
# No object errors
assert len(obj.errors) == 0
package_dict = get_action('package_show')(self.context,{'id':obj.package_id})
assert package_dict
expected = {
'name': u'one-scotland-address-gazetteer-web-map-service-wms',
'title': u'One Scotland Address Gazetteer Web Map Service (WMS)',
'tags': [{u'name': u'Addresses'}, {u'name': u'Scottish National Gazetteer'}],
'notes': u'This service displays its contents at larger scale than 1:10000. [edited]',
}
package_dict['tags'] = self.clean_tags(package_dict['tags'])
2019-12-11 13:23:03 +01:00
for key,value in expected.items():
if not package_dict[key] == value:
raise AssertionError('Unexpected value for %s: %s (was expecting %s)' % \
(key, package_dict[key], value))
if config.get('ckan.harvest.auth.profile') == u'publisher':
assert package_dict['groups'] == [self.publisher.id]
expected_extras = {
# Basic
'guid': obj.guid,
'UKLP': u'True',
'resource-type': u'service',
'access_constraints': u'["No restriction on public access"]',
'responsible-party': u'The Improvement Service (owner)',
'provider':u'The Improvement Service',
'contact-email': u'OSGCM@improvementservice.org.uk',
# Spatial
'bbox-east-long': u'0.5242365625',
'bbox-north-lat': u'61.0243',
'bbox-south-lat': u'54.4764484375',
'bbox-west-long': u'-9.099786875',
'spatial': u'{"type": "Polygon", "coordinates": [[[0.5242365625, 54.4764484375], [-9.099786875, 54.4764484375], [-9.099786875, 61.0243], [0.5242365625, 61.0243], [0.5242365625, 54.4764484375]]]}',
# Other
'coupled-resource': u'[{"href": ["http://scotgovsdi.edina.ac.uk/srv/en/csw?service=CSW&request=GetRecordById&version=2.0.2&outputSchema=http://www.isotc211.org/2005/gmd&elementSetName=full&id=250ea276-48e2-4189-8a89-fcc4ca92d652"], "uuid": ["250ea276-48e2-4189-8a89-fcc4ca92d652"], "title": []}]',
'dataset-reference-date': u'[{"type": "publication", "value": "2011-09-08"}]',
'frequency-of-update': u'daily',
'licence': u'["Use of the One Scotland Gazetteer data used by this this service is available to any organisation that is a member of the One Scotland Mapping Agreement. It is not currently commercially available", "http://www.test.gov.uk/licenseurl"]',
'licence_url': u'http://www.test.gov.uk/licenseurl',
'metadata-date': u'2011-09-08T16:07:32',
'metadata-language': u'eng',
'spatial-data-service-type': u'other',
'spatial-reference-system': u'OSGB 1936 / British National Grid (EPSG:27700)',
'temporal_coverage-from': u'["1904-06-16"]',
'temporal_coverage-to': u'["2004-06-16"]',
}
2019-12-11 13:23:03 +01:00
for key,value in expected_extras.items():
extra_value = self.find_extra(package_dict, key)
if extra_value is None:
raise AssertionError('Extra %s not present in package' % key)
if not extra_value == value:
raise AssertionError('Unexpected value for extra %s: %s (was expecting %s)' % \
(key, package_dict['extras'][key], value))
expected_resource = {
'ckan_recommended_wms_preview': 'True',
'description': 'Link to the GetCapabilities request for this service',
'name': 'Web Map Service (WMS)',
'resource_locator_function': 'download',
'resource_locator_protocol': 'OGC:WMS-1.3.0-http-get-capabilities',
'url': u'http://127.0.0.1:8999/wms/capabilities.xml',
'verified': 'True',
}
resource = package_dict['resources'][0]
2019-12-11 13:23:03 +01:00
for key,value in expected_resource.items():
if not key in resource:
raise AssertionError('Expected key not in resource: %s' % (key))
if not resource[key] == value:
raise AssertionError('Unexpected value in resource for %s: %s (was expecting %s)' % \
(key, resource[key], value))
assert datetime.strptime(resource['verified_date'],'%Y-%m-%dT%H:%M:%S.%f').date() == date.today()
assert resource['format'].lower() == 'wms'
def test_harvest_fields_dataset(self):
# Create source
source_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
'source_type': u'gemini-single'
}
source, job = self._create_source_and_job(source_fixture)
harvester = GeminiDocHarvester()
object_ids = harvester.gather_stage(job)
assert object_ids, len(object_ids) == 1
# No gather errors
assert len(job.gather_errors) == 0
# Fetch stage always returns True for Single Doc harvesters
assert harvester.fetch_stage(object_ids) == True
obj = HarvestObject.get(object_ids[0])
assert obj, obj.content
assert obj.guid == u'test-dataset-1'
harvester.import_stage(obj)
# No object errors
assert len(obj.errors) == 0
package_dict = get_action('package_show')(self.context,{'id':obj.package_id})
assert package_dict
expected = {
'name': u'country-parks-scotland',
'title': u'Country Parks (Scotland)',
'tags': [{u'name': u'Nature conservation'}],
'notes': u'Parks are set up by Local Authorities to provide open-air recreation facilities close to towns and cities. [edited]'
}
package_dict['tags'] = self.clean_tags(package_dict['tags'])
2019-12-11 13:23:03 +01:00
for key,value in expected.items():
if not package_dict[key] == value:
raise AssertionError('Unexpected value for %s: %s (was expecting %s)' % \
(key, package_dict[key], value))
if config.get('ckan.harvest.auth.profile') == u'publisher':
assert package_dict['groups'] == [self.publisher.id]
expected_extras = {
# Basic
'guid': obj.guid,
'resource-type': u'dataset',
'responsible-party': u'Scottish Natural Heritage (custodian, distributor)',
'access_constraints': u'["Copyright Scottish Natural Heritage"]',
'contact-email': u'data_supply@snh.gov.uk',
'provider':'',
# Spatial
'bbox-east-long': u'0.205857204',
'bbox-north-lat': u'61.06066944',
'bbox-south-lat': u'54.529947158',
'bbox-west-long': u'-8.97114288',
'spatial': u'{"type": "Polygon", "coordinates": [[[0.205857204, 54.529947158], [-8.97114288, 54.529947158], [-8.97114288, 61.06066944], [0.205857204, 61.06066944], [0.205857204, 54.529947158]]]}',
# Other
'coupled-resource': u'[]',
'dataset-reference-date': u'[{"type": "creation", "value": "2004-02"}, {"type": "revision", "value": "2006-07-03"}]',
'frequency-of-update': u'irregular',
'licence': u'["Reference and PSMA Only", "http://www.test.gov.uk/licenseurl"]',
'licence_url': u'http://www.test.gov.uk/licenseurl',
'metadata-date': u'2011-09-23T10:06:08',
'metadata-language': u'eng',
'spatial-reference-system': u'urn:ogc:def:crs:EPSG::27700',
'temporal_coverage-from': u'["1998"]',
'temporal_coverage-to': u'["2010"]',
}
2019-12-11 13:23:03 +01:00
for key, value in expected_extras.items():
extra_value = self.find_extra(package_dict, key)
if extra_value is None:
raise AssertionError('Extra %s not present in package' % key)
if not extra_value == value:
raise AssertionError('Unexpected value for extra %s: %s (was expecting %s)' % \
(key, package_dict['extras'][key], value))
expected_resource = {
'description': 'Test Resource Description',
'format': u'',
'name': 'Test Resource Name',
'resource_locator_function': 'download',
'resource_locator_protocol': 'test-protocol',
'url': u'https://gateway.snh.gov.uk/pls/apex_ddtdb2/f?p=101',
}
resource = package_dict['resources'][0]
2019-12-11 13:23:03 +01:00
for key,value in expected_resource.items():
if not resource[key] == value:
raise AssertionError('Unexpected value in resource for %s: %s (was expecting %s)' % \
(key, resource[key], value))
def test_harvest_error_bad_xml(self):
# Create source
source_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/error_bad_xml.xml',
'source_type': u'gemini-single'
}
source, job = self._create_source_and_job(source_fixture)
harvester = GeminiDocHarvester()
try:
object_ids = harvester.gather_stage(job)
except lxml.etree.XMLSyntaxError:
# this only occurs in debug_exception_mode
pass
else:
assert object_ids is None
# Check gather errors
assert len(job.gather_errors) == 1
assert job.gather_errors[0].harvest_job_id == job.id
assert 'Error parsing the document' in job.gather_errors[0].message
def test_harvest_error_404(self):
# Create source
source_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/not_there.xml',
'source_type': u'gemini-single'
}
source, job = self._create_source_and_job(source_fixture)
harvester = GeminiDocHarvester()
object_ids = harvester.gather_stage(job)
assert object_ids is None
# Check gather errors
assert len(job.gather_errors) == 1
assert job.gather_errors[0].harvest_job_id == job.id
assert 'Unable to get content for URL' in job.gather_errors[0].message
def test_harvest_error_validation(self):
# Create source
source_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/error_validation.xml',
'source_type': u'gemini-single'
}
source, job = self._create_source_and_job(source_fixture)
harvester = GeminiDocHarvester()
object_ids = harvester.gather_stage(job)
# Right now the import process goes ahead even with validation errors
assert object_ids, len(object_ids) == 1
# No gather errors
assert len(job.gather_errors) == 0
# Fetch stage always returns True for Single Doc harvesters
assert harvester.fetch_stage(object_ids) == True
obj = HarvestObject.get(object_ids[0])
assert obj, obj.content
assert obj.guid == u'test-error-validation-1'
harvester.import_stage(obj)
# Check errors
assert len(obj.errors) == 1
assert obj.errors[0].harvest_object_id == obj.id
message = obj.errors[0].message
assert_in('One email address shall be provided', message)
assert_in('Service type shall be one of \'discovery\', \'view\', \'download\', \'transformation\', \'invoke\' or \'other\' following INSPIRE generic names', message)
assert_in('Limitations on public access code list value shall be \'otherRestrictions\'', message)
assert_in('One organisation name shall be provided', message)
def test_harvest_update_records(self):
# Create source
source_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
'source_type': u'gemini-single'
}
source, first_job = self._create_source_and_job(source_fixture)
first_obj = self._run_job_for_single_document(first_job)
first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# Package was created
assert first_package_dict
assert first_obj.current == True
assert first_obj.package
# Create and run a second job, the package should not be updated
second_job = self._create_job(source.id)
second_obj = self._run_job_for_single_document(second_job)
Session.remove()
Session.add(first_obj)
Session.add(second_obj)
Session.refresh(first_obj)
Session.refresh(second_obj)
second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# Package was not updated
assert second_package_dict, first_package_dict['id'] == second_package_dict['id']
assert not second_obj.package, not second_obj.package_id
assert second_obj.current == False, first_obj.current == True
# Create and run a third job, forcing the importing to simulate an update in the package
third_job = self._create_job(source.id)
third_obj = self._run_job_for_single_document(third_job,force_import=True)
# For some reason first_obj does not get updated after the import_stage,
# and we have to force a refresh to get the actual DB values.
Session.remove()
Session.add(first_obj)
Session.add(second_obj)
Session.add(third_obj)
Session.refresh(first_obj)
Session.refresh(second_obj)
Session.refresh(third_obj)
third_package_dict = get_action('package_show')(self.context,{'id':third_obj.package_id})
# Package was updated
assert third_package_dict, first_package_dict['id'] == third_package_dict['id']
assert third_obj.package, third_obj.package_id == first_package_dict['id']
assert third_obj.current == True
assert second_obj.current == False
assert first_obj.current == False
def test_harvest_deleted_record(self):
# Create source
source_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml',
'source_type': u'gemini-single'
}
source, first_job = self._create_source_and_job(source_fixture)
first_obj = self._run_job_for_single_document(first_job)
first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# Package was created
assert first_package_dict
assert first_package_dict['state'] == u'active'
assert first_obj.current == True
# Delete package
first_package_dict['state'] = u'deleted'
self.context.update({'id':first_package_dict['id']})
updated_package_dict = get_action('package_update')(self.context,first_package_dict)
# Create and run a second job, the date has not changed, so the package should not be updated
# and remain deleted
first_job.status = u'Finished'
first_job.save()
second_job = self._create_job(source.id)
second_obj = self._run_job_for_single_document(second_job)
second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# Package was not updated
assert second_package_dict, updated_package_dict['id'] == second_package_dict['id']
assert not second_obj.package, not second_obj.package_id
assert second_obj.current == False, first_obj.current == True
# Harvest an updated document, with a more recent modified date, package should be
# updated and reactivated
source.url = u'http://127.0.0.1:8999/gemini2.1/service1_newer.xml'
source.save()
third_job = self._create_job(source.id)
third_obj = self._run_job_for_single_document(third_job)
third_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
Session.remove()
Session.add(first_obj)
Session.add(second_obj)
Session.add(third_obj)
Session.refresh(first_obj)
Session.refresh(second_obj)
Session.refresh(third_obj)
# Package was updated
assert third_package_dict, third_package_dict['id'] == second_package_dict['id']
assert third_obj.package, third_obj.package
assert third_obj.current == True, second_obj.current == False
assert first_obj.current == False
assert 'NEWER' in third_package_dict['title']
assert third_package_dict['state'] == u'active'
def test_harvest_different_sources_same_document(self):
# Create source1
source1_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/source1/same_dataset.xml',
'source_type': u'gemini-single'
}
source1, first_job = self._create_source_and_job(source1_fixture)
first_obj = self._run_job_for_single_document(first_job)
first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# Package was created
assert first_package_dict
assert first_package_dict['state'] == u'active'
assert first_obj.current == True
# Harvest the same document, unchanged, from another source, the package
# is not updated.
# (As of https://github.com/okfn/ckanext-inspire/commit/9fb67
# we are no longer throwing an exception when this happens)
source2_fixture = {
'title': 'Test Source 2',
'name': 'test-source-2',
'url': u'http://127.0.0.1:8999/gemini2.1/source2/same_dataset.xml',
'source_type': u'gemini-single'
}
source2, second_job = self._create_source_and_job(source2_fixture)
second_obj = self._run_job_for_single_document(second_job)
second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# Package was not updated
assert second_package_dict, first_package_dict['id'] == second_package_dict['id']
assert not second_obj.package, not second_obj.package_id
assert second_obj.current == False, first_obj.current == True
# Inactivate source1 and reharvest from source2, package should be updated
third_job = self._create_job(source2.id)
third_obj = self._run_job_for_single_document(third_job,force_import=True)
Session.remove()
Session.add(first_obj)
Session.add(second_obj)
Session.add(third_obj)
Session.refresh(first_obj)
Session.refresh(second_obj)
Session.refresh(third_obj)
third_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# Package was updated
assert third_package_dict, first_package_dict['id'] == third_package_dict['id']
assert third_obj.package, third_obj.package_id == first_package_dict['id']
assert third_obj.current == True
assert second_obj.current == False
assert first_obj.current == False
def test_harvest_different_sources_same_document_but_deleted_inbetween(self):
# Create source1
source1_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/source1/same_dataset.xml',
'source_type': u'gemini-single'
}
source1, first_job = self._create_source_and_job(source1_fixture)
first_obj = self._run_job_for_single_document(first_job)
first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# Package was created
assert first_package_dict
assert first_package_dict['state'] == u'active'
assert first_obj.current == True
# Delete/withdraw the package
first_package_dict = get_action('package_delete')(self.context,{'id':first_obj.package_id})
first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# Harvest the same document, unchanged, from another source
source2_fixture = {
'title': 'Test Source 2',
'name': 'test-source-2',
'url': u'http://127.0.0.1:8999/gemini2.1/source2/same_dataset.xml',
'source_type': u'gemini-single'
}
source2, second_job = self._create_source_and_job(source2_fixture)
second_obj = self._run_job_for_single_document(second_job)
second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# It would be good if the package was updated, but we see that it isn't
assert second_package_dict, first_package_dict['id'] == second_package_dict['id']
assert not second_obj.package
assert second_obj.current == False
assert first_obj.current == True
def test_harvest_moves_sources(self):
# Create source1
source1_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/service1.xml',
'source_type': u'gemini-single'
}
source1, first_job = self._create_source_and_job(source1_fixture)
first_obj = self._run_job_for_single_document(first_job)
first_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# Package was created
assert first_package_dict
assert first_package_dict['state'] == u'active'
assert first_obj.current == True
# Harvest the same document GUID but with a newer date, from another source.
source2_fixture = {
'title': 'Test Source 2',
'name': 'test-source-2',
'url': u'http://127.0.0.1:8999/gemini2.1/service1_newer.xml',
'source_type': u'gemini-single'
}
source2, second_job = self._create_source_and_job(source2_fixture)
second_obj = self._run_job_for_single_document(second_job)
second_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# Now we have two packages
assert second_package_dict, first_package_dict['id'] == second_package_dict['id']
assert second_obj.package
assert second_obj.current == True
assert first_obj.current == True
# so currently, if you move a Gemini between harvest sources you need
# to update the date to get it to reharvest, and then you should
# withdraw the package relating to the original harvest source.
def test_harvest_import_command(self):
# Create source
source_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
'source_type': u'gemini-single'
}
source, first_job = self._create_source_and_job(source_fixture)
first_obj = self._run_job_for_single_document(first_job)
before_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# Package was created
assert before_package_dict
assert first_obj.current == True
assert first_obj.package
# Create and run two more jobs, the package should not be updated
second_job = self._create_job(source.id)
second_obj = self._run_job_for_single_document(second_job)
third_job = self._create_job(source.id)
third_obj = self._run_job_for_single_document(third_job)
# Run the import command manually
imported_objects = get_action('harvest_objects_import')(self.context,{'source_id':source.id})
Session.remove()
Session.add(first_obj)
Session.add(second_obj)
Session.add(third_obj)
Session.refresh(first_obj)
Session.refresh(second_obj)
Session.refresh(third_obj)
after_package_dict = get_action('package_show')(self.context,{'id':first_obj.package_id})
# Package was updated, and the current object remains the same
assert after_package_dict, before_package_dict['id'] == after_package_dict['id']
assert third_obj.current == False
assert second_obj.current == False
assert first_obj.current == True
source_dict = get_action('harvest_source_show')(self.context,{'id':source.id})
assert source_dict['status']['total_datasets'] == 1
def test_clean_tags(self):
# Create source
source_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
'source_type': u'gemini-single',
'owner_org': 'test-org',
'metadata_created': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'),
'metadata_modified': datetime.now().strftime('%YYYY-%MM-%DD %HH:%MM:%s'),
}
user = User.get('dummy')
if not user:
user = call_action('user_create',
name='dummy',
password='dummybummy',
email='dummy@dummy.com')
user_name = user['name']
else:
user_name = user.name
org = Group.by_name('test-org')
if org is None:
org = call_action('organization_create',
context={'user': user_name},
name='test-org')
existing_g = Group.by_name('existing-group')
if existing_g is None:
existing_g = call_action('group_create',
context={'user': user_name},
name='existing-group')
context = {'user': 'dummy'}
package_schema = default_update_package_schema()
context['schema'] = package_schema
package_dict = {'frequency': 'manual',
'publisher_name': 'dummy',
'extras': [{'key':'theme', 'value':['non-mappable', 'thememap1']}],
'groups': [],
'title': 'fakename',
'holder_name': 'dummy',
'holder_identifier': 'dummy',
'name': 'fakename',
'notes': 'dummy',
'owner_org': 'test-org',
'modified': datetime.now(),
'publisher_identifier': 'dummy',
'metadata_created' : datetime.now(),
'metadata_modified' : datetime.now(),
2019-12-11 13:23:03 +01:00
'guid': str(uuid4()),
'identifier': 'dummy'}
package_data = call_action('package_create', context=context, **package_dict)
package = Package.get('fakename')
source, job = self._create_source_and_job(source_fixture)
job.package = package
job.guid = uuid4()
harvester = SpatialHarvester()
with open(os.path.join('..', 'data', 'dataset.json')) as f:
dataset = json.load(f)
# long tags are invalid in all cases
TAG_LONG_INVALID = 'abcdefghij' * 20
# if clean_tags is not set to true, tags will be truncated to 50 chars
TAG_LONG_VALID = TAG_LONG_INVALID[:50]
# default truncate to 100
TAG_LONG_VALID_LONG = TAG_LONG_INVALID[:100]
assert len(TAG_LONG_VALID) == 50
assert TAG_LONG_VALID[-1] == 'j'
TAG_CHARS_INVALID = 'Pretty-inv@lid.tag!'
TAG_CHARS_VALID = 'pretty-invlidtag'
dataset['tags'].append(TAG_LONG_INVALID)
dataset['tags'].append(TAG_CHARS_INVALID)
harvester.source_config = {'clean_tags': False}
out = harvester.get_package_dict(dataset, job)
tags = out['tags']
# no clean tags, so invalid chars are in
# but tags are truncated to 50 chars
assert {'name': TAG_CHARS_VALID} not in tags
assert {'name': TAG_CHARS_INVALID} in tags
assert {'name': TAG_LONG_VALID_LONG} in tags
assert {'name': TAG_LONG_INVALID} not in tags
harvester.source_config = {'clean_tags': True}
out = harvester.get_package_dict(dataset, job)
tags = out['tags']
assert {'name': TAG_CHARS_VALID} in tags
assert {'name': TAG_LONG_VALID_LONG} in tags
BASIC_GEMINI = '''<gmd:MD_Metadata xmlns:gmd="http://www.isotc211.org/2005/gmd" xmlns:gco="http://www.isotc211.org/2005/gco">
<gmd:fileIdentifier xmlns:gml="http://www.opengis.net/gml">
<gco:CharacterString>e269743a-cfda-4632-a939-0c8416ae801e</gco:CharacterString>
</gmd:fileIdentifier>
<gmd:hierarchyLevel>
<gmd:MD_ScopeCode codeList="http://standards.iso.org/ittf/PubliclyAvailableStandards/ISO_19139_Schemas/resources/Codelist/gmxCodelists.xml#MD_ScopeCode" codeListValue="service">service</gmd:MD_ScopeCode>
</gmd:hierarchyLevel>
</gmd:MD_Metadata>'''
GUID = 'e269743a-cfda-4632-a939-0c8416ae801e'
GEMINI_MISSING_GUID = '''<gmd:MD_Metadata xmlns:gmd="http://www.isotc211.org/2005/gmd" xmlns:gco="http://www.isotc211.org/2005/gco"/>'''
class TestGatherMethods(HarvestFixtureBase):
def setup(self):
HarvestFixtureBase.setup(self)
# Create source
source_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/dataset1.xml',
'source_type': u'gemini-single'
}
source, job = self._create_source_and_job(source_fixture)
self.harvester = GeminiHarvester()
self.harvester.harvest_job = job
def teardown(self):
model.repo.rebuild_db()
def test_get_gemini_string_and_guid(self):
res = self.harvester.get_gemini_string_and_guid(BASIC_GEMINI, url=None)
assert_equal(res, (BASIC_GEMINI, GUID))
def test_get_gemini_string_and_guid__no_guid(self):
res = self.harvester.get_gemini_string_and_guid(GEMINI_MISSING_GUID, url=None)
assert_equal(res, (GEMINI_MISSING_GUID, ''))
def test_get_gemini_string_and_guid__non_parsing(self):
content = '<gmd:MD_Metadata xmlns:gmd="http://www.isotc211.org/2005/gmd" xmlns:gco="http://www.isotc211.org/2005/gco">' # no closing tag
assert_raises(lxml.etree.XMLSyntaxError, self.harvester.get_gemini_string_and_guid, content)
def test_get_gemini_string_and_guid__empty(self):
content = ''
assert_raises(lxml.etree.XMLSyntaxError, self.harvester.get_gemini_string_and_guid, content)
2019-12-11 13:23:03 +01:00
class TestImportStageTools(object):
def test_licence_url_normal(self):
assert_equal(GeminiHarvester._extract_first_licence_url(
['Reference and PSMA Only',
'http://www.test.gov.uk/licenseurl']),
'http://www.test.gov.uk/licenseurl')
def test_licence_url_multiple_urls(self):
# only the first URL is extracted
assert_equal(GeminiHarvester._extract_first_licence_url(
['Reference and PSMA Only',
'http://www.test.gov.uk/licenseurl',
'http://www.test.gov.uk/2nd_licenseurl']),
'http://www.test.gov.uk/licenseurl')
def test_licence_url_embedded(self):
# URL is embedded within the text field and not extracted
assert_equal(GeminiHarvester._extract_first_licence_url(
['Reference and PSMA Only http://www.test.gov.uk/licenseurl']),
None)
def test_licence_url_embedded_at_start(self):
# URL is embedded at the start of the text field and the
# whole field is returned. Noting this unusual behaviour
assert_equal(GeminiHarvester._extract_first_licence_url(
['http://www.test.gov.uk/licenseurl Reference and PSMA Only']),
'http://www.test.gov.uk/licenseurl Reference and PSMA Only')
def test_responsible_organisation_basic(self):
responsible_organisation = [{'organisation-name': 'Ordnance Survey',
'role': 'owner'},
{'organisation-name': 'Maps Ltd',
'role': 'distributor'}]
assert_equal(GeminiHarvester._process_responsible_organisation(responsible_organisation),
('Ordnance Survey', ['Maps Ltd (distributor)',
'Ordnance Survey (owner)']))
def test_responsible_organisation_publisher(self):
# no owner, so falls back to publisher
responsible_organisation = [{'organisation-name': 'Ordnance Survey',
'role': 'publisher'},
{'organisation-name': 'Maps Ltd',
'role': 'distributor'}]
assert_equal(GeminiHarvester._process_responsible_organisation(responsible_organisation),
('Ordnance Survey', ['Maps Ltd (distributor)',
'Ordnance Survey (publisher)']))
def test_responsible_organisation_owner(self):
# provider is the owner (ignores publisher)
responsible_organisation = [{'organisation-name': 'Ordnance Survey',
'role': 'publisher'},
{'organisation-name': 'Owner',
'role': 'owner'},
{'organisation-name': 'Maps Ltd',
'role': 'distributor'}]
assert_equal(GeminiHarvester._process_responsible_organisation(responsible_organisation),
('Owner', ['Owner (owner)',
'Maps Ltd (distributor)',
'Ordnance Survey (publisher)',
]))
def test_responsible_organisation_multiple_roles(self):
# provider is the owner (ignores publisher)
responsible_organisation = [{'organisation-name': 'Ordnance Survey',
'role': 'publisher'},
{'organisation-name': 'Ordnance Survey',
'role': 'custodian'},
{'organisation-name': 'Distributor',
'role': 'distributor'}]
assert_equal(GeminiHarvester._process_responsible_organisation(responsible_organisation),
('Ordnance Survey', ['Distributor (distributor)',
'Ordnance Survey (publisher, custodian)',
]))
def test_responsible_organisation_blank_provider(self):
# no owner or publisher, so blank provider
responsible_organisation = [{'organisation-name': 'Ordnance Survey',
'role': 'resourceProvider'},
{'organisation-name': 'Maps Ltd',
'role': 'distributor'}]
assert_equal(GeminiHarvester._process_responsible_organisation(responsible_organisation),
('', ['Maps Ltd (distributor)',
'Ordnance Survey (resourceProvider)']))
def test_responsible_organisation_blank(self):
# no owner or publisher, so blank provider
responsible_organisation = []
assert_equal(GeminiHarvester._process_responsible_organisation(responsible_organisation),
('', []))
class TestValidation(HarvestFixtureBase):
@classmethod
def setup_class(cls):
# TODO: Fix these tests, broken since 27c4ee81e
raise SkipTest('Validation tests not working since 27c4ee81e')
SpatialHarvester._validator = Validators(profiles=['iso19139eden', 'constraints', 'gemini2'])
HarvestFixtureBase.setup_class()
def get_validation_errors(self, validation_test_filename):
# Create source
source_fixture = {
'title': 'Test Source',
'name': 'test-source',
'url': u'http://127.0.0.1:8999/gemini2.1/validation/%s' % validation_test_filename,
'source_type': u'gemini-single'
}
source, job = self._create_source_and_job(source_fixture)
harvester = GeminiDocHarvester()
# Gather stage for GeminiDocHarvester includes validation
object_ids = harvester.gather_stage(job)
# Check the validation errors
errors = '; '.join([gather_error.message for gather_error in job.gather_errors])
return errors
def test_01_dataset_fail_iso19139_schema(self):
errors = self.get_validation_errors('01_Dataset_Invalid_XSD_No_Such_Element.xml')
assert len(errors) > 0
assert_in('Could not get the GUID', errors)
def test_02_dataset_fail_constraints_schematron(self):
errors = self.get_validation_errors('02_Dataset_Invalid_19139_Missing_Data_Format.xml')
assert len(errors) > 0
assert_in('MD_Distribution / MD_Format: count(distributionFormat + distributorFormat) > 0', errors)
def test_03_dataset_fail_gemini_schematron(self):
errors = self.get_validation_errors('03_Dataset_Invalid_GEMINI_Missing_Keyword.xml')
assert len(errors) > 0
assert_in('Descriptive keywords are mandatory', errors)
def test_04_dataset_valid(self):
errors = self.get_validation_errors('04_Dataset_Valid.xml')
assert len(errors) == 0
def test_05_series_fail_iso19139_schema(self):
errors = self.get_validation_errors('05_Series_Invalid_XSD_No_Such_Element.xml')
assert len(errors) > 0
assert_in('Could not get the GUID', errors)
def test_06_series_fail_constraints_schematron(self):
errors = self.get_validation_errors('06_Series_Invalid_19139_Missing_Data_Format.xml')
assert len(errors) > 0
assert_in('MD_Distribution / MD_Format: count(distributionFormat + distributorFormat) > 0', errors)
def test_07_series_fail_gemini_schematron(self):
errors = self.get_validation_errors('07_Series_Invalid_GEMINI_Missing_Keyword.xml')
assert len(errors) > 0
assert_in('Descriptive keywords are mandatory', errors)
def test_08_series_valid(self):
errors = self.get_validation_errors('08_Series_Valid.xml')
assert len(errors) == 0
def test_09_service_fail_iso19139_schema(self):
errors = self.get_validation_errors('09_Service_Invalid_No_Such_Element.xml')
assert len(errors) > 0
assert_in('Could not get the GUID', errors)
def test_10_service_fail_constraints_schematron(self):
errors = self.get_validation_errors('10_Service_Invalid_19139_Level_Description.xml')
assert len(errors) > 0
assert_in("DQ_Scope: 'levelDescription' is mandatory if 'level' notEqual 'dataset' or 'series'.", errors)
def test_11_service_fail_gemini_schematron(self):
errors = self.get_validation_errors('11_Service_Invalid_GEMINI_Service_Type.xml')
assert len(errors) > 0
assert_in("Service type shall be one of 'discovery', 'view', 'download', 'transformation', 'invoke' or 'other' following INSPIRE generic names.", errors)
def test_12_service_valid(self):
errors = self.get_validation_errors('12_Service_Valid.xml')
assert len(errors) == 0, errors
def test_13_dataset_fail_iso19139_schema_2(self):
# This test Dataset has srv tags and only Service metadata should.
errors = self.get_validation_errors('13_Dataset_Invalid_Element_srv.xml')
assert len(errors) > 0
assert_in('Element \'{http://www.isotc211.org/2005/srv}SV_ServiceIdentification\': This element is not expected.', errors)