diff --git a/ckanext/harvest/harvesters/base.py b/ckanext/harvest/harvesters/base.py index 41a5061..1fd0ebc 100644 --- a/ckanext/harvest/harvesters/base.py +++ b/ckanext/harvest/harvesters/base.py @@ -8,7 +8,7 @@ from pylons import config from ckan import plugins as p from ckan import model -from ckan.model import Session, Package +from ckan.model import Session, Package, PACKAGE_NAME_MAX_LENGTH from ckan.logic import ValidationError, NotFound, get_action from ckan.logic.schema import default_create_package_schema @@ -41,21 +41,100 @@ class HarvesterBase(SingletonPlugin): _user_name = None - def _gen_new_name(self, title): + @classmethod + def _gen_new_name(cls, title, existing_name=None, + append_type='number-sequence'): ''' - Creates a URL friendly name from a title + Returns a 'name' for the dataset (URL friendly), based on the title. - If the name already exists, it will add some random characters at the end + If the ideal name is already used, it will append a number to it to + ensure it is unique. + + If generating a new name because the title of the dataset has changed, + specify the existing name, in case the name doesn't need to change + after all. + + :param existing_name: the current name of the dataset - only specify + this if the dataset exists + :type existing_name: string + :param append_type: the type of characters to add to make it unique - + either 'number-sequence' or 'random-hex'. + :type append_type: string ''' - name = munge_title_to_name(title).replace('_', '-') - while '--' in name: - name = name.replace('--', '-') - pkg_obj = Session.query(Package).filter(Package.name == name).first() - if pkg_obj: - return name + str(uuid.uuid4())[:5] + ideal_name = munge_title_to_name(title) + ideal_name = re.sub('-+', '-', ideal_name) # collapse multiple dashes + return cls._ensure_name_is_unique(ideal_name, + existing_name=existing_name, + append_type=append_type) + + @staticmethod + def _ensure_name_is_unique(ideal_name, existing_name=None, + append_type='number-sequence'): + ''' + Returns a dataset name based on the ideal_name, only it will be + guaranteed to be different than all the other datasets, by adding a + number on the end if necessary. + + If generating a new name because the title of the dataset has changed, + specify the existing name, in case the name doesn't need to change + after all. + + The maximum dataset name length is taken account of. + + :param ideal_name: the desired name for the dataset, if its not already + been taken (usually derived by munging the dataset + title) + :type ideal_name: string + :param existing_name: the current name of the dataset - only specify + this if the dataset exists + :type existing_name: string + :param append_type: the type of characters to add to make it unique - + either 'number-sequence' or 'random-hex'. + :type append_type: string + ''' + ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH] + if existing_name == ideal_name: + return ideal_name + if append_type == 'number-sequence': + MAX_NUMBER_APPENDED = 999 + APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED)) + elif append_type == 'random-hex': + APPEND_MAX_CHARS = 5 # 16^5 = 1 million combinations else: - return name + raise NotImplementedError('append_type cannot be %s' % append_type) + # Find out which package names have been taken. Restrict it to names + # derived from the ideal name plus and numbers added + like_q = u'%s%%' % \ + ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + name_results = Session.query(Package.name)\ + .filter(Package.name.ilike(like_q))\ + .all() + taken = set([name_result[0] for name_result in name_results]) + if existing_name and existing_name in taken: + taken.remove(existing_name) + if ideal_name not in taken: + # great, the ideal name is available + return ideal_name + elif existing_name and existing_name.startswith(ideal_name): + # the ideal name is not available, but its an existing dataset with + # a name based on the ideal one, so there's no point changing it to + # a different number + return existing_name + elif append_type == 'number-sequence': + # find the next available number + counter = 1 + while counter <= MAX_NUMBER_APPENDED: + candidate_name = \ + ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \ + str(counter) + if candidate_name not in taken: + return candidate_name + counter = counter + 1 + return None + elif append_type == 'random-hex': + return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \ + str(uuid.uuid4())[:APPEND_MAX_CHARS] def _save_gather_error(self, message, job): diff --git a/ckanext/harvest/tests/harvesters/__init__.py b/ckanext/harvest/tests/harvesters/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/ckanext/harvest/tests/harvesters/test_base.py b/ckanext/harvest/tests/harvesters/test_base.py new file mode 100644 index 0000000..6e1da21 --- /dev/null +++ b/ckanext/harvest/tests/harvesters/test_base.py @@ -0,0 +1,96 @@ +import re + +from nose.tools import assert_equal + +from ckanext.harvest.harvesters.base import HarvesterBase +try: + from ckan.tests import helpers + from ckan.tests import factories +except ImportError: + from ckan.new_tests import helpers + from ckan.new_tests import factories + + +_ensure_name_is_unique = HarvesterBase._ensure_name_is_unique + + +class TestGenNewName(object): + @classmethod + def setup_class(cls): + helpers.reset_db() + + def test_basic(self): + assert_equal(HarvesterBase._gen_new_name('Trees'), 'trees') + + def test_munge(self): + assert_equal( + HarvesterBase._gen_new_name('Trees and branches - survey.'), + 'trees-and-branches-survey') + + +class TestEnsureNameIsUnique(object): + def setup(self): + helpers.reset_db() + + def test_no_existing_datasets(self): + factories.Dataset(name='unrelated') + assert_equal(_ensure_name_is_unique('trees'), 'trees') + + def test_existing_dataset(self): + factories.Dataset(name='trees') + assert_equal(_ensure_name_is_unique('trees'), 'trees1') + + def test_two_existing_datasets(self): + factories.Dataset(name='trees') + factories.Dataset(name='trees1') + assert_equal(_ensure_name_is_unique('trees'), 'trees2') + + def test_no_existing_datasets_and_long_name(self): + assert_equal(_ensure_name_is_unique('x'*101), 'x'*100) + + def test_existing_dataset_and_long_name(self): + # because PACKAGE_NAME_MAX_LENGTH = 100 + factories.Dataset(name='x'*100) + assert_equal(_ensure_name_is_unique('x'*101), 'x'*99 + '1') + + def test_update_dataset_with_new_name(self): + factories.Dataset(name='trees1') + assert_equal(_ensure_name_is_unique('tree', existing_name='trees1'), + 'tree') + + def test_update_dataset_but_with_same_name(self): + # this can happen if you remove a trailing space from the title - the + # harvester sees the title changed and thinks it should have a new + # name, but clearly it can reuse its existing one + factories.Dataset(name='trees') + factories.Dataset(name='trees1') + assert_equal(_ensure_name_is_unique('trees', existing_name='trees'), + 'trees') + + def test_update_dataset_to_available_shorter_name(self): + # this can be handy when if reharvesting, you got duplicates and + # managed to purge one set and through a minor title change you can now + # lose the appended number. users don't like unnecessary numbers. + factories.Dataset(name='trees1') + assert_equal(_ensure_name_is_unique('trees', existing_name='trees1'), + 'trees') + + def test_update_dataset_but_doesnt_change_to_other_number(self): + # there's no point changing one number for another though + factories.Dataset(name='trees') + factories.Dataset(name='trees2') + assert_equal(_ensure_name_is_unique('trees', existing_name='trees2'), + 'trees2') + + def test_update_dataset_with_new_name_with_numbers(self): + factories.Dataset(name='trees') + factories.Dataset(name='trees2') + factories.Dataset(name='frogs') + assert_equal(_ensure_name_is_unique('frogs', existing_name='trees2'), + 'frogs1') + + def test_existing_dataset_appending_hex(self): + factories.Dataset(name='trees') + name = _ensure_name_is_unique('trees', append_type='random-hex') + # e.g. 'trees0b53f' + assert re.match('trees[\da-f]{5}', name)