Merge branch 'unique-names-improved'

2015-10-23 11:02:49 +01:00 · 2015-10-23 11:02:49 +01:00 · d950b13400
parent 9b10f9e227 82bdff2f34
commit d950b13400
3 changed files with 186 additions and 11 deletions
--- a/ckanext/harvest/harvesters/base.py
+++ b/ckanext/harvest/harvesters/base.py
@ -8,7 +8,7 @@ from pylons import config
 from ckan import plugins as p
 from ckan import model
-from ckan.model import Session, Package
+from ckan.model import Session, Package, PACKAGE_NAME_MAX_LENGTH
 from ckan.logic import ValidationError, NotFound, get_action
 from ckan.logic.schema import default_create_package_schema
@ -41,21 +41,100 @@ class HarvesterBase(SingletonPlugin):
    _user_name = None
-    def _gen_new_name(self, title):
+    @classmethod
    def _gen_new_name(cls, title, existing_name=None,
                      append_type='number-sequence'):
        '''
-        Creates a URL friendly name from a title
+        Returns a 'name' for the dataset (URL friendly), based on the title.
-        If the name already exists, it will add some random characters at the end
+        If the ideal name is already used, it will append a number to it to
        ensure it is unique.
        If generating a new name because the title of the dataset has changed,
        specify the existing name, in case the name doesn't need to change
        after all.
        :param existing_name: the current name of the dataset - only specify
                              this if the dataset exists
        :type existing_name: string
        :param append_type: the type of characters to add to make it unique -
                            either 'number-sequence' or 'random-hex'.
        :type append_type: string
        '''
-        name = munge_title_to_name(title).replace('_', '-')
+        ideal_name = munge_title_to_name(title)
-        while '--' in name:
+        ideal_name = re.sub('-+', '-', ideal_name)  # collapse multiple dashes
-            name = name.replace('--', '-')
+        return cls._ensure_name_is_unique(ideal_name,
-        pkg_obj = Session.query(Package).filter(Package.name == name).first()
+                                          existing_name=existing_name,
-        if pkg_obj:
+                                          append_type=append_type)
-            return name + str(uuid.uuid4())[:5]
+
    @staticmethod
    def _ensure_name_is_unique(ideal_name, existing_name=None,
                               append_type='number-sequence'):
        '''
        Returns a dataset name based on the ideal_name, only it will be
        guaranteed to be different than all the other datasets, by adding a
        number on the end if necessary.
        If generating a new name because the title of the dataset has changed,
        specify the existing name, in case the name doesn't need to change
        after all.
        The maximum dataset name length is taken account of.
        :param ideal_name: the desired name for the dataset, if its not already
                           been taken (usually derived by munging the dataset
                           title)
        :type ideal_name: string
        :param existing_name: the current name of the dataset - only specify
                              this if the dataset exists
        :type existing_name: string
        :param append_type: the type of characters to add to make it unique -
                            either 'number-sequence' or 'random-hex'.
        :type append_type: string
        '''
        ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH]
        if existing_name == ideal_name:
            return ideal_name
        if append_type == 'number-sequence':
            MAX_NUMBER_APPENDED = 999
            APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED))
        elif append_type == 'random-hex':
            APPEND_MAX_CHARS = 5  # 16^5 = 1 million combinations
        else:
-            return name
+            raise NotImplementedError('append_type cannot be %s' % append_type)
        # Find out which package names have been taken. Restrict it to names
        # derived from the ideal name plus and numbers added
        like_q = u'%s%%' % \
            ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS]
        name_results = Session.query(Package.name)\
                              .filter(Package.name.ilike(like_q))\
                              .all()
        taken = set([name_result[0] for name_result in name_results])
        if existing_name and existing_name in taken:
            taken.remove(existing_name)
        if ideal_name not in taken:
            # great, the ideal name is available
            return ideal_name
        elif existing_name and existing_name.startswith(ideal_name):
            # the ideal name is not available, but its an existing dataset with
            # a name based on the ideal one, so there's no point changing it to
            # a different number
            return existing_name
        elif append_type == 'number-sequence':
            # find the next available number
            counter = 1
            while counter <= MAX_NUMBER_APPENDED:
                candidate_name = \
                    ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \
                    str(counter)
                if candidate_name not in taken:
                    return candidate_name
                counter = counter + 1
            return None
        elif append_type == 'random-hex':
            return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \
                str(uuid.uuid4())[:APPEND_MAX_CHARS]
    def _save_gather_error(self, message, job):
--- a/ckanext/harvest/tests/harvesters/init.py
+++ b/ckanext/harvest/tests/harvesters/init.py
--- a/ckanext/harvest/tests/harvesters/test_base.py
+++ b/ckanext/harvest/tests/harvesters/test_base.py
@ -0,0 +1,96 @@
 import re
 from nose.tools import assert_equal
 from ckanext.harvest.harvesters.base import HarvesterBase
 try:
    from ckan.tests import helpers
    from ckan.tests import factories
 except ImportError:
    from ckan.new_tests import helpers
    from ckan.new_tests import factories
 _ensure_name_is_unique = HarvesterBase._ensure_name_is_unique
 class TestGenNewName(object):
    @classmethod
    def setup_class(cls):
        helpers.reset_db()
    def test_basic(self):
        assert_equal(HarvesterBase._gen_new_name('Trees'), 'trees')
    def test_munge(self):
        assert_equal(
            HarvesterBase._gen_new_name('Trees and branches - survey.'),
            'trees-and-branches-survey')
 class TestEnsureNameIsUnique(object):
    def setup(self):
        helpers.reset_db()
    def test_no_existing_datasets(self):
        factories.Dataset(name='unrelated')
        assert_equal(_ensure_name_is_unique('trees'), 'trees')
    def test_existing_dataset(self):
        factories.Dataset(name='trees')
        assert_equal(_ensure_name_is_unique('trees'), 'trees1')
    def test_two_existing_datasets(self):
        factories.Dataset(name='trees')
        factories.Dataset(name='trees1')
        assert_equal(_ensure_name_is_unique('trees'), 'trees2')
    def test_no_existing_datasets_and_long_name(self):
        assert_equal(_ensure_name_is_unique('x'*101), 'x'*100)
    def test_existing_dataset_and_long_name(self):
        # because PACKAGE_NAME_MAX_LENGTH = 100
        factories.Dataset(name='x'*100)
        assert_equal(_ensure_name_is_unique('x'*101), 'x'*99 + '1')
    def test_update_dataset_with_new_name(self):
        factories.Dataset(name='trees1')
        assert_equal(_ensure_name_is_unique('tree', existing_name='trees1'),
                     'tree')
    def test_update_dataset_but_with_same_name(self):
        # this can happen if you remove a trailing space from the title - the
        # harvester sees the title changed and thinks it should have a new
        # name, but clearly it can reuse its existing one
        factories.Dataset(name='trees')
        factories.Dataset(name='trees1')
        assert_equal(_ensure_name_is_unique('trees', existing_name='trees'),
                     'trees')
    def test_update_dataset_to_available_shorter_name(self):
        # this can be handy when if reharvesting, you got duplicates and
        # managed to purge one set and through a minor title change you can now
        # lose the appended number. users don't like unnecessary numbers.
        factories.Dataset(name='trees1')
        assert_equal(_ensure_name_is_unique('trees', existing_name='trees1'),
                     'trees')
    def test_update_dataset_but_doesnt_change_to_other_number(self):
        # there's no point changing one number for another though
        factories.Dataset(name='trees')
        factories.Dataset(name='trees2')
        assert_equal(_ensure_name_is_unique('trees', existing_name='trees2'),
                     'trees2')
    def test_update_dataset_with_new_name_with_numbers(self):
        factories.Dataset(name='trees')
        factories.Dataset(name='trees2')
        factories.Dataset(name='frogs')
        assert_equal(_ensure_name_is_unique('frogs', existing_name='trees2'),
                     'frogs1')
    def test_existing_dataset_appending_hex(self):
        factories.Dataset(name='trees')
        name = _ensure_name_is_unique('trees', append_type='random-hex')
        # e.g. 'trees0b53f'
        assert re.match('trees[\da-f]{5}', name)