Merge branch 'unique-names-improved'

2015-10-23 11:02:49 +01:00 · 2015-10-23 11:02:49 +01:00 · d950b13400
parent 9b10f9e227 82bdff2f34
commit d950b13400
3 changed files with 186 additions and 11 deletions
--- a/ckanext/harvest/harvesters/base.py
+++ b/ckanext/harvest/harvesters/base.py
@ -8,7 +8,7 @@ from pylons import config

 from ckan import plugins as p
 from ckan import model
-from ckan.model import Session, Package
+from ckan.model import Session, Package, PACKAGE_NAME_MAX_LENGTH
 from ckan.logic import ValidationError, NotFound, get_action

 from ckan.logic.schema import default_create_package_schema
@ -41,21 +41,100 @@ class HarvesterBase(SingletonPlugin):

    _user_name = None

-    def _gen_new_name(self, title):
+    @classmethod
+    def _gen_new_name(cls, title, existing_name=None,
+                      append_type='number-sequence'):
        '''
-        Creates a URL friendly name from a title
+        Returns a 'name' for the dataset (URL friendly), based on the title.

-        If the name already exists, it will add some random characters at the end
+        If the ideal name is already used, it will append a number to it to
+        ensure it is unique.
+
+        If generating a new name because the title of the dataset has changed,
+        specify the existing name, in case the name doesn't need to change
+        after all.
+
+        :param existing_name: the current name of the dataset - only specify
+                              this if the dataset exists
+        :type existing_name: string
+        :param append_type: the type of characters to add to make it unique -
+                            either 'number-sequence' or 'random-hex'.
+        :type append_type: string
        '''

-        name = munge_title_to_name(title).replace('_', '-')
-        while '--' in name:
-            name = name.replace('--', '-')
-        pkg_obj = Session.query(Package).filter(Package.name == name).first()
-        if pkg_obj:
-            return name + str(uuid.uuid4())[:5]
+        ideal_name = munge_title_to_name(title)
+        ideal_name = re.sub('-+', '-', ideal_name)  # collapse multiple dashes
+        return cls._ensure_name_is_unique(ideal_name,
+                                          existing_name=existing_name,
+                                          append_type=append_type)
+
+    @staticmethod
+    def _ensure_name_is_unique(ideal_name, existing_name=None,
+                               append_type='number-sequence'):
+        '''
+        Returns a dataset name based on the ideal_name, only it will be
+        guaranteed to be different than all the other datasets, by adding a
+        number on the end if necessary.
+
+        If generating a new name because the title of the dataset has changed,
+        specify the existing name, in case the name doesn't need to change
+        after all.
+
+        The maximum dataset name length is taken account of.
+
+        :param ideal_name: the desired name for the dataset, if its not already
+                           been taken (usually derived by munging the dataset
+                           title)
+        :type ideal_name: string
+        :param existing_name: the current name of the dataset - only specify
+                              this if the dataset exists
+        :type existing_name: string
+        :param append_type: the type of characters to add to make it unique -
+                            either 'number-sequence' or 'random-hex'.
+        :type append_type: string
+        '''
+        ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH]
+        if existing_name == ideal_name:
+            return ideal_name
+        if append_type == 'number-sequence':
+            MAX_NUMBER_APPENDED = 999
+            APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED))
+        elif append_type == 'random-hex':
+            APPEND_MAX_CHARS = 5  # 16^5 = 1 million combinations
        else:
-            return name
+            raise NotImplementedError('append_type cannot be %s' % append_type)
+        # Find out which package names have been taken. Restrict it to names
+        # derived from the ideal name plus and numbers added
+        like_q = u'%s%%' % \
+            ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS]
+        name_results = Session.query(Package.name)\
+                              .filter(Package.name.ilike(like_q))\
+                              .all()
+        taken = set([name_result[0] for name_result in name_results])
+        if existing_name and existing_name in taken:
+            taken.remove(existing_name)
+        if ideal_name not in taken:
+            # great, the ideal name is available
+            return ideal_name
+        elif existing_name and existing_name.startswith(ideal_name):
+            # the ideal name is not available, but its an existing dataset with
+            # a name based on the ideal one, so there's no point changing it to
+            # a different number
+            return existing_name
+        elif append_type == 'number-sequence':
+            # find the next available number
+            counter = 1
+            while counter <= MAX_NUMBER_APPENDED:
+                candidate_name = \
+                    ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \
+                    str(counter)
+                if candidate_name not in taken:
+                    return candidate_name
+                counter = counter + 1
+            return None
+        elif append_type == 'random-hex':
+            return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \
+                str(uuid.uuid4())[:APPEND_MAX_CHARS]


    def _save_gather_error(self, message, job):
--- a/ckanext/harvest/tests/harvesters/init.py
+++ b/ckanext/harvest/tests/harvesters/init.py
--- a/ckanext/harvest/tests/harvesters/test_base.py
+++ b/ckanext/harvest/tests/harvesters/test_base.py
@ -0,0 +1,96 @@
+import re
+
+from nose.tools import assert_equal
+
+from ckanext.harvest.harvesters.base import HarvesterBase
+try:
+    from ckan.tests import helpers
+    from ckan.tests import factories
+except ImportError:
+    from ckan.new_tests import helpers
+    from ckan.new_tests import factories
+
+
+_ensure_name_is_unique = HarvesterBase._ensure_name_is_unique
+
+
+class TestGenNewName(object):
+    @classmethod
+    def setup_class(cls):
+        helpers.reset_db()
+
+    def test_basic(self):
+        assert_equal(HarvesterBase._gen_new_name('Trees'), 'trees')
+
+    def test_munge(self):
+        assert_equal(
+            HarvesterBase._gen_new_name('Trees and branches - survey.'),
+            'trees-and-branches-survey')
+
+
+class TestEnsureNameIsUnique(object):
+    def setup(self):
+        helpers.reset_db()
+
+    def test_no_existing_datasets(self):
+        factories.Dataset(name='unrelated')
+        assert_equal(_ensure_name_is_unique('trees'), 'trees')
+
+    def test_existing_dataset(self):
+        factories.Dataset(name='trees')
+        assert_equal(_ensure_name_is_unique('trees'), 'trees1')
+
+    def test_two_existing_datasets(self):
+        factories.Dataset(name='trees')
+        factories.Dataset(name='trees1')
+        assert_equal(_ensure_name_is_unique('trees'), 'trees2')
+
+    def test_no_existing_datasets_and_long_name(self):
+        assert_equal(_ensure_name_is_unique('x'*101), 'x'*100)
+
+    def test_existing_dataset_and_long_name(self):
+        # because PACKAGE_NAME_MAX_LENGTH = 100
+        factories.Dataset(name='x'*100)
+        assert_equal(_ensure_name_is_unique('x'*101), 'x'*99 + '1')
+
+    def test_update_dataset_with_new_name(self):
+        factories.Dataset(name='trees1')
+        assert_equal(_ensure_name_is_unique('tree', existing_name='trees1'),
+                     'tree')
+
+    def test_update_dataset_but_with_same_name(self):
+        # this can happen if you remove a trailing space from the title - the
+        # harvester sees the title changed and thinks it should have a new
+        # name, but clearly it can reuse its existing one
+        factories.Dataset(name='trees')
+        factories.Dataset(name='trees1')
+        assert_equal(_ensure_name_is_unique('trees', existing_name='trees'),
+                     'trees')
+
+    def test_update_dataset_to_available_shorter_name(self):
+        # this can be handy when if reharvesting, you got duplicates and
+        # managed to purge one set and through a minor title change you can now
+        # lose the appended number. users don't like unnecessary numbers.
+        factories.Dataset(name='trees1')
+        assert_equal(_ensure_name_is_unique('trees', existing_name='trees1'),
+                     'trees')
+
+    def test_update_dataset_but_doesnt_change_to_other_number(self):
+        # there's no point changing one number for another though
+        factories.Dataset(name='trees')
+        factories.Dataset(name='trees2')
+        assert_equal(_ensure_name_is_unique('trees', existing_name='trees2'),
+                     'trees2')
+
+    def test_update_dataset_with_new_name_with_numbers(self):
+        factories.Dataset(name='trees')
+        factories.Dataset(name='trees2')
+        factories.Dataset(name='frogs')
+        assert_equal(_ensure_name_is_unique('frogs', existing_name='trees2'),
+                     'frogs1')
+
+    def test_existing_dataset_appending_hex(self):
+        factories.Dataset(name='trees')
+        name = _ensure_name_is_unique('trees', append_type='random-hex')
+        # e.g. 'trees0b53f'
+        assert re.match('trees[\da-f]{5}', name)