Generating unique names improved

* Harvesters that change the name when the title changes have had a problem when the change is small and a number was unnecessarily appended. e.g. "Trees "->"Trees" meant _gen_new_name("Trees") returned "trees1". Now you can specify the existing value and it will return that if it still holds. * Maximum dataset name length is now adhered to. * To make a name unique, a sequential number is now added, since for users that is more understandable and pleasant. However hex digits are still an option, for those that want to harvest concurrently.
2015-10-01 17:53:03 +01:00 · 2015-10-01 17:53:03 +01:00 · be3e88086a
parent 89b6ad2ce1
commit be3e88086a
1 changed files with 90 additions and 11 deletions
--- a/ckanext/harvest/harvesters/base.py
+++ b/ckanext/harvest/harvesters/base.py
@ -8,7 +8,7 @@ from pylons import config

 from ckan import plugins as p
 from ckan import model
-from ckan.model import Session, Package
+from ckan.model import Session, Package, PACKAGE_NAME_MAX_LENGTH
 from ckan.logic import ValidationError, NotFound, get_action

 from ckan.logic.schema import default_create_package_schema
@ -41,21 +41,100 @@ class HarvesterBase(SingletonPlugin):

    _user_name = None

-    def _gen_new_name(self, title):
+    @classmethod
+    def _gen_new_name(cls, title, existing_name=None,
+                      append_type='number-sequence'):
        '''
-        Creates a URL friendly name from a title
+        Returns a 'name' for the dataset (URL friendly), based on the title.

-        If the name already exists, it will add some random characters at the end
+        If the ideal name is already used, it will append a number to it to
+        ensure it is unique.
+
+        If generating a new name because the title of the dataset has changed,
+        specify the existing name, in case the name doesn't need to change
+        after all.
+
+        :param existing_name: the current name of the dataset - only specify
+                              this if the dataset exists
+        :type existing_name: string
+        :param append_type: the type of characters to add to make it unique -
+                            either 'number-sequence' or 'random-hex'.
+        :type append_type: string
        '''

-        name = munge_title_to_name(title).replace('_', '-')
-        while '--' in name:
-            name = name.replace('--', '-')
-        pkg_obj = Session.query(Package).filter(Package.name == name).first()
-        if pkg_obj:
-            return name + str(uuid.uuid4())[:5]
+        ideal_name = munge_title_to_name(title)
+        ideal_name = re.sub('-+', '-', ideal_name)  # collapse multiple dashes
+        return cls._ensure_name_is_unique(ideal_name,
+                                          existing_name=existing_name,
+                                          append_type=append_type)
+
+    @staticmethod
+    def _ensure_name_is_unique(ideal_name, existing_name=None,
+                               append_type='number-sequence'):
+        '''
+        Returns a dataset name based on the ideal_name, only it will be
+        guaranteed to be different than all the other datasets, by adding a
+        number on the end if necessary.
+
+        If generating a new name because the title of the dataset has changed,
+        specify the existing name, in case the name doesn't need to change
+        after all.
+
+        The maximum dataset name length is taken account of.
+
+        :param ideal_name: the desired name for the dataset, if its not already
+                           been taken (usually derived by munging the dataset
+                           title)
+        :type ideal_name: string
+        :param existing_name: the current name of the dataset - only specify
+                              this if the dataset exists
+        :type existing_name: string
+        :param append_type: the type of characters to add to make it unique -
+                            either 'number-sequence' or 'random-hex'.
+        :type append_type: string
+        '''
+        ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH]
+        if existing_name == ideal_name:
+            return ideal_name
+        if append_type == 'number-sequence':
+            MAX_NUMBER_APPENDED = 999
+            APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED))
+        elif append_type == 'random-hex':
+            APPEND_MAX_CHARS = 5  # 16^5 = 1 million combinations
        else:
-            return name
+            raise NotImplementedError('append_type cannot be %s' % append_type)
+        # Find out which package names have been taken. Restrict it to names
+        # derived from the ideal name plus and numbers added
+        like_q = u'%s%%' % \
+            ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS]
+        name_results = Session.query(Package.name)\
+                              .filter(Package.name.ilike(like_q))\
+                              .all()
+        taken = set([name_result[0] for name_result in name_results])
+        if existing_name and existing_name in taken:
+            taken.remove(existing_name)
+        if ideal_name not in taken:
+            # great, the ideal name is available
+            return ideal_name
+        elif existing_name and existing_name.startswith(ideal_name):
+            # the ideal name is not available, but its an existing dataset with
+            # a name based on the ideal one, so there's no point changing it to
+            # a different number
+            return existing_name
+        elif append_type == 'number-sequence':
+            # find the next available number
+            counter = 1
+            while counter <= MAX_NUMBER_APPENDED:
+                candidate_name = \
+                    ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \
+                    str(counter)
+                if candidate_name not in taken:
+                    return candidate_name
+                counter = counter + 1
+            return None
+        elif append_type == 'random-hex':
+            return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \
+                str(uuid.uuid4())[:APPEND_MAX_CHARS]


    def _save_gather_error(self, message, job):