diff --git a/ckanext/harvest/harvesters/base.py b/ckanext/harvest/harvesters/base.py index 41a5061..1fd0ebc 100644 --- a/ckanext/harvest/harvesters/base.py +++ b/ckanext/harvest/harvesters/base.py @@ -8,7 +8,7 @@ from pylons import config from ckan import plugins as p from ckan import model -from ckan.model import Session, Package +from ckan.model import Session, Package, PACKAGE_NAME_MAX_LENGTH from ckan.logic import ValidationError, NotFound, get_action from ckan.logic.schema import default_create_package_schema @@ -41,21 +41,100 @@ class HarvesterBase(SingletonPlugin): _user_name = None - def _gen_new_name(self, title): + @classmethod + def _gen_new_name(cls, title, existing_name=None, + append_type='number-sequence'): ''' - Creates a URL friendly name from a title + Returns a 'name' for the dataset (URL friendly), based on the title. - If the name already exists, it will add some random characters at the end + If the ideal name is already used, it will append a number to it to + ensure it is unique. + + If generating a new name because the title of the dataset has changed, + specify the existing name, in case the name doesn't need to change + after all. + + :param existing_name: the current name of the dataset - only specify + this if the dataset exists + :type existing_name: string + :param append_type: the type of characters to add to make it unique - + either 'number-sequence' or 'random-hex'. + :type append_type: string ''' - name = munge_title_to_name(title).replace('_', '-') - while '--' in name: - name = name.replace('--', '-') - pkg_obj = Session.query(Package).filter(Package.name == name).first() - if pkg_obj: - return name + str(uuid.uuid4())[:5] + ideal_name = munge_title_to_name(title) + ideal_name = re.sub('-+', '-', ideal_name) # collapse multiple dashes + return cls._ensure_name_is_unique(ideal_name, + existing_name=existing_name, + append_type=append_type) + + @staticmethod + def _ensure_name_is_unique(ideal_name, existing_name=None, + append_type='number-sequence'): + ''' + Returns a dataset name based on the ideal_name, only it will be + guaranteed to be different than all the other datasets, by adding a + number on the end if necessary. + + If generating a new name because the title of the dataset has changed, + specify the existing name, in case the name doesn't need to change + after all. + + The maximum dataset name length is taken account of. + + :param ideal_name: the desired name for the dataset, if its not already + been taken (usually derived by munging the dataset + title) + :type ideal_name: string + :param existing_name: the current name of the dataset - only specify + this if the dataset exists + :type existing_name: string + :param append_type: the type of characters to add to make it unique - + either 'number-sequence' or 'random-hex'. + :type append_type: string + ''' + ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH] + if existing_name == ideal_name: + return ideal_name + if append_type == 'number-sequence': + MAX_NUMBER_APPENDED = 999 + APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED)) + elif append_type == 'random-hex': + APPEND_MAX_CHARS = 5 # 16^5 = 1 million combinations else: - return name + raise NotImplementedError('append_type cannot be %s' % append_type) + # Find out which package names have been taken. Restrict it to names + # derived from the ideal name plus and numbers added + like_q = u'%s%%' % \ + ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + name_results = Session.query(Package.name)\ + .filter(Package.name.ilike(like_q))\ + .all() + taken = set([name_result[0] for name_result in name_results]) + if existing_name and existing_name in taken: + taken.remove(existing_name) + if ideal_name not in taken: + # great, the ideal name is available + return ideal_name + elif existing_name and existing_name.startswith(ideal_name): + # the ideal name is not available, but its an existing dataset with + # a name based on the ideal one, so there's no point changing it to + # a different number + return existing_name + elif append_type == 'number-sequence': + # find the next available number + counter = 1 + while counter <= MAX_NUMBER_APPENDED: + candidate_name = \ + ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \ + str(counter) + if candidate_name not in taken: + return candidate_name + counter = counter + 1 + return None + elif append_type == 'random-hex': + return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \ + str(uuid.uuid4())[:APPEND_MAX_CHARS] def _save_gather_error(self, message, job):