Generating unique names improved

* Harvesters that change the name when the title changes have had a
  problem when the change is small and a number was unnecessarily
  appended. e.g. "Trees "->"Trees" meant _gen_new_name("Trees") returned
  "trees1". Now you can specify the existing value and it will return
  that if it still holds.
* Maximum dataset name length is now adhered to.
* To make a name unique, a sequential number is now added, since for
  users that is more understandable and pleasant. However hex digits are
  still an option, for those that want to harvest concurrently.
This commit is contained in:
David Read 2015-10-01 17:53:03 +01:00
parent 89b6ad2ce1
commit be3e88086a
1 changed files with 90 additions and 11 deletions

View File

@ -8,7 +8,7 @@ from pylons import config
from ckan import plugins as p from ckan import plugins as p
from ckan import model from ckan import model
from ckan.model import Session, Package from ckan.model import Session, Package, PACKAGE_NAME_MAX_LENGTH
from ckan.logic import ValidationError, NotFound, get_action from ckan.logic import ValidationError, NotFound, get_action
from ckan.logic.schema import default_create_package_schema from ckan.logic.schema import default_create_package_schema
@ -41,21 +41,100 @@ class HarvesterBase(SingletonPlugin):
_user_name = None _user_name = None
def _gen_new_name(self, title): @classmethod
def _gen_new_name(cls, title, existing_name=None,
append_type='number-sequence'):
''' '''
Creates a URL friendly name from a title Returns a 'name' for the dataset (URL friendly), based on the title.
If the name already exists, it will add some random characters at the end If the ideal name is already used, it will append a number to it to
ensure it is unique.
If generating a new name because the title of the dataset has changed,
specify the existing name, in case the name doesn't need to change
after all.
:param existing_name: the current name of the dataset - only specify
this if the dataset exists
:type existing_name: string
:param append_type: the type of characters to add to make it unique -
either 'number-sequence' or 'random-hex'.
:type append_type: string
''' '''
name = munge_title_to_name(title).replace('_', '-') ideal_name = munge_title_to_name(title)
while '--' in name: ideal_name = re.sub('-+', '-', ideal_name) # collapse multiple dashes
name = name.replace('--', '-') return cls._ensure_name_is_unique(ideal_name,
pkg_obj = Session.query(Package).filter(Package.name == name).first() existing_name=existing_name,
if pkg_obj: append_type=append_type)
return name + str(uuid.uuid4())[:5]
@staticmethod
def _ensure_name_is_unique(ideal_name, existing_name=None,
append_type='number-sequence'):
'''
Returns a dataset name based on the ideal_name, only it will be
guaranteed to be different than all the other datasets, by adding a
number on the end if necessary.
If generating a new name because the title of the dataset has changed,
specify the existing name, in case the name doesn't need to change
after all.
The maximum dataset name length is taken account of.
:param ideal_name: the desired name for the dataset, if its not already
been taken (usually derived by munging the dataset
title)
:type ideal_name: string
:param existing_name: the current name of the dataset - only specify
this if the dataset exists
:type existing_name: string
:param append_type: the type of characters to add to make it unique -
either 'number-sequence' or 'random-hex'.
:type append_type: string
'''
ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH]
if existing_name == ideal_name:
return ideal_name
if append_type == 'number-sequence':
MAX_NUMBER_APPENDED = 999
APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED))
elif append_type == 'random-hex':
APPEND_MAX_CHARS = 5 # 16^5 = 1 million combinations
else: else:
return name raise NotImplementedError('append_type cannot be %s' % append_type)
# Find out which package names have been taken. Restrict it to names
# derived from the ideal name plus and numbers added
like_q = u'%s%%' % \
ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS]
name_results = Session.query(Package.name)\
.filter(Package.name.ilike(like_q))\
.all()
taken = set([name_result[0] for name_result in name_results])
if existing_name and existing_name in taken:
taken.remove(existing_name)
if ideal_name not in taken:
# great, the ideal name is available
return ideal_name
elif existing_name and existing_name.startswith(ideal_name):
# the ideal name is not available, but its an existing dataset with
# a name based on the ideal one, so there's no point changing it to
# a different number
return existing_name
elif append_type == 'number-sequence':
# find the next available number
counter = 1
while counter <= MAX_NUMBER_APPENDED:
candidate_name = \
ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \
str(counter)
if candidate_name not in taken:
return candidate_name
counter = counter + 1
return None
elif append_type == 'random-hex':
return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \
str(uuid.uuid4())[:APPEND_MAX_CHARS]
def _save_gather_error(self, message, job): def _save_gather_error(self, message, job):