Generating unique names improved
* Harvesters that change the name when the title changes have had a problem when the change is small and a number was unnecessarily appended. e.g. "Trees "->"Trees" meant _gen_new_name("Trees") returned "trees1". Now you can specify the existing value and it will return that if it still holds. * Maximum dataset name length is now adhered to. * To make a name unique, a sequential number is now added, since for users that is more understandable and pleasant. However hex digits are still an option, for those that want to harvest concurrently.
This commit is contained in:
parent
89b6ad2ce1
commit
be3e88086a
|
@ -8,7 +8,7 @@ from pylons import config
|
|||
|
||||
from ckan import plugins as p
|
||||
from ckan import model
|
||||
from ckan.model import Session, Package
|
||||
from ckan.model import Session, Package, PACKAGE_NAME_MAX_LENGTH
|
||||
from ckan.logic import ValidationError, NotFound, get_action
|
||||
|
||||
from ckan.logic.schema import default_create_package_schema
|
||||
|
@ -41,21 +41,100 @@ class HarvesterBase(SingletonPlugin):
|
|||
|
||||
_user_name = None
|
||||
|
||||
def _gen_new_name(self, title):
|
||||
@classmethod
|
||||
def _gen_new_name(cls, title, existing_name=None,
|
||||
append_type='number-sequence'):
|
||||
'''
|
||||
Creates a URL friendly name from a title
|
||||
Returns a 'name' for the dataset (URL friendly), based on the title.
|
||||
|
||||
If the name already exists, it will add some random characters at the end
|
||||
If the ideal name is already used, it will append a number to it to
|
||||
ensure it is unique.
|
||||
|
||||
If generating a new name because the title of the dataset has changed,
|
||||
specify the existing name, in case the name doesn't need to change
|
||||
after all.
|
||||
|
||||
:param existing_name: the current name of the dataset - only specify
|
||||
this if the dataset exists
|
||||
:type existing_name: string
|
||||
:param append_type: the type of characters to add to make it unique -
|
||||
either 'number-sequence' or 'random-hex'.
|
||||
:type append_type: string
|
||||
'''
|
||||
|
||||
name = munge_title_to_name(title).replace('_', '-')
|
||||
while '--' in name:
|
||||
name = name.replace('--', '-')
|
||||
pkg_obj = Session.query(Package).filter(Package.name == name).first()
|
||||
if pkg_obj:
|
||||
return name + str(uuid.uuid4())[:5]
|
||||
ideal_name = munge_title_to_name(title)
|
||||
ideal_name = re.sub('-+', '-', ideal_name) # collapse multiple dashes
|
||||
return cls._ensure_name_is_unique(ideal_name,
|
||||
existing_name=existing_name,
|
||||
append_type=append_type)
|
||||
|
||||
@staticmethod
|
||||
def _ensure_name_is_unique(ideal_name, existing_name=None,
|
||||
append_type='number-sequence'):
|
||||
'''
|
||||
Returns a dataset name based on the ideal_name, only it will be
|
||||
guaranteed to be different than all the other datasets, by adding a
|
||||
number on the end if necessary.
|
||||
|
||||
If generating a new name because the title of the dataset has changed,
|
||||
specify the existing name, in case the name doesn't need to change
|
||||
after all.
|
||||
|
||||
The maximum dataset name length is taken account of.
|
||||
|
||||
:param ideal_name: the desired name for the dataset, if its not already
|
||||
been taken (usually derived by munging the dataset
|
||||
title)
|
||||
:type ideal_name: string
|
||||
:param existing_name: the current name of the dataset - only specify
|
||||
this if the dataset exists
|
||||
:type existing_name: string
|
||||
:param append_type: the type of characters to add to make it unique -
|
||||
either 'number-sequence' or 'random-hex'.
|
||||
:type append_type: string
|
||||
'''
|
||||
ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH]
|
||||
if existing_name == ideal_name:
|
||||
return ideal_name
|
||||
if append_type == 'number-sequence':
|
||||
MAX_NUMBER_APPENDED = 999
|
||||
APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED))
|
||||
elif append_type == 'random-hex':
|
||||
APPEND_MAX_CHARS = 5 # 16^5 = 1 million combinations
|
||||
else:
|
||||
return name
|
||||
raise NotImplementedError('append_type cannot be %s' % append_type)
|
||||
# Find out which package names have been taken. Restrict it to names
|
||||
# derived from the ideal name plus and numbers added
|
||||
like_q = u'%s%%' % \
|
||||
ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS]
|
||||
name_results = Session.query(Package.name)\
|
||||
.filter(Package.name.ilike(like_q))\
|
||||
.all()
|
||||
taken = set([name_result[0] for name_result in name_results])
|
||||
if existing_name and existing_name in taken:
|
||||
taken.remove(existing_name)
|
||||
if ideal_name not in taken:
|
||||
# great, the ideal name is available
|
||||
return ideal_name
|
||||
elif existing_name and existing_name.startswith(ideal_name):
|
||||
# the ideal name is not available, but its an existing dataset with
|
||||
# a name based on the ideal one, so there's no point changing it to
|
||||
# a different number
|
||||
return existing_name
|
||||
elif append_type == 'number-sequence':
|
||||
# find the next available number
|
||||
counter = 1
|
||||
while counter <= MAX_NUMBER_APPENDED:
|
||||
candidate_name = \
|
||||
ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \
|
||||
str(counter)
|
||||
if candidate_name not in taken:
|
||||
return candidate_name
|
||||
counter = counter + 1
|
||||
return None
|
||||
elif append_type == 'random-hex':
|
||||
return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \
|
||||
str(uuid.uuid4())[:APPEND_MAX_CHARS]
|
||||
|
||||
|
||||
def _save_gather_error(self, message, job):
|
||||
|
|
Loading…
Reference in New Issue