Merge branch 'unique-names-improved'
This commit is contained in:
commit
d950b13400
|
@ -8,7 +8,7 @@ from pylons import config
|
|||
|
||||
from ckan import plugins as p
|
||||
from ckan import model
|
||||
from ckan.model import Session, Package
|
||||
from ckan.model import Session, Package, PACKAGE_NAME_MAX_LENGTH
|
||||
from ckan.logic import ValidationError, NotFound, get_action
|
||||
|
||||
from ckan.logic.schema import default_create_package_schema
|
||||
|
@ -41,21 +41,100 @@ class HarvesterBase(SingletonPlugin):
|
|||
|
||||
_user_name = None
|
||||
|
||||
def _gen_new_name(self, title):
|
||||
@classmethod
|
||||
def _gen_new_name(cls, title, existing_name=None,
|
||||
append_type='number-sequence'):
|
||||
'''
|
||||
Creates a URL friendly name from a title
|
||||
Returns a 'name' for the dataset (URL friendly), based on the title.
|
||||
|
||||
If the name already exists, it will add some random characters at the end
|
||||
If the ideal name is already used, it will append a number to it to
|
||||
ensure it is unique.
|
||||
|
||||
If generating a new name because the title of the dataset has changed,
|
||||
specify the existing name, in case the name doesn't need to change
|
||||
after all.
|
||||
|
||||
:param existing_name: the current name of the dataset - only specify
|
||||
this if the dataset exists
|
||||
:type existing_name: string
|
||||
:param append_type: the type of characters to add to make it unique -
|
||||
either 'number-sequence' or 'random-hex'.
|
||||
:type append_type: string
|
||||
'''
|
||||
|
||||
name = munge_title_to_name(title).replace('_', '-')
|
||||
while '--' in name:
|
||||
name = name.replace('--', '-')
|
||||
pkg_obj = Session.query(Package).filter(Package.name == name).first()
|
||||
if pkg_obj:
|
||||
return name + str(uuid.uuid4())[:5]
|
||||
ideal_name = munge_title_to_name(title)
|
||||
ideal_name = re.sub('-+', '-', ideal_name) # collapse multiple dashes
|
||||
return cls._ensure_name_is_unique(ideal_name,
|
||||
existing_name=existing_name,
|
||||
append_type=append_type)
|
||||
|
||||
@staticmethod
|
||||
def _ensure_name_is_unique(ideal_name, existing_name=None,
|
||||
append_type='number-sequence'):
|
||||
'''
|
||||
Returns a dataset name based on the ideal_name, only it will be
|
||||
guaranteed to be different than all the other datasets, by adding a
|
||||
number on the end if necessary.
|
||||
|
||||
If generating a new name because the title of the dataset has changed,
|
||||
specify the existing name, in case the name doesn't need to change
|
||||
after all.
|
||||
|
||||
The maximum dataset name length is taken account of.
|
||||
|
||||
:param ideal_name: the desired name for the dataset, if its not already
|
||||
been taken (usually derived by munging the dataset
|
||||
title)
|
||||
:type ideal_name: string
|
||||
:param existing_name: the current name of the dataset - only specify
|
||||
this if the dataset exists
|
||||
:type existing_name: string
|
||||
:param append_type: the type of characters to add to make it unique -
|
||||
either 'number-sequence' or 'random-hex'.
|
||||
:type append_type: string
|
||||
'''
|
||||
ideal_name = ideal_name[:PACKAGE_NAME_MAX_LENGTH]
|
||||
if existing_name == ideal_name:
|
||||
return ideal_name
|
||||
if append_type == 'number-sequence':
|
||||
MAX_NUMBER_APPENDED = 999
|
||||
APPEND_MAX_CHARS = len(str(MAX_NUMBER_APPENDED))
|
||||
elif append_type == 'random-hex':
|
||||
APPEND_MAX_CHARS = 5 # 16^5 = 1 million combinations
|
||||
else:
|
||||
return name
|
||||
raise NotImplementedError('append_type cannot be %s' % append_type)
|
||||
# Find out which package names have been taken. Restrict it to names
|
||||
# derived from the ideal name plus and numbers added
|
||||
like_q = u'%s%%' % \
|
||||
ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS]
|
||||
name_results = Session.query(Package.name)\
|
||||
.filter(Package.name.ilike(like_q))\
|
||||
.all()
|
||||
taken = set([name_result[0] for name_result in name_results])
|
||||
if existing_name and existing_name in taken:
|
||||
taken.remove(existing_name)
|
||||
if ideal_name not in taken:
|
||||
# great, the ideal name is available
|
||||
return ideal_name
|
||||
elif existing_name and existing_name.startswith(ideal_name):
|
||||
# the ideal name is not available, but its an existing dataset with
|
||||
# a name based on the ideal one, so there's no point changing it to
|
||||
# a different number
|
||||
return existing_name
|
||||
elif append_type == 'number-sequence':
|
||||
# find the next available number
|
||||
counter = 1
|
||||
while counter <= MAX_NUMBER_APPENDED:
|
||||
candidate_name = \
|
||||
ideal_name[:PACKAGE_NAME_MAX_LENGTH-len(str(counter))] + \
|
||||
str(counter)
|
||||
if candidate_name not in taken:
|
||||
return candidate_name
|
||||
counter = counter + 1
|
||||
return None
|
||||
elif append_type == 'random-hex':
|
||||
return ideal_name[:PACKAGE_NAME_MAX_LENGTH-APPEND_MAX_CHARS] + \
|
||||
str(uuid.uuid4())[:APPEND_MAX_CHARS]
|
||||
|
||||
|
||||
def _save_gather_error(self, message, job):
|
||||
|
|
|
@ -0,0 +1,96 @@
|
|||
import re
|
||||
|
||||
from nose.tools import assert_equal
|
||||
|
||||
from ckanext.harvest.harvesters.base import HarvesterBase
|
||||
try:
|
||||
from ckan.tests import helpers
|
||||
from ckan.tests import factories
|
||||
except ImportError:
|
||||
from ckan.new_tests import helpers
|
||||
from ckan.new_tests import factories
|
||||
|
||||
|
||||
_ensure_name_is_unique = HarvesterBase._ensure_name_is_unique
|
||||
|
||||
|
||||
class TestGenNewName(object):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
helpers.reset_db()
|
||||
|
||||
def test_basic(self):
|
||||
assert_equal(HarvesterBase._gen_new_name('Trees'), 'trees')
|
||||
|
||||
def test_munge(self):
|
||||
assert_equal(
|
||||
HarvesterBase._gen_new_name('Trees and branches - survey.'),
|
||||
'trees-and-branches-survey')
|
||||
|
||||
|
||||
class TestEnsureNameIsUnique(object):
|
||||
def setup(self):
|
||||
helpers.reset_db()
|
||||
|
||||
def test_no_existing_datasets(self):
|
||||
factories.Dataset(name='unrelated')
|
||||
assert_equal(_ensure_name_is_unique('trees'), 'trees')
|
||||
|
||||
def test_existing_dataset(self):
|
||||
factories.Dataset(name='trees')
|
||||
assert_equal(_ensure_name_is_unique('trees'), 'trees1')
|
||||
|
||||
def test_two_existing_datasets(self):
|
||||
factories.Dataset(name='trees')
|
||||
factories.Dataset(name='trees1')
|
||||
assert_equal(_ensure_name_is_unique('trees'), 'trees2')
|
||||
|
||||
def test_no_existing_datasets_and_long_name(self):
|
||||
assert_equal(_ensure_name_is_unique('x'*101), 'x'*100)
|
||||
|
||||
def test_existing_dataset_and_long_name(self):
|
||||
# because PACKAGE_NAME_MAX_LENGTH = 100
|
||||
factories.Dataset(name='x'*100)
|
||||
assert_equal(_ensure_name_is_unique('x'*101), 'x'*99 + '1')
|
||||
|
||||
def test_update_dataset_with_new_name(self):
|
||||
factories.Dataset(name='trees1')
|
||||
assert_equal(_ensure_name_is_unique('tree', existing_name='trees1'),
|
||||
'tree')
|
||||
|
||||
def test_update_dataset_but_with_same_name(self):
|
||||
# this can happen if you remove a trailing space from the title - the
|
||||
# harvester sees the title changed and thinks it should have a new
|
||||
# name, but clearly it can reuse its existing one
|
||||
factories.Dataset(name='trees')
|
||||
factories.Dataset(name='trees1')
|
||||
assert_equal(_ensure_name_is_unique('trees', existing_name='trees'),
|
||||
'trees')
|
||||
|
||||
def test_update_dataset_to_available_shorter_name(self):
|
||||
# this can be handy when if reharvesting, you got duplicates and
|
||||
# managed to purge one set and through a minor title change you can now
|
||||
# lose the appended number. users don't like unnecessary numbers.
|
||||
factories.Dataset(name='trees1')
|
||||
assert_equal(_ensure_name_is_unique('trees', existing_name='trees1'),
|
||||
'trees')
|
||||
|
||||
def test_update_dataset_but_doesnt_change_to_other_number(self):
|
||||
# there's no point changing one number for another though
|
||||
factories.Dataset(name='trees')
|
||||
factories.Dataset(name='trees2')
|
||||
assert_equal(_ensure_name_is_unique('trees', existing_name='trees2'),
|
||||
'trees2')
|
||||
|
||||
def test_update_dataset_with_new_name_with_numbers(self):
|
||||
factories.Dataset(name='trees')
|
||||
factories.Dataset(name='trees2')
|
||||
factories.Dataset(name='frogs')
|
||||
assert_equal(_ensure_name_is_unique('frogs', existing_name='trees2'),
|
||||
'frogs1')
|
||||
|
||||
def test_existing_dataset_appending_hex(self):
|
||||
factories.Dataset(name='trees')
|
||||
name = _ensure_name_is_unique('trees', append_type='random-hex')
|
||||
# e.g. 'trees0b53f'
|
||||
assert re.match('trees[\da-f]{5}', name)
|
Loading…
Reference in New Issue