Merge branch 'master' of github.com:ckan/ckanext-harvest into 180-fix-modified-date

2015-11-04 09:35:23 +00:00 · 2015-11-04 09:35:23 +00:00 · 46e7e182f5
parent e305957cbe 72ef3e4826
commit 46e7e182f5
5 changed files with 306 additions and 156 deletions
--- a/ckanext/harvest/logic/action/create.py
+++ b/ckanext/harvest/logic/action/create.py
@ -148,15 +148,15 @@ def _check_for_existing_jobs(context, source_id):
    return exist
 def harvest_object_create(context, data_dict):
-    """ Create a new harvest object
+    ''' Create a new harvest object
    :type guid: string (optional)
    :type content: string (optional)
-    :type job_id: string 
+    :type job_id: string
    :type source_id: string (optional)
    :type package_id: string (optional)
    :type extras: dict (optional)
-    """
+    '''
    check_access('harvest_object_create', context, data_dict)
    data, errors = _validate(data_dict, harvest_object_create_schema(), context)
--- a/ckanext/harvest/logic/action/delete.py
+++ b/ckanext/harvest/logic/action/delete.py
@ -2,12 +2,11 @@ import logging
 from ckan import plugins as p
 log = logging.getLogger(__name__)
 def harvest_source_delete(context, data_dict):
-    '''
+    '''Deletes an existing harvest source
    Deletes an existing harvest source
    This method just proxies the request to package_delete,
    which will delete the actual harvest type dataset and the
@ -15,10 +14,6 @@ def harvest_source_delete(context, data_dict):
    :param id: the name or id of the harvest source to delete
    :type id: string
    :returns: the newly created harvest source
    :rtype: dictionary
    '''
    log.info('Deleting harvest source: %r', data_dict)
@ -28,7 +23,8 @@ def harvest_source_delete(context, data_dict):
    if context.get('clear_source', False):
-        # We need the id, the name won't work
+        # We need the id. The name won't work.
        package_dict = p.toolkit.get_action('package_show')(context, data_dict)
-        p.toolkit.get_action('harvest_source_clear')(context, {'id': package_dict['id']})
+        p.toolkit.get_action('harvest_source_clear')(
            context, {'id': package_dict['id']})
--- a/ckanext/harvest/logic/action/update.py
+++ b/ckanext/harvest/logic/action/update.py
@ -29,12 +29,13 @@ from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
 from ckanext.harvest.logic import HarvestJobExists, NoNewHarvestJobError
 from ckanext.harvest.logic.dictization import harvest_job_dictize
-from ckanext.harvest.logic.action.get import harvest_source_show, harvest_job_list, _get_sources_for_user
+from ckanext.harvest.logic.action.get import (
-
+    harvest_source_show, harvest_job_list, _get_sources_for_user)
 log = logging.getLogger(__name__)
-def harvest_source_update(context,data_dict):
+
 def harvest_source_update(context, data_dict):
    '''
    Updates an existing harvest source
@ -69,10 +70,8 @@ def harvest_source_update(context,data_dict):
        type. Should be a serialized as JSON. (optional)
    :type config: string
    :returns: the newly created harvest source
    :rtype: dictionary
    '''
    log.info('Updating harvest source: %r', data_dict)
@ -92,11 +91,11 @@ def harvest_source_clear(context, data_dict):
    :param id: the id of the harvest source to clear
    :type id: string
    '''
    check_access('harvest_source_clear',context,data_dict)
-    harvest_source_id = data_dict.get('id', None)
+    check_access('harvest_source_clear', context, data_dict)
    harvest_source_id = data_dict.get('id')
    source = HarvestSource.get(harvest_source_id)
    if not source:
@ -110,68 +109,96 @@ def harvest_source_clear(context, data_dict):
    model = context['model']
-    sql = "select id from related where id in (select related_id from related_dataset where dataset_id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}'));".format(harvest_source_id=harvest_source_id)
+    sql = '''select id from related where id in (
              select related_id from related_dataset where dataset_id in (
                  select package_id from harvest_object
                  where harvest_source_id = '{harvest_source_id}'));'''.format(
        harvest_source_id=harvest_source_id)
    result = model.Session.execute(sql)
    ids = []
    for row in result:
        ids.append(row[0])
    related_ids = "('" + "','".join(ids) + "')"
-    sql = '''begin; 
+    sql = '''begin;
-    update package set state = 'to_delete' where id in (select package_id from harvest_object where harvest_source_id = '{harvest_source_id}');'''.format(
+        update package set state = 'to_delete' where id in (
            select package_id from harvest_object
            where harvest_source_id = '{harvest_source_id}');'''.format(
        harvest_source_id=harvest_source_id)
    # CKAN-2.3 or above: delete resource views, resource revisions & resources
    if toolkit.check_ckan_version(min_version='2.3'):
        sql += '''
-        delete from resource_view where resource_id in (select id from resource where package_id in (select id from package where state = 'to_delete' ));
+        delete from resource_view where resource_id in (
-        delete from resource_revision where package_id in (select id from package where state = 'to_delete' );
+            select id from resource where package_id in (
-        delete from resource where package_id in (select id from package where state = 'to_delete' );
+                select id from package where state = 'to_delete'));
        delete from resource_revision where package_id in (
            select id from package where state = 'to_delete');
        delete from resource where package_id in (
            select id from package where state = 'to_delete');
        '''
    # Backwards-compatibility: support ResourceGroup (pre-CKAN-2.3)
    else:
        sql += '''
-        delete from resource_revision where resource_group_id in 
+        delete from resource_revision where resource_group_id in (
-        (select id from resource_group where package_id in 
+            select id from resource_group where package_id in (
-        (select id from package where state = 'to_delete'));
+                select id from package where state = 'to_delete'));
-        delete from resource where resource_group_id in 
+        delete from resource where resource_group_id in (
-        (select id from resource_group where package_id in 
+            select id from resource_group where package_id in (
-        (select id from package where state = 'to_delete'));
+                select id from package where state = 'to_delete'));
-        delete from resource_group_revision where package_id in 
+        delete from resource_group_revision where package_id in (
-        (select id from package where state = 'to_delete');
+            select id from package where state = 'to_delete');
-        delete from resource_group where package_id  in 
+        delete from resource_group where package_id in (
-        (select id from package where state = 'to_delete');
+            select id from package where state = 'to_delete');
        '''
    # CKAN pre-2.5: authz models were removed in migration 078
    if toolkit.check_ckan_version(max_version='2.4.99'):
        sql += '''
-        delete from package_role where package_id in
+        delete from package_role where package_id in (
-        (select id from package where state = 'to_delete');
+            select id from package where state = 'to_delete');
-        delete from user_object_role where id not in
+        delete from user_object_role where id not in (
-        (select user_object_role_id from package_role) and context = 'Package';
+            select user_object_role_id from package_role)
            and context = 'Package';
        '''
    sql += '''
-    delete from harvest_object_error where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
+    delete from harvest_object_error where harvest_object_id in (
-    delete from harvest_object_extra where harvest_object_id in (select id from harvest_object where harvest_source_id = '{harvest_source_id}');
+        select id from harvest_object
        where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object_extra where harvest_object_id in (
        select id from harvest_object
        where harvest_source_id = '{harvest_source_id}');
    delete from harvest_object where harvest_source_id = '{harvest_source_id}';
-    delete from harvest_gather_error where harvest_job_id in (select id from harvest_job where source_id = '{harvest_source_id}');
+    delete from harvest_gather_error where harvest_job_id in (
        select id from harvest_job where source_id = '{harvest_source_id}');
    delete from harvest_job where source_id = '{harvest_source_id}';
-    delete from package_tag_revision where package_id in (select id from package where state = 'to_delete');
+    delete from package_tag_revision where package_id in (
-    delete from member_revision where table_id in (select id from package where state = 'to_delete');
+        select id from package where state = 'to_delete');
-    delete from package_extra_revision where package_id in (select id from package where state = 'to_delete');
+    delete from member_revision where table_id in (
-    delete from package_revision where id in (select id from package where state = 'to_delete');
+        select id from package where state = 'to_delete');
-    delete from package_tag where package_id in (select id from package where state = 'to_delete');
+    delete from package_extra_revision where package_id in (
-    delete from package_extra where package_id in (select id from package where state = 'to_delete');
+        select id from package where state = 'to_delete');
-    delete from package_relationship_revision where subject_package_id in (select id from package where state = 'to_delete');
+    delete from package_revision where id in (
-    delete from package_relationship_revision where object_package_id in (select id from package where state = 'to_delete');
+        select id from package where state = 'to_delete');
-    delete from package_relationship where subject_package_id in (select id from package where state = 'to_delete');
+    delete from package_tag where package_id in (
-    delete from package_relationship where object_package_id in (select id from package where state = 'to_delete');
+        select id from package where state = 'to_delete');
-    delete from member where table_id in (select id from package where state = 'to_delete');
+    delete from package_extra where package_id in (
-    delete from related_dataset where dataset_id in (select id from package where state = 'to_delete');
+        select id from package where state = 'to_delete');
    delete from package_relationship_revision where subject_package_id in (
        select id from package where state = 'to_delete');
    delete from package_relationship_revision where object_package_id in (
        select id from package where state = 'to_delete');
    delete from package_relationship where subject_package_id in (
        select id from package where state = 'to_delete');
    delete from package_relationship where object_package_id in (
        select id from package where state = 'to_delete');
    delete from member where table_id in (
        select id from package where state = 'to_delete');
    delete from related_dataset where dataset_id in (
        select id from package where state = 'to_delete');
    delete from related where id in {related_ids};
-    delete from package where id in (select id from package where state = 'to_delete');
+    delete from package where id in (
        select id from package where state = 'to_delete');
    commit;
    '''.format(
        harvest_source_id=harvest_source_id, related_ids=related_ids)
@ -183,7 +210,8 @@ def harvest_source_clear(context, data_dict):
    return {'id': harvest_source_id}
-def harvest_source_index_clear(context,data_dict):
+
 def harvest_source_index_clear(context, data_dict):
    '''
    Clears all datasets, jobs and objects related to a harvest source, but
    keeps the source itself.  This is useful to clean history of long running
@ -193,8 +221,8 @@ def harvest_source_index_clear(context,data_dict):
    :type id: string
    '''
-    check_access('harvest_source_clear',context,data_dict)
+    check_access('harvest_source_clear', context, data_dict)
-    harvest_source_id = data_dict.get('id',None)
+    harvest_source_id = data_dict.get('id')
    source = HarvestSource.get(harvest_source_id)
    if not source:
@ -204,8 +232,8 @@ def harvest_source_index_clear(context,data_dict):
    harvest_source_id = source.id
    conn = make_connection()
-    query = ''' +%s:"%s" +site_id:"%s" ''' % ('harvest_source_id', harvest_source_id,
+    query = ''' +%s:"%s" +site_id:"%s" ''' % (
-                                            config.get('ckan.site_id'))
+        'harvest_source_id', harvest_source_id, config.get('ckan.site_id'))
    try:
        conn.delete_query(query)
        if asbool(config.get('ckan.search.solr_commit', 'true')):
@ -240,17 +268,17 @@ def harvest_objects_import(context, data_dict):
    :type package_id: string
    '''
    log.info('Harvest objects import: %r', data_dict)
-    check_access('harvest_objects_import',context,data_dict)
+    check_access('harvest_objects_import', context, data_dict)
    model = context['model']
    session = context['session']
-    source_id = data_dict.get('source_id',None)
+    source_id = data_dict.get('source_id')
-    harvest_object_id = data_dict.get('harvest_object_id',None)
+    harvest_object_id = data_dict.get('harvest_object_id')
-    package_id_or_name = data_dict.get('package_id',None)
+    package_id_or_name = data_dict.get('package_id')
-    segments = context.get('segments',None)
+    segments = context.get('segments')
-    join_datasets = context.get('join_datasets',True)
+    join_datasets = context.get('join_datasets', True)
    if source_id:
        source = HarvestSource.get(source_id)
@ -262,43 +290,48 @@ def harvest_objects_import(context, data_dict):
            log.warn('Harvest source %s is not active.', source_id)
            raise Exception('This harvest source is not active')
-        last_objects_ids = session.query(HarvestObject.id) \
+        last_objects_ids = \
-                .join(HarvestSource) \
+            session.query(HarvestObject.id) \
-                .filter(HarvestObject.source==source) \
+                   .join(HarvestSource) \
-                .filter(HarvestObject.current==True)
+                   .filter(HarvestObject.source == source) \
                   .filter(HarvestObject.current == True)
    elif harvest_object_id:
-        last_objects_ids = session.query(HarvestObject.id) \
+        last_objects_ids = \
-                .filter(HarvestObject.id==harvest_object_id)
+            session.query(HarvestObject.id) \
                   .filter(HarvestObject.id == harvest_object_id)
    elif package_id_or_name:
-        last_objects_ids = session.query(HarvestObject.id) \
+        last_objects_ids = \
-            .join(Package) \
+            session.query(HarvestObject.id) \
-            .filter(HarvestObject.current==True) \
+                   .join(Package) \
-            .filter(Package.state==u'active') \
+                   .filter(HarvestObject.current == True) \
-            .filter(or_(Package.id==package_id_or_name,
+                   .filter(Package.state == u'active') \
-                        Package.name==package_id_or_name))
+                   .filter(or_(Package.id == package_id_or_name,
                               Package.name == package_id_or_name))
        join_datasets = False
    else:
-        last_objects_ids = session.query(HarvestObject.id) \
+        last_objects_ids = \
-                .filter(HarvestObject.current==True)
+            session.query(HarvestObject.id) \
                   .filter(HarvestObject.current == True)
    if join_datasets:
        last_objects_ids = last_objects_ids.join(Package) \
-            .filter(Package.state==u'active')
+            .filter(Package.state == u'active')
    last_objects_ids = last_objects_ids.all()
    last_objects_count = 0
    for obj_id in last_objects_ids:
-        if segments and str(hashlib.md5(obj_id[0]).hexdigest())[0] not in segments:
+        if segments and \
                str(hashlib.md5(obj_id[0]).hexdigest())[0] not in segments:
            continue
        obj = session.query(HarvestObject).get(obj_id)
        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == obj.source.type:
-                if hasattr(harvester,'force_import'):
+                if hasattr(harvester, 'force_import'):
                    harvester.force_import = True
                harvester.import_stage(obj)
                break
@ -306,7 +339,8 @@ def harvest_objects_import(context, data_dict):
    log.info('Harvest objects imported: %s', last_objects_count)
    return last_objects_count
-def _caluclate_next_run(frequency):
+
 def _calculate_next_run(frequency):
    now = datetime.datetime.utcnow()
    if frequency == 'ALWAYS':
@ -318,7 +352,7 @@ def _caluclate_next_run(frequency):
    if frequency == 'DAILY':
        return now + datetime.timedelta(days=1)
    if frequency == 'MONTHLY':
-        if now.month in (4,6,9,11):
+        if now.month in (4, 6, 9, 11):
            days = 30
        elif now.month == 2:
            if now.year % 4 == 0:
@ -341,19 +375,20 @@ def _make_scheduled_jobs(context, data_dict):
        data_dict = {'source_id': source.id}
        try:
            get_action('harvest_job_create')(context, data_dict)
-        except HarvestJobExists, e:
+        except HarvestJobExists:
            log.info('Trying to rerun job for %s skipping' % source.id)
-        source.next_run = _caluclate_next_run(source.frequency)
+        source.next_run = _calculate_next_run(source.frequency)
        source.save()
-def harvest_jobs_run(context,data_dict):
+
 def harvest_jobs_run(context, data_dict):
    log.info('Harvest job run: %r', data_dict)
-    check_access('harvest_jobs_run',context,data_dict)
+    check_access('harvest_jobs_run', context, data_dict)
    session = context['session']
-    source_id = data_dict.get('source_id',None)
+    source_id = data_dict.get('source_id')
    if not source_id:
        _make_scheduled_jobs(context, data_dict)
@ -361,38 +396,41 @@ def harvest_jobs_run(context,data_dict):
    context['return_objects'] = False
    # Flag finished jobs as such
-    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'Running'})
+    jobs = harvest_job_list(
        context, {'source_id': source_id, 'status': u'Running'})
    if len(jobs):
        for job in jobs:
            if job['gather_finished']:
-                objects = session.query(HarvestObject.id) \
+                objects = \
-                          .filter(HarvestObject.harvest_job_id==job['id']) \
+                    session.query(HarvestObject.id) \
-                          .filter(and_((HarvestObject.state!=u'COMPLETE'),
+                           .filter(HarvestObject.harvest_job_id == job['id']) \
-                                       (HarvestObject.state!=u'ERROR'))) \
+                           .filter(and_((HarvestObject.state != u'COMPLETE'),
-                          .order_by(HarvestObject.import_finished.desc())
+                                        (HarvestObject.state != u'ERROR'))) \
                           .order_by(HarvestObject.import_finished.desc())
                if objects.count() == 0:
                    job_obj = HarvestJob.get(job['id'])
                    job_obj.status = u'Finished'
                    last_object = session.query(HarvestObject) \
-                          .filter(HarvestObject.harvest_job_id==job['id']) \
+                        .filter(HarvestObject.harvest_job_id == job['id']) \
-                          .filter(HarvestObject.import_finished!=None) \
+                        .filter(HarvestObject.import_finished != None) \
-                          .order_by(HarvestObject.import_finished.desc()) \
+                        .order_by(HarvestObject.import_finished.desc()) \
-                          .first()
+                        .first()
                    if last_object:
                        job_obj.finished = last_object.import_finished
                    job_obj.save()
                    # Reindex the harvest source dataset so it has the latest
                    # status
-                    get_action('harvest_source_reindex')(context,
+                    get_action('harvest_source_reindex')(
-                        {'id': job_obj.source.id})
+                        context, {'id': job_obj.source.id})
    # resubmit old redis tasks
    resubmit_jobs()
    # Check if there are pending harvest jobs
-    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'})
+    jobs = harvest_job_list(
        context, {'source_id': source_id, 'status': u'New'})
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise NoNewHarvestJobError('There are no new harvesting jobs')
@ -402,7 +440,7 @@ def harvest_jobs_run(context,data_dict):
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
-        source = harvest_source_show(context,{'id':job['source_id']})
+        source = harvest_source_show(context, {'id': job['source_id']})
        if source['active']:
            job_obj = HarvestJob.get(job['id'])
            job_obj.status = job['status'] = u'Running'
@ -431,7 +469,7 @@ def harvest_job_abort(context, data_dict):
    model = context['model']
-    source_id = data_dict.get('source_id', None)
+    source_id = data_dict.get('source_id')
    source = harvest_source_show(context, {'id': source_id})
    # HarvestJob set status to 'Finished'
@ -484,20 +522,22 @@ def harvest_sources_reindex(context, data_dict):
    model = context['model']
    packages = model.Session.query(model.Package) \
-                            .filter(model.Package.type==DATASET_TYPE_NAME) \
+                            .filter(model.Package.type == DATASET_TYPE_NAME) \
-                            .filter(model.Package.state==u'active') \
+                            .filter(model.Package.state == u'active') \
                            .all()
    package_index = PackageSearchIndex()
    reindex_context = {'defer_commit': True}
    for package in packages:
-        get_action('harvest_source_reindex')(reindex_context, {'id': package.id})
+        get_action('harvest_source_reindex')(
            reindex_context, {'id': package.id})
    package_index.commit()
    return True
@logic.side_effect_free
 def harvest_source_reindex(context, data_dict):
    '''Reindex a single harvest source'''
@ -508,8 +548,8 @@ def harvest_source_reindex(context, data_dict):
    if 'extras_as_string'in context:
        del context['extras_as_string']
    context.update({'ignore_auth': True})
-    package_dict = logic.get_action('harvest_source_show')(context,
+    package_dict = logic.get_action('harvest_source_show')(
-        {'id': harvest_source_id})
+        context, {'id': harvest_source_id})
    log.debug('Updating search index for harvest source: {0}'.format(
        package_dict.get('name') or harvest_source_id))
--- a/ckanext/harvest/logic/validators.py
+++ b/ckanext/harvest/logic/validators.py
@ -14,22 +14,25 @@ from ckan.lib.navl.validators import keep_extras
 log = logging.getLogger(__name__)
 def harvest_source_id_exists(value, context):
-    result = HarvestSource.get(value,None)
+    result = HarvestSource.get(value)
    if not result:
        raise Invalid('Harvest Source with id %r does not exist.' % str(value))
    return value
 def harvest_job_exists(value, context):
-    """Check if a harvest job exists and returns the model if it does"""
+    '''Check if a harvest job exists and returns the model if it does'''
-    result = HarvestJob.get(value, None)
+    result = HarvestJob.get(value)
    if not result:
        raise Invalid('Harvest Job with id %r does not exist.' % str(value))
    return result
 def _normalize_url(url):
    o = urlparse.urlparse(url)
@ -48,60 +51,81 @@ def _normalize_url(url):
    path = o.path.rstrip('/')
    check_url = urlparse.urlunparse((
-            o.scheme,
+        o.scheme,
-            netloc,
+        netloc,
-            path,
+        path,
-            None,None,None))
+        None, None, None))
    return check_url
-def harvest_source_url_validator(key,data,errors,context):
+
-    package = context.get("package")
+def harvest_source_url_validator(key, data, errors, context):
    '''Validate the provided harvest source URL
    Checks that the URL & config combination are unique to this HarvestSource.
    '''
    package = context.get('package')
    if package:
        package_id = package.id
    else:
-        package_id = data.get(key[:-1] + ("id",))
+        package_id = data.get(key[:-1] + ('id',))
    try:
        new_config = data.get(key[:-1] + ('config',))
    except:
        new_config = None
    new_url = _normalize_url(data[key])
    #pkg_id = data.get(('id',),'')
-    q = model.Session.query(model.Package.url, model.Package.state) \
+    q = model.Session.query(model.Package.id, model.Package.url) \
-               .filter(model.Package.type==DATASET_TYPE_NAME)
+             .filter(model.Package.type == DATASET_TYPE_NAME)
    if package_id:
        # When editing a source we need to avoid its own URL
-        q = q.filter(model.Package.id!=package_id)
+        q = q.filter(model.Package.id != package_id)
    existing_sources = q.all()
-    for url, state in existing_sources:
+    for id_, url in existing_sources:
        url = _normalize_url(url)
-        if url == new_url:
+        conf = model.Session.query(HarvestSource.config).filter(
-            raise Invalid('There already is a Harvest Source for this URL: %s' % data[key])
+            HarvestSource.id == id_).first()
        if conf:
            conf = conf[0]
        else:
            conf = None
        if url == new_url and conf == new_config:
            raise Invalid('There already is a Harvest Source for this URL (& '
                          'config): url=%s config=%s' % (new_url, new_config))
    return data[key]
-def harvest_source_type_exists(value,context):
+
-    #TODO: use new description interface
+def harvest_source_type_exists(value, context):
    # TODO: use new description interface
    # Get all the registered harvester types
    available_types = []
    for harvester in PluginImplementations(IHarvester):
        info = harvester.info()
        if not info or 'name' not in info:
-            log.error('Harvester %r does not provide the harvester name in the info response' % str(harvester))
+            log.error('Harvester %s does not provide the harvester name in '
                      'the info response' % harvester)
            continue
        available_types.append(info['name'])
    if not value in available_types:
-        raise Invalid('Unknown harvester type: %s. Have you registered a harvester for this type?' % value)
+        raise Invalid('Unknown harvester type: %s. Have you registered a '
                      'harvester for this type?' % value)
    return value
-def harvest_source_config_validator(key,data,errors,context):
+
-    harvester_type = data.get(('source_type',),'')
+def harvest_source_config_validator(key, data, errors, context):
    harvester_type = data.get(('source_type',), '')
    for harvester in PluginImplementations(IHarvester):
        info = harvester.info()
        if info['name'] == harvester_type:
@ -109,21 +133,24 @@ def harvest_source_config_validator(key,data,errors,context):
                try:
                    return harvester.validate_config(data[key])
                except Exception, e:
-                    raise Invalid('Error parsing the configuration options: %s' % str(e))
+                    raise Invalid('Error parsing the configuration options: %s'
                                  % e)
            else:
                return data[key]
 def keep_not_empty_extras(key, data, errors, context):
    extras = data.pop(key, {})
    for extras_key, value in extras.iteritems():
        if value:
            data[key[:-1] + (extras_key,)] = value
 def harvest_source_extra_validator(key,data,errors,context):
    harvester_type = data.get(('source_type',),'')
-    #gather all extra fields to use as whitelist of what
+def harvest_source_extra_validator(key, data, errors, context):
-    #can be added to top level data_dict
+    harvester_type = data.get(('source_type',), '')
    # gather all extra fields to use as whitelist of what
    # can be added to top level data_dict
    all_extra_fields = set()
    for harvester in PluginImplementations(IHarvester):
        if not hasattr(harvester, 'extra_schema'):
@ -142,7 +169,7 @@ def harvest_source_extra_validator(key,data,errors,context):
    extra_data, extra_errors = validate(data.get(key, {}), extra_schema)
    for key in extra_data.keys():
-        #only allow keys that appear in at least one harvester
+        # only allow keys that appear in at least one harvester
        if key not in all_extra_fields:
            extra_data.pop(key)
@ -152,8 +179,8 @@ def harvest_source_extra_validator(key,data,errors,context):
    for key, value in extra_errors.iteritems():
        errors[(key,)] = value
-    ## need to get config out of extras as __extra runs
+    # need to get config out of extras as __extra runs
-    ## after rest of validation
+    # after rest of validation
    package_extras = data.get(('extras',), [])
    for num, extra in enumerate(list(package_extras)):
@ -177,21 +204,24 @@ def harvest_source_extra_validator(key,data,errors,context):
    if package_extras:
        data[('extras',)] = package_extras
-def harvest_source_convert_from_config(key,data,errors,context):
+
 def harvest_source_convert_from_config(key, data, errors, context):
    config = data[key]
    if config:
        config_dict = json.loads(config)
        for key, value in config_dict.iteritems():
            data[(key,)] = value
-def harvest_source_active_validator(value,context):
+
-    if isinstance(value,basestring):
+def harvest_source_active_validator(value, context):
    if isinstance(value, basestring):
        if value.lower() == 'true':
            return True
        else:
            return False
    return bool(value)
 def harvest_source_frequency_exists(value):
    if value == '':
        value = 'MANUAL'
@ -205,6 +235,7 @@ def dataset_type_exists(value):
        value = DATASET_TYPE_NAME
    return value
 def harvest_object_extras_validator(value, context):
    if not isinstance(value, dict):
        raise Invalid('extras must be a dict')
--- a/ckanext/harvest/tests/test_action.py
+++ b/ckanext/harvest/tests/test_action.py
@ -2,6 +2,7 @@ import json
 import copy
 import factories
 import unittest
 from nose.tools import assert_equal, assert_raises
 try:
    from ckan.tests import factories as ckan_factories
@ -12,6 +13,7 @@ except ImportError:
 from ckan import plugins as p
 from ckan.plugins import toolkit
 from ckan import model
 import ckan.lib.search as search
 from ckanext.harvest.interfaces import IHarvester
 import ckanext.harvest.model as harvest_model
@ -127,6 +129,32 @@ class FunctionalTestBaseWithoutClearBetweenTests(object):
        pass
 SOURCE_DICT = {
    "url": "http://test.action.com",
    "name": "test-source-action",
    "title": "Test source action",
    "notes": "Test source action desc",
    "source_type": "test-for-action",
    "frequency": "MANUAL",
    "config": json.dumps({"custom_option": ["a", "b"]})
 }
 class ActionBase(object):
    @classmethod
    def setup_class(cls):
        if not p.plugin_loaded('test_action_harvester'):
            p.load('test_action_harvester')
    def setup(self):
        reset_db()
        harvest_model.setup()
    @classmethod
    def teardown_class(cls):
        p.unload('test_action_harvester')
 class HarvestSourceActionBase(FunctionalTestBaseWithoutClearBetweenTests):
    @classmethod
@ -136,15 +164,7 @@ class HarvestSourceActionBase(FunctionalTestBaseWithoutClearBetweenTests):
        cls.sysadmin = ckan_factories.Sysadmin()
-        cls.default_source_dict = {
+        cls.default_source_dict = SOURCE_DICT
            "url": "http://test.action.com",
            "name": "test-source-action",
            "title": "Test source action",
            "notes": "Test source action desc",
            "source_type": "test-for-action",
            "frequency": "MANUAL",
            "config": json.dumps({"custom_option": ["a", "b"]})
        }
        if not p.plugin_loaded('test_action_harvester'):
            p.load('test_action_harvester')
@ -287,6 +307,69 @@ class TestHarvestSourceActionUpdate(HarvestSourceActionBase):
        assert source.type == source_dict['source_type']
 class TestActions(ActionBase):
    def test_harvest_source_clear(self):
        source = factories.HarvestSourceObj(**SOURCE_DICT)
        job = factories.HarvestJobObj(source=source)
        dataset = ckan_factories.Dataset()
        object_ = factories.HarvestObjectObj(job=job, source=source,
                                             package_id=dataset['id'])
        context = {'model': model, 'session': model.Session,
                   'ignore_auth': True, 'user': ''}
        result = toolkit.get_action('harvest_source_clear')(
            context, {'id': source.id})
        assert_equal(result, {'id': source.id})
        source = harvest_model.HarvestSource.get(source.id)
        assert source
        assert_equal(harvest_model.HarvestJob.get(job.id), None)
        assert_equal(harvest_model.HarvestObject.get(object_.id), None)
        assert_equal(model.Package.get(dataset['id']), None)
    def test_harvest_source_create_twice_with_unique_url(self):
        # don't use factory because it looks for the existing source
        data_dict = SOURCE_DICT
        site_user = toolkit.get_action('get_site_user')(
            {'model': model, 'ignore_auth': True}, {})['name']
        toolkit.get_action('harvest_source_create')(
            {'user': site_user}, data_dict)
        data_dict['name'] = 'another-source1'
        data_dict['url'] = 'http://another-url'
        toolkit.get_action('harvest_source_create')(
            {'user': site_user}, data_dict)
    def test_harvest_source_create_twice_with_same_url(self):
        # don't use factory because it looks for the existing source
        data_dict = SOURCE_DICT
        site_user = toolkit.get_action('get_site_user')(
            {'model': model, 'ignore_auth': True}, {})['name']
        toolkit.get_action('harvest_source_create')(
            {'user': site_user}, data_dict)
        data_dict['name'] = 'another-source2'
        assert_raises(toolkit.ValidationError,
                      toolkit.get_action('harvest_source_create'),
                      {'user': site_user}, data_dict)
    def test_harvest_source_create_twice_with_unique_url_and_config(self):
        # don't use factory because it looks for the existing source
        data_dict = SOURCE_DICT
        site_user = toolkit.get_action('get_site_user')(
            {'model': model, 'ignore_auth': True}, {})['name']
        toolkit.get_action('harvest_source_create')(
            {'user': site_user}, data_dict)
        data_dict['name'] = 'another-source3'
        data_dict['config'] = '{"something": "new"}'
        toolkit.get_action('harvest_source_create')(
            {'user': site_user}, data_dict)
 class TestHarvestObject(unittest.TestCase):
    @classmethod
    def setup_class(cls):