harvester-d4science/ckanext/harvest/logic/action/update.py

import hashlib

import logging

from ckan.plugins import PluginImplementations
from ckanext.harvest.interfaces import IHarvester

from ckan.model import Package

from ckan.logic import NotFound, ValidationError, check_access
from ckan.lib.navl.dictization_functions import validate

from ckanext.harvest.queue import get_gather_publisher

from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject)
from ckanext.harvest.logic.schema import default_harvest_source_schema
from ckanext.harvest.logic.dictization import (harvest_source_dictize,harvest_object_dictize)

from ckanext.harvest.logic.action.create import _error_summary
from ckanext.harvest.logic.action.get import harvest_source_show,harvest_job_list


log = logging.getLogger(__name__)

def harvest_source_update(context,data_dict):

    check_access('harvest_source_update',context,data_dict)

    model = context['model']
    session = context['session']

    source_id = data_dict.get('id')
    schema = context.get('schema') or default_harvest_source_schema()

    log.info('Harvest source %s update: %r', source_id, data_dict)
    source = HarvestSource.get(source_id)
    if not source:
        log.error('Harvest source %s does not exist', source_id)
        raise NotFound('Harvest source %s does not exist' % source_id)

    data, errors = validate(data_dict, schema)

    if errors:
        session.rollback()
        raise ValidationError(errors,_error_summary(errors))

    fields = ['url','title','type','description','user_id','publisher_id']
    for f in fields:
        if f in data and data[f] is not None:
            source.__setattr__(f,data[f])

    if 'active' in data_dict:
        source.active = data['active']

    if 'config' in data_dict:
        source.config = data['config']

    source.save()
    # Abort any pending jobs
    if not source.active:
        jobs = HarvestJob.filter(source=source,status=u'New')
        log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count())
        if jobs:
            for job in jobs:
                job.status = u'Aborted'
                job.save()

    return harvest_source_dictize(source,context)

def harvest_objects_import(context,data_dict):
    '''
        Reimports the current harvest objects
        It performs the import stage with the last fetched objects, optionally
        belonging to a certain source.
        Please note that no objects will be fetched from the remote server.
        It will only affect the last fetched objects already present in the
        database.
    '''
    log.info('Harvest objects import: %r', data_dict)
    check_access('harvest_objects_import',context,data_dict)

    model = context['model']
    session = context['session']
    source_id = data_dict.get('source_id',None)

    segments = context.get('segments',None)

    join_datasets = context.get('join_datasets',True)

    if source_id:
        source = HarvestSource.get(source_id)
        if not source:
            log.error('Harvest source %s does not exist', source_id)
            raise NotFound('Harvest source %s does not exist' % source_id)

        if not source.active:
            log.warn('Harvest source %s is not active.', source_id)
            raise Exception('This harvest source is not active')

        last_objects_ids = session.query(HarvestObject.id) \
                .join(HarvestSource) \
                .filter(HarvestObject.source==source) \
                .filter(HarvestObject.current==True)

    else:
        last_objects_ids = session.query(HarvestObject.id) \
                .filter(HarvestObject.current==True) \

    if join_datasets:
        last_objects_ids = last_objects_ids.join(Package) \
            .filter(Package.state==u'active')

    last_objects_ids = last_objects_ids.all()

    last_objects = []

    for obj_id in last_objects_ids:
        if segments and str(hashlib.md5(obj_id[0]).hexdigest())[0] not in segments:
            continue

        obj = session.query(HarvestObject).get(obj_id)

        for harvester in PluginImplementations(IHarvester):
            if harvester.info()['name'] == obj.source.type:
                if hasattr(harvester,'force_import'):
                    harvester.force_import = True
                harvester.import_stage(obj)
                break
        last_objects.append(harvest_object_dictize(obj,context))
    log.info('Harvest objects imported: %s', len(last_objects))
    return last_objects

def harvest_jobs_run(context,data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run',context,data_dict)

    source_id = data_dict.get('source_id',None)

    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'})
    if len(jobs) == 0:
        log.info('No new harvest jobs.')
        raise Exception('There are no new harvesting jobs')

    # Send each job to the gather queue
    publisher = get_gather_publisher()
    sent_jobs = []
    for job in jobs:
        context['detailed'] = False
        source = harvest_source_show(context,{'id':job['source']})
        if source['active']:
            publisher.send({'harvest_job_id': job['id']})
            log.info('Sent job %s to the gather queue' % job['id'])
            sent_jobs.append(job)

    publisher.close()
    return sent_jobs
Allow defining segments of harvest objects to import Useful when importing large number of objects, as it allows parallelization 2012-08-02 19:41:59 +02:00			`import hashlib`

[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`import logging`

			`from ckan.plugins import PluginImplementations`
			`from ckanext.harvest.interfaces import IHarvester`

			`from ckan.model import Package`

[logic,auth] Add auth logic layer The first version of the auth layer is based on the current policy, i.e. you need to be sysadmin to perform any action. TODO: the CLI is still not working. 2012-03-01 13:02:16 +01:00			`from ckan.logic import NotFound, ValidationError, check_access`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`from ckan.lib.navl.dictization_functions import validate`

			`from ckanext.harvest.queue import get_gather_publisher`

			`from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject)`
[logic] Fix broken imports 2012-03-07 18:08:17 +01:00			`from ckanext.harvest.logic.schema import default_harvest_source_schema`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`from ckanext.harvest.logic.dictization import (harvest_source_dictize,harvest_object_dictize)`

			`from ckanext.harvest.logic.action.create import _error_summary`
			`from ckanext.harvest.logic.action.get import harvest_source_show,harvest_job_list`


			`log = logging.getLogger(__name__)`

			`def harvest_source_update(context,data_dict):`

[logic,auth] Add auth logic layer The first version of the auth layer is based on the current policy, i.e. you need to be sysadmin to perform any action. TODO: the CLI is still not working. 2012-03-01 13:02:16 +01:00			`check_access('harvest_source_update',context,data_dict)`

[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`model = context['model']`
[logic,auth] Add auth logic layer The first version of the auth layer is based on the current policy, i.e. you need to be sysadmin to perform any action. TODO: the CLI is still not working. 2012-03-01 13:02:16 +01:00			`session = context['session']`

[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`source_id = data_dict.get('id')`
[auth,logic,ui] Handle publishers on the UI Add fields for publishers in the form when using the publihser auth profile. Some changes related to the source schema. 2012-03-06 17:01:43 +01:00			`schema = context.get('schema') or default_harvest_source_schema()`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00
Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log.info('Harvest source %s update: %r', source_id, data_dict)`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`source = HarvestSource.get(source_id)`
			`if not source:`
Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log.error('Harvest source %s does not exist', source_id)`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`raise NotFound('Harvest source %s does not exist' % source_id)`

			`data, errors = validate(data_dict, schema)`
[auth,logic,ui] Handle publishers on the UI Add fields for publishers in the form when using the publihser auth profile. Some changes related to the source schema. 2012-03-06 17:01:43 +01:00
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`if errors:`
[logic,auth] Add auth logic layer The first version of the auth layer is based on the current policy, i.e. you need to be sysadmin to perform any action. TODO: the CLI is still not working. 2012-03-01 13:02:16 +01:00			`session.rollback()`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`raise ValidationError(errors,_error_summary(errors))`

			`fields = ['url','title','type','description','user_id','publisher_id']`
			`for f in fields:`
			`if f in data and data[f] is not None:`
			`source.__setattr__(f,data[f])`

			`if 'active' in data_dict:`
			`source.active = data['active']`

			`if 'config' in data_dict:`
			`source.config = data['config']`

			`source.save()`
			`# Abort any pending jobs`
			`if not source.active:`
			`jobs = HarvestJob.filter(source=source,status=u'New')`
Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log.info('Harvest source %s not active, so aborting %i outstanding jobs', source_id, jobs.count())`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`if jobs:`
			`for job in jobs:`
			`job.status = u'Aborted'`
			`job.save()`

			`return harvest_source_dictize(source,context)`

			`def harvest_objects_import(context,data_dict):`
			`'''`
			`Reimports the current harvest objects`
			`It performs the import stage with the last fetched objects, optionally`
			`belonging to a certain source.`
			`Please note that no objects will be fetched from the remote server.`
			`It will only affect the last fetched objects already present in the`
			`database.`
			`'''`
Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log.info('Harvest objects import: %r', data_dict)`
[logic,auth] Add auth logic layer The first version of the auth layer is based on the current policy, i.e. you need to be sysadmin to perform any action. TODO: the CLI is still not working. 2012-03-01 13:02:16 +01:00			`check_access('harvest_objects_import',context,data_dict)`

[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`model = context['model']`
[logic,auth] Add auth logic layer The first version of the auth layer is based on the current policy, i.e. you need to be sysadmin to perform any action. TODO: the CLI is still not working. 2012-03-01 13:02:16 +01:00			`session = context['session']`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`source_id = data_dict.get('source_id',None)`

Allow defining segments of harvest objects to import Useful when importing large number of objects, as it allows parallelization 2012-08-02 19:41:59 +02:00			`segments = context.get('segments',None)`

Allow not linking to datasets when importing records With the -j flag, harvest objects are not linked to datasets when importing. This is useful sometimes when importing records for the first time. 2012-07-30 13:11:55 +02:00			`join_datasets = context.get('join_datasets',True)`

[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`if source_id:`
			`source = HarvestSource.get(source_id)`
			`if not source:`
Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log.error('Harvest source %s does not exist', source_id)`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`raise NotFound('Harvest source %s does not exist' % source_id)`

			`if not source.active:`
Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log.warn('Harvest source %s is not active.', source_id)`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`raise Exception('This harvest source is not active')`

[logic,auth] Add auth logic layer The first version of the auth layer is based on the current policy, i.e. you need to be sysadmin to perform any action. TODO: the CLI is still not working. 2012-03-01 13:02:16 +01:00			`last_objects_ids = session.query(HarvestObject.id) \`
Allow not linking to datasets when importing records With the -j flag, harvest objects are not linked to datasets when importing. This is useful sometimes when importing records for the first time. 2012-07-30 13:11:55 +02:00			`.join(HarvestSource) \`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`.filter(HarvestObject.source==source) \`
Allow not linking to datasets when importing records With the -j flag, harvest objects are not linked to datasets when importing. This is useful sometimes when importing records for the first time. 2012-07-30 13:11:55 +02:00			`.filter(HarvestObject.current==True)`

[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`else:`
[logic,auth] Add auth logic layer The first version of the auth layer is based on the current policy, i.e. you need to be sysadmin to perform any action. TODO: the CLI is still not working. 2012-03-01 13:02:16 +01:00			`last_objects_ids = session.query(HarvestObject.id) \`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`.filter(HarvestObject.current==True) \`
Allow not linking to datasets when importing records With the -j flag, harvest objects are not linked to datasets when importing. This is useful sometimes when importing records for the first time. 2012-07-30 13:11:55 +02:00
			`if join_datasets:`
			`last_objects_ids = last_objects_ids.join(Package) \`
			`.filter(Package.state==u'active')`

			`last_objects_ids = last_objects_ids.all()`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00
			`last_objects = []`
Allow defining segments of harvest objects to import Useful when importing large number of objects, as it allows parallelization 2012-08-02 19:41:59 +02:00
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`for obj_id in last_objects_ids:`
Allow defining segments of harvest objects to import Useful when importing large number of objects, as it allows parallelization 2012-08-02 19:41:59 +02:00			`if segments and str(hashlib.md5(obj_id[0]).hexdigest())[0] not in segments:`
			`continue`

[logic,auth] Add auth logic layer The first version of the auth layer is based on the current policy, i.e. you need to be sysadmin to perform any action. TODO: the CLI is still not working. 2012-03-01 13:02:16 +01:00			`obj = session.query(HarvestObject).get(obj_id)`
Allow defining segments of harvest objects to import Useful when importing large number of objects, as it allows parallelization 2012-08-02 19:41:59 +02:00
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`for harvester in PluginImplementations(IHarvester):`
			`if harvester.info()['name'] == obj.source.type:`
			`if hasattr(harvester,'force_import'):`
			`harvester.force_import = True`
			`harvester.import_stage(obj)`
			`break`
			`last_objects.append(harvest_object_dictize(obj,context))`
Allow not linking to datasets when importing records With the -j flag, harvest objects are not linked to datasets when importing. This is useful sometimes when importing records for the first time. 2012-07-30 13:11:55 +02:00			`log.info('Harvest objects imported: %s', len(last_objects))`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`return last_objects`

			`def harvest_jobs_run(context,data_dict):`
Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log.info('Harvest job run: %r', data_dict)`
[logic,auth] Add auth logic layer The first version of the auth layer is based on the current policy, i.e. you need to be sysadmin to perform any action. TODO: the CLI is still not working. 2012-03-01 13:02:16 +01:00			`check_access('harvest_jobs_run',context,data_dict)`

[logic,auth] Implement publisher auth profile The publisher profile allows general users to handle harvest sources based on membership to a certain group (publisher), as opposed to the default auth profile where only sysadmins can perform any harvesting task. To enable it, put this directive in your ini file: ckan.harvest.auth.profile = publisher TODO: * Save publisher id / user id when creating sources * Show publisher in form and index page 2012-03-02 17:49:39 +01:00			`source_id = data_dict.get('source_id',None)`

[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`# Check if there are pending harvest jobs`
[logic,auth] Implement publisher auth profile The publisher profile allows general users to handle harvest sources based on membership to a certain group (publisher), as opposed to the default auth profile where only sysadmins can perform any harvesting task. To enable it, put this directive in your ini file: ckan.harvest.auth.profile = publisher TODO: * Save publisher id / user id when creating sources * Show publisher in form and index page 2012-03-02 17:49:39 +01:00			`jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'})`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`if len(jobs) == 0:`
Add copious logging to record what happens in harvesting. 2012-06-08 18:09:22 +02:00			`log.info('No new harvest jobs.')`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`raise Exception('There are no new harvesting jobs')`

			`# Send each job to the gather queue`
			`publisher = get_gather_publisher()`
			`sent_jobs = []`
			`for job in jobs:`
[cli] Speed up run command 2012-06-29 12:32:18 +02:00			`context['detailed'] = False`
[logic] Refactor the rest of the logic functions (create,update,delete) 2012-02-29 16:20:35 +01:00			`source = harvest_source_show(context,{'id':job['source']})`
			`if source['active']:`
			`publisher.send({'harvest_job_id': job['id']})`
			`log.info('Sent job %s to the gather queue' % job['id'])`
			`sent_jobs.append(job)`

			`publisher.close()`
			`return sent_jobs`