add jobs at certain frequencies

This commit is contained in:
kindly 2012-10-29 17:15:02 +00:00
parent 9fc0ae9937
commit 2529a17304
7 changed files with 87 additions and 9 deletions

View File

@ -15,7 +15,7 @@ class Harvester(CkanCommand):
harvester initdb harvester initdb
- Creates the necessary tables in the database - Creates the necessary tables in the database
harvester source {url} {type} [{active}] [{user-id}] [{publisher-id}] harvester source {url} {type} [{config}] [{active}] [{user-id}] [{publisher-id}] [{frequency}]
- create new harvest source - create new harvest source
harvester rmsource {id} harvester rmsource {id}
@ -64,7 +64,7 @@ class Harvester(CkanCommand):
summary = __doc__.split('\n')[0] summary = __doc__.split('\n')[0]
usage = __doc__ usage = __doc__
max_args = 6 max_args = 8
min_args = 0 min_args = 0
def __init__(self,name): def __init__(self,name):
@ -169,11 +169,18 @@ class Harvester(CkanCommand):
publisher_id = unicode(self.args[6]) publisher_id = unicode(self.args[6])
else: else:
publisher_id = u'' publisher_id = u''
if len(self.args) >= 8:
frequency = unicode(self.args[7])
if not frequency:
frequency = None
else:
frequency = None
try: try:
data_dict = { data_dict = {
'url':url, 'url':url,
'type':type, 'type':type,
'config':config, 'config':config,
'frequency':frequency,
'active':active, 'active':active,
'user_id':user_id, 'user_id':user_id,
'publisher_id':publisher_id} 'publisher_id':publisher_id}
@ -186,9 +193,11 @@ class Harvester(CkanCommand):
sources = get_action('harvest_source_list')(context,{}) sources = get_action('harvest_source_list')(context,{})
self.print_there_are('harvest source', sources) self.print_there_are('harvest source', sources)
# Create a harvest job for the new source # Create a harvest job for the new source if not regular job.
get_action('harvest_job_create')(context,{'source_id':source['id']}) if not data_dict['frequency']:
print 'A new Harvest Job for this source has also been created' get_action('harvest_job_create')(context,{'source_id':source['id']})
print 'A new Harvest Job for this source has also been created'
except ValidationError,e: except ValidationError,e:
print 'An error occurred:' print 'An error occurred:'
print str(e.error_dict) print str(e.error_dict)
@ -278,6 +287,7 @@ class Harvester(CkanCommand):
print ' active: %s' % source['active'] print ' active: %s' % source['active']
print ' user: %s' % source['user_id'] print ' user: %s' % source['user_id']
print 'publisher: %s' % source['publisher_id'] print 'publisher: %s' % source['publisher_id']
print 'frequency: %s' % source['frequency']
print ' jobs: %s' % source['status']['job_count'] print ' jobs: %s' % source['status']['job_count']
print '' print ''

View File

@ -5,3 +5,5 @@ except ImportError:
import pkgutil import pkgutil
__path__ = pkgutil.extend_path(__path__, __name__) __path__ = pkgutil.extend_path(__path__, __name__)
class HarvestJobExists(Exception):
pass

View File

@ -2,6 +2,7 @@ import re
import logging import logging
from ckan.logic import NotFound, ValidationError, check_access from ckan.logic import NotFound, ValidationError, check_access
from ckanext.harvest.logic import HarvestJobExists
from ckan.lib.navl.dictization_functions import validate from ckan.lib.navl.dictization_functions import validate
from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject) from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject)
@ -32,7 +33,8 @@ def harvest_source_create(context,data_dict):
source.url = data['url'].strip() source.url = data['url'].strip()
source.type = data['type'] source.type = data['type']
opt = ['active','title','description','user_id','publisher_id','config'] opt = ['active','title','description','user_id',
'publisher_id','config', 'frequency']
for o in opt: for o in opt:
if o in data and data[o] is not None: if o in data and data[o] is not None:
source.__setattr__(o,data[o]) source.__setattr__(o,data[o])
@ -45,6 +47,7 @@ def harvest_source_create(context,data_dict):
return harvest_source_dictize(source,context) return harvest_source_dictize(source,context)
def harvest_job_create(context,data_dict): def harvest_job_create(context,data_dict):
log.info('Harvest job create: %r', data_dict) log.info('Harvest job create: %r', data_dict)
check_access('harvest_job_create',context,data_dict) check_access('harvest_job_create',context,data_dict)
@ -70,7 +73,7 @@ def harvest_job_create(context,data_dict):
exists = harvest_job_list(context,data_dict) exists = harvest_job_list(context,data_dict)
if len(exists): if len(exists):
log.warn('There is already an unrun job %r for this source %s', exists, source_id) log.warn('There is already an unrun job %r for this source %s', exists, source_id)
raise Exception('There already is an unrun job for this source') raise HarvestJobExists('There already is an unrun job for this source')
job = HarvestJob() job = HarvestJob()
job.source = source job.source = source

View File

@ -2,6 +2,7 @@ import logging
from sqlalchemy import or_ from sqlalchemy import or_
from ckan.authz import Authorizer from ckan.authz import Authorizer
from ckan.model import User from ckan.model import User
import datetime
from ckan.plugins import PluginImplementations from ckan.plugins import PluginImplementations
from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.interfaces import IHarvester
@ -153,6 +154,7 @@ def _get_sources_for_user(context,data_dict):
user = context.get('user','') user = context.get('user','')
only_active = data_dict.get('only_active',False) only_active = data_dict.get('only_active',False)
only_to_run = data_dict.get('only_to_run',False)
query = session.query(HarvestSource) \ query = session.query(HarvestSource) \
.order_by(HarvestSource.created.desc()) .order_by(HarvestSource.created.desc())
@ -160,6 +162,14 @@ def _get_sources_for_user(context,data_dict):
if only_active: if only_active:
query = query.filter(HarvestSource.active==True) \ query = query.filter(HarvestSource.active==True) \
if only_to_run:
query = query.filter(or_(HarvestSource.frequency!=None,
HarvestSource.frequency!='')
)
query = query.filter(or_(HarvestSource.next_run<=datetime.datetime.utcnow(),
HarvestSource.next_run==None)
)
# Sysadmins will get all sources # Sysadmins will get all sources
if not Authorizer().is_sysadmin(user): if not Authorizer().is_sysadmin(user):
# This only applies to a non sysadmin user when using the # This only applies to a non sysadmin user when using the

View File

@ -1,8 +1,10 @@
import hashlib import hashlib
import logging import logging
import datetime
from ckan.plugins import PluginImplementations from ckan.plugins import PluginImplementations
from ckan.logic import get_action
from ckanext.harvest.interfaces import IHarvester from ckanext.harvest.interfaces import IHarvester
from ckan.model import Package from ckan.model import Package
@ -14,10 +16,11 @@ from ckanext.harvest.queue import get_gather_publisher
from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject) from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject)
from ckanext.harvest.logic.schema import default_harvest_source_schema from ckanext.harvest.logic.schema import default_harvest_source_schema
from ckanext.harvest.logic import HarvestJobExists
from ckanext.harvest.logic.dictization import (harvest_source_dictize,harvest_object_dictize) from ckanext.harvest.logic.dictization import (harvest_source_dictize,harvest_object_dictize)
from ckanext.harvest.logic.action.create import _error_summary from ckanext.harvest.logic.action.create import _error_summary
from ckanext.harvest.logic.action.get import harvest_source_show,harvest_job_list from ckanext.harvest.logic.action.get import harvest_source_show, harvest_job_list, _get_sources_for_user
log = logging.getLogger(__name__) log = logging.getLogger(__name__)
@ -132,12 +135,56 @@ def harvest_objects_import(context,data_dict):
log.info('Harvest objects imported: %s', last_objects_count) log.info('Harvest objects imported: %s', last_objects_count)
return last_objects_count return last_objects_count
def _caluclate_next_run(frequency):
now = datetime.datetime.utcnow()
if frequency == 'ALWAYS':
return now
if frequency == 'WEEKLY':
return now + datetime.timedelta(weeks=1)
if frequency == 'BIWEEKLY':
return now + datetime.timedelta(weeks=2)
if frequency == 'DAILY':
return now + datetime.timedelta(days=1)
if frequency == 'MONTHLY':
if now.month in (4,6,9,11):
days = 30
elif now.month == 2:
if now.year % 4 == 0:
days = 29
else:
days = 28
else:
days = 31
return now + datetime.timedelta(days=days)
raise Exception('Frequency {freq} not recognised'.format(freq=frequency))
def _make_scheduled_jobs(context, data_dict):
data_dict = {'only_to_run': True,
'only_active': True}
sources = _get_sources_for_user(context, data_dict)
for source in sources:
data_dict = {'source_id': source.id}
try:
get_action('harvest_job_create')(context, data_dict)
except HarvestJobExists, e:
log.info('Trying to rerun job for %s skipping' % source.id)
source.next_run = _caluclate_next_run(source.frequency)
source.save()
def harvest_jobs_run(context,data_dict): def harvest_jobs_run(context,data_dict):
log.info('Harvest job run: %r', data_dict) log.info('Harvest job run: %r', data_dict)
check_access('harvest_jobs_run',context,data_dict) check_access('harvest_jobs_run',context,data_dict)
source_id = data_dict.get('source_id',None) source_id = data_dict.get('source_id',None)
if not source_id:
_make_scheduled_jobs(context, data_dict)
# Check if there are pending harvest jobs # Check if there are pending harvest jobs
jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'}) jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'})
if len(jobs) == 0: if len(jobs) == 0:

View File

@ -11,7 +11,8 @@ from ckanext.harvest.logic.validators import (harvest_source_id_exists,
harvest_source_url_validator, harvest_source_url_validator,
harvest_source_type_exists, harvest_source_type_exists,
harvest_source_config_validator, harvest_source_config_validator,
harvest_source_active_validator,) harvest_source_active_validator,
harvest_source_frequency_exists)
def default_harvest_source_schema(): def default_harvest_source_schema():
@ -21,6 +22,7 @@ def default_harvest_source_schema():
'type': [not_empty, unicode, harvest_source_type_exists], 'type': [not_empty, unicode, harvest_source_type_exists],
'title': [ignore_missing,unicode], 'title': [ignore_missing,unicode],
'description': [ignore_missing,unicode], 'description': [ignore_missing,unicode],
'frequency': [ignore_missing,unicode, harvest_source_frequency_exists],
'active': [ignore_missing,harvest_source_active_validator], 'active': [ignore_missing,harvest_source_active_validator],
'user_id': [ignore_missing,unicode], 'user_id': [ignore_missing,unicode],
'config': [ignore_missing,harvest_source_config_validator] 'config': [ignore_missing,harvest_source_config_validator]

View File

@ -99,3 +99,7 @@ def harvest_source_active_validator(value,context):
return False return False
return bool(value) return bool(value)
def harvest_source_frequency_exists(value):
if value.upper() not in ['MONTHLY','ALWAYS','WEEKLY','BIWEEKLY','DAILY']:
raise Invalid('Frequency %s not recognised' % value)
return value.upper()