add jobs at certain frequencies

2012-10-29 17:15:02 +00:00 · 2012-10-29 17:15:02 +00:00 · 2529a17304
parent 9fc0ae9937
commit 2529a17304
7 changed files with 87 additions and 9 deletions
--- a/ckanext/harvest/commands/harvester.py
+++ b/ckanext/harvest/commands/harvester.py
@ -15,7 +15,7 @@ class Harvester(CkanCommand):
      harvester initdb
        - Creates the necessary tables in the database

-      harvester source {url} {type} [{active}] [{user-id}] [{publisher-id}]
+      harvester source {url} {type} [{config}] [{active}] [{user-id}] [{publisher-id}] [{frequency}]
        - create new harvest source

      harvester rmsource {id}
@ -64,7 +64,7 @@ class Harvester(CkanCommand):

    summary = __doc__.split('\n')[0]
    usage = __doc__
-    max_args = 6
+    max_args = 8
    min_args = 0

    def __init__(self,name):
@ -169,11 +169,18 @@ class Harvester(CkanCommand):
            publisher_id = unicode(self.args[6])
        else:
            publisher_id = u''
+        if len(self.args) >= 8:
+            frequency = unicode(self.args[7])
+            if not frequency:
+                frequency = None
+        else:
+            frequency = None
        try:
            data_dict = {
                    'url':url,
                    'type':type,
                    'config':config,
+                    'frequency':frequency,
                    'active':active,
                    'user_id':user_id,
                    'publisher_id':publisher_id}
@ -186,9 +193,11 @@ class Harvester(CkanCommand):
            sources = get_action('harvest_source_list')(context,{})
            self.print_there_are('harvest source', sources)

-            # Create a harvest job for the new source
-            get_action('harvest_job_create')(context,{'source_id':source['id']})
-            print 'A new Harvest Job for this source has also been created'
+            # Create a harvest job for the new source if not regular job.
+            if not data_dict['frequency']:
+                get_action('harvest_job_create')(context,{'source_id':source['id']})
+                print 'A new Harvest Job for this source has also been created'
+
        except ValidationError,e:
           print 'An error occurred:'
           print str(e.error_dict)
@ -278,6 +287,7 @@ class Harvester(CkanCommand):
        print '   active: %s' % source['active']
        print '     user: %s' % source['user_id']
        print 'publisher: %s' % source['publisher_id']
+        print 'frequency: %s' % source['frequency']
        print '     jobs: %s' % source['status']['job_count']
        print ''

--- a/ckanext/harvest/logic/init.py
+++ b/ckanext/harvest/logic/init.py
@ -5,3 +5,5 @@ except ImportError:
    import pkgutil
    __path__ = pkgutil.extend_path(__path__, __name__)

+class HarvestJobExists(Exception):
+    pass
--- a/ckanext/harvest/logic/action/create.py
+++ b/ckanext/harvest/logic/action/create.py
@ -2,6 +2,7 @@ import re
 import logging

 from ckan.logic import NotFound, ValidationError, check_access
+from ckanext.harvest.logic import HarvestJobExists
 from ckan.lib.navl.dictization_functions import validate

 from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject)
@ -32,7 +33,8 @@ def harvest_source_create(context,data_dict):
    source.url = data['url'].strip()
    source.type = data['type']

-    opt = ['active','title','description','user_id','publisher_id','config']
+    opt = ['active','title','description','user_id',
+           'publisher_id','config', 'frequency']
    for o in opt:
        if o in data and data[o] is not None:
            source.__setattr__(o,data[o])
@ -45,6 +47,7 @@ def harvest_source_create(context,data_dict):

    return harvest_source_dictize(source,context)

+
 def harvest_job_create(context,data_dict):
    log.info('Harvest job create: %r', data_dict)
    check_access('harvest_job_create',context,data_dict)
@ -70,7 +73,7 @@ def harvest_job_create(context,data_dict):
    exists = harvest_job_list(context,data_dict)
    if len(exists):
        log.warn('There is already an unrun job %r for this source %s', exists, source_id)
-        raise Exception('There already is an unrun job for this source')
+        raise HarvestJobExists('There already is an unrun job for this source')

    job = HarvestJob()
    job.source = source
--- a/ckanext/harvest/logic/action/get.py
+++ b/ckanext/harvest/logic/action/get.py
@ -2,6 +2,7 @@ import logging
 from sqlalchemy import or_
 from ckan.authz import Authorizer
 from ckan.model import User
+import datetime

 from ckan.plugins import PluginImplementations
 from ckanext.harvest.interfaces import IHarvester
@ -153,6 +154,7 @@ def _get_sources_for_user(context,data_dict):
    user = context.get('user','')

    only_active = data_dict.get('only_active',False)
+    only_to_run = data_dict.get('only_to_run',False)

    query = session.query(HarvestSource) \
                .order_by(HarvestSource.created.desc())
@ -160,6 +162,14 @@ def _get_sources_for_user(context,data_dict):
    if only_active:
        query = query.filter(HarvestSource.active==True) \

+    if only_to_run:
+        query = query.filter(or_(HarvestSource.frequency!=None,
+                                 HarvestSource.frequency!='')
+                            )
+        query = query.filter(or_(HarvestSource.next_run<=datetime.datetime.utcnow(),
+                                 HarvestSource.next_run==None)
+                            )
+
    # Sysadmins will get all sources
    if not Authorizer().is_sysadmin(user):
        # This only applies to a non sysadmin user when using the
--- a/ckanext/harvest/logic/action/update.py
+++ b/ckanext/harvest/logic/action/update.py
@ -1,8 +1,10 @@
 import hashlib

 import logging
+import datetime

 from ckan.plugins import PluginImplementations
+from ckan.logic import get_action
 from ckanext.harvest.interfaces import IHarvester

 from ckan.model import Package
@ -14,10 +16,11 @@ from ckanext.harvest.queue import get_gather_publisher

 from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject)
 from ckanext.harvest.logic.schema import default_harvest_source_schema
+from ckanext.harvest.logic import HarvestJobExists
 from ckanext.harvest.logic.dictization import (harvest_source_dictize,harvest_object_dictize)

 from ckanext.harvest.logic.action.create import _error_summary
-from ckanext.harvest.logic.action.get import harvest_source_show,harvest_job_list
+from ckanext.harvest.logic.action.get import harvest_source_show, harvest_job_list, _get_sources_for_user


 log = logging.getLogger(__name__)
@ -132,12 +135,56 @@ def harvest_objects_import(context,data_dict):
    log.info('Harvest objects imported: %s', last_objects_count)
    return last_objects_count

+def _caluclate_next_run(frequency):
+
+    now = datetime.datetime.utcnow()
+    if frequency == 'ALWAYS':
+        return now
+    if frequency == 'WEEKLY':
+        return now + datetime.timedelta(weeks=1)
+    if frequency == 'BIWEEKLY':
+        return now + datetime.timedelta(weeks=2)
+    if frequency == 'DAILY':
+        return now + datetime.timedelta(days=1)
+    if frequency == 'MONTHLY':
+        if now.month in (4,6,9,11):
+            days = 30
+        elif now.month == 2:
+            if now.year % 4 == 0:
+                days = 29
+            else:
+                days = 28
+        else:
+            days = 31
+        return now + datetime.timedelta(days=days)
+    raise Exception('Frequency {freq} not recognised'.format(freq=frequency))
+
+
+def _make_scheduled_jobs(context, data_dict):
+
+    data_dict = {'only_to_run': True,
+                 'only_active': True}
+    sources = _get_sources_for_user(context, data_dict)
+
+    for source in sources:
+        data_dict = {'source_id': source.id}
+        try:
+            get_action('harvest_job_create')(context, data_dict)
+        except HarvestJobExists, e:
+            log.info('Trying to rerun job for %s skipping' % source.id)
+
+        source.next_run = _caluclate_next_run(source.frequency)
+        source.save()
+
 def harvest_jobs_run(context,data_dict):
    log.info('Harvest job run: %r', data_dict)
    check_access('harvest_jobs_run',context,data_dict)

    source_id = data_dict.get('source_id',None)

+    if not source_id:
+        _make_scheduled_jobs(context, data_dict)
+
    # Check if there are pending harvest jobs
    jobs = harvest_job_list(context,{'source_id':source_id,'status':u'New'})
    if len(jobs) == 0:
--- a/ckanext/harvest/logic/schema.py
+++ b/ckanext/harvest/logic/schema.py
@ -11,7 +11,8 @@ from ckanext.harvest.logic.validators import (harvest_source_id_exists,
                                            harvest_source_url_validator,
                                            harvest_source_type_exists,
                                            harvest_source_config_validator,
-                                            harvest_source_active_validator,)
+                                            harvest_source_active_validator,
+                                            harvest_source_frequency_exists)

 def default_harvest_source_schema():

@ -21,6 +22,7 @@ def default_harvest_source_schema():
        'type': [not_empty, unicode, harvest_source_type_exists],
        'title': [ignore_missing,unicode],
        'description': [ignore_missing,unicode],
+        'frequency': [ignore_missing,unicode, harvest_source_frequency_exists],
        'active': [ignore_missing,harvest_source_active_validator],
        'user_id': [ignore_missing,unicode],
        'config': [ignore_missing,harvest_source_config_validator]
--- a/ckanext/harvest/logic/validators.py
+++ b/ckanext/harvest/logic/validators.py
@ -99,3 +99,7 @@ def harvest_source_active_validator(value,context):
            return False
    return bool(value)

+def harvest_source_frequency_exists(value):
+    if value.upper() not in ['MONTHLY','ALWAYS','WEEKLY','BIWEEKLY','DAILY']:
+        raise Invalid('Frequency %s not recognised' % value)
+    return value.upper()