commit
1ec2af0590
|
@ -11,3 +11,4 @@ development.ini
|
|||
*.sw?
|
||||
*~
|
||||
node_modules
|
||||
*.project
|
||||
|
|
50
README.rst
50
README.rst
|
@ -58,6 +58,7 @@ running a version lower than 2.0.
|
|||
|
||||
ckan.harvest.mq.type = redis
|
||||
|
||||
|
||||
There are a number of configuration options available for the backends. These don't need to
|
||||
be modified at all if you are using the default Redis or RabbitMQ install (step 1). The list
|
||||
below shows the available options and their default values:
|
||||
|
@ -101,6 +102,37 @@ After installation, the harvest source listing should be available under /harves
|
|||
|
||||
http://localhost:5000/harvest
|
||||
|
||||
Database logger configuration(optional)
|
||||
=======================================
|
||||
|
||||
1. Logging to the database is disabled by default. If you want your ckan harvest logs
|
||||
to be exposed to the CKAN API you need to properly configure the logger
|
||||
with the following configuration parameter::
|
||||
|
||||
ckan.harvest.log_scope = 0
|
||||
|
||||
* -1 - Do not log in the database - DEFAULT
|
||||
* 0 - Log everything
|
||||
* 1 - model, logic.action, logic.validators, harvesters
|
||||
* 2 - model, logic.action, logic.validators
|
||||
* 3 - model, logic.action
|
||||
* 4 - logic.action
|
||||
* 5 - model
|
||||
* 6 - plugin
|
||||
* 7 - harvesters
|
||||
|
||||
2. Setup time frame(in days) for the clean-up mechanism with the following config parameter::
|
||||
|
||||
ckan.harvest.log_timeframe = 10
|
||||
|
||||
If no value is present the default is 30 days.
|
||||
|
||||
3. Setup log level for the database logger::
|
||||
|
||||
ckan.harvest.log_level = info
|
||||
|
||||
If no log level is set the default is ``debug``.
|
||||
|
||||
|
||||
Command line interface
|
||||
======================
|
||||
|
@ -164,6 +196,11 @@ The following operations can be run from the command line using the
|
|||
WARNING: if using Redis, this command purges all data in the current
|
||||
Redis database
|
||||
|
||||
harvester clean_harvest_log
|
||||
- Clean-up mechanism for the harvest log table.
|
||||
You can configure the time frame through the configuration
|
||||
parameter 'ckan.harvest.log_timeframe'. The default time frame is 30 days
|
||||
|
||||
harvester [-j] [-o] [--segments={segments}] import [{source-id}]
|
||||
- perform the import stage with the last fetched objects, for a certain
|
||||
source or a single harvest object. Please note that no objects will
|
||||
|
@ -692,6 +729,19 @@ following steps with the one you are using.
|
|||
You can of course modify this periodicity, this `Wikipedia page <http://en.wikipedia.org/wiki/Cron#CRON_expression>`_
|
||||
has a good overview of the crontab syntax.
|
||||
|
||||
5. In order to setup clean-up mechanism for the harvest log one more cron job needs to be scheduled::
|
||||
|
||||
sudo crontab -e -u ckan
|
||||
|
||||
Paste this line into your crontab, again replacing the paths to paster and
|
||||
the ini file with yours::
|
||||
|
||||
# m h dom mon dow command
|
||||
0 5 * * * /usr/lib/ckan/default/bin/paster --plugin=ckanext-harvest harvester clean_harvest_log --config=/etc/ckan/std/std.ini
|
||||
|
||||
This particular example will perform clean-up each day at 05 AM.
|
||||
You can tweak the value according to your needs.
|
||||
|
||||
Tests
|
||||
=====
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ from pprint import pprint
|
|||
|
||||
from ckan import model
|
||||
from ckan.logic import get_action, ValidationError
|
||||
from ckan.plugins import toolkit
|
||||
|
||||
from ckan.lib.cli import CkanCommand
|
||||
|
||||
|
@ -66,6 +67,11 @@ class Harvester(CkanCommand):
|
|||
- removes all jobs from fetch and gather queue
|
||||
WARNING: if using Redis, this command purges all data in the current
|
||||
Redis database
|
||||
|
||||
harvester clean_harvest_log
|
||||
- Clean-up mechanism for the harvest log table.
|
||||
You can configure the time frame through the configuration
|
||||
parameter `ckan.harvest.log_timeframe`. The default time frame is 30 days
|
||||
|
||||
harvester [-j] [-o|-g|-p {id/guid}] [--segments={segments}] import [{source-id}]
|
||||
- perform the import stage with the last fetched objects, for a certain
|
||||
|
@ -192,6 +198,8 @@ class Harvester(CkanCommand):
|
|||
pprint(harvesters_info)
|
||||
elif cmd == 'reindex':
|
||||
self.reindex()
|
||||
elif cmd == 'clean_harvest_log':
|
||||
self.clean_harvest_log()
|
||||
else:
|
||||
print 'Command %s not recognized' % cmd
|
||||
|
||||
|
@ -513,3 +521,14 @@ class Harvester(CkanCommand):
|
|||
def is_singular(self, sequence):
|
||||
return len(sequence) == 1
|
||||
|
||||
def clean_harvest_log(self):
|
||||
from datetime import datetime, timedelta
|
||||
from pylons import config
|
||||
from ckanext.harvest.model import clean_harvest_log
|
||||
|
||||
# Log time frame - in days
|
||||
log_timeframe = toolkit.asint(config.get('ckan.harvest.log_timeframe', 30))
|
||||
condition = datetime.utcnow() - timedelta(days=log_timeframe)
|
||||
|
||||
# Delete logs older then the given date
|
||||
clean_harvest_log(condition=condition)
|
|
@ -0,0 +1,16 @@
|
|||
from logging import Handler, NOTSET
|
||||
|
||||
from ckanext.harvest.model import HarvestLog
|
||||
|
||||
class DBLogHandler(Handler):
|
||||
def __init__(self, level=NOTSET):
|
||||
super(DBLogHandler,self).__init__(level=level)
|
||||
|
||||
def emit(self, record):
|
||||
try:
|
||||
level = record.levelname
|
||||
msg = self.format(record)
|
||||
obj = HarvestLog(level=level, content=msg)
|
||||
obj.save()
|
||||
except Exception as exc:
|
||||
pass
|
|
@ -1,4 +1,5 @@
|
|||
import logging
|
||||
from itertools import groupby
|
||||
from sqlalchemy import or_
|
||||
from ckan.model import User
|
||||
import datetime
|
||||
|
@ -12,10 +13,11 @@ from ckan.logic import NotFound, check_access, side_effect_free
|
|||
|
||||
from ckanext.harvest import model as harvest_model
|
||||
|
||||
from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject)
|
||||
from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject, HarvestLog)
|
||||
from ckanext.harvest.logic.dictization import (harvest_source_dictize,
|
||||
harvest_job_dictize,
|
||||
harvest_object_dictize)
|
||||
harvest_object_dictize,
|
||||
harvest_log_dictize)
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
@ -310,6 +312,42 @@ def harvesters_info_show(context,data_dict):
|
|||
|
||||
return available_harvesters
|
||||
|
||||
@side_effect_free
|
||||
def harvest_log_list(context,data_dict):
|
||||
'''Returns a list of harvester log entries.
|
||||
|
||||
:param per_page: number of logs to be shown default: 100
|
||||
:param offset: use with ``per_page`` default: 0
|
||||
:param level: filter log entries by level(debug, info, warning, error, critical)
|
||||
'''
|
||||
|
||||
check_access('harvest_log_list', context, data_dict)
|
||||
|
||||
model = context['model']
|
||||
session = context['session']
|
||||
|
||||
try:
|
||||
per_page = int(data_dict.get('per_page', 100))
|
||||
except ValueError:
|
||||
per_page = 100
|
||||
try:
|
||||
offset = int(data_dict.get('offset', 0))
|
||||
except ValueError:
|
||||
offset = 0
|
||||
|
||||
level = data_dict.get('level', None)
|
||||
|
||||
query = session.query(HarvestLog)
|
||||
|
||||
if level is not None:
|
||||
query = query.filter(HarvestLog.level==level.upper())
|
||||
|
||||
query = query.order_by(HarvestLog.created.desc())
|
||||
logs = query.offset(offset).limit(per_page).all()
|
||||
|
||||
out = [harvest_log_dictize(obj, context) for obj in logs]
|
||||
return out
|
||||
|
||||
def _get_sources_for_user(context,data_dict):
|
||||
|
||||
model = context['model']
|
||||
|
|
|
@ -97,6 +97,11 @@ def harvest_object_dictize(obj, context):
|
|||
|
||||
return out
|
||||
|
||||
def harvest_log_dictize(obj, context):
|
||||
out = obj.as_dict()
|
||||
del out['id']
|
||||
|
||||
return out
|
||||
|
||||
def _get_source_status(source, context):
|
||||
'''
|
||||
|
|
|
@ -31,6 +31,7 @@ __all__ = [
|
|||
'HarvestObject', 'harvest_object_table',
|
||||
'HarvestGatherError', 'harvest_gather_error_table',
|
||||
'HarvestObjectError', 'harvest_object_error_table',
|
||||
'HarvestLog', 'harvest_log_table'
|
||||
]
|
||||
|
||||
|
||||
|
@ -40,6 +41,7 @@ harvest_object_table = None
|
|||
harvest_gather_error_table = None
|
||||
harvest_object_error_table = None
|
||||
harvest_object_extra_table = None
|
||||
harvest_log_table = None
|
||||
|
||||
|
||||
def setup():
|
||||
|
@ -62,7 +64,8 @@ def setup():
|
|||
harvest_gather_error_table.create()
|
||||
harvest_object_error_table.create()
|
||||
harvest_object_extra_table.create()
|
||||
|
||||
harvest_log_table.create()
|
||||
|
||||
log.debug('Harvest tables created')
|
||||
else:
|
||||
from ckan.model.meta import engine
|
||||
|
@ -86,6 +89,10 @@ def setup():
|
|||
log.debug('Creating harvest source datasets for %i existing sources', len(sources_to_migrate))
|
||||
sources_to_migrate = [s[0] for s in sources_to_migrate]
|
||||
migrate_v3_create_datasets(sources_to_migrate)
|
||||
|
||||
# Check if harvest_log table exist - needed for existing users
|
||||
if not 'harvest_log' in inspector.get_table_names():
|
||||
harvest_log_table.create()
|
||||
|
||||
# Check if harvest_object has a index
|
||||
index_names = [index['name'] for index in inspector.get_indexes("harvest_object")]
|
||||
|
@ -198,6 +205,11 @@ class HarvestObjectError(HarvestDomainObject):
|
|||
if line else message
|
||||
log.debug(log_message)
|
||||
|
||||
class HarvestLog(HarvestDomainObject):
|
||||
'''HarvestLog objects are created each time something is logged
|
||||
using python's standard logging module
|
||||
'''
|
||||
pass
|
||||
|
||||
def harvest_object_before_insert_listener(mapper,connection,target):
|
||||
'''
|
||||
|
@ -219,6 +231,7 @@ def define_harvester_tables():
|
|||
global harvest_object_extra_table
|
||||
global harvest_gather_error_table
|
||||
global harvest_object_error_table
|
||||
global harvest_log_table
|
||||
|
||||
harvest_source_table = Table('harvest_source', metadata,
|
||||
Column('id', types.UnicodeText, primary_key=True, default=make_uuid),
|
||||
|
@ -300,6 +313,13 @@ def define_harvester_tables():
|
|||
Column('line', types.Integer),
|
||||
Column('created', types.DateTime, default=datetime.datetime.utcnow),
|
||||
)
|
||||
# Harvest Log table
|
||||
harvest_log_table = Table('harvest_log', metadata,
|
||||
Column('id', types.UnicodeText, primary_key=True, default=make_uuid),
|
||||
Column('content', types.UnicodeText, nullable=False),
|
||||
Column('level', types.Enum('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL', name='log_level')),
|
||||
Column('created', types.DateTime, default=datetime.datetime.utcnow),
|
||||
)
|
||||
|
||||
mapper(
|
||||
HarvestSource,
|
||||
|
@ -374,6 +394,11 @@ def define_harvester_tables():
|
|||
),
|
||||
},
|
||||
)
|
||||
|
||||
mapper(
|
||||
HarvestLog,
|
||||
harvest_log_table,
|
||||
)
|
||||
|
||||
event.listen(HarvestObject, 'before_insert', harvest_object_before_insert_listener)
|
||||
|
||||
|
@ -553,3 +578,14 @@ def migrate_v3_create_datasets(source_ids=None):
|
|||
log.info('Created new package for source {0} ({1})'.format(source.id, source.url))
|
||||
except logic.ValidationError,e:
|
||||
log.error('Validation Error: %s' % str(e.error_summary))
|
||||
|
||||
def clean_harvest_log(condition):
|
||||
Session.query(HarvestLog).filter(HarvestLog.created <= condition)\
|
||||
.delete(synchronize_session=False)
|
||||
try:
|
||||
Session.commit()
|
||||
except InvalidRequestError:
|
||||
Session.rollback()
|
||||
log.error('An error occurred while trying to clean-up the harvest log table')
|
||||
|
||||
log.info('Harvest log table clean-up finished successfully')
|
||||
|
|
|
@ -19,6 +19,7 @@ from ckanext.harvest import logic as harvest_logic
|
|||
|
||||
from ckanext.harvest.model import setup as model_setup
|
||||
from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
|
||||
from ckanext.harvest.log import DBLogHandler
|
||||
|
||||
|
||||
|
||||
|
@ -217,6 +218,9 @@ class Harvest(p.SingletonPlugin, DefaultDatasetForm, DefaultTranslation):
|
|||
|
||||
# Setup harvest model
|
||||
model_setup()
|
||||
|
||||
# Configure database logger
|
||||
_configure_db_logger(config)
|
||||
|
||||
self.startup = False
|
||||
|
||||
|
@ -463,3 +467,57 @@ def _delete_harvest_source_object(context, data_dict):
|
|||
log.debug('Harvest source %s deleted', source_id)
|
||||
|
||||
return source
|
||||
|
||||
def _configure_db_logger(config):
|
||||
# Log scope
|
||||
#
|
||||
# -1 - do not log to the database
|
||||
# 0 - log everything
|
||||
# 1 - model, logic.action, logic.validators, harvesters
|
||||
# 2 - model, logic.action, logic.validators
|
||||
# 3 - model, logic.action
|
||||
# 4 - logic.action
|
||||
# 5 - model
|
||||
# 6 - plugin
|
||||
# 7 - harvesters
|
||||
#
|
||||
scope = p.toolkit.asint(config.get('ckan.harvest.log_scope', -1))
|
||||
if scope == -1:
|
||||
return
|
||||
|
||||
parent_logger = 'ckanext.harvest'
|
||||
children = ['plugin', 'model', 'logic.action.create', 'logic.action.delete',
|
||||
'logic.action.get', 'logic.action.patch', 'logic.action.update',
|
||||
'logic.validators', 'harvesters.base', 'harvesters.ckanharvester']
|
||||
|
||||
children_ = {0: children, 1: children[1:], 2: children[1:-2],
|
||||
3: children[1:-3], 4: children[2:-3], 5: children[1:2],
|
||||
6: children[:1], 7: children[-2:]}
|
||||
|
||||
# Get log level from config param - default: DEBUG
|
||||
from logging import DEBUG, INFO, WARNING, ERROR, CRITICAL
|
||||
level = config.get('ckan.harvest.log_level', 'debug').upper()
|
||||
if level == 'DEBUG':
|
||||
level = DEBUG
|
||||
elif level == 'INFO':
|
||||
level = INFO
|
||||
elif level == 'WARNING':
|
||||
level = WARNING
|
||||
elif level == 'ERROR':
|
||||
level = ERROR
|
||||
elif level == 'CRITICAL':
|
||||
level = CRITICAL
|
||||
else:
|
||||
level = DEBUG
|
||||
|
||||
loggers = children_.get(scope)
|
||||
|
||||
# Get root logger and set db handler
|
||||
logger = getLogger(parent_logger)
|
||||
if scope < 1:
|
||||
logger.addHandler(DBLogHandler(level=level))
|
||||
|
||||
# Set db handler to all child loggers
|
||||
for _ in loggers:
|
||||
child_logger = logger.getChild(_)
|
||||
child_logger.addHandler(DBLogHandler(level=level))
|
||||
|
|
|
@ -253,7 +253,7 @@ class TestHarvestSourceActionCreate(HarvestSourceActionBase):
|
|||
|
||||
for key in source_dict.keys():
|
||||
assert_equal(source_dict[key], result[key])
|
||||
|
||||
|
||||
# Check that source was actually created
|
||||
source = harvest_model.HarvestSource.get(result['id'])
|
||||
assert_equal(source.url, source_dict['url'])
|
||||
|
@ -502,3 +502,40 @@ class TestHarvestObject(unittest.TestCase):
|
|||
|
||||
self.assertRaises(toolkit.ValidationError, harvest_object_create,
|
||||
context, data_dict)
|
||||
|
||||
|
||||
class TestHarvestDBLog(unittest.TestCase):
|
||||
@classmethod
|
||||
def setup_class(cls):
|
||||
reset_db()
|
||||
harvest_model.setup()
|
||||
|
||||
def test_harvest_db_logger(self):
|
||||
# Create source and check if harvest_log table is populated
|
||||
data_dict = SOURCE_DICT.copy()
|
||||
data_dict['source_type'] = 'test'
|
||||
source = factories.HarvestSourceObj(**data_dict)
|
||||
content = 'Harvest source created: %s' % source.id
|
||||
log = harvest_model.Session.query(harvest_model.HarvestLog).\
|
||||
filter(harvest_model.HarvestLog.content==content).first()
|
||||
|
||||
self.assertIsNotNone(log)
|
||||
self.assertEqual(log.level, 'INFO')
|
||||
|
||||
context = {
|
||||
'model': model,
|
||||
'session': model.Session,
|
||||
'ignore_auth': True,
|
||||
}
|
||||
|
||||
data = toolkit.get_action('harvest_log_list')(context, {})
|
||||
self.assertTrue(len(data) > 0)
|
||||
self.assertIn('level', data[0])
|
||||
self.assertIn('content', data[0])
|
||||
self.assertIn('created', data[0])
|
||||
self.assertTrue(data[0]['created'] > data[1]['created'])
|
||||
|
||||
per_page = 1
|
||||
data = toolkit.get_action('harvest_log_list')(context, {'level': 'info', 'per_page': per_page})
|
||||
self.assertEqual(len(data), per_page)
|
||||
self.assertEqual(data[0]['level'], 'INFO')
|
|
@ -24,13 +24,13 @@ ckan.legacy_templates = false
|
|||
|
||||
# Logging configuration
|
||||
[loggers]
|
||||
keys = root, ckan, sqlalchemy
|
||||
keys = root, ckan, sqlalchemy, ckan_harvester
|
||||
|
||||
[handlers]
|
||||
keys = console
|
||||
keys = console, dblog
|
||||
|
||||
[formatters]
|
||||
keys = generic
|
||||
keys = generic, dblog
|
||||
|
||||
[logger_root]
|
||||
level = WARN
|
||||
|
@ -41,6 +41,11 @@ qualname = ckan
|
|||
handlers =
|
||||
level = INFO
|
||||
|
||||
[logger_ckan_harvester]
|
||||
qualname = ckanext.harvest
|
||||
handlers = dblog
|
||||
level = DEBUG
|
||||
|
||||
[logger_sqlalchemy]
|
||||
handlers =
|
||||
qualname = sqlalchemy.engine
|
||||
|
@ -52,5 +57,14 @@ args = (sys.stdout,)
|
|||
level = NOTSET
|
||||
formatter = generic
|
||||
|
||||
[handler_dblog]
|
||||
class = ckanext.harvest.log.DBLogHandler
|
||||
args = ()
|
||||
level = DEBUG
|
||||
formatter = dblog
|
||||
|
||||
[formatter_dblog]
|
||||
format = %(message)s
|
||||
|
||||
[formatter_generic]
|
||||
format = %(asctime)s %(levelname)-5.5s [%(name)s] %(message)s
|
||||
|
|
Loading…
Reference in New Issue