Merge pull request #230 from keitaroinc/logging-module

Logging module
This commit is contained in:
Adrià Mercader 2016-05-17 14:12:03 +01:00
commit 1ec2af0590
10 changed files with 281 additions and 7 deletions

1
.gitignore vendored
View File

@ -11,3 +11,4 @@ development.ini
*.sw?
*~
node_modules
*.project

View File

@ -58,6 +58,7 @@ running a version lower than 2.0.
ckan.harvest.mq.type = redis
There are a number of configuration options available for the backends. These don't need to
be modified at all if you are using the default Redis or RabbitMQ install (step 1). The list
below shows the available options and their default values:
@ -101,6 +102,37 @@ After installation, the harvest source listing should be available under /harves
http://localhost:5000/harvest
Database logger configuration(optional)
=======================================
1. Logging to the database is disabled by default. If you want your ckan harvest logs
to be exposed to the CKAN API you need to properly configure the logger
with the following configuration parameter::
ckan.harvest.log_scope = 0
* -1 - Do not log in the database - DEFAULT
* 0 - Log everything
* 1 - model, logic.action, logic.validators, harvesters
* 2 - model, logic.action, logic.validators
* 3 - model, logic.action
* 4 - logic.action
* 5 - model
* 6 - plugin
* 7 - harvesters
2. Setup time frame(in days) for the clean-up mechanism with the following config parameter::
ckan.harvest.log_timeframe = 10
If no value is present the default is 30 days.
3. Setup log level for the database logger::
ckan.harvest.log_level = info
If no log level is set the default is ``debug``.
Command line interface
======================
@ -164,6 +196,11 @@ The following operations can be run from the command line using the
WARNING: if using Redis, this command purges all data in the current
Redis database
harvester clean_harvest_log
- Clean-up mechanism for the harvest log table.
You can configure the time frame through the configuration
parameter 'ckan.harvest.log_timeframe'. The default time frame is 30 days
harvester [-j] [-o] [--segments={segments}] import [{source-id}]
- perform the import stage with the last fetched objects, for a certain
source or a single harvest object. Please note that no objects will
@ -692,6 +729,19 @@ following steps with the one you are using.
You can of course modify this periodicity, this `Wikipedia page <http://en.wikipedia.org/wiki/Cron#CRON_expression>`_
has a good overview of the crontab syntax.
5. In order to setup clean-up mechanism for the harvest log one more cron job needs to be scheduled::
sudo crontab -e -u ckan
Paste this line into your crontab, again replacing the paths to paster and
the ini file with yours::
# m h dom mon dow command
0 5 * * * /usr/lib/ckan/default/bin/paster --plugin=ckanext-harvest harvester clean_harvest_log --config=/etc/ckan/std/std.ini
This particular example will perform clean-up each day at 05 AM.
You can tweak the value according to your needs.
Tests
=====

View File

@ -3,6 +3,7 @@ from pprint import pprint
from ckan import model
from ckan.logic import get_action, ValidationError
from ckan.plugins import toolkit
from ckan.lib.cli import CkanCommand
@ -66,6 +67,11 @@ class Harvester(CkanCommand):
- removes all jobs from fetch and gather queue
WARNING: if using Redis, this command purges all data in the current
Redis database
harvester clean_harvest_log
- Clean-up mechanism for the harvest log table.
You can configure the time frame through the configuration
parameter `ckan.harvest.log_timeframe`. The default time frame is 30 days
harvester [-j] [-o|-g|-p {id/guid}] [--segments={segments}] import [{source-id}]
- perform the import stage with the last fetched objects, for a certain
@ -192,6 +198,8 @@ class Harvester(CkanCommand):
pprint(harvesters_info)
elif cmd == 'reindex':
self.reindex()
elif cmd == 'clean_harvest_log':
self.clean_harvest_log()
else:
print 'Command %s not recognized' % cmd
@ -513,3 +521,14 @@ class Harvester(CkanCommand):
def is_singular(self, sequence):
return len(sequence) == 1
def clean_harvest_log(self):
from datetime import datetime, timedelta
from pylons import config
from ckanext.harvest.model import clean_harvest_log
# Log time frame - in days
log_timeframe = toolkit.asint(config.get('ckan.harvest.log_timeframe', 30))
condition = datetime.utcnow() - timedelta(days=log_timeframe)
# Delete logs older then the given date
clean_harvest_log(condition=condition)

16
ckanext/harvest/log.py Normal file
View File

@ -0,0 +1,16 @@
from logging import Handler, NOTSET
from ckanext.harvest.model import HarvestLog
class DBLogHandler(Handler):
def __init__(self, level=NOTSET):
super(DBLogHandler,self).__init__(level=level)
def emit(self, record):
try:
level = record.levelname
msg = self.format(record)
obj = HarvestLog(level=level, content=msg)
obj.save()
except Exception as exc:
pass

View File

@ -1,4 +1,5 @@
import logging
from itertools import groupby
from sqlalchemy import or_
from ckan.model import User
import datetime
@ -12,10 +13,11 @@ from ckan.logic import NotFound, check_access, side_effect_free
from ckanext.harvest import model as harvest_model
from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject)
from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject, HarvestLog)
from ckanext.harvest.logic.dictization import (harvest_source_dictize,
harvest_job_dictize,
harvest_object_dictize)
harvest_object_dictize,
harvest_log_dictize)
log = logging.getLogger(__name__)
@ -310,6 +312,42 @@ def harvesters_info_show(context,data_dict):
return available_harvesters
@side_effect_free
def harvest_log_list(context,data_dict):
'''Returns a list of harvester log entries.
:param per_page: number of logs to be shown default: 100
:param offset: use with ``per_page`` default: 0
:param level: filter log entries by level(debug, info, warning, error, critical)
'''
check_access('harvest_log_list', context, data_dict)
model = context['model']
session = context['session']
try:
per_page = int(data_dict.get('per_page', 100))
except ValueError:
per_page = 100
try:
offset = int(data_dict.get('offset', 0))
except ValueError:
offset = 0
level = data_dict.get('level', None)
query = session.query(HarvestLog)
if level is not None:
query = query.filter(HarvestLog.level==level.upper())
query = query.order_by(HarvestLog.created.desc())
logs = query.offset(offset).limit(per_page).all()
out = [harvest_log_dictize(obj, context) for obj in logs]
return out
def _get_sources_for_user(context,data_dict):
model = context['model']

View File

@ -97,6 +97,11 @@ def harvest_object_dictize(obj, context):
return out
def harvest_log_dictize(obj, context):
out = obj.as_dict()
del out['id']
return out
def _get_source_status(source, context):
'''

View File

@ -31,6 +31,7 @@ __all__ = [
'HarvestObject', 'harvest_object_table',
'HarvestGatherError', 'harvest_gather_error_table',
'HarvestObjectError', 'harvest_object_error_table',
'HarvestLog', 'harvest_log_table'
]
@ -40,6 +41,7 @@ harvest_object_table = None
harvest_gather_error_table = None
harvest_object_error_table = None
harvest_object_extra_table = None
harvest_log_table = None
def setup():
@ -62,7 +64,8 @@ def setup():
harvest_gather_error_table.create()
harvest_object_error_table.create()
harvest_object_extra_table.create()
harvest_log_table.create()
log.debug('Harvest tables created')
else:
from ckan.model.meta import engine
@ -86,6 +89,10 @@ def setup():
log.debug('Creating harvest source datasets for %i existing sources', len(sources_to_migrate))
sources_to_migrate = [s[0] for s in sources_to_migrate]
migrate_v3_create_datasets(sources_to_migrate)
# Check if harvest_log table exist - needed for existing users
if not 'harvest_log' in inspector.get_table_names():
harvest_log_table.create()
# Check if harvest_object has a index
index_names = [index['name'] for index in inspector.get_indexes("harvest_object")]
@ -198,6 +205,11 @@ class HarvestObjectError(HarvestDomainObject):
if line else message
log.debug(log_message)
class HarvestLog(HarvestDomainObject):
'''HarvestLog objects are created each time something is logged
using python's standard logging module
'''
pass
def harvest_object_before_insert_listener(mapper,connection,target):
'''
@ -219,6 +231,7 @@ def define_harvester_tables():
global harvest_object_extra_table
global harvest_gather_error_table
global harvest_object_error_table
global harvest_log_table
harvest_source_table = Table('harvest_source', metadata,
Column('id', types.UnicodeText, primary_key=True, default=make_uuid),
@ -300,6 +313,13 @@ def define_harvester_tables():
Column('line', types.Integer),
Column('created', types.DateTime, default=datetime.datetime.utcnow),
)
# Harvest Log table
harvest_log_table = Table('harvest_log', metadata,
Column('id', types.UnicodeText, primary_key=True, default=make_uuid),
Column('content', types.UnicodeText, nullable=False),
Column('level', types.Enum('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL', name='log_level')),
Column('created', types.DateTime, default=datetime.datetime.utcnow),
)
mapper(
HarvestSource,
@ -374,6 +394,11 @@ def define_harvester_tables():
),
},
)
mapper(
HarvestLog,
harvest_log_table,
)
event.listen(HarvestObject, 'before_insert', harvest_object_before_insert_listener)
@ -553,3 +578,14 @@ def migrate_v3_create_datasets(source_ids=None):
log.info('Created new package for source {0} ({1})'.format(source.id, source.url))
except logic.ValidationError,e:
log.error('Validation Error: %s' % str(e.error_summary))
def clean_harvest_log(condition):
Session.query(HarvestLog).filter(HarvestLog.created <= condition)\
.delete(synchronize_session=False)
try:
Session.commit()
except InvalidRequestError:
Session.rollback()
log.error('An error occurred while trying to clean-up the harvest log table')
log.info('Harvest log table clean-up finished successfully')

View File

@ -19,6 +19,7 @@ from ckanext.harvest import logic as harvest_logic
from ckanext.harvest.model import setup as model_setup
from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject
from ckanext.harvest.log import DBLogHandler
@ -217,6 +218,9 @@ class Harvest(p.SingletonPlugin, DefaultDatasetForm, DefaultTranslation):
# Setup harvest model
model_setup()
# Configure database logger
_configure_db_logger(config)
self.startup = False
@ -463,3 +467,57 @@ def _delete_harvest_source_object(context, data_dict):
log.debug('Harvest source %s deleted', source_id)
return source
def _configure_db_logger(config):
# Log scope
#
# -1 - do not log to the database
# 0 - log everything
# 1 - model, logic.action, logic.validators, harvesters
# 2 - model, logic.action, logic.validators
# 3 - model, logic.action
# 4 - logic.action
# 5 - model
# 6 - plugin
# 7 - harvesters
#
scope = p.toolkit.asint(config.get('ckan.harvest.log_scope', -1))
if scope == -1:
return
parent_logger = 'ckanext.harvest'
children = ['plugin', 'model', 'logic.action.create', 'logic.action.delete',
'logic.action.get', 'logic.action.patch', 'logic.action.update',
'logic.validators', 'harvesters.base', 'harvesters.ckanharvester']
children_ = {0: children, 1: children[1:], 2: children[1:-2],
3: children[1:-3], 4: children[2:-3], 5: children[1:2],
6: children[:1], 7: children[-2:]}
# Get log level from config param - default: DEBUG
from logging import DEBUG, INFO, WARNING, ERROR, CRITICAL
level = config.get('ckan.harvest.log_level', 'debug').upper()
if level == 'DEBUG':
level = DEBUG
elif level == 'INFO':
level = INFO
elif level == 'WARNING':
level = WARNING
elif level == 'ERROR':
level = ERROR
elif level == 'CRITICAL':
level = CRITICAL
else:
level = DEBUG
loggers = children_.get(scope)
# Get root logger and set db handler
logger = getLogger(parent_logger)
if scope < 1:
logger.addHandler(DBLogHandler(level=level))
# Set db handler to all child loggers
for _ in loggers:
child_logger = logger.getChild(_)
child_logger.addHandler(DBLogHandler(level=level))

View File

@ -253,7 +253,7 @@ class TestHarvestSourceActionCreate(HarvestSourceActionBase):
for key in source_dict.keys():
assert_equal(source_dict[key], result[key])
# Check that source was actually created
source = harvest_model.HarvestSource.get(result['id'])
assert_equal(source.url, source_dict['url'])
@ -502,3 +502,40 @@ class TestHarvestObject(unittest.TestCase):
self.assertRaises(toolkit.ValidationError, harvest_object_create,
context, data_dict)
class TestHarvestDBLog(unittest.TestCase):
@classmethod
def setup_class(cls):
reset_db()
harvest_model.setup()
def test_harvest_db_logger(self):
# Create source and check if harvest_log table is populated
data_dict = SOURCE_DICT.copy()
data_dict['source_type'] = 'test'
source = factories.HarvestSourceObj(**data_dict)
content = 'Harvest source created: %s' % source.id
log = harvest_model.Session.query(harvest_model.HarvestLog).\
filter(harvest_model.HarvestLog.content==content).first()
self.assertIsNotNone(log)
self.assertEqual(log.level, 'INFO')
context = {
'model': model,
'session': model.Session,
'ignore_auth': True,
}
data = toolkit.get_action('harvest_log_list')(context, {})
self.assertTrue(len(data) > 0)
self.assertIn('level', data[0])
self.assertIn('content', data[0])
self.assertIn('created', data[0])
self.assertTrue(data[0]['created'] > data[1]['created'])
per_page = 1
data = toolkit.get_action('harvest_log_list')(context, {'level': 'info', 'per_page': per_page})
self.assertEqual(len(data), per_page)
self.assertEqual(data[0]['level'], 'INFO')

View File

@ -24,13 +24,13 @@ ckan.legacy_templates = false
# Logging configuration
[loggers]
keys = root, ckan, sqlalchemy
keys = root, ckan, sqlalchemy, ckan_harvester
[handlers]
keys = console
keys = console, dblog
[formatters]
keys = generic
keys = generic, dblog
[logger_root]
level = WARN
@ -41,6 +41,11 @@ qualname = ckan
handlers =
level = INFO
[logger_ckan_harvester]
qualname = ckanext.harvest
handlers = dblog
level = DEBUG
[logger_sqlalchemy]
handlers =
qualname = sqlalchemy.engine
@ -52,5 +57,14 @@ args = (sys.stdout,)
level = NOTSET
formatter = generic
[handler_dblog]
class = ckanext.harvest.log.DBLogHandler
args = ()
level = DEBUG
formatter = dblog
[formatter_dblog]
format = %(message)s
[formatter_generic]
format = %(asctime)s %(levelname)-5.5s [%(name)s] %(message)s