From a79ad2e325ca2eb0bcf89ee28eea60b408a0d51b Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Tue, 5 Apr 2016 20:21:04 +0200 Subject: [PATCH 01/17] Added basic DBLogHandler --- .gitignore | 1 + ckanext/harvest/log.py | 16 ++++++++++++++++ ckanext/harvest/model/__init__.py | 21 +++++++++++++++++++++ 3 files changed, 38 insertions(+) create mode 100644 ckanext/harvest/log.py diff --git a/.gitignore b/.gitignore index 62ff3fd..6915513 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ development.ini *.sw? *~ node_modules +*.project diff --git a/ckanext/harvest/log.py b/ckanext/harvest/log.py new file mode 100644 index 0000000..5f05a28 --- /dev/null +++ b/ckanext/harvest/log.py @@ -0,0 +1,16 @@ +from logging import Handler + +from ckanext.harvest.model import HarvestLog + +class DBLogHandler(Handler, object): + def __init__(self): + super(DBLogHandler,self).__init__() + + def emit(self, record): + try: + level = record.levelname + msg = self.format(record) + obj = HarvestLog(level=level, content=msg) + obj.save() + except: + pass \ No newline at end of file diff --git a/ckanext/harvest/model/__init__.py b/ckanext/harvest/model/__init__.py index a87eb58..70fce78 100644 --- a/ckanext/harvest/model/__init__.py +++ b/ckanext/harvest/model/__init__.py @@ -30,6 +30,7 @@ __all__ = [ 'HarvestObject', 'harvest_object_table', 'HarvestGatherError', 'harvest_gather_error_table', 'HarvestObjectError', 'harvest_object_error_table', + 'HarvestLog', 'harvest_log_table' ] @@ -39,6 +40,7 @@ harvest_object_table = None harvest_gather_error_table = None harvest_object_error_table = None harvest_object_extra_table = None +harvest_log_table = None def setup(): @@ -61,6 +63,7 @@ def setup(): harvest_gather_error_table.create() harvest_object_error_table.create() harvest_object_extra_table.create() + harvest_log_table.create() log.debug('Harvest tables created') else: @@ -191,6 +194,11 @@ class HarvestObjectError(HarvestDomainObject): if line else message log.debug(log_message) +class HarvestLog(HarvestDomainObject): + '''HarvestLog objects are created each time something is logged + using python's standard logging module + ''' + pass def harvest_object_before_insert_listener(mapper,connection,target): ''' @@ -212,6 +220,7 @@ def define_harvester_tables(): global harvest_object_extra_table global harvest_gather_error_table global harvest_object_error_table + global harvest_log_table harvest_source_table = Table('harvest_source', metadata, Column('id', types.UnicodeText, primary_key=True, default=make_uuid), @@ -292,6 +301,13 @@ def define_harvester_tables(): Column('line', types.Integer), Column('created', types.DateTime, default=datetime.datetime.utcnow), ) + # Harvest Log table + harvest_log_table = Table('harvest_log', metadata, + Column('id', types.UnicodeText, primary_key=True, default=make_uuid), + Column('content', types.UnicodeText, nullable=False), + Column('level', types.Enum('DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL', name='log_level')), + Column('created', types.DateTime, default=datetime.datetime.utcnow), + ) mapper( HarvestSource, @@ -366,6 +382,11 @@ def define_harvester_tables(): ), }, ) + + mapper( + HarvestLog, + harvest_log_table, + ) event.listen(HarvestObject, 'before_insert', harvest_object_before_insert_listener) From 97cd64b1724fec4770b84e4563be5c8670d5594f Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Tue, 5 Apr 2016 23:53:14 +0200 Subject: [PATCH 02/17] Added harvest_log_list get action --- ckanext/harvest/logic/action/get.py | 38 ++++++++++++++++++++++++++-- ckanext/harvest/logic/dictization.py | 2 ++ 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/ckanext/harvest/logic/action/get.py b/ckanext/harvest/logic/action/get.py index 77a2de6..773b6c3 100644 --- a/ckanext/harvest/logic/action/get.py +++ b/ckanext/harvest/logic/action/get.py @@ -1,4 +1,5 @@ import logging +from itertools import groupby from sqlalchemy import or_ from ckan.model import User import datetime @@ -12,10 +13,11 @@ from ckan.logic import NotFound, check_access, side_effect_free from ckanext.harvest import model as harvest_model -from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject) +from ckanext.harvest.model import (HarvestSource, HarvestJob, HarvestObject, HarvestLog) from ckanext.harvest.logic.dictization import (harvest_source_dictize, harvest_job_dictize, - harvest_object_dictize) + harvest_object_dictize, + harvest_log_dictize) log = logging.getLogger(__name__) @@ -310,6 +312,38 @@ def harvesters_info_show(context,data_dict): return available_harvesters +@side_effect_free +def harvest_log_list(context,data_dict): + '''Returns a list of harvester log entries grouped by level. + + :param per_page: number of logs to be shown default: 100 + :param offset: use with ``per_page`` default: 0 + :param level: filter log entries by level(debug, info, warning, error, critical) + ''' + + check_access('harvest_log_list', context, data_dict) + + model = context['model'] + session = context['session'] + + per_page = data_dict.get('per_page', 100) + offset = data_dict.get('offset', 0) + level = data_dict.get('level', False) + + query = session.query(HarvestLog) + + if level: + query = query.filter(HarvestLog.level==level.upper()) + + query = query.order_by(HarvestLog.level.desc(), HarvestLog.created.desc()) + logs = query.offset(offset).limit(per_page).all() + + out = dict() + for k, g in groupby(logs, lambda l: l.level): + out.update({k: [harvest_log_dictize(obj, context) for obj in g]}) + + return out + def _get_sources_for_user(context,data_dict): model = context['model'] diff --git a/ckanext/harvest/logic/dictization.py b/ckanext/harvest/logic/dictization.py index 9c102ee..1e0a53f 100644 --- a/ckanext/harvest/logic/dictization.py +++ b/ckanext/harvest/logic/dictization.py @@ -97,6 +97,8 @@ def harvest_object_dictize(obj, context): return out +def harvest_log_dictize(obj, context): + return obj.as_dict() def _get_source_status(source, context): ''' From 21c81fb1497487be64f0820c83250db29236d6be Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Thu, 7 Apr 2016 00:17:07 +0200 Subject: [PATCH 03/17] Added database logger params in the config file --- test-core.ini | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/test-core.ini b/test-core.ini index bc0402d..b736803 100644 --- a/test-core.ini +++ b/test-core.ini @@ -24,13 +24,13 @@ ckan.legacy_templates = false # Logging configuration [loggers] -keys = root, ckan, sqlalchemy +keys = root, ckan, sqlalchemy, ckan_harvester [handlers] -keys = console +keys = console, dblog [formatters] -keys = generic +keys = generic, dblog [logger_root] level = WARN @@ -41,6 +41,11 @@ qualname = ckan handlers = level = INFO +[logger_ckan_harvester] +qualname = ckanext.harvest +handlers = dblog +level = DEBUG + [logger_sqlalchemy] handlers = qualname = sqlalchemy.engine @@ -52,5 +57,14 @@ args = (sys.stdout,) level = NOTSET formatter = generic +[handler_dblog] +class = ckanext.harvest.log.DBLogHandler +args = () +level = DEBUG +formatter = dblog + +[formatter_dblog] +format = %(message)s + [formatter_generic] format = %(asctime)s %(levelname)-5.5s [%(name)s] %(message)s From 3665889f272afb7694a6eeb8822b8166081f949a Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Mon, 11 Apr 2016 19:42:09 +0200 Subject: [PATCH 04/17] Updated README.rst with configuration options for the logger --- README.rst | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/README.rst b/README.rst index c033855..cfab874 100644 --- a/README.rst +++ b/README.rst @@ -85,6 +85,33 @@ config option (or ``default``) will be used to namespace the relevant things: * On Redis, it will namespace the keys used, so only the relevant instance gets them, eg ``site1:harvest_job_id``, ``site1:harvest_object__id:804f114a-8f68-4e7c-b124-3eb00f66202f`` +7. If you want your ckan harvest logs to be exposed to the ckan API you need to add the + following configuration options in your ckan configuriation file: + + [loggers] + keys = ckan_harvester + + [handlers] + keys = dblog + + [formatters] + keys = dblog + + [logger_ckan_harvester] + qualname = ckanext.harvest + handlers = dblog + level = DEBUG + + [handler_dblog] + class = ckanext.harvest.log.DBLogHandler + args = () + level = DEBUG + formatter = dblog + + [formatter_dblog] + format = %(message)s + + If you are having troubles configuring ckan logger please refer to ``test-core.ini`` Configuration ============= From 17804b902b5c148bf15a38dc4e6dbb5c6badcd5c Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Mon, 11 Apr 2016 21:01:11 +0200 Subject: [PATCH 05/17] spacing fixes --- README.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.rst b/README.rst index cfab874..0c4db39 100644 --- a/README.rst +++ b/README.rst @@ -89,26 +89,32 @@ config option (or ``default``) will be used to namespace the relevant things: following configuration options in your ckan configuriation file: [loggers] + keys = ckan_harvester [handlers] + keys = dblog [formatters] + keys = dblog [logger_ckan_harvester] + qualname = ckanext.harvest handlers = dblog level = DEBUG [handler_dblog] + class = ckanext.harvest.log.DBLogHandler args = () level = DEBUG formatter = dblog [formatter_dblog] + format = %(message)s If you are having troubles configuring ckan logger please refer to ``test-core.ini`` From 30388f04e790a0aa837e5b33096693e86d62dd2d Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Mon, 11 Apr 2016 21:03:40 +0200 Subject: [PATCH 06/17] spacing fix --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 0c4db39..5caf037 100644 --- a/README.rst +++ b/README.rst @@ -86,7 +86,7 @@ config option (or ``default``) will be used to namespace the relevant things: ``site1:harvest_job_id``, ``site1:harvest_object__id:804f114a-8f68-4e7c-b124-3eb00f66202f`` 7. If you want your ckan harvest logs to be exposed to the ckan API you need to add the - following configuration options in your ckan configuriation file: + following configuration options in your ckan configuriation file:: [loggers] From a1968e4c6376ff4c20dcbe54057e7ced808b8545 Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Tue, 12 Apr 2016 19:28:43 +0200 Subject: [PATCH 07/17] Check if harvest_log table is populated on source creation --- ckanext/harvest/tests/test_action.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ckanext/harvest/tests/test_action.py b/ckanext/harvest/tests/test_action.py index e924611..1583c14 100644 --- a/ckanext/harvest/tests/test_action.py +++ b/ckanext/harvest/tests/test_action.py @@ -253,12 +253,15 @@ class TestHarvestSourceActionCreate(HarvestSourceActionBase): for key in source_dict.keys(): assert_equal(source_dict[key], result[key]) - + # Check that source was actually created source = harvest_model.HarvestSource.get(result['id']) assert_equal(source.url, source_dict['url']) assert_equal(source.type, source_dict['source_type']) + # New source is created so the harvest_log table should be populated + assert model.Session.query(harvest_model.HarvestLog).count() > 0 + # Trying to create a source with the same URL fails source_dict = self._get_source_dict() source_dict['name'] = 'test-source-action-new' From 3d519ce0b269deceef4727a33200998d3a387cbe Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Mon, 25 Apr 2016 19:53:49 +0200 Subject: [PATCH 08/17] Partial fixes --- ckanext/harvest/log.py | 2 +- ckanext/harvest/logic/action/get.py | 26 +++++++++++++++----------- ckanext/harvest/logic/dictization.py | 5 ++++- ckanext/harvest/tests/test_action.py | 3 --- 4 files changed, 20 insertions(+), 16 deletions(-) diff --git a/ckanext/harvest/log.py b/ckanext/harvest/log.py index 5f05a28..98f3abf 100644 --- a/ckanext/harvest/log.py +++ b/ckanext/harvest/log.py @@ -12,5 +12,5 @@ class DBLogHandler(Handler, object): msg = self.format(record) obj = HarvestLog(level=level, content=msg) obj.save() - except: + except Exception as exc: pass \ No newline at end of file diff --git a/ckanext/harvest/logic/action/get.py b/ckanext/harvest/logic/action/get.py index 773b6c3..dc827fc 100644 --- a/ckanext/harvest/logic/action/get.py +++ b/ckanext/harvest/logic/action/get.py @@ -314,7 +314,7 @@ def harvesters_info_show(context,data_dict): @side_effect_free def harvest_log_list(context,data_dict): - '''Returns a list of harvester log entries grouped by level. + '''Returns a list of harvester log entries. :param per_page: number of logs to be shown default: 100 :param offset: use with ``per_page`` default: 0 @@ -326,22 +326,26 @@ def harvest_log_list(context,data_dict): model = context['model'] session = context['session'] - per_page = data_dict.get('per_page', 100) - offset = data_dict.get('offset', 0) - level = data_dict.get('level', False) + try: + per_page = int(data_dict.get('per_page', 100)) + except ValueError: + per_page = 100 + try: + offset = int(data_dict.get('offset', 0)) + except ValueError: + offset = 0 + + level = data_dict.get('level', None) query = session.query(HarvestLog) - if level: + if level is not None: query = query.filter(HarvestLog.level==level.upper()) - query = query.order_by(HarvestLog.level.desc(), HarvestLog.created.desc()) + query = query.order_by(HarvestLog.created.desc()) logs = query.offset(offset).limit(per_page).all() - - out = dict() - for k, g in groupby(logs, lambda l: l.level): - out.update({k: [harvest_log_dictize(obj, context) for obj in g]}) - + + out = [harvest_log_dictize(obj, context) for obj in logs] return out def _get_sources_for_user(context,data_dict): diff --git a/ckanext/harvest/logic/dictization.py b/ckanext/harvest/logic/dictization.py index 1e0a53f..877ea19 100644 --- a/ckanext/harvest/logic/dictization.py +++ b/ckanext/harvest/logic/dictization.py @@ -98,7 +98,10 @@ def harvest_object_dictize(obj, context): return out def harvest_log_dictize(obj, context): - return obj.as_dict() + out = obj.as_dict() + del out['id'] + + return out def _get_source_status(source, context): ''' diff --git a/ckanext/harvest/tests/test_action.py b/ckanext/harvest/tests/test_action.py index 1583c14..c2b1524 100644 --- a/ckanext/harvest/tests/test_action.py +++ b/ckanext/harvest/tests/test_action.py @@ -259,9 +259,6 @@ class TestHarvestSourceActionCreate(HarvestSourceActionBase): assert_equal(source.url, source_dict['url']) assert_equal(source.type, source_dict['source_type']) - # New source is created so the harvest_log table should be populated - assert model.Session.query(harvest_model.HarvestLog).count() > 0 - # Trying to create a source with the same URL fails source_dict = self._get_source_dict() source_dict['name'] = 'test-source-action-new' From 009cc57e093df8af682e0053865239367f862dcc Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Fri, 6 May 2016 18:44:02 +0200 Subject: [PATCH 09/17] Added clean-up mechanism for the harvest log --- ckanext/harvest/commands/harvester.py | 21 ++++++++++++++++++++- ckanext/harvest/model/__init__.py | 11 +++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index 39be5ce..3f80ffd 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -3,6 +3,7 @@ from pprint import pprint from ckan import model from ckan.logic import get_action, ValidationError +from ckan.plugins import toolkit from ckan.lib.cli import CkanCommand @@ -66,6 +67,11 @@ class Harvester(CkanCommand): - removes all jobs from fetch and gather queue WARNING: if using Redis, this command purges all data in the current Redis database + + harvester clean_harvest_log + - Clean-up mechanism for the harvest log table. + You can configure the time frame through the configuration + parameter `ckan.harvest.log_timeframe`. The default time frame is 30 days harvester [-j] [-o|-g|-p {id/guid}] [--segments={segments}] import [{source-id}] - perform the import stage with the last fetched objects, for a certain @@ -87,7 +93,7 @@ class Harvester(CkanCommand): harvester job-all - create new harvest jobs for all active sources. - +https://www.facebook.com/ harvester reindex - reindexes the harvest source datasets @@ -192,6 +198,8 @@ class Harvester(CkanCommand): pprint(harvesters_info) elif cmd == 'reindex': self.reindex() + elif cmd == 'clean_harvest_log': + self.clean_harvest_log() else: print 'Command %s not recognized' % cmd @@ -513,3 +521,14 @@ class Harvester(CkanCommand): def is_singular(self, sequence): return len(sequence) == 1 + def clean_harvest_log(self): + from datetime import datetime, timedelta + from pylons import config + from ckanext.harvest.model import clean_harvest_log + + # Log time frame - in days + log_timeframe = toolkit.asint(config.get('ckan.harvest.log_timeframe', 30)) + condition = datetime.utcnow() - timedelta(days=log_timeframe) + + # Delete logs older then the given date + clean_harvest_log(condition=condition) \ No newline at end of file diff --git a/ckanext/harvest/model/__init__.py b/ckanext/harvest/model/__init__.py index 70fce78..e0d4698 100644 --- a/ckanext/harvest/model/__init__.py +++ b/ckanext/harvest/model/__init__.py @@ -566,3 +566,14 @@ def migrate_v3_create_datasets(source_ids=None): log.info('Created new package for source {0} ({1})'.format(source.id, source.url)) except logic.ValidationError,e: log.error('Validation Error: %s' % str(e.error_summary)) + +def clean_harvest_log(condition): + Session.query(HarvestLog).filter(HarvestLog.created <= condition)\ + .delete(synchronize_session=False) + try: + Session.commit() + except InvalidRequestError: + Session.rollback() + log.error('An error occurred while trying to clean-up the harvest log table') + + log.info('Harvest log table clean-up finished successfully') From dc4abdcbd879e4d7c559552fd8b74f251f166317 Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Fri, 6 May 2016 18:54:57 +0200 Subject: [PATCH 10/17] README updates --- README.rst | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/README.rst b/README.rst index 5caf037..30b4c27 100644 --- a/README.rst +++ b/README.rst @@ -725,6 +725,19 @@ following steps with the one you are using. You can of course modify this periodicity, this `Wikipedia page `_ has a good overview of the crontab syntax. +5. In order to setup clean-up mechanism for the harvest log one more cron job needs to be scheduled:: + + sudo crontab -e -u ckan + + Paste this line into your crontab, again replacing the paths to paster and + the ini file with yours:: + + # m h dom mon dow command + 0 5 * * * /usr/lib/ckan/default/bin/paster --plugin=ckanext-harvest harvester clean_harvest_log --config=/etc/ckan/std/std.ini + + This particular example will perform clean-up each day at 05 AM. + You can tweak the value according to your needs. + Tests ===== From fe3e92bffea36a59cfd328c1e1acf413f2af1f64 Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Fri, 6 May 2016 18:59:16 +0200 Subject: [PATCH 11/17] README updates --- README.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/README.rst b/README.rst index 30b4c27..bb38ef8 100644 --- a/README.rst +++ b/README.rst @@ -197,6 +197,11 @@ The following operations can be run from the command line using the WARNING: if using Redis, this command purges all data in the current Redis database + harvester clean_harvest_log + - Clean-up mechanism for the harvest log table. + You can configure the time frame through the configuration + parameter 'ckan.harvest.log_timeframe'. The default time frame is 30 days + harvester [-j] [-o] [--segments={segments}] import [{source-id}] - perform the import stage with the last fetched objects, for a certain source or a single harvest object. Please note that no objects will From cc86f4062a779b101e66b411d8269357557da1b9 Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Fri, 6 May 2016 19:07:03 +0200 Subject: [PATCH 12/17] README updates --- README.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.rst b/README.rst index bb38ef8..07e6ef8 100644 --- a/README.rst +++ b/README.rst @@ -58,6 +58,12 @@ running a version lower than 2.0. ckan.harvest.mq.type = redis +7. Setup time frame(in days) for the clean-up mechanism with the following config parameter:: + + ckan.harvest.log_timeframe = 10 + + If no value is present the default is 30 days. + There are a number of configuration options available for the backends. These don't need to be modified at all if you are using the default Redis or RabbitMQ install (step 1). The list below shows the available options and their default values: From 0be2c868cb2f5a50d3c469399627566205bd001f Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Wed, 11 May 2016 13:29:53 +0200 Subject: [PATCH 13/17] README updates DBLogHandler updates Added harvest_log table migration for existing users Implemented database log scoping --- README.rst | 81 ++++++++++++++++----------- ckanext/harvest/commands/harvester.py | 4 +- ckanext/harvest/log.py | 8 +-- ckanext/harvest/model/__init__.py | 6 +- ckanext/harvest/plugin.py | 58 +++++++++++++++++++ 5 files changed, 116 insertions(+), 41 deletions(-) diff --git a/README.rst b/README.rst index 07e6ef8..329befa 100644 --- a/README.rst +++ b/README.rst @@ -58,7 +58,53 @@ running a version lower than 2.0. ckan.harvest.mq.type = redis -7. Setup time frame(in days) for the clean-up mechanism with the following config parameter:: +7. If you want your ckan harvest logs to be exposed to the ckan API you need to properly + configure the logger. The default configuration logs everything to the database with + log level ``DEBUG``. If you want to modify the database logger configure the following + parameter:: + + ``ckan.harvest.log_scope = 0`` + + * Log scope settings: + + - ``-1`` Do not log to the database + - ``0`` Log everything - Default + - ``1`` model, logic.action, logic.validators, harvesters + - ``2`` model, logic.action, logic.validators + - ``3`` model, logic.action + - ``4`` logic.action + - ``5`` model + - ``6`` plugin + - ``7`` harvesters + + Additionally you can configure the logger in the following way:: + + [loggers] + keys = ckan_harvester + + [handlers] + keys = dblog + + [formatters] + keys = dblog + + [logger_ckan_harvester] + qualname = ckanext.harvest + handlers = dblog + level = DEBUG + + [handler_dblog] + class = ckanext.harvest.log.DBLogHandler + args = () + level = DEBUG + formatter = dblog + + [formatter_dblog] + format = %(message)s + + If you are having troubles configuring ckan logger please refer to ``test-core.ini`` + +8. Setup time frame(in days) for the clean-up mechanism with the following config parameter:: ckan.harvest.log_timeframe = 10 @@ -91,39 +137,6 @@ config option (or ``default``) will be used to namespace the relevant things: * On Redis, it will namespace the keys used, so only the relevant instance gets them, eg ``site1:harvest_job_id``, ``site1:harvest_object__id:804f114a-8f68-4e7c-b124-3eb00f66202f`` -7. If you want your ckan harvest logs to be exposed to the ckan API you need to add the - following configuration options in your ckan configuriation file:: - - [loggers] - - keys = ckan_harvester - - [handlers] - - keys = dblog - - [formatters] - - keys = dblog - - [logger_ckan_harvester] - - qualname = ckanext.harvest - handlers = dblog - level = DEBUG - - [handler_dblog] - - class = ckanext.harvest.log.DBLogHandler - args = () - level = DEBUG - formatter = dblog - - [formatter_dblog] - - format = %(message)s - - If you are having troubles configuring ckan logger please refer to ``test-core.ini`` Configuration ============= diff --git a/ckanext/harvest/commands/harvester.py b/ckanext/harvest/commands/harvester.py index 3f80ffd..f59102c 100644 --- a/ckanext/harvest/commands/harvester.py +++ b/ckanext/harvest/commands/harvester.py @@ -93,7 +93,7 @@ class Harvester(CkanCommand): harvester job-all - create new harvest jobs for all active sources. -https://www.facebook.com/ + harvester reindex - reindexes the harvest source datasets @@ -531,4 +531,4 @@ https://www.facebook.com/ condition = datetime.utcnow() - timedelta(days=log_timeframe) # Delete logs older then the given date - clean_harvest_log(condition=condition) \ No newline at end of file + clean_harvest_log(condition=condition) \ No newline at end of file diff --git a/ckanext/harvest/log.py b/ckanext/harvest/log.py index 98f3abf..cb1b242 100644 --- a/ckanext/harvest/log.py +++ b/ckanext/harvest/log.py @@ -1,10 +1,10 @@ -from logging import Handler +from logging import Handler, NOTSET from ckanext.harvest.model import HarvestLog -class DBLogHandler(Handler, object): - def __init__(self): - super(DBLogHandler,self).__init__() +class DBLogHandler(Handler): + def __init__(self, level=NOTSET): + super(DBLogHandler,self).__init__(level=level) def emit(self, record): try: diff --git a/ckanext/harvest/model/__init__.py b/ckanext/harvest/model/__init__.py index e0d4698..fe48c78 100644 --- a/ckanext/harvest/model/__init__.py +++ b/ckanext/harvest/model/__init__.py @@ -64,7 +64,7 @@ def setup(): harvest_object_error_table.create() harvest_object_extra_table.create() harvest_log_table.create() - + log.debug('Harvest tables created') else: from ckan.model.meta import engine @@ -88,6 +88,10 @@ def setup(): log.debug('Creating harvest source datasets for %i existing sources', len(sources_to_migrate)) sources_to_migrate = [s[0] for s in sources_to_migrate] migrate_v3_create_datasets(sources_to_migrate) + + # Check if harvest_log table exist - needed for existing users + if not 'harvest_log' in inspector.get_table_names(): + harvest_log_table.create() class HarvestError(Exception): diff --git a/ckanext/harvest/plugin.py b/ckanext/harvest/plugin.py index 4db7cfb..27bb857 100644 --- a/ckanext/harvest/plugin.py +++ b/ckanext/harvest/plugin.py @@ -19,6 +19,7 @@ from ckanext.harvest import logic as harvest_logic from ckanext.harvest.model import setup as model_setup from ckanext.harvest.model import HarvestSource, HarvestJob, HarvestObject +from ckanext.harvest.log import DBLogHandler @@ -217,6 +218,9 @@ class Harvest(p.SingletonPlugin, DefaultDatasetForm, DefaultTranslation): # Setup harvest model model_setup() + + # Configure logger + _configure_logger(config) self.startup = False @@ -463,3 +467,57 @@ def _delete_harvest_source_object(context, data_dict): log.debug('Harvest source %s deleted', source_id) return source + +def _configure_logger(config): + # Log scope + # + # -1 - do not log to the database + # 0 - log everything + # 1 - model, logic.action, logic.validators, harvesters + # 2 - model, logic.action, logic.validators + # 3 - model, logic.action + # 4 - logic.action + # 5 - model + # 6 - plugin + # 7 - harvesters + # + scope = p.toolkit.asint(config.get('ckan.harvest.log_scope', 0)) + if scope == -1: + return + + parent_logger = 'ckanext.harvest' + children = ['plugin', 'model', 'logic.action.create', 'logic.action.delete', + 'logic.action.get', 'logic.action.patch', 'action.update', + 'logic.validators', 'harvesters.base', 'harvesters.ckanharvester'] + + children_ = {0: children, 1: children[1:], 2: children[1:-2], + 3: children[1:-3], 4: children[2:-3], 5: children[1:2], + 6: children[:1], 7: children[-2:]} + + # Get log level from config param - default: DEBUG + from logging import DEBUG, INFO, WARNING, ERROR, CRITICAL + level_ = config.get('ckan.harvest.log_level', 'debug').upper() + if level_ == 'DEBUG': + level_ = DEBUG + elif level_ == 'INFO': + level_ = INFO + elif level_ == 'WARNING': + level_ = WARNING + elif level_ == 'ERROR': + level_ = ERROR + elif level_ == 'CRITICAL': + level_ = CRITICAL + else: + level_ = DEBUG + + loggers = children_.get(scope) + + # Get root logger and set db handler + logger = getLogger(parent_logger) + if scope < 1: + logger.addHandler(DBLogHandler(level=level_)) + + # Set db handler to all child loggers + for _ in loggers: + child_logger = logger.getChild(_) + child_logger.addHandler(DBLogHandler(level=level_)) From ceb8d26aa041723aa38667cc57013925d64dc37b Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Wed, 11 May 2016 17:26:40 +0200 Subject: [PATCH 14/17] Spacing fixes in README.rst --- README.rst | 68 ++++++++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/README.rst b/README.rst index 329befa..4eb9d67 100644 --- a/README.rst +++ b/README.rst @@ -58,51 +58,49 @@ running a version lower than 2.0. ckan.harvest.mq.type = redis -7. If you want your ckan harvest logs to be exposed to the ckan API you need to properly - configure the logger. The default configuration logs everything to the database with - log level ``DEBUG``. If you want to modify the database logger configure the following - parameter:: +7. If you want your ckan harvest logs to be exposed to the CKAN API you need to properly + configure the logger. The default configuration logs everything in the database with + log level ``DEBUG``. If you want to modify the default logging mechanism set the following + parameter in your configuration file:: - ``ckan.harvest.log_scope = 0`` + ckan.harvest.log_scope = 0 - * Log scope settings: + * -1 - Do not log in the database + * 0 - Log everything - DEFAULT + * 1 - model, logic.action, logic.validators, harvesters + * 2 - model, logic.action, logic.validators + * 3 - model, logic.action + * 4 - logic.action + * 5 - model + * 6 - plugin + * 7 - harvesters - - ``-1`` Do not log to the database - - ``0`` Log everything - Default - - ``1`` model, logic.action, logic.validators, harvesters - - ``2`` model, logic.action, logic.validators - - ``3`` model, logic.action - - ``4`` logic.action - - ``5`` model - - ``6`` plugin - - ``7`` harvesters +Additionally you can configure the logger the following way:: - Additionally you can configure the logger in the following way:: + [loggers] + keys = ckan_harvester - [loggers] - keys = ckan_harvester + [handlers] + keys = dblog - [handlers] - keys = dblog + [formatters] + keys = dblog - [formatters] - keys = dblog + [logger_ckan_harvester] + qualname = ckanext.harvest + handlers = dblog + level = DEBUG - [logger_ckan_harvester] - qualname = ckanext.harvest - handlers = dblog - level = DEBUG + [handler_dblog] + class = ckanext.harvest.log.DBLogHandler + args = () + level = DEBUG + formatter = dblog - [handler_dblog] - class = ckanext.harvest.log.DBLogHandler - args = () - level = DEBUG - formatter = dblog + [formatter_dblog] + format = %(message)s - [formatter_dblog] - format = %(message)s - - If you are having troubles configuring ckan logger please refer to ``test-core.ini`` +If you are having troubles configuring harvest logger please refer to ``test-core.ini`` 8. Setup time frame(in days) for the clean-up mechanism with the following config parameter:: From a3026f915a6dbb8f65c2be070f7e99371e814aba Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Thu, 12 May 2016 09:46:33 +0200 Subject: [PATCH 15/17] README updates --- README.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.rst b/README.rst index 4eb9d67..50eea28 100644 --- a/README.rst +++ b/README.rst @@ -108,6 +108,12 @@ If you are having troubles configuring harvest logger please refer to ``test-cor If no value is present the default is 30 days. +9. Setup log level for the database logger:: + + ckan.harvest.log_level = info + + If no log level is set the default is ``debug``. + There are a number of configuration options available for the backends. These don't need to be modified at all if you are using the default Redis or RabbitMQ install (step 1). The list below shows the available options and their default values: From cc6cb3e3892714c8b4c574c024fc2e7243834f40 Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Mon, 16 May 2016 13:15:12 +0200 Subject: [PATCH 16/17] Changed default config params for the database logger Added database logger test case README updates --- README.rst | 113 ++++++++++++++------------- ckanext/harvest/plugin.py | 38 ++++----- ckanext/harvest/tests/test_action.py | 37 +++++++++ 3 files changed, 114 insertions(+), 74 deletions(-) diff --git a/README.rst b/README.rst index 50eea28..79a70b8 100644 --- a/README.rst +++ b/README.rst @@ -58,61 +58,6 @@ running a version lower than 2.0. ckan.harvest.mq.type = redis -7. If you want your ckan harvest logs to be exposed to the CKAN API you need to properly - configure the logger. The default configuration logs everything in the database with - log level ``DEBUG``. If you want to modify the default logging mechanism set the following - parameter in your configuration file:: - - ckan.harvest.log_scope = 0 - - * -1 - Do not log in the database - * 0 - Log everything - DEFAULT - * 1 - model, logic.action, logic.validators, harvesters - * 2 - model, logic.action, logic.validators - * 3 - model, logic.action - * 4 - logic.action - * 5 - model - * 6 - plugin - * 7 - harvesters - -Additionally you can configure the logger the following way:: - - [loggers] - keys = ckan_harvester - - [handlers] - keys = dblog - - [formatters] - keys = dblog - - [logger_ckan_harvester] - qualname = ckanext.harvest - handlers = dblog - level = DEBUG - - [handler_dblog] - class = ckanext.harvest.log.DBLogHandler - args = () - level = DEBUG - formatter = dblog - - [formatter_dblog] - format = %(message)s - -If you are having troubles configuring harvest logger please refer to ``test-core.ini`` - -8. Setup time frame(in days) for the clean-up mechanism with the following config parameter:: - - ckan.harvest.log_timeframe = 10 - - If no value is present the default is 30 days. - -9. Setup log level for the database logger:: - - ckan.harvest.log_level = info - - If no log level is set the default is ``debug``. There are a number of configuration options available for the backends. These don't need to be modified at all if you are using the default Redis or RabbitMQ install (step 1). The list @@ -157,6 +102,64 @@ After installation, the harvest source listing should be available under /harves http://localhost:5000/harvest +Database logger configuration(optional) +======================================= + +1. Logging to the database is disabled by default. If you want your ckan harvest logs + to be exposed to the CKAN API you need to properly configure the logger + with the following configuration parameter:: + + ckan.harvest.log_scope = 0 + + * -1 - Do not log in the database - DEFAULT + * 0 - Log everything + * 1 - model, logic.action, logic.validators, harvesters + * 2 - model, logic.action, logic.validators + * 3 - model, logic.action + * 4 - logic.action + * 5 - model + * 6 - plugin + * 7 - harvesters + +Additionally you can configure the logger the following way:: + + [loggers] + keys = ckan_harvester + + [handlers] + keys = dblog + + [formatters] + keys = dblog + + [logger_ckan_harvester] + qualname = ckanext.harvest + handlers = dblog + level = DEBUG + + [handler_dblog] + class = ckanext.harvest.log.DBLogHandler + args = () + level = DEBUG + formatter = dblog + + [formatter_dblog] + format = %(message)s + +If you are having troubles configuring harvest logger please refer to ``test-core.ini`` + +2. Setup time frame(in days) for the clean-up mechanism with the following config parameter:: + + ckan.harvest.log_timeframe = 10 + + If no value is present the default is 30 days. + +3. Setup log level for the database logger:: + + ckan.harvest.log_level = info + + If no log level is set the default is ``debug``. + Command line interface ====================== diff --git a/ckanext/harvest/plugin.py b/ckanext/harvest/plugin.py index 27bb857..55af1c7 100644 --- a/ckanext/harvest/plugin.py +++ b/ckanext/harvest/plugin.py @@ -219,8 +219,8 @@ class Harvest(p.SingletonPlugin, DefaultDatasetForm, DefaultTranslation): # Setup harvest model model_setup() - # Configure logger - _configure_logger(config) + # Configure database logger + _configure_db_logger(config) self.startup = False @@ -468,7 +468,7 @@ def _delete_harvest_source_object(context, data_dict): return source -def _configure_logger(config): +def _configure_db_logger(config): # Log scope # # -1 - do not log to the database @@ -481,13 +481,13 @@ def _configure_logger(config): # 6 - plugin # 7 - harvesters # - scope = p.toolkit.asint(config.get('ckan.harvest.log_scope', 0)) + scope = p.toolkit.asint(config.get('ckan.harvest.log_scope', -1)) if scope == -1: return parent_logger = 'ckanext.harvest' children = ['plugin', 'model', 'logic.action.create', 'logic.action.delete', - 'logic.action.get', 'logic.action.patch', 'action.update', + 'logic.action.get', 'logic.action.patch', 'logic.action.update', 'logic.validators', 'harvesters.base', 'harvesters.ckanharvester'] children_ = {0: children, 1: children[1:], 2: children[1:-2], @@ -496,28 +496,28 @@ def _configure_logger(config): # Get log level from config param - default: DEBUG from logging import DEBUG, INFO, WARNING, ERROR, CRITICAL - level_ = config.get('ckan.harvest.log_level', 'debug').upper() - if level_ == 'DEBUG': - level_ = DEBUG - elif level_ == 'INFO': - level_ = INFO - elif level_ == 'WARNING': - level_ = WARNING - elif level_ == 'ERROR': - level_ = ERROR - elif level_ == 'CRITICAL': - level_ = CRITICAL + level = config.get('ckan.harvest.log_level', 'debug').upper() + if level == 'DEBUG': + level = DEBUG + elif level == 'INFO': + level = INFO + elif level == 'WARNING': + level = WARNING + elif level == 'ERROR': + level = ERROR + elif level == 'CRITICAL': + level = CRITICAL else: - level_ = DEBUG + level = DEBUG loggers = children_.get(scope) # Get root logger and set db handler logger = getLogger(parent_logger) if scope < 1: - logger.addHandler(DBLogHandler(level=level_)) + logger.addHandler(DBLogHandler(level=level)) # Set db handler to all child loggers for _ in loggers: child_logger = logger.getChild(_) - child_logger.addHandler(DBLogHandler(level=level_)) + child_logger.addHandler(DBLogHandler(level=level)) diff --git a/ckanext/harvest/tests/test_action.py b/ckanext/harvest/tests/test_action.py index c2b1524..79e91ca 100644 --- a/ckanext/harvest/tests/test_action.py +++ b/ckanext/harvest/tests/test_action.py @@ -502,3 +502,40 @@ class TestHarvestObject(unittest.TestCase): self.assertRaises(toolkit.ValidationError, harvest_object_create, context, data_dict) + + +class TestHarvestDBLog(unittest.TestCase): + @classmethod + def setup_class(cls): + reset_db() + harvest_model.setup() + + def test_harvest_db_logger(self): + # Create source and check if harvest_log table is populated + data_dict = SOURCE_DICT.copy() + data_dict['source_type'] = 'test' + source = factories.HarvestSourceObj(**data_dict) + content = 'Harvest source created: %s' % source.id + log = harvest_model.Session.query(harvest_model.HarvestLog).\ + filter(harvest_model.HarvestLog.content==content).first() + + self.assertIsNotNone(log) + self.assertEqual(log.level, 'INFO') + + context = { + 'model': model, + 'session': model.Session, + 'ignore_auth': True, + } + + data = toolkit.get_action('harvest_log_list')(context, {}) + self.assertTrue(len(data) > 0) + self.assertIn('level', data[0]) + self.assertIn('content', data[0]) + self.assertIn('created', data[0]) + self.assertTrue(data[0]['created'] > data[1]['created']) + + per_page = 1 + data = toolkit.get_action('harvest_log_list')(context, {'level': 'info', 'per_page': per_page}) + self.assertEqual(len(data), per_page) + self.assertEqual(data[0]['level'], 'INFO') \ No newline at end of file From 9641dc69c2d4da6b0b6d72137d66a322b0ea069e Mon Sep 17 00:00:00 2001 From: Petar Efnushev Date: Tue, 17 May 2016 11:31:38 +0200 Subject: [PATCH 17/17] README updates --- README.rst | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/README.rst b/README.rst index 79a70b8..60f2cd6 100644 --- a/README.rst +++ b/README.rst @@ -121,33 +121,6 @@ Database logger configuration(optional) * 6 - plugin * 7 - harvesters -Additionally you can configure the logger the following way:: - - [loggers] - keys = ckan_harvester - - [handlers] - keys = dblog - - [formatters] - keys = dblog - - [logger_ckan_harvester] - qualname = ckanext.harvest - handlers = dblog - level = DEBUG - - [handler_dblog] - class = ckanext.harvest.log.DBLogHandler - args = () - level = DEBUG - formatter = dblog - - [formatter_dblog] - format = %(message)s - -If you are having troubles configuring harvest logger please refer to ``test-core.ini`` - 2. Setup time frame(in days) for the clean-up mechanism with the following config parameter:: ckan.harvest.log_timeframe = 10