# -*- coding: utf-8 -*- from __future__ import print_function import json import logging import re import sys import xml.etree.ElementTree as etree import ckan.lib.helpers as h import ckan.plugins.toolkit as tk import six from ckan import model from ckantoolkit import _ from six import StringIO from ckanext.harvest.logic import HarvestJobExists, HarvestSourceInactiveError try: # Python 2.7 xml_parser_exception = etree.ParseError except AttributeError: # Python 2.6 from xml.parsers import expat xml_parser_exception = expat.ExpatError log = logging.getLogger(__name__) DATASET_TYPE_NAME = "harvest" ############################################################################### # CLI # ############################################################################### def _admin_user(): context = {"model": model, "session": model.Session, "ignore_auth": True} return tk.get_action("get_site_user")(context, {}) def _print_harvest_sources(sources, output): for source in sources: _print_harvest_source(source, output) def _print_harvest_jobs(jobs, output): for job in jobs: _print_harvest_job(job, output) def _print_harvest_job(job, output): print( ("\tJob id: {}\n" "\t\tstatus: {}\n" "\t\tsource: {}\n" "\t\tobjects: {}").format( job.get("id"), job.get("status"), job.get("source_id"), len(job.get("objects", [])), ), file=output, ) print( "\t\tgather_errors: {0}".format(len(job.get("gather_errors", []))), file=output, ) for error in job.get("gather_errors", []): print("\t\t\t{0}".format(error["message"]), file=output) def _print_harvest_source(source, output): print("Source id: {0}".format(source.get("id")), file=output) if "name" in source: # 'name' is only there if the source comes from the Package print("\tname: {}".format(source["name"]), file=output) data_dict = { "url": source.get("url"), # 'type' if source comes from HarvestSource, 'source_type' if # it comes from the Package "type": source.get("source_type") or source.get("type"), "active": source.get("active", source.get("state") == "active"), "frequency": source.get("frequency"), "jobs": source.get("status").get("job_count"), } print( ("\turl: {url}\n" "\ttype: {type}\n" "\tactive: {active}\n" "\tfrequency: {frequency}\n" "\tjobs: {jobs}\n" "\n").format(**data_dict), file=output, ) def _there_are(what, sequence, condition=""): is_singular = len(sequence) == 1 return "There {0} {1} {2}{3}{4}".format( is_singular and "is" or "are", len(sequence), condition and ("{0} ".format(condition.lower())) or "", what, not is_singular and "s" or "", ) def initdb(): from ckanext.harvest.model import setup as db_setup db_setup() def create_harvest_source( name, url, type, title=None, active=True, owner_org=None, frequency="MANUAL", config=None, ): output = StringIO() data_dict = { "name": name, "url": url, "source_type": type, "title": title, "active": active, "owner_org": owner_org, "frequency": frequency, "config": config, } context = { "model": model, "session": model.Session, "user": _admin_user()["name"], "ignore_auth": True, } source = tk.get_action("harvest_source_create")(context, data_dict) print("Created new harvest source:", file=output) _print_harvest_source(source, output) sources = tk.get_action("harvest_source_list")(context, {}) print(_there_are("harvest source", sources), file=output) # Create a harvest job for the new source if not regular job. if not data_dict["frequency"]: tk.get_action("harvest_job_create")(context, { "source_id": source["id"], "run": True }) print( "A new Harvest Job for this source has also been created", file=output, ) return output.getvalue() def show_harvest_source(source_id_or_name): context = { "model": model, "session": model.Session, "user": _admin_user()["name"], } source = tk.get_action("harvest_source_show")(context, { "id": source_id_or_name }) output = StringIO() _print_harvest_source(source, output) return output.getvalue() def remove_harvest_source(source_id_or_name): context = { "model": model, "session": model.Session, "user": _admin_user()["name"], } source = tk.get_action("harvest_source_show")(context, { "id": source_id_or_name }) tk.get_action("harvest_source_delete")(context, {"id": source["id"]}) def clear_harvest_source(source_id_or_name): context = { "model": model, "session": model.Session, "user": _admin_user()["name"], } source = tk.get_action("harvest_source_show")(context, { "id": source_id_or_name }) tk.get_action("harvest_source_clear")(context, {"id": source["id"]}) def clear_harvest_source_history(source_id, keep_current): context = { "model": model, "user": _admin_user()["name"], "session": model.Session, } if source_id is not None: tk.get_action("harvest_source_job_history_clear")(context, { "id": source_id, "keep_current": keep_current }) return "Cleared job history of harvest source: {0}".format(source_id) else: # Purge queues, because we clean all harvest jobs and # objects in the database. if not keep_current: purge_queues() cleared_sources_dicts = tk.get_action( "harvest_sources_job_history_clear")(context, { "keep_current": keep_current }) return "Cleared job history for all harvest sources: {0} source(s)".format( len(cleared_sources_dicts)) def abort_failed_jobs(job_life_span, include, exclude): context = { "model": model, "user": _admin_user()["name"], "session": model.Session, } result = tk.get_action("harvest_abort_failed_jobs")(context, { "life_span": job_life_span, "include": include, "exclude": exclude }) print(result) def purge_queues(): from ckanext.harvest.queue import purge_queues as purge purge() def list_sources(all): if all: data_dict = {} what = "harvest source" else: data_dict = {"only_active": True} what = "active harvest source" context = { "model": model, "session": model.Session, "user": _admin_user()["name"], } sources = tk.get_action("harvest_source_list")(context, data_dict) output = StringIO() _print_harvest_sources(sources, output) print(_there_are(what, sources), file=output) return output.getvalue() def create_job(source_id_or_name): context = { "model": model, "session": model.Session, "user": _admin_user()["name"], } source = tk.get_action("harvest_source_show")(context, { "id": source_id_or_name }) context = { "model": model, "session": model.Session, "user": _admin_user()["name"], } job = tk.get_action("harvest_job_create")(context, { "source_id": source["id"], "run": True }) output = StringIO() _print_harvest_job(job, output) jobs = tk.get_action("harvest_job_list")(context, {"status": u"New"}) print(_there_are("harvest job", jobs, condition=u"New"), file=output) return output.getvalue() def list_jobs(): context = { "model": model, "user": _admin_user()["name"], "session": model.Session, } jobs = tk.get_action("harvest_job_list")(context, {}) output = StringIO() _print_harvest_jobs(jobs, output) print(_there_are(what="harvest job", sequence=jobs), file=output) return output.getvalue() def abort_job(job_or_source_id_or_name): context = { "model": model, "user": _admin_user()["name"], "session": model.Session, } job = tk.get_action("harvest_job_abort")(context, { "id": job_or_source_id_or_name }) return "Job status: {0}".format(job["status"]) def gather_consumer(): import logging from ckanext.harvest.queue import ( get_gather_consumer, gather_callback, get_gather_queue_name, ) logging.getLogger("amqplib").setLevel(logging.INFO) consumer = get_gather_consumer() for method, header, body in consumer.consume( queue=get_gather_queue_name()): gather_callback(consumer, method, header, body) def fetch_consumer(): import logging logging.getLogger("amqplib").setLevel(logging.INFO) from ckanext.harvest.queue import ( get_fetch_consumer, fetch_callback, get_fetch_queue_name, ) consumer = get_fetch_consumer() for method, header, body in consumer.consume(queue=get_fetch_queue_name()): fetch_callback(consumer, method, header, body) def run_harvester(): context = { "model": model, "user": _admin_user()["name"], "session": model.Session, } tk.get_action("harvest_jobs_run")(context, {}) def run_test_harvester(source_id_or_name, force_import): from ckanext.harvest import queue from ckanext.harvest.tests import lib from ckanext.harvest.logic import HarvestJobExists from ckanext.harvest.model import HarvestJob context = { "model": model, "session": model.Session, "user": _admin_user()["name"], } source = tk.get_action("harvest_source_show")(context, { "id": source_id_or_name }) # Determine the job try: job_dict = tk.get_action("harvest_job_create")( context, { "source_id": source["id"] }) except HarvestJobExists: running_jobs = tk.get_action("harvest_job_list")( context, { "source_id": source["id"], "status": "Running" }) if running_jobs: print('\nSource "{0}" apparently has a "Running" job:\n{1}'.format( source.get("name") or source["id"], running_jobs)) resp = six.moves.input("Abort it? (y/n)") if not resp.lower().startswith("y"): sys.exit(1) job_dict = tk.get_action("harvest_job_abort")( context, { "source_id": source["id"] }) else: print("Reusing existing harvest job") jobs = tk.get_action("harvest_job_list")(context, { "source_id": source["id"], "status": "New" }) assert (len(jobs) == 1 ), 'Multiple "New" jobs for this source! {0}'.format(jobs) job_dict = jobs[0] job_obj = HarvestJob.get(job_dict["id"]) if force_import: job_obj.force_import = force_import harvester = queue.get_harvester(source["source_type"]) assert harvester, "No harvester found for type: {0}".format( source["source_type"]) lib.run_harvest_job(job_obj, harvester) def import_stage( source_id_or_name, no_join_datasets, harvest_object_id, guid, package_id, segments, ): if source_id_or_name: context = { "model": model, "session": model.Session, "user": _admin_user()["name"], } source = tk.get_action("harvest_source_show")(context, { "id": source_id_or_name }) source_id = source["id"] else: source_id = None context = { "model": model, "session": model.Session, "user": _admin_user()["name"], "join_datasets": not no_join_datasets, "segments": segments, } objs_count = tk.get_action("harvest_objects_import")( context, { "source_id": source_id, "harvest_object_id": harvest_object_id, "package_id": package_id, "guid": guid, }, ) print("{0} objects reimported".format(objs_count)) def job_all(): context = { "model": model, "user": _admin_user()["name"], "session": model.Session, } jobs = tk.get_action("harvest_job_create_all")(context, {}) return "Created {0} new harvest jobs".format(len(jobs)) def reindex(): context = {"model": model, "user": _admin_user()["name"]} tk.get_action("harvest_sources_reindex")(context, {}) def clean_harvest_log(): from datetime import datetime, timedelta from ckantoolkit import config from ckanext.harvest.model import clean_harvest_log # Log time frame - in days log_timeframe = tk.asint(config.get("ckan.harvest.log_timeframe", 30)) condition = datetime.utcnow() - timedelta(days=log_timeframe) # Delete logs older then the given date clean_harvest_log(condition=condition) def harvesters_info(): harvesters_info = tk.get_action("harvesters_info_show")() return harvesters_info ############################################################################### # Controller # ############################################################################### def _not_auth_message(): return _('Not authorized to see this page') def _get_source_for_job(source_id): try: context = {'model': model, 'user': tk.c.user} source_dict = tk.get_action('harvest_source_show')(context, { 'id': source_id }) except tk.ObjectNotFound: return tk.abort(404, _('Harvest source not found')) except tk.NotAuthorized: return tk.abort(401, _not_auth_message()) except Exception as e: msg = 'An error occurred: [%s]' % str(e) return tk.abort(500, msg) return source_dict def admin_view(id): try: context = {'model': model, 'user': tk.c.user} tk.check_access('harvest_source_update', context, {'id': id}) harvest_source = tk.get_action('harvest_source_show')(context, { 'id': id }) return tk.render('source/admin.html', extra_vars={'harvest_source': harvest_source}) except tk.ObjectNotFound: return tk.abort(404, _('Harvest source not found')) except tk.NotAuthorized: return tk.abort(401, _not_auth_message()) def job_show_last_view(source): source_dict = _get_source_for_job(source) if not source_dict['status']['last_job']: return tk.abort(404, _('No jobs yet for this source')) return job_show_view( source_dict['status']['last_job']['id'], source_dict=source_dict, is_last=True, ) def job_show_view(id, source_dict=False, is_last=False): try: context = {'model': model, 'user': tk.c.user} job = tk.get_action('harvest_job_show')(context, {'id': id}) job_report = tk.get_action('harvest_job_report')(context, {'id': id}) if not source_dict: source_dict = tk.get_action('harvest_source_show')( context, { 'id': job['source_id'] }) return tk.render( 'source/job/read.html', extra_vars={ 'harvest_source': source_dict, 'job': job, 'job_report': job_report, 'is_last_job': is_last, }, ) except tk.ObjectNotFound: return tk.abort(404, _('Harvest job not found')) except tk.NotAuthorized: return tk.abort(401, _not_auth_message()) except Exception as e: msg = 'An error occurred: [%s]' % str(e) return tk.abort(500, msg) def job_list_view(source): try: context = {'model': model, 'user': tk.c.user} harvest_source = tk.get_action('harvest_source_show')(context, { 'id': source }) jobs = tk.get_action('harvest_job_list')( context, { 'source_id': harvest_source['id'] }) return tk.render( 'source/job/list.html', extra_vars={ 'harvest_source': harvest_source, 'jobs': jobs }, ) except tk.ObjectNotFound: return tk.abort(404, _('Harvest source not found')) except tk.NotAuthorized: return tk.abort(401, _not_auth_message()) except Exception as e: msg = 'An error occurred: [%s]' % str(e) return tk.abort(500, msg) def about_view(id): try: context = {'model': model, 'user': tk.c.user} harvest_source = tk.get_action('harvest_source_show')(context, { 'id': id }) return tk.render('source/about.html', extra_vars={'harvest_source': harvest_source}) except tk.ObjectNotFound: return tk.abort(404, _('Harvest source not found')) except tk.NotAuthorized: return tk.abort(401, _not_auth_message()) def job_abort_view(source, id): try: context = {'model': model, 'user': tk.c.user} tk.get_action('harvest_job_abort')(context, {'id': id}) h.flash_success(_('Harvest job stopped')) except tk.ObjectNotFound: return tk.abort(404, _('Harvest job not found')) except tk.NotAuthorized: return tk.abort(401, _not_auth_message()) except Exception as e: msg = 'An error occurred: [%s]' % str(e) return tk.abort(500, msg) return h.redirect_to( h.url_for('{0}_admin'.format(DATASET_TYPE_NAME), id=source)) def refresh_view(id): try: context = {'model': model, 'user': tk.c.user, 'session': model.Session} tk.get_action('harvest_job_create')(context, { 'source_id': id, 'run': True }) h.flash_success( _('Harvest will start shortly. Refresh this page for updates.')) except tk.ObjectNotFound: return tk.abort(404, _('Harvest source not found')) except tk.NotAuthorized: return tk.abort(401, _not_auth_message()) except HarvestSourceInactiveError: h.flash_error( _('Cannot create new harvest jobs on inactive ' 'sources. First, please change the source status ' 'to "active".')) except HarvestJobExists: h.flash_notice( _('A harvest job has already been scheduled for ' 'this source')) except Exception as e: msg = 'An error occurred: [%s]' % str(e) h.flash_error(msg) return h.redirect_to( h.url_for('{0}_admin'.format(DATASET_TYPE_NAME), id=id)) def clear_view(id): try: context = {'model': model, 'user': tk.c.user, 'session': model.Session} tk.get_action('harvest_source_clear')(context, {'id': id}) h.flash_success(_('Harvest source cleared')) except tk.ObjectNotFound: return tk.abort(404, _('Harvest source not found')) except tk.NotAuthorized: return tk.abort(401, _not_auth_message()) except Exception as e: msg = 'An error occurred: [%s]' % str(e) h.flash_error(msg) return h.redirect_to( h.url_for('{0}_admin'.format(DATASET_TYPE_NAME), id=id)) def delete_view(id): try: context = {'model': model, 'user': tk.c.user} context['clear_source'] = tk.request.params.get('clear', '').lower() in ( u'true', u'1', ) tk.get_action('harvest_source_delete')(context, {'id': id}) if context['clear_source']: h.flash_success(_('Harvesting source successfully cleared')) else: h.flash_success(_('Harvesting source successfully inactivated')) return h.redirect_to( h.url_for('{0}_admin'.format(DATASET_TYPE_NAME), id=id)) except tk.ObjectNotFound: return tk.abort(404, _('Harvest source not found')) except tk.NotAuthorized: return tk.abort(401, _not_auth_message()) def object_show_view(id, ref_type, response): try: context = {'model': model, 'user': tk.c.user} if ref_type == 'object': obj = tk.get_action('harvest_object_show')(context, {'id': id}) elif ref_type == 'dataset': obj = tk.get_action('harvest_object_show')(context, { 'dataset_id': id }) # Check content type. It will probably be either XML or JSON try: if obj['content']: content = obj['content'] elif 'original_document' in obj['extras']: content = obj['extras']['original_document'] else: return tk.abort(404, _('No content found')) try: etree.fromstring(re.sub(r'<\?xml(.*)\?>', '', content)) except UnicodeEncodeError: etree.fromstring( re.sub(r'<\?xml(.*)\?>', '', content.encode('utf-8'))) response.content_type = 'application/xml; charset=utf-8' if '\n' + content except xml_parser_exception: try: json.loads(obj['content']) response.content_type = 'application/json; charset=utf-8' except ValueError: # Just return whatever it is pass response.headers['Content-Length'] = len(content) return (response, six.ensure_str(content)) except tk.ObjectNotFound as e: return tk.abort(404, _(str(e))) except tk.NotAuthorized: return tk.abort(401, _not_auth_message()) except Exception as e: msg = 'An error occurred: [%s]' % str(e) return tk.abort(500, msg)