2011-04-06 13:45:00 +02:00
|
|
|
import logging
|
2011-04-07 17:59:11 +02:00
|
|
|
import datetime
|
2012-10-24 01:34:32 +02:00
|
|
|
import json
|
2011-04-06 13:45:00 +02:00
|
|
|
|
2012-10-24 01:34:32 +02:00
|
|
|
import pika
|
2011-04-06 13:45:00 +02:00
|
|
|
|
|
|
|
from ckan.lib.base import config
|
|
|
|
from ckan.plugins import PluginImplementations
|
2012-12-18 00:50:26 +01:00
|
|
|
from ckan import model
|
2011-04-06 13:45:00 +02:00
|
|
|
|
2011-04-15 17:12:47 +02:00
|
|
|
from ckanext.harvest.model import HarvestJob, HarvestObject,HarvestGatherError
|
2011-04-06 13:45:00 +02:00
|
|
|
from ckanext.harvest.interfaces import IHarvester
|
|
|
|
|
2012-04-10 21:10:17 +02:00
|
|
|
log = logging.getLogger(__name__)
|
2012-04-10 21:53:29 +02:00
|
|
|
assert not log.disabled
|
2011-04-06 13:45:00 +02:00
|
|
|
|
2015-10-28 18:51:58 +01:00
|
|
|
__all__ = ['get_gather_publisher', 'get_gather_consumer',
|
|
|
|
'get_fetch_publisher', 'get_fetch_consumer',
|
|
|
|
'get_harvester']
|
2011-04-06 13:45:00 +02:00
|
|
|
|
2011-04-07 17:59:11 +02:00
|
|
|
PORT = 5672
|
2011-04-06 13:45:00 +02:00
|
|
|
USERID = 'guest'
|
|
|
|
PASSWORD = 'guest'
|
|
|
|
HOSTNAME = 'localhost'
|
|
|
|
VIRTUAL_HOST = '/'
|
2013-09-19 11:43:03 +02:00
|
|
|
MQ_TYPE = 'amqp'
|
2013-04-21 18:04:57 +02:00
|
|
|
REDIS_PORT = 6379
|
|
|
|
REDIS_DB = 0
|
2011-04-06 13:45:00 +02:00
|
|
|
|
|
|
|
# settings for AMQP
|
|
|
|
EXCHANGE_TYPE = 'direct'
|
|
|
|
EXCHANGE_NAME = 'ckan.harvest'
|
|
|
|
|
2012-10-30 18:13:39 +01:00
|
|
|
def get_connection():
|
2013-04-21 18:04:57 +02:00
|
|
|
backend = config.get('ckan.harvest.mq.type', MQ_TYPE)
|
2013-09-19 11:43:03 +02:00
|
|
|
if backend in ('amqp', 'ampq'): # "ampq" is for compat with old typo
|
|
|
|
return get_connection_amqp()
|
2013-04-21 18:04:57 +02:00
|
|
|
if backend == 'redis':
|
|
|
|
return get_connection_redis()
|
|
|
|
raise Exception('not a valid queue type %s' % backend)
|
|
|
|
|
2013-09-19 11:43:03 +02:00
|
|
|
def get_connection_amqp():
|
2011-04-06 13:45:00 +02:00
|
|
|
try:
|
|
|
|
port = int(config.get('ckan.harvest.mq.port', PORT))
|
|
|
|
except ValueError:
|
|
|
|
port = PORT
|
|
|
|
userid = config.get('ckan.harvest.mq.user_id', USERID)
|
|
|
|
password = config.get('ckan.harvest.mq.password', PASSWORD)
|
|
|
|
hostname = config.get('ckan.harvest.mq.hostname', HOSTNAME)
|
|
|
|
virtual_host = config.get('ckan.harvest.mq.virtual_host', VIRTUAL_HOST)
|
2011-04-07 17:59:11 +02:00
|
|
|
|
2012-10-24 01:34:32 +02:00
|
|
|
credentials = pika.PlainCredentials(userid, password)
|
|
|
|
parameters = pika.ConnectionParameters(host=hostname,
|
|
|
|
port=port,
|
|
|
|
virtual_host=virtual_host,
|
|
|
|
credentials=credentials,
|
|
|
|
frame_max=10000)
|
|
|
|
log.debug("pika connection using %s" % parameters.__dict__)
|
2012-11-15 12:36:06 +01:00
|
|
|
|
2012-10-24 01:34:32 +02:00
|
|
|
return pika.BlockingConnection(parameters)
|
|
|
|
|
2013-04-21 18:04:57 +02:00
|
|
|
def get_connection_redis():
|
|
|
|
import redis
|
|
|
|
return redis.StrictRedis(host=config.get('ckan.harvest.mq.hostname', HOSTNAME),
|
|
|
|
port=int(config.get('ckan.harvest.mq.port', REDIS_PORT)),
|
|
|
|
db=int(config.get('ckan.harvest.mq.redis_db', REDIS_DB)))
|
|
|
|
|
2015-06-01 18:54:22 +02:00
|
|
|
|
|
|
|
def get_gather_queue_name():
|
|
|
|
return 'ckan.harvest.{0}.gather'.format(config.get('ckan.site_id',
|
|
|
|
'default'))
|
|
|
|
|
|
|
|
|
|
|
|
def get_fetch_queue_name():
|
|
|
|
return 'ckan.harvest.{0}.fetch'.format(config.get('ckan.site_id',
|
|
|
|
'default'))
|
|
|
|
|
|
|
|
|
2012-11-07 10:51:25 +01:00
|
|
|
def purge_queues():
|
2013-10-16 13:59:23 +02:00
|
|
|
|
|
|
|
backend = config.get('ckan.harvest.mq.type', MQ_TYPE)
|
2012-11-07 10:51:25 +01:00
|
|
|
connection = get_connection()
|
2013-10-16 13:59:23 +02:00
|
|
|
if backend in ('amqp', 'ampq'):
|
2013-04-21 18:04:57 +02:00
|
|
|
channel = connection.channel()
|
2015-06-01 18:54:22 +02:00
|
|
|
channel.queue_purge(queue=get_gather_queue_name())
|
2015-10-21 18:12:40 +02:00
|
|
|
log.info('AMQP queue purged: %s', get_gather_queue_name())
|
2015-06-01 18:54:22 +02:00
|
|
|
channel.queue_purge(queue=get_fetch_queue_name())
|
2015-10-21 18:12:40 +02:00
|
|
|
log.info('AMQP queue purged: %s', get_fetch_queue_name())
|
2013-04-21 18:04:57 +02:00
|
|
|
return
|
2013-10-16 13:59:23 +02:00
|
|
|
if backend == 'redis':
|
2015-10-23 12:52:22 +02:00
|
|
|
connection.flushdb()
|
|
|
|
log.info('Redis database flushed')
|
2013-04-21 18:04:57 +02:00
|
|
|
|
|
|
|
def resubmit_jobs():
|
|
|
|
if config.get('ckan.harvest.mq.type') != 'redis':
|
|
|
|
return
|
|
|
|
redis = get_connection()
|
|
|
|
harvest_object_pending = redis.keys('harvest_object_id:*')
|
|
|
|
for key in harvest_object_pending:
|
|
|
|
date_of_key = datetime.datetime.strptime(redis.get(key),
|
|
|
|
"%Y-%m-%d %H:%M:%S.%f")
|
2015-10-21 18:12:40 +02:00
|
|
|
if (datetime.datetime.now() - date_of_key).seconds > 180: # 3 minutes for fetch and import max
|
2013-04-21 18:04:57 +02:00
|
|
|
redis.rpush('harvest_object_id',
|
|
|
|
json.dumps({'harvest_object_id': key.split(':')[-1]})
|
|
|
|
)
|
|
|
|
redis.delete(key)
|
|
|
|
|
|
|
|
harvest_jobs_pending = redis.keys('harvest_job_id:*')
|
|
|
|
for key in harvest_jobs_pending:
|
|
|
|
date_of_key = datetime.datetime.strptime(redis.get(key),
|
|
|
|
"%Y-%m-%d %H:%M:%S.%f")
|
|
|
|
if (datetime.datetime.now() - date_of_key).seconds > 7200: # 3 hours for a gather
|
|
|
|
redis.rpush('harvest_job_id',
|
|
|
|
json.dumps({'harvest_job_id': key.split(':')[-1]})
|
|
|
|
)
|
|
|
|
redis.delete(key)
|
2012-10-24 01:34:32 +02:00
|
|
|
|
|
|
|
class Publisher(object):
|
|
|
|
def __init__(self, connection, channel, exchange, routing_key):
|
|
|
|
self.connection = connection
|
|
|
|
self.channel = channel
|
|
|
|
self.exchange = exchange
|
|
|
|
self.routing_key = routing_key
|
|
|
|
def send(self, body, **kw):
|
|
|
|
return self.channel.basic_publish(self.exchange,
|
|
|
|
self.routing_key,
|
|
|
|
json.dumps(body),
|
|
|
|
properties=pika.BasicProperties(
|
|
|
|
delivery_mode = 2, # make message persistent
|
|
|
|
),
|
|
|
|
**kw)
|
|
|
|
def close(self):
|
|
|
|
self.connection.close()
|
2011-04-06 13:45:00 +02:00
|
|
|
|
2013-04-21 18:04:57 +02:00
|
|
|
class RedisPublisher(object):
|
|
|
|
def __init__(self, redis, routing_key):
|
|
|
|
self.redis = redis ## not used
|
|
|
|
self.routing_key = routing_key
|
|
|
|
def send(self, body, **kw):
|
|
|
|
value = json.dumps(body)
|
|
|
|
# remove if already there
|
|
|
|
if self.routing_key == 'harvest_job_id':
|
|
|
|
self.redis.lrem(self.routing_key, 0, value)
|
|
|
|
self.redis.rpush(self.routing_key, value)
|
2013-04-22 19:08:19 +02:00
|
|
|
|
2013-04-21 18:04:57 +02:00
|
|
|
def close(self):
|
|
|
|
return
|
|
|
|
|
2011-04-06 13:45:00 +02:00
|
|
|
def get_publisher(routing_key):
|
2012-10-30 18:13:39 +01:00
|
|
|
connection = get_connection()
|
2013-04-21 18:04:57 +02:00
|
|
|
backend = config.get('ckan.harvest.mq.type', MQ_TYPE)
|
2013-09-19 11:43:03 +02:00
|
|
|
if backend in ('amqp', 'ampq'):
|
2013-04-21 18:04:57 +02:00
|
|
|
channel = connection.channel()
|
|
|
|
channel.exchange_declare(exchange=EXCHANGE_NAME, durable=True)
|
|
|
|
return Publisher(connection,
|
|
|
|
channel,
|
|
|
|
EXCHANGE_NAME,
|
|
|
|
routing_key=routing_key)
|
|
|
|
if backend == 'redis':
|
|
|
|
return RedisPublisher(connection, routing_key)
|
|
|
|
|
|
|
|
|
|
|
|
class FakeMethod(object):
|
2013-09-19 11:43:03 +02:00
|
|
|
''' This is to act like the method returned by AMQP'''
|
2013-04-21 18:04:57 +02:00
|
|
|
def __init__(self, message):
|
|
|
|
self.delivery_tag = message
|
|
|
|
|
|
|
|
class RedisConsumer(object):
|
|
|
|
def __init__(self, redis, routing_key):
|
|
|
|
self.redis = redis
|
|
|
|
self.routing_key = routing_key
|
|
|
|
def consume(self, queue):
|
|
|
|
while True:
|
|
|
|
key, body = self.redis.blpop(self.routing_key)
|
|
|
|
self.redis.set(self.persistance_key(body),
|
|
|
|
str(datetime.datetime.now()))
|
|
|
|
yield (FakeMethod(body), self, body)
|
|
|
|
def persistance_key(self, message):
|
|
|
|
message = json.loads(message)
|
|
|
|
return self.routing_key + ':' + message[self.routing_key]
|
|
|
|
def basic_ack(self, message):
|
|
|
|
self.redis.delete(self.persistance_key(message))
|
2013-04-22 19:08:19 +02:00
|
|
|
def queue_purge(self, queue):
|
2015-10-23 12:52:22 +02:00
|
|
|
self.redis.flushdb()
|
2013-04-22 19:08:19 +02:00
|
|
|
def basic_get(self, queue):
|
|
|
|
body = self.redis.lpop(self.routing_key)
|
|
|
|
return (FakeMethod(body), self, body)
|
2012-10-24 01:34:32 +02:00
|
|
|
|
2011-04-06 13:45:00 +02:00
|
|
|
def get_consumer(queue_name, routing_key):
|
2012-10-24 01:34:32 +02:00
|
|
|
|
2012-10-30 18:13:39 +01:00
|
|
|
connection = get_connection()
|
2013-04-21 18:04:57 +02:00
|
|
|
backend = config.get('ckan.harvest.mq.type', MQ_TYPE)
|
|
|
|
|
2013-09-19 11:43:03 +02:00
|
|
|
if backend in ('amqp', 'ampq'):
|
2013-04-21 18:04:57 +02:00
|
|
|
channel = connection.channel()
|
|
|
|
channel.exchange_declare(exchange=EXCHANGE_NAME, durable=True)
|
|
|
|
channel.queue_declare(queue=queue_name, durable=True)
|
|
|
|
channel.queue_bind(queue=queue_name, exchange=EXCHANGE_NAME, routing_key=routing_key)
|
|
|
|
return channel
|
|
|
|
if backend == 'redis':
|
|
|
|
return RedisConsumer(connection, routing_key)
|
2011-04-06 13:45:00 +02:00
|
|
|
|
2011-04-07 17:59:11 +02:00
|
|
|
|
2012-10-24 01:34:32 +02:00
|
|
|
def gather_callback(channel, method, header, body):
|
2011-04-06 13:45:00 +02:00
|
|
|
try:
|
2012-10-24 01:34:32 +02:00
|
|
|
id = json.loads(body)['harvest_job_id']
|
2011-04-07 17:59:11 +02:00
|
|
|
log.debug('Received harvest job id: %s' % id)
|
2011-04-06 13:45:00 +02:00
|
|
|
except KeyError:
|
|
|
|
log.error('No harvest job id received')
|
2012-10-24 01:34:32 +02:00
|
|
|
channel.basic_ack(method.delivery_tag)
|
2013-03-14 18:31:07 +01:00
|
|
|
return False
|
|
|
|
|
|
|
|
# Get a publisher for the fetch queue
|
|
|
|
publisher = get_fetch_publisher()
|
|
|
|
|
|
|
|
job = HarvestJob.get(id)
|
|
|
|
|
|
|
|
if not job:
|
|
|
|
log.error('Harvest job does not exist: %s' % id)
|
|
|
|
channel.basic_ack(method.delivery_tag)
|
|
|
|
return False
|
|
|
|
|
|
|
|
# Send the harvest job to the plugins that implement
|
|
|
|
# the Harvester interface, only if the source type
|
|
|
|
# matches
|
2015-10-28 18:51:58 +01:00
|
|
|
harvester = get_harvester(job.source.type)
|
|
|
|
|
|
|
|
if harvester:
|
|
|
|
try:
|
|
|
|
harvest_object_ids = gather_stage(harvester, job)
|
|
|
|
except (Exception, KeyboardInterrupt):
|
|
|
|
channel.basic_ack(method.delivery_tag)
|
|
|
|
raise
|
|
|
|
|
|
|
|
if not isinstance(harvest_object_ids, list):
|
|
|
|
log.error('Gather stage failed')
|
|
|
|
publisher.close()
|
|
|
|
channel.basic_ack(method.delivery_tag)
|
|
|
|
return False
|
|
|
|
|
|
|
|
if len(harvest_object_ids) == 0:
|
|
|
|
log.info('No harvest objects to fetch')
|
|
|
|
publisher.close()
|
|
|
|
channel.basic_ack(method.delivery_tag)
|
|
|
|
return False
|
|
|
|
|
|
|
|
log.debug('Received from plugin gather_stage: {0} objects (first: {1} last: {2})'.format(
|
|
|
|
len(harvest_object_ids), harvest_object_ids[:1], harvest_object_ids[-1:]))
|
|
|
|
for id in harvest_object_ids:
|
|
|
|
# Send the id to the fetch queue
|
|
|
|
publisher.send({'harvest_object_id':id})
|
|
|
|
log.debug('Sent {0} objects to the fetch queue'.format(len(harvest_object_ids)))
|
|
|
|
|
|
|
|
else:
|
2015-07-22 11:13:02 +02:00
|
|
|
# This can occur if you:
|
2015-10-28 18:51:58 +01:00
|
|
|
# * remove a harvester and it still has sources that are then refreshed
|
|
|
|
# * add a new harvester and restart CKAN but not the gather queue.
|
2015-07-22 11:13:02 +02:00
|
|
|
msg = 'System error - No harvester could be found for source type %s' % job.source.type
|
2013-03-14 18:31:07 +01:00
|
|
|
err = HarvestGatherError(message=msg,job=job)
|
|
|
|
err.save()
|
|
|
|
log.error(msg)
|
|
|
|
|
|
|
|
model.Session.remove()
|
|
|
|
publisher.close()
|
|
|
|
channel.basic_ack(method.delivery_tag)
|
2011-04-06 13:45:00 +02:00
|
|
|
|
|
|
|
|
2015-10-28 18:51:58 +01:00
|
|
|
def get_harvester(harvest_source_type):
|
|
|
|
for harvester in PluginImplementations(IHarvester):
|
|
|
|
if harvester.info()['name'] == harvest_source_type:
|
|
|
|
return harvester
|
|
|
|
|
|
|
|
|
2015-10-21 18:26:57 +02:00
|
|
|
def gather_stage(harvester, job):
|
|
|
|
'''Calls the harvester's gather_stage, returning harvest object ids, with
|
|
|
|
some error handling.
|
|
|
|
|
|
|
|
This is split off from gather_callback so that tests can call it without
|
|
|
|
dealing with queue stuff.
|
|
|
|
'''
|
|
|
|
job.gather_started = datetime.datetime.utcnow()
|
|
|
|
|
|
|
|
try:
|
|
|
|
harvest_object_ids = harvester.gather_stage(job)
|
|
|
|
except (Exception, KeyboardInterrupt):
|
|
|
|
harvest_objects = model.Session.query(HarvestObject).filter_by(
|
|
|
|
harvest_job_id=job.id
|
|
|
|
)
|
|
|
|
for harvest_object in harvest_objects:
|
|
|
|
model.Session.delete(harvest_object)
|
|
|
|
model.Session.commit()
|
|
|
|
raise
|
|
|
|
finally:
|
|
|
|
job.gather_finished = datetime.datetime.utcnow()
|
|
|
|
job.save()
|
|
|
|
return harvest_object_ids
|
|
|
|
|
|
|
|
|
2012-10-24 01:34:32 +02:00
|
|
|
def fetch_callback(channel, method, header, body):
|
2011-04-06 13:45:00 +02:00
|
|
|
try:
|
2012-10-24 01:34:32 +02:00
|
|
|
id = json.loads(body)['harvest_object_id']
|
2011-04-06 13:45:00 +02:00
|
|
|
log.info('Received harvest object id: %s' % id)
|
|
|
|
except KeyError:
|
|
|
|
log.error('No harvest object id received')
|
2012-10-24 01:34:32 +02:00
|
|
|
channel.basic_ack(method.delivery_tag)
|
2012-11-15 12:36:06 +01:00
|
|
|
return False
|
|
|
|
|
|
|
|
obj = HarvestObject.get(id)
|
|
|
|
if not obj:
|
|
|
|
log.error('Harvest object does not exist: %s' % id)
|
|
|
|
channel.basic_ack(method.delivery_tag)
|
|
|
|
return False
|
|
|
|
|
2012-11-15 19:11:35 +01:00
|
|
|
obj.retry_times += 1
|
|
|
|
obj.save()
|
|
|
|
|
|
|
|
if obj.retry_times >= 5:
|
2013-01-10 11:48:48 +01:00
|
|
|
obj.state = "ERROR"
|
|
|
|
obj.save()
|
2012-11-15 19:11:35 +01:00
|
|
|
log.error('Too many consecutive retries for object {0}'.format(obj.id))
|
|
|
|
channel.basic_ack(method.delivery_tag)
|
|
|
|
return False
|
|
|
|
|
2012-11-15 12:36:06 +01:00
|
|
|
# Send the harvest object to the plugins that implement
|
|
|
|
# the Harvester interface, only if the source type
|
|
|
|
# matches
|
|
|
|
for harvester in PluginImplementations(IHarvester):
|
|
|
|
if harvester.info()['name'] == obj.source.type:
|
2013-03-19 02:16:43 +01:00
|
|
|
fetch_and_import_stages(harvester, obj)
|
2012-11-15 12:36:06 +01:00
|
|
|
|
2013-03-01 13:52:58 +01:00
|
|
|
model.Session.remove()
|
2012-11-15 12:36:06 +01:00
|
|
|
channel.basic_ack(method.delivery_tag)
|
2011-04-06 13:45:00 +02:00
|
|
|
|
2013-03-19 02:16:43 +01:00
|
|
|
def fetch_and_import_stages(harvester, obj):
|
2013-07-04 15:59:27 +02:00
|
|
|
obj.fetch_started = datetime.datetime.utcnow()
|
2013-03-19 02:16:43 +01:00
|
|
|
obj.state = "FETCH"
|
|
|
|
obj.save()
|
|
|
|
success_fetch = harvester.fetch_stage(obj)
|
2013-07-04 15:59:27 +02:00
|
|
|
obj.fetch_finished = datetime.datetime.utcnow()
|
2013-03-19 02:16:43 +01:00
|
|
|
obj.save()
|
|
|
|
if success_fetch:
|
|
|
|
# If no errors where found, call the import method
|
2013-07-04 15:59:27 +02:00
|
|
|
obj.import_started = datetime.datetime.utcnow()
|
2013-03-19 02:16:43 +01:00
|
|
|
obj.state = "IMPORT"
|
|
|
|
obj.save()
|
|
|
|
success_import = harvester.import_stage(obj)
|
2013-07-04 15:59:27 +02:00
|
|
|
obj.import_finished = datetime.datetime.utcnow()
|
2013-03-19 02:16:43 +01:00
|
|
|
if success_import:
|
|
|
|
obj.state = "COMPLETE"
|
|
|
|
else:
|
|
|
|
obj.state = "ERROR"
|
|
|
|
obj.save()
|
|
|
|
else:
|
|
|
|
obj.state = "ERROR"
|
|
|
|
obj.save()
|
|
|
|
if obj.report_status:
|
2013-03-19 02:21:20 +01:00
|
|
|
return
|
2013-03-19 02:16:43 +01:00
|
|
|
if obj.state == 'ERROR':
|
|
|
|
obj.report_status = 'errored'
|
|
|
|
elif obj.current == False:
|
|
|
|
obj.report_status = 'deleted'
|
|
|
|
elif len(model.Session.query(HarvestObject)
|
|
|
|
.filter_by(package_id = obj.package_id)
|
|
|
|
.limit(2)
|
|
|
|
.all()) == 2:
|
|
|
|
obj.report_status = 'updated'
|
|
|
|
else:
|
|
|
|
obj.report_status = 'added'
|
|
|
|
obj.save()
|
|
|
|
|
2011-04-06 13:45:00 +02:00
|
|
|
def get_gather_consumer():
|
2015-06-01 18:54:22 +02:00
|
|
|
consumer = get_consumer(get_gather_queue_name(), 'harvest_job_id')
|
2012-04-10 21:10:17 +02:00
|
|
|
log.debug('Gather queue consumer registered')
|
2011-04-06 13:45:00 +02:00
|
|
|
return consumer
|
|
|
|
|
|
|
|
def get_fetch_consumer():
|
2015-06-01 18:54:22 +02:00
|
|
|
consumer = get_consumer(get_fetch_queue_name(), 'harvest_object_id')
|
2012-04-10 21:10:17 +02:00
|
|
|
log.debug('Fetch queue consumer registered')
|
2011-04-06 13:45:00 +02:00
|
|
|
return consumer
|
|
|
|
|
|
|
|
def get_gather_publisher():
|
|
|
|
return get_publisher('harvest_job_id')
|
|
|
|
|
|
|
|
def get_fetch_publisher():
|
|
|
|
return get_publisher('harvest_object_id')
|
|
|
|
|
2011-04-07 17:59:11 +02:00
|
|
|
# Get a publisher for the fetch queue
|
|
|
|
#fetch_publisher = get_fetch_publisher()
|
|
|
|
|