822 lines
30 KiB
Java
822 lines
30 KiB
Java
package org.gcube.dataanalysis.executor.job.management;
|
|
|
|
import java.io.File;
|
|
import java.util.ArrayList;
|
|
import java.util.HashMap;
|
|
import java.util.List;
|
|
import java.util.Map;
|
|
import java.util.Timer;
|
|
import java.util.TimerTask;
|
|
import java.util.UUID;
|
|
|
|
import javax.jms.ExceptionListener;
|
|
import javax.jms.JMSException;
|
|
import javax.jms.Message;
|
|
import javax.jms.MessageListener;
|
|
|
|
import org.apache.activemq.ActiveMQConnection;
|
|
import org.apache.axis.message.addressing.EndpointReferenceType;
|
|
import org.gcube.common.core.contexts.GHNContext;
|
|
import org.gcube.common.core.informationsystem.client.AtomicCondition;
|
|
import org.gcube.common.core.informationsystem.client.ISClient;
|
|
import org.gcube.common.core.informationsystem.client.RPDocument;
|
|
import org.gcube.common.core.informationsystem.client.queries.WSResourceQuery;
|
|
import org.gcube.common.core.scope.GCUBEScope;
|
|
import org.gcube.common.scope.api.ScopeProvider;
|
|
import org.gcube.contentmanagement.blobstorage.resource.StorageObject;
|
|
import org.gcube.contentmanagement.blobstorage.service.IClient;
|
|
import org.gcube.contentmanagement.lexicalmatcher.utils.AnalysisLogger;
|
|
import org.gcube.contentmanager.storageclient.wrapper.AccessType;
|
|
import org.gcube.contentmanager.storageclient.wrapper.MemoryType;
|
|
import org.gcube.contentmanager.storageclient.wrapper.StorageClient;
|
|
import org.gcube.dataanalysis.ecoengine.utils.Operations;
|
|
import org.gcube.dataanalysis.executor.messagequeue.ATTRIBUTE;
|
|
import org.gcube.dataanalysis.executor.messagequeue.Consumer;
|
|
import org.gcube.dataanalysis.executor.messagequeue.Producer;
|
|
import org.gcube.dataanalysis.executor.messagequeue.QCONSTANTS;
|
|
import org.gcube.dataanalysis.executor.messagequeue.QueueManager;
|
|
import org.gcube.dataanalysis.executor.scripts.ScriptIOWorker;
|
|
import org.gcube.vremanagement.executor.stubs.ExecutorCall;
|
|
import org.gcube.vremanagement.executor.stubs.TaskCall;
|
|
import org.gcube.vremanagement.executor.stubs.TaskProxy;
|
|
|
|
public class QueueJobManager {
|
|
|
|
// broadcast message period
|
|
public static int broadcastTimePeriod = 120000;
|
|
// max silence before computation stops
|
|
public static int maxSilenceTimeBeforeComputationStop = 10800000;
|
|
// max number of retries per computation step
|
|
public static int maxNumberOfComputationRetries = 1;
|
|
// period for controlling a node activity
|
|
public static int computationWatcherTimerPeriod = 120000;
|
|
// max number of message to put in a queue
|
|
// protected static int maxNumberOfMessages = 20;
|
|
public static int maxNumberOfStages = Integer.MAX_VALUE;//10;
|
|
// timeout for resending a message
|
|
public static int queueWatcherMaxwaitingTime = QCONSTANTS.refreshStatusTime;// * 5;
|
|
|
|
protected int maxFailureTries;
|
|
private static String pluginName = "generic-worker";//"GenericWorker";
|
|
|
|
protected String scope;
|
|
protected GCUBEScope gscope;
|
|
protected String session;
|
|
|
|
protected boolean yetstopped;
|
|
protected boolean messagesresent;
|
|
protected float status;
|
|
protected boolean abort;
|
|
protected boolean shutdown;
|
|
|
|
protected List<EndpointReferenceType> eprs;
|
|
protected int activeNodes;
|
|
protected int computingNodes;
|
|
protected int numberOfMessages;
|
|
protected int totalNumberOfMessages;
|
|
protected int actualNumberOfNodes;
|
|
protected int totalNumberOfStages;
|
|
public int currentNumberOfStages;
|
|
|
|
// files management
|
|
protected List<String> filenames;
|
|
protected List<String> fileurls;
|
|
|
|
// queue parameters
|
|
protected String queueName;
|
|
protected String queueResponse;
|
|
protected String queueURL;
|
|
protected String queueUSER;
|
|
protected String queuePWD;
|
|
protected org.gcube.dataanalysis.executor.messagequeue.Consumer consumer;
|
|
protected Producer producer;
|
|
|
|
Timer broadcastTimer;
|
|
Timer computationWatcherTimer;
|
|
ComputationTimerWatcher computationWatcher;
|
|
String serviceClass;
|
|
String serviceName;
|
|
String owner;
|
|
String localDir;
|
|
String remoteDir;
|
|
String outputDir;
|
|
String script;
|
|
List<String> arguments;
|
|
String configuration;
|
|
boolean deletefiles;
|
|
StatusListener statuslistener;
|
|
|
|
private void resetAllVars() {
|
|
scope = null;
|
|
gscope = null;
|
|
|
|
yetstopped = false;
|
|
messagesresent = false;
|
|
status = 0;
|
|
abort = false;
|
|
shutdown = false;
|
|
|
|
eprs = null;
|
|
activeNodes = 0;
|
|
computingNodes = 0;
|
|
numberOfMessages = 0;
|
|
|
|
actualNumberOfNodes = 0;
|
|
filenames = null;
|
|
fileurls = null;
|
|
|
|
queueName = null;
|
|
queueResponse = null;
|
|
queueURL = null;
|
|
queueUSER = null;
|
|
queuePWD = null;
|
|
consumer = null;
|
|
producer = null;
|
|
broadcastTimer = null;
|
|
computationWatcherTimer = null;
|
|
computationWatcher = null;
|
|
serviceClass = null;
|
|
serviceName = null;
|
|
owner = null;
|
|
localDir = null;
|
|
remoteDir = null;
|
|
outputDir = null;
|
|
script = null;
|
|
arguments = null;
|
|
configuration = null;
|
|
deletefiles = false;
|
|
statuslistener = null;
|
|
}
|
|
|
|
public int getActiveNodes() {
|
|
return computingNodes;
|
|
}
|
|
|
|
public float getStatus() {
|
|
float innerStatus = 0;
|
|
if (totalNumberOfMessages != 0)
|
|
innerStatus = (1f - ((float) numberOfMessages / (float) totalNumberOfMessages));
|
|
if (totalNumberOfStages == 0)
|
|
return innerStatus;
|
|
else {
|
|
float offset = ((float) Math.max(currentNumberOfStages - 1, 0)) / (float) totalNumberOfStages;
|
|
float status = offset + (innerStatus / (float) totalNumberOfStages);
|
|
// AnalysisLogger.getLogger().info("stages: "+totalNumberOfStages+" inner status: "+innerStatus+" currentStage: "+currentNumberOfStages+" status: "+status);
|
|
return status;
|
|
}
|
|
}
|
|
|
|
// there is only one node from the client point of view
|
|
public int getNumberOfNodes() {
|
|
if (eprs.size() > 0)
|
|
return 1;
|
|
else
|
|
return 0;
|
|
}
|
|
|
|
public void setNumberOfNodes(int newNumberOfNodes) {
|
|
// ignore this setting in this case
|
|
}
|
|
|
|
private void init(String scope, int numberOfNodes) throws Exception {
|
|
resetAllVars();
|
|
// init scope variables
|
|
this.scope = scope;
|
|
gscope = GCUBEScope.getScope(scope);
|
|
// introduce a session
|
|
// initialize flags
|
|
shutdown = false;
|
|
yetstopped = false;
|
|
messagesresent = false;
|
|
abort = false;
|
|
// find all the nodes - initialize the eprs
|
|
findNodes(scope);
|
|
}
|
|
|
|
public QueueJobManager(String scope, int numberOfNodes) throws Exception {
|
|
init(scope, numberOfNodes);
|
|
}
|
|
|
|
public QueueJobManager(String scope, int numberOfNodes, List<EndpointReferenceType> eprs) throws Exception {
|
|
init(scope, numberOfNodes);
|
|
this.eprs = eprs;
|
|
}
|
|
|
|
private void setGlobalVars(String serviceClass, String serviceName, String owner, String localDir, String remoteDir, String outputDir, String script, List<String> arguments, String configuration, boolean deletefiles) {
|
|
this.serviceClass = serviceClass;
|
|
this.serviceName = serviceName;
|
|
this.owner = owner;
|
|
this.localDir = localDir;
|
|
this.remoteDir = remoteDir;
|
|
this.outputDir = outputDir;
|
|
this.script = script;
|
|
this.arguments = arguments;
|
|
this.configuration = configuration;
|
|
this.deletefiles = deletefiles;
|
|
}
|
|
|
|
private int totalmessages = 0;
|
|
|
|
public boolean uploadAndExecuteChunkized(String serviceClass, String serviceName, String owner, String localDir, String remoteDir, String outputDir, String script, List<String> arguments, String configuration, boolean deletefiles, boolean forceUpload) throws Exception {
|
|
long t0 = System.currentTimeMillis();
|
|
|
|
int elements = arguments.size();
|
|
/*
|
|
* int div = elements / (maxNumberOfMessages); int rest = elements % (maxNumberOfMessages); if (rest > 0) div++; if (div == 0) { div = 1; }
|
|
*/
|
|
session = (("" + UUID.randomUUID()).replace("-", "") + Math.random()).replace(".", "");
|
|
int[] chunkSizes = null;
|
|
//up to 1120 species we don't make stages
|
|
if (elements>maxNumberOfStages)
|
|
chunkSizes = Operations.takeChunks(elements, maxNumberOfStages);
|
|
else {
|
|
chunkSizes = new int[1];
|
|
chunkSizes[0]=elements;
|
|
}
|
|
int allchunks = chunkSizes.length;
|
|
totalNumberOfStages = allchunks;
|
|
currentNumberOfStages = 0;
|
|
int start = 0;
|
|
totalmessages = 0;
|
|
AnalysisLogger.getLogger().info("Starting the computation in "+allchunks+" stages");
|
|
for (int i = 0; i < allchunks; i++) {
|
|
numberOfMessages = totalNumberOfMessages = 0;
|
|
currentNumberOfStages++;
|
|
int end = Math.min(elements, start + chunkSizes[i]);
|
|
AnalysisLogger.getLogger().info("Computing the chunk number " + (i + 1) + " of " + allchunks + " between " + start + " and " + (end - 1));
|
|
List<String> sublist = new ArrayList<String>();
|
|
for (int j = start; j < end; j++)
|
|
sublist.add(arguments.get(j));
|
|
|
|
AnalysisLogger.getLogger().info("size sub:" + sublist.size());
|
|
// totalmessages=totalmessages+sublist.size();
|
|
uploadAndExecute(serviceClass, serviceName, owner, localDir, remoteDir, outputDir, script, sublist, configuration, deletefiles, forceUpload);
|
|
if (abort)
|
|
break;
|
|
start = end;
|
|
AnalysisLogger.getLogger().info("Processed chunk number " + (i + 1));
|
|
|
|
}
|
|
|
|
currentNumberOfStages = totalNumberOfStages;
|
|
AnalysisLogger.getLogger().info("Finished computation on all chunks and messages " + totalmessages);
|
|
AnalysisLogger.getLogger().info("Whole Procedure done in " + (System.currentTimeMillis() - t0) + " ms");
|
|
return (!abort);
|
|
}
|
|
|
|
private boolean uploadAndExecute(String serviceClass, String serviceName, String owner, String localDir, String remoteDir, String outputDir, String script, List<String> arguments, String configuration, boolean deletefiles, boolean forceUpload) throws Exception {
|
|
int numberOfRetries = maxNumberOfComputationRetries;
|
|
boolean recompute = true;
|
|
|
|
while ((numberOfRetries > 0) && (recompute)) {
|
|
long t0 = System.currentTimeMillis();
|
|
// if (numberOfRetries<maxNumberOfComputationRetries)
|
|
init(scope, 1);
|
|
|
|
AnalysisLogger.getLogger().info("Computation Try number " + (maxNumberOfComputationRetries + 1 - numberOfRetries));
|
|
|
|
AnalysisLogger.getLogger().info("Contacting " + actualNumberOfNodes + " Nodes");
|
|
// set globals
|
|
setGlobalVars(serviceClass, serviceName, owner, localDir, remoteDir, outputDir, script, arguments, configuration, deletefiles);
|
|
// if not yet uploaded , upload required files
|
|
uploadFilesOnStorage(forceUpload);
|
|
|
|
// initializing queue
|
|
setQueueVariables();
|
|
// broadcast a message to all executors for purging previous queues
|
|
// purgeQueues();
|
|
createClientProducer();
|
|
broadcastListenCommandToExecutorNodes();
|
|
|
|
maxFailureTries = activeNodes * 1;
|
|
|
|
broadcastTimer = new Timer();
|
|
broadcastTimer.schedule(new Broadcaster(), broadcastTimePeriod, broadcastTimePeriod);
|
|
|
|
computationWatcherTimer = new Timer();
|
|
computationWatcher = new ComputationTimerWatcher(maxSilenceTimeBeforeComputationStop);
|
|
computationWatcherTimer.schedule(computationWatcher, computationWatcherTimerPeriod, computationWatcherTimerPeriod);
|
|
|
|
// send all messages
|
|
sendMessages();
|
|
createClientConsumer();
|
|
// wait for messages
|
|
waitForMessages();
|
|
|
|
AnalysisLogger.getLogger().info("Wait for message finished - checking result");
|
|
if (numberOfMessages == 0) {
|
|
AnalysisLogger.getLogger().info("All tasks have correctly finished!");
|
|
}
|
|
|
|
/*
|
|
* else{ AnalysisLogger.getLogger().info("Timeout - Warning Some Task is missing!"); for (int k=0;k<finishedChunks.length;k++){ if (finishedChunks[k]==0){ AnalysisLogger.getLogger().info("Sending Again message number " + k); Map<String, Object> inputs = generateInputMessage(filenames, fileurls, outputDir, script, arguments.get(k), k, scope, serviceClass, serviceName, owner, remoteDir, session, configuration, deletefiles); producer.sendMessage(inputs, 0); AnalysisLogger.getLogger().info("Sent Message " + k); } } waitForMessages(); if (numberOfMessages>0){ abort = true; } }
|
|
*/
|
|
|
|
// deleteRemoteFolder();
|
|
// summary
|
|
AnalysisLogger.getLogger().info("-SUMMARY-");
|
|
for (int i = 0; i < totalNumberOfMessages; i++) {
|
|
if (activeMessages[i])
|
|
AnalysisLogger.getLogger().info("Error : the Message Number " + i + " Was Never Processed!");
|
|
if (resentMessages[i] > 0) {
|
|
messagesresent = true;
|
|
AnalysisLogger.getLogger().info("Warning : the Message Number " + i + " Was resent " + resentMessages[i] + " Times");
|
|
}
|
|
}
|
|
AnalysisLogger.getLogger().info("-SUMMARY END-");
|
|
|
|
stop();
|
|
AnalysisLogger.getLogger().info("Stopped");
|
|
AnalysisLogger.getLogger().info("Single Step Procedure done in " + (System.currentTimeMillis() - t0) + " ms");
|
|
activeNodes = 0;
|
|
numberOfRetries--;
|
|
if (abort) {
|
|
recompute = true;
|
|
if (numberOfRetries > 0)
|
|
Thread.sleep(10000);
|
|
} else
|
|
recompute = false;
|
|
}
|
|
|
|
return (!abort);
|
|
}
|
|
|
|
public boolean hasResentMessages() {
|
|
return messagesresent;
|
|
}
|
|
|
|
public void waitForMessages() throws Exception {
|
|
AnalysisLogger.getLogger().info("Waiting...");
|
|
while ((numberOfMessages > 0) && (!abort)) {
|
|
Thread.sleep(2000);
|
|
|
|
// long tcurrent = System.currentTimeMillis();
|
|
// if ((tcurrent - waitTime) > maxwaitingTime) {
|
|
// break;
|
|
// }
|
|
}
|
|
AnalysisLogger.getLogger().info("...Stop - Abort?" + abort);
|
|
}
|
|
|
|
public boolean wasAborted() {
|
|
return abort;
|
|
}
|
|
|
|
public void purgeQueues() throws Exception {
|
|
AnalysisLogger.getLogger().info("Purging Queue");
|
|
List<WorkerWatcher> tasksProxies = new ArrayList<WorkerWatcher>();
|
|
for (int j = 0; j < actualNumberOfNodes; j++) {
|
|
try {
|
|
contactNodes(tasksProxies, j, queueName, queueUSER, queuePWD, queueURL, queueResponse, session, "true");
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
AnalysisLogger.getLogger().info("Error in purgin queue on node " + j);
|
|
}
|
|
}
|
|
AnalysisLogger.getLogger().info("Queue Purged");
|
|
}
|
|
|
|
public void stop() {
|
|
try {
|
|
if (!yetstopped) {
|
|
if (broadcastTimer != null) {
|
|
AnalysisLogger.getLogger().info("Stopping Broadcaster");
|
|
broadcastTimer.cancel();
|
|
broadcastTimer.purge();
|
|
}
|
|
|
|
if (computationWatcherTimer != null) {
|
|
AnalysisLogger.getLogger().info("Stopping Watcher");
|
|
computationWatcherTimer.cancel();
|
|
computationWatcherTimer.purge();
|
|
}
|
|
|
|
AnalysisLogger.getLogger().info("Purging Status Listener");
|
|
|
|
if (statuslistener != null)
|
|
statuslistener.destroyAllWatchers();
|
|
|
|
AnalysisLogger.getLogger().info("Stopping Producer and Consumer");
|
|
|
|
try{
|
|
producer.stop();
|
|
producer.closeSession();
|
|
}catch(Exception e1){}
|
|
try{
|
|
consumer.stop();
|
|
consumer.closeSession();
|
|
}catch(Exception e2){}
|
|
|
|
AnalysisLogger.getLogger().info("Purging Remote Queues");
|
|
purgeQueues();
|
|
|
|
yetstopped = true;
|
|
}
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
AnalysisLogger.getLogger().info("Not completely stopped");
|
|
}
|
|
}
|
|
|
|
private void contactNodes(List<WorkerWatcher> tasksProxies, int order, String queueName, String queueUSER, String queuePWD, String queueURL, String queueResponse, String session, String purgeQueue) throws Exception {
|
|
// generate the input map according to the arguments
|
|
Map<String, Object> inputs = generateWorkerInput(queueName, queueUSER, queuePWD, queueURL, queueResponse, session, purgeQueue);
|
|
AnalysisLogger.getLogger().info("Inputs " + inputs);
|
|
// take the i-th endpoint of the executor
|
|
EndpointReferenceType selectedEPR = eprs.get(order);
|
|
AnalysisLogger.getLogger().info("Broadcasting to node " + (order + 1) + " on " + selectedEPR.getAddress());
|
|
// run the executor script
|
|
ExecutorCall call = new ExecutorCall(pluginName, gscope);
|
|
call.setEndpointReference(selectedEPR);
|
|
TaskCall task = null;
|
|
AnalysisLogger.getLogger().info("EPR:" + selectedEPR);
|
|
task = call.launch(inputs);
|
|
// AnalysisLogger.getLogger().info("Task EPR:" + task.getEndpointReference());
|
|
TaskProxy proxy = task.getProxy();
|
|
tasksProxies.add(new WorkerWatcher(proxy, AnalysisLogger.getLogger()));
|
|
// AnalysisLogger.getLogger().info("Contacting node " + (order + 1) + " OK on " + selectedEPR);
|
|
|
|
}
|
|
|
|
private int findNodes(String scopeString) throws Exception {
|
|
AnalysisLogger.getLogger().debug("SCOPE:"+scopeString);
|
|
GCUBEScope scope = GCUBEScope.getScope(scopeString);
|
|
ISClient client = GHNContext.getImplementation(ISClient.class);
|
|
WSResourceQuery wsquery = client.getQuery(WSResourceQuery.class);
|
|
wsquery.addAtomicConditions(new AtomicCondition("//gc:ServiceName", "Executor"));
|
|
wsquery.addAtomicConditions(new AtomicCondition("/child::*[local-name()='Task']/name[text()='" + pluginName + "']", pluginName));
|
|
List<RPDocument> listdoc = client.execute(wsquery, scope);
|
|
EndpointReferenceType epr = null;
|
|
eprs = new ArrayList<EndpointReferenceType>();
|
|
int numberOfEP = 0;
|
|
for (RPDocument resource : listdoc) {
|
|
epr = resource.getEndpoint();
|
|
numberOfEP++;
|
|
eprs.add(epr);
|
|
}
|
|
AnalysisLogger.getLogger().info("Found " + numberOfEP + " endpoints");
|
|
// get current number of available nodes
|
|
actualNumberOfNodes = eprs.size();
|
|
return numberOfEP;
|
|
}
|
|
|
|
private void setQueueVariables() throws Exception {
|
|
queueName = "D4ScienceJob"; // + session;
|
|
queueResponse = queueName + "Response"+session;
|
|
//general scope
|
|
queueURL = gscope.getServiceMap().getEndpoints(GHNContext.MSGBROKER).iterator().next().getAddress().toString();
|
|
//tests on ecosystem
|
|
//TODO: delete this!
|
|
// queueURL = "tcp://ui.grid.research-infrastructures.eu:6166";
|
|
// queueURL = "tcp://message-broker.d4science.research-infrastructures.eu:6166";
|
|
AnalysisLogger.getLogger().info("Queue for the scope: " + queueURL);
|
|
if (queueURL==null){
|
|
if (scope.startsWith("/gcube"))
|
|
queueURL = "tcp://ui.grid.research-infrastructures.eu:6166";
|
|
else
|
|
queueURL = "tcp://message-broker.d4science.research-infrastructures.eu:6166";
|
|
}
|
|
queueUSER = ActiveMQConnection.DEFAULT_USER;
|
|
queuePWD = ActiveMQConnection.DEFAULT_PASSWORD;
|
|
}
|
|
|
|
public void deleteRemoteFolder() throws Exception {
|
|
ScopeProvider.instance.set(scope);
|
|
IClient client = new StorageClient(serviceClass, serviceName, owner, AccessType.SHARED,MemoryType.VOLATILE).getClient();
|
|
// IClient client = new StorageClient(serviceClass, serviceName, owner, AccessType.SHARED, gscope).getClient();
|
|
AnalysisLogger.getLogger().info("Removing Remote Dir " + remoteDir);
|
|
client.removeDir().RDir(remoteDir);
|
|
AnalysisLogger.getLogger().info("Removed");
|
|
}
|
|
|
|
private void uploadFilesOnStorage(boolean forceupload) throws Exception {
|
|
ScopeProvider.instance.set(scope);
|
|
IClient client = new StorageClient(serviceClass, serviceName, owner, AccessType.SHARED, MemoryType.VOLATILE).getClient();
|
|
// IClient client = new StorageClient(serviceClass, serviceName, owner, AccessType.SHARED, gscope).getClient();
|
|
File dir = new File(localDir);
|
|
File[] files = dir.listFiles();
|
|
AnalysisLogger.getLogger().info("Start uploading");
|
|
filenames = new ArrayList<String>();
|
|
fileurls = new ArrayList<String>();
|
|
boolean uploadFiles = forceupload;
|
|
// if we do not force upload then check if the folder is yet there
|
|
if (!uploadFiles) {
|
|
List<StorageObject> remoteObjects = client.showDir().RDir(remoteDir);
|
|
// only upload files if they are not yet uploaded
|
|
if (remoteObjects.size() == 0)
|
|
uploadFiles = true;
|
|
}
|
|
if (!uploadFiles)
|
|
AnalysisLogger.getLogger().info("Unnecessary to Uploading Files");
|
|
|
|
AnalysisLogger.getLogger().info("Loading files");
|
|
for (File sfile : files) {
|
|
if (sfile.getName().startsWith("."))
|
|
continue;
|
|
|
|
String localf = sfile.getAbsolutePath();
|
|
String filename = sfile.getName();
|
|
String remotef = remoteDir + sfile.getName();
|
|
if (uploadFiles) {
|
|
client.put(true).LFile(localf).RFile(remotef);
|
|
AnalysisLogger.getLogger().info("Uploading File "+localf+" as remote file "+remotef);
|
|
}
|
|
String url = client.getUrl().RFile(remotef);
|
|
// AnalysisLogger.getLogger().info("URL obtained: " + url);
|
|
filenames.add(filename);
|
|
fileurls.add(url);
|
|
}
|
|
AnalysisLogger.getLogger().info("Loading finished");
|
|
|
|
}
|
|
|
|
private void broadcastListenCommandToExecutorNodes() throws Exception {
|
|
AnalysisLogger.getLogger().info("Submitting script to Remote Queue " + queueName);
|
|
List<WorkerWatcher> tasksProxies = new ArrayList<WorkerWatcher>();
|
|
try{
|
|
findNodes(scope);
|
|
}catch(Exception e){
|
|
AnalysisLogger.getLogger().info("Error in Finding nodes - using previous value");
|
|
}
|
|
activeNodes = actualNumberOfNodes;
|
|
// launch the tasks
|
|
for (int i = 0; i < actualNumberOfNodes; i++) {
|
|
try {
|
|
contactNodes(tasksProxies, i, queueName, queueUSER, queuePWD, queueURL, queueResponse, session, "false");
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
AnalysisLogger.getLogger().info("Error in Contacting nodes");
|
|
}
|
|
}
|
|
}
|
|
|
|
private void createClientProducer() throws Exception {
|
|
AnalysisLogger.getLogger().info("Creating Message Queue and Producer");
|
|
// create the Producer
|
|
QueueManager qm = new QueueManager();
|
|
qm.createAndConnect(queueUSER, queuePWD, queueURL, queueName);
|
|
producer = new Producer(qm, queueName);
|
|
AnalysisLogger.getLogger().info("Producer OK");
|
|
}
|
|
|
|
private void createClientConsumer() throws Exception {
|
|
AnalysisLogger.getLogger().info("Creating Response Message Queue and Consumer");
|
|
// create the listener
|
|
statuslistener = new StatusListener();
|
|
QueueManager qm1 = new QueueManager();
|
|
qm1.createAndConnect(queueUSER, queuePWD, queueURL, queueResponse);
|
|
consumer = new Consumer(qm1, statuslistener, statuslistener, queueResponse);
|
|
AnalysisLogger.getLogger().info("Consumers OK");
|
|
}
|
|
|
|
boolean activeMessages[];
|
|
public int resentMessages[];
|
|
|
|
private void sendMessages() throws Exception {
|
|
int i = 0;
|
|
numberOfMessages = arguments.size();
|
|
totalNumberOfMessages = numberOfMessages;
|
|
AnalysisLogger.getLogger().info("Messages To Send " + numberOfMessages);
|
|
activeMessages = new boolean[numberOfMessages];
|
|
resentMessages = new int[numberOfMessages];
|
|
for (String argum : arguments) {
|
|
Map<String, Object> inputs = generateInputMessage(filenames, fileurls, outputDir, script, argum, i, scope, serviceClass, serviceName, owner, remoteDir, session, configuration, deletefiles, false);
|
|
producer.sendMessage(inputs, 0);
|
|
AnalysisLogger.getLogger().info("Send " + i);
|
|
activeMessages[i] = true;
|
|
i++;
|
|
}
|
|
AnalysisLogger.getLogger().info("Messages Sent " + numberOfMessages);
|
|
}
|
|
|
|
private Map<String, Object> generateInputMessage(Object filenames, Object fileurls, String outputDir, String script, String argum, int i, String scope, String serviceClass, String serviceName, String owner, String remoteDir, String session, String configuration, boolean deletefiles, boolean duplicateMessage) {
|
|
Map<String, Object> inputs = new HashMap<String, Object>();
|
|
|
|
inputs.put(ATTRIBUTE.FILE_NAMES.name(), filenames);
|
|
inputs.put(ATTRIBUTE.FILE_URLS.name(), fileurls);
|
|
inputs.put(ATTRIBUTE.OUTPUTDIR.name(), outputDir);
|
|
inputs.put(ATTRIBUTE.SCRIPT.name(), script);
|
|
inputs.put(ATTRIBUTE.ARGUMENTS.name(), argum + " " + duplicateMessage);
|
|
inputs.put(ATTRIBUTE.ORDER.name(), "" + i);
|
|
inputs.put(ATTRIBUTE.SCOPE.name(), scope);
|
|
inputs.put(ATTRIBUTE.SERVICE_CLASS.name(), serviceClass);
|
|
inputs.put(ATTRIBUTE.SERVICE_NAME.name(), serviceName);
|
|
inputs.put(ATTRIBUTE.OWNER.name(), owner);
|
|
inputs.put(ATTRIBUTE.REMOTEDIR.name(), remoteDir);
|
|
inputs.put(ATTRIBUTE.CLEAN_CACHE.name(), "" + deletefiles);
|
|
inputs.put(ATTRIBUTE.QSESSION.name(), session);
|
|
inputs.put(ATTRIBUTE.CONFIGURATION.name(), configuration);
|
|
inputs.put(ATTRIBUTE.TOPIC_RESPONSE_NAME.name(), queueResponse);
|
|
inputs.put(ATTRIBUTE.QUEUE_USER.name(), queueUSER);
|
|
inputs.put(ATTRIBUTE.QUEUE_PASSWORD.name(), queuePWD);
|
|
inputs.put(ATTRIBUTE.QUEUE_URL.name(), queueURL);
|
|
return inputs;
|
|
}
|
|
|
|
private Map<String, Object> generateWorkerInput(String queueName, String queueUser, String queuePassword, String queueURL, String queueResponse, String session, String purge) {
|
|
|
|
Map<String, Object> inputs = new HashMap<String, Object>();
|
|
|
|
inputs.put(ATTRIBUTE.TOPIC_NAME.name(), ScriptIOWorker.toInputString(queueName));
|
|
inputs.put(ATTRIBUTE.QUEUE_USER.name(), ScriptIOWorker.toInputString(queueUser));
|
|
inputs.put(ATTRIBUTE.QUEUE_PASSWORD.name(), ScriptIOWorker.toInputString(queuePassword));
|
|
inputs.put(ATTRIBUTE.QUEUE_URL.name(), ScriptIOWorker.toInputString(queueURL));
|
|
inputs.put(ATTRIBUTE.TOPIC_RESPONSE_NAME.name(), ScriptIOWorker.toInputString(queueResponse));
|
|
inputs.put(ATTRIBUTE.QSESSION.name(), session);
|
|
inputs.put(ATTRIBUTE.ERASE.name(), purge);
|
|
return inputs;
|
|
}
|
|
|
|
public class Broadcaster extends TimerTask {
|
|
|
|
@Override
|
|
public void run() {
|
|
try {
|
|
AnalysisLogger.getLogger().info("(((((((((((((((((((((((((((------Broadcasting Information To Watchers------)))))))))))))))))))))))))))");
|
|
broadcastListenCommandToExecutorNodes();
|
|
AnalysisLogger.getLogger().info("(((((((((((((((((((((((((((------END Broadcasting Information To Watchers------)))))))))))))))))))))))))))");
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
AnalysisLogger.getLogger().info("--------------------------------Broadcaster: Error Sending Listen Message to Executors------)))))))))))))))))))))))))))");
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
public class ComputationTimerWatcher extends TimerTask {
|
|
|
|
long maxTime;
|
|
long lastTimeClock;
|
|
|
|
public ComputationTimerWatcher(long maxtime) {
|
|
this.maxTime = maxtime;
|
|
this.lastTimeClock = System.currentTimeMillis();
|
|
}
|
|
|
|
public void reset() {
|
|
lastTimeClock = System.currentTimeMillis();
|
|
}
|
|
|
|
public void setmaxTime(long maxTime) {
|
|
this.maxTime = maxTime;
|
|
}
|
|
|
|
@Override
|
|
public void run() {
|
|
try {
|
|
long t0 = System.currentTimeMillis();
|
|
AnalysisLogger.getLogger().info("Computation Watcher Timing Is " + (t0 - lastTimeClock)+" max computation time is "+maxTime);
|
|
if ((t0 - lastTimeClock) > maxTime) {
|
|
AnalysisLogger.getLogger().info("Computation Watcher - Computation Timeout: Closing Queue Job Manager!!!");
|
|
abort();
|
|
}
|
|
} catch (Exception e) {
|
|
e.printStackTrace();
|
|
AnalysisLogger.getLogger().info("Error Taking clock");
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
public synchronized void abort() {
|
|
AnalysisLogger.getLogger().info("Computation Aborted");
|
|
this.abort = true;
|
|
}
|
|
|
|
public class StatusListener implements MessageListener, ExceptionListener {
|
|
|
|
private QueueWorkerWatcher[] watchers;
|
|
|
|
synchronized public void onException(JMSException ex) {
|
|
abort();
|
|
AnalysisLogger.getLogger().info("JMS Exception occured. Shutting down client.");
|
|
}
|
|
|
|
private synchronized void addWatcher(int order) {
|
|
if (watchers == null)
|
|
watchers = new QueueWorkerWatcher[totalNumberOfMessages];
|
|
|
|
QueueWorkerWatcher watcher = watchers[order];
|
|
if (watcher != null) {
|
|
destroyWatcher(order);
|
|
}
|
|
|
|
Map<String, Object> message = generateInputMessage(filenames, fileurls, outputDir, script, arguments.get(order), order, scope, serviceClass, serviceName, owner, remoteDir, session, configuration, deletefiles, true);
|
|
watchers[order] = new QueueWorkerWatcher(producer, message, order);
|
|
}
|
|
|
|
private synchronized void resetWatcher(int order) {
|
|
if (watchers == null)
|
|
watchers = new QueueWorkerWatcher[totalNumberOfMessages];
|
|
else if (watchers[order] != null)
|
|
watchers[order].resetTime();
|
|
}
|
|
|
|
private synchronized void destroyWatcher(int order) {
|
|
if (watchers != null && watchers[order] != null) {
|
|
if (watchers[order].hasResent())
|
|
resentMessages[order] = resentMessages[order] + 1;
|
|
|
|
watchers[order].destroy();
|
|
watchers[order] = null;
|
|
AnalysisLogger.getLogger().info("Destroyed Watcher number " + order);
|
|
}
|
|
}
|
|
|
|
public synchronized void destroyAllWatchers() {
|
|
if (watchers != null) {
|
|
for (int i = 0; i < watchers.length; i++) {
|
|
destroyWatcher(i);
|
|
}
|
|
}
|
|
}
|
|
|
|
public void onMessage(Message message) {
|
|
|
|
// get message
|
|
try {
|
|
|
|
HashMap<String, Object> details = (HashMap<String, Object>) (HashMap<String, Object>) message.getObjectProperty(ATTRIBUTE.CONTENT.name());
|
|
String status = (String) details.get(ATTRIBUTE.STATUS.name());
|
|
String order = "" + details.get(ATTRIBUTE.ORDER.name());
|
|
String nodeaddress = (String) details.get(ATTRIBUTE.NODE.name());
|
|
String msession = (String) details.get(ATTRIBUTE.QSESSION.name());
|
|
Object error = details.get(ATTRIBUTE.ERROR.name());
|
|
|
|
AnalysisLogger.getLogger().info("Current session " + session);
|
|
if ((msession != null) && (msession.equals(session))) {
|
|
AnalysisLogger.getLogger().info("Session " + session + " is right - acknowledge");
|
|
message.acknowledge();
|
|
AnalysisLogger.getLogger().info("Session " + session + " acknowledged");
|
|
int orderInt = -1;
|
|
try {
|
|
orderInt = Integer.parseInt(order);
|
|
} catch (Exception e3) {
|
|
e3.printStackTrace();
|
|
}
|
|
if (orderInt > -1) {
|
|
|
|
// reset the watcher
|
|
if (computationWatcher!=null)
|
|
computationWatcher.reset();
|
|
|
|
AnalysisLogger.getLogger().info("Task number " + order + " is " + status + " on node " + nodeaddress + " and session " + session);
|
|
|
|
if (status.equals(ATTRIBUTE.STARTED.name())) {
|
|
computingNodes++;
|
|
addWatcher(orderInt);
|
|
}
|
|
if (status.equals(ATTRIBUTE.PROCESSING.name())) {
|
|
|
|
resetWatcher(orderInt);
|
|
} else if (status.equals(ATTRIBUTE.FINISHED.name())) {
|
|
|
|
totalmessages++;
|
|
computingNodes--;
|
|
destroyWatcher(orderInt);
|
|
if (numberOfMessages > 0)
|
|
numberOfMessages--;
|
|
|
|
AnalysisLogger.getLogger().info("Remaining " + numberOfMessages + " messages to manage");
|
|
activeMessages[orderInt] = false;
|
|
|
|
} else if (status.equals(ATTRIBUTE.FATAL_ERROR.name())) {
|
|
if (error!=null)
|
|
AnalysisLogger.getLogger().info("REPORTED FATAL_ERROR on " +nodeaddress+" : ");
|
|
AnalysisLogger.getLogger().info(error);
|
|
|
|
computingNodes--;
|
|
if (maxFailureTries <= 0) {
|
|
AnalysisLogger.getLogger().info("Too much Failures - Aborting");
|
|
destroyAllWatchers();
|
|
abort();
|
|
} else {
|
|
AnalysisLogger.getLogger().info("Failure Occurred - Now Resending Message " + orderInt);
|
|
resentMessages[orderInt] = resentMessages[orderInt] + 1;
|
|
maxFailureTries--;
|
|
// resend message
|
|
Map<String, Object> retrymessage = generateInputMessage(filenames, fileurls, outputDir, script, arguments.get(orderInt), orderInt, scope, serviceClass, serviceName, owner, remoteDir, session, configuration, deletefiles, true);
|
|
producer.sendMessage(retrymessage, QCONSTANTS.timeToLive);
|
|
AnalysisLogger.getLogger().info("Failure Occurred - Resent Message " + orderInt);
|
|
}
|
|
|
|
}
|
|
|
|
} else
|
|
AnalysisLogger.getLogger().info("Ignoring message " + order + " with status " + status);
|
|
} else {
|
|
AnalysisLogger.getLogger().info("wrong session " + msession + " ignoring message");
|
|
// consumer.manager.session.recover();
|
|
}
|
|
} catch (Exception e) {
|
|
|
|
AnalysisLogger.getLogger().info("Error reading details ", e);
|
|
AnalysisLogger.getLogger().info("...Aborting Job...");
|
|
abort();
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
}
|