UrlsController/src/main/java/eu/openaire/urls_controller/components/ScheduledTasks.java

499 lines
28 KiB
Java

package eu.openaire.urls_controller.components;
import com.google.gson.Gson;
import com.google.gson.JsonSyntaxException;
import eu.openaire.urls_controller.UrlsControllerApplication;
import eu.openaire.urls_controller.configuration.DatabaseConnector;
import eu.openaire.urls_controller.controllers.BulkImportController;
import eu.openaire.urls_controller.controllers.ShutdownController;
import eu.openaire.urls_controller.controllers.StatsController;
import eu.openaire.urls_controller.controllers.UrlsController;
import eu.openaire.urls_controller.models.WorkerInfo;
import eu.openaire.urls_controller.payloads.requests.WorkerReport;
import eu.openaire.urls_controller.services.UrlsServiceImpl;
import eu.openaire.urls_controller.util.FileUtils;
import eu.openaire.urls_controller.util.GenericUtils;
import io.micrometer.core.instrument.MeterRegistry;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.http.ResponseEntity;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.text.DecimalFormat;
import java.util.*;
import java.util.concurrent.CancellationException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@Component
public class ScheduledTasks {
private static final Logger logger = LoggerFactory.getLogger(ScheduledTasks.class);
@Autowired
FileUtils fileUtils;
@Autowired
UrlsServiceImpl urlsService;
@Autowired
private UrlsControllerApplication urlsControllerApplication;
private final StatsController statsController;
private final UrlsController urlsController;
@Value("${services.pdfaggregation.controller.assignmentLimit}")
private int assignmentsLimit;
public static final DecimalFormat df = new DecimalFormat("0.00");
private final String workerReportsDirPath;
public static final AtomicInteger numOfAllPayloads = new AtomicInteger(0);
public static final AtomicInteger numOfPayloadsAggregatedByServiceThroughCrawling = new AtomicInteger(0);
public static final AtomicInteger numOfPayloadsAggregatedByServiceThroughBulkImport = new AtomicInteger(0);
public static final AtomicInteger numOfPayloadsAggregatedByService = new AtomicInteger(0);
public static final AtomicInteger numOfLegacyPayloads = new AtomicInteger(0);
public static final AtomicInteger numOfRecordsInspectedByServiceThroughCrawling = new AtomicInteger(0);
public ScheduledTasks(@Value("${services.pdfaggregation.controller.workerReportsDirPath}") String workerReportsDirPath, StatsController statsController, UrlsController urlsController, MeterRegistry registry)
{
if ( !workerReportsDirPath.endsWith("/") )
workerReportsDirPath += "/";
this.workerReportsDirPath = workerReportsDirPath; // This dir will be created later.
this.statsController = statsController;
this.urlsController = urlsController;
jsonStringBuilder = new StringBuilder(assignmentsLimit * 500);
registry.gauge("numOfAllPayloads", numOfAllPayloads);
registry.gauge("numOfPayloadsAggregatedByServiceThroughCrawling", numOfPayloadsAggregatedByServiceThroughCrawling);
registry.gauge("numOfPayloadsAggregatedByServiceThroughBulkImport", numOfPayloadsAggregatedByServiceThroughBulkImport);
registry.gauge("numOfPayloadsAggregatedByService", numOfPayloadsAggregatedByService);
registry.gauge("numOfLegacyPayloads", numOfLegacyPayloads);
registry.gauge("numOfRecordsInspectedByServiceThroughCrawling", numOfRecordsInspectedByServiceThroughCrawling);
}
@Scheduled(initialDelay = 1_800_000, fixedDelay = 900_000) // Execute this method 30 mins from the start and 15 mins after each last execution, in order for some tasks to have been gathered.
//@Scheduled(initialDelay = 60_000, fixedDelay = 20_000) // Just for testing (every 20 secs).
// The initial delay is larger, because we have to wait some time for at least one worker to finish retrieving the full-texts from thousands of publications, whereas, later we will have a lot of workerReports waiting to be processed.
public void checkResultsOfBackgroundTasks()
{
int sizeOfFutures = UrlsController.futuresOfBackgroundTasks.size();
if ( sizeOfFutures == 0 )
return;
logger.debug("Going to check the results of " + sizeOfFutures + " background tasks.");
// Calling ".get()" on each future will block the current scheduling thread until a result is returned (if the callable task succeeded or failed).
// So this scheduled-task will not be executed again until all current callableTasks have finished executing.
int numFailedTasks = 0;
List<Future<Boolean>> futuresToDelete = new ArrayList<>(sizeOfFutures);
for ( int i=0; i < sizeOfFutures; ++i ) { // Check the futures up to the current "sizeOfFutures". The size of the list may increase while we check, but here we wil check only the first e.g. 10 futures.
Future<Boolean> future = null;
try {
future = UrlsController.futuresOfBackgroundTasks.get(i);
if ( ! future.get() ) // Get and see if an exception is thrown. This blocks the current thread, until the task of the future has finished.
numFailedTasks ++;
} catch (ExecutionException ee) { // These can be serious errors like an "out of memory exception" (Java HEAP).
logger.error(GenericUtils.getSelectedStackTraceForCausedException(ee, "Background_task_" + i + " failed with: ", null, 30));
numFailedTasks ++;
} catch (CancellationException ce) {
logger.error("Background_task_" + i + " was cancelled: " + ce.getMessage());
numFailedTasks ++;
} catch (InterruptedException ie) {
logger.error("Background_task_" + i + " was interrupted: " + ie.getMessage());
numFailedTasks ++;
} catch (IndexOutOfBoundsException ioobe) {
logger.error("IOOBE for background_task_" + i + " in the futures-list! " + ioobe.getMessage());
// Only here, the "future" will be null.
} finally {
if ( future != null ) // It may be null in case we have a IOBE.
futuresToDelete.add(future); // Do not delete them directly here, as the indexes will get messed up and we will get "IOOBE".
}
}
for ( Future<Boolean> future : futuresToDelete )
UrlsController.futuresOfBackgroundTasks.remove(future); // We do not need it anymore. This is the safest way to delete them without removing newly added futures as well.
if ( numFailedTasks > 0 )
logger.warn(numFailedTasks + " out of " + sizeOfFutures + " background tasks have failed!");
else if ( logger.isTraceEnabled() )
logger.trace("All of the " + sizeOfFutures + " background tasks have succeeded.");
}
@Scheduled(initialDelay = 600_000, fixedDelay = 7_200_000) // Check every 2 hours. The initial delay is 10 minutes, to allow to shut down quickly in case of problem when starting, but also account for the initial communication with the Workers, where a problem may appear.
//@Scheduled(initialDelay = 60_000, fixedDelay = 20_000) // Just for testing (every 20 secs).
public void checkIfServiceIsReadyForShutdown()
{
if ( ! ShutdownController.shouldShutdownService )
return; // Either the service was never instructed to shut down, or the user canceled the request.
// Check whether there are still background tasks to be processed. Either workerReport or Bulk-import requests.
int numOfFutures = UrlsController.futuresOfBackgroundTasks.size();
if ( numOfFutures > 0 ) {
logger.debug("There are still " + numOfFutures + " backgroundTasks waiting to be executed or have their status checked..");
return;
}
// Here, the above may have given a result of < 0 >, but a new task may be asked for execution right next and still await for execution..
// The crawling-jobs can be safely finish, by avoiding to shut-down as long as at least one worker is still running (waiting for the Controller to verify that the assignments-batch is completed).
// The bulk-import procedures have their bulkImport DIR registered in the "bulkImportDirsUnderProcessing", before their takes is being submitted for execution.
// So the Controller will now shut down if either of takes-types have not finished.
// Check whether there are any active bulk-import procedures.
int numOfBulkImportDirsUnderProcessing = BulkImportController.bulkImportDirsUnderProcessing.size();
if ( numOfBulkImportDirsUnderProcessing > 0 ) {
logger.debug("There are still " + numOfBulkImportDirsUnderProcessing + " bulkImportDirsUnderProcessing..");
return;
}
// Check whether the workers have not shutdown yet, which means that they either crawl assignments or/and they are waiting for the Controller to process the WorkerReport and then shutdown.
Set<String> workerIds = UrlsController.workersInfoMap.keySet();
if ( workerIds.size() > 0 ) {
for ( String workerId : workerIds ) {
WorkerInfo workerInfo = UrlsController.workersInfoMap.get(workerId); // The workerId is certainly inside the map and has a workerInfo value.
if ( ! workerInfo.getHasShutdown() ) {
logger.debug("At least one worker (with IP: " + workerInfo.getWorkerIP() + ") is still active. Waiting for all workers to shutdown, before shutting down the service.");
return; // If at least 1 worker is still active, then do not shut down the Controller.
}
}
logger.info("All workers have already shutdown. Shutting down the Controller..");
} else
logger.info("No workers have participated in the service yet. The Controller will shut-down now.");
// IMPORTANT: If one worker has crashed, then it will have not informed the Controller. So the controller will think that it is still running and will not shut down..!
// In this case, we have to manually shut-down the service, from Docker cli.
// Any left-over worker-reports are kept to be retried next time the Controller starts.
urlsControllerApplication.gentleAppShutdown();
}
@Value("${services.pdfaggregation.controller.isTestEnvironment}")
private boolean isTestEnvironment;
@Scheduled(initialDelay = 43_200_000, fixedDelay = Long.MAX_VALUE) // Run 12 hours after startup (in order for all workers to be back online) and then never again.
//@Scheduled(initialDelay = 420_000, fixedDelay = Long.MAX_VALUE) // Just for testing (7 mins after startup, only once).
public void checkAndProcessOldAndUnsuccessfulWorkerReports()
{
// We make sure an initial delay of some minutes is in place before this is executed, since we have to make sure all workers are up and running in order for them to be able to answer the full-texts-requests.
if ( UrlsController.numOfActiveWorkers.get() == 0 ) {
long timeToWait = (isTestEnvironment ? 1_200_000 : 43_200_000); // 10 mins | 12 hours
logger.warn("None of the workers is participating in the service, at the moment. Will wait " + ((timeToWait /1000) /60) + " minutes and try again..");
try {
Thread.sleep(timeToWait);
} catch (InterruptedException ie) {
logger.warn("The wait-period was interrupted! Will try either way.");
}
if ( UrlsController.numOfActiveWorkers.get() == 0 ) {
logger.error("None of the workers is participating in the service, yet again. Will not process any leftover workerReports!");
return;
}
}
// In case the Controller processes older, failed workerReports, it may be the case that the worker to which the report comes from, is not currently online.
// So its IP won't be stored in the map. Even if we have its IP in the report itself, it's of no use if the worker is offline, since we cannot get the full-texts.
// We do not care for attempts, if the payloads are not there.
inspectWorkerReportsAndTakeAction(ActionForWorkerReports.process_previous_failed);
// In case a worker-report among those "left-overs" fails again, then it will just be removed by the scheduler, when 7 days pass, and it's still there.
}
@Scheduled(initialDelay = 86_400_000, fixedDelay = 86_400_000) // Run after one day, every day.
//@Scheduled(initialDelay = 1_200_000, fixedDelay = 1_200_000) // Just for testing (every 1200 secs).
public void checkAndProcessRecentUnsuccessfulWorkerReports()
{
// After some hours from their failure and before the worker delete the full-texts as obsolete, we re-try these workerReports,
// hopping that any previous error was temporary.
inspectWorkerReportsAndTakeAction(ActionForWorkerReports.process_current_failed);
}
@Scheduled(initialDelay = 604_800_000, fixedDelay = 604_800_000) // Run every 7 days.
//@Scheduled(initialDelay = 1_200_000, fixedDelay = 1_200_000) // Just for testing (every 1200 secs).
public void checkAndDeleteOldWorkerReports()
{
// The failed workerReports are kept for 7 days, for manual debugging purposes. Even if the service does not make more than 2 attempts to import them.
inspectWorkerReportsAndTakeAction(ActionForWorkerReports.delete_old);
}
@Scheduled(initialDelay = 259_200_000, fixedDelay = 259_200_000) // Run every 3 days. 3 days after startup.
//@Scheduled(initialDelay = 1_200_000, fixedDelay = 1_200_000) // Just for testing (every 1200 secs).
public void checkAndDeleteUnhandledAssignments()
{
// Remove the assignments having a "date" older than 3 days.
// For some reason, sometimes, the worker cannot receive the assignments in time.
// In this case, no job is created for those assignments nd no workerReport gets stored in storage.
// The assignments just remain in the table, and the urls cannot be rechecked.
Calendar calendar = Calendar.getInstance();
calendar.add(Calendar.DAY_OF_MONTH, -3); // Subtract 3 from current Date.
DatabaseConnector.databaseLock.lock();
urlsService.deleteAssignmentsWithOlderDate(calendar.getTimeInMillis()); // Any error-log is written inside.
DatabaseConnector.databaseLock.unlock();
}
// Scheduled Metrics for Prometheus.
// Prometheus scrapes for metrics usually every 15 seconds, but that is an extremely short time-period for DB-statistics.
@Scheduled(fixedDelay = 21_600_000) // Every 6 hours run the following queries to the database and register the metric.
//@Scheduled(initialDelay = 60_000, fixedDelay = 1_200_000) // For general testing only.
//@Scheduled(initialDelay = 60_000, fixedDelay = 120_000) // For debug testing only.
public void updatePrometheusMetrics()
{
ResponseEntity<?> responseEntity = statsController.getNumberOfAllPayloads(true);
if ( responseEntity.getStatusCode().value() == 200 ) {
numOfAllPayloads.set(Integer.parseInt(responseEntity.getBody().toString())); // (any other cast method fails)
} // Any error is already logged.
responseEntity = statsController.getNumberOfPayloadsAggregatedByServiceThroughCrawling(true);
if ( responseEntity.getStatusCode().value() == 200 ) {
numOfPayloadsAggregatedByServiceThroughCrawling.set(Integer.parseInt(responseEntity.getBody().toString())); // (any other cast method fails)
} // Any error is already logged.
responseEntity = statsController.getNumberOfPayloadsAggregatedByServiceThroughBulkImport(true);
if ( responseEntity.getStatusCode().value() == 200 ) {
numOfPayloadsAggregatedByServiceThroughBulkImport.set(Integer.parseInt(responseEntity.getBody().toString())); // (any other cast method fails)
} // Any error is already logged.
responseEntity = statsController.getNumberOfPayloadsAggregatedByService(true);
if ( responseEntity.getStatusCode().value() == 200 ) {
numOfPayloadsAggregatedByService.set(Integer.parseInt(responseEntity.getBody().toString())); // (any other cast method fails)
} // Any error is already logged.
responseEntity = statsController.getNumberOfLegacyPayloads(true);
if ( responseEntity.getStatusCode().value() == 200 ) {
numOfLegacyPayloads.set(Integer.parseInt(responseEntity.getBody().toString())); // (any other cast method fails)
} // Any error is already logged.
responseEntity = statsController.getNumberOfRecordsInspectedByServiceThroughCrawling(true);
if ( responseEntity.getStatusCode().value() == 200 ) {
numOfRecordsInspectedByServiceThroughCrawling.set(Integer.parseInt(responseEntity.getBody().toString())); // (any other cast method fails)
} // Any error is already logged.
// TODO - Export more complex data; <numOfAllPayloadsPerDatasource>, <numOfAllPayloadsPerPublicationYear>,
// <numOfAggregatedPayloadsPerDatasource>, ..., <numOfBulkImportedPayloadsPerDatasource>, ...
}
enum ActionForWorkerReports {process_previous_failed, process_current_failed, delete_old}
// TODO - Maybe make these numbers configurable from the "application.yml" file.
private static final double daysToWaitBeforeProcessing = 0.5; // 12 hours
private static final double maxDaysToAllowProcessing = 1.9; // 45.6 hours
// The Workers wait at most 48 hours before deleting the full-text files. So there is no point to try and process the report after that time-frame.
// These reports will have to wait a bit for the scheduler to assign them to threads, before actually being processed.
private static final Gson gson = new Gson(); // This is "transient" by default.
private static StringBuilder jsonStringBuilder = null;
private static final int daysDivisor = (1000 * 60 * 60 * 24); // In order to get the time-difference in days. We divide with: /1000 to get seconds, /60 to get minutes, /60 to get hours and /24 to get days.
private static final double daysToWaitBeforeDeletion = 7.0;
private void inspectWorkerReportsAndTakeAction(ActionForWorkerReports actionForWorkerReports)
{
if ( actionForWorkerReports == ActionForWorkerReports.process_previous_failed )
logger.debug("Going to check and process any unsuccessful workerReports from the previous run.");
else if ( actionForWorkerReports == ActionForWorkerReports.process_current_failed )
logger.debug("Going to check and process any unsuccessful workerReports which are between " + daysToWaitBeforeProcessing + " and " + daysToWaitBeforeDeletion + " days old (inclusive).");
else
logger.debug("Going to check and remove any leftover workerReports, which are more than " + daysToWaitBeforeDeletion + " days old.");
// The failed workerReports are kept for 7 days, for manual debugging purposes. Even if the service does not make more than 2 attempts to import them.
try {
File[] workerReportSubDirs = getWorkerReportSubDirs();
if ( workerReportSubDirs == null )
return;
long currentTime = 0L;
if ( actionForWorkerReports != ActionForWorkerReports.process_previous_failed )
currentTime = System.currentTimeMillis();
int numWorkerReportsToBeHandled = 0;
int numWorkerReportsHandled = 0;
for ( File workerReportSubDir : workerReportSubDirs )
{
File[] workerReportFiles = workerReportSubDir.listFiles(File::isFile);
if ( workerReportFiles == null ) {
logger.error("There was an error when getting the workerReports of \"workerReportSubDir\": " + workerReportSubDir);
return;
} else if (workerReportFiles.length == 0) {
logger.debug("The \"workerReportsDir\" is empty, so there is nothing to take action on.");
return;
}
for ( File workerReportFile : workerReportFiles )
{
String workerReportName = workerReportFile.getName();
if ( actionForWorkerReports == ActionForWorkerReports.process_previous_failed ) {
if ( workerReportName.contains("failed") ) {
numWorkerReportsToBeHandled ++;
if ( processFailedWorkerReport(workerReportFile, workerReportName) )
numWorkerReportsHandled ++;
}
} else {
long lastModified = workerReportFile.lastModified();
if ( logger.isTraceEnabled() )
logger.trace("The workerReport \"" + workerReportName + "\" was last accessed in: " + new Date(lastModified));
double elapsedDays = (double) (currentTime - lastModified) / daysDivisor;
if ( actionForWorkerReports == ActionForWorkerReports.process_current_failed ) {
if ( (elapsedDays >= daysToWaitBeforeProcessing) && (elapsedDays <= maxDaysToAllowProcessing)
&& workerReportName.contains("failed") ) {
numWorkerReportsToBeHandled ++;
if ( processFailedWorkerReport(workerReportFile, workerReportName) )
numWorkerReportsHandled ++;
}
} else { // Deletion..
if ( elapsedDays > daysToWaitBeforeDeletion ) {
// Enough time has passed, the directory should be deleted immediately.
logger.warn("The workerReport \"" + workerReportName + "\" was accessed " + df.format(elapsedDays) + " days ago (passed the " + daysToWaitBeforeDeletion + " days limit) and will be deleted.");
numWorkerReportsToBeHandled ++;
if ( fileUtils.deleteFile(workerReportFile.getAbsolutePath()) // Either successful or failed.
&& !workerReportName.contains("successful") // If this has failed or its state is unknown (it was never renamed), then delete the assignment-records. For the successful, they have already been deleted.
&& extractAssignmentsCounterAndDeleteRelatedAssignmentRecords(workerReportName) )
numWorkerReportsHandled ++;
}
}
}
}// end reports loop
}// end sub-dirs loop
String initMsg = "The action \"" + actionForWorkerReports.toString() + "\" was imposed to " + numWorkerReportsHandled + " workerReports.";
int numWorkerReportsFailedToBeHandled = (numWorkerReportsToBeHandled - numWorkerReportsHandled);
if ( numWorkerReportsFailedToBeHandled > 0 )
logger.warn(initMsg + " " + numWorkerReportsFailedToBeHandled + " workerReports failed to be handled!");
else
logger.debug(initMsg);
} catch (Exception e) {
logger.error("", e);
}
}
private File[] getWorkerReportSubDirs() throws Exception
{
File workerReportsDir = new File(workerReportsDirPath);
if ( !workerReportsDir.isDirectory() ) {
logger.error("The \"workerReportsDir\" (" + workerReportsDirPath + ") does not exist!"); // This base dir should always exist!
return null;
}
// The worker reports are inside "worker_X" sub-dirs.
File[] workerReportSubDirs = workerReportsDir.listFiles(File::isDirectory);
if ( workerReportSubDirs == null ) {
logger.error("There was an error when getting the subDirs of \"workerReportsDir\": " + workerReportsDir);
return null;
} else if ( workerReportSubDirs.length == 0 ) { // The worker_X sub-dirs do not normally get deleted. So this is a warning that either no workers are running (wasting time) or that something bad happened to thios directories.
logger.warn("The \"workerReportsDir\" is empty. None of the workers returned a workerReport, so there is nothing to process.");
return null;
} else
return workerReportSubDirs;
}
private boolean processFailedWorkerReport(File workerReportFile, String workerReportName)
{
logger.debug("Going to load and parse the failed workerReport: " + workerReportName);
// Load the file's json content into a "WorkerReport" object.
try ( BufferedReader bfRead = new BufferedReader(new FileReader(workerReportFile), FileUtils.halfMb) ) {
String line;
while ( (line = bfRead.readLine()) != null ) // The line, without any line-termination-characters.
jsonStringBuilder.append(line).append(GenericUtils.endOfLine);
} catch (Exception e) {
logger.error("Problem when acquiring the contents of workerReport \"" + workerReportName + "\"");
jsonStringBuilder.setLength(0); // Reset the StringBuilder without de-allocating.
// This failed report will be retried by the scheduler, inside the allowed time-period.
// If it never reached success, then its file and the assignment-records will be deleted by the scheduler in the following days.
return false;
}
WorkerReport workerReport = null;
try {
workerReport = gson.fromJson(jsonStringBuilder.toString(), WorkerReport.class);
} catch (JsonSyntaxException jse) {
logger.error("Problem when parsing the workerReport \"" + workerReportName + "\": " + jse.getMessage());
// Same handling as mentioned in the above "catch".
return false;
} finally {
jsonStringBuilder.setLength(0); // Reset the StringBuilder without de-allocating.
}
this.urlsController.addWorkerReport(workerReport); // This will check and add the workerReport to the background jobs' scheduler.
return true;
}
private static final Pattern ASSIGNMENTS_COUNTER_REPORT_FILTER = Pattern.compile(".*([\\d]+)_report[\\w]*.json$");
private boolean extractAssignmentsCounterAndDeleteRelatedAssignmentRecords(String workerReportName)
{
// We need to delete the records from the "assignment" table, in order for them to be retried in the future.
// Take the counter from workerReportName.
Matcher matcher = ASSIGNMENTS_COUNTER_REPORT_FILTER.matcher(workerReportName);
if ( ! matcher.matches() ) {
logger.error("Could not match the report \"" + workerReportName + "\" with regex: " + ASSIGNMENTS_COUNTER_REPORT_FILTER);
return false;
}
String counterString = matcher.group(1);
if ( (counterString == null) || counterString.isEmpty() ) {
logger.error("Could not extract the \"assignmentCounter\" from report: " + workerReportName);
return false;
}
int curReportAssignmentsCounter;
try {
curReportAssignmentsCounter = Integer.parseInt(counterString);
} catch (NumberFormatException nfe) {
logger.error("Could not parse the \"curReportAssignmentsCounter\" (" + counterString + ") which was extracted from report: " + workerReportName);
return false;
}
logger.debug("Will delete the assignments of the old, not-successful, workerReport: " + workerReportName);
DatabaseConnector.databaseLock.lock();
urlsService.deleteAssignmentsBatch(curReportAssignmentsCounter); // Any error-log is written inside.
DatabaseConnector.databaseLock.unlock();
return true;
}
}