Add new prometheus metrics:

- averageFulltextsTransferSizeOfWorkerReports
- averageSuccessPercentageOfWorkerReports
This commit is contained in:
Lampros Smyrnaios 2024-06-14 17:27:52 +03:00
parent 7e7fc35d1e
commit ab18ac5ff8
4 changed files with 30 additions and 1 deletions

View File

@ -80,6 +80,8 @@ Note: The Shutdown Service API is accessible by the Controller's host machine.
- "**numOfPayloadsAggregatedByService**"
- "**numOfLegacyPayloads**"
- "**numOfRecordsInspectedByServiceThroughCrawling**"
- "**averageFulltextsTransferSizeOfWorkerReports**"
- "**averageSuccessPercentageOfWorkerReports**"
- "**getAssignments_time_seconds_max**": Time taken to return the assignments.
- "**addWorkerReport_time_seconds**": Time taken to add the WorkerReport.
<br>

View File

@ -1,5 +1,6 @@
package eu.openaire.urls_controller.components;
import com.google.common.util.concurrent.AtomicDouble;
import com.google.gson.Gson;
import com.google.gson.JsonSyntaxException;
import eu.openaire.urls_controller.UrlsControllerApplication;
@ -31,6 +32,7 @@ import java.util.concurrent.CancellationException;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.Future;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -68,6 +70,12 @@ public class ScheduledTasks {
public static final AtomicInteger numOfRecordsInspectedByServiceThroughCrawling = new AtomicInteger(0);
public static final AtomicLong totalFulltextWorkerReportSize = new AtomicLong(0); // This is only usefull to get the "averageSize", since upon restart, this value will be reset to ZERO. (and we do not care about getting the SUM of sizes of each fulltext, from the DB)
public static final AtomicLong averageFulltextsTransferSizeOfWorkerReports = new AtomicLong(0);
public static final AtomicDouble totalSuccessPercentagesOfWorkerReports = new AtomicDouble(0);
public static final AtomicDouble averageSuccessPercentageOfWorkerReports = new AtomicDouble(0);
public ScheduledTasks(@Value("${services.pdfaggregation.controller.workerReportsDirPath}") String workerReportsDirPath, StatsController statsController, UrlsController urlsController, MeterRegistry registry)
{
@ -84,6 +92,8 @@ public class ScheduledTasks {
registry.gauge("numOfPayloadsAggregatedByService", numOfPayloadsAggregatedByService);
registry.gauge("numOfLegacyPayloads", numOfLegacyPayloads);
registry.gauge("numOfRecordsInspectedByServiceThroughCrawling", numOfRecordsInspectedByServiceThroughCrawling);
registry.gauge("averageFulltextsTransferSizeOfWorkerReports", averageFulltextsTransferSizeOfWorkerReports);
registry.gauge("averageSuccessPercentageOfWorkerReports", averageSuccessPercentageOfWorkerReports);
}
@ -300,6 +310,16 @@ public class ScheduledTasks {
numOfRecordsInspectedByServiceThroughCrawling.set(Integer.parseInt(responseEntity.getBody().toString())); // (any other cast method fails)
} // Any error is already logged.
// Record the batch size to know if the zstd files transferred are getting larger or smaller over time.
long numWorkerReportsInt = UrlsServiceImpl.numOfWorkerReportsProcessed.get();
if ( numWorkerReportsInt > 0 ) {
averageFulltextsTransferSizeOfWorkerReports.set(totalFulltextWorkerReportSize.get() / numWorkerReportsInt);
averageSuccessPercentageOfWorkerReports.set(totalSuccessPercentagesOfWorkerReports.get()/ numWorkerReportsInt); // The result will still be a percentage <= 100.
}
logger.info("averageFulltextsTransferSizeOfWorkerReports = " + (averageFulltextsTransferSizeOfWorkerReports.get() / 1048576) + " MB"); // Divide by (1024 * 1024) to get the value in MB.
logger.info("averageSuccessPercentageOfWorkerReports = " + df.format(averageSuccessPercentageOfWorkerReports.get()) + " %");
// The above two metrics are not very accurate, for small number of workerReports, since at the time of assignments, the "numWorkerReportsInt" may include 1-2 workerReports which have not reached the stage of calculating their percentage or getting their fulltexts from the workers.
// Nevertheless, in time, after processing 10s of workerReports, they show a near-accurate picture of the averages.
// TODO - Export more complex data; <numOfAllPayloadsPerDatasource>, <numOfAllPayloadsPerPublicationYear>,
// <numOfAggregatedPayloadsPerDatasource>, ..., <numOfBulkImportedPayloadsPerDatasource>, ...

View File

@ -2,6 +2,7 @@ package eu.openaire.urls_controller.util;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.SetMultimap;
import eu.openaire.urls_controller.components.ScheduledTasks;
import eu.openaire.urls_controller.configuration.DatabaseConnector;
import eu.openaire.urls_controller.models.Error;
import eu.openaire.urls_controller.models.Payload;
@ -301,13 +302,16 @@ public class FileUtils {
public boolean saveArchive(InputStream inputStream, File zstdFile)
{
long numBytesOfZstdFile = 0;
try ( BufferedInputStream inStream = new BufferedInputStream(inputStream, tenMb);
BufferedOutputStream outStream = new BufferedOutputStream(Files.newOutputStream(zstdFile.toPath()), tenMb) )
{
int readBytes;
while ( (readBytes = inStream.read()) != -1 ) {
outStream.write(readBytes);
numBytesOfZstdFile ++;
}
ScheduledTasks.totalFulltextWorkerReportSize.addAndGet(numBytesOfZstdFile); // Add this zstd size in order to get the "average", for workerReports, later.
return true;
} catch (Exception e) {
logger.error("Could not save the zstd file \"" + zstdFile.getName() + "\": " + e.getMessage(), e);

View File

@ -1,6 +1,7 @@
package eu.openaire.urls_controller.util;
import com.google.common.collect.HashMultimap;
import eu.openaire.urls_controller.components.ScheduledTasks;
import eu.openaire.urls_controller.controllers.UrlsController;
import eu.openaire.urls_controller.models.Payload;
import eu.openaire.urls_controller.models.UrlReport;
@ -199,7 +200,9 @@ public class FilesHandler {
return FileUtils.UploadFullTextsResponse.successful_without_fulltexts; // It was handled, no error.
}
logger.info("NumFullTextsFound by assignments_" + assignmentsBatchCounter + " = " + numValidFullTextsFound + " (out of " + sizeOfUrlReports + " | about " + df.format(numValidFullTextsFound * 100.0 / sizeOfUrlReports) + "%).");
double successPercentage = (numValidFullTextsFound * 100.0 / sizeOfUrlReports);
ScheduledTasks.totalSuccessPercentagesOfWorkerReports.addAndGet(successPercentage);
logger.info("NumFullTextsFound by assignments_" + assignmentsBatchCounter + " = " + numValidFullTextsFound + " (out of " + sizeOfUrlReports + " | about " + df.format(successPercentage) + "%).");
// TODO - Have a prometheus GAUGE to hold the value of the above percentage, so that we can track the success-rates over time..