Expose the following statistics as prometheus-metrics and create/update a stats-endpoint for each one:

- "numOfPayloadsAggregatedByServiceThroughCrawling"
 - "numOfPayloadsAggregatedByServiceThroughBulkImport"
 - "numOfPayloadsAggregatedByService"
 - "numOfLegacyPayloads"
 - "numOfRecordsInspectedByServiceThroughCrawling" (renamed from "numOfInspectedRecords")
This commit is contained in:
Lampros Smyrnaios 2023-06-23 15:22:26 +03:00
parent b6b1cb08b9
commit 0f4b63c4a9
5 changed files with 111 additions and 27 deletions

View File

@ -20,12 +20,18 @@ For interacting with the database we use [**Impala**](https://impala.apache.org/
**Statistics API**:
- "**getNumberOfAllPayloads**" endpoint: **http://\<IP\>:\<PORT\>/api/stats/getNumberOfAllPayloads** <br>
This endpoint returns the total number of payloads existing in the database, independently of the way they were aggregated. This includes the payloads created by other pieces of software, before the PDF-Aggregation-Service was created.
- "**getNumberOfPayloadsAggregatedByServiceThroughCrawling**" endpoint: **http://\<IP\>:\<PORT\>/api/stats/getNumberOfPayloadsAggregatedByServiceThroughCrawling** <br>
This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself, through crawling.
- "**getNumberOfPayloadsAggregatedByServiceThroughBulkImport**" endpoint: **http://\<IP\>:\<PORT\>/api/stats/getNumberOfPayloadsAggregatedByServiceThroughBulkImport** <br>
This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself, through bulk-import procedures, from compatible datasources.
- "**getNumberOfPayloadsAggregatedByService**" endpoint: **http://\<IP\>:\<PORT\>/api/stats/getNumberOfPayloadsAggregatedByService** <br>
This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself. It excludes the payloads aggregated by other methods, by applying a Date-filter for the records created in 2021 or later.
This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself, both through crawling and bulk-import procedures.
- "**getNumberOfLegacyPayloads**" endpoint: **http://\<IP\>:\<PORT\>/api/stats/getNumberOfLegacyPayloads** <br>
This endpoint returns the number of payloads which were aggregated by methods other thant the PDF Aggregation Service.
- "**getNumberOfPayloadsForDatasource**" endpoint: **http://\<IP\>:\<PORT\>/api/stats/getNumberOfPayloadsForDatasource?datasourceId=\<givenDatasourceId\>** <br>
This endpoint returns the number of payloads which belong to the datasource specified by the given datasourceID.
- "**getNumberOfRecordsInspected**" endpoint: **http://\<IP\>:\<PORT\>/api/stats/getNumberOfRecordsInspected** <br>
This endpoint returns the number of records inspected by the PDF-Aggregation-Service.
- "**getNumberOfRecordsInspectedByServiceThroughCrawling**" endpoint: **http://\<IP\>:\<PORT\>/api/stats/getNumberOfRecordsInspectedByServiceThroughCrawling** <br>
This endpoint returns the number of records inspected by the PDF-Aggregation-Service through crawling.
<br>
<br>

View File

@ -40,8 +40,13 @@ public class ScheduledTasks {
private final String workerReportsDirPath;
AtomicInteger numOfAllPayloads = new AtomicInteger(0);
AtomicInteger numOfInspectedRecords = new AtomicInteger(0);
public static final AtomicInteger numOfAllPayloads = new AtomicInteger(0);
public static final AtomicInteger numOfPayloadsAggregatedByServiceThroughCrawling = new AtomicInteger(0);
public static final AtomicInteger numOfPayloadsAggregatedByServiceThroughBulkImport = new AtomicInteger(0);
public static final AtomicInteger numOfPayloadsAggregatedByService = new AtomicInteger(0);
public static final AtomicInteger numOfLegacyPayloads = new AtomicInteger(0);
public static final AtomicInteger numOfRecordsInspectedByServiceThroughCrawling = new AtomicInteger(0);
public ScheduledTasks(@Value("${services.pdfaggregation.controller.workerReportsDirPath}") String workerReportsDirPath, StatsController statsController, MeterRegistry registry)
@ -54,7 +59,11 @@ public class ScheduledTasks {
this.statsController = statsController;
registry.gauge("numOfAllPayloads", numOfAllPayloads);
registry.gauge("numOfInspectedRecords", numOfInspectedRecords);
registry.gauge("numOfPayloadsAggregatedByServiceThroughCrawling", numOfPayloadsAggregatedByServiceThroughCrawling);
registry.gauge("numOfPayloadsAggregatedByServiceThroughBulkImport", numOfPayloadsAggregatedByServiceThroughBulkImport);
registry.gauge("numOfPayloadsAggregatedByService", numOfPayloadsAggregatedByService);
registry.gauge("numOfLegacyPayloads", numOfLegacyPayloads);
registry.gauge("numOfRecordsInspectedByServiceThroughCrawling", numOfRecordsInspectedByServiceThroughCrawling);
}
@ -179,9 +188,29 @@ public class ScheduledTasks {
numOfAllPayloads.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails)
} // Any error is already logged.
responseEntity = statsController.getNumberOfRecordsInspected(true);
responseEntity = statsController.getNumberOfPayloadsAggregatedByServiceThroughCrawling(true);
if ( responseEntity.getStatusCode().value() == 200 ) {
numOfInspectedRecords.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails)
numOfPayloadsAggregatedByServiceThroughCrawling.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails)
} // Any error is already logged.
responseEntity = statsController.getNumberOfPayloadsAggregatedByServiceThroughBulkImport(true);
if ( responseEntity.getStatusCode().value() == 200 ) {
numOfPayloadsAggregatedByServiceThroughBulkImport.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails)
} // Any error is already logged.
responseEntity = statsController.getNumberOfPayloadsAggregatedByService(true);
if ( responseEntity.getStatusCode().value() == 200 ) {
numOfPayloadsAggregatedByService.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails)
} // Any error is already logged.
responseEntity = statsController.getNumberOfLegacyPayloads(true);
if ( responseEntity.getStatusCode().value() == 200 ) {
numOfLegacyPayloads.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails)
} // Any error is already logged.
responseEntity = statsController.getNumberOfRecordsInspectedByServiceThroughCrawling(true);
if ( responseEntity.getStatusCode().value() == 200 ) {
numOfRecordsInspectedByServiceThroughCrawling.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails)
} // Any error is already logged.

View File

@ -12,6 +12,7 @@ import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RequestParam;
import org.springframework.web.bind.annotation.RestController;
/**
* This controller returns statistics for the database.
*/
@ -35,20 +36,68 @@ public class StatsController {
if ( ! isCalledFromScheduler )
logger.info("Received a \"getNumberOfAllPayloads\" request.");
final String getPayloadsNumberQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload";
return statsService.getNumberOfPayloads(getPayloadsNumberQuery, "payloads");
final String getAllPayloadsNumberQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload";
return statsService.getNumberOfPayloads(getAllPayloadsNumberQuery, "all payloads");
}
/**
* This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself.
* It excludes the payloads aggregated by other methods, by applying a Date-filter for the records created in 2021 or later.
* This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself, through crawling.
* */
@GetMapping("getNumberOfPayloadsAggregatedByServiceThroughCrawling")
public ResponseEntity<?> getNumberOfPayloadsAggregatedByServiceThroughCrawling(boolean isCalledFromScheduler)
{
if ( ! isCalledFromScheduler )
logger.info("Received a \"getNumberOfPayloadsAggregatedByServiceThroughCrawling\" request.");
String getNumOfPayloadsAggregatedByServiceThroughCrawlingQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload_aggregated";
return statsService.getNumberOfPayloads(getNumOfPayloadsAggregatedByServiceThroughCrawlingQuery, "payloads aggregated by the Service through crawling");
}
/**
* This endpoint returns the number of payloads aggregated by this Service, through BulkImport procedures with compatible datasources..
* */
@GetMapping("getNumberOfPayloadsAggregatedByServiceThroughBulkImport")
public ResponseEntity<?> getNumberOfPayloadsAggregatedByServiceThroughBulkImport(boolean isCalledFromScheduler)
{
if ( ! isCalledFromScheduler )
logger.info("Received a \"getNumberOfPayloadsAggregatedByServiceThroughBulkImport\" request.");
String getNumOfPayloadsAggregatedByServiceThroughBulkImportQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload_bulk_import";
return statsService.getNumberOfPayloads(getNumOfPayloadsAggregatedByServiceThroughBulkImportQuery, "payloads aggregated by the Service through BulkImport procedures");
}
/**
* This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself, through crawling AND bulk-import procedures.
* */
@GetMapping("getNumberOfPayloadsAggregatedByService")
public ResponseEntity<?> getNumberOfPayloadsAggregatedByService() {
logger.info("Received a \"getNumberOfPayloadsAggregatedByService\" request.");
String getPayloadsAggregatedQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload_aggregated";
return statsService.getNumberOfPayloads(getPayloadsAggregatedQuery, "payloads retrieved by the PDF Aggregation Service");
public ResponseEntity<?> getNumberOfPayloadsAggregatedByService(boolean isCalledFromScheduler)
{
if ( ! isCalledFromScheduler )
logger.info("Received a \"getNumberOfPayloadsAggregatedByService\" request.");
String getNumOfPayloadsAggregatedByServiceQuery = "select count(id) from\n" +
" (select id from " + ImpalaConnector.databaseName + ".payload_aggregated\n" +
" union all\n" +
" select id from " + ImpalaConnector.databaseName + ".payload_bulk_import)\n" +
" as payloads_from_service";
return statsService.getNumberOfPayloads(getNumOfPayloadsAggregatedByServiceQuery, "payloads aggregated by the Service, through both crawling and bulk-import procedures");
}
/**
* This endpoint returns the number of legacy payloads, which were aggregated by methods other thant the PDF Aggregation Service.
* */
@GetMapping("getNumberOfLegacyPayloads")
public ResponseEntity<?> getNumberOfLegacyPayloads(boolean isCalledFromScheduler)
{
if ( ! isCalledFromScheduler )
logger.info("Received a \"getNumberOfLegacyPayloads\" request.");
String getNumOfLegacyPayloadsQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload_legacy";
return statsService.getNumberOfPayloads(getNumOfLegacyPayloadsQuery, "legacy payloads");
}
@ -58,14 +107,14 @@ public class StatsController {
@GetMapping("getNumberOfPayloadsForDatasource")
public ResponseEntity<?> getNumberOfPayloadsForDatasource(@RequestParam String datasourceId) {
logger.info("Received a \"getNumberOfPayloadsForDatasource\" request.");
final String getPayloadsNumberForDatasourceQuery =
final String getNumOfPayloadsForDatasourceQuery =
"select count(p.id) from " + ImpalaConnector.databaseName + ".payload p\n" +
" join " + ImpalaConnector.databaseName + ".publication pu on pu.id=p.id and pu.datasourceid=\"" + datasourceId + "\"";
if ( logger.isTraceEnabled() )
logger.trace("getPayloadsNumberForDatasourceQuery:\n" + getPayloadsNumberForDatasourceQuery);
logger.trace("getNumOfPayloadsForDatasourceQuery:\n" + getNumOfPayloadsForDatasourceQuery);
return statsService.getNumberOfPayloads(getPayloadsNumberForDatasourceQuery, "payloads related to datasourceId \"" + datasourceId + "\"");
return statsService.getNumberOfPayloads(getNumOfPayloadsForDatasourceQuery, "payloads related to datasourceId \"" + datasourceId + "\"");
}
@ -108,15 +157,15 @@ public class StatsController {
/**
* This endpoint returns the number of records inspected by the PDF-Aggregation-Service.
* This endpoint returns the number of records inspected by the PDF-Aggregation-Service, through crawling.
* */
@GetMapping("getNumberOfRecordsInspected")
public ResponseEntity<?> getNumberOfRecordsInspected(boolean isCalledFromScheduler)
@GetMapping("getNumberOfRecordsInspectedByServiceThroughCrawling")
public ResponseEntity<?> getNumberOfRecordsInspectedByServiceThroughCrawling(boolean isCalledFromScheduler)
{
if ( ! isCalledFromScheduler )
logger.info("Received a \"getNumberOfRecordsInspected\" request.");
logger.info("Received a \"getNumberOfRecordsInspectedByServiceThroughCrawling\" request.");
return statsService.getNumberOfRecordsInspected();
return statsService.getNumberOfRecordsInspectedByServiceThroughCrawling();
}
}

View File

@ -6,6 +6,6 @@ public interface StatsService {
ResponseEntity<?> getNumberOfPayloads(String getPayloadsNumberQuery, String extraMsg);
ResponseEntity<?> getNumberOfRecordsInspected();
ResponseEntity<?> getNumberOfRecordsInspectedByServiceThroughCrawling();
}

View File

@ -41,7 +41,7 @@ public class StatsServiceImpl implements StatsService {
}
public ResponseEntity<?> getNumberOfRecordsInspected()
public ResponseEntity<?> getNumberOfRecordsInspectedByServiceThroughCrawling()
{
// Note that until all the records are inspected, the "attempt" table contains all the inspected records PLUS very few duplicates (id-url) which come from the publications-database.
// After all the records are inspected, it contains duplicate records of more and more id-urls, as time goes on, since for every eligible record the Service re-attempts to get the full-text.
@ -53,7 +53,7 @@ public class StatsServiceImpl implements StatsService {
Object result = jdbcTemplate.queryForObject(getInspectedRecordsNumberQuery, Integer.class);
if ( result != null ) {
int numOfInspectedRecords = (int) result;
logger.info("Number of inspected records from the database \"" + ImpalaConnector.databaseName + "\" is " + numOfInspectedRecords);
logger.info("Number of crawling-inspected records from the database \"" + ImpalaConnector.databaseName + "\" is " + numOfInspectedRecords);
return new ResponseEntity<>(numOfInspectedRecords, HttpStatus.OK);
} else
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("The inspected records' number could not be retrieved from the database \"" + ImpalaConnector.databaseName + "\" using the getInspectedRecordsNumberQuery: " + getInspectedRecordsNumberQuery);