From 0f4b63c4a96194af8e7a6e4fb441792af4c33a9a Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Fri, 23 Jun 2023 15:22:26 +0300 Subject: [PATCH] Expose the following statistics as prometheus-metrics and create/update a stats-endpoint for each one: - "numOfPayloadsAggregatedByServiceThroughCrawling" - "numOfPayloadsAggregatedByServiceThroughBulkImport" - "numOfPayloadsAggregatedByService" - "numOfLegacyPayloads" - "numOfRecordsInspectedByServiceThroughCrawling" (renamed from "numOfInspectedRecords") --- README.md | 12 ++- .../components/ScheduledTasks.java | 39 +++++++-- .../controllers/StatsController.java | 81 +++++++++++++++---- .../services/StatsService.java | 2 +- .../services/StatsServiceImpl.java | 4 +- 5 files changed, 111 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index e58b7a0..da1c121 100644 --- a/README.md +++ b/README.md @@ -20,12 +20,18 @@ For interacting with the database we use [**Impala**](https://impala.apache.org/ **Statistics API**: - "**getNumberOfAllPayloads**" endpoint: **http://\:\/api/stats/getNumberOfAllPayloads**
This endpoint returns the total number of payloads existing in the database, independently of the way they were aggregated. This includes the payloads created by other pieces of software, before the PDF-Aggregation-Service was created. +- "**getNumberOfPayloadsAggregatedByServiceThroughCrawling**" endpoint: **http://\:\/api/stats/getNumberOfPayloadsAggregatedByServiceThroughCrawling**
+ This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself, through crawling. +- "**getNumberOfPayloadsAggregatedByServiceThroughBulkImport**" endpoint: **http://\:\/api/stats/getNumberOfPayloadsAggregatedByServiceThroughBulkImport**
+ This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself, through bulk-import procedures, from compatible datasources. - "**getNumberOfPayloadsAggregatedByService**" endpoint: **http://\:\/api/stats/getNumberOfPayloadsAggregatedByService**
- This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself. It excludes the payloads aggregated by other methods, by applying a Date-filter for the records created in 2021 or later. + This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself, both through crawling and bulk-import procedures. +- "**getNumberOfLegacyPayloads**" endpoint: **http://\:\/api/stats/getNumberOfLegacyPayloads**
+ This endpoint returns the number of payloads which were aggregated by methods other thant the PDF Aggregation Service. - "**getNumberOfPayloadsForDatasource**" endpoint: **http://\:\/api/stats/getNumberOfPayloadsForDatasource?datasourceId=\**
This endpoint returns the number of payloads which belong to the datasource specified by the given datasourceID. -- "**getNumberOfRecordsInspected**" endpoint: **http://\:\/api/stats/getNumberOfRecordsInspected**
- This endpoint returns the number of records inspected by the PDF-Aggregation-Service. +- "**getNumberOfRecordsInspectedByServiceThroughCrawling**" endpoint: **http://\:\/api/stats/getNumberOfRecordsInspectedByServiceThroughCrawling**
+ This endpoint returns the number of records inspected by the PDF-Aggregation-Service through crawling.

diff --git a/src/main/java/eu/openaire/urls_controller/components/ScheduledTasks.java b/src/main/java/eu/openaire/urls_controller/components/ScheduledTasks.java index 20b248a..fdbfb76 100644 --- a/src/main/java/eu/openaire/urls_controller/components/ScheduledTasks.java +++ b/src/main/java/eu/openaire/urls_controller/components/ScheduledTasks.java @@ -40,8 +40,13 @@ public class ScheduledTasks { private final String workerReportsDirPath; - AtomicInteger numOfAllPayloads = new AtomicInteger(0); - AtomicInteger numOfInspectedRecords = new AtomicInteger(0); + public static final AtomicInteger numOfAllPayloads = new AtomicInteger(0); + public static final AtomicInteger numOfPayloadsAggregatedByServiceThroughCrawling = new AtomicInteger(0); + public static final AtomicInteger numOfPayloadsAggregatedByServiceThroughBulkImport = new AtomicInteger(0); + public static final AtomicInteger numOfPayloadsAggregatedByService = new AtomicInteger(0); + public static final AtomicInteger numOfLegacyPayloads = new AtomicInteger(0); + + public static final AtomicInteger numOfRecordsInspectedByServiceThroughCrawling = new AtomicInteger(0); public ScheduledTasks(@Value("${services.pdfaggregation.controller.workerReportsDirPath}") String workerReportsDirPath, StatsController statsController, MeterRegistry registry) @@ -54,7 +59,11 @@ public class ScheduledTasks { this.statsController = statsController; registry.gauge("numOfAllPayloads", numOfAllPayloads); - registry.gauge("numOfInspectedRecords", numOfInspectedRecords); + registry.gauge("numOfPayloadsAggregatedByServiceThroughCrawling", numOfPayloadsAggregatedByServiceThroughCrawling); + registry.gauge("numOfPayloadsAggregatedByServiceThroughBulkImport", numOfPayloadsAggregatedByServiceThroughBulkImport); + registry.gauge("numOfPayloadsAggregatedByService", numOfPayloadsAggregatedByService); + registry.gauge("numOfLegacyPayloads", numOfLegacyPayloads); + registry.gauge("numOfRecordsInspectedByServiceThroughCrawling", numOfRecordsInspectedByServiceThroughCrawling); } @@ -179,9 +188,29 @@ public class ScheduledTasks { numOfAllPayloads.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails) } // Any error is already logged. - responseEntity = statsController.getNumberOfRecordsInspected(true); + responseEntity = statsController.getNumberOfPayloadsAggregatedByServiceThroughCrawling(true); if ( responseEntity.getStatusCode().value() == 200 ) { - numOfInspectedRecords.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails) + numOfPayloadsAggregatedByServiceThroughCrawling.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails) + } // Any error is already logged. + + responseEntity = statsController.getNumberOfPayloadsAggregatedByServiceThroughBulkImport(true); + if ( responseEntity.getStatusCode().value() == 200 ) { + numOfPayloadsAggregatedByServiceThroughBulkImport.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails) + } // Any error is already logged. + + responseEntity = statsController.getNumberOfPayloadsAggregatedByService(true); + if ( responseEntity.getStatusCode().value() == 200 ) { + numOfPayloadsAggregatedByService.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails) + } // Any error is already logged. + + responseEntity = statsController.getNumberOfLegacyPayloads(true); + if ( responseEntity.getStatusCode().value() == 200 ) { + numOfLegacyPayloads.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails) + } // Any error is already logged. + + responseEntity = statsController.getNumberOfRecordsInspectedByServiceThroughCrawling(true); + if ( responseEntity.getStatusCode().value() == 200 ) { + numOfRecordsInspectedByServiceThroughCrawling.set(Integer.valueOf(responseEntity.getBody().toString())); // (any other cast method fails) } // Any error is already logged. diff --git a/src/main/java/eu/openaire/urls_controller/controllers/StatsController.java b/src/main/java/eu/openaire/urls_controller/controllers/StatsController.java index 2bb3b0d..4ff5cf6 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/StatsController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/StatsController.java @@ -12,6 +12,7 @@ import org.springframework.web.bind.annotation.RequestMapping; import org.springframework.web.bind.annotation.RequestParam; import org.springframework.web.bind.annotation.RestController; + /** * This controller returns statistics for the database. */ @@ -35,20 +36,68 @@ public class StatsController { if ( ! isCalledFromScheduler ) logger.info("Received a \"getNumberOfAllPayloads\" request."); - final String getPayloadsNumberQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload"; - return statsService.getNumberOfPayloads(getPayloadsNumberQuery, "payloads"); + final String getAllPayloadsNumberQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload"; + return statsService.getNumberOfPayloads(getAllPayloadsNumberQuery, "all payloads"); } /** - * This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself. - * It excludes the payloads aggregated by other methods, by applying a Date-filter for the records created in 2021 or later. + * This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself, through crawling. + * */ + @GetMapping("getNumberOfPayloadsAggregatedByServiceThroughCrawling") + public ResponseEntity getNumberOfPayloadsAggregatedByServiceThroughCrawling(boolean isCalledFromScheduler) + { + if ( ! isCalledFromScheduler ) + logger.info("Received a \"getNumberOfPayloadsAggregatedByServiceThroughCrawling\" request."); + + String getNumOfPayloadsAggregatedByServiceThroughCrawlingQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload_aggregated"; + return statsService.getNumberOfPayloads(getNumOfPayloadsAggregatedByServiceThroughCrawlingQuery, "payloads aggregated by the Service through crawling"); + } + + + /** + * This endpoint returns the number of payloads aggregated by this Service, through BulkImport procedures with compatible datasources.. + * */ + @GetMapping("getNumberOfPayloadsAggregatedByServiceThroughBulkImport") + public ResponseEntity getNumberOfPayloadsAggregatedByServiceThroughBulkImport(boolean isCalledFromScheduler) + { + if ( ! isCalledFromScheduler ) + logger.info("Received a \"getNumberOfPayloadsAggregatedByServiceThroughBulkImport\" request."); + + String getNumOfPayloadsAggregatedByServiceThroughBulkImportQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload_bulk_import"; + return statsService.getNumberOfPayloads(getNumOfPayloadsAggregatedByServiceThroughBulkImportQuery, "payloads aggregated by the Service through BulkImport procedures"); + } + + + /** + * This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself, through crawling AND bulk-import procedures. * */ @GetMapping("getNumberOfPayloadsAggregatedByService") - public ResponseEntity getNumberOfPayloadsAggregatedByService() { - logger.info("Received a \"getNumberOfPayloadsAggregatedByService\" request."); - String getPayloadsAggregatedQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload_aggregated"; - return statsService.getNumberOfPayloads(getPayloadsAggregatedQuery, "payloads retrieved by the PDF Aggregation Service"); + public ResponseEntity getNumberOfPayloadsAggregatedByService(boolean isCalledFromScheduler) + { + if ( ! isCalledFromScheduler ) + logger.info("Received a \"getNumberOfPayloadsAggregatedByService\" request."); + + String getNumOfPayloadsAggregatedByServiceQuery = "select count(id) from\n" + + " (select id from " + ImpalaConnector.databaseName + ".payload_aggregated\n" + + " union all\n" + + " select id from " + ImpalaConnector.databaseName + ".payload_bulk_import)\n" + + " as payloads_from_service"; + return statsService.getNumberOfPayloads(getNumOfPayloadsAggregatedByServiceQuery, "payloads aggregated by the Service, through both crawling and bulk-import procedures"); + } + + + /** + * This endpoint returns the number of legacy payloads, which were aggregated by methods other thant the PDF Aggregation Service. + * */ + @GetMapping("getNumberOfLegacyPayloads") + public ResponseEntity getNumberOfLegacyPayloads(boolean isCalledFromScheduler) + { + if ( ! isCalledFromScheduler ) + logger.info("Received a \"getNumberOfLegacyPayloads\" request."); + + String getNumOfLegacyPayloadsQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload_legacy"; + return statsService.getNumberOfPayloads(getNumOfLegacyPayloadsQuery, "legacy payloads"); } @@ -58,14 +107,14 @@ public class StatsController { @GetMapping("getNumberOfPayloadsForDatasource") public ResponseEntity getNumberOfPayloadsForDatasource(@RequestParam String datasourceId) { logger.info("Received a \"getNumberOfPayloadsForDatasource\" request."); - final String getPayloadsNumberForDatasourceQuery = + final String getNumOfPayloadsForDatasourceQuery = "select count(p.id) from " + ImpalaConnector.databaseName + ".payload p\n" + " join " + ImpalaConnector.databaseName + ".publication pu on pu.id=p.id and pu.datasourceid=\"" + datasourceId + "\""; if ( logger.isTraceEnabled() ) - logger.trace("getPayloadsNumberForDatasourceQuery:\n" + getPayloadsNumberForDatasourceQuery); + logger.trace("getNumOfPayloadsForDatasourceQuery:\n" + getNumOfPayloadsForDatasourceQuery); - return statsService.getNumberOfPayloads(getPayloadsNumberForDatasourceQuery, "payloads related to datasourceId \"" + datasourceId + "\""); + return statsService.getNumberOfPayloads(getNumOfPayloadsForDatasourceQuery, "payloads related to datasourceId \"" + datasourceId + "\""); } @@ -108,15 +157,15 @@ public class StatsController { /** - * This endpoint returns the number of records inspected by the PDF-Aggregation-Service. + * This endpoint returns the number of records inspected by the PDF-Aggregation-Service, through crawling. * */ - @GetMapping("getNumberOfRecordsInspected") - public ResponseEntity getNumberOfRecordsInspected(boolean isCalledFromScheduler) + @GetMapping("getNumberOfRecordsInspectedByServiceThroughCrawling") + public ResponseEntity getNumberOfRecordsInspectedByServiceThroughCrawling(boolean isCalledFromScheduler) { if ( ! isCalledFromScheduler ) - logger.info("Received a \"getNumberOfRecordsInspected\" request."); + logger.info("Received a \"getNumberOfRecordsInspectedByServiceThroughCrawling\" request."); - return statsService.getNumberOfRecordsInspected(); + return statsService.getNumberOfRecordsInspectedByServiceThroughCrawling(); } } diff --git a/src/main/java/eu/openaire/urls_controller/services/StatsService.java b/src/main/java/eu/openaire/urls_controller/services/StatsService.java index e0be0ea..fe669b2 100644 --- a/src/main/java/eu/openaire/urls_controller/services/StatsService.java +++ b/src/main/java/eu/openaire/urls_controller/services/StatsService.java @@ -6,6 +6,6 @@ public interface StatsService { ResponseEntity getNumberOfPayloads(String getPayloadsNumberQuery, String extraMsg); - ResponseEntity getNumberOfRecordsInspected(); + ResponseEntity getNumberOfRecordsInspectedByServiceThroughCrawling(); } diff --git a/src/main/java/eu/openaire/urls_controller/services/StatsServiceImpl.java b/src/main/java/eu/openaire/urls_controller/services/StatsServiceImpl.java index 2e9c19f..ebfca8b 100644 --- a/src/main/java/eu/openaire/urls_controller/services/StatsServiceImpl.java +++ b/src/main/java/eu/openaire/urls_controller/services/StatsServiceImpl.java @@ -41,7 +41,7 @@ public class StatsServiceImpl implements StatsService { } - public ResponseEntity getNumberOfRecordsInspected() + public ResponseEntity getNumberOfRecordsInspectedByServiceThroughCrawling() { // Note that until all the records are inspected, the "attempt" table contains all the inspected records PLUS very few duplicates (id-url) which come from the publications-database. // After all the records are inspected, it contains duplicate records of more and more id-urls, as time goes on, since for every eligible record the Service re-attempts to get the full-text. @@ -53,7 +53,7 @@ public class StatsServiceImpl implements StatsService { Object result = jdbcTemplate.queryForObject(getInspectedRecordsNumberQuery, Integer.class); if ( result != null ) { int numOfInspectedRecords = (int) result; - logger.info("Number of inspected records from the database \"" + ImpalaConnector.databaseName + "\" is " + numOfInspectedRecords); + logger.info("Number of crawling-inspected records from the database \"" + ImpalaConnector.databaseName + "\" is " + numOfInspectedRecords); return new ResponseEntity<>(numOfInspectedRecords, HttpStatus.OK); } else return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("The inspected records' number could not be retrieved from the database \"" + ImpalaConnector.databaseName + "\" using the getInspectedRecordsNumberQuery: " + getInspectedRecordsNumberQuery);