- Refactor the payloads-statistics-code and provide two endpoints: "getNumberOfPayloadsAggregatedByService", which returns the number of payloads aggregated only by the PDF-Aggregation-Service, and the "getNumberOfAllPayloads", which returns the number of all payloads existing in the database, even the ones aggregated in the past, by other pieces of software.

- Update README.md.
- Make sure the docker image is clean-built, by avoiding the use of cache.
This commit is contained in:
Lampros Smyrnaios 2023-02-02 17:58:47 +02:00
parent c9f33d3afa
commit 49fefefafd
3 changed files with 52 additions and 26 deletions

View File

@ -6,8 +6,12 @@ The database used is the [Impala](https://impala.apache.org/).<br>
<br>
**Statistics API**:
- "**getNumberOfPayloads**" endpoint: **http://IP:PORT/api/stats/getNumberOfPayloads**
- "**getNumberOfRecordsInspected**" endpoint: **http://IP:PORT/api/stats/getNumberOfRecordsInspected**
- "**getNumberOfAllPayloads**" endpoint: **http://<IP>:<PORT>/api/stats/getNumberOfAllPayloads** <br>
This endpoint returns the total number of payloads existing in the database, independently of the way they were aggregated. This includes the payloads created by other pieces of software, before the PDF-Aggregation-Service was created.
- "**getNumberOfPayloadsAggregatedByService**" endpoint: **http://<IP>:<PORT>/api/stats/getNumberOfPayloadsAggregatedByService** <br>
This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself. It excludes the payloads aggregated by other methods, by applying a Date-filter for the records created in 2021 or later.
- "**getNumberOfRecordsInspected**" endpoint: **http://<IP>:<PORT>/api/stats/getNumberOfRecordsInspected** <br>
This endpoint returns the number of records inspected by the PDF-Aggregation-Service.
<br>
<br>

View File

@ -53,7 +53,7 @@ if [[ justInstall -eq 0 ]]; then
echo -e "\nBuilding docker image..\n"
sudo docker --version || handle_error "Docker was not found!" 3
dockerImage=${username}"/urls_controller:latest"
sudo docker build -t "${dockerImage}" .
sudo docker build --no-cache -t "${dockerImage}" .
echo -e "\nPushing docker image.. (the account password is required, otherwise it will not be pushed, but it will continue to run)..\n"
(sudo docker login -u "${username}" && sudo docker push "${dockerImage}") || true
(sudo mkdir -p "$HOME"/tmp/config && sudo cp ./src/main/resources/application.properties "$HOME"/tmp/config) || true # This also replaces an existing "application.properties".

View File

@ -14,7 +14,7 @@ import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
/**
* This controller returns statistics for the database.
* This controller returns statistics for the database.
*/
@RestController
@RequestMapping("/stats")
@ -25,35 +25,37 @@ public class StatsController {
@Autowired
private JdbcTemplate jdbcTemplate;
@GetMapping("getNumberOfPayloads")
public ResponseEntity<?> getNumberOfPayloads()
{
logger.info("Received a \"getNumberOfPayloads\" request.");
/**
* This endpoint returns the total number of payloads existing in the database, independently of the way they were aggregated.
* This includes the payloads created by other pieces of software, before the PDF-Aggregation-Service was created.
* */
@GetMapping("getNumberOfAllPayloads")
public ResponseEntity<?> getNumberOfAllPayloads() {
logger.info("Received a \"getNumberOfAllPayloads\" request.");
final String getPayloadsNumberQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload";
try {
Object result = jdbcTemplate.queryForObject(getPayloadsNumberQuery, Integer.class);
if ( result != null ) {
int numOfPayloads = (int) result;
logger.info("Number of payloads in the database \"" + ImpalaConnector.databaseName + "\" is " + numOfPayloads);
return new ResponseEntity<>(numOfPayloads, HttpStatus.OK);
} else
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("The payloads' number could not be retrieved from the database \"" + ImpalaConnector.databaseName + "\" using the getPayloadsNumberQuery: " + getPayloadsNumberQuery);
} catch (EmptyResultDataAccessException erdae) {
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("The payloads' number could not be retrieved from the database \"" + ImpalaConnector.databaseName + "\" using the getPayloadsNumberQuery: " + getPayloadsNumberQuery);
} catch (Exception e) {
String errorMsg = "Problem when executing \"getPayloadsNumberQuery\": " + getPayloadsNumberQuery;
logger.error(errorMsg, e);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
// We may get a "Class Cast Exception", in case the Impala returns a non-integer value.
}
return getNumberOfPayloads(getPayloadsNumberQuery, "");
}
/**
* This endpoint returns the number of payloads aggregated by the PDF-Aggregated-Service itself.
* It excludes the payloads aggregated by other methods, by applying a Date-filter for the records created in 2021 or later.
* */
@GetMapping("getNumberOfPayloadsAggregatedByService")
public ResponseEntity<?> getNumberOfPayloadsAggregatedByService() {
logger.info("Received a \"getNumberOfPayloadsAggregatedByService\" request.");
String getPayloadsAggregatedQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload where `date` >= cast(cast('2021-01-01' as timestamp) as bigint)";
return getNumberOfPayloads(getPayloadsAggregatedQuery, "retrieved by the PDF Aggregation Service");
}
/**
* This endpoint returns the number of records inspected by the PDF-Aggregation-Service.
* */
@GetMapping("getNumberOfRecordsInspected")
public ResponseEntity<?> getNumberOfRecordsInspected()
{
// Note that until all the records are inspected, the "attempt" table contains all the inspected records +very few duplicates (id-url) which come from the publications-database.
// Note that until all the records are inspected, the "attempt" table contains all the inspected records PLUS very few duplicates (id-url) which come from the publications-database.
// After all the records are inspected, it contains duplicate records of more and more id-urls, as time goes one, since for every eligible record the Service re-attempts to get the full-text.
// So in order to get the number of inspected records, we want the distinct number, which at some point it will remain stable, even though the Service will try again and again some records.
// Before all the records are inspected, this endpoint will report all the inspected records MINUS the duplicate records which come straight from the "publication" table.
@ -79,4 +81,24 @@ public class StatsController {
}
}
private ResponseEntity<?> getNumberOfPayloads(String getPayloadsNumberQuery, String extraMsg) {
try {
Object result = jdbcTemplate.queryForObject(getPayloadsNumberQuery, Integer.class);
if ( result != null ) {
int numOfPayloads = (int) result;
logger.info("Number of payloads " + extraMsg + " in the database \"" + ImpalaConnector.databaseName + "\" is " + numOfPayloads);
return new ResponseEntity<>(numOfPayloads, HttpStatus.OK);
} else
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("The payloads' number could not be retrieved from the database \"" + ImpalaConnector.databaseName + "\" using the getPayloadsNumberQuery: " + getPayloadsNumberQuery);
} catch (EmptyResultDataAccessException erdae) {
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("The payloads' number could not be retrieved from the database \"" + ImpalaConnector.databaseName + "\" using the getPayloadsNumberQuery: " + getPayloadsNumberQuery);
} catch (Exception e) {
String errorMsg = "Problem when executing \"getPayloadsNumberQuery\": " + getPayloadsNumberQuery;
logger.error(errorMsg, e);
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
// We may get a "Class Cast Exception", in case the Impala returns a non-integer value.
}
}
}