From e51ee9dd27ed6248e49d3a1f0261f1e62b043be0 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 15 Dec 2022 14:04:22 +0200 Subject: [PATCH] - Add info about the Stats API usage in "README.md". - Optimize performance in "ParquetFileUtils.createAndLoadParquetDataIntoAttemptTable()" and "ParquetFileUtils.createAndLoadParquetDataIntoPayloadTable()". - Handle the "EmptyResultDataAccessException" inside "StatsController". - Optimize gradle's performance. - Code polishing. --- README.md | 8 +++++++- build.gradle | 2 +- .../controllers/StatsController.java | 13 +++++++++---- .../urls_controller/controllers/UrlController.java | 2 +- .../urls_controller/util/ParquetFileUtils.java | 6 ++++-- 5 files changed, 22 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index c564c7a..09a134e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,13 @@ The Controller's Application receives requests coming from the [Workers](https://code-repo.d4science.org/lsmyrnaios/UrlsWorker) , constructs an assignments-list with data received from a database and returns the list to the workers.
Then, it receives the "WorkerReports", it requests the full-texts from the workers, in batches, and uploads them on the S3-Object-Store. Finally, it writes the related reports, along with the updated file-locations into the database.
-The database used is the [Impala](https://impala.apache.org/) .
+The database used is the [Impala](https://impala.apache.org/).
+
+Statistics API: +- "**getNumberOfPayloads**" endpoint: **http://IP:PORT/api/stats/getNumberOfPayloads** +- "**getNumberOfRecordsInspected**" endpoint: **http://IP:PORT/api/stats/getNumberOfRecordsInspected** +
+
To install and run the application: - Run ```git clone``` and then ```cd UrlsController```. diff --git a/build.gradle b/build.gradle index 2addcba..7309bca 100644 --- a/build.gradle +++ b/build.gradle @@ -116,7 +116,7 @@ configurations.implementation { } // Set increased lower and upper limits for the java-execution. -tasks.withType(JavaExec) { +tasks.withType(JavaExec).configureEach { jvmArgs = ['-Xms512m', '-Xmx8g'] } diff --git a/src/main/java/eu/openaire/urls_controller/controllers/StatsController.java b/src/main/java/eu/openaire/urls_controller/controllers/StatsController.java index 8b2fa77..5ae5521 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/StatsController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/StatsController.java @@ -5,6 +5,7 @@ import eu.openaire.urls_controller.configuration.ImpalaConnector; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.dao.EmptyResultDataAccessException; import org.springframework.http.HttpStatus; import org.springframework.http.ResponseEntity; import org.springframework.jdbc.core.JdbcTemplate; @@ -29,7 +30,7 @@ public class StatsController { { logger.info("Received a \"getNumberOfPayloads\" request."); - String getPayloadsNumberQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload"; + final String getPayloadsNumberQuery = "select count(id) from " + ImpalaConnector.databaseName + ".payload"; try { Object result = jdbcTemplate.queryForObject(getPayloadsNumberQuery, Integer.class); if ( result != null ) { @@ -38,6 +39,8 @@ public class StatsController { return new ResponseEntity<>(numOfPayloads, HttpStatus.OK); } else return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("The payloads' number could not be retrieved from the database \"" + ImpalaConnector.databaseName + "\" using the getPayloadsNumberQuery: " + getPayloadsNumberQuery); + } catch (EmptyResultDataAccessException erdae) { + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("The payloads' number could not be retrieved from the database \"" + ImpalaConnector.databaseName + "\" using the getPayloadsNumberQuery: " + getPayloadsNumberQuery); } catch (Exception e) { String errorMsg = "Problem when executing \"getPayloadsNumberQuery\": " + getPayloadsNumberQuery; logger.error(errorMsg, e); @@ -51,13 +54,13 @@ public class StatsController { public ResponseEntity getNumberOfRecordsInspected() { // Note that until all the records are inspected, the "attempt" table contains all the inspected records +very few duplicates (id-url) which come from the publications-database. - // After all the records are inspected, it contains duplicate records of more and more id-urls, as time goes one, since for every eligible record the Service re-attmepts to get the full-text. - // So in order to get the number of inspected records, we want the distinct number, which at some point it will remain stable, even though the Service will try aganin and again some of the records. + // After all the records are inspected, it contains duplicate records of more and more id-urls, as time goes one, since for every eligible record the Service re-attempts to get the full-text. + // So in order to get the number of inspected records, we want the distinct number, which at some point it will remain stable, even though the Service will try again and again some records. // Before all the records are inspected, this endpoint will report all the inspected records MINUS the duplicate records which come straight from the "publication" table. logger.info("Received a \"getNumberOfRecordsInspected\" request."); - String getInspectedRecordsNumberQuery = "select count(dist.id) from (select distinct id, original_url from " + ImpalaConnector.databaseName + ".attempt) as dist"; + final String getInspectedRecordsNumberQuery = "select count(dist.id) from (select distinct id, original_url from " + ImpalaConnector.databaseName + ".attempt) as dist"; try { Object result = jdbcTemplate.queryForObject(getInspectedRecordsNumberQuery, Integer.class); if ( result != null ) { @@ -66,6 +69,8 @@ public class StatsController { return new ResponseEntity<>(numOfInspectedRecords, HttpStatus.OK); } else return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("The inspected records' number could not be retrieved from the database \"" + ImpalaConnector.databaseName + "\" using the getInspectedRecordsNumberQuery: " + getInspectedRecordsNumberQuery); + } catch (EmptyResultDataAccessException erdae) { + return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("The inspected records' number could not be retrieved from the database \"" + ImpalaConnector.databaseName + "\" using the getInspectedRecordsNumberQuery: " + getInspectedRecordsNumberQuery); } catch (Exception e) { String errorMsg = "Problem when executing \"getInspectedRecordsNumberQuery\": " + getInspectedRecordsNumberQuery; logger.error(errorMsg, e); diff --git a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java index 85cbc6e..c0cb5a3 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java @@ -205,7 +205,7 @@ public class UrlController { } - public static ExecutorService insertsExecutor = Executors.newFixedThreadPool(6); + public static final ExecutorService insertsExecutor = Executors.newFixedThreadPool(6); @PostMapping("addWorkerReport") public ResponseEntity addWorkerReport(@RequestBody WorkerReport workerReport, HttpServletRequest request) { diff --git a/src/main/java/eu/openaire/urls_controller/util/ParquetFileUtils.java b/src/main/java/eu/openaire/urls_controller/util/ParquetFileUtils.java index 68445c6..b2c3a16 100644 --- a/src/main/java/eu/openaire/urls_controller/util/ParquetFileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/ParquetFileUtils.java @@ -188,8 +188,9 @@ public class ParquetFileUtils { public boolean createAndLoadParquetDataIntoAttemptTable(int attemptsIncNum, List urlReports, long curReportAssignments, String currentParquetPath) { - List recordList = new ArrayList<>(); + List recordList = new ArrayList<>(urlReports.size()); GenericData.Record record; + for ( UrlReport urlReport : urlReports ) { Payload payload = urlReport.getPayload(); if ( payload == null ) { @@ -244,7 +245,7 @@ public class ParquetFileUtils { public boolean createAndLoadParquetDataIntoPayloadTable(List urlReports, long curReportAssignments, String currentParquetPath) { - List recordList = new ArrayList<>(); + List recordList = new ArrayList<>((int) (urlReports.size() * 0.2)); GenericData.Record record; for ( UrlReport urlReport : urlReports ) @@ -315,6 +316,7 @@ public class ParquetFileUtils { try (ParquetWriter writer = AvroParquetWriter.builder(outputFile).withSchema(schema) .withCompressionCodec(CompressionCodecName.GZIP).build()) + // When the app runs inside a Docker Container, it is NOT guaranteed that all compression-types will work. For example, the "SNAPPY"-compression does NOT work, while the "GZIP" works. { //logger.debug("Going to write to \"" + fullFilePath + "\" the record list: " + recordList); // DEBUG! for ( GenericRecord record : recordList ) {