From b6340066a70c9d5a1542c8bfa6def2dd21a70641 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 28 Sep 2022 22:34:33 +0300 Subject: [PATCH] - Improve handling of the case, where the full-texts were found, but the Controller could not acquire them from the Worker. - Add/improve logs and comments. - Code cleanup. --- .../controllers/UrlController.java | 19 ++++++++++++------- .../urls_controller/util/FileUtils.java | 6 +++--- 2 files changed, 15 insertions(+), 10 deletions(-) diff --git a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java index b4e8f2d..dc5a281 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java @@ -257,8 +257,11 @@ public class UrlController { } else if ( uploadFullTextsResponse == FileUtils.UploadFullTextsResponse.unsuccessful ) { logger.error("Failed to get and/or upload the fullTexts for batch-assignments_" + curReportAssignments); - // The docUrls were still found! Just update ALL the fileLocations, sizes, hashes and mimetypes, to show that the files are not available and continue with writing the attempts and the payloads. + // The docUrls were still found! Just update ALL the fileLocations, sizes, hashes and mimetypes, to show that the files are not available. fileUtils.updateUrlReportsToHaveNoFullTextFiles(urlReports, false); + // We write only the payloads which are connected with retrieved full-texts, uploaded to S3-Object-Store. + // We continue with writing the "attempts", as we want to avoid re-checking the failed-urls later. + // The urls which give full-text (no matter if we could not get it from the worker), are flagged as "couldRetry" anyway, so they will be picked-up to be checked again later. } else logger.debug("Finished uploading the full-texts from batch-assignments_" + curReportAssignments); @@ -270,7 +273,8 @@ public class UrlController { final String insertIntoAttemptBaseQuery = "INSERT INTO " + ImpalaConnector.databaseName + ".attempt (id, original_url, date, status, error_class, error_message) VALUES (?, ?, ?, ?, ?, ?)"; final int[] attemptArgTypes = new int[] {Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR}; - final AtomicInteger failedCount = new AtomicInteger(0); + final AtomicInteger failedQueriesCount = new AtomicInteger(0); + // Split the "UrlReports" into some sub-lists int sizeOfEachSubList = (int)(sizeOUrlReports * 0.2); @@ -316,7 +320,6 @@ public class UrlController { } ImpalaConnector.databaseLock.lock(); - try { // Invoke all the tasks and wait for them to finish before moving to the next batch. insertsExecutor.invokeAll(callableTasks); } catch (InterruptedException ie) { // In this case, any unfinished tasks are cancelled. @@ -329,7 +332,7 @@ public class UrlController { return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg); } - int failedQueries = failedCount.get(); + int failedQueries = failedQueriesCount.get(); String failedQueriesMsg = failedQueries + " out of " + (sizeOUrlReports *2) + " failed to be processed!"; logger.debug("Finished inserting the payloads and the attempts into the \"payload\" and \"attempt\" tables" + ((failedQueries > 0) ? (", although " + failedQueriesMsg) : ".") + " Going to merge the parquet files for those tables."); @@ -354,11 +357,14 @@ public class UrlController { ImpalaConnector.databaseLock.unlock(); return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(mergeErrorMsg); } - ImpalaConnector.databaseLock.unlock(); logger.debug("Finished merging the database tables."); - return ResponseEntity.status(HttpStatus.OK).body(failedQueriesMsg); + if ( uploadFullTextsResponse == FileUtils.UploadFullTextsResponse.unsuccessful ) + return ResponseEntity.status(HttpStatus.MULTI_STATUS).body("The full-text files failed to be acquired from the worker!\n" + failedQueriesMsg); + else + return ResponseEntity.status(HttpStatus.OK).body(failedQueriesMsg); + } @@ -391,7 +397,6 @@ public class UrlController { try { // We use a "PreparedStatement" to do insertions, for security and valid SQL syntax reasons. Object[] args = new Object[] {payload.getId(), payload.getOriginal_url(), payload.getTimestamp_acquired(), urlReport.getStatus().toString(), String.valueOf(error.getType()), error.getMessage()}; - jdbcTemplate.update(insertIntoAttemptBaseQuery, args, attemptArgTypes); } catch (Exception e) { logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\": " + e.getMessage()); diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index fc7b0a4..40cb146 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -178,7 +178,7 @@ public class FileUtils { ArrayList allFileNames = new ArrayList<>(allFileNamesWithPayloads.keySet()); int numAllFullTexts = allFileNames.size(); if ( numAllFullTexts == 0 ) { - logger.warn("The retrieved files where < 0 > for assignments_" + assignmentsBatchCounter + " | from worker: " + workerId); + logger.warn("No full-text files were retrieved for assignments_" + assignmentsBatchCounter + " | from worker: " + workerId); return UploadFullTextsResponse.successful; // It was handled, no error. } @@ -207,7 +207,7 @@ public class FileUtils { } } catch (RuntimeException re) { // The "cause" was logged inside "getConnection()". - failedBatches += (1 + (numOfBatches - batchCounter)); // Add this and the rest of the failed batches. + failedBatches += (1 + (numOfBatches - batchCounter)); // The "failedBatches" will have the previously failedBatches + this one + the remaining batches which will likely fail too, thus, they will not be tested. break; } @@ -368,7 +368,7 @@ public class FileUtils { } - public static String getMessageFromResponseBody(HttpURLConnection conn, boolean isError) { + public String getMessageFromResponseBody(HttpURLConnection conn, boolean isError) { StringBuilder msgStrB = new StringBuilder(500); try ( BufferedReader br = new BufferedReader(new InputStreamReader((isError ? conn.getErrorStream() : conn.getInputStream()))) ) { // Try-with-resources String inputLine;