From dea257b87f47922f3903e5f904a9271cf87df1fa Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Mon, 6 Dec 2021 20:18:30 +0200 Subject: [PATCH] - Fix a bug, which caused the get-full-texts request to fail, because of the wrong "requestAssignmentsCounter". - Fix a bug, which caused multiple workers to get assigned the same batch-counter, while the assignment-tasks where different. - Set a max-size limit to the amount of space the logs can use. Over that size, the older logs will be deleted. - Show the error-message returned from the Worker, when a getFullTexts-request fails. - Improve some log-messages. - Update dependencies. - Code cleanup. --- build.gradle | 2 +- gradle/wrapper/gradle-wrapper.properties | 2 +- installAndRun.sh | 2 +- .../controllers/UrlController.java | 27 ++++---- .../urls_controller/util/FileUtils.java | 61 ++++++++++++------- src/main/resources/logback-spring.xml | 7 ++- 6 files changed, 61 insertions(+), 40 deletions(-) diff --git a/build.gradle b/build.gradle index 1f9aa0f..b902fd7 100644 --- a/build.gradle +++ b/build.gradle @@ -43,7 +43,7 @@ dependencies { // https://mvnrepository.com/artifact/com.google.guava/guava implementation group: 'com.google.guava', name: 'guava', version: '31.0.1-jre' - implementation 'io.minio:minio:8.3.3' + implementation 'io.minio:minio:8.3.4' // https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3' // This is required by the minio, as Spring uses a version which is not supported by minio. diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index e750102..84d1f85 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,5 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-7.3-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.1-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/installAndRun.sh b/installAndRun.sh index a3bb3e3..4079bc5 100755 --- a/installAndRun.sh +++ b/installAndRun.sh @@ -8,7 +8,7 @@ elif [[ $# -gt 1 ]]; then echo -e "Wrong number of arguments given: ${#}\nPlease execute it like: script.sh "; exit 1 fi -gradleVersion="7.3" +gradleVersion="7.3.1" if [[ justInstall -eq 0 ]]; then diff --git a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java index 8484a9a..2d270c8 100644 --- a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java +++ b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java @@ -230,8 +230,9 @@ public class UrlController { ImpalaConnector.closeConnection(con); ImpalaConnector.databaseLock.unlock(); - logger.info("Sending batch-assignments_" + assignmentsBatchCounter.incrementAndGet() + " with " + assignmentsSize + " assignments to worker with ID: " + workerId + "."); - return ResponseEntity.status(HttpStatus.OK).body(new AssignmentsResponse(assignmentsBatchCounter.get(), assignments)); + long curAssignmentsBatchCounter = assignmentsBatchCounter.incrementAndGet(); + logger.info("Sending batch-assignments_" + curAssignmentsBatchCounter + " with " + assignmentsSize + " assignments to worker with ID: " + workerId + "."); + return ResponseEntity.status(HttpStatus.OK).body(new AssignmentsResponse(curAssignmentsBatchCounter, assignments)); } @@ -251,11 +252,12 @@ public class UrlController { return ResponseEntity.status(HttpStatus.BAD_REQUEST).body(errorMsg); } - logger.info("Received the WorkerReport for batch-assignments_" + workerReport.getAssignmentRequestCounter() + ", from the worker with id: " + workerReport.getWorkerId() + ". It contains " + urlReports.size() + " urlReports. Going to request the fullTexts from the Worker and insert the UrlReports into the database."); + long curReportAssignments = workerReport.getAssignmentRequestCounter(); + logger.info("Received the WorkerReport for batch-assignments_" + curReportAssignments + ", from the worker with id: " + workerReport.getWorkerId() + ". It contains " + urlReports.size() + " urlReports. Going to request the fullTexts from the Worker and insert the UrlReports into the database."); // Before continuing with inserts, take and upload the fullTexts from the Worker. Also, update the file-"location". - if ( ! FileUtils.getAndUploadFullTexts(urlReports, request, assignmentsBatchCounter, workerReport.getWorkerId()) ) { - logger.error("Failed to get and/or upload the fullTexts for assignments_" + assignmentsBatchCounter); + if ( ! FileUtils.getAndUploadFullTexts(urlReports, request, curReportAssignments, workerReport.getWorkerId()) ) { + logger.error("Failed to get and/or upload the fullTexts for assignments_" + curReportAssignments); // The docUrls were still found! Just update ALL the fileLocations. sizes and hashes, to show that the files are not available and continue with writing the attempts and the Payloads. FileUtils.updateUrlReportsToHaveNoFullTextFiles(urlReports); } @@ -299,12 +301,11 @@ public class UrlController { for ( UrlReport urlReport : urlReports ) { Payload payload = urlReport.getPayload(); if ( payload == null ) { - logger.error("Payload was \"null\" for a \"urlReport\", in assignments_" + assignmentsBatchCounter); + logger.error("Payload was \"null\" for a \"urlReport\", in assignments_" + curReportAssignments); payloadErrorMsg = (++failedCount) + " urlReports failed to be processed because they had no payload!"; continue; } - String tempFullQueryString = null; try { // We use a "PreparedStatement" to do insertions, for security reasons. preparedInsertPayloadStatement.setString(1, payload.getId()); preparedInsertPayloadStatement.setString(2, payload.getOriginal_url()); @@ -322,10 +323,9 @@ public class UrlController { preparedInsertPayloadStatement.setString(7, payload.getHash()); preparedInsertPayloadStatement.setString(8, payload.getLocation()); preparedInsertPayloadStatement.setString(9, payload.getProvenance()); - tempFullQueryString = preparedInsertPayloadStatement.toString(); preparedInsertPayloadStatement.executeUpdate(); } catch (SQLException sqle) { - logger.error("Problem when executing the \"insertIntoPayloadBaseQuery\":\n" + tempFullQueryString + "\n" + sqle.getMessage() + "\n\n"); + logger.error("Problem when executing the \"insertIntoPayloadBaseQuery\": " + sqle.getMessage() + "\n\n"); } Error error = urlReport.getError(); @@ -341,10 +341,9 @@ public class UrlController { preparedInsertAttemptStatement.setString(4, urlReport.getStatus().toString()); preparedInsertAttemptStatement.setString(5, String.valueOf(error.getType())); // This covers the case of "null". preparedInsertAttemptStatement.setString(6, error.getMessage()); - tempFullQueryString = preparedInsertAttemptStatement.toString(); preparedInsertAttemptStatement.executeUpdate(); } catch (SQLException sqle) { - logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\":\n" + tempFullQueryString + "\n" + sqle.getMessage() + "\n\n"); + logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\": " + sqle.getMessage() + "\n\n"); } }//end for-loop @@ -488,9 +487,9 @@ public class UrlController { if ( scanner != null ) // Check if the initial value is null. scanner.close(); - logger.info("Sending batch_" + assignmentsBatchCounter.incrementAndGet() + " with " + assignments.size() + " assignments (" + FileUtils.duplicateIdUrlEntries.get() + " more assignments were discarded as duplicates), to worker with ID: " + workerId); - - return ResponseEntity.status(HttpStatus.OK).header("Content-Type", "application/json").body(new AssignmentsResponse(assignmentsBatchCounter.get(), assignments)); + long curAssignmentsBatchCounter = assignmentsBatchCounter.incrementAndGet(); + logger.info("Sending batch_" + curAssignmentsBatchCounter + " with " + assignments.size() + " assignments (" + FileUtils.duplicateIdUrlEntries.get() + " more assignments were discarded as duplicates), to worker with ID: " + workerId); + return ResponseEntity.status(HttpStatus.OK).header("Content-Type", "application/json").body(new AssignmentsResponse(curAssignmentsBatchCounter, assignments)); } } diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index 02249bc..1abfe0d 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -11,10 +11,7 @@ import org.springframework.boot.configurationprocessor.json.JSONException; import org.springframework.boot.configurationprocessor.json.JSONObject; import javax.servlet.http.HttpServletRequest; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; +import java.io.*; import java.net.HttpURLConnection; import java.net.URL; import java.nio.file.Files; @@ -27,7 +24,6 @@ import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Scanner; -import java.util.concurrent.atomic.AtomicLong; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -111,7 +107,7 @@ public class FileUtils { public static final String baseTargetLocation = System.getProperty("user.dir") + File.separator + "fullTexts" + File.separator; private static final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames). - public static boolean getAndUploadFullTexts(List urlReports, HttpServletRequest request, AtomicLong assignmentsBatchCounter, String workerId) + public static boolean getAndUploadFullTexts(List urlReports, HttpServletRequest request, long assignmentsBatchCounter, String workerId) { // The Controller have to request the files from the Worker, in order to upload them to the S3. // We will have to UPDATE the "location" of each of those files in the UrlReports and then insert them all into the database. @@ -176,24 +172,24 @@ public class FileUtils { File curAssignmentsBaseDir = new File(curAssignmentsBaseLocation); int failedBatches = 0; - for ( int i=1; i <= numOfBatches; ++i ) + for ( int batchCounter = 1; batchCounter <= numOfBatches; ++batchCounter ) { - List fileNamesForCurBatch = getFileNamesForBatch(allFileNames, numAllFullTexts, i, numOfBatches); - HttpURLConnection conn = getConnection(baseUrl, assignmentsBatchCounter, i, fileNamesForCurBatch, numOfBatches, workerId); + List fileNamesForCurBatch = getFileNamesForBatch(allFileNames, numAllFullTexts, batchCounter); + HttpURLConnection conn = getConnection(baseUrl, assignmentsBatchCounter, batchCounter, fileNamesForCurBatch, numOfBatches, workerId); if ( conn == null ) { updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch); failedBatches ++; continue; // To the next batch. } - String targetLocation = curAssignmentsBaseLocation + "batch_" + i + File.separator; + String targetLocation = curAssignmentsBaseLocation + "batch_" + batchCounter + File.separator; File curBatchDir = new File(targetLocation); try { // Get the extracted files., Path targetPath = Files.createDirectories(Paths.get(targetLocation)); // Unzip the file. Iterate over the PDFs and upload each one of them and get the S3-Url - String zipFileFullPath = targetLocation + "fullTexts_" + assignmentsBatchCounter + "_" + i + ".zip"; + String zipFileFullPath = targetLocation + "fullTexts_" + assignmentsBatchCounter + "_" + batchCounter + ".zip"; File zipFile = new File(zipFileFullPath); if ( ! saveZipFile(conn, zipFile) ) { @@ -254,10 +250,10 @@ public class FileUtils { setUnretrievedFullText(payload); } - logger.info("Finished uploading " + numUploadedFiles + " full-texts of assignments_" + assignmentsBatchCounter + " on S3-ObjectStore."); + logger.info("Finished uploading " + numUploadedFiles + " full-texts of assignments_" + assignmentsBatchCounter + ", batch_" + batchCounter + " on S3-ObjectStore."); } catch (Exception e) { - logger.error("Could not extract and upload the full-texts for batch_" + i + " of assignments_" + assignmentsBatchCounter + "\n" + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6). + logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + "\n" + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6). updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch); failedBatches ++; } finally { @@ -270,7 +266,7 @@ public class FileUtils { // Check if none of the batches were handled.. if ( failedBatches == numOfBatches ) { - logger.error("None of the " + numOfBatches + " batches could be handled!"); + logger.error("None of the " + numOfBatches + " batches could be handled for assignments_" + assignmentsBatchCounter + ", for worker: " + workerId); return false; } else { replaceNotUploadedFileLocations(urlReports); @@ -279,9 +275,10 @@ public class FileUtils { } - private static HttpURLConnection getConnection(String baseUrl, AtomicLong assignmentsBatchCounter, int batchNum, List fileNamesForCurBatch, int totalBatches, String workerId) + private static HttpURLConnection getConnection(String baseUrl, long assignmentsBatchCounter, int batchNum, List fileNamesForCurBatch, int totalBatches, String workerId) { - String requestUrl = getRequestUrlForBatch(baseUrl, batchNum, fileNamesForCurBatch); + baseUrl += batchNum + "/"; + String requestUrl = getRequestUrlForBatch(baseUrl, fileNamesForCurBatch); logger.info("Going to request the batch_" + batchNum + " (out of " + totalBatches + ") with " + fileNamesForCurBatch.size() + " fullTexts, of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and baseRequestUrl: " + baseUrl + "[fileNames]"); HttpURLConnection conn = null; try { @@ -291,7 +288,7 @@ public class FileUtils { conn.connect(); int statusCode = conn.getResponseCode(); if ( statusCode != 200 ) { - logger.warn("HTTP-" + statusCode + ": Problem with when requesting the ZipFile of batch_" + batchNum + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl); + logger.warn("HTTP-" + statusCode + ": " + getErrorMessageFromResponseBody(conn) + "\nProblem when requesting the ZipFile of batch_" + batchNum + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl); return null; } } catch (Exception e) { @@ -302,7 +299,29 @@ public class FileUtils { } - private static List getFileNamesForBatch(List allFileNames, int numAllFullTexts, int curBatch, int numOfBatches) + private static String getErrorMessageFromResponseBody(HttpURLConnection conn) + { + StringBuilder errorMsgStrB = new StringBuilder(500); + try ( BufferedReader br = new BufferedReader(new InputStreamReader(conn.getErrorStream())) ) { // Try-with-resources + String inputLine; + while ( (inputLine = br.readLine()) != null ) + { + if ( !inputLine.isEmpty() ) { + errorMsgStrB.append(inputLine); + } + } + return (errorMsgStrB.length() != 0) ? errorMsgStrB.toString() : null; // Make sure we return a "null" on empty string, to better handle the case in the caller-function. + } catch ( IOException ioe ) { + logger.error("IOException when retrieving the error-message: " + ioe.getMessage()); + return null; + } catch ( Exception e ) { + logger.error("Could not extract the errorMessage!", e); + return null; + } + } + + + private static List getFileNamesForBatch(List allFileNames, int numAllFullTexts, int curBatch) { int initialIndex = ((curBatch-1) * numOfFullTextsPerBatch); int endingIndex = (curBatch * numOfFullTextsPerBatch); @@ -321,12 +340,12 @@ public class FileUtils { } - private static final StringBuilder sb = new StringBuilder(numOfFullTextsPerBatch * 100); + private static final StringBuilder sb = new StringBuilder(numOfFullTextsPerBatch * 50); // TODO - Make it THREAD-LOCAL, if we move to multi-thread batch requests. - private static String getRequestUrlForBatch(String baseUrl, int curBatch, List fileNamesForCurBatch) + private static String getRequestUrlForBatch(String baseUrl, List fileNamesForCurBatch) { - sb.append(baseUrl).append(curBatch).append("/"); + sb.append(baseUrl); int numFullTextsCurBatch = fileNamesForCurBatch.size(); for ( int j=0; j < numFullTextsCurBatch; ++j ){ sb.append(fileNamesForCurBatch.get(j)); diff --git a/src/main/resources/logback-spring.xml b/src/main/resources/logback-spring.xml index a1972a0..86ba426 100644 --- a/src/main/resources/logback-spring.xml +++ b/src/main/resources/logback-spring.xml @@ -1,15 +1,18 @@ - + logs/UrlsController.log logs/UrlsController.%i.log.zip + 1 + 20 50MB + UTF-8 %d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M\(@%line\) - %msg%n @@ -24,7 +27,7 @@ - + \ No newline at end of file