forked from lsmyrnaios/UrlsController
- Fix a bug, which caused the get-full-texts request to fail, because of the wrong "requestAssignmentsCounter".
- Fix a bug, which caused multiple workers to get assigned the same batch-counter, while the assignment-tasks where different. - Set a max-size limit to the amount of space the logs can use. Over that size, the older logs will be deleted. - Show the error-message returned from the Worker, when a getFullTexts-request fails. - Improve some log-messages. - Update dependencies. - Code cleanup.
This commit is contained in:
parent
15224c6468
commit
dea257b87f
|
@ -43,7 +43,7 @@ dependencies {
|
|||
// https://mvnrepository.com/artifact/com.google.guava/guava
|
||||
implementation group: 'com.google.guava', name: 'guava', version: '31.0.1-jre'
|
||||
|
||||
implementation 'io.minio:minio:8.3.3'
|
||||
implementation 'io.minio:minio:8.3.4'
|
||||
|
||||
// https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp
|
||||
implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3' // This is required by the minio, as Spring uses a version which is not supported by minio.
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
distributionBase=GRADLE_USER_HOME
|
||||
distributionPath=wrapper/dists
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.3-bin.zip
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.1-bin.zip
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
zipStorePath=wrapper/dists
|
||||
|
|
|
@ -8,7 +8,7 @@ elif [[ $# -gt 1 ]]; then
|
|||
echo -e "Wrong number of arguments given: ${#}\nPlease execute it like: script.sh <justInstall: 0 | 1>"; exit 1
|
||||
fi
|
||||
|
||||
gradleVersion="7.3"
|
||||
gradleVersion="7.3.1"
|
||||
|
||||
if [[ justInstall -eq 0 ]]; then
|
||||
|
||||
|
|
|
@ -230,8 +230,9 @@ public class UrlController {
|
|||
ImpalaConnector.closeConnection(con);
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
|
||||
logger.info("Sending batch-assignments_" + assignmentsBatchCounter.incrementAndGet() + " with " + assignmentsSize + " assignments to worker with ID: " + workerId + ".");
|
||||
return ResponseEntity.status(HttpStatus.OK).body(new AssignmentsResponse(assignmentsBatchCounter.get(), assignments));
|
||||
long curAssignmentsBatchCounter = assignmentsBatchCounter.incrementAndGet();
|
||||
logger.info("Sending batch-assignments_" + curAssignmentsBatchCounter + " with " + assignmentsSize + " assignments to worker with ID: " + workerId + ".");
|
||||
return ResponseEntity.status(HttpStatus.OK).body(new AssignmentsResponse(curAssignmentsBatchCounter, assignments));
|
||||
}
|
||||
|
||||
|
||||
|
@ -251,11 +252,12 @@ public class UrlController {
|
|||
return ResponseEntity.status(HttpStatus.BAD_REQUEST).body(errorMsg);
|
||||
}
|
||||
|
||||
logger.info("Received the WorkerReport for batch-assignments_" + workerReport.getAssignmentRequestCounter() + ", from the worker with id: " + workerReport.getWorkerId() + ". It contains " + urlReports.size() + " urlReports. Going to request the fullTexts from the Worker and insert the UrlReports into the database.");
|
||||
long curReportAssignments = workerReport.getAssignmentRequestCounter();
|
||||
logger.info("Received the WorkerReport for batch-assignments_" + curReportAssignments + ", from the worker with id: " + workerReport.getWorkerId() + ". It contains " + urlReports.size() + " urlReports. Going to request the fullTexts from the Worker and insert the UrlReports into the database.");
|
||||
|
||||
// Before continuing with inserts, take and upload the fullTexts from the Worker. Also, update the file-"location".
|
||||
if ( ! FileUtils.getAndUploadFullTexts(urlReports, request, assignmentsBatchCounter, workerReport.getWorkerId()) ) {
|
||||
logger.error("Failed to get and/or upload the fullTexts for assignments_" + assignmentsBatchCounter);
|
||||
if ( ! FileUtils.getAndUploadFullTexts(urlReports, request, curReportAssignments, workerReport.getWorkerId()) ) {
|
||||
logger.error("Failed to get and/or upload the fullTexts for assignments_" + curReportAssignments);
|
||||
// The docUrls were still found! Just update ALL the fileLocations. sizes and hashes, to show that the files are not available and continue with writing the attempts and the Payloads.
|
||||
FileUtils.updateUrlReportsToHaveNoFullTextFiles(urlReports);
|
||||
}
|
||||
|
@ -299,12 +301,11 @@ public class UrlController {
|
|||
for ( UrlReport urlReport : urlReports ) {
|
||||
Payload payload = urlReport.getPayload();
|
||||
if ( payload == null ) {
|
||||
logger.error("Payload was \"null\" for a \"urlReport\", in assignments_" + assignmentsBatchCounter);
|
||||
logger.error("Payload was \"null\" for a \"urlReport\", in assignments_" + curReportAssignments);
|
||||
payloadErrorMsg = (++failedCount) + " urlReports failed to be processed because they had no payload!";
|
||||
continue;
|
||||
}
|
||||
|
||||
String tempFullQueryString = null;
|
||||
try { // We use a "PreparedStatement" to do insertions, for security reasons.
|
||||
preparedInsertPayloadStatement.setString(1, payload.getId());
|
||||
preparedInsertPayloadStatement.setString(2, payload.getOriginal_url());
|
||||
|
@ -322,10 +323,9 @@ public class UrlController {
|
|||
preparedInsertPayloadStatement.setString(7, payload.getHash());
|
||||
preparedInsertPayloadStatement.setString(8, payload.getLocation());
|
||||
preparedInsertPayloadStatement.setString(9, payload.getProvenance());
|
||||
tempFullQueryString = preparedInsertPayloadStatement.toString();
|
||||
preparedInsertPayloadStatement.executeUpdate();
|
||||
} catch (SQLException sqle) {
|
||||
logger.error("Problem when executing the \"insertIntoPayloadBaseQuery\":\n" + tempFullQueryString + "\n" + sqle.getMessage() + "\n\n");
|
||||
logger.error("Problem when executing the \"insertIntoPayloadBaseQuery\": " + sqle.getMessage() + "\n\n");
|
||||
}
|
||||
|
||||
Error error = urlReport.getError();
|
||||
|
@ -341,10 +341,9 @@ public class UrlController {
|
|||
preparedInsertAttemptStatement.setString(4, urlReport.getStatus().toString());
|
||||
preparedInsertAttemptStatement.setString(5, String.valueOf(error.getType())); // This covers the case of "null".
|
||||
preparedInsertAttemptStatement.setString(6, error.getMessage());
|
||||
tempFullQueryString = preparedInsertAttemptStatement.toString();
|
||||
preparedInsertAttemptStatement.executeUpdate();
|
||||
} catch (SQLException sqle) {
|
||||
logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\":\n" + tempFullQueryString + "\n" + sqle.getMessage() + "\n\n");
|
||||
logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\": " + sqle.getMessage() + "\n\n");
|
||||
}
|
||||
}//end for-loop
|
||||
|
||||
|
@ -488,9 +487,9 @@ public class UrlController {
|
|||
if ( scanner != null ) // Check if the initial value is null.
|
||||
scanner.close();
|
||||
|
||||
logger.info("Sending batch_" + assignmentsBatchCounter.incrementAndGet() + " with " + assignments.size() + " assignments (" + FileUtils.duplicateIdUrlEntries.get() + " more assignments were discarded as duplicates), to worker with ID: " + workerId);
|
||||
|
||||
return ResponseEntity.status(HttpStatus.OK).header("Content-Type", "application/json").body(new AssignmentsResponse(assignmentsBatchCounter.get(), assignments));
|
||||
long curAssignmentsBatchCounter = assignmentsBatchCounter.incrementAndGet();
|
||||
logger.info("Sending batch_" + curAssignmentsBatchCounter + " with " + assignments.size() + " assignments (" + FileUtils.duplicateIdUrlEntries.get() + " more assignments were discarded as duplicates), to worker with ID: " + workerId);
|
||||
return ResponseEntity.status(HttpStatus.OK).header("Content-Type", "application/json").body(new AssignmentsResponse(curAssignmentsBatchCounter, assignments));
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -11,10 +11,7 @@ import org.springframework.boot.configurationprocessor.json.JSONException;
|
|||
import org.springframework.boot.configurationprocessor.json.JSONObject;
|
||||
|
||||
import javax.servlet.http.HttpServletRequest;
|
||||
import java.io.File;
|
||||
import java.io.FileOutputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.*;
|
||||
import java.net.HttpURLConnection;
|
||||
import java.net.URL;
|
||||
import java.nio.file.Files;
|
||||
|
@ -27,7 +24,6 @@ import java.util.ArrayList;
|
|||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Scanner;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
|
@ -111,7 +107,7 @@ public class FileUtils {
|
|||
public static final String baseTargetLocation = System.getProperty("user.dir") + File.separator + "fullTexts" + File.separator;
|
||||
private static final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames).
|
||||
|
||||
public static boolean getAndUploadFullTexts(List<UrlReport> urlReports, HttpServletRequest request, AtomicLong assignmentsBatchCounter, String workerId)
|
||||
public static boolean getAndUploadFullTexts(List<UrlReport> urlReports, HttpServletRequest request, long assignmentsBatchCounter, String workerId)
|
||||
{
|
||||
// The Controller have to request the files from the Worker, in order to upload them to the S3.
|
||||
// We will have to UPDATE the "location" of each of those files in the UrlReports and then insert them all into the database.
|
||||
|
@ -176,24 +172,24 @@ public class FileUtils {
|
|||
File curAssignmentsBaseDir = new File(curAssignmentsBaseLocation);
|
||||
|
||||
int failedBatches = 0;
|
||||
for ( int i=1; i <= numOfBatches; ++i )
|
||||
for ( int batchCounter = 1; batchCounter <= numOfBatches; ++batchCounter )
|
||||
{
|
||||
List<String> fileNamesForCurBatch = getFileNamesForBatch(allFileNames, numAllFullTexts, i, numOfBatches);
|
||||
HttpURLConnection conn = getConnection(baseUrl, assignmentsBatchCounter, i, fileNamesForCurBatch, numOfBatches, workerId);
|
||||
List<String> fileNamesForCurBatch = getFileNamesForBatch(allFileNames, numAllFullTexts, batchCounter);
|
||||
HttpURLConnection conn = getConnection(baseUrl, assignmentsBatchCounter, batchCounter, fileNamesForCurBatch, numOfBatches, workerId);
|
||||
if ( conn == null ) {
|
||||
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch);
|
||||
failedBatches ++;
|
||||
continue; // To the next batch.
|
||||
}
|
||||
|
||||
String targetLocation = curAssignmentsBaseLocation + "batch_" + i + File.separator;
|
||||
String targetLocation = curAssignmentsBaseLocation + "batch_" + batchCounter + File.separator;
|
||||
File curBatchDir = new File(targetLocation);
|
||||
try {
|
||||
// Get the extracted files.,
|
||||
Path targetPath = Files.createDirectories(Paths.get(targetLocation));
|
||||
|
||||
// Unzip the file. Iterate over the PDFs and upload each one of them and get the S3-Url
|
||||
String zipFileFullPath = targetLocation + "fullTexts_" + assignmentsBatchCounter + "_" + i + ".zip";
|
||||
String zipFileFullPath = targetLocation + "fullTexts_" + assignmentsBatchCounter + "_" + batchCounter + ".zip";
|
||||
File zipFile = new File(zipFileFullPath);
|
||||
|
||||
if ( ! saveZipFile(conn, zipFile) ) {
|
||||
|
@ -254,10 +250,10 @@ public class FileUtils {
|
|||
setUnretrievedFullText(payload);
|
||||
}
|
||||
|
||||
logger.info("Finished uploading " + numUploadedFiles + " full-texts of assignments_" + assignmentsBatchCounter + " on S3-ObjectStore.");
|
||||
logger.info("Finished uploading " + numUploadedFiles + " full-texts of assignments_" + assignmentsBatchCounter + ", batch_" + batchCounter + " on S3-ObjectStore.");
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.error("Could not extract and upload the full-texts for batch_" + i + " of assignments_" + assignmentsBatchCounter + "\n" + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6).
|
||||
logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + "\n" + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6).
|
||||
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch);
|
||||
failedBatches ++;
|
||||
} finally {
|
||||
|
@ -270,7 +266,7 @@ public class FileUtils {
|
|||
|
||||
// Check if none of the batches were handled..
|
||||
if ( failedBatches == numOfBatches ) {
|
||||
logger.error("None of the " + numOfBatches + " batches could be handled!");
|
||||
logger.error("None of the " + numOfBatches + " batches could be handled for assignments_" + assignmentsBatchCounter + ", for worker: " + workerId);
|
||||
return false;
|
||||
} else {
|
||||
replaceNotUploadedFileLocations(urlReports);
|
||||
|
@ -279,9 +275,10 @@ public class FileUtils {
|
|||
}
|
||||
|
||||
|
||||
private static HttpURLConnection getConnection(String baseUrl, AtomicLong assignmentsBatchCounter, int batchNum, List<String> fileNamesForCurBatch, int totalBatches, String workerId)
|
||||
private static HttpURLConnection getConnection(String baseUrl, long assignmentsBatchCounter, int batchNum, List<String> fileNamesForCurBatch, int totalBatches, String workerId)
|
||||
{
|
||||
String requestUrl = getRequestUrlForBatch(baseUrl, batchNum, fileNamesForCurBatch);
|
||||
baseUrl += batchNum + "/";
|
||||
String requestUrl = getRequestUrlForBatch(baseUrl, fileNamesForCurBatch);
|
||||
logger.info("Going to request the batch_" + batchNum + " (out of " + totalBatches + ") with " + fileNamesForCurBatch.size() + " fullTexts, of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and baseRequestUrl: " + baseUrl + "[fileNames]");
|
||||
HttpURLConnection conn = null;
|
||||
try {
|
||||
|
@ -291,7 +288,7 @@ public class FileUtils {
|
|||
conn.connect();
|
||||
int statusCode = conn.getResponseCode();
|
||||
if ( statusCode != 200 ) {
|
||||
logger.warn("HTTP-" + statusCode + ": Problem with when requesting the ZipFile of batch_" + batchNum + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl);
|
||||
logger.warn("HTTP-" + statusCode + ": " + getErrorMessageFromResponseBody(conn) + "\nProblem when requesting the ZipFile of batch_" + batchNum + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl);
|
||||
return null;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
|
@ -302,7 +299,29 @@ public class FileUtils {
|
|||
}
|
||||
|
||||
|
||||
private static List<String> getFileNamesForBatch(List<String> allFileNames, int numAllFullTexts, int curBatch, int numOfBatches)
|
||||
private static String getErrorMessageFromResponseBody(HttpURLConnection conn)
|
||||
{
|
||||
StringBuilder errorMsgStrB = new StringBuilder(500);
|
||||
try ( BufferedReader br = new BufferedReader(new InputStreamReader(conn.getErrorStream())) ) { // Try-with-resources
|
||||
String inputLine;
|
||||
while ( (inputLine = br.readLine()) != null )
|
||||
{
|
||||
if ( !inputLine.isEmpty() ) {
|
||||
errorMsgStrB.append(inputLine);
|
||||
}
|
||||
}
|
||||
return (errorMsgStrB.length() != 0) ? errorMsgStrB.toString() : null; // Make sure we return a "null" on empty string, to better handle the case in the caller-function.
|
||||
} catch ( IOException ioe ) {
|
||||
logger.error("IOException when retrieving the error-message: " + ioe.getMessage());
|
||||
return null;
|
||||
} catch ( Exception e ) {
|
||||
logger.error("Could not extract the errorMessage!", e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
private static List<String> getFileNamesForBatch(List<String> allFileNames, int numAllFullTexts, int curBatch)
|
||||
{
|
||||
int initialIndex = ((curBatch-1) * numOfFullTextsPerBatch);
|
||||
int endingIndex = (curBatch * numOfFullTextsPerBatch);
|
||||
|
@ -321,12 +340,12 @@ public class FileUtils {
|
|||
}
|
||||
|
||||
|
||||
private static final StringBuilder sb = new StringBuilder(numOfFullTextsPerBatch * 100);
|
||||
private static final StringBuilder sb = new StringBuilder(numOfFullTextsPerBatch * 50);
|
||||
// TODO - Make it THREAD-LOCAL, if we move to multi-thread batch requests.
|
||||
|
||||
private static String getRequestUrlForBatch(String baseUrl, int curBatch, List<String> fileNamesForCurBatch)
|
||||
private static String getRequestUrlForBatch(String baseUrl, List<String> fileNamesForCurBatch)
|
||||
{
|
||||
sb.append(baseUrl).append(curBatch).append("/");
|
||||
sb.append(baseUrl);
|
||||
int numFullTextsCurBatch = fileNamesForCurBatch.size();
|
||||
for ( int j=0; j < numFullTextsCurBatch; ++j ){
|
||||
sb.append(fileNamesForCurBatch.get(j));
|
||||
|
|
|
@ -1,15 +1,18 @@
|
|||
<configuration debug="false">
|
||||
|
||||
<appender name="File" class="ch.qos.logback.core.rolling.RollingFileAppender">
|
||||
<appender name="RollingFile" class="ch.qos.logback.core.rolling.RollingFileAppender">
|
||||
<file>logs/UrlsController.log</file>
|
||||
|
||||
<rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">
|
||||
<fileNamePattern>logs/UrlsController.%i.log.zip</fileNamePattern>
|
||||
<minIndex>1</minIndex>
|
||||
<maxIndex>20</maxIndex>
|
||||
</rollingPolicy>
|
||||
|
||||
<triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
|
||||
<maxFileSize>50MB</maxFileSize>
|
||||
</triggeringPolicy>
|
||||
|
||||
<encoder>
|
||||
<charset>UTF-8</charset>
|
||||
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M\(@%line\) - %msg%n</pattern>
|
||||
|
@ -24,7 +27,7 @@
|
|||
</appender>
|
||||
|
||||
<root level="debug">
|
||||
<appender-ref ref="File" />
|
||||
<appender-ref ref="RollingFile" />
|
||||
</root>
|
||||
|
||||
</configuration>
|
Loading…
Reference in New Issue