- Fix a bug, which caused the get-full-texts request to fail, because of the wrong "requestAssignmentsCounter".

- Fix a bug, which caused multiple workers to get assigned the same batch-counter, while the assignment-tasks where different.
- Set a max-size limit to the amount of space the logs can use. Over that size, the older logs will be deleted.
- Show the error-message returned from the Worker, when a getFullTexts-request fails.
- Improve some log-messages.
- Update dependencies.
- Code cleanup.
springify_project
Lampros Smyrnaios 2 years ago
parent 15224c6468
commit dea257b87f

@ -43,7 +43,7 @@ dependencies {
// https://mvnrepository.com/artifact/com.google.guava/guava // https://mvnrepository.com/artifact/com.google.guava/guava
implementation group: 'com.google.guava', name: 'guava', version: '31.0.1-jre' implementation group: 'com.google.guava', name: 'guava', version: '31.0.1-jre'
implementation 'io.minio:minio:8.3.3' implementation 'io.minio:minio:8.3.4'
// https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp // https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp
implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3' // This is required by the minio, as Spring uses a version which is not supported by minio. implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3' // This is required by the minio, as Spring uses a version which is not supported by minio.

@ -1,5 +1,5 @@
distributionBase=GRADLE_USER_HOME distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-7.3-bin.zip distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.1-bin.zip
zipStoreBase=GRADLE_USER_HOME zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists zipStorePath=wrapper/dists

@ -8,7 +8,7 @@ elif [[ $# -gt 1 ]]; then
echo -e "Wrong number of arguments given: ${#}\nPlease execute it like: script.sh <justInstall: 0 | 1>"; exit 1 echo -e "Wrong number of arguments given: ${#}\nPlease execute it like: script.sh <justInstall: 0 | 1>"; exit 1
fi fi
gradleVersion="7.3" gradleVersion="7.3.1"
if [[ justInstall -eq 0 ]]; then if [[ justInstall -eq 0 ]]; then

@ -230,8 +230,9 @@ public class UrlController {
ImpalaConnector.closeConnection(con); ImpalaConnector.closeConnection(con);
ImpalaConnector.databaseLock.unlock(); ImpalaConnector.databaseLock.unlock();
logger.info("Sending batch-assignments_" + assignmentsBatchCounter.incrementAndGet() + " with " + assignmentsSize + " assignments to worker with ID: " + workerId + "."); long curAssignmentsBatchCounter = assignmentsBatchCounter.incrementAndGet();
return ResponseEntity.status(HttpStatus.OK).body(new AssignmentsResponse(assignmentsBatchCounter.get(), assignments)); logger.info("Sending batch-assignments_" + curAssignmentsBatchCounter + " with " + assignmentsSize + " assignments to worker with ID: " + workerId + ".");
return ResponseEntity.status(HttpStatus.OK).body(new AssignmentsResponse(curAssignmentsBatchCounter, assignments));
} }
@ -251,11 +252,12 @@ public class UrlController {
return ResponseEntity.status(HttpStatus.BAD_REQUEST).body(errorMsg); return ResponseEntity.status(HttpStatus.BAD_REQUEST).body(errorMsg);
} }
logger.info("Received the WorkerReport for batch-assignments_" + workerReport.getAssignmentRequestCounter() + ", from the worker with id: " + workerReport.getWorkerId() + ". It contains " + urlReports.size() + " urlReports. Going to request the fullTexts from the Worker and insert the UrlReports into the database."); long curReportAssignments = workerReport.getAssignmentRequestCounter();
logger.info("Received the WorkerReport for batch-assignments_" + curReportAssignments + ", from the worker with id: " + workerReport.getWorkerId() + ". It contains " + urlReports.size() + " urlReports. Going to request the fullTexts from the Worker and insert the UrlReports into the database.");
// Before continuing with inserts, take and upload the fullTexts from the Worker. Also, update the file-"location". // Before continuing with inserts, take and upload the fullTexts from the Worker. Also, update the file-"location".
if ( ! FileUtils.getAndUploadFullTexts(urlReports, request, assignmentsBatchCounter, workerReport.getWorkerId()) ) { if ( ! FileUtils.getAndUploadFullTexts(urlReports, request, curReportAssignments, workerReport.getWorkerId()) ) {
logger.error("Failed to get and/or upload the fullTexts for assignments_" + assignmentsBatchCounter); logger.error("Failed to get and/or upload the fullTexts for assignments_" + curReportAssignments);
// The docUrls were still found! Just update ALL the fileLocations. sizes and hashes, to show that the files are not available and continue with writing the attempts and the Payloads. // The docUrls were still found! Just update ALL the fileLocations. sizes and hashes, to show that the files are not available and continue with writing the attempts and the Payloads.
FileUtils.updateUrlReportsToHaveNoFullTextFiles(urlReports); FileUtils.updateUrlReportsToHaveNoFullTextFiles(urlReports);
} }
@ -299,12 +301,11 @@ public class UrlController {
for ( UrlReport urlReport : urlReports ) { for ( UrlReport urlReport : urlReports ) {
Payload payload = urlReport.getPayload(); Payload payload = urlReport.getPayload();
if ( payload == null ) { if ( payload == null ) {
logger.error("Payload was \"null\" for a \"urlReport\", in assignments_" + assignmentsBatchCounter); logger.error("Payload was \"null\" for a \"urlReport\", in assignments_" + curReportAssignments);
payloadErrorMsg = (++failedCount) + " urlReports failed to be processed because they had no payload!"; payloadErrorMsg = (++failedCount) + " urlReports failed to be processed because they had no payload!";
continue; continue;
} }
String tempFullQueryString = null;
try { // We use a "PreparedStatement" to do insertions, for security reasons. try { // We use a "PreparedStatement" to do insertions, for security reasons.
preparedInsertPayloadStatement.setString(1, payload.getId()); preparedInsertPayloadStatement.setString(1, payload.getId());
preparedInsertPayloadStatement.setString(2, payload.getOriginal_url()); preparedInsertPayloadStatement.setString(2, payload.getOriginal_url());
@ -322,10 +323,9 @@ public class UrlController {
preparedInsertPayloadStatement.setString(7, payload.getHash()); preparedInsertPayloadStatement.setString(7, payload.getHash());
preparedInsertPayloadStatement.setString(8, payload.getLocation()); preparedInsertPayloadStatement.setString(8, payload.getLocation());
preparedInsertPayloadStatement.setString(9, payload.getProvenance()); preparedInsertPayloadStatement.setString(9, payload.getProvenance());
tempFullQueryString = preparedInsertPayloadStatement.toString();
preparedInsertPayloadStatement.executeUpdate(); preparedInsertPayloadStatement.executeUpdate();
} catch (SQLException sqle) { } catch (SQLException sqle) {
logger.error("Problem when executing the \"insertIntoPayloadBaseQuery\":\n" + tempFullQueryString + "\n" + sqle.getMessage() + "\n\n"); logger.error("Problem when executing the \"insertIntoPayloadBaseQuery\": " + sqle.getMessage() + "\n\n");
} }
Error error = urlReport.getError(); Error error = urlReport.getError();
@ -341,10 +341,9 @@ public class UrlController {
preparedInsertAttemptStatement.setString(4, urlReport.getStatus().toString()); preparedInsertAttemptStatement.setString(4, urlReport.getStatus().toString());
preparedInsertAttemptStatement.setString(5, String.valueOf(error.getType())); // This covers the case of "null". preparedInsertAttemptStatement.setString(5, String.valueOf(error.getType())); // This covers the case of "null".
preparedInsertAttemptStatement.setString(6, error.getMessage()); preparedInsertAttemptStatement.setString(6, error.getMessage());
tempFullQueryString = preparedInsertAttemptStatement.toString();
preparedInsertAttemptStatement.executeUpdate(); preparedInsertAttemptStatement.executeUpdate();
} catch (SQLException sqle) { } catch (SQLException sqle) {
logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\":\n" + tempFullQueryString + "\n" + sqle.getMessage() + "\n\n"); logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\": " + sqle.getMessage() + "\n\n");
} }
}//end for-loop }//end for-loop
@ -488,9 +487,9 @@ public class UrlController {
if ( scanner != null ) // Check if the initial value is null. if ( scanner != null ) // Check if the initial value is null.
scanner.close(); scanner.close();
logger.info("Sending batch_" + assignmentsBatchCounter.incrementAndGet() + " with " + assignments.size() + " assignments (" + FileUtils.duplicateIdUrlEntries.get() + " more assignments were discarded as duplicates), to worker with ID: " + workerId); long curAssignmentsBatchCounter = assignmentsBatchCounter.incrementAndGet();
logger.info("Sending batch_" + curAssignmentsBatchCounter + " with " + assignments.size() + " assignments (" + FileUtils.duplicateIdUrlEntries.get() + " more assignments were discarded as duplicates), to worker with ID: " + workerId);
return ResponseEntity.status(HttpStatus.OK).header("Content-Type", "application/json").body(new AssignmentsResponse(assignmentsBatchCounter.get(), assignments)); return ResponseEntity.status(HttpStatus.OK).header("Content-Type", "application/json").body(new AssignmentsResponse(curAssignmentsBatchCounter, assignments));
} }
} }

@ -11,10 +11,7 @@ import org.springframework.boot.configurationprocessor.json.JSONException;
import org.springframework.boot.configurationprocessor.json.JSONObject; import org.springframework.boot.configurationprocessor.json.JSONObject;
import javax.servlet.http.HttpServletRequest; import javax.servlet.http.HttpServletRequest;
import java.io.File; import java.io.*;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection; import java.net.HttpURLConnection;
import java.net.URL; import java.net.URL;
import java.nio.file.Files; import java.nio.file.Files;
@ -27,7 +24,6 @@ import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List; import java.util.List;
import java.util.Scanner; import java.util.Scanner;
import java.util.concurrent.atomic.AtomicLong;
import java.util.regex.Matcher; import java.util.regex.Matcher;
import java.util.regex.Pattern; import java.util.regex.Pattern;
@ -111,7 +107,7 @@ public class FileUtils {
public static final String baseTargetLocation = System.getProperty("user.dir") + File.separator + "fullTexts" + File.separator; public static final String baseTargetLocation = System.getProperty("user.dir") + File.separator + "fullTexts" + File.separator;
private static final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames). private static final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames).
public static boolean getAndUploadFullTexts(List<UrlReport> urlReports, HttpServletRequest request, AtomicLong assignmentsBatchCounter, String workerId) public static boolean getAndUploadFullTexts(List<UrlReport> urlReports, HttpServletRequest request, long assignmentsBatchCounter, String workerId)
{ {
// The Controller have to request the files from the Worker, in order to upload them to the S3. // The Controller have to request the files from the Worker, in order to upload them to the S3.
// We will have to UPDATE the "location" of each of those files in the UrlReports and then insert them all into the database. // We will have to UPDATE the "location" of each of those files in the UrlReports and then insert them all into the database.
@ -176,24 +172,24 @@ public class FileUtils {
File curAssignmentsBaseDir = new File(curAssignmentsBaseLocation); File curAssignmentsBaseDir = new File(curAssignmentsBaseLocation);
int failedBatches = 0; int failedBatches = 0;
for ( int i=1; i <= numOfBatches; ++i ) for ( int batchCounter = 1; batchCounter <= numOfBatches; ++batchCounter )
{ {
List<String> fileNamesForCurBatch = getFileNamesForBatch(allFileNames, numAllFullTexts, i, numOfBatches); List<String> fileNamesForCurBatch = getFileNamesForBatch(allFileNames, numAllFullTexts, batchCounter);
HttpURLConnection conn = getConnection(baseUrl, assignmentsBatchCounter, i, fileNamesForCurBatch, numOfBatches, workerId); HttpURLConnection conn = getConnection(baseUrl, assignmentsBatchCounter, batchCounter, fileNamesForCurBatch, numOfBatches, workerId);
if ( conn == null ) { if ( conn == null ) {
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch); updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch);
failedBatches ++; failedBatches ++;
continue; // To the next batch. continue; // To the next batch.
} }
String targetLocation = curAssignmentsBaseLocation + "batch_" + i + File.separator; String targetLocation = curAssignmentsBaseLocation + "batch_" + batchCounter + File.separator;
File curBatchDir = new File(targetLocation); File curBatchDir = new File(targetLocation);
try { try {
// Get the extracted files., // Get the extracted files.,
Path targetPath = Files.createDirectories(Paths.get(targetLocation)); Path targetPath = Files.createDirectories(Paths.get(targetLocation));
// Unzip the file. Iterate over the PDFs and upload each one of them and get the S3-Url // Unzip the file. Iterate over the PDFs and upload each one of them and get the S3-Url
String zipFileFullPath = targetLocation + "fullTexts_" + assignmentsBatchCounter + "_" + i + ".zip"; String zipFileFullPath = targetLocation + "fullTexts_" + assignmentsBatchCounter + "_" + batchCounter + ".zip";
File zipFile = new File(zipFileFullPath); File zipFile = new File(zipFileFullPath);
if ( ! saveZipFile(conn, zipFile) ) { if ( ! saveZipFile(conn, zipFile) ) {
@ -254,10 +250,10 @@ public class FileUtils {
setUnretrievedFullText(payload); setUnretrievedFullText(payload);
} }
logger.info("Finished uploading " + numUploadedFiles + " full-texts of assignments_" + assignmentsBatchCounter + " on S3-ObjectStore."); logger.info("Finished uploading " + numUploadedFiles + " full-texts of assignments_" + assignmentsBatchCounter + ", batch_" + batchCounter + " on S3-ObjectStore.");
} catch (Exception e) { } catch (Exception e) {
logger.error("Could not extract and upload the full-texts for batch_" + i + " of assignments_" + assignmentsBatchCounter + "\n" + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6). logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + "\n" + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6).
updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch); updateUrlReportsForCurBatchTOHaveNoFullTextFiles(payloadsHashMap, fileNamesForCurBatch);
failedBatches ++; failedBatches ++;
} finally { } finally {
@ -270,7 +266,7 @@ public class FileUtils {
// Check if none of the batches were handled.. // Check if none of the batches were handled..
if ( failedBatches == numOfBatches ) { if ( failedBatches == numOfBatches ) {
logger.error("None of the " + numOfBatches + " batches could be handled!"); logger.error("None of the " + numOfBatches + " batches could be handled for assignments_" + assignmentsBatchCounter + ", for worker: " + workerId);
return false; return false;
} else { } else {
replaceNotUploadedFileLocations(urlReports); replaceNotUploadedFileLocations(urlReports);
@ -279,9 +275,10 @@ public class FileUtils {
} }
private static HttpURLConnection getConnection(String baseUrl, AtomicLong assignmentsBatchCounter, int batchNum, List<String> fileNamesForCurBatch, int totalBatches, String workerId) private static HttpURLConnection getConnection(String baseUrl, long assignmentsBatchCounter, int batchNum, List<String> fileNamesForCurBatch, int totalBatches, String workerId)
{ {
String requestUrl = getRequestUrlForBatch(baseUrl, batchNum, fileNamesForCurBatch); baseUrl += batchNum + "/";
String requestUrl = getRequestUrlForBatch(baseUrl, fileNamesForCurBatch);
logger.info("Going to request the batch_" + batchNum + " (out of " + totalBatches + ") with " + fileNamesForCurBatch.size() + " fullTexts, of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and baseRequestUrl: " + baseUrl + "[fileNames]"); logger.info("Going to request the batch_" + batchNum + " (out of " + totalBatches + ") with " + fileNamesForCurBatch.size() + " fullTexts, of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and baseRequestUrl: " + baseUrl + "[fileNames]");
HttpURLConnection conn = null; HttpURLConnection conn = null;
try { try {
@ -291,7 +288,7 @@ public class FileUtils {
conn.connect(); conn.connect();
int statusCode = conn.getResponseCode(); int statusCode = conn.getResponseCode();
if ( statusCode != 200 ) { if ( statusCode != 200 ) {
logger.warn("HTTP-" + statusCode + ": Problem with when requesting the ZipFile of batch_" + batchNum + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl); logger.warn("HTTP-" + statusCode + ": " + getErrorMessageFromResponseBody(conn) + "\nProblem when requesting the ZipFile of batch_" + batchNum + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl);
return null; return null;
} }
} catch (Exception e) { } catch (Exception e) {
@ -302,7 +299,29 @@ public class FileUtils {
} }
private static List<String> getFileNamesForBatch(List<String> allFileNames, int numAllFullTexts, int curBatch, int numOfBatches) private static String getErrorMessageFromResponseBody(HttpURLConnection conn)
{
StringBuilder errorMsgStrB = new StringBuilder(500);
try ( BufferedReader br = new BufferedReader(new InputStreamReader(conn.getErrorStream())) ) { // Try-with-resources
String inputLine;
while ( (inputLine = br.readLine()) != null )
{
if ( !inputLine.isEmpty() ) {
errorMsgStrB.append(inputLine);
}
}
return (errorMsgStrB.length() != 0) ? errorMsgStrB.toString() : null; // Make sure we return a "null" on empty string, to better handle the case in the caller-function.
} catch ( IOException ioe ) {
logger.error("IOException when retrieving the error-message: " + ioe.getMessage());
return null;
} catch ( Exception e ) {
logger.error("Could not extract the errorMessage!", e);
return null;
}
}
private static List<String> getFileNamesForBatch(List<String> allFileNames, int numAllFullTexts, int curBatch)
{ {
int initialIndex = ((curBatch-1) * numOfFullTextsPerBatch); int initialIndex = ((curBatch-1) * numOfFullTextsPerBatch);
int endingIndex = (curBatch * numOfFullTextsPerBatch); int endingIndex = (curBatch * numOfFullTextsPerBatch);
@ -321,12 +340,12 @@ public class FileUtils {
} }
private static final StringBuilder sb = new StringBuilder(numOfFullTextsPerBatch * 100); private static final StringBuilder sb = new StringBuilder(numOfFullTextsPerBatch * 50);
// TODO - Make it THREAD-LOCAL, if we move to multi-thread batch requests. // TODO - Make it THREAD-LOCAL, if we move to multi-thread batch requests.
private static String getRequestUrlForBatch(String baseUrl, int curBatch, List<String> fileNamesForCurBatch) private static String getRequestUrlForBatch(String baseUrl, List<String> fileNamesForCurBatch)
{ {
sb.append(baseUrl).append(curBatch).append("/"); sb.append(baseUrl);
int numFullTextsCurBatch = fileNamesForCurBatch.size(); int numFullTextsCurBatch = fileNamesForCurBatch.size();
for ( int j=0; j < numFullTextsCurBatch; ++j ){ for ( int j=0; j < numFullTextsCurBatch; ++j ){
sb.append(fileNamesForCurBatch.get(j)); sb.append(fileNamesForCurBatch.get(j));

@ -1,15 +1,18 @@
<configuration debug="false"> <configuration debug="false">
<appender name="File" class="ch.qos.logback.core.rolling.RollingFileAppender"> <appender name="RollingFile" class="ch.qos.logback.core.rolling.RollingFileAppender">
<file>logs/UrlsController.log</file> <file>logs/UrlsController.log</file>
<rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy"> <rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy">
<fileNamePattern>logs/UrlsController.%i.log.zip</fileNamePattern> <fileNamePattern>logs/UrlsController.%i.log.zip</fileNamePattern>
<minIndex>1</minIndex>
<maxIndex>20</maxIndex>
</rollingPolicy> </rollingPolicy>
<triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy"> <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy">
<maxFileSize>50MB</maxFileSize> <maxFileSize>50MB</maxFileSize>
</triggeringPolicy> </triggeringPolicy>
<encoder> <encoder>
<charset>UTF-8</charset> <charset>UTF-8</charset>
<pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M\(@%line\) - %msg%n</pattern> <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{36}.%M\(@%line\) - %msg%n</pattern>
@ -24,7 +27,7 @@
</appender> </appender>
<root level="debug"> <root level="debug">
<appender-ref ref="File" /> <appender-ref ref="RollingFile" />
</root> </root>
</configuration> </configuration>
Loading…
Cancel
Save