- Heavily reduce the maximum amount of space needed, by deleting the files of each full-texts batch, right after they are uploaded to the S3 Object Store.

- Add a check for when the retrieved full-texts-batch is missing some requested files and show a warn-log.
- Update dependencies.
This commit is contained in:
Lampros Smyrnaios 2023-01-23 20:23:21 +02:00
parent d8773e6ebb
commit dc8f0f2bd1
2 changed files with 18 additions and 9 deletions

View File

@ -1,5 +1,5 @@
plugins {
id 'org.springframework.boot' version '2.7.7'
id 'org.springframework.boot' version '2.7.8'
id 'io.spring.dependency-management' version '1.1.0'
id 'java'
}
@ -104,7 +104,7 @@ dependencies {
// Add back some updated version of the needed dependencies.
implementation 'org.apache.thrift:libthrift:0.17.0'
implementation 'com.fasterxml.woodstox:woodstox-core:6.4.0'
implementation 'com.fasterxml.woodstox:woodstox-core:6.5.0'
// https://mvnrepository.com/artifact/org.json/json
implementation 'org.json:json:20220924'

View File

@ -227,7 +227,7 @@ public class FileUtils {
return UploadFullTextsResponse.successful; // It was handled, no error.
}
// Request the full-texts in batches, compressed in a zstd tar.
// Request the full-texts in batches, compressed in a zstd tar file.
int numOfBatches = (numAllFullTexts / numOfFullTextsPerBatch);
int remainingFiles = (numAllFullTexts % numOfFullTextsPerBatch);
if ( remainingFiles > 0 ) { // Add an extra batch for the remaining files. This guarantees at least one batch will exist no matter how few (>0) the files are.
@ -243,7 +243,7 @@ public class FileUtils {
// TODO - The least we have to do it to expose the port-assignment somewhere more obvious like inside the "application.properties" file.
String curAssignmentsBaseLocation = baseFilesLocation + "assignments_" + assignmentsBatchCounter + File.separator;
// Note: the "curAssignmentsBaseLocation"-directory will be create once the first batch sub-directory is called for creation.
// Note: the "curAssignmentsBaseLocation"-directory will be created once the first batch subdirectory is called for creation.
int failedBatches = 0;
for ( int batchCounter = 1; batchCounter <= numOfBatches; ++batchCounter ) {
@ -263,9 +263,10 @@ public class FileUtils {
// Get the extracted files.
String targetDirectory = curAssignmentsBaseLocation + "batch_" + batchCounter + File.separator;
Path curBatchPath = null;
try {
// Create this batch-directory.
Path curBatchPath = Files.createDirectories(Paths.get(targetDirectory));
curBatchPath = Files.createDirectories(Paths.get(targetDirectory));
// The base-directory will be created along with the first batch directory.
// Save and decompress the zstd file. Iterate over the PDFs and upload each one of them and get the S3-Url.
@ -282,18 +283,26 @@ public class FileUtils {
fileDecompressor.decompressFiles(zstdFileFullPath, curBatchPath);
String[] fileNames = new File(targetDirectory).list();
if ( (fileNames == null) || (fileNames.length <= 2) ) { // The directory might have only two files, the "tar-file" and the "tar.zstd-file", if the full-texts failed to be decompressed or untarred..
logger.error("No full-text fileNames where extracted from directory: " + targetDirectory);
String[] extractedFileNames = new File(targetDirectory).list();
if ( (extractedFileNames == null) || (extractedFileNames.length <= 2) ) { // The directory might have only two files, the "tar-file" and the "tar.zstd-file", if the full-texts failed to be decompressed or untarred..
logger.error("No full-texts' fleNames where extracted from directory: " + targetDirectory);
failedBatches ++;
continue; // To the next batch.
}
else if ( (extractedFileNames.length -2) != fileNamesForCurBatch.size() ) {
logger.warn("The number of extracted files (" + (extractedFileNames.length -2) + ") was not equal to the number of the current-batch's files (" + fileNamesForCurBatch.size() + ").");
// We do NOT have to find and cross-reference the missing files with the urlReports, in order to set their locations to <null>,
// since, in the end of each assignments-batch, an iteration will be made and for all the non-retrieved and non-uploaded full-texts, the app will set them to null.
}
uploadFullTexts(fileNames, targetDirectory, allFileNamesWithPayloads);
uploadFullTexts(extractedFileNames, targetDirectory, allFileNamesWithPayloads);
} catch (Exception e) {
logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + "\n" + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6).
failedBatches ++;
} finally {
if ( curBatchPath != null )
deleteDirectory(curBatchPath.toFile());
}
} // End of batches.