- Heavily reduce the maximum amount of space needed, by deleting the files of each full-texts batch, right after they are uploaded to the S3 Object Store.
- Add a check for when the retrieved full-texts-batch is missing some requested files and show a warn-log. - Update dependencies.
This commit is contained in:
parent
d8773e6ebb
commit
dc8f0f2bd1
|
@ -1,5 +1,5 @@
|
|||
plugins {
|
||||
id 'org.springframework.boot' version '2.7.7'
|
||||
id 'org.springframework.boot' version '2.7.8'
|
||||
id 'io.spring.dependency-management' version '1.1.0'
|
||||
id 'java'
|
||||
}
|
||||
|
@ -104,7 +104,7 @@ dependencies {
|
|||
|
||||
// Add back some updated version of the needed dependencies.
|
||||
implementation 'org.apache.thrift:libthrift:0.17.0'
|
||||
implementation 'com.fasterxml.woodstox:woodstox-core:6.4.0'
|
||||
implementation 'com.fasterxml.woodstox:woodstox-core:6.5.0'
|
||||
|
||||
// https://mvnrepository.com/artifact/org.json/json
|
||||
implementation 'org.json:json:20220924'
|
||||
|
|
|
@ -227,7 +227,7 @@ public class FileUtils {
|
|||
return UploadFullTextsResponse.successful; // It was handled, no error.
|
||||
}
|
||||
|
||||
// Request the full-texts in batches, compressed in a zstd tar.
|
||||
// Request the full-texts in batches, compressed in a zstd tar file.
|
||||
int numOfBatches = (numAllFullTexts / numOfFullTextsPerBatch);
|
||||
int remainingFiles = (numAllFullTexts % numOfFullTextsPerBatch);
|
||||
if ( remainingFiles > 0 ) { // Add an extra batch for the remaining files. This guarantees at least one batch will exist no matter how few (>0) the files are.
|
||||
|
@ -243,7 +243,7 @@ public class FileUtils {
|
|||
// TODO - The least we have to do it to expose the port-assignment somewhere more obvious like inside the "application.properties" file.
|
||||
|
||||
String curAssignmentsBaseLocation = baseFilesLocation + "assignments_" + assignmentsBatchCounter + File.separator;
|
||||
// Note: the "curAssignmentsBaseLocation"-directory will be create once the first batch sub-directory is called for creation.
|
||||
// Note: the "curAssignmentsBaseLocation"-directory will be created once the first batch subdirectory is called for creation.
|
||||
|
||||
int failedBatches = 0;
|
||||
for ( int batchCounter = 1; batchCounter <= numOfBatches; ++batchCounter ) {
|
||||
|
@ -263,9 +263,10 @@ public class FileUtils {
|
|||
|
||||
// Get the extracted files.
|
||||
String targetDirectory = curAssignmentsBaseLocation + "batch_" + batchCounter + File.separator;
|
||||
Path curBatchPath = null;
|
||||
try {
|
||||
// Create this batch-directory.
|
||||
Path curBatchPath = Files.createDirectories(Paths.get(targetDirectory));
|
||||
curBatchPath = Files.createDirectories(Paths.get(targetDirectory));
|
||||
// The base-directory will be created along with the first batch directory.
|
||||
|
||||
// Save and decompress the zstd file. Iterate over the PDFs and upload each one of them and get the S3-Url.
|
||||
|
@ -282,18 +283,26 @@ public class FileUtils {
|
|||
|
||||
fileDecompressor.decompressFiles(zstdFileFullPath, curBatchPath);
|
||||
|
||||
String[] fileNames = new File(targetDirectory).list();
|
||||
if ( (fileNames == null) || (fileNames.length <= 2) ) { // The directory might have only two files, the "tar-file" and the "tar.zstd-file", if the full-texts failed to be decompressed or untarred..
|
||||
logger.error("No full-text fileNames where extracted from directory: " + targetDirectory);
|
||||
String[] extractedFileNames = new File(targetDirectory).list();
|
||||
if ( (extractedFileNames == null) || (extractedFileNames.length <= 2) ) { // The directory might have only two files, the "tar-file" and the "tar.zstd-file", if the full-texts failed to be decompressed or untarred..
|
||||
logger.error("No full-texts' fleNames where extracted from directory: " + targetDirectory);
|
||||
failedBatches ++;
|
||||
continue; // To the next batch.
|
||||
}
|
||||
else if ( (extractedFileNames.length -2) != fileNamesForCurBatch.size() ) {
|
||||
logger.warn("The number of extracted files (" + (extractedFileNames.length -2) + ") was not equal to the number of the current-batch's files (" + fileNamesForCurBatch.size() + ").");
|
||||
// We do NOT have to find and cross-reference the missing files with the urlReports, in order to set their locations to <null>,
|
||||
// since, in the end of each assignments-batch, an iteration will be made and for all the non-retrieved and non-uploaded full-texts, the app will set them to null.
|
||||
}
|
||||
|
||||
uploadFullTexts(fileNames, targetDirectory, allFileNamesWithPayloads);
|
||||
uploadFullTexts(extractedFileNames, targetDirectory, allFileNamesWithPayloads);
|
||||
|
||||
} catch (Exception e) {
|
||||
logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + "\n" + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6).
|
||||
failedBatches ++;
|
||||
} finally {
|
||||
if ( curBatchPath != null )
|
||||
deleteDirectory(curBatchPath.toFile());
|
||||
}
|
||||
} // End of batches.
|
||||
|
||||
|
|
Loading…
Reference in New Issue