diff --git a/build.gradle b/build.gradle index 9d8dc8e..bc79d88 100644 --- a/build.gradle +++ b/build.gradle @@ -1,5 +1,5 @@ plugins { - id 'org.springframework.boot' version '2.7.7' + id 'org.springframework.boot' version '2.7.8' id 'io.spring.dependency-management' version '1.1.0' id 'java' } @@ -104,7 +104,7 @@ dependencies { // Add back some updated version of the needed dependencies. implementation 'org.apache.thrift:libthrift:0.17.0' - implementation 'com.fasterxml.woodstox:woodstox-core:6.4.0' + implementation 'com.fasterxml.woodstox:woodstox-core:6.5.0' // https://mvnrepository.com/artifact/org.json/json implementation 'org.json:json:20220924' diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index 62f04d4..5b58bfe 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -227,7 +227,7 @@ public class FileUtils { return UploadFullTextsResponse.successful; // It was handled, no error. } - // Request the full-texts in batches, compressed in a zstd tar. + // Request the full-texts in batches, compressed in a zstd tar file. int numOfBatches = (numAllFullTexts / numOfFullTextsPerBatch); int remainingFiles = (numAllFullTexts % numOfFullTextsPerBatch); if ( remainingFiles > 0 ) { // Add an extra batch for the remaining files. This guarantees at least one batch will exist no matter how few (>0) the files are. @@ -243,7 +243,7 @@ public class FileUtils { // TODO - The least we have to do it to expose the port-assignment somewhere more obvious like inside the "application.properties" file. String curAssignmentsBaseLocation = baseFilesLocation + "assignments_" + assignmentsBatchCounter + File.separator; - // Note: the "curAssignmentsBaseLocation"-directory will be create once the first batch sub-directory is called for creation. + // Note: the "curAssignmentsBaseLocation"-directory will be created once the first batch subdirectory is called for creation. int failedBatches = 0; for ( int batchCounter = 1; batchCounter <= numOfBatches; ++batchCounter ) { @@ -263,9 +263,10 @@ public class FileUtils { // Get the extracted files. String targetDirectory = curAssignmentsBaseLocation + "batch_" + batchCounter + File.separator; + Path curBatchPath = null; try { // Create this batch-directory. - Path curBatchPath = Files.createDirectories(Paths.get(targetDirectory)); + curBatchPath = Files.createDirectories(Paths.get(targetDirectory)); // The base-directory will be created along with the first batch directory. // Save and decompress the zstd file. Iterate over the PDFs and upload each one of them and get the S3-Url. @@ -282,18 +283,26 @@ public class FileUtils { fileDecompressor.decompressFiles(zstdFileFullPath, curBatchPath); - String[] fileNames = new File(targetDirectory).list(); - if ( (fileNames == null) || (fileNames.length <= 2) ) { // The directory might have only two files, the "tar-file" and the "tar.zstd-file", if the full-texts failed to be decompressed or untarred.. - logger.error("No full-text fileNames where extracted from directory: " + targetDirectory); + String[] extractedFileNames = new File(targetDirectory).list(); + if ( (extractedFileNames == null) || (extractedFileNames.length <= 2) ) { // The directory might have only two files, the "tar-file" and the "tar.zstd-file", if the full-texts failed to be decompressed or untarred.. + logger.error("No full-texts' fleNames where extracted from directory: " + targetDirectory); failedBatches ++; continue; // To the next batch. } + else if ( (extractedFileNames.length -2) != fileNamesForCurBatch.size() ) { + logger.warn("The number of extracted files (" + (extractedFileNames.length -2) + ") was not equal to the number of the current-batch's files (" + fileNamesForCurBatch.size() + ")."); + // We do NOT have to find and cross-reference the missing files with the urlReports, in order to set their locations to , + // since, in the end of each assignments-batch, an iteration will be made and for all the non-retrieved and non-uploaded full-texts, the app will set them to null. + } - uploadFullTexts(fileNames, targetDirectory, allFileNamesWithPayloads); + uploadFullTexts(extractedFileNames, targetDirectory, allFileNamesWithPayloads); } catch (Exception e) { logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + "\n" + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6). failedBatches ++; + } finally { + if ( curBatchPath != null ) + deleteDirectory(curBatchPath.toFile()); } } // End of batches.