diff --git a/src/main/java/eu/openaire/urls_worker/controllers/FullTextsController.java b/src/main/java/eu/openaire/urls_worker/controllers/FullTextsController.java index c0595fb..1c2460d 100644 --- a/src/main/java/eu/openaire/urls_worker/controllers/FullTextsController.java +++ b/src/main/java/eu/openaire/urls_worker/controllers/FullTextsController.java @@ -3,7 +3,7 @@ package eu.openaire.urls_worker.controllers; import eu.openaire.urls_worker.plugins.PublicationsRetrieverPlugin; import eu.openaire.urls_worker.services.FileStorageService; import eu.openaire.urls_worker.util.FilesCompressor; -import eu.openaire.urls_worker.util.FilesZipper; +import org.apache.commons.io.FileDeleteStrategy; import org.apache.commons.io.FileUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -36,6 +36,8 @@ public class FullTextsController { } + // TODO - Remove the "Improved" form the endpoint's name, now that the previous "simple"-endpoint is removed. + // TODO - This has to happen at the same time with the related change in the API-call from the Controller! @GetMapping("getFullTextsImproved/{assignmentsCounter:[\\d]+}/{totalBatches:[\\d]+}/{batchCounter:[\\d]+}/{fileNamesWithExtensions}") public Object getMultipleFullTextsImproved(@PathVariable long assignmentsCounter, @PathVariable int totalBatches, @PathVariable int batchCounter, @PathVariable List fileNamesWithExtensions) { @@ -73,6 +75,7 @@ public class FullTextsController { String errorMsg = "Failed to create the zstd file for \"batchCounter\"-" + batchCounter; logger.error(errorMsg); return ResponseEntity.internalServerError().body(errorMsg); + // The related files will be deleted later, upon completing the Worker-report process, in "AssignmentsHandler.postWorkerReport()". } if ( batchCounter == totalBatches ) @@ -89,6 +92,11 @@ public class FullTextsController { String errorMsg = "Could not load the FileInputStream of the zstd-tar-file \"" + zstdTarFileFullPath + "\"!"; logger.error(errorMsg, e); return ResponseEntity.internalServerError().body(errorMsg); + } finally { + // In some cases, the full-texts might be too large and their total number too, + // so if we leave them be, and wait for all batches to finish, we may get a "java.io.IOException: No space left on device" error. + deleteFulltextBatchFiles(currentAssignmentsBaseFullTextsPath, assignmentsCounter, batchCounter, fileNamesWithExtensions); + // The ".tar.zstd" file of this batch, for which we pass a steam to the Controller, will be deleted after the next batch, or after all batches are transferred and handles by the Controller. } // The related fulltext and (zstd-)tar files will be deleted in "AssignmentsHandler.postWorkerReport()", after the Controller has finished transferring them. They will be deleted even in case of a Controller-error. @@ -96,67 +104,6 @@ public class FullTextsController { } - @Deprecated - @GetMapping("getFullTexts/{assignmentsCounter:[\\d]+}/{totalZipBatches:[\\d]+}/{zipBatchCounter:[\\d]+}/{fileNamesWithExtensions}") - public Object getMultipleFullTexts(@PathVariable long assignmentsCounter, @PathVariable int totalZipBatches, @PathVariable int zipBatchCounter, @PathVariable List fileNamesWithExtensions) { - - int fileNamesListNum = fileNamesWithExtensions.size(); - if ( (fileNamesListNum == 1) && (fileNamesWithExtensions.get(0).length() == 0) ) { // In case the last "/" in the url was given (without any files following), then this list will not be empty, but have one empty item instead. - // In case the url does not end in "/", then Spring will automatically return an "HTTP-BadRequest". - String errorMsg = "An empty \"fileNamesWithExtensions\" list was given from assignments_" + assignmentsCounter + ", for batch_" + zipBatchCounter; - logger.error(errorMsg); - return ResponseEntity.badRequest().body(errorMsg); - } - - if ( totalZipBatches == 0 ) { - String errorMsg = "The given \"totalZipBatches\" (" + totalZipBatches + ") was < 0 >!"; - logger.error(errorMsg); - return ResponseEntity.badRequest().body(errorMsg); - } - else if ( zipBatchCounter > totalZipBatches ) { - String errorMsg = "The given \"zipBatchCounter\" (" + zipBatchCounter + ") is greater than the \"totalZipBatches\" (" + totalZipBatches + ")!"; - logger.error(errorMsg); - return ResponseEntity.badRequest().body(errorMsg); - } - - logger.info("Received a \"getMultipleFullTexts\" request for returning a zip-file containing " + fileNamesListNum + " full-texts, from assignments_" + assignmentsCounter + ", for batch_" + zipBatchCounter + " (out of " + totalZipBatches + ")."); - - String currentAssignmentsBaseFullTextsPath = assignmentsBaseDir + "assignments_" + assignmentsCounter + "_fullTexts" + File.separator; - - if ( ! (new File(currentAssignmentsBaseFullTextsPath).isDirectory()) ) { - String errorMsg = "The base directory for assignments_" + assignmentsCounter + " was not found: " + currentAssignmentsBaseFullTextsPath; - logger.error(errorMsg); - return ResponseEntity.badRequest().body(errorMsg); - } - - File zipFile = FilesZipper.zipMultipleFilesAndGetZip(assignmentsCounter, zipBatchCounter, fileNamesWithExtensions, currentAssignmentsBaseFullTextsPath); - if ( zipFile == null ) { - String errorMsg = "Failed to create the zip file for \"zipBatchCounter\"-" + zipBatchCounter; - logger.error(errorMsg); - return ResponseEntity.internalServerError().body(errorMsg); - } - - if ( zipBatchCounter == totalZipBatches ) - logger.debug("Will return the " + ((totalZipBatches > 1) ? "last" : "only one") + " batch (" + zipBatchCounter + ") of Assignments_" + assignmentsCounter + " to the Controller."); - - String zipName = zipFile.getName(); - String zipFileFullPath = currentAssignmentsBaseFullTextsPath + zipName; - try { - return ResponseEntity.ok() - .contentType(MediaType.APPLICATION_OCTET_STREAM) - .header(HttpHeaders.CONTENT_DISPOSITION, "inline; filename=\"" + zipName + "\"") - .body(new InputStreamResource(Files.newInputStream(Paths.get(zipFileFullPath)))); - } catch (Exception e) { - String errorMsg = "Could not load the FileInputStream of the zip-file \"" + zipFileFullPath + "\"!"; - logger.error(errorMsg, e); - return ResponseEntity.internalServerError().body(errorMsg); - } - - // The related fulltext and zip files will be deleted in "AssignmentsHandler.postWorkerReport()", after the Controller has finished transferring them. They will be deleted even in case of a Controller-error. - // In case of an error and file-deletion, the related id-url records will just be re-processed in the future by some (maybe different) Worker. - } - - @GetMapping("getFullText/{assignmentsCounter:[\\d]+}/{fileNameWithExtension:[\\w_:]+.[\\w]{2,10}}") public ResponseEntity getFullText(@PathVariable long assignmentsCounter, @PathVariable String fileNameWithExtension) { @@ -203,4 +150,39 @@ public class FullTextsController { } } + + public static void deleteFulltextBatchFiles(String assignmentsBatchDir, long assignmentsCounter, long fulltextsBatch, List filenames) + { + // We will delete all the files related to the given fulltexts-batch, along with the created tar and zstd files. + + for ( String fileName : filenames ) + deleteFile(assignmentsBatchDir + fileName); + + // Now let's delete the ".tar" and ".tar.zstd" files as well. + String partialNonBatchFileName = assignmentsBatchDir + "assignments_" + assignmentsCounter + "_full-texts_"; + deleteFile(partialNonBatchFileName + fulltextsBatch + ".tar"); + + // The ".tar.zstd" file of this batch will be deleted by the next batch or in the end of these assignments. + // Now we will delete the zstd file of the previous assignments. + if ( fulltextsBatch >= 2 ) + deleteFile(partialNonBatchFileName + (fulltextsBatch -1) + ".tar.zstd"); + + // We do not use a fulltexts-batch directory, since even if it makes the deletion faster, it will make the full-texts delivery to the controller slower, + // since we will need to move the requested full-texts to that directory before continuing with tarring and compressing the files and sending them over to the Controller. + // Also, we cannot pre-create such directories, since it will add complexity in the download process and also some of the full-texts may not be requested by the Controller (because of duplicates). + } + + + public static boolean deleteFile(String fileFullPathString) + { + File currentFile = new File(fileFullPathString); + try { + FileDeleteStrategy.FORCE.delete(currentFile); + } catch (IOException e) { + logger.error("Error when deleting the file: " + fileFullPathString); + return false; + } + return true; + } + } diff --git a/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java b/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java index b57cc63..85c658f 100644 --- a/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java +++ b/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java @@ -160,7 +160,7 @@ public class AssignmentsHandler { timesClearingDomainAndPathTrackingData ++; timesClearingDomainAndPathBlockingData ++; // Increment this also, as we avoid the following check in this case, but the counter has to be increased nevertheless. } else if ( idUrlPairsHandled >= ((timesClearingDomainAndPathBlockingData +1) * idUrlsToHandleBeforeClearingDomainAndPathBlockingData) ) { - GenericUtils.clearDomainAndPathBlockingData(); + GenericUtils.clearBlockingData(); timesClearingDomainAndPathBlockingData ++; } @@ -219,11 +219,15 @@ public class AssignmentsHandler { urlReports.clear(); // Reset, without de-allocating. assignmentsForPlugins.clear(); - // It is possible that one or more full-texts-batches, are not sent to the Controller, or that the Controller failed to process them. - // In that case, the related "attempt"-records will keep their "success" state, but the related "payload" records will not be inserted into the database. - // When all the id-urls are processed at least one time, the Controller will start returning all the "couldRetry" records without a related "payload"-record. FullTextsController.deleteDirectory(assignmentRequestCounter); + // Even though we delete the full-texts batch-by-batch, some files may not have been previously deleted, since they may be duplicates of others found by previous assignments-batches + // and thus, they may have not been requested by the Controller (and thus not deleted after transferring the batches). + // Also, the ".tar.zstd" file of last batch will be deleted here, as well as the whole directory itself. } + + // Note: It is possible that one or more full-texts-batches, are not sent to the Controller, or that the Controller failed to process them. + // In that case, the related "attempt"-records will keep their "success" state, but the related "payload" records will not be inserted into the database. + // When all the id-urls are processed at least one time, the Controller will start returning all the "couldRetry" records without a related "payload"-record. } diff --git a/src/main/java/eu/openaire/urls_worker/util/FilesZipper.java b/src/main/java/eu/openaire/urls_worker/util/FilesZipper.java deleted file mode 100644 index 2d24e46..0000000 --- a/src/main/java/eu/openaire/urls_worker/util/FilesZipper.java +++ /dev/null @@ -1,74 +0,0 @@ -package eu.openaire.urls_worker.util; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.io.File; -import java.io.FileInputStream; -import java.io.FileNotFoundException; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.util.List; -import java.util.zip.ZipEntry; -import java.util.zip.ZipOutputStream; - - -public class FilesZipper -{ - private static final Logger logger = LoggerFactory.getLogger(FilesZipper.class); - - - public static File zipMultipleFilesAndGetZip(long assignmentsCounter, int zipBatchCounter, List filesToZip, String baseDirectory) - { - String zipFileFullPath = baseDirectory + "assignments_" + assignmentsCounter + "_full-texts_" + zipBatchCounter + ".zip"; - // For example: assignments_2_full-texts_4.zip | where < 4 > is referred to the 4th batch of files requested by the Controller. - - int numZippedFiles = 0; - File zipFile = new File(zipFileFullPath); - try ( ZipOutputStream zos = new ZipOutputStream(Files.newOutputStream(zipFile.toPath()), StandardCharsets.UTF_8) ) - { - for ( String file : filesToZip ) { - if ( zipAFile(file, zos, baseDirectory) ) - numZippedFiles ++; - } - } catch (Exception e) { - logger.error("Exception when creating the zip-file: " + zipFileFullPath, e); - return null; - } - logger.debug("Zipped " + numZippedFiles + " (out of " + filesToZip.size() + ") files for assignments_" + assignmentsCounter + ", batch_" + zipBatchCounter); - return zipFile; - } - - - private static boolean zipAFile(String fileName, ZipOutputStream zos, String baseDir) - { - boolean shouldCloseEntry = false; // Useful in order to know if we should close the entry (an Exception may appear, and so we should not try to close it). - String fullFileName = baseDir + fileName; - try (FileInputStream fis = new FileInputStream(fullFileName)) { - zos.putNextEntry(new ZipEntry(fileName)); - shouldCloseEntry = true; - int readByte; - while ( (readByte = fis.read()) != -1 ) { - zos.write(readByte); - } - } catch (FileNotFoundException fnfe) { - logger.error("Error zipping file: " + fullFileName, fnfe.getMessage()); - return false; - } catch (Exception e) { - if ( ! e.getMessage().contains("duplicate") ) - logger.error("Error zipping file: " + fullFileName, e); - return false; - } finally { - if ( shouldCloseEntry ) { - try { - zos.closeEntry(); // close just the ZipEntry here (not the ZipOutputStream) - } catch (IOException e) { - logger.error("", e); - } - } - } - return true; - } - -}