- Delete the transferred full-texts as soon as possible, in order to mitigate the "No space left on device"-error, which may appear, in case we have some very large files.

- Use the new "GenericUtils.clearBlockingData()" method from the "PublicationsRetriever" library.
- Remove the deprecated "getMultipleFullTexts"-endpoint, along with the Zip-related code.
This commit is contained in:
Lampros Smyrnaios 2023-01-18 16:55:59 +02:00
parent 7dd5719bff
commit bd0d9eb36f
3 changed files with 52 additions and 140 deletions

View File

@ -3,7 +3,7 @@ package eu.openaire.urls_worker.controllers;
import eu.openaire.urls_worker.plugins.PublicationsRetrieverPlugin;
import eu.openaire.urls_worker.services.FileStorageService;
import eu.openaire.urls_worker.util.FilesCompressor;
import eu.openaire.urls_worker.util.FilesZipper;
import org.apache.commons.io.FileDeleteStrategy;
import org.apache.commons.io.FileUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@ -36,6 +36,8 @@ public class FullTextsController {
}
// TODO - Remove the "Improved" form the endpoint's name, now that the previous "simple"-endpoint is removed.
// TODO - This has to happen at the same time with the related change in the API-call from the Controller!
@GetMapping("getFullTextsImproved/{assignmentsCounter:[\\d]+}/{totalBatches:[\\d]+}/{batchCounter:[\\d]+}/{fileNamesWithExtensions}")
public Object getMultipleFullTextsImproved(@PathVariable long assignmentsCounter, @PathVariable int totalBatches, @PathVariable int batchCounter, @PathVariable List<String> fileNamesWithExtensions) {
@ -73,6 +75,7 @@ public class FullTextsController {
String errorMsg = "Failed to create the zstd file for \"batchCounter\"-" + batchCounter;
logger.error(errorMsg);
return ResponseEntity.internalServerError().body(errorMsg);
// The related files will be deleted later, upon completing the Worker-report process, in "AssignmentsHandler.postWorkerReport()".
}
if ( batchCounter == totalBatches )
@ -89,6 +92,11 @@ public class FullTextsController {
String errorMsg = "Could not load the FileInputStream of the zstd-tar-file \"" + zstdTarFileFullPath + "\"!";
logger.error(errorMsg, e);
return ResponseEntity.internalServerError().body(errorMsg);
} finally {
// In some cases, the full-texts might be too large and their total number too,
// so if we leave them be, and wait for all batches to finish, we may get a "java.io.IOException: No space left on device" error.
deleteFulltextBatchFiles(currentAssignmentsBaseFullTextsPath, assignmentsCounter, batchCounter, fileNamesWithExtensions);
// The ".tar.zstd" file of this batch, for which we pass a steam to the Controller, will be deleted after the next batch, or after all batches are transferred and handles by the Controller.
}
// The related fulltext and (zstd-)tar files will be deleted in "AssignmentsHandler.postWorkerReport()", after the Controller has finished transferring them. They will be deleted even in case of a Controller-error.
@ -96,67 +104,6 @@ public class FullTextsController {
}
@Deprecated
@GetMapping("getFullTexts/{assignmentsCounter:[\\d]+}/{totalZipBatches:[\\d]+}/{zipBatchCounter:[\\d]+}/{fileNamesWithExtensions}")
public Object getMultipleFullTexts(@PathVariable long assignmentsCounter, @PathVariable int totalZipBatches, @PathVariable int zipBatchCounter, @PathVariable List<String> fileNamesWithExtensions) {
int fileNamesListNum = fileNamesWithExtensions.size();
if ( (fileNamesListNum == 1) && (fileNamesWithExtensions.get(0).length() == 0) ) { // In case the last "/" in the url was given (without any files following), then this list will not be empty, but have one empty item instead.
// In case the url does not end in "/", then Spring will automatically return an "HTTP-BadRequest".
String errorMsg = "An empty \"fileNamesWithExtensions\" list was given from assignments_" + assignmentsCounter + ", for batch_" + zipBatchCounter;
logger.error(errorMsg);
return ResponseEntity.badRequest().body(errorMsg);
}
if ( totalZipBatches == 0 ) {
String errorMsg = "The given \"totalZipBatches\" (" + totalZipBatches + ") was < 0 >!";
logger.error(errorMsg);
return ResponseEntity.badRequest().body(errorMsg);
}
else if ( zipBatchCounter > totalZipBatches ) {
String errorMsg = "The given \"zipBatchCounter\" (" + zipBatchCounter + ") is greater than the \"totalZipBatches\" (" + totalZipBatches + ")!";
logger.error(errorMsg);
return ResponseEntity.badRequest().body(errorMsg);
}
logger.info("Received a \"getMultipleFullTexts\" request for returning a zip-file containing " + fileNamesListNum + " full-texts, from assignments_" + assignmentsCounter + ", for batch_" + zipBatchCounter + " (out of " + totalZipBatches + ").");
String currentAssignmentsBaseFullTextsPath = assignmentsBaseDir + "assignments_" + assignmentsCounter + "_fullTexts" + File.separator;
if ( ! (new File(currentAssignmentsBaseFullTextsPath).isDirectory()) ) {
String errorMsg = "The base directory for assignments_" + assignmentsCounter + " was not found: " + currentAssignmentsBaseFullTextsPath;
logger.error(errorMsg);
return ResponseEntity.badRequest().body(errorMsg);
}
File zipFile = FilesZipper.zipMultipleFilesAndGetZip(assignmentsCounter, zipBatchCounter, fileNamesWithExtensions, currentAssignmentsBaseFullTextsPath);
if ( zipFile == null ) {
String errorMsg = "Failed to create the zip file for \"zipBatchCounter\"-" + zipBatchCounter;
logger.error(errorMsg);
return ResponseEntity.internalServerError().body(errorMsg);
}
if ( zipBatchCounter == totalZipBatches )
logger.debug("Will return the " + ((totalZipBatches > 1) ? "last" : "only one") + " batch (" + zipBatchCounter + ") of Assignments_" + assignmentsCounter + " to the Controller.");
String zipName = zipFile.getName();
String zipFileFullPath = currentAssignmentsBaseFullTextsPath + zipName;
try {
return ResponseEntity.ok()
.contentType(MediaType.APPLICATION_OCTET_STREAM)
.header(HttpHeaders.CONTENT_DISPOSITION, "inline; filename=\"" + zipName + "\"")
.body(new InputStreamResource(Files.newInputStream(Paths.get(zipFileFullPath))));
} catch (Exception e) {
String errorMsg = "Could not load the FileInputStream of the zip-file \"" + zipFileFullPath + "\"!";
logger.error(errorMsg, e);
return ResponseEntity.internalServerError().body(errorMsg);
}
// The related fulltext and zip files will be deleted in "AssignmentsHandler.postWorkerReport()", after the Controller has finished transferring them. They will be deleted even in case of a Controller-error.
// In case of an error and file-deletion, the related id-url records will just be re-processed in the future by some (maybe different) Worker.
}
@GetMapping("getFullText/{assignmentsCounter:[\\d]+}/{fileNameWithExtension:[\\w_:]+.[\\w]{2,10}}")
public ResponseEntity<?> getFullText(@PathVariable long assignmentsCounter, @PathVariable String fileNameWithExtension) {
@ -203,4 +150,39 @@ public class FullTextsController {
}
}
public static void deleteFulltextBatchFiles(String assignmentsBatchDir, long assignmentsCounter, long fulltextsBatch, List<String> filenames)
{
// We will delete all the files related to the given fulltexts-batch, along with the created tar and zstd files.
for ( String fileName : filenames )
deleteFile(assignmentsBatchDir + fileName);
// Now let's delete the ".tar" and ".tar.zstd" files as well.
String partialNonBatchFileName = assignmentsBatchDir + "assignments_" + assignmentsCounter + "_full-texts_";
deleteFile(partialNonBatchFileName + fulltextsBatch + ".tar");
// The ".tar.zstd" file of this batch will be deleted by the next batch or in the end of these assignments.
// Now we will delete the zstd file of the previous assignments.
if ( fulltextsBatch >= 2 )
deleteFile(partialNonBatchFileName + (fulltextsBatch -1) + ".tar.zstd");
// We do not use a fulltexts-batch directory, since even if it makes the deletion faster, it will make the full-texts delivery to the controller slower,
// since we will need to move the requested full-texts to that directory before continuing with tarring and compressing the files and sending them over to the Controller.
// Also, we cannot pre-create such directories, since it will add complexity in the download process and also some of the full-texts may not be requested by the Controller (because of duplicates).
}
public static boolean deleteFile(String fileFullPathString)
{
File currentFile = new File(fileFullPathString);
try {
FileDeleteStrategy.FORCE.delete(currentFile);
} catch (IOException e) {
logger.error("Error when deleting the file: " + fileFullPathString);
return false;
}
return true;
}
}

View File

@ -160,7 +160,7 @@ public class AssignmentsHandler {
timesClearingDomainAndPathTrackingData ++;
timesClearingDomainAndPathBlockingData ++; // Increment this also, as we avoid the following check in this case, but the counter has to be increased nevertheless.
} else if ( idUrlPairsHandled >= ((timesClearingDomainAndPathBlockingData +1) * idUrlsToHandleBeforeClearingDomainAndPathBlockingData) ) {
GenericUtils.clearDomainAndPathBlockingData();
GenericUtils.clearBlockingData();
timesClearingDomainAndPathBlockingData ++;
}
@ -219,11 +219,15 @@ public class AssignmentsHandler {
urlReports.clear(); // Reset, without de-allocating.
assignmentsForPlugins.clear();
// It is possible that one or more full-texts-batches, are not sent to the Controller, or that the Controller failed to process them.
// In that case, the related "attempt"-records will keep their "success" state, but the related "payload" records will not be inserted into the database.
// When all the id-urls are processed at least one time, the Controller will start returning all the "couldRetry" records without a related "payload"-record.
FullTextsController.deleteDirectory(assignmentRequestCounter);
// Even though we delete the full-texts batch-by-batch, some files may not have been previously deleted, since they may be duplicates of others found by previous assignments-batches
// and thus, they may have not been requested by the Controller (and thus not deleted after transferring the batches).
// Also, the ".tar.zstd" file of last batch will be deleted here, as well as the whole directory itself.
}
// Note: It is possible that one or more full-texts-batches, are not sent to the Controller, or that the Controller failed to process them.
// In that case, the related "attempt"-records will keep their "success" state, but the related "payload" records will not be inserted into the database.
// When all the id-urls are processed at least one time, the Controller will start returning all the "couldRetry" records without a related "payload"-record.
}

View File

@ -1,74 +0,0 @@
package eu.openaire.urls_worker.util;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.util.List;
import java.util.zip.ZipEntry;
import java.util.zip.ZipOutputStream;
public class FilesZipper
{
private static final Logger logger = LoggerFactory.getLogger(FilesZipper.class);
public static File zipMultipleFilesAndGetZip(long assignmentsCounter, int zipBatchCounter, List<String> filesToZip, String baseDirectory)
{
String zipFileFullPath = baseDirectory + "assignments_" + assignmentsCounter + "_full-texts_" + zipBatchCounter + ".zip";
// For example: assignments_2_full-texts_4.zip | where < 4 > is referred to the 4th batch of files requested by the Controller.
int numZippedFiles = 0;
File zipFile = new File(zipFileFullPath);
try ( ZipOutputStream zos = new ZipOutputStream(Files.newOutputStream(zipFile.toPath()), StandardCharsets.UTF_8) )
{
for ( String file : filesToZip ) {
if ( zipAFile(file, zos, baseDirectory) )
numZippedFiles ++;
}
} catch (Exception e) {
logger.error("Exception when creating the zip-file: " + zipFileFullPath, e);
return null;
}
logger.debug("Zipped " + numZippedFiles + " (out of " + filesToZip.size() + ") files for assignments_" + assignmentsCounter + ", batch_" + zipBatchCounter);
return zipFile;
}
private static boolean zipAFile(String fileName, ZipOutputStream zos, String baseDir)
{
boolean shouldCloseEntry = false; // Useful in order to know if we should close the entry (an Exception may appear, and so we should not try to close it).
String fullFileName = baseDir + fileName;
try (FileInputStream fis = new FileInputStream(fullFileName)) {
zos.putNextEntry(new ZipEntry(fileName));
shouldCloseEntry = true;
int readByte;
while ( (readByte = fis.read()) != -1 ) {
zos.write(readByte);
}
} catch (FileNotFoundException fnfe) {
logger.error("Error zipping file: " + fullFileName, fnfe.getMessage());
return false;
} catch (Exception e) {
if ( ! e.getMessage().contains("duplicate") )
logger.error("Error zipping file: " + fullFileName, e);
return false;
} finally {
if ( shouldCloseEntry ) {
try {
zos.closeEntry(); // close just the ZipEntry here (not the ZipOutputStream)
} catch (IOException e) {
logger.error("", e);
}
}
}
return true;
}
}