- Delete the transferred full-texts as soon as possible, in order to mitigate the "No space left on device"-error, which may appear, in case we have some very large files.
- Use the new "GenericUtils.clearBlockingData()" method from the "PublicationsRetriever" library. - Remove the deprecated "getMultipleFullTexts"-endpoint, along with the Zip-related code.
This commit is contained in:
parent
7dd5719bff
commit
bd0d9eb36f
|
@ -3,7 +3,7 @@ package eu.openaire.urls_worker.controllers;
|
|||
import eu.openaire.urls_worker.plugins.PublicationsRetrieverPlugin;
|
||||
import eu.openaire.urls_worker.services.FileStorageService;
|
||||
import eu.openaire.urls_worker.util.FilesCompressor;
|
||||
import eu.openaire.urls_worker.util.FilesZipper;
|
||||
import org.apache.commons.io.FileDeleteStrategy;
|
||||
import org.apache.commons.io.FileUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
@ -36,6 +36,8 @@ public class FullTextsController {
|
|||
}
|
||||
|
||||
|
||||
// TODO - Remove the "Improved" form the endpoint's name, now that the previous "simple"-endpoint is removed.
|
||||
// TODO - This has to happen at the same time with the related change in the API-call from the Controller!
|
||||
@GetMapping("getFullTextsImproved/{assignmentsCounter:[\\d]+}/{totalBatches:[\\d]+}/{batchCounter:[\\d]+}/{fileNamesWithExtensions}")
|
||||
public Object getMultipleFullTextsImproved(@PathVariable long assignmentsCounter, @PathVariable int totalBatches, @PathVariable int batchCounter, @PathVariable List<String> fileNamesWithExtensions) {
|
||||
|
||||
|
@ -73,6 +75,7 @@ public class FullTextsController {
|
|||
String errorMsg = "Failed to create the zstd file for \"batchCounter\"-" + batchCounter;
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.internalServerError().body(errorMsg);
|
||||
// The related files will be deleted later, upon completing the Worker-report process, in "AssignmentsHandler.postWorkerReport()".
|
||||
}
|
||||
|
||||
if ( batchCounter == totalBatches )
|
||||
|
@ -89,6 +92,11 @@ public class FullTextsController {
|
|||
String errorMsg = "Could not load the FileInputStream of the zstd-tar-file \"" + zstdTarFileFullPath + "\"!";
|
||||
logger.error(errorMsg, e);
|
||||
return ResponseEntity.internalServerError().body(errorMsg);
|
||||
} finally {
|
||||
// In some cases, the full-texts might be too large and their total number too,
|
||||
// so if we leave them be, and wait for all batches to finish, we may get a "java.io.IOException: No space left on device" error.
|
||||
deleteFulltextBatchFiles(currentAssignmentsBaseFullTextsPath, assignmentsCounter, batchCounter, fileNamesWithExtensions);
|
||||
// The ".tar.zstd" file of this batch, for which we pass a steam to the Controller, will be deleted after the next batch, or after all batches are transferred and handles by the Controller.
|
||||
}
|
||||
|
||||
// The related fulltext and (zstd-)tar files will be deleted in "AssignmentsHandler.postWorkerReport()", after the Controller has finished transferring them. They will be deleted even in case of a Controller-error.
|
||||
|
@ -96,67 +104,6 @@ public class FullTextsController {
|
|||
}
|
||||
|
||||
|
||||
@Deprecated
|
||||
@GetMapping("getFullTexts/{assignmentsCounter:[\\d]+}/{totalZipBatches:[\\d]+}/{zipBatchCounter:[\\d]+}/{fileNamesWithExtensions}")
|
||||
public Object getMultipleFullTexts(@PathVariable long assignmentsCounter, @PathVariable int totalZipBatches, @PathVariable int zipBatchCounter, @PathVariable List<String> fileNamesWithExtensions) {
|
||||
|
||||
int fileNamesListNum = fileNamesWithExtensions.size();
|
||||
if ( (fileNamesListNum == 1) && (fileNamesWithExtensions.get(0).length() == 0) ) { // In case the last "/" in the url was given (without any files following), then this list will not be empty, but have one empty item instead.
|
||||
// In case the url does not end in "/", then Spring will automatically return an "HTTP-BadRequest".
|
||||
String errorMsg = "An empty \"fileNamesWithExtensions\" list was given from assignments_" + assignmentsCounter + ", for batch_" + zipBatchCounter;
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.badRequest().body(errorMsg);
|
||||
}
|
||||
|
||||
if ( totalZipBatches == 0 ) {
|
||||
String errorMsg = "The given \"totalZipBatches\" (" + totalZipBatches + ") was < 0 >!";
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.badRequest().body(errorMsg);
|
||||
}
|
||||
else if ( zipBatchCounter > totalZipBatches ) {
|
||||
String errorMsg = "The given \"zipBatchCounter\" (" + zipBatchCounter + ") is greater than the \"totalZipBatches\" (" + totalZipBatches + ")!";
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.badRequest().body(errorMsg);
|
||||
}
|
||||
|
||||
logger.info("Received a \"getMultipleFullTexts\" request for returning a zip-file containing " + fileNamesListNum + " full-texts, from assignments_" + assignmentsCounter + ", for batch_" + zipBatchCounter + " (out of " + totalZipBatches + ").");
|
||||
|
||||
String currentAssignmentsBaseFullTextsPath = assignmentsBaseDir + "assignments_" + assignmentsCounter + "_fullTexts" + File.separator;
|
||||
|
||||
if ( ! (new File(currentAssignmentsBaseFullTextsPath).isDirectory()) ) {
|
||||
String errorMsg = "The base directory for assignments_" + assignmentsCounter + " was not found: " + currentAssignmentsBaseFullTextsPath;
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.badRequest().body(errorMsg);
|
||||
}
|
||||
|
||||
File zipFile = FilesZipper.zipMultipleFilesAndGetZip(assignmentsCounter, zipBatchCounter, fileNamesWithExtensions, currentAssignmentsBaseFullTextsPath);
|
||||
if ( zipFile == null ) {
|
||||
String errorMsg = "Failed to create the zip file for \"zipBatchCounter\"-" + zipBatchCounter;
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.internalServerError().body(errorMsg);
|
||||
}
|
||||
|
||||
if ( zipBatchCounter == totalZipBatches )
|
||||
logger.debug("Will return the " + ((totalZipBatches > 1) ? "last" : "only one") + " batch (" + zipBatchCounter + ") of Assignments_" + assignmentsCounter + " to the Controller.");
|
||||
|
||||
String zipName = zipFile.getName();
|
||||
String zipFileFullPath = currentAssignmentsBaseFullTextsPath + zipName;
|
||||
try {
|
||||
return ResponseEntity.ok()
|
||||
.contentType(MediaType.APPLICATION_OCTET_STREAM)
|
||||
.header(HttpHeaders.CONTENT_DISPOSITION, "inline; filename=\"" + zipName + "\"")
|
||||
.body(new InputStreamResource(Files.newInputStream(Paths.get(zipFileFullPath))));
|
||||
} catch (Exception e) {
|
||||
String errorMsg = "Could not load the FileInputStream of the zip-file \"" + zipFileFullPath + "\"!";
|
||||
logger.error(errorMsg, e);
|
||||
return ResponseEntity.internalServerError().body(errorMsg);
|
||||
}
|
||||
|
||||
// The related fulltext and zip files will be deleted in "AssignmentsHandler.postWorkerReport()", after the Controller has finished transferring them. They will be deleted even in case of a Controller-error.
|
||||
// In case of an error and file-deletion, the related id-url records will just be re-processed in the future by some (maybe different) Worker.
|
||||
}
|
||||
|
||||
|
||||
@GetMapping("getFullText/{assignmentsCounter:[\\d]+}/{fileNameWithExtension:[\\w_:]+.[\\w]{2,10}}")
|
||||
public ResponseEntity<?> getFullText(@PathVariable long assignmentsCounter, @PathVariable String fileNameWithExtension) {
|
||||
|
||||
|
@ -203,4 +150,39 @@ public class FullTextsController {
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
public static void deleteFulltextBatchFiles(String assignmentsBatchDir, long assignmentsCounter, long fulltextsBatch, List<String> filenames)
|
||||
{
|
||||
// We will delete all the files related to the given fulltexts-batch, along with the created tar and zstd files.
|
||||
|
||||
for ( String fileName : filenames )
|
||||
deleteFile(assignmentsBatchDir + fileName);
|
||||
|
||||
// Now let's delete the ".tar" and ".tar.zstd" files as well.
|
||||
String partialNonBatchFileName = assignmentsBatchDir + "assignments_" + assignmentsCounter + "_full-texts_";
|
||||
deleteFile(partialNonBatchFileName + fulltextsBatch + ".tar");
|
||||
|
||||
// The ".tar.zstd" file of this batch will be deleted by the next batch or in the end of these assignments.
|
||||
// Now we will delete the zstd file of the previous assignments.
|
||||
if ( fulltextsBatch >= 2 )
|
||||
deleteFile(partialNonBatchFileName + (fulltextsBatch -1) + ".tar.zstd");
|
||||
|
||||
// We do not use a fulltexts-batch directory, since even if it makes the deletion faster, it will make the full-texts delivery to the controller slower,
|
||||
// since we will need to move the requested full-texts to that directory before continuing with tarring and compressing the files and sending them over to the Controller.
|
||||
// Also, we cannot pre-create such directories, since it will add complexity in the download process and also some of the full-texts may not be requested by the Controller (because of duplicates).
|
||||
}
|
||||
|
||||
|
||||
public static boolean deleteFile(String fileFullPathString)
|
||||
{
|
||||
File currentFile = new File(fileFullPathString);
|
||||
try {
|
||||
FileDeleteStrategy.FORCE.delete(currentFile);
|
||||
} catch (IOException e) {
|
||||
logger.error("Error when deleting the file: " + fileFullPathString);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -160,7 +160,7 @@ public class AssignmentsHandler {
|
|||
timesClearingDomainAndPathTrackingData ++;
|
||||
timesClearingDomainAndPathBlockingData ++; // Increment this also, as we avoid the following check in this case, but the counter has to be increased nevertheless.
|
||||
} else if ( idUrlPairsHandled >= ((timesClearingDomainAndPathBlockingData +1) * idUrlsToHandleBeforeClearingDomainAndPathBlockingData) ) {
|
||||
GenericUtils.clearDomainAndPathBlockingData();
|
||||
GenericUtils.clearBlockingData();
|
||||
timesClearingDomainAndPathBlockingData ++;
|
||||
}
|
||||
|
||||
|
@ -219,11 +219,15 @@ public class AssignmentsHandler {
|
|||
urlReports.clear(); // Reset, without de-allocating.
|
||||
assignmentsForPlugins.clear();
|
||||
|
||||
// It is possible that one or more full-texts-batches, are not sent to the Controller, or that the Controller failed to process them.
|
||||
// In that case, the related "attempt"-records will keep their "success" state, but the related "payload" records will not be inserted into the database.
|
||||
// When all the id-urls are processed at least one time, the Controller will start returning all the "couldRetry" records without a related "payload"-record.
|
||||
FullTextsController.deleteDirectory(assignmentRequestCounter);
|
||||
// Even though we delete the full-texts batch-by-batch, some files may not have been previously deleted, since they may be duplicates of others found by previous assignments-batches
|
||||
// and thus, they may have not been requested by the Controller (and thus not deleted after transferring the batches).
|
||||
// Also, the ".tar.zstd" file of last batch will be deleted here, as well as the whole directory itself.
|
||||
}
|
||||
|
||||
// Note: It is possible that one or more full-texts-batches, are not sent to the Controller, or that the Controller failed to process them.
|
||||
// In that case, the related "attempt"-records will keep their "success" state, but the related "payload" records will not be inserted into the database.
|
||||
// When all the id-urls are processed at least one time, the Controller will start returning all the "couldRetry" records without a related "payload"-record.
|
||||
}
|
||||
|
||||
|
||||
|
|
|
@ -1,74 +0,0 @@
|
|||
package eu.openaire.urls_worker.util;
|
||||
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.FileInputStream;
|
||||
import java.io.FileNotFoundException;
|
||||
import java.io.IOException;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Files;
|
||||
import java.util.List;
|
||||
import java.util.zip.ZipEntry;
|
||||
import java.util.zip.ZipOutputStream;
|
||||
|
||||
|
||||
public class FilesZipper
|
||||
{
|
||||
private static final Logger logger = LoggerFactory.getLogger(FilesZipper.class);
|
||||
|
||||
|
||||
public static File zipMultipleFilesAndGetZip(long assignmentsCounter, int zipBatchCounter, List<String> filesToZip, String baseDirectory)
|
||||
{
|
||||
String zipFileFullPath = baseDirectory + "assignments_" + assignmentsCounter + "_full-texts_" + zipBatchCounter + ".zip";
|
||||
// For example: assignments_2_full-texts_4.zip | where < 4 > is referred to the 4th batch of files requested by the Controller.
|
||||
|
||||
int numZippedFiles = 0;
|
||||
File zipFile = new File(zipFileFullPath);
|
||||
try ( ZipOutputStream zos = new ZipOutputStream(Files.newOutputStream(zipFile.toPath()), StandardCharsets.UTF_8) )
|
||||
{
|
||||
for ( String file : filesToZip ) {
|
||||
if ( zipAFile(file, zos, baseDirectory) )
|
||||
numZippedFiles ++;
|
||||
}
|
||||
} catch (Exception e) {
|
||||
logger.error("Exception when creating the zip-file: " + zipFileFullPath, e);
|
||||
return null;
|
||||
}
|
||||
logger.debug("Zipped " + numZippedFiles + " (out of " + filesToZip.size() + ") files for assignments_" + assignmentsCounter + ", batch_" + zipBatchCounter);
|
||||
return zipFile;
|
||||
}
|
||||
|
||||
|
||||
private static boolean zipAFile(String fileName, ZipOutputStream zos, String baseDir)
|
||||
{
|
||||
boolean shouldCloseEntry = false; // Useful in order to know if we should close the entry (an Exception may appear, and so we should not try to close it).
|
||||
String fullFileName = baseDir + fileName;
|
||||
try (FileInputStream fis = new FileInputStream(fullFileName)) {
|
||||
zos.putNextEntry(new ZipEntry(fileName));
|
||||
shouldCloseEntry = true;
|
||||
int readByte;
|
||||
while ( (readByte = fis.read()) != -1 ) {
|
||||
zos.write(readByte);
|
||||
}
|
||||
} catch (FileNotFoundException fnfe) {
|
||||
logger.error("Error zipping file: " + fullFileName, fnfe.getMessage());
|
||||
return false;
|
||||
} catch (Exception e) {
|
||||
if ( ! e.getMessage().contains("duplicate") )
|
||||
logger.error("Error zipping file: " + fullFileName, e);
|
||||
return false;
|
||||
} finally {
|
||||
if ( shouldCloseEntry ) {
|
||||
try {
|
||||
zos.closeEntry(); // close just the ZipEntry here (not the ZipOutputStream)
|
||||
} catch (IOException e) {
|
||||
logger.error("", e);
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in New Issue