UrlsWorker/src/main/java/eu/openaire/urls_worker/util/FilesCompressor.java

111 lines
4.6 KiB
Java

package eu.openaire.urls_worker.util;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveOutputStream;
import org.apache.commons.compress.compressors.zstandard.ZstdCompressorOutputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.*;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.util.List;
public class FilesCompressor {
private static final Logger logger = LoggerFactory.getLogger(FilesCompressor.class);
public static File compressMultipleFilesIntoOne(long assignmentsCounter, int zipBatchCounter, List<String> filesToCompress, String baseDirectory)
{
// For example: assignments_2_full-texts_4.zip | where < 4 > is referred to the 4th batch of files requested by the Controller.
File tarFile = getTarArchiveWithFullTexts(filesToCompress, baseDirectory, assignmentsCounter, zipBatchCounter);
if ( tarFile == null )
return null; // The error-cause is already logged.
// The "TAR" archive is not compressed, but it helps deliver multiple full-texts with a single Stream.
// Then, we compress the archive, using Facebook's "ZStandard" algorithm, which delivers both high compression-rate and compression and decompression efficiency.
String tarFilePath = tarFile.getPath();
String zStandardFileFullPath = tarFilePath + ".zstd";
File zStandardFile = new File(zStandardFileFullPath);
try ( InputStream in = Files.newInputStream(Paths.get(tarFilePath));
ZstdCompressorOutputStream zOut = new ZstdCompressorOutputStream(new BufferedOutputStream(Files.newOutputStream(zStandardFile.toPath()))) )
{
final byte[] buffer = new byte[1048576]; // 1 Mb
int numBytes = 0;
while ( (numBytes = in.read(buffer)) != -1 ) {
zOut.write(buffer, 0, numBytes);
}
} catch (Exception e) {
logger.error("Exception when compressing the tar-archive: " + tarFilePath, e);
return null;
}
return zStandardFile;
}
/**
* This method adds the requested full-text file into a TAR archive, which later will be compressed.
* */
private static File getTarArchiveWithFullTexts(List<String> filesToTar, String baseDir, long assignmentsCounter, int tarBatchCounter) {
String tarFileFullPath = baseDir + "assignments_" + assignmentsCounter + "_full-texts_" + tarBatchCounter + ".tar";
// For example: assignments_2_full-texts_4.zip | where < 4 > is referred to the 4th batch of files requested by the Controller.
// https://commons.apache.org/proper/commons-compress/examples.html
int numTarredFiles = 0;
File tarFile = new File(tarFileFullPath);
try ( TarArchiveOutputStream taos = new TarArchiveOutputStream(Files.newOutputStream(tarFile.toPath())) )
{
for ( String fileName : filesToTar ) {
if ( addTarEntry(taos, fileName, baseDir) )
numTarredFiles ++;
}
} catch (Exception e) {
logger.error("Exception when creating the tar-file: " + tarFileFullPath, e);
return null;
}
logger.debug("Tarred " + numTarredFiles + " (out of " + filesToTar.size() + ") files for assignments_" + assignmentsCounter + ", batch_" + tarBatchCounter);
return tarFile;
}
private static boolean addTarEntry(TarArchiveOutputStream taos, String fileName, String baseDir)
{
boolean shouldCloseEntry = false; // Useful in order to know if we should close the entry (an Exception may appear, and so we should not try to close it).
String fullFileName = baseDir + fileName;
try ( FileInputStream fis = new FileInputStream(fullFileName) )
{
TarArchiveEntry entry = new TarArchiveEntry(fileName);
entry.setSize(Files.size(Paths.get(fullFileName))); // Yes, tar requires that we set the size beforehand..
taos.putArchiveEntry(entry);
shouldCloseEntry = true;
int readByte;
while ( (readByte = fis.read()) != -1 ) {
taos.write(readByte);
}
} catch (Exception e) {
logger.error("", e);
return false;
} finally {
if ( shouldCloseEntry ) {
try {
taos.closeArchiveEntry(); // close just the ZipEntry here (not the ZipOutputStream)
} catch (IOException e) {
logger.error("", e);
}
}
}
return true;
}
}