UrlsController/src/main/java/eu/openaire/urls_controller/util/FileDecompressor.java

56 lines
2.4 KiB
Java

package eu.openaire.urls_controller.util;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
import org.apache.commons.compress.compressors.zstandard.ZstdCompressorInputStream;
import org.apache.commons.lang.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
@Component
public class FileDecompressor {
private static final Logger logger = LoggerFactory.getLogger(FileDecompressor.class);
public void decompressFiles(String zstdSource, Path targetDir) throws Exception
{
// Decompress the zstd file.
Path tarPath = Paths.get(StringUtils.replace(zstdSource, ".zstd", "", 1)); // Remove the ".zstd" extension.
int readByte = -1;
try ( ZstdCompressorInputStream zsIn = new ZstdCompressorInputStream(new BufferedInputStream(Files.newInputStream(Paths.get(zstdSource)), FileUtils.tenMb));
BufferedOutputStream out = new BufferedOutputStream(Files.newOutputStream(tarPath), FileUtils.tenMb) )
{
while ( (readByte = zsIn.read()) != -1 )
out.write(readByte);
}
// Now we have a decompressed tar-file, which we will Un-tar, in order to extract the full-text files.
try ( TarArchiveInputStream tarInput = new TarArchiveInputStream(new BufferedInputStream(Files.newInputStream(tarPath)), FileUtils.tenMb) )
{
TarArchiveEntry entry;
while ( ((entry = (TarArchiveEntry) tarInput.getNextEntry()) != null) )
{
// Copy an individual entry.
try ( BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(Files.newOutputStream(targetDir.resolve(entry.getName())), FileUtils.tenMb) ) {
while ( (readByte = tarInput.read()) != -1 )
bufferedOutputStream.write(readByte);
} // The exception will be given to the caller method.
// No need to close the tarEntry (no "close()" method is provided).
}
}
// Now we have a batch-directory which contains the tar-file along with the extracted full-text files.
// After uploading the full-texts, the batch-directories will be deleted.
}
}