From d630f1619888ac69ac481d6ae1179a0e9d97b707 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Mon, 10 Jun 2024 18:21:35 +0300 Subject: [PATCH] Improve the compression of fulltext files: - Fix not using the big bufferSize it was supposed to use. - Make sure the maximum compression-level is used. Before, the invalid value "bufferSize" was passed as the level, and it is unclear to which real-compression level it was changed to, inside the zstd-library (19 or 22 (only allowed though "ultra mode")), probably to the ultra-level though, as this "switch" seems to be required only through the cli. - Exclude the possibly outdated "commons-compress" transitive dependency from the "publications_retriever" dependency. --- build.gradle | 1 + .../java/eu/openaire/urls_worker/util/FilesCompressor.java | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/build.gradle b/build.gradle index c211e31..6f84353 100644 --- a/build.gradle +++ b/build.gradle @@ -42,6 +42,7 @@ dependencies { exclude group: 'ch.qos.logback', module: 'logback-classic' exclude group: 'org.slf4j', module: 'slf4j-api' exclude group: 'io.minio' // This is not used in the Worker, since it's the Controller which uploads the full-texts to S3. It also includes an older "commons-compress" version which causes problems. + exclude group: 'org.apache.commons', module: 'commons-compress' } implementation group: 'com.google.guava', name: 'guava', version: '33.2.0-jre' diff --git a/src/main/java/eu/openaire/urls_worker/util/FilesCompressor.java b/src/main/java/eu/openaire/urls_worker/util/FilesCompressor.java index 95e754d..22dc208 100644 --- a/src/main/java/eu/openaire/urls_worker/util/FilesCompressor.java +++ b/src/main/java/eu/openaire/urls_worker/util/FilesCompressor.java @@ -1,5 +1,6 @@ package eu.openaire.urls_worker.util; +import com.github.luben.zstd.Zstd; import eu.openaire.urls_worker.controllers.FullTextsController; import eu.openaire.urls_worker.models.TarFileResult; import org.apache.commons.compress.archivers.tar.TarArchiveEntry; @@ -24,6 +25,8 @@ public class FilesCompressor { public static final int bufferSize = (5 * 1_048_576); // 5 Mb + public static final int maxCompressionLevel = Zstd.maxCompressionLevel(); + public static File compressMultipleFilesIntoOne(long assignmentsCounter, int tarBatchCounter, List filesToCompress, String baseDirectory) { @@ -51,7 +54,7 @@ public class FilesCompressor { File zStandardFile = new File(zStandardFileFullPath); try ( BufferedInputStream in = new BufferedInputStream(Files.newInputStream(Paths.get(tarFilePath)), bufferSize); - ZstdCompressorOutputStream zOut = new ZstdCompressorOutputStream(new BufferedOutputStream(Files.newOutputStream(zStandardFile.toPath())), bufferSize) ) + ZstdCompressorOutputStream zOut = new ZstdCompressorOutputStream(new BufferedOutputStream(Files.newOutputStream(zStandardFile.toPath()), bufferSize), maxCompressionLevel) ) { int readByte; while ( (readByte = in.read()) != -1 )