Improve the compression of fulltext files:

- Fix not using the big bufferSize it was supposed to use.
- Make sure the maximum compression-level is used. Before, the invalid value "bufferSize" was passed as the level, and it is unclear to which real-compression level it was changed to, inside the zstd-library (19 or 22 (only allowed though "ultra mode")), probably to the ultra-level though, as this "switch" seems to be required only through the cli.
- Exclude the possibly outdated "commons-compress" transitive dependency from the "publications_retriever" dependency.
This commit is contained in:
Lampros Smyrnaios 2024-06-10 18:21:35 +03:00
parent 107908a733
commit d630f16198
2 changed files with 5 additions and 1 deletions

View File

@ -42,6 +42,7 @@ dependencies {
exclude group: 'ch.qos.logback', module: 'logback-classic' exclude group: 'ch.qos.logback', module: 'logback-classic'
exclude group: 'org.slf4j', module: 'slf4j-api' exclude group: 'org.slf4j', module: 'slf4j-api'
exclude group: 'io.minio' // This is not used in the Worker, since it's the Controller which uploads the full-texts to S3. It also includes an older "commons-compress" version which causes problems. exclude group: 'io.minio' // This is not used in the Worker, since it's the Controller which uploads the full-texts to S3. It also includes an older "commons-compress" version which causes problems.
exclude group: 'org.apache.commons', module: 'commons-compress'
} }
implementation group: 'com.google.guava', name: 'guava', version: '33.2.0-jre' implementation group: 'com.google.guava', name: 'guava', version: '33.2.0-jre'

View File

@ -1,5 +1,6 @@
package eu.openaire.urls_worker.util; package eu.openaire.urls_worker.util;
import com.github.luben.zstd.Zstd;
import eu.openaire.urls_worker.controllers.FullTextsController; import eu.openaire.urls_worker.controllers.FullTextsController;
import eu.openaire.urls_worker.models.TarFileResult; import eu.openaire.urls_worker.models.TarFileResult;
import org.apache.commons.compress.archivers.tar.TarArchiveEntry; import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
@ -24,6 +25,8 @@ public class FilesCompressor {
public static final int bufferSize = (5 * 1_048_576); // 5 Mb public static final int bufferSize = (5 * 1_048_576); // 5 Mb
public static final int maxCompressionLevel = Zstd.maxCompressionLevel();
public static File compressMultipleFilesIntoOne(long assignmentsCounter, int tarBatchCounter, List<String> filesToCompress, String baseDirectory) public static File compressMultipleFilesIntoOne(long assignmentsCounter, int tarBatchCounter, List<String> filesToCompress, String baseDirectory)
{ {
@ -51,7 +54,7 @@ public class FilesCompressor {
File zStandardFile = new File(zStandardFileFullPath); File zStandardFile = new File(zStandardFileFullPath);
try ( BufferedInputStream in = new BufferedInputStream(Files.newInputStream(Paths.get(tarFilePath)), bufferSize); try ( BufferedInputStream in = new BufferedInputStream(Files.newInputStream(Paths.get(tarFilePath)), bufferSize);
ZstdCompressorOutputStream zOut = new ZstdCompressorOutputStream(new BufferedOutputStream(Files.newOutputStream(zStandardFile.toPath())), bufferSize) ) ZstdCompressorOutputStream zOut = new ZstdCompressorOutputStream(new BufferedOutputStream(Files.newOutputStream(zStandardFile.toPath()), bufferSize), maxCompressionLevel) )
{ {
int readByte; int readByte;
while ( (readByte = in.read()) != -1 ) while ( (readByte = in.read()) != -1 )