Improve the compression of fulltext files:
- Fix not using the big bufferSize it was supposed to use. - Make sure the maximum compression-level is used. Before, the invalid value "bufferSize" was passed as the level, and it is unclear to which real-compression level it was changed to, inside the zstd-library (19 or 22 (only allowed though "ultra mode")), probably to the ultra-level though, as this "switch" seems to be required only through the cli. - Exclude the possibly outdated "commons-compress" transitive dependency from the "publications_retriever" dependency.
This commit is contained in:
parent
107908a733
commit
d630f16198
|
@ -42,6 +42,7 @@ dependencies {
|
||||||
exclude group: 'ch.qos.logback', module: 'logback-classic'
|
exclude group: 'ch.qos.logback', module: 'logback-classic'
|
||||||
exclude group: 'org.slf4j', module: 'slf4j-api'
|
exclude group: 'org.slf4j', module: 'slf4j-api'
|
||||||
exclude group: 'io.minio' // This is not used in the Worker, since it's the Controller which uploads the full-texts to S3. It also includes an older "commons-compress" version which causes problems.
|
exclude group: 'io.minio' // This is not used in the Worker, since it's the Controller which uploads the full-texts to S3. It also includes an older "commons-compress" version which causes problems.
|
||||||
|
exclude group: 'org.apache.commons', module: 'commons-compress'
|
||||||
}
|
}
|
||||||
|
|
||||||
implementation group: 'com.google.guava', name: 'guava', version: '33.2.0-jre'
|
implementation group: 'com.google.guava', name: 'guava', version: '33.2.0-jre'
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
package eu.openaire.urls_worker.util;
|
package eu.openaire.urls_worker.util;
|
||||||
|
|
||||||
|
import com.github.luben.zstd.Zstd;
|
||||||
import eu.openaire.urls_worker.controllers.FullTextsController;
|
import eu.openaire.urls_worker.controllers.FullTextsController;
|
||||||
import eu.openaire.urls_worker.models.TarFileResult;
|
import eu.openaire.urls_worker.models.TarFileResult;
|
||||||
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
||||||
|
@ -24,6 +25,8 @@ public class FilesCompressor {
|
||||||
|
|
||||||
public static final int bufferSize = (5 * 1_048_576); // 5 Mb
|
public static final int bufferSize = (5 * 1_048_576); // 5 Mb
|
||||||
|
|
||||||
|
public static final int maxCompressionLevel = Zstd.maxCompressionLevel();
|
||||||
|
|
||||||
|
|
||||||
public static File compressMultipleFilesIntoOne(long assignmentsCounter, int tarBatchCounter, List<String> filesToCompress, String baseDirectory)
|
public static File compressMultipleFilesIntoOne(long assignmentsCounter, int tarBatchCounter, List<String> filesToCompress, String baseDirectory)
|
||||||
{
|
{
|
||||||
|
@ -51,7 +54,7 @@ public class FilesCompressor {
|
||||||
File zStandardFile = new File(zStandardFileFullPath);
|
File zStandardFile = new File(zStandardFileFullPath);
|
||||||
|
|
||||||
try ( BufferedInputStream in = new BufferedInputStream(Files.newInputStream(Paths.get(tarFilePath)), bufferSize);
|
try ( BufferedInputStream in = new BufferedInputStream(Files.newInputStream(Paths.get(tarFilePath)), bufferSize);
|
||||||
ZstdCompressorOutputStream zOut = new ZstdCompressorOutputStream(new BufferedOutputStream(Files.newOutputStream(zStandardFile.toPath())), bufferSize) )
|
ZstdCompressorOutputStream zOut = new ZstdCompressorOutputStream(new BufferedOutputStream(Files.newOutputStream(zStandardFile.toPath()), bufferSize), maxCompressionLevel) )
|
||||||
{
|
{
|
||||||
int readByte;
|
int readByte;
|
||||||
while ( (readByte = in.read()) != -1 )
|
while ( (readByte = in.read()) != -1 )
|
||||||
|
|
Loading…
Reference in New Issue