95 lines
5.0 KiB
Java
95 lines
5.0 KiB
Java
package eu.openaire.urls_controller.util;
|
|
|
|
import com.google.common.collect.SetMultimap;
|
|
import eu.openaire.urls_controller.models.Payload;
|
|
import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
|
|
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
|
|
import org.apache.commons.compress.compressors.zstandard.ZstdCompressorInputStream;
|
|
import org.apache.commons.lang.StringUtils;
|
|
import org.slf4j.Logger;
|
|
import org.slf4j.LoggerFactory;
|
|
import org.springframework.beans.factory.annotation.Autowired;
|
|
import org.springframework.stereotype.Component;
|
|
|
|
import java.io.BufferedInputStream;
|
|
import java.io.BufferedOutputStream;
|
|
import java.io.File;
|
|
import java.nio.file.Files;
|
|
import java.nio.file.Path;
|
|
import java.nio.file.Paths;
|
|
import java.util.List;
|
|
|
|
@Component
|
|
public class FileDecompressor {
|
|
|
|
private static final Logger logger = LoggerFactory.getLogger(FileDecompressor.class);
|
|
|
|
@Autowired
|
|
private FileUtils fileUtils;
|
|
|
|
|
|
boolean decompressAndUploadFullTexts(String zstdFileFullPath, Path curBatchPath, String targetDirectory, List<String> fileNamesForCurBatch, int batchCounter, SetMultimap<String, Payload> allFileNamesWithPayloads, long assignmentsBatchCounter)
|
|
{
|
|
try {
|
|
decompressFiles(zstdFileFullPath, curBatchPath);
|
|
String[] extractedFileNames = new File(targetDirectory).list();
|
|
if ( extractedFileNames == null ) {
|
|
logger.error("There was an error when acquiring the list of extracted files of directory: " + targetDirectory);
|
|
return false;
|
|
} else if ( extractedFileNames.length == 0 ) { // The directory might have only two files, the "tar-file" and the "tar.zstd-file", if the full-texts failed to be decompressed or untarred..
|
|
logger.error("No full-texts' fleNames where extracted from directory: " + targetDirectory);
|
|
return false;
|
|
} else if ( extractedFileNames.length != fileNamesForCurBatch.size() ) {
|
|
logger.warn("The number of extracted files (" + extractedFileNames.length + ") was not equal to the number of files (" + fileNamesForCurBatch.size() + ") of the current batch_" + batchCounter);
|
|
// We do NOT have to find and cross-reference the missing files with the urlReports, in order to set their locations to <null>,
|
|
// since, in the end of each assignments-batch, an iteration will be made and for all the non-retrieved and non-uploaded full-texts, the app will set them to null.
|
|
}
|
|
fileUtils.uploadFullTexts(extractedFileNames, targetDirectory, allFileNamesWithPayloads, batchCounter);
|
|
return true;
|
|
} catch (Exception e) {
|
|
logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + GenericUtils.endOfLine + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6).
|
|
return false;
|
|
} finally {
|
|
fileUtils.deleteDirectory(curBatchPath.toFile());
|
|
}
|
|
}
|
|
|
|
public void decompressFiles(String zstdSource, Path targetDir) throws Exception
|
|
{
|
|
// Decompress the zstd file.
|
|
String tarPathStr = StringUtils.replace(zstdSource, ".zstd", "", 1); // Remove the ".zstd" extension.
|
|
Path tarPath = Paths.get(tarPathStr);
|
|
int readByte = -1;
|
|
|
|
try ( ZstdCompressorInputStream zsIn = new ZstdCompressorInputStream(new BufferedInputStream(Files.newInputStream(Paths.get(zstdSource)), FileUtils.tenMb));
|
|
BufferedOutputStream out = new BufferedOutputStream(Files.newOutputStream(tarPath), FileUtils.tenMb) )
|
|
{
|
|
while ( (readByte = zsIn.read()) != -1 )
|
|
out.write(readByte);
|
|
} finally {
|
|
fileUtils.deleteFile(zstdSource); // Delete the initial zstd file.
|
|
}
|
|
|
|
// Now we have a decompressed tar-file, which we will Un-tar, in order to extract the full-text files.
|
|
try ( TarArchiveInputStream tarInput = new TarArchiveInputStream(new BufferedInputStream(Files.newInputStream(tarPath)), FileUtils.tenMb) )
|
|
{
|
|
TarArchiveEntry entry;
|
|
while ( ((entry = (TarArchiveEntry) tarInput.getNextEntry()) != null) )
|
|
{
|
|
// Copy an individual entry.
|
|
try ( BufferedOutputStream bufferedOutputStream = new BufferedOutputStream(Files.newOutputStream(targetDir.resolve(entry.getName())), FileUtils.tenMb) ) {
|
|
while ( (readByte = tarInput.read()) != -1 )
|
|
bufferedOutputStream.write(readByte);
|
|
} // The exception will be given to the caller method.
|
|
// No need to close the tarEntry (no "close()" method is provided).
|
|
}
|
|
} finally {
|
|
fileUtils.deleteFile(tarPathStr); // Delete the decompressed tar file.
|
|
}
|
|
|
|
// Now we have a batch-directory which contains the tar-file along with the extracted full-text files.
|
|
// After uploading the full-texts, the batch-directories will be deleted.
|
|
}
|
|
|
|
}
|