- Use Facebook's [**Zstandard**](https://facebook.github.io/zstd/) compression algorithm, which brings very big benefits on compression rate and speed.

- Update the minIO dependency. - Code polishing.
2023-01-10 13:34:54 +02:00 · 2023-01-10 13:34:54 +02:00 · 8876089022
parent d1a4c84289
commit 8876089022
7 changed files with 99 additions and 79 deletions
--- a/README.md
+++ b/README.md
@ -17,3 +17,7 @@ To install and run the application:
 If you want to just run the app, then run the script with the argument "1": ```./installAndRun.sh 1```.<br>
 If you want to build and run the app on a **Docker Container**, then run the script with the argument "0" followed by the argument "1": ```./installAndRun.sh 0 1```.<br>
 <br>
+
+Implementation notes:
+- For transferring the full-text files, we use Facebook's [**Zstandard**](https://facebook.github.io/zstd/) compression algorithm, which brings very big benefits in compression rate and speed.
+- The names of the uploaded full-text files ae of the following form: "***datasourceID/recordId::fileHash.pdf***"
--- a/build.gradle
+++ b/build.gradle
@ -45,7 +45,10 @@ dependencies {
    // https://mvnrepository.com/artifact/org.apache.commons/commons-lang3
    implementation group: 'org.apache.commons', name: 'commons-lang3', version: '3.12.0'

-    implementation 'io.minio:minio:8.4.6'
+    // https://mvnrepository.com/artifact/org.apache.commons/commons-compress
+    implementation 'org.apache.commons:commons-compress:1.22'
+
+    implementation 'io.minio:minio:8.5.0'

    // https://mvnrepository.com/artifact/com.cloudera.impala/jdbc
    implementation("com.cloudera.impala:jdbc:2.5.31") {
--- a/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java
+++ b/src/main/java/eu/openaire/urls_controller/controllers/UrlController.java
@ -257,7 +257,7 @@ public class UrlController {
        else
            logger.debug("Finished uploading the full-texts from batch-assignments_" + curReportAssignments);

-        String currentParquetPath = parquetFileUtils.parquetBaseLocalDirectoryPath + "assignments_" + assignmentsBatchCounter.get() + File.separator;
+        String currentParquetPath = parquetFileUtils.parquetBaseLocalDirectoryPath + "assignments_" + curReportAssignments + File.separator;
        java.nio.file.Path parquetDirPath = Paths.get(currentParquetPath);
        if ( !Files.isDirectory(parquetDirPath) ) {
            try {
--- a/src/main/java/eu/openaire/urls_controller/util/FileDecompressor.java
+++ b/src/main/java/eu/openaire/urls_controller/util/FileDecompressor.java
@ -0,0 +1,56 @@
+package eu.openaire.urls_controller.util;
+
+import org.apache.commons.compress.archivers.tar.TarArchiveEntry;
+import org.apache.commons.compress.archivers.tar.TarArchiveInputStream;
+import org.apache.commons.compress.compressors.zstandard.ZstdCompressorInputStream;
+import org.apache.commons.lang.StringUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.stereotype.Component;
+
+import java.io.BufferedInputStream;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
+
+@Component
+public class FileDecompressor {
+
+    private static final Logger logger = LoggerFactory.getLogger(FileDecompressor.class);
+
+
+    public void decompressFiles(String zstdSource, Path targetDir) throws Exception
+    {
+        // Decompress the zstd file.
+        Path tarPath = Paths.get(StringUtils.replace(zstdSource, ".zstd", "", 1)); // Remove the ".zstd" extension.
+
+        try ( ZstdCompressorInputStream zsIn = new ZstdCompressorInputStream(new BufferedInputStream(Files.newInputStream(Paths.get(zstdSource))));
+              OutputStream out = Files.newOutputStream(tarPath) )
+        {
+            final byte[] buffer = new byte[1048576];    // 1 Mb
+            int n = 0;
+            while ( (n = zsIn.read(buffer)) != -1 ) {
+                out.write(buffer, 0, n);
+            }
+        }
+
+        // Now we have a decompressed tar-file, which we will Un-tar, in order to extract the full-text files.
+        try ( TarArchiveInputStream tarInput = new TarArchiveInputStream(new BufferedInputStream(Files.newInputStream(tarPath))) )
+        {
+            TarArchiveEntry entry;
+            while ( ((entry = (TarArchiveEntry) tarInput.getNextEntry()) != null) )
+            {
+                String entryName = entry.getName();
+                Path targetFilePath = targetDir.resolve(entryName);
+                Files.copy(tarInput, targetFilePath, StandardCopyOption.REPLACE_EXISTING);   // Copy an individual entry.
+                // No need to close the tarEntry.
+            }
+        }
+
+        // Now we have a batch-directory which contains the tar-file along with the extracted full-text files.
+        // After uploading the full-texts, the batch-directories will be deleted.
+    }
+
+}
--- a/src/main/java/eu/openaire/urls_controller/util/FileUnZipper.java
+++ b/src/main/java/eu/openaire/urls_controller/util/FileUnZipper.java
@ -1,52 +0,0 @@
-package eu.openaire.urls_controller.util;
-
-import org.springframework.stereotype.Component;
-
-import java.io.File;
-import java.io.IOException;
-import java.nio.file.Files;
-import java.nio.file.Path;
-import java.nio.file.StandardCopyOption;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
-
-@Component
-public class FileUnZipper {
-
-
-    public void unzipFolder(Path source, Path target) throws Exception {
-        try ( ZipInputStream zis = new ZipInputStream(Files.newInputStream(source.toFile().toPath())) ) {
-            // Iterate over the files in zip and unzip them.
-            ZipEntry zipEntry = zis.getNextEntry();
-            while ( zipEntry != null ) {
-                String zipEntryName = zipEntry.getName();
-                Path targetPath = zipSlipProtect(zipEntryName, target);
-                if ( zipEntryName.endsWith(File.separator) )  // If we have a directory.
-                    Files.createDirectories(targetPath);
-                else {
-                    // Some zip-files store only the file-paths and not separate directories. We need to create parent directories, e.g data/folder/file.txt
-                    Path parentPath = targetPath.getParent();
-                    if ( (parentPath != null) && Files.notExists(parentPath) ) {
-                        Files.createDirectories(parentPath);
-                    }
-                    Files.copy(zis, targetPath, StandardCopyOption.REPLACE_EXISTING);   // Copy an individual entry.
-                }
-                zis.closeEntry();
-                zipEntry = zis.getNextEntry();
-            }
-            // Here the "zipEntry" will always be "null", so no "current ZIP entry" will be open, (so the "closeEntry()" is not needed).
-        }
-    }
-
-
-    // Protect from a Zip Slip attack:  https://snyk.io/research/zip-slip-vulnerability
-    public Path zipSlipProtect(String zipEntryName, Path targetDir) throws IOException {
-        Path targetDirResolved = targetDir.resolve(zipEntryName);
-        // Make sure normalized file still has targetDir as its prefix, else throw an exception.
-        Path normalizePath = targetDirResolved.normalize();
-        if ( !normalizePath.startsWith(targetDir) ) {
-            throw new IOException("Bad zip entry: " + zipEntryName);
-        }
-        return normalizePath;
-    }
-}
--- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
+++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
@ -46,7 +46,8 @@ public class FileUtils {
    private S3ObjectStore s3ObjectStore;

    @Autowired
-    private FileUnZipper fileUnZipper;
+    private FileDecompressor fileDecompressor;
+

    public enum UploadFullTextsResponse {successful, unsuccessful, databaseError}

@ -226,7 +227,7 @@ public class FileUtils {
            return UploadFullTextsResponse.successful;    // It was handled, no error.
        }

-        // Request the full-texts in batches, compressed in zip.
+        // Request the full-texts in batches, compressed in a zstd tar.
        int numOfBatches = (numAllFullTexts / numOfFullTextsPerBatch);
        int remainingFiles = (numAllFullTexts % numOfFullTextsPerBatch);
        if ( remainingFiles > 0 ) {   // Add an extra batch for the remaining files. This guarantees at least one batch will exist no matter how few (>0) the files are.
@ -236,10 +237,13 @@ public class FileUtils {
            logger.debug("The assignments_" + assignmentsBatchCounter + " have " + numAllFullTexts + " distinct non-already-uploaded fullTexts. Going to request them from the Worker \"" + workerId + "\", in " + numOfBatches + " batches (" + numOfFullTextsPerBatch + " files each).");

        // Check if one full text is left out because of the division. Put it int the last batch.
-        String baseUrl = "http://" + remoteAddr + ":1881/api/full-texts/getFullTexts/" + assignmentsBatchCounter + "/" + numOfBatches + "/";
+        String baseUrl = "http://" + remoteAddr + ":1881/api/full-texts/getFullTextsImproved/" + assignmentsBatchCounter + "/" + numOfBatches + "/";
+
+        // TODO - The worker should send the port in which it accepts requests, along with the current request.
+        // TODO - The least we have to do it to expose the port-assignment somewhere more obvious like inside the "application.properties" file.

        String curAssignmentsBaseLocation = baseFilesLocation + "assignments_" + assignmentsBatchCounter + File.separator;
-        File curAssignmentsBaseDir = new File(curAssignmentsBaseLocation);
+        // Note: the "curAssignmentsBaseLocation"-directory will be create once the first batch sub-directory is called for creation.

        int failedBatches = 0;
        for ( int batchCounter = 1; batchCounter <= numOfBatches; ++batchCounter ) {
@ -264,26 +268,28 @@ public class FileUtils {
                Path curBatchPath = Files.createDirectories(Paths.get(targetDirectory));
                // The base-directory will be created along with the first batch directory.

-                // Unzip the file. Iterate over the PDFs and upload each one of them and get the S3-Url
-                String zipFileFullPath = targetDirectory + "fullTexts_" + assignmentsBatchCounter + "_" + batchCounter + ".zip";
-                File zipFile = new File(zipFileFullPath);
+                // Save and decompress the zstd file. Iterate over the PDFs and upload each one of them and get the S3-Url.
+                String zstdFileFullPath = targetDirectory + "fullTexts_" + assignmentsBatchCounter + "_" + batchCounter + ".tar.zstd";
+                File zstdFile = new File(zstdFileFullPath);

-                if ( ! saveZipFile(conn, zipFile) ) {
+                if ( ! saveArchive(conn, zstdFile) ) {
                    failedBatches ++;
                    continue;   // To the next batch.
                }
-                //logger.debug("The zip file has been saved: " + zipFileFullPath);    // DEBUG!
+                //logger.debug("The zstd file has been saved: " + zstdFileFullPath);    // DEBUG!

-                fileUnZipper.unzipFolder(Paths.get(zipFileFullPath), curBatchPath);
+                // We do not call "conn.disconnect()", since more request are about to be made to the worker, in the near future.
+
+                fileDecompressor.decompressFiles(zstdFileFullPath, curBatchPath);

                String[] fileNames = new File(targetDirectory).list();
-                if ( (fileNames == null) || (fileNames.length <= 1) ) {    // The directory might have only one file, the "zip-file", if the full-texts failed to be unzipped..
+                if ( (fileNames == null) || (fileNames.length <= 2) ) {    // The directory might have only two files, the "tar-file" and the "tar.zstd-file", if the full-texts failed to be decompressed or untarred..
                    logger.error("No full-text fileNames where extracted from directory: " + targetDirectory);
                    failedBatches ++;
                    continue;   // To the next batch.
                }

-                uploadFullTexts(fileNames, targetDirectory, zipFileFullPath, allFileNamesWithPayloads);
+                uploadFullTexts(fileNames, targetDirectory, allFileNamesWithPayloads);

            } catch (Exception e) {
                logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + "\n" + e.getMessage(), e);   // It shows the response body (after Spring v.2.5.6).
@ -292,7 +298,7 @@ public class FileUtils {
        }   // End of batches.

        updateUrlReportsToHaveNoFullTextFiles(urlReports, true);    // Make sure all records without an S3-Url have < null > file-data (some batches or uploads might have failed).
-        deleteDirectory(curAssignmentsBaseDir);
+        deleteDirectory(new File(curAssignmentsBaseLocation));

        if ( failedBatches == numOfBatches ) {
            logger.error("None of the " + numOfBatches + " batches could be handled for assignments_" + assignmentsBatchCounter + ", for worker: " + workerId);
@ -315,12 +321,12 @@ public class FileUtils {
            conn.connect();
            int statusCode = conn.getResponseCode();
            if ( statusCode != 200 ) {
-                logger.warn("HTTP-" + statusCode + ": " + getMessageFromResponseBody(conn, true) + "\nProblem when requesting the ZipFile of batch_" + batchNum + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl);
+                logger.warn("HTTP-" + statusCode + ": " + getMessageFromResponseBody(conn, true) + "\nProblem when requesting the ZstdFile of batch_" + batchNum + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl);
                return null;
            }
        } catch (Exception e) {
            String exMessage = e.getMessage();
-            logger.warn("Problem when requesting the ZipFile of batch_" + batchNum + " of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl + "\n" + exMessage);
+            logger.warn("Problem when requesting the ZstdFile of batch_" + batchNum + " of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and requestUrl: " + requestUrl + "\n" + exMessage);
            if ( exMessage.contains("Connection refused") ) {
                logger.error("Since we received a \"Connection refused\", all of the remaining batches (" + (totalBatches - batchNum) + ") will not be requested!");
                throw new RuntimeException();
@ -331,14 +337,13 @@ public class FileUtils {
    }


-    private void uploadFullTexts(String[] fileNames, String targetDirectory, String zipFileFullPath, SetMultimap<String, Payload> allFileNamesWithPayloads)
+    private void uploadFullTexts(String[] fileNames, String targetDirectory, SetMultimap<String, Payload> allFileNamesWithPayloads)
    {
        // Iterate over the files and upload them to S3.
        //int numUploadedFiles = 0;
        for( String fileName : fileNames )
        {
-            String fileFullPath = targetDirectory + fileName;
-            if ( fileFullPath.equals(zipFileFullPath) ) // Exclude the zip-file from uploading.
+            if ( fileName.contains(".tar") ) // Exclude the tar-files from uploading.
                continue;

            // Check if this stored file is related to one or more Payloads from the Set. Defend against malicious file injection. It does not add more overhead, since we already need the "fileRelatedPayloads".
@ -399,9 +404,11 @@ public class FileUtils {
                    continue;
                }

+                String fileFullPath = targetDirectory + fileName;   // The fullPath to the local file.
+
                // Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName).
                // Now we append the file-hash, so it is guaranteed that the filename will be unique.
-                fileName = datasourceId + "/" + fileNameID + "::" + hash + dotFileExtension;
+                fileName = datasourceId + "/" + fileNameID + "::" + hash + dotFileExtension;    // This is the fileName to be used in the objectStore, not of the local file!

                String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath);
                setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url);
@ -412,8 +419,8 @@ public class FileUtils {
            // Else, the record will have its file-data set to "null", in the end of this method.
        }

-        //logger.debug("Finished uploading " + numUploadedFiles + " full-texts (out of " + (fileNames.length -1) + " distinct files) from assignments_" + assignmentsBatchCounter + ", batch_" + batchCounter + " on S3-ObjectStore.");
-        // (fileNames.length -1) --> minus the zip-file
+        //logger.debug("Finished uploading " + numUploadedFiles + " full-texts (out of " + (fileNames.length -2) + " distinct files) from assignments_" + assignmentsBatchCounter + ", batch_" + batchCounter + " on S3-ObjectStore.");
+        // (fileNames.length -2) --> minus the zstd and the tar files.
    }


@ -468,19 +475,19 @@ public class FileUtils {
    }


-    public boolean saveZipFile(HttpURLConnection conn, File zipFile) {
+    public boolean saveArchive(HttpURLConnection conn, File zstdFile) {
        InputStream inStream = null;
        FileOutputStream outStream = null;
        try {
            inStream = conn.getInputStream();
-            outStream = new FileOutputStream(zipFile);
+            outStream = new FileOutputStream(zstdFile);
            int readByte;
            while ( (readByte = inStream.read()) != -1 ) {
                outStream.write(readByte);
            }
            return true;
        } catch (Exception e) {
-            logger.error("Could not save the zip file \"" + zipFile.getName() + "\": " + e.getMessage(), e);
+            logger.error("Could not save the zstd file \"" + zstdFile.getName() + "\": " + e.getMessage(), e);
            return false;
        } finally {
            try {
--- a/src/main/java/eu/openaire/urls_controller/util/ParquetFileUtils.java
+++ b/src/main/java/eu/openaire/urls_controller/util/ParquetFileUtils.java
@ -318,8 +318,10 @@ public class ParquetFileUtils {

        try (ParquetWriter<GenericRecord> writer = AvroParquetWriter.<GenericRecord>builder(outputFile).withSchema(schema)
                                                                    .withCompressionCodec(CompressionCodecName.GZIP).build())
-            //  When the app runs inside a Docker Container, it is NOT guaranteed that all compression-types will work. For example, the "SNAPPY"-compression does NOT work, while the "GZIP" works.
        {
+            // When the app runs inside a Docker Container, it is NOT guaranteed that all compression-types will work. For example, the "SNAPPY"-compression does NOT work, while the "GZIP" works.
+            // Also, we would prefer ZSTD over GZIP, but the old version of the Impala-Database does not support it..
+
            //logger.debug("Going to write to \"" + fullFilePath + "\" the record list: " + recordList);  // DEBUG!
            for ( GenericRecord record : recordList ) {
                //logger.debug("Writing to \"" + fullFilePath + "\" the record: " + record);  // DEBUG!