Move some code from "FileUtils.getAndUploadFullTexts()" to two separate methods.

2024-03-20 16:53:03 +02:00 · 2024-03-20 16:53:03 +02:00 · b9b29dd51c
parent 56d233d38e
commit b9b29dd51c
1 changed files with 72 additions and 56 deletions
--- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
+++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
@ -112,36 +112,7 @@ public class FileUtils {

        HashMultimap<String, Payload> allFileNamesWithPayloads = HashMultimap.create((sizeOfUrlReports / 5), 3);    // Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it.

-        HashMultimap<String, Payload> hashesWithPayloads = HashMultimap.create((sizeOfUrlReports / 5), 3);   // Holds multiple payloads for the same fileHash.
-        // The "Hash" part of the multimap helps with avoiding duplicate fileHashes.
-
-        for ( UrlReport urlReport : urlReports )
-        {
-            Payload payload = urlReport.getPayload();
-            if ( payload == null )
-                continue;
-
-            String fileLocation = payload.getLocation();
-            if ( fileLocation == null )
-                continue;   // The full-text was not retrieved for this UrlReport.
-
-            // Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH.
-            // If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker.
-            // If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker.
-            String fileHash = payload.getHash();
-            if ( fileHash != null )
-            {
-                hashesWithPayloads.put(fileHash, payload);  // Hold multiple payloads per fileHash.
-                // There are 2 cases, which contribute to that:
-                // 1) Different publication-IDs end up giving the same full-text-url, resulting in the same file. Those duplicates are not saved, but instead, the location, hash and size of the file is copied to the other payload.
-                // 2) Different publication-IDs end up giving different full-text-urls which point to the same file. Although very rare, in this case, the file is downloaded again by the Worker and has a different name.
-
-                // In either case, the duplicate file will not be transferred to the Controller, but in the 2nd one it takes up extra space, at least for some time.
-                // TODO - Implement a fileHash-check lagorithm in the Worker's side ("PublicationsRetriever"), to avoid keeping those files in storage.
-
-            } else  // This should never happen..
-                logger.error("Payload: " + payload + " has a null fileHash!");
-        }// end-for
+        HashMultimap<String, Payload> hashesWithPayloads = getHashesWithPayloads(urlReports, sizeOfUrlReports);   // Holds multiple payloads for the same fileHash.

        Set<String> fileHashes = hashesWithPayloads.keySet();
        int fileHashesSetSize = fileHashes.size();  // Get the size of the keysSet, instead of the whole multimap.
@ -150,32 +121,7 @@ public class FileUtils {
            return UploadFullTextsResponse.successful_without_fulltexts;    // It was handled, no error.
        }

-        // Prepare the "fileHashListString" to be used inside the "getHashLocationsQuery". Create the following string-pattern:
-        // ("HASH_1", "HASH_2", ...)
-        int stringBuilderCapacity = ((fileHashesSetSize * 32) + (fileHashesSetSize -1) +2);
-
-        String getHashLocationsQuery = "select distinct `hash`, `location` from " + DatabaseConnector.databaseName + ".payload where `hash` in "
-                + getQueryListString(new ArrayList<>(fileHashes), fileHashesSetSize, stringBuilderCapacity);
-
-        HashMap<String, String> hashLocationMap = new HashMap<>(fileHashesSetSize/2);   // No multimap is needed since only one location is returned for each fileHash.
-
-        DatabaseConnector.databaseLock.lock();    // The execution uses the database.
-        try {
-            jdbcTemplate.query(getHashLocationsQuery, rs -> {
-                try {   // For each of the 4 columns returned, do the following. The column-indexing starts from 1.
-                    hashLocationMap.put(rs.getString(1), rs.getString(2));
-                } catch (SQLException sqle) {
-                    logger.error("No value was able to be retrieved from one of the columns of row_" + rs.getRow(), sqle);
-                }
-            });
-        } catch (EmptyResultDataAccessException erdae) {
-            logger.warn("No previously-found hash-locations where found for assignments_" + assignmentsBatchCounter);
-        } catch (Exception e) {
-            logger.error("Unexpected error when checking for already-found file-hashes!", e);
-            // We will continue with storing the files, we do not want to lose them.
-        } finally {
-            DatabaseConnector.databaseLock.unlock();
-        }
+        HashMap<String, String> hashLocationMap = getHashLocationMap(fileHashes, fileHashesSetSize, assignmentsBatchCounter);

        for ( String fileHash : fileHashes )
        {
@ -319,6 +265,76 @@ public class FileUtils {
    }


+    public HashMultimap<String, Payload> getHashesWithPayloads(List<UrlReport> urlReports, int sizeOfUrlReports)
+    {
+        HashMultimap<String, Payload> hashesWithPayloads = HashMultimap.create((sizeOfUrlReports / 5), 3);   // Holds multiple payloads for the same fileHash.
+        // The "Hash" part of the multimap helps with avoiding duplicate fileHashes.
+
+        for ( UrlReport urlReport : urlReports )
+        {
+            Payload payload = urlReport.getPayload();
+            if ( payload == null )
+                continue;
+
+            String fileLocation = payload.getLocation();
+            if ( fileLocation == null )
+                continue;   // The full-text was not retrieved for this UrlReport.
+
+            // Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH.
+            // If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker.
+            // If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker.
+            String fileHash = payload.getHash();
+            if ( fileHash != null )
+            {
+                hashesWithPayloads.put(fileHash, payload);  // Hold multiple payloads per fileHash.
+                // There are 2 cases, which contribute to that:
+                // 1) Different publication-IDs end up giving the same full-text-url, resulting in the same file. Those duplicates are not saved, but instead, the location, hash and size of the file is copied to the other payload.
+                // 2) Different publication-IDs end up giving different full-text-urls which point to the same file. Although very rare, in this case, the file is downloaded again by the Worker and has a different name.
+
+                // In either case, the duplicate file will not be transferred to the Controller, but in the 2nd one it takes up extra space, at least for some time.
+                // TODO - Implement a fileHash-check algorithm in the Worker's side ("PublicationsRetriever"), to avoid keeping those files in storage.
+
+            } else  // This should never happen..
+                logger.error("Payload: " + payload + " has a null fileHash!");
+        }// end-for
+
+        return hashesWithPayloads;
+    }
+
+
+    public HashMap<String, String> getHashLocationMap(Set<String> fileHashes, int fileHashesSetSize, long assignmentsBatchCounter)
+    {
+        // Prepare the "fileHashListString" to be used inside the "getHashLocationsQuery". Create the following string-pattern:
+        // ("HASH_1", "HASH_2", ...)
+        int stringBuilderCapacity = ((fileHashesSetSize * 32) + (fileHashesSetSize -1) +2);
+
+        String getHashLocationsQuery = "select distinct `hash`, `location` from " + DatabaseConnector.databaseName + ".payload where `hash` in "
+                + getQueryListString(new ArrayList<>(fileHashes), fileHashesSetSize, stringBuilderCapacity);
+
+        HashMap<String, String> hashLocationMap = new HashMap<>(fileHashesSetSize/2);   // No multimap is needed since only one location is returned for each fileHash.
+
+        DatabaseConnector.databaseLock.lock();    // The execution uses the database.
+        try {
+            jdbcTemplate.query(getHashLocationsQuery, rs -> {
+                try {   // For each of the 4 columns returned, do the following. The column-indexing starts from 1.
+                    hashLocationMap.put(rs.getString(1), rs.getString(2));
+                } catch (SQLException sqle) {
+                    logger.error("No value was able to be retrieved from one of the columns of row_" + rs.getRow(), sqle);
+                }
+            });
+        } catch (EmptyResultDataAccessException erdae) {
+            logger.warn("No previously-found hash-locations where found for assignments_" + assignmentsBatchCounter);
+        } catch (Exception e) {
+            logger.error("Unexpected error when checking for already-found file-hashes!", e);
+            // We will continue with storing the files, we do not want to lose them.
+        } finally {
+            DatabaseConnector.databaseLock.unlock();
+        }
+
+        return hashLocationMap;
+    }
+
+
    private boolean getAndSaveFullTextBatch(List<String> fileNamesForCurBatch, String baseUrl, long assignmentsBatchCounter, int batchCounter, int numOfBatches,
                                            String zstdFileFullPath, String workerId) throws RuntimeException
    {