From b9b29dd51cd10d22d71f860d9db662a24b342f6e Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 20 Mar 2024 16:53:03 +0200 Subject: [PATCH] Move some code from "FileUtils.getAndUploadFullTexts()" to two separate methods. --- .../urls_controller/util/FileUtils.java | 128 ++++++++++-------- 1 file changed, 72 insertions(+), 56 deletions(-) diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index bf2b265..deb2366 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -112,36 +112,7 @@ public class FileUtils { HashMultimap allFileNamesWithPayloads = HashMultimap.create((sizeOfUrlReports / 5), 3); // Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it. - HashMultimap hashesWithPayloads = HashMultimap.create((sizeOfUrlReports / 5), 3); // Holds multiple payloads for the same fileHash. - // The "Hash" part of the multimap helps with avoiding duplicate fileHashes. - - for ( UrlReport urlReport : urlReports ) - { - Payload payload = urlReport.getPayload(); - if ( payload == null ) - continue; - - String fileLocation = payload.getLocation(); - if ( fileLocation == null ) - continue; // The full-text was not retrieved for this UrlReport. - - // Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH. - // If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker. - // If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker. - String fileHash = payload.getHash(); - if ( fileHash != null ) - { - hashesWithPayloads.put(fileHash, payload); // Hold multiple payloads per fileHash. - // There are 2 cases, which contribute to that: - // 1) Different publication-IDs end up giving the same full-text-url, resulting in the same file. Those duplicates are not saved, but instead, the location, hash and size of the file is copied to the other payload. - // 2) Different publication-IDs end up giving different full-text-urls which point to the same file. Although very rare, in this case, the file is downloaded again by the Worker and has a different name. - - // In either case, the duplicate file will not be transferred to the Controller, but in the 2nd one it takes up extra space, at least for some time. - // TODO - Implement a fileHash-check lagorithm in the Worker's side ("PublicationsRetriever"), to avoid keeping those files in storage. - - } else // This should never happen.. - logger.error("Payload: " + payload + " has a null fileHash!"); - }// end-for + HashMultimap hashesWithPayloads = getHashesWithPayloads(urlReports, sizeOfUrlReports); // Holds multiple payloads for the same fileHash. Set fileHashes = hashesWithPayloads.keySet(); int fileHashesSetSize = fileHashes.size(); // Get the size of the keysSet, instead of the whole multimap. @@ -150,32 +121,7 @@ public class FileUtils { return UploadFullTextsResponse.successful_without_fulltexts; // It was handled, no error. } - // Prepare the "fileHashListString" to be used inside the "getHashLocationsQuery". Create the following string-pattern: - // ("HASH_1", "HASH_2", ...) - int stringBuilderCapacity = ((fileHashesSetSize * 32) + (fileHashesSetSize -1) +2); - - String getHashLocationsQuery = "select distinct `hash`, `location` from " + DatabaseConnector.databaseName + ".payload where `hash` in " - + getQueryListString(new ArrayList<>(fileHashes), fileHashesSetSize, stringBuilderCapacity); - - HashMap hashLocationMap = new HashMap<>(fileHashesSetSize/2); // No multimap is needed since only one location is returned for each fileHash. - - DatabaseConnector.databaseLock.lock(); // The execution uses the database. - try { - jdbcTemplate.query(getHashLocationsQuery, rs -> { - try { // For each of the 4 columns returned, do the following. The column-indexing starts from 1. - hashLocationMap.put(rs.getString(1), rs.getString(2)); - } catch (SQLException sqle) { - logger.error("No value was able to be retrieved from one of the columns of row_" + rs.getRow(), sqle); - } - }); - } catch (EmptyResultDataAccessException erdae) { - logger.warn("No previously-found hash-locations where found for assignments_" + assignmentsBatchCounter); - } catch (Exception e) { - logger.error("Unexpected error when checking for already-found file-hashes!", e); - // We will continue with storing the files, we do not want to lose them. - } finally { - DatabaseConnector.databaseLock.unlock(); - } + HashMap hashLocationMap = getHashLocationMap(fileHashes, fileHashesSetSize, assignmentsBatchCounter); for ( String fileHash : fileHashes ) { @@ -319,6 +265,76 @@ public class FileUtils { } + public HashMultimap getHashesWithPayloads(List urlReports, int sizeOfUrlReports) + { + HashMultimap hashesWithPayloads = HashMultimap.create((sizeOfUrlReports / 5), 3); // Holds multiple payloads for the same fileHash. + // The "Hash" part of the multimap helps with avoiding duplicate fileHashes. + + for ( UrlReport urlReport : urlReports ) + { + Payload payload = urlReport.getPayload(); + if ( payload == null ) + continue; + + String fileLocation = payload.getLocation(); + if ( fileLocation == null ) + continue; // The full-text was not retrieved for this UrlReport. + + // Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH. + // If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker. + // If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker. + String fileHash = payload.getHash(); + if ( fileHash != null ) + { + hashesWithPayloads.put(fileHash, payload); // Hold multiple payloads per fileHash. + // There are 2 cases, which contribute to that: + // 1) Different publication-IDs end up giving the same full-text-url, resulting in the same file. Those duplicates are not saved, but instead, the location, hash and size of the file is copied to the other payload. + // 2) Different publication-IDs end up giving different full-text-urls which point to the same file. Although very rare, in this case, the file is downloaded again by the Worker and has a different name. + + // In either case, the duplicate file will not be transferred to the Controller, but in the 2nd one it takes up extra space, at least for some time. + // TODO - Implement a fileHash-check algorithm in the Worker's side ("PublicationsRetriever"), to avoid keeping those files in storage. + + } else // This should never happen.. + logger.error("Payload: " + payload + " has a null fileHash!"); + }// end-for + + return hashesWithPayloads; + } + + + public HashMap getHashLocationMap(Set fileHashes, int fileHashesSetSize, long assignmentsBatchCounter) + { + // Prepare the "fileHashListString" to be used inside the "getHashLocationsQuery". Create the following string-pattern: + // ("HASH_1", "HASH_2", ...) + int stringBuilderCapacity = ((fileHashesSetSize * 32) + (fileHashesSetSize -1) +2); + + String getHashLocationsQuery = "select distinct `hash`, `location` from " + DatabaseConnector.databaseName + ".payload where `hash` in " + + getQueryListString(new ArrayList<>(fileHashes), fileHashesSetSize, stringBuilderCapacity); + + HashMap hashLocationMap = new HashMap<>(fileHashesSetSize/2); // No multimap is needed since only one location is returned for each fileHash. + + DatabaseConnector.databaseLock.lock(); // The execution uses the database. + try { + jdbcTemplate.query(getHashLocationsQuery, rs -> { + try { // For each of the 4 columns returned, do the following. The column-indexing starts from 1. + hashLocationMap.put(rs.getString(1), rs.getString(2)); + } catch (SQLException sqle) { + logger.error("No value was able to be retrieved from one of the columns of row_" + rs.getRow(), sqle); + } + }); + } catch (EmptyResultDataAccessException erdae) { + logger.warn("No previously-found hash-locations where found for assignments_" + assignmentsBatchCounter); + } catch (Exception e) { + logger.error("Unexpected error when checking for already-found file-hashes!", e); + // We will continue with storing the files, we do not want to lose them. + } finally { + DatabaseConnector.databaseLock.unlock(); + } + + return hashLocationMap; + } + + private boolean getAndSaveFullTextBatch(List fileNamesForCurBatch, String baseUrl, long assignmentsBatchCounter, int batchCounter, int numOfBatches, String zstdFileFullPath, String workerId) throws RuntimeException {