forked from lsmyrnaios/UrlsController
Move some code from "FileUtils.getAndUploadFullTexts()" to two separate methods.
This commit is contained in:
parent
56d233d38e
commit
b9b29dd51c
|
@ -112,36 +112,7 @@ public class FileUtils {
|
||||||
|
|
||||||
HashMultimap<String, Payload> allFileNamesWithPayloads = HashMultimap.create((sizeOfUrlReports / 5), 3); // Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it.
|
HashMultimap<String, Payload> allFileNamesWithPayloads = HashMultimap.create((sizeOfUrlReports / 5), 3); // Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it.
|
||||||
|
|
||||||
HashMultimap<String, Payload> hashesWithPayloads = HashMultimap.create((sizeOfUrlReports / 5), 3); // Holds multiple payloads for the same fileHash.
|
HashMultimap<String, Payload> hashesWithPayloads = getHashesWithPayloads(urlReports, sizeOfUrlReports); // Holds multiple payloads for the same fileHash.
|
||||||
// The "Hash" part of the multimap helps with avoiding duplicate fileHashes.
|
|
||||||
|
|
||||||
for ( UrlReport urlReport : urlReports )
|
|
||||||
{
|
|
||||||
Payload payload = urlReport.getPayload();
|
|
||||||
if ( payload == null )
|
|
||||||
continue;
|
|
||||||
|
|
||||||
String fileLocation = payload.getLocation();
|
|
||||||
if ( fileLocation == null )
|
|
||||||
continue; // The full-text was not retrieved for this UrlReport.
|
|
||||||
|
|
||||||
// Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH.
|
|
||||||
// If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker.
|
|
||||||
// If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker.
|
|
||||||
String fileHash = payload.getHash();
|
|
||||||
if ( fileHash != null )
|
|
||||||
{
|
|
||||||
hashesWithPayloads.put(fileHash, payload); // Hold multiple payloads per fileHash.
|
|
||||||
// There are 2 cases, which contribute to that:
|
|
||||||
// 1) Different publication-IDs end up giving the same full-text-url, resulting in the same file. Those duplicates are not saved, but instead, the location, hash and size of the file is copied to the other payload.
|
|
||||||
// 2) Different publication-IDs end up giving different full-text-urls which point to the same file. Although very rare, in this case, the file is downloaded again by the Worker and has a different name.
|
|
||||||
|
|
||||||
// In either case, the duplicate file will not be transferred to the Controller, but in the 2nd one it takes up extra space, at least for some time.
|
|
||||||
// TODO - Implement a fileHash-check lagorithm in the Worker's side ("PublicationsRetriever"), to avoid keeping those files in storage.
|
|
||||||
|
|
||||||
} else // This should never happen..
|
|
||||||
logger.error("Payload: " + payload + " has a null fileHash!");
|
|
||||||
}// end-for
|
|
||||||
|
|
||||||
Set<String> fileHashes = hashesWithPayloads.keySet();
|
Set<String> fileHashes = hashesWithPayloads.keySet();
|
||||||
int fileHashesSetSize = fileHashes.size(); // Get the size of the keysSet, instead of the whole multimap.
|
int fileHashesSetSize = fileHashes.size(); // Get the size of the keysSet, instead of the whole multimap.
|
||||||
|
@ -150,32 +121,7 @@ public class FileUtils {
|
||||||
return UploadFullTextsResponse.successful_without_fulltexts; // It was handled, no error.
|
return UploadFullTextsResponse.successful_without_fulltexts; // It was handled, no error.
|
||||||
}
|
}
|
||||||
|
|
||||||
// Prepare the "fileHashListString" to be used inside the "getHashLocationsQuery". Create the following string-pattern:
|
HashMap<String, String> hashLocationMap = getHashLocationMap(fileHashes, fileHashesSetSize, assignmentsBatchCounter);
|
||||||
// ("HASH_1", "HASH_2", ...)
|
|
||||||
int stringBuilderCapacity = ((fileHashesSetSize * 32) + (fileHashesSetSize -1) +2);
|
|
||||||
|
|
||||||
String getHashLocationsQuery = "select distinct `hash`, `location` from " + DatabaseConnector.databaseName + ".payload where `hash` in "
|
|
||||||
+ getQueryListString(new ArrayList<>(fileHashes), fileHashesSetSize, stringBuilderCapacity);
|
|
||||||
|
|
||||||
HashMap<String, String> hashLocationMap = new HashMap<>(fileHashesSetSize/2); // No multimap is needed since only one location is returned for each fileHash.
|
|
||||||
|
|
||||||
DatabaseConnector.databaseLock.lock(); // The execution uses the database.
|
|
||||||
try {
|
|
||||||
jdbcTemplate.query(getHashLocationsQuery, rs -> {
|
|
||||||
try { // For each of the 4 columns returned, do the following. The column-indexing starts from 1.
|
|
||||||
hashLocationMap.put(rs.getString(1), rs.getString(2));
|
|
||||||
} catch (SQLException sqle) {
|
|
||||||
logger.error("No value was able to be retrieved from one of the columns of row_" + rs.getRow(), sqle);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
} catch (EmptyResultDataAccessException erdae) {
|
|
||||||
logger.warn("No previously-found hash-locations where found for assignments_" + assignmentsBatchCounter);
|
|
||||||
} catch (Exception e) {
|
|
||||||
logger.error("Unexpected error when checking for already-found file-hashes!", e);
|
|
||||||
// We will continue with storing the files, we do not want to lose them.
|
|
||||||
} finally {
|
|
||||||
DatabaseConnector.databaseLock.unlock();
|
|
||||||
}
|
|
||||||
|
|
||||||
for ( String fileHash : fileHashes )
|
for ( String fileHash : fileHashes )
|
||||||
{
|
{
|
||||||
|
@ -319,6 +265,76 @@ public class FileUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public HashMultimap<String, Payload> getHashesWithPayloads(List<UrlReport> urlReports, int sizeOfUrlReports)
|
||||||
|
{
|
||||||
|
HashMultimap<String, Payload> hashesWithPayloads = HashMultimap.create((sizeOfUrlReports / 5), 3); // Holds multiple payloads for the same fileHash.
|
||||||
|
// The "Hash" part of the multimap helps with avoiding duplicate fileHashes.
|
||||||
|
|
||||||
|
for ( UrlReport urlReport : urlReports )
|
||||||
|
{
|
||||||
|
Payload payload = urlReport.getPayload();
|
||||||
|
if ( payload == null )
|
||||||
|
continue;
|
||||||
|
|
||||||
|
String fileLocation = payload.getLocation();
|
||||||
|
if ( fileLocation == null )
|
||||||
|
continue; // The full-text was not retrieved for this UrlReport.
|
||||||
|
|
||||||
|
// Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH.
|
||||||
|
// If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker.
|
||||||
|
// If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker.
|
||||||
|
String fileHash = payload.getHash();
|
||||||
|
if ( fileHash != null )
|
||||||
|
{
|
||||||
|
hashesWithPayloads.put(fileHash, payload); // Hold multiple payloads per fileHash.
|
||||||
|
// There are 2 cases, which contribute to that:
|
||||||
|
// 1) Different publication-IDs end up giving the same full-text-url, resulting in the same file. Those duplicates are not saved, but instead, the location, hash and size of the file is copied to the other payload.
|
||||||
|
// 2) Different publication-IDs end up giving different full-text-urls which point to the same file. Although very rare, in this case, the file is downloaded again by the Worker and has a different name.
|
||||||
|
|
||||||
|
// In either case, the duplicate file will not be transferred to the Controller, but in the 2nd one it takes up extra space, at least for some time.
|
||||||
|
// TODO - Implement a fileHash-check algorithm in the Worker's side ("PublicationsRetriever"), to avoid keeping those files in storage.
|
||||||
|
|
||||||
|
} else // This should never happen..
|
||||||
|
logger.error("Payload: " + payload + " has a null fileHash!");
|
||||||
|
}// end-for
|
||||||
|
|
||||||
|
return hashesWithPayloads;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public HashMap<String, String> getHashLocationMap(Set<String> fileHashes, int fileHashesSetSize, long assignmentsBatchCounter)
|
||||||
|
{
|
||||||
|
// Prepare the "fileHashListString" to be used inside the "getHashLocationsQuery". Create the following string-pattern:
|
||||||
|
// ("HASH_1", "HASH_2", ...)
|
||||||
|
int stringBuilderCapacity = ((fileHashesSetSize * 32) + (fileHashesSetSize -1) +2);
|
||||||
|
|
||||||
|
String getHashLocationsQuery = "select distinct `hash`, `location` from " + DatabaseConnector.databaseName + ".payload where `hash` in "
|
||||||
|
+ getQueryListString(new ArrayList<>(fileHashes), fileHashesSetSize, stringBuilderCapacity);
|
||||||
|
|
||||||
|
HashMap<String, String> hashLocationMap = new HashMap<>(fileHashesSetSize/2); // No multimap is needed since only one location is returned for each fileHash.
|
||||||
|
|
||||||
|
DatabaseConnector.databaseLock.lock(); // The execution uses the database.
|
||||||
|
try {
|
||||||
|
jdbcTemplate.query(getHashLocationsQuery, rs -> {
|
||||||
|
try { // For each of the 4 columns returned, do the following. The column-indexing starts from 1.
|
||||||
|
hashLocationMap.put(rs.getString(1), rs.getString(2));
|
||||||
|
} catch (SQLException sqle) {
|
||||||
|
logger.error("No value was able to be retrieved from one of the columns of row_" + rs.getRow(), sqle);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
} catch (EmptyResultDataAccessException erdae) {
|
||||||
|
logger.warn("No previously-found hash-locations where found for assignments_" + assignmentsBatchCounter);
|
||||||
|
} catch (Exception e) {
|
||||||
|
logger.error("Unexpected error when checking for already-found file-hashes!", e);
|
||||||
|
// We will continue with storing the files, we do not want to lose them.
|
||||||
|
} finally {
|
||||||
|
DatabaseConnector.databaseLock.unlock();
|
||||||
|
}
|
||||||
|
|
||||||
|
return hashLocationMap;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
private boolean getAndSaveFullTextBatch(List<String> fileNamesForCurBatch, String baseUrl, long assignmentsBatchCounter, int batchCounter, int numOfBatches,
|
private boolean getAndSaveFullTextBatch(List<String> fileNamesForCurBatch, String baseUrl, long assignmentsBatchCounter, int batchCounter, int numOfBatches,
|
||||||
String zstdFileFullPath, String workerId) throws RuntimeException
|
String zstdFileFullPath, String workerId) throws RuntimeException
|
||||||
{
|
{
|
||||||
|
|
Loading…
Reference in New Issue