// TODO - Unify this ExecutorService with the hash-matching executorService. Since one will ALWAYS be called after the other. So why having two ExecServices to handle?
HashMultimap<String,Payload>allFileNamesWithPayloads=HashMultimap.create((sizeOfUrlReports/5),3);// Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it.
StringalreadyFoundFileLocation=hashLocationMap.get(fileHash);// Only one location has been retrieved per fileHash.
if(alreadyFoundFileLocation!=null){
// Fill the payloads with locations from the "previously-found-hashes."
payload.setLocation(alreadyFoundFileLocation);
if(logger.isTraceEnabled())
logger.trace("The record with ID \""+payload.getId()+"\" has an \"alreadyRetrieved\" file, with hash \""+fileHash+"\" and location \""+alreadyFoundFileLocation+"\".");// DEBUG!
numFilesFoundFromPreviousAssignmentsBatches++;
numValidFullTextsFound++;// We trust the location being valid..
}
else{// This file has not been found before..
// Extract the "fileNameWithExtension" to be added in the HashMultimap.
logger.error("Failed to match the \"fileLocation\": \""+fileLocation+"\" of id: \""+payload.getId()+"\", originalUrl: \""+payload.getOriginal_url()+"\", using this regex: "+FILENAME_ID_EXTENSION);
logger.error("Failed to extract the \"fileNameWithExtension\" from \"fileLocation\": \""+fileLocation+"\", of id: \""+payload.getId()+"\", originalUrl: \""+payload.getOriginal_url()+"\", using this regex: "+FILENAME_ID_EXTENSION);
numFullTextsWithProblematicLocations++;
continue;
}
numValidFullTextsFound++;
allFileNamesWithPayloads.put(fileNameWithExtension,payload);// The keys and the values are not duplicate.
// Task with ID-1 might have an "ID-1.pdf" file, while a task with ID-2 can also have an "ID-1.pdf" file, as the pdf-url-2 might be the same with pdf-url-1, thus, the ID-2 file was not downloaded again.
returnUploadFullTextsResponse.successful_without_fulltexts;// It's not what we want, but it's not an error either.
}
ArrayList<String>allFileNames=newArrayList<>(allFileNamesWithPayloads.keySet());// The number of fulltexts are lower than the number of payloads, since multiple payloads may lead to the same file.
logger.info(numValidFullTextsFound+" fulltexts were retrieved for assignments_"+assignmentsBatchCounter+", from worker: \""+workerId+"\", but all of them have been retrieved before.");
logger.info("NumFullTextsFound by assignments_"+assignmentsBatchCounter+" = "+numValidFullTextsFound+" (out of "+sizeOfUrlReports+" | about "+df.format(numValidFullTextsFound*100.0/sizeOfUrlReports)+"%).");
if(remainingFiles>0){// Add an extra batch for the remaining files. This guarantees at least one batch will exist no matter how few (>0) the files are.
logger.debug("The assignments_"+assignmentsBatchCounter+" have "+numFullTextsToBeRequested+" distinct, non-already-uploaded fullTexts (total is: "+numValidFullTextsFound+"). Going to request them from the Worker \""+workerId+"\", in "+numOfBatches+" batches ("+numOfFullTextsPerBatch+" files each, except for the final batch, which will have "+remainingFiles+" files).");
logger.debug("The assignments_"+assignmentsBatchCounter+" have "+numFullTextsToBeRequested+" distinct, non-already-uploaded fullTexts (total is: "+numValidFullTextsFound+"). Going to request them from the Worker \""+workerId+"\", in "+numOfBatches+" batches ("+numOfFullTextsPerBatch+" files each).");
// TODO - Currently, for big assignments (e.g. 10000), it takes 2 mins (actually 1,5 mins after using the Zstandard compression) for the worker to zstd the files and return them FOR EACH BATCH
// Also it takes around 3 mins for the Controller to process the received files FOR EACH BATCH
// So, for 24 batches, it takes around 24 * 2 * 3 = 144 mins to process all the full-texts for each assignments-batch.
// The worker will not have 2 parallel requests for zstd files, so the single CPU there will not be stressed to zstd many files in parallel.
// Yes the Controller may have the situation in which before finishing uploading the previously receive files to S3, it receives the new zstd from the Worker.
// TODO - BUT, we can make the new thread "WAIT" for the previous to finish.
logger.error("Could not create the \"curBatchPath\" directory: "+targetDirectory+GenericUtils.endOfLine+e.getMessage(),e);// It shows the response body (after Spring v.2.5.6).
failedBatches+=(1+(numOfBatches-batchCounter));// The "failedBatches" will have the previously failedBatches + this one + the remaining batches which will likely fail too, thus, they will not be tested. Some initial batches may have succeeded.
removeUnretrievedFullTextsFromUrlReports(urlReports,true);// Make sure all records without an S3-Url have < null > file-data (some batches or uploads might have failed).
// Check and warn about the number of failed payloads.
// Possible reasons: failed to check their hash in the DB, the file was not found inside the worker, whole batch failed to be delivered from the worker, files failed t be uploaded to S3
logger.warn(numFailedPayloads+" payloads (out of "+numInitialPayloads+") failed to be processed for assignments_"+assignmentsBatchCounter+", for worker: "+workerId);
HashMultimap<String,Payload>hashesWithPayloads=HashMultimap.create((sizeOfUrlReports/5),3);// Holds multiple payloads for the same fileHash.
// The "Hash" part of the multimap helps with avoiding duplicate fileHashes.
for(UrlReporturlReport:urlReports)
{
Payloadpayload=urlReport.getPayload();
if(payload==null)
continue;
StringfileLocation=payload.getLocation();
if(fileLocation==null)
continue;// The full-text was not retrieved for this UrlReport.
// Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH.
// If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker.
// If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker.
StringfileHash=payload.getHash();
if(fileHash!=null)
{
hashesWithPayloads.put(fileHash,payload);// Hold multiple payloads per fileHash.
// There are 2 cases, which contribute to that:
// 1) Different publication-IDs end up giving the same full-text-url, resulting in the same file. Those duplicates are not saved, but instead, the location, hash and size of the file is copied to the other payload.
// 2) Different publication-IDs end up giving different full-text-urls which point to the same file. Although very rare, in this case, the file is downloaded again by the Worker and has a different name.
// In either case, the duplicate file will not be transferred to the Controller, but in the 2nd one it takes up extra space, at least for some time.
// TODO - Implement a fileHash-check algorithm in the Worker's side ("PublicationsRetriever"), to avoid keeping those files in storage.
}else// This should never happen..
logger.error("Payload: "+payload+" has a null fileHash!");
HashMap<String,String>hashLocationMap=newHashMap<>(fileHashesSetSize/2);// No multimap is needed since only one location is returned for each fileHash.
DatabaseConnector.databaseLock.lock();// The execution uses the database.
try{
jdbcTemplate.query(getHashLocationsQuery,rs->{
try{// For each of the 4 columns returned, do the following. The column-indexing starts from 1.
if((extractedFileNames==null)||(extractedFileNames.length<=2)){// The directory might have only two files, the "tar-file" and the "tar.zstd-file", if the full-texts failed to be decompressed or untarred..
logger.error("No full-texts' fleNames where extracted from directory: "+targetDirectory);
logger.warn("The number of extracted files ("+(extractedFileNames.length-2)+") was not equal to the number of files ("+fileNamesForCurBatch.size()+") of the current batch_"+batchCounter);
// We do NOT have to find and cross-reference the missing files with the urlReports, in order to set their locations to <null>,
// since, in the end of each assignments-batch, an iteration will be made and for all the non-retrieved and non-uploaded full-texts, the app will set them to null.
logger.error("Could not extract and upload the full-texts for batch_"+batchCounter+" of assignments_"+assignmentsBatchCounter+GenericUtils.endOfLine+e.getMessage(),e);// It shows the response body (after Spring v.2.5.6).
//logger.debug("Going to request the batch_" + batchNum + " (out of " + totalBatches + ") with " + fileNamesForCurBatch.size() + " fullTexts, of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and baseRequestUrl: " + baseUrl + "[fileNames]");
// TODO - The above update will also enable is to improve edge-case management like making sure we do not create a whole new batch for just a few files..
// Check this case for example, where we have one extra batch with the network, compression-decompression, transfer, uploading ect overhead:
// 2023-02-06 12:17:26.893 [http-nio-1880-exec-8] DEBUG e.o.urls_controller.util.FileUtils.getAndUploadFullTexts(@235) - The assignments_12 have 211 distinct non-already-uploaded fullTexts.
// Going to request them from the Worker "worker_X", in 4 batches (70 files each, except for the final batch, which will have 1 files).
// If we are not limited by the url-length we can easily say that if less than 10 files remain for the last batch, then add them to the previous batch (eg. the last batch will have 79 files)
// If equal to 10 or more files remain, then we will make an extra batch.
logger.warn("HTTP-"+statusCode+": "+errMsg+"\n\nProblem when requesting the ZstdFile of batch_"+batchNum+" from the Worker with ID \""+workerId+"\" and requestUrl: "+requestUrl);
if(((statusCode>=500)&&(statusCode<=599))
||((statusCode==400)&&((errMsg!=null)&&errMsg.contains("The base directory for assignments_"+assignmentsBatchCounter+" was not found"))))
logger.warn("Problem when requesting the ZstdFile of batch_"+batchNum+" of assignments_"+assignmentsBatchCounter+" from the Worker with ID \""+workerId+"\" and requestUrl: "+requestUrl+GenericUtils.endOfLine+exMessage);
logger.error("Since we received a \"Connection refused\", from \""+workerId+"\", all of the remaining batches ("+(totalBatches-batchNum)+") will not be requested!");
// Check if this stored file is related to one or more Payloads from the Set. Defend against malicious file injection. It does not add more overhead, since we already need the "fileRelatedPayloads".
// Prepare the filename as: "datasourceid/publicationid::hash.pdf"
// All related payloads point to this exact same file, BUT, may be related with different urlIDs, which in turn be related with different datasourceIDs.
// This file could have been found from different urlIds and thus be related to multiple datasourceIds.
// BUT, since the filename contains a specific urlID, the datasourceId should be the one related to that specific urlID.
// So, we extract this urlID, search the payload inside the "fileRelatedPayloads" and get the related datasourceID (instead of taking the first or a random datasourceID).
logger.error("Failed to match the \""+fileName+"\" with the regex: "+FILENAME_ID_EXTENSION);
continue;
}
// The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used.
// Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName).
StringfileNameID=matcher.group(4);
if((fileNameID==null)||fileNameID.isEmpty()){
logger.error("Failed to extract the \"fileNameID\" from \""+fileName+"\".");
StringfilenameForS3=constructS3FileName(fileName,fileNameID,dotFileExtension,datasourceId,hash);// This name is for the uploaded file, in the S3 Object Store.
if(filenameForS3==null)// The error is logged inside.
// Mark this full-text as not-retrieved, since it will be deleted from local-storage. The retrieved link to the full-text ("actual_url") will be kept, for now.
payload.setLocation(null);// This will cause the payload to not be inserted into the "payload" table in the database. Only the "attempt" record will be inserted.
finalHashMultimap<String,Payload>urlToPayloadsMultimap=HashMultimap.create((numInitialPayloads/3),3);// Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it.
.flatMap(payload->Stream.of(payload.getOriginal_url(),payload.getActual_url()))// Add both "original_url" and "actual_url" in the final results.
.collect(Collectors.toList());
// Prepare the "urlsToRetrieveRelatedIDs" to be used inside the "getDataForPayloadPrefillQuery". Create the following string-pattern: ("URL_1", "URL_2", ...)
logger.error("No results retrieved from the \"getDataForPayloadPrefillQuery\", when trying to prefill payloads, from assignment_"+assignmentsBatchCounter+".");
logger.error("Some results were retrieved from the \"getDataForPayloadPrefillQuery\", but no data could be extracted from them, when trying to prefill payloads, from assignment_"+assignmentsBatchCounter+".");
logger.debug("Final number of UrlReports is "+urlReports.size()+" | assignmentsBatchCounter: "+assignmentsBatchCounter);
// In order to avoid assigning these "prefill" records to workers, before they are inserted in the attempt and payload tables..
// We have to make sure this method is called inside a "DB-locked" code block and the "DB-unlock" happens only after all records are loaded into the DB-tables.