- Fix not updating the fileLocation with the s3Url for records which share the same full-text.
- Set only one delete-order for each assignments-batch-files, not one (or more, by mistake) per zip-batch.
- Set the HttpStatus to "204 - NO_CONTENT", when no assignments are available to be returned to the Worker.
- Fix not unlocking the "dataBaseLock" in case of a "dataBase-connection"-error, in "addWorkerReport()".
- Improve some log-messages.
- Change the log-level for the "S3-bucket already exists" message.
- Update Gradle.
- Optimize imports.
- Code cleanup.
logger.debug("Finished gathering "+assignmentsSize+" assignments for worker with id \""+workerId+"\". Going to insert them into the \"assignment\" table and then return them to the worker.");
logger.info("Received the WorkerReport for batch-assignments_"+curReportAssignments+", from the worker with id: "+curWorkerId+". It contains "+urlReports.size()+" urlReports. Going to request the fullTexts from the Worker and insert the UrlReports into the database.");
logger.error("Failed to get and/or upload the fullTexts for assignments_"+curReportAssignments);
// The docUrls were still found! Just update ALL the fileLocations. sizes and hashes, to show that the files are not available and continue with writing the attempts and the Payloads.
// The "databaseLock" was unlocked inside the "FileUtils.getAndUploadFullTexts" to avoid blocking the database while doing large irrelevant tasks like transferring files.
if(error==null){// A bit rare to happen, but we should fix it (otherwise NPEs will be thrown for the rest of the loop)
if(error==null){// A bit rare to happen, but we should fix it (otherwise NPEs will be thrown for the rest of this loop).
logger.warn("Error was \"null\" for \"urlReport\": "+urlReport+"\nSetting an empty object with \"null\" members.");
error=newError(null,null);
}
@ -409,6 +415,7 @@ public class UrlController {
// This will delete the rows of the "assignment" table which refer to the curWorkerId. As we have non-kudu Impala tables, the Delete operation can only succeed through a "merge" operation of the rest of the data.
// Only the rows referring to OTHER workerIDs get stored in a temp-table, while the "assignment" table gets deleted. Then, the temp_table becomes the "assignment" table.
// We do not need to keep the assignment-info anymore, the "findAssignmentsQuery" checks the payload table for previously handled tasks.
mergeErrorMsg=FileUtils.mergeParquetFiles("assignment",con," WHERE workerid != ",curWorkerId);
HashMultimap<String,String>allFileNamesWithIDsHashMap=HashMultimap.create((urlReports.size()/5),3);// Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it.
// Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH.
// If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker.
// If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker.
// Use the same prepared-statement for all requests, to improve speed (just like when inserting similar thing to the DB).
if(resultSet.next()){// Move the "cursor" to the first row. If there is any data..
fileLocation=resultSet.getString(1);
if(fileLocation!=null){// If the full-text of this record is already-found.
payload.setLocation(fileLocation);// Set the location to the older identical file, which was uploaded to S3.
logger.debug("The record with ID \""+payload.getId()+"\" has an \"alreadyRetrieved\" file, with hash \""+fileHash+"\" and location \""+fileLocation+"\".");
numFilesFoundFromPreviousAssignmentsBatches++;
continue;
}
}
}catch(Exceptione){
logger.error("Error when executing or acquiring data from the the \"getFileLocationForHashQuery\"!\n"+e.getMessage());
}
}
// If the full-text of this record was not found by a previous batch..
fileLocation=payload.getLocation();
if(fileLocation!=null){// If the docFile was downloaded (without an error)..
allFileNamesWithIDsHashMap.put(fileNameWithExtension,payload.getId());// The keys and the values are not duplicate. Task with ID-1 might have an "ID-1.pdf" file.
// While a task with ID-2 can also have an "ID-1.pdf" file, as the pdf-url-2 might be the same with pdf-url-1.
}
}
}
// Close the Prepared Statement.
try{
if(getFileLocationForHashPreparedStatement!=null)
getFileLocationForHashPreparedStatement.close();
}catch(SQLExceptionsqle){
logger.error("Failed to close the \"getFileLocationForHashPreparedStatement\"!\n"+sqle.getMessage());
}finally{
ImpalaConnector.databaseLock.unlock();// The rest work of this function does not use the database.
}
logger.info("NumFullTextUrlsFound by assignments_"+assignmentsBatchCounter+" = "+numFullTextUrlsFound+" (out of "+urlReports.size()+").");
logger.warn("The file retrieved by the Worker where < 0 > for assignments_"+assignmentsBatchCounter);
@ -164,17 +222,17 @@ public class FileUtils {
if((numAllFullTexts%numOfFullTextsPerBatch)>0)// Add an extra batch for the remaining files. This guarantees at least one batch will exist no matter how few (>0) the files are.
numOfBatches++;
logger.debug("The assignments_"+assignmentsBatchCounter+" have "+numAllFullTexts+" fullTexts. Going to request them from the Worker, in " +numOfBatches+" batches.");
logger.debug("The assignments_"+assignmentsBatchCounter+" have "+numAllFullTexts+" distinct non-already-uploaded fullTexts. Going to request them from the Worker \"" +workerId+"\", in " +numOfBatches+" batches.");
// Check if one full text is left out because of the division. Put it int the last batch.
// Index all Payloads to be more efficiently searched later.
HashMultimap<String,Payload>payloadsHashMultimap =HashMultimap.create((urlReports.size()/3),3);// Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it.
if(fileRelatedIDs.isEmpty()){// In case the "fileName" is not inside the "allFileNamesWithIDsHashMap" HashMultimap.
logger.error("The stored file \""+fileName+"\" is not related to any ID which had a file requested from the Worker!");
continue;
}
if(!location.endsWith(fileName)){// That should NEVER happen...
logger.error("The location \""+location+"\" of the payload matched with the ID \""+id+"\" is not ending with the filename it was supposed to \""+fileName+"\"");
if(isFileNameProblematic(fileName,payloadsHashMultimap))// Do some more checks.
continue;
}
// At this point, we know that this file is related with one or more IDs of the payloads AND it has a valid fileName.
// Let's try to upload the file to S3 and update the payloads of all related IDs, either in successful upload or not.
payload.setLocation(s3Url);// Update the file-location to the new S3-url.
numUploadedFiles++;
}else
setUnretrievedFullText(payload);
setFullTextForMultipleIDs(payloadsHashMultimap,fileRelatedIDs,s3Url);// It checks weather (s3Url != null) and acts accordingly.
numUploadedFiles++;
}
// Else, the record will have its file-data set to "null", in the end of this method.
}
logger.info("Finished uploading "+numUploadedFiles+" full-texts of assignments_"+assignmentsBatchCounter+", batch_"+batchCounter+" on S3-ObjectStore.");
logger.info("Finished uploading "+numUploadedFiles+" full-texts (out of "+(fileNames.length-1)+" distinct files) from assignments_"+assignmentsBatchCounter+", batch_"+batchCounter+" on S3-ObjectStore.");
// (fileNames.length -1) --> minus the zip-file
}catch(Exceptione){
logger.error("Could not extract and upload the full-texts for batch_"+batchCounter+" of assignments_"+assignmentsBatchCounter+"\n"+e.getMessage(),e);// It shows the response body (after Spring v.2.5.6).
deleteDirectory(curBatchDir);// Delete the files of this batch (including the zip-file).
}
}// End of batches.
@ -278,7 +324,7 @@ public class FileUtils {
logger.error("None of the "+numOfBatches+" batches could be handled for assignments_"+assignmentsBatchCounter+", for worker: "+workerId);
returnfalse;
}else{
replaceNotUploadedFileLocations(urlReports);
replaceNotUploadedFileLocations(urlReports);// Make sure all records without an s3Url have null file-data.
returntrue;
}
}
@ -315,9 +361,8 @@ public class FileUtils {
StringinputLine;
while((inputLine=br.readLine())!=null)
{
if(!inputLine.isEmpty()){
if(!inputLine.isEmpty())
errorMsgStrB.append(inputLine);
}
}
return(errorMsgStrB.length()!=0)?errorMsgStrB.toString():null;// Make sure we return a "null" on empty string, to better handle the case in the caller-function.
logger.error("The given fileName \""+fileName+"\" was invalid! Could not be matched with matcher: "+matcher);
returntrue;
}
StringfileID=matcher.group(1);
if((fileID==null)||fileID.isEmpty()){
logger.error("The given fileName \""+fileName+"\" was invalid. No fileID was extracted!");
returntrue;
}
// Take the payloads which are related with this ID. An ID might have multiple original-urls, thus multiple payloads.
// The ID we have here, is the one from the first record which reached to this file.
// There might be other records pointing to this file. But, in order to mark this file as "valid", we have to match it with at least one of the records-IDs.
// We do this process to avoid handling and uploading irrelevant files which could find their way to the working directory (either because of a Worker's error or any other type of malfunction or even malicious action).
logger.error("The given fileID \""+fileID+"\" was not part of the \"payloadsHashMultimap\"!");
returntrue;
}
// Search through the payloads to find at least one match, in order for this file to NOT be "problematic".
for(Payloadpayload:payloads)
{
Stringlocation=payload.getLocation();
if((location!=null)&&location.endsWith(fileName))
returnfalse;// It's not problematic.
}
logger.error("None of the locations of the payloads matched with the ID \""+fileID+"\" are ending with the filename \""+fileName+"\" they were supposed to.");
logger.error("The given id \""+id+"\" (coming from the \"allFileNamesWithIDsHashMap\"), is not found inside the \"payloadsHashMultimap\"!");
continue;
}
for(Payloadpayload:payloads)
if(payload.getHash()!=null)// Update only for the records which led to a file, not all the records of this ID (an ID might have multiple original_urls pointing to different directions).
payload.setLocation(s3Url);// Update the file-location to the new S3-url. All the other file-data is already set from the Worker.