jdbcTemplate.execute("CREATE TABLE "+ImpalaConnector.databaseName+"."+tableName+"_tmp stored as parquet AS SELECT * FROM "+ImpalaConnector.databaseName+"."+tableName+""+whereClause+parameter);
HashMultimap<String,Payload>allFileNamesWithPayloads=HashMultimap.create((urlReportsSize/5),3);// Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it.
// Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH.
// If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker.
// If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker.
// No fileLocation is found, it's ok. It will be null by default.
}catch(Exceptione){
logger.error("Error when executing or acquiring data from the the \"getFileLocationForHashQuery\"!\n",e);
// TODO - SHOULD WE RETURN A "UploadFullTextsResponse.databaseError" AND force the caller to not even insert the payloads to the database??
// TODO - Since the database will have problems.. there is no point in trying to insert the payloads to Impala (we will handle it like: we tried to insert and got an error).
// TODO - In case we DO return, UNLOCK the database-lock and close the Prepared statement (it's not auto-closed here)and the Database connection.
if(alreadyFoundFileLocation!=null){// If the full-text of this record is already-found and uploaded.
payload.setLocation(alreadyFoundFileLocation);// Set the location to the older identical file, which was uploaded to S3. The other file-data is identical.
//logger.debug("The record with ID \"" + payload.getId() + "\" has an \"alreadyRetrieved\" file, with hash \"" + fileHash + "\" and location \"" + alreadyFoundFileLocation + "\"."); // DEBUG!
continue;// Do not request the file from the worker, it's already uploaded. Move on. The "location" will be filled my the "setFullTextForMultiplePayloads()" method, later.
allFileNamesWithPayloads.put(fileNameWithExtension,payload);// The keys and the values are not duplicate.
// Task with ID-1 might have an "ID-1.pdf" file, while a task with ID-2 can also have an "ID-1.pdf" file, as the pdf-url-2 might be the same with pdf-url-1, thus, the ID-2 file was not downloaded again.
logger.info("NumFullTextsFound by assignments_"+assignmentsBatchCounter+" = "+numFullTextsFound+" (out of "+urlReportsSize+" | about "+df.format(numFullTextsFound*100.0/urlReportsSize)+"%).");
if((numAllFullTexts%numOfFullTextsPerBatch)>0)// Add an extra batch for the remaining files. This guarantees at least one batch will exist no matter how few (>0) the files are.
logger.debug("The assignments_"+assignmentsBatchCounter+" have "+numAllFullTexts+" distinct non-already-uploaded fullTexts. Going to request them from the Worker \""+workerId+"\", in "+numOfBatches+" batches.");
failedBatches+=(1+(numOfBatches-batchCounter));// The "failedBatches" will have the previously failedBatches + this one + the remaining batches which will likely fail too, thus, they will not be tested.
logger.error("Could not extract and upload the full-texts for batch_"+batchCounter+" of assignments_"+assignmentsBatchCounter+"\n"+e.getMessage(),e);// It shows the response body (after Spring v.2.5.6).
updateUrlReportsToHaveNoFullTextFiles(urlReports,true);// Make sure all records without an S3-Url have < null > file-data (some batches or uploads might have failed).
//logger.debug("Going to request the batch_" + batchNum + " (out of " + totalBatches + ") with " + fileNamesForCurBatch.size() + " fullTexts, of assignments_" + assignmentsBatchCounter + " from the Worker with ID \"" + workerId + "\" and baseRequestUrl: " + baseUrl + "[fileNames]");
logger.warn("HTTP-"+statusCode+": "+getMessageFromResponseBody(conn,true)+"\nProblem when requesting the ZipFile of batch_"+batchNum+" from the Worker with ID \""+workerId+"\" and requestUrl: "+requestUrl);
logger.warn("Problem when requesting the ZipFile of batch_"+batchNum+" of assignments_"+assignmentsBatchCounter+" from the Worker with ID \""+workerId+"\" and requestUrl: "+requestUrl+"\n"+exMessage);
if(exMessage.contains("Connection refused")){
logger.error("Since we received a \"Connection refused\", all of the remaining batches ("+(totalBatches-batchNum)+") will not be requested!");
if(fileFullPath.equals(zipFileFullPath))// Exclude the zip-file from uploading.
continue;
// Check if this stored file is related to one or more Payloads from the Set. Defend against malicious file injection. It does not add more overhead, since we already need the "fileRelatedPayloads".
if(fileRelatedPayloads.isEmpty()){// In case the "fileName" is not inside the "allFileNamesWithPayloads" HashMultimap.
logger.error("The stored file \""+fileName+"\" is not related to any Payload returned from the Worker!");
continue;
}
// Let's try to upload the file to S3 and update the payloads, either in successful file-uploads (right-away) or not (in the end).
try{
// Prepare the filename as: "datasourceid/publicationid::hash.pdf"
// All related payloads point to this exact same file, BUT, may be related with different urlIDs, which in turn be related with different datasourceIDs.
// This file could have been found from different urlIds and thus be related to multiple datasourceIds.
// BUT, since the filename contains a specific urlID, the datasourceId should be the one related to that specific urlID.
// So, we extract this urlID, search the payload inside the "fileRelatedPayloads" and get the related datasourceID (instead of taking the first or a random datasourceID).
logger.error("Failed to extract the \"dotFileExtension\" from \""+fileName+"\".");
continue;
}
// This file is related with some payloads, in a sense that these payloads have urls which lead to the same full-text url.
// These payloads might have different IDs and sourceUrls. But, in the end, the different sourceUrls give the same full-text.
// Below, we make sure we pick the "datasource" from the payload, which has the same id as the full-text's name.
// If there are multiple payloads with the same id, which point to the same file, then we can take whatever datasource we want from those payloads.
// It is possible that payloads with same IDs, but different sourceUrls pointing to the same full-text, can be related with different datasources
// (especially for IDs of type: "doiboost_____::XXXXXXXXXXXXXXXXXXXXX").
// It does not really matter, since the first-ever payload to give this full-text could very well be another one,
// since the crawling happens in multiple threads which compete with each other for CPU time.
StringdatasourceId=null;
Stringhash=null;
booleanisFound=false;
for(Payloadpayload:fileRelatedPayloads){
if(fileNameID.equals(payload.getId())){
datasourceId=payload.getDatasourceId();
hash=payload.getHash();
isFound=true;
break;
}
}
if(!isFound){// This should never normally happen. If it does, then a very bad change will have taken place.
logger.error("The \"fileNameID\" ("+fileNameID+") was not found inside the \"fileRelatedPayloads\" for fileName: "+fileName);
continue;
}
// Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName).
// Now we append the file-hash, so it is guaranteed that the filename will be unique.
// Mark this full-text as not-retrieved, since it will be deleted from local-storage. The retrieved link to the full-text ("actual_url") will be kept, for now.
payload.setLocation(null);// This will cause the payload to not be inserted into the "payload" table in the database. Only the "attempt" record will be inserted.