- Replace the "numFullTextUrlsFound"-counter with "numFullTextsFound"-counter to reflect the end result of the actually available full-texts (which were downloaded by the Worker).
- Optimize the gather-fileNames loop. - Improve a message in "installAndRun.sh"
This commit is contained in:
parent
ad5dbdde9b
commit
88acaae20f
|
@ -45,7 +45,7 @@ if [[ justInstall -eq 0 ]]; then
|
||||||
sudo docker --version || (echo -e "Docker was not found!"; exit 9)
|
sudo docker --version || (echo -e "Docker was not found!"; exit 9)
|
||||||
dockerImage=${username}"/urls_controller:latest"
|
dockerImage=${username}"/urls_controller:latest"
|
||||||
sudo docker build -t "${dockerImage}" .
|
sudo docker build -t "${dockerImage}" .
|
||||||
echo -e "\nPushing docker image.. (the account password is required)..\n"
|
echo -e "\nPushing docker image.. (the account password is required, otherwise it will not be pushed, but it will continue to run)..\n"
|
||||||
(sudo docker login -u "${username}" && sudo docker push "${dockerImage}") || true
|
(sudo docker login -u "${username}" && sudo docker push "${dockerImage}") || true
|
||||||
(sudo mkdir -p "$HOME"/tmp/config && sudo cp ./src/main/resources/application.properties "$HOME"/tmp/config) || true # This also replaces an existing "application.properties".
|
(sudo mkdir -p "$HOME"/tmp/config && sudo cp ./src/main/resources/application.properties "$HOME"/tmp/config) || true # This also replaces an existing "application.properties".
|
||||||
sudo docker run -d --mount type=bind,source="$HOME"/tmp/config,target=/mnt/config -p 1880:1880 "${dockerImage}" && echo "The docker container started running."
|
sudo docker run -d --mount type=bind,source="$HOME"/tmp/config,target=/mnt/config -p 1880:1880 "${dockerImage}" && echo "The docker container started running."
|
||||||
|
|
|
@ -105,33 +105,32 @@ public class FileUtils {
|
||||||
remoteAddr = request.getRemoteAddr();
|
remoteAddr = request.getRemoteAddr();
|
||||||
|
|
||||||
// Get the file-locations.
|
// Get the file-locations.
|
||||||
int numFullTextUrlsFound = 0;
|
int numFullTextsFound = 0;
|
||||||
int numFilesFoundFromPreviousAssignmentsBatches = 0;
|
int numFilesFoundFromPreviousAssignmentsBatches = 0;
|
||||||
HashMultimap<String, Payload> allFileNamesWithPayloads = HashMultimap.create((urlReports.size() / 5), 3); // Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it.
|
HashMultimap<String, Payload> allFileNamesWithPayloads = HashMultimap.create((urlReports.size() / 5), 3); // Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it.
|
||||||
String getFileLocationForHashQuery = "select `location` from " + databaseName + ".payload where `hash` = ? limit 1" ;
|
String getFileLocationForHashQuery = "select `location` from " + databaseName + ".payload where `hash` = ? limit 1" ;
|
||||||
|
final int[] hashArgType = new int[] {Types.VARCHAR};
|
||||||
|
|
||||||
ImpalaConnector.databaseLock.lock();
|
ImpalaConnector.databaseLock.lock();
|
||||||
|
|
||||||
for ( UrlReport urlReport : urlReports ) {
|
for ( UrlReport urlReport : urlReports )
|
||||||
UrlReport.StatusType statusType = urlReport.getStatus();
|
{
|
||||||
if ( (statusType == null) || statusType.equals(UrlReport.StatusType.non_accessible) ) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
numFullTextUrlsFound ++;
|
|
||||||
|
|
||||||
Payload payload = urlReport.getPayload();
|
Payload payload = urlReport.getPayload();
|
||||||
if ( payload == null )
|
if ( payload == null )
|
||||||
continue;
|
continue;
|
||||||
|
|
||||||
String fileLocation = null;
|
String fileLocation = payload.getLocation();
|
||||||
|
if ( fileLocation == null )
|
||||||
|
continue; // The full-text was not retrieved, go to the next UrlReport.
|
||||||
|
|
||||||
// Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH.
|
// Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH.
|
||||||
// If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker.
|
// If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker.
|
||||||
// If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker.
|
// If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker.
|
||||||
String fileHash = payload.getHash();
|
String fileHash = payload.getHash();
|
||||||
if ( fileHash != null ) {
|
if ( fileHash != null ) {
|
||||||
|
String alreadyFoundFileLocation = null;
|
||||||
try {
|
try {
|
||||||
fileLocation = jdbcTemplate.queryForObject(getFileLocationForHashQuery, new Object[] {fileHash}, new int[] {Types.VARCHAR}, String.class);
|
alreadyFoundFileLocation = jdbcTemplate.queryForObject(getFileLocationForHashQuery, new Object[] {fileHash}, hashArgType, String.class);
|
||||||
} catch (EmptyResultDataAccessException erdae) {
|
} catch (EmptyResultDataAccessException erdae) {
|
||||||
// No fileLocation is found, it's ok. It will be null by default.
|
// No fileLocation is found, it's ok. It will be null by default.
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
|
@ -142,33 +141,33 @@ public class FileUtils {
|
||||||
// Unless we do what it is said above, do not continue to the next UrlReport, this query-exception should not disrupt the normal full-text processing.
|
// Unless we do what it is said above, do not continue to the next UrlReport, this query-exception should not disrupt the normal full-text processing.
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( fileLocation != null ) { // If the full-text of this record is already-found and uploaded.
|
if ( alreadyFoundFileLocation != null ) { // If the full-text of this record is already-found and uploaded.
|
||||||
payload.setLocation(fileLocation); // Set the location to the older identical file, which was uploaded to S3. The other file-data is identical.
|
payload.setLocation(alreadyFoundFileLocation); // Set the location to the older identical file, which was uploaded to S3. The other file-data is identical.
|
||||||
//logger.debug("The record with ID \"" + payload.getId() + "\" has an \"alreadyRetrieved\" file, with hash \"" + fileHash + "\" and location \"" + fileLocation + "\"."); // DEBUG!
|
//logger.debug("The record with ID \"" + payload.getId() + "\" has an \"alreadyRetrieved\" file, with hash \"" + fileHash + "\" and location \"" + alreadyFoundFileLocation + "\"."); // DEBUG!
|
||||||
numFilesFoundFromPreviousAssignmentsBatches ++;
|
numFilesFoundFromPreviousAssignmentsBatches ++;
|
||||||
|
numFullTextsFound ++;
|
||||||
continue; // Do not request the file from the worker, it's already uploaded. Move on.
|
continue; // Do not request the file from the worker, it's already uploaded. Move on.
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If the full-text of this record was not found by a previous batch...
|
// Extract the "fileNameWithExtension" to be added in the HashMultimap.
|
||||||
fileLocation = payload.getLocation();
|
Matcher matcher = FILENAME_WITH_EXTENSION.matcher(fileLocation);
|
||||||
if ( fileLocation != null ) { // If the docFile was downloaded (without an error)..
|
if ( ! matcher.matches() ) {
|
||||||
Matcher matcher = FILENAME_WITH_EXTENSION.matcher(fileLocation);
|
continue;
|
||||||
if ( ! matcher.matches() ) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
String fileNameWithExtension = matcher.group(1);
|
|
||||||
if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
allFileNamesWithPayloads.put(fileNameWithExtension, payload); // The keys and the values are not duplicate.
|
|
||||||
// Task with ID-1 might have an "ID-1.pdf" file, while a task with ID-2 can also have an "ID-1.pdf" file, as the pdf-url-2 might be the same with pdf-url-1, thus, the ID-2 file was not downloaded again.
|
|
||||||
}
|
}
|
||||||
|
String fileNameWithExtension = matcher.group(1);
|
||||||
|
if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
numFullTextsFound ++;
|
||||||
|
allFileNamesWithPayloads.put(fileNameWithExtension, payload); // The keys and the values are not duplicate.
|
||||||
|
// Task with ID-1 might have an "ID-1.pdf" file, while a task with ID-2 can also have an "ID-1.pdf" file, as the pdf-url-2 might be the same with pdf-url-1, thus, the ID-2 file was not downloaded again.
|
||||||
}// end-for
|
}// end-for
|
||||||
|
|
||||||
ImpalaConnector.databaseLock.unlock(); // The remaining work of this function does not use the database.
|
ImpalaConnector.databaseLock.unlock(); // The remaining work of this function does not use the database.
|
||||||
|
|
||||||
logger.info("NumFullTextUrlsFound by assignments_" + assignmentsBatchCounter + " = " + numFullTextUrlsFound + " (out of " + urlReports.size() + ").");
|
logger.info("NumFullTextsFound by assignments_" + assignmentsBatchCounter + " = " + numFullTextsFound + " (out of " + urlReports.size() + ").");
|
||||||
logger.debug("NumFilesFoundFromPreviousAssignmentsBatches = " + numFilesFoundFromPreviousAssignmentsBatches);
|
logger.debug("NumFilesFoundFromPreviousAssignmentsBatches = " + numFilesFoundFromPreviousAssignmentsBatches);
|
||||||
|
|
||||||
ArrayList<String> allFileNames = new ArrayList<>(allFileNamesWithPayloads.keySet());
|
ArrayList<String> allFileNames = new ArrayList<>(allFileNamesWithPayloads.keySet());
|
||||||
|
|
Loading…
Reference in New Issue