diff --git a/installAndRun.sh b/installAndRun.sh index 18e4437..c914792 100755 --- a/installAndRun.sh +++ b/installAndRun.sh @@ -45,7 +45,7 @@ if [[ justInstall -eq 0 ]]; then sudo docker --version || (echo -e "Docker was not found!"; exit 9) dockerImage=${username}"/urls_controller:latest" sudo docker build -t "${dockerImage}" . - echo -e "\nPushing docker image.. (the account password is required)..\n" + echo -e "\nPushing docker image.. (the account password is required, otherwise it will not be pushed, but it will continue to run)..\n" (sudo docker login -u "${username}" && sudo docker push "${dockerImage}") || true (sudo mkdir -p "$HOME"/tmp/config && sudo cp ./src/main/resources/application.properties "$HOME"/tmp/config) || true # This also replaces an existing "application.properties". sudo docker run -d --mount type=bind,source="$HOME"/tmp/config,target=/mnt/config -p 1880:1880 "${dockerImage}" && echo "The docker container started running." diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index 0d197a4..90b4f56 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -105,33 +105,32 @@ public class FileUtils { remoteAddr = request.getRemoteAddr(); // Get the file-locations. - int numFullTextUrlsFound = 0; + int numFullTextsFound = 0; int numFilesFoundFromPreviousAssignmentsBatches = 0; HashMultimap allFileNamesWithPayloads = HashMultimap.create((urlReports.size() / 5), 3); // Holds multiple values for any key, if a fileName(key) has many IDs(values) associated with it. String getFileLocationForHashQuery = "select `location` from " + databaseName + ".payload where `hash` = ? limit 1" ; + final int[] hashArgType = new int[] {Types.VARCHAR}; ImpalaConnector.databaseLock.lock(); - for ( UrlReport urlReport : urlReports ) { - UrlReport.StatusType statusType = urlReport.getStatus(); - if ( (statusType == null) || statusType.equals(UrlReport.StatusType.non_accessible) ) { - continue; - } - numFullTextUrlsFound ++; - + for ( UrlReport urlReport : urlReports ) + { Payload payload = urlReport.getPayload(); if ( payload == null ) continue; - String fileLocation = null; + String fileLocation = payload.getLocation(); + if ( fileLocation == null ) + continue; // The full-text was not retrieved, go to the next UrlReport. // Query the payload-table FOR EACH RECORD to get the fileLocation of A PREVIOUS RECORD WITH THE SAME FILE-HASH. // If no result is returned, then this record is not previously found, so go ahead and add it in the list of files to request from the worker. // If a file-location IS returned (for this hash), then this file is already uploaded to the S3. Update the record to point to that file-location and do not request that file from the Worker. String fileHash = payload.getHash(); if ( fileHash != null ) { + String alreadyFoundFileLocation = null; try { - fileLocation = jdbcTemplate.queryForObject(getFileLocationForHashQuery, new Object[] {fileHash}, new int[] {Types.VARCHAR}, String.class); + alreadyFoundFileLocation = jdbcTemplate.queryForObject(getFileLocationForHashQuery, new Object[] {fileHash}, hashArgType, String.class); } catch (EmptyResultDataAccessException erdae) { // No fileLocation is found, it's ok. It will be null by default. } catch (Exception e) { @@ -142,33 +141,33 @@ public class FileUtils { // Unless we do what it is said above, do not continue to the next UrlReport, this query-exception should not disrupt the normal full-text processing. } - if ( fileLocation != null ) { // If the full-text of this record is already-found and uploaded. - payload.setLocation(fileLocation); // Set the location to the older identical file, which was uploaded to S3. The other file-data is identical. - //logger.debug("The record with ID \"" + payload.getId() + "\" has an \"alreadyRetrieved\" file, with hash \"" + fileHash + "\" and location \"" + fileLocation + "\"."); // DEBUG! + if ( alreadyFoundFileLocation != null ) { // If the full-text of this record is already-found and uploaded. + payload.setLocation(alreadyFoundFileLocation); // Set the location to the older identical file, which was uploaded to S3. The other file-data is identical. + //logger.debug("The record with ID \"" + payload.getId() + "\" has an \"alreadyRetrieved\" file, with hash \"" + fileHash + "\" and location \"" + alreadyFoundFileLocation + "\"."); // DEBUG! numFilesFoundFromPreviousAssignmentsBatches ++; + numFullTextsFound ++; continue; // Do not request the file from the worker, it's already uploaded. Move on. } } - // If the full-text of this record was not found by a previous batch... - fileLocation = payload.getLocation(); - if ( fileLocation != null ) { // If the docFile was downloaded (without an error).. - Matcher matcher = FILENAME_WITH_EXTENSION.matcher(fileLocation); - if ( ! matcher.matches() ) { - continue; - } - String fileNameWithExtension = matcher.group(1); - if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) { - continue; - } - allFileNamesWithPayloads.put(fileNameWithExtension, payload); // The keys and the values are not duplicate. - // Task with ID-1 might have an "ID-1.pdf" file, while a task with ID-2 can also have an "ID-1.pdf" file, as the pdf-url-2 might be the same with pdf-url-1, thus, the ID-2 file was not downloaded again. + // Extract the "fileNameWithExtension" to be added in the HashMultimap. + Matcher matcher = FILENAME_WITH_EXTENSION.matcher(fileLocation); + if ( ! matcher.matches() ) { + continue; } + String fileNameWithExtension = matcher.group(1); + if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) { + continue; + } + + numFullTextsFound ++; + allFileNamesWithPayloads.put(fileNameWithExtension, payload); // The keys and the values are not duplicate. + // Task with ID-1 might have an "ID-1.pdf" file, while a task with ID-2 can also have an "ID-1.pdf" file, as the pdf-url-2 might be the same with pdf-url-1, thus, the ID-2 file was not downloaded again. }// end-for ImpalaConnector.databaseLock.unlock(); // The remaining work of this function does not use the database. - logger.info("NumFullTextUrlsFound by assignments_" + assignmentsBatchCounter + " = " + numFullTextUrlsFound + " (out of " + urlReports.size() + ")."); + logger.info("NumFullTextsFound by assignments_" + assignmentsBatchCounter + " = " + numFullTextsFound + " (out of " + urlReports.size() + ")."); logger.debug("NumFilesFoundFromPreviousAssignmentsBatches = " + numFilesFoundFromPreviousAssignmentsBatches); ArrayList allFileNames = new ArrayList<>(allFileNamesWithPayloads.keySet());