From 0032a8018f14e925d999df83d03820c1d289df48 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Mon, 17 Jan 2022 10:12:48 +0200 Subject: [PATCH] - Improve search-accuracy of "alreadyDownloaded" full-texts. - Handle the potential error-case of an "alreadyDownloaded" full-text not being discovered inside the "FileUtils.dataToBeLoggedList". --- .../plugins/PublicationsRetrieverPlugin.java | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java index 6b68059..12a5d8c 100644 --- a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java +++ b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java @@ -158,13 +158,17 @@ public class PublicationsRetrieverPlugin { if ( "true".equals(data.getWasDocumentOrDatasetAccessible()) ) // The reversed order defends against a potential NPE. { status = UrlReport.StatusType.accessible; - if ( comment.startsWith(UrlUtils.alreadyDownloadedByIDMessage, 0) ) { + if ( comment.startsWith(UrlUtils.alreadyDownloadedFromIDMessage, 0) ) { // The file of this docUrl was already downloaded by another docUrl. - String initialId = comment.substring(UrlUtils.alreadyDownloadedByIDMessage.length()); // The fileName starts right after the "message". - //logger.debug("initialId: " + initialId); // DEBUG! - // Search that ID inside the list and if that instance gave the docUrl (there might be multiple ID instances) then get the file-location. + int indexOfAlreadyDownloadedFromSourceUrlMessage = comment.indexOf(UrlUtils.alreadyDownloadedFromSourceUrlContinuedMessage); + int indexOfAlreadyDownloadedFromSourceUrl = indexOfAlreadyDownloadedFromSourceUrlMessage + UrlUtils.alreadyDownloadedFromSourceUrlContinuedMessage.length(); + String initialId = comment.substring(UrlUtils.alreadyDownloadedFromIDMessage.length(), indexOfAlreadyDownloadedFromSourceUrlMessage); // The fileName starts right after the "message". + String initialSourceUrl = comment.substring(indexOfAlreadyDownloadedFromSourceUrl); + //logger.debug("initialId: " + initialId + " | sourceUrl: " + initialSourceUrl); // DEBUG! + // Search that ID and sourceUrl inside the list, if that instance is the first-found one, then get the file-data (there might be duplicate ID-sourceUrl instances, but only one of them has the file-data). for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) { - if ( data_2.getUrlId().equals(initialId) && ("true".equals(data_2.getWasDocumentOrDatasetAccessible())) ) { + if ( data_2.getUrlId().equals(initialId) && (data_2.getSourceUrl().equals(initialSourceUrl)) + && ! data_2.getComment().startsWith(UrlUtils.alreadyDownloadedFromIDMessage) ) { fileLocation = data_2.getComment(); size = data_2.getSize(); hash = data_2.getHash(); @@ -172,7 +176,8 @@ public class PublicationsRetrieverPlugin { break; } } - // TODO - The case where the "twin-ID" is not found, should "never" happen. But should we check? How to handle if that is the case..? + // In case the "alreadyDownloaded" full-text is not found, we have an error. + error = new Error(Error.ErrorType.couldRetry, comment + " | That ID-sourceUrl was not found inside the WorkerReport!"); // We can still try to download it from the found docUrl, in the future. } else if ( ! comment.equals(HttpConnUtils.docFileNotRetrievedMessage) ) { // If it was downloaded without an error. fileLocation = comment; // This is the full-file-path. @@ -181,7 +186,7 @@ public class PublicationsRetrieverPlugin { error = new Error(Error.ErrorType.couldRetry, comment); // We can still try to download it from the found docUrl, in the future. if ( error == null ) // If the file was retrieved, in any time. - error = new Error(Error.ErrorType.couldRetry, null); // We do not want to send a "null" object, since it just adds more complicated handling in the controller.. + error = new Error(Error.ErrorType.couldRetry, null); // We do not want to send a "null" Error-object, since it just adds more complicated handling in the controller.. } else { status = UrlReport.StatusType.non_accessible;