From a1f750a0aa386c520bc7f6288269488a805a193b Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Tue, 5 Apr 2022 17:51:45 +0300 Subject: [PATCH] - Handle the case, where, from a group of related records, the initial record which led to a publication-url, failed to have its full-text downloaded. Now we make sure the file-related data for all those related records is kept "null" and a special error is written. - Code optimization. --- .../plugins/PublicationsRetrieverPlugin.java | 48 ++++++++++++------- 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java index c4af623..1e53d0a 100644 --- a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java +++ b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java @@ -156,6 +156,9 @@ public class PublicationsRetrieverPlugin { } + private static final int lengthOfAlreadyDownloadedFromSourceUrlContinuedMessage = ConnSupportUtils.alreadyDownloadedFromSourceUrlContinuedMessage.length(); + private static final int lengthOfAlreadyDownloadedFromIDMessage = ConnSupportUtils.alreadyDownloadedFromIDMessage.length(); + public static void addUrlReportsToWorkerReport(Collection assignments) { // Index the UrlIds with the DatasourceIds for quick-search later. @@ -176,31 +179,40 @@ public class PublicationsRetrieverPlugin { if ( "true".equals(data.getWasDocumentOrDatasetAccessible()) ) // The reversed order defends against a potential NPE. { status = UrlReport.StatusType.accessible; - if ( comment.startsWith(UrlUtils.alreadyDownloadedFromIDMessage, 0) ) { + if ( comment.startsWith(ConnSupportUtils.alreadyDownloadedFromIDMessage, 0) ) { // The file of this docUrl was already downloaded by another docUrl. - int indexOfAlreadyDownloadedFromSourceUrlMessage = comment.indexOf(UrlUtils.alreadyDownloadedFromSourceUrlContinuedMessage); - int indexOfAlreadyDownloadedFromSourceUrl = indexOfAlreadyDownloadedFromSourceUrlMessage + UrlUtils.alreadyDownloadedFromSourceUrlContinuedMessage.length(); - String initialId = comment.substring(UrlUtils.alreadyDownloadedFromIDMessage.length(), indexOfAlreadyDownloadedFromSourceUrlMessage); // The fileName starts right after the "message". + int indexOfAlreadyDownloadedFromSourceUrlMessage = comment.indexOf(ConnSupportUtils.alreadyDownloadedFromSourceUrlContinuedMessage); + int indexOfAlreadyDownloadedFromSourceUrl = indexOfAlreadyDownloadedFromSourceUrlMessage + lengthOfAlreadyDownloadedFromSourceUrlContinuedMessage; + String initialId = comment.substring(lengthOfAlreadyDownloadedFromIDMessage, indexOfAlreadyDownloadedFromSourceUrlMessage); // The fileName starts right after the "message". String initialSourceUrl = comment.substring(indexOfAlreadyDownloadedFromSourceUrl); //logger.debug("initialId: " + initialId + " | sourceUrl: " + initialSourceUrl); // DEBUG! // Search that ID and sourceUrl inside the list, if that instance is the first-found one, then get the file-data (there might be duplicate ID-sourceUrl instances, but only one of them has the file-data). boolean foundAlreadyDownloadedFullText = false; - for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) { - if ( data_2.getUrlId().equals(initialId) && (data_2.getSourceUrl().equals(initialSourceUrl)) - && ! data_2.getComment().startsWith(UrlUtils.alreadyDownloadedFromIDMessage) ) { - fileLocation = data_2.getComment(); - size = data_2.getSize(); - hash = data_2.getHash(); - mimeType = "application/pdf"; // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is assigned correctly. - foundAlreadyDownloadedFullText = true; - break; - } + boolean foundIDUrlInWorkerReport = false; + for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) + { + if ( ! (data_2.getUrlId().equals(initialId) && (data_2.getSourceUrl().equals(initialSourceUrl))) ) + continue; + + foundIDUrlInWorkerReport = true; + String tempFileLocation = data_2.getComment(); + if ( tempFileLocation.startsWith(ConnSupportUtils.alreadyDownloadedFromIDMessage, 0) || tempFileLocation.startsWith(HttpConnUtils.docFileNotRetrievedMessage, 0) ) + continue; + + fileLocation = tempFileLocation; + size = data_2.getSize(); + hash = data_2.getHash(); + mimeType = "application/pdf"; // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is assigned correctly. + foundAlreadyDownloadedFullText = true; + break; + } + // In case the "alreadyDownloaded" full-text is not found, we have an error. All file-related data is "null". + if ( !foundAlreadyDownloadedFullText ) { + String addErrorMessage = ((!foundIDUrlInWorkerReport) ? " | That ID-sourceUrl was not found inside the WorkerReport!" : " | The file was not downloaded!"); + error = new Error(Error.ErrorType.couldRetry, comment + addErrorMessage); // We can still try to download it from the found docUrl, in the future. } - // In case the "alreadyDownloaded" full-text is not found, we have an error. - if ( !foundAlreadyDownloadedFullText ) - error = new Error(Error.ErrorType.couldRetry, comment + " | That ID-sourceUrl was not found inside the WorkerReport!"); // We can still try to download it from the found docUrl, in the future. } - else if ( ! comment.contains(HttpConnUtils.docFileNotRetrievedMessage) ) { // If it was downloaded without an error. + else if ( ! comment.startsWith(HttpConnUtils.docFileNotRetrievedMessage, 0) ) { // If it was downloaded without an error. fileLocation = comment; // This is the full-file-path. mimeType = "application/pdf"; } else // Else the file was not retrieved, so all file-related data are kept "null".