From a1f750a0aa386c520bc7f6288269488a805a193b Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Tue, 5 Apr 2022 17:51:45 +0300
Subject: [PATCH] - Handle the case, where, from a group of related records,
 the initial record which led to a publication-url, failed to have its
 full-text downloaded. Now we make sure the file-related data for all those
 related records is kept "null" and a special error is written. - Code
 optimization.

---
 .../plugins/PublicationsRetrieverPlugin.java  | 48 ++++++++++++-------
 1 file changed, 30 insertions(+), 18 deletions(-)
diff --git a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java
index c4af623..1e53d0a 100644
--- a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java
+++ b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java
@@ -156,6 +156,9 @@ public class PublicationsRetrieverPlugin {
     }
 
 
+    private static final int lengthOfAlreadyDownloadedFromSourceUrlContinuedMessage = ConnSupportUtils.alreadyDownloadedFromSourceUrlContinuedMessage.length();
+    private static final int lengthOfAlreadyDownloadedFromIDMessage = ConnSupportUtils.alreadyDownloadedFromIDMessage.length();
+
     public static void addUrlReportsToWorkerReport(Collection<Assignment> assignments)
     {
         // Index the UrlIds with the DatasourceIds for quick-search later.
@@ -176,31 +179,40 @@ public class PublicationsRetrieverPlugin {
             if ( "true".equals(data.getWasDocumentOrDatasetAccessible()) )  // The reversed order defends against a potential NPE.
             {
                 status = UrlReport.StatusType.accessible;
-                if ( comment.startsWith(UrlUtils.alreadyDownloadedFromIDMessage, 0) ) {
+                if ( comment.startsWith(ConnSupportUtils.alreadyDownloadedFromIDMessage, 0) ) {
                     // The file of this docUrl was already downloaded by another docUrl.
-                    int indexOfAlreadyDownloadedFromSourceUrlMessage = comment.indexOf(UrlUtils.alreadyDownloadedFromSourceUrlContinuedMessage);
-                    int indexOfAlreadyDownloadedFromSourceUrl = indexOfAlreadyDownloadedFromSourceUrlMessage + UrlUtils.alreadyDownloadedFromSourceUrlContinuedMessage.length();
-                    String initialId = comment.substring(UrlUtils.alreadyDownloadedFromIDMessage.length(), indexOfAlreadyDownloadedFromSourceUrlMessage);   // The fileName starts right after the "message".
+                    int indexOfAlreadyDownloadedFromSourceUrlMessage = comment.indexOf(ConnSupportUtils.alreadyDownloadedFromSourceUrlContinuedMessage);
+                    int indexOfAlreadyDownloadedFromSourceUrl = indexOfAlreadyDownloadedFromSourceUrlMessage + lengthOfAlreadyDownloadedFromSourceUrlContinuedMessage;
+                    String initialId = comment.substring(lengthOfAlreadyDownloadedFromIDMessage, indexOfAlreadyDownloadedFromSourceUrlMessage);   // The fileName starts right after the "message".
                     String initialSourceUrl = comment.substring(indexOfAlreadyDownloadedFromSourceUrl);
                     //logger.debug("initialId: " + initialId + " | sourceUrl: " + initialSourceUrl);    // DEBUG!
                     // Search that ID and sourceUrl inside the list, if that instance is the first-found one, then get the file-data (there might be duplicate ID-sourceUrl instances, but only one of them has the file-data).
                     boolean foundAlreadyDownloadedFullText = false;
-                    for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) {
-                        if ( data_2.getUrlId().equals(initialId) && (data_2.getSourceUrl().equals(initialSourceUrl))
-                                && ! data_2.getComment().startsWith(UrlUtils.alreadyDownloadedFromIDMessage) ) {
-                            fileLocation = data_2.getComment();
-                            size = data_2.getSize();
-                            hash = data_2.getHash();
-                            mimeType = "application/pdf";   // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is assigned correctly.
-                            foundAlreadyDownloadedFullText = true;
-                            break;
-                        }
+                    boolean foundIDUrlInWorkerReport = false;
+                    for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList )
+                    {
+                        if ( ! (data_2.getUrlId().equals(initialId) && (data_2.getSourceUrl().equals(initialSourceUrl))) )
+                            continue;
+
+                        foundIDUrlInWorkerReport = true;
+                        String tempFileLocation = data_2.getComment();
+                        if ( tempFileLocation.startsWith(ConnSupportUtils.alreadyDownloadedFromIDMessage, 0) || tempFileLocation.startsWith(HttpConnUtils.docFileNotRetrievedMessage, 0) )
+                            continue;
+
+                        fileLocation = tempFileLocation;
+                        size = data_2.getSize();
+                        hash = data_2.getHash();
+                        mimeType = "application/pdf";   // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is assigned correctly.
+                        foundAlreadyDownloadedFullText = true;
+                        break;
+                    }
+                    // In case the "alreadyDownloaded" full-text is not found, we have an error. All file-related data is "null".
+                    if ( !foundAlreadyDownloadedFullText ) {
+                        String addErrorMessage = ((!foundIDUrlInWorkerReport) ? " | That ID-sourceUrl was not found inside the WorkerReport!" : " | The file was not downloaded!");
+                        error = new Error(Error.ErrorType.couldRetry, comment + addErrorMessage); // We can still try to download it from the found docUrl, in the future.
                     }
-                    // In case the "alreadyDownloaded" full-text is not found, we have an error.
-                    if ( !foundAlreadyDownloadedFullText )
-                        error = new Error(Error.ErrorType.couldRetry, comment + " | That ID-sourceUrl was not found inside the WorkerReport!"); // We can still try to download it from the found docUrl, in the future.
                 }
-                else if ( ! comment.contains(HttpConnUtils.docFileNotRetrievedMessage) ) {  // If it was downloaded without an error.
+                else if ( ! comment.startsWith(HttpConnUtils.docFileNotRetrievedMessage, 0) ) {  // If it was downloaded without an error.
                     fileLocation = comment; // This is the full-file-path.
                     mimeType = "application/pdf";
                 } else  // Else the file was not retrieved, so all file-related data are kept "null".