- Improve search-accuracy of "alreadyDownloaded" full-texts.

- Handle the potential error-case of an "alreadyDownloaded" full-text not being discovered inside the "FileUtils.dataToBeLoggedList".
2 years ago · 0032a8018f
parent d61ff4b6dd
commit 0032a8018f
1 changed files with 12 additions and 7 deletions
--- a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java
+++ b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java
@ -158,13 +158,17 @@ public class PublicationsRetrieverPlugin {
            if ( "true".equals(data.getWasDocumentOrDatasetAccessible()) )  // The reversed order defends against a potential NPE.
            {
                status = UrlReport.StatusType.accessible;
-                if ( comment.startsWith(UrlUtils.alreadyDownloadedByIDMessage, 0) ) {
+                if ( comment.startsWith(UrlUtils.alreadyDownloadedFromIDMessage, 0) ) {
                    // The file of this docUrl was already downloaded by another docUrl.
-                    String initialId = comment.substring(UrlUtils.alreadyDownloadedByIDMessage.length());   // The fileName starts right after the "message".
-                    //logger.debug("initialId: " + initialId);    // DEBUG!
-                    // Search that ID inside the list and if that instance gave the docUrl (there might be multiple ID instances) then get the file-location.
+                    int indexOfAlreadyDownloadedFromSourceUrlMessage = comment.indexOf(UrlUtils.alreadyDownloadedFromSourceUrlContinuedMessage);
+                    int indexOfAlreadyDownloadedFromSourceUrl = indexOfAlreadyDownloadedFromSourceUrlMessage + UrlUtils.alreadyDownloadedFromSourceUrlContinuedMessage.length();
+                    String initialId = comment.substring(UrlUtils.alreadyDownloadedFromIDMessage.length(), indexOfAlreadyDownloadedFromSourceUrlMessage);   // The fileName starts right after the "message".
+                    String initialSourceUrl = comment.substring(indexOfAlreadyDownloadedFromSourceUrl);
+                    //logger.debug("initialId: " + initialId + " | sourceUrl: " + initialSourceUrl);    // DEBUG!
+                    // Search that ID and sourceUrl inside the list, if that instance is the first-found one, then get the file-data (there might be duplicate ID-sourceUrl instances, but only one of them has the file-data).
                    for ( DataToBeLogged data_2 : FileUtils.dataToBeLoggedList ) {
-                        if ( data_2.getUrlId().equals(initialId) && ("true".equals(data_2.getWasDocumentOrDatasetAccessible())) ) {
+                        if ( data_2.getUrlId().equals(initialId) && (data_2.getSourceUrl().equals(initialSourceUrl))
+                                && ! data_2.getComment().startsWith(UrlUtils.alreadyDownloadedFromIDMessage) ) {
                            fileLocation = data_2.getComment();
                            size = data_2.getSize();
                            hash = data_2.getHash();
@ -172,7 +176,8 @@ public class PublicationsRetrieverPlugin {
                            break;
                        }
                    }
-                    // TODO - The case where the "twin-ID" is not found, should "never" happen. But should we check? How to handle if that is the case..?
+                    // In case the "alreadyDownloaded" full-text is not found, we have an error.
+                    error = new Error(Error.ErrorType.couldRetry, comment + " | That ID-sourceUrl was not found inside the WorkerReport!"); // We can still try to download it from the found docUrl, in the future.
                }
                else if ( ! comment.equals(HttpConnUtils.docFileNotRetrievedMessage) ) {  // If it was downloaded without an error.
                    fileLocation = comment; // This is the full-file-path.
@ -181,7 +186,7 @@ public class PublicationsRetrieverPlugin {
                    error = new Error(Error.ErrorType.couldRetry, comment); // We can still try to download it from the found docUrl, in the future.

                if ( error == null )    // If the file was retrieved, in any time.
-                    error = new Error(Error.ErrorType.couldRetry, null);  // We do not want to send a "null" object, since it just adds more complicated handling in the controller..
+                    error = new Error(Error.ErrorType.couldRetry, null);  // We do not want to send a "null" Error-object, since it just adds more complicated handling in the controller..
            }
            else {
                status = UrlReport.StatusType.non_accessible;