From b2788d31a93f78f3c7f4ae234588b429a8f4f2ff Mon Sep 17 00:00:00 2001
From: LSmyrnaios <lsmyrnaios@gmail.com>
Date: Wed, 8 Sep 2021 05:02:14 +0300
Subject: [PATCH] - Integrate the latest changes from the
 "PublicationsRetriever"-plugin. The fileSize and the fileHash are computed
 inside the plugin now. - Make the "mimeType" "null", when no docFile was
 retrieved. - Signal the scheduler that the worker is ready for work, when it
 has finished processing but not yet posted the previous data. - Fix a minor
 bug; now return "false" when there is any problem with the url of a specific
 task. - Avoid memory re-allocations for "callableTasks".

---
 .../plugins/PublicationsRetrieverPlugin.java  | 40 +++++--------------
 .../urls_worker/util/AssignmentHandler.java   |  6 +--
 2 files changed, 12 insertions(+), 34 deletions(-)
diff --git a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java
index f44b6e8..1cb809e 100644
--- a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java
+++ b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java
@@ -1,7 +1,5 @@
 package eu.openaire.urls_worker.plugins;
 
-import com.google.common.hash.Hashing;
-import com.google.common.io.Files;
 import edu.uci.ics.crawler4j.url.URLCanonicalizer;
 import eu.openaire.publications_retriever.PublicationsRetriever;
 import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
@@ -24,7 +22,6 @@ import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.nio.charset.StandardCharsets;
-import java.nio.file.Paths;
 import java.util.*;
 import java.util.concurrent.Callable;
 import java.util.concurrent.Executors;
@@ -65,7 +62,7 @@ public class PublicationsRetrieverPlugin {
     }
 
 
-    public static void processAssginmets(Long assignmentRequestCounter, Collection<Assignment> assignments) throws RuntimeException, FileNotFoundException
+    public static void processAssignments(Long assignmentRequestCounter, Collection<Assignment> assignments) throws RuntimeException, FileNotFoundException
     {
         ConnSupportUtils.setKnownMimeTypes();
 
@@ -93,7 +90,7 @@ public class PublicationsRetrieverPlugin {
                 String sourceUrl = urlToCheck;    // Hold it here for the logging-messages.
                 if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
                     logger.warn("Could not canonicalize url: " + sourceUrl);
-                    UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false");
+                    UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
                     LoaderAndChecker.connProblematicUrls.incrementAndGet();
                     return false;
                 }
@@ -118,7 +115,8 @@ public class PublicationsRetrieverPlugin {
                     List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
                     String wasUrlValid = list.get(0);
                     String couldRetry = list.get(1);
-                    UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry);
+                    UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
+                    return false;
                 }
                 return true;
             });
@@ -128,7 +126,7 @@ public class PublicationsRetrieverPlugin {
                 logger.info("Batch counter: " + (++batchCount) + " | progress: " + PublicationsRetriever.df.format(((batchCount-1) * taskCount) * 100.0 / tasksSize) + "% | every batch contains " + FileUtils.jsonBatchSize + " id-url pairs.");
                 LoaderAndChecker.invokeAllTasksAndWait(callableTasks);
                 addUrlReportsToWorkerReport();
-                callableTasks = new ArrayList<>(FileUtils.jsonBatchSize); // Reset the thread-tasks-list for the next batch.
+                callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
             }
         }// end tasks-for-loop
     }
@@ -140,10 +138,8 @@ public class PublicationsRetrieverPlugin {
     {
         for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
         {
-            String status = null, fileLocation = null, hash = null;
-            Long size = null;
+            String status = null, fileLocation = null, comment = data.getComment(), mimeType = null;
             Error error = null;
-            String comment = data.getComment();
 
             if ( data.getWasDocumentOrDatasetAccessible().equals("true") )
             {
@@ -164,26 +160,8 @@ public class PublicationsRetrieverPlugin {
                     fileLocation = "File not retrieved";
                 else {
                     fileLocation = comment;
-                    try {
-                        File docFile = new File(fileLocation);
-                        if ( docFile.isFile() ) {
-                            hash = Files.hash(docFile, Hashing.md5()).toString();    // These hashing functions are deprecated, but just to inform us that MD5 is not secure. Luckily, we use MD5 just to identify duplicate files.
-                            //logger.debug("MD5 for file \"" + docFile.getName() + "\": " + hash); // DEBUG!
-                            size = java.nio.file.Files.size(Paths.get(fileLocation));
-                            //logger.debug("Size of file \"" + docFile.getName() + "\": " + size); // DEBUG!
-                        } else
-                            logger.error("No file was found with path: " + fileLocation);
-                    } catch (Exception e) {
-                        if ( hash == null )
-                            logger.error("Could not retrieve the MD5-hash for the file: " + fileLocation);
-
-                        if ( size == null )
-                            logger.error("Could not retrieve the size of the file: " + fileLocation);
-
-                        e.printStackTrace();
-                    }
+                    mimeType = "application/pdf";
                 }
-
                 error = new Error(null, null);  // We do not want to send a "null" object, since it just adds more complicated handling in the controller..
             }
             else {
@@ -198,7 +176,7 @@ public class PublicationsRetrieverPlugin {
             if ( docOrDatasetUrl.equals(UrlUtils.unreachableDocOrDatasetUrlIndicator) || docOrDatasetUrl.equals(UrlUtils.duplicateUrlIndicator) )
                 docOrDatasetUrl = null;
 
-            Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), "application/pdf", size, hash, fileLocation, "crawl:PublicationsRetriever");
+            Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), mimeType, data.getSize(), data.getHash(), fileLocation, "crawl:PublicationsRetriever");
             // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
 
             AssignmentHandler.urlReports.add(new UrlReport(status, payload, error));
@@ -214,7 +192,7 @@ public class PublicationsRetrieverPlugin {
             List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
             String wasUrlValid = list.get(0);
             String couldRetry = list.get(1);
-            UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry);
+            UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
             return false;
         }
     }
diff --git a/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java b/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java
index c2dab55..562dd10 100644
--- a/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java
+++ b/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java
@@ -104,14 +104,14 @@ public class AssignmentHandler {
         // For now, let's just run all tasks in the generic plugin.
 
         try {
-            PublicationsRetrieverPlugin.processAssginmets(assignmentRequestCounter, assignmentsForPlugins.values());
+            PublicationsRetrieverPlugin.processAssignments(assignmentRequestCounter, assignmentsForPlugins.values());
         } catch (Exception e) {
             logger.error(e.getMessage(), e);
         }
 
-        postWorkerReport(assignmentRequestCounter);
+        isAvailableForWork = true;  // State this before posting, to catch the soonest next scheduled request.
 
-        isAvailableForWork = true;
+        postWorkerReport(assignmentRequestCounter);
     }