- Integrate the latest changes from the "PublicationsRetriever"-plugin. The fileSize and the fileHash are computed inside the plugin now.

- Make the "mimeType" "null", when no docFile was retrieved. - Signal the scheduler that the worker is ready for work, when it has finished processing but not yet posted the previous data. - Fix a minor bug; now return "false" when there is any problem with the url of a specific task. - Avoid memory re-allocations for "callableTasks".
2021-09-08 05:02:14 +03:00 · 2021-09-08 05:02:14 +03:00 · b2788d31a9
parent 6fd9eed1ec
commit b2788d31a9
2 changed files with 12 additions and 34 deletions
--- a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java
+++ b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java
@ -1,7 +1,5 @@
 package eu.openaire.urls_worker.plugins;

-import com.google.common.hash.Hashing;
-import com.google.common.io.Files;
 import edu.uci.ics.crawler4j.url.URLCanonicalizer;
 import eu.openaire.publications_retriever.PublicationsRetriever;
 import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
@ -24,7 +22,6 @@ import java.io.File;
 import java.io.FileNotFoundException;
 import java.io.FileOutputStream;
 import java.nio.charset.StandardCharsets;
-import java.nio.file.Paths;
 import java.util.*;
 import java.util.concurrent.Callable;
 import java.util.concurrent.Executors;
@ -65,7 +62,7 @@ public class PublicationsRetrieverPlugin {
    }


-    public static void processAssginmets(Long assignmentRequestCounter, Collection<Assignment> assignments) throws RuntimeException, FileNotFoundException
+    public static void processAssignments(Long assignmentRequestCounter, Collection<Assignment> assignments) throws RuntimeException, FileNotFoundException
    {
        ConnSupportUtils.setKnownMimeTypes();

@ -93,7 +90,7 @@ public class PublicationsRetrieverPlugin {
                String sourceUrl = urlToCheck;    // Hold it here for the logging-messages.
                if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
                    logger.warn("Could not canonicalize url: " + sourceUrl);
-                    UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false");
+                    UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
                    LoaderAndChecker.connProblematicUrls.incrementAndGet();
                    return false;
                }
@ -118,7 +115,8 @@ public class PublicationsRetrieverPlugin {
                    List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
                    String wasUrlValid = list.get(0);
                    String couldRetry = list.get(1);
-                    UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry);
+                    UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
+                    return false;
                }
                return true;
            });
@ -128,7 +126,7 @@ public class PublicationsRetrieverPlugin {
                logger.info("Batch counter: " + (++batchCount) + " | progress: " + PublicationsRetriever.df.format(((batchCount-1) * taskCount) * 100.0 / tasksSize) + "% | every batch contains " + FileUtils.jsonBatchSize + " id-url pairs.");
                LoaderAndChecker.invokeAllTasksAndWait(callableTasks);
                addUrlReportsToWorkerReport();
-                callableTasks = new ArrayList<>(FileUtils.jsonBatchSize); // Reset the thread-tasks-list for the next batch.
+                callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
            }
        }// end tasks-for-loop
    }
@ -140,10 +138,8 @@ public class PublicationsRetrieverPlugin {
    {
        for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
        {
-            String status = null, fileLocation = null, hash = null;
-            Long size = null;
+            String status = null, fileLocation = null, comment = data.getComment(), mimeType = null;
            Error error = null;
-            String comment = data.getComment();

            if ( data.getWasDocumentOrDatasetAccessible().equals("true") )
            {
@ -164,26 +160,8 @@ public class PublicationsRetrieverPlugin {
                    fileLocation = "File not retrieved";
                else {
                    fileLocation = comment;
-                    try {
-                        File docFile = new File(fileLocation);
-                        if ( docFile.isFile() ) {
-                            hash = Files.hash(docFile, Hashing.md5()).toString();    // These hashing functions are deprecated, but just to inform us that MD5 is not secure. Luckily, we use MD5 just to identify duplicate files.
-                            //logger.debug("MD5 for file \"" + docFile.getName() + "\": " + hash); // DEBUG!
-                            size = java.nio.file.Files.size(Paths.get(fileLocation));
-                            //logger.debug("Size of file \"" + docFile.getName() + "\": " + size); // DEBUG!
-                        } else
-                            logger.error("No file was found with path: " + fileLocation);
-                    } catch (Exception e) {
-                        if ( hash == null )
-                            logger.error("Could not retrieve the MD5-hash for the file: " + fileLocation);
-
-                        if ( size == null )
-                            logger.error("Could not retrieve the size of the file: " + fileLocation);
-
-                        e.printStackTrace();
-                    }
+                    mimeType = "application/pdf";
                }
-
                error = new Error(null, null);  // We do not want to send a "null" object, since it just adds more complicated handling in the controller..
            }
            else {
@ -198,7 +176,7 @@ public class PublicationsRetrieverPlugin {
            if ( docOrDatasetUrl.equals(UrlUtils.unreachableDocOrDatasetUrlIndicator) || docOrDatasetUrl.equals(UrlUtils.duplicateUrlIndicator) )
                docOrDatasetUrl = null;

-            Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), "application/pdf", size, hash, fileLocation, "crawl:PublicationsRetriever");
+            Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), mimeType, data.getSize(), data.getHash(), fileLocation, "crawl:PublicationsRetriever");
            // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.

            AssignmentHandler.urlReports.add(new UrlReport(status, payload, error));
@ -214,7 +192,7 @@ public class PublicationsRetrieverPlugin {
            List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
            String wasUrlValid = list.get(0);
            String couldRetry = list.get(1);
-            UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry);
+            UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
            return false;
        }
    }
--- a/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java
+++ b/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java
@ -104,14 +104,14 @@ public class AssignmentHandler {
        // For now, let's just run all tasks in the generic plugin.

        try {
-            PublicationsRetrieverPlugin.processAssginmets(assignmentRequestCounter, assignmentsForPlugins.values());
+            PublicationsRetrieverPlugin.processAssignments(assignmentRequestCounter, assignmentsForPlugins.values());
        } catch (Exception e) {
            logger.error(e.getMessage(), e);
        }

-        postWorkerReport(assignmentRequestCounter);
+        isAvailableForWork = true;  // State this before posting, to catch the soonest next scheduled request.

-        isAvailableForWork = true;
+        postWorkerReport(assignmentRequestCounter);
    }