From b2788d31a93f78f3c7f4ae234588b429a8f4f2ff Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 8 Sep 2021 05:02:14 +0300 Subject: [PATCH] - Integrate the latest changes from the "PublicationsRetriever"-plugin. The fileSize and the fileHash are computed inside the plugin now. - Make the "mimeType" "null", when no docFile was retrieved. - Signal the scheduler that the worker is ready for work, when it has finished processing but not yet posted the previous data. - Fix a minor bug; now return "false" when there is any problem with the url of a specific task. - Avoid memory re-allocations for "callableTasks". --- .../plugins/PublicationsRetrieverPlugin.java | 40 +++++-------------- .../urls_worker/util/AssignmentHandler.java | 6 +-- 2 files changed, 12 insertions(+), 34 deletions(-) diff --git a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java index f44b6e8..1cb809e 100644 --- a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java +++ b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java @@ -1,7 +1,5 @@ package eu.openaire.urls_worker.plugins; -import com.google.common.hash.Hashing; -import com.google.common.io.Files; import edu.uci.ics.crawler4j.url.URLCanonicalizer; import eu.openaire.publications_retriever.PublicationsRetriever; import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException; @@ -24,7 +22,6 @@ import java.io.File; import java.io.FileNotFoundException; import java.io.FileOutputStream; import java.nio.charset.StandardCharsets; -import java.nio.file.Paths; import java.util.*; import java.util.concurrent.Callable; import java.util.concurrent.Executors; @@ -65,7 +62,7 @@ public class PublicationsRetrieverPlugin { } - public static void processAssginmets(Long assignmentRequestCounter, Collection assignments) throws RuntimeException, FileNotFoundException + public static void processAssignments(Long assignmentRequestCounter, Collection assignments) throws RuntimeException, FileNotFoundException { ConnSupportUtils.setKnownMimeTypes(); @@ -93,7 +90,7 @@ public class PublicationsRetrieverPlugin { String sourceUrl = urlToCheck; // Hold it here for the logging-messages. if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) { logger.warn("Could not canonicalize url: " + sourceUrl); - UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false"); + UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null); LoaderAndChecker.connProblematicUrls.incrementAndGet(); return false; } @@ -118,7 +115,8 @@ public class PublicationsRetrieverPlugin { List list = LoaderAndChecker.getWasValidAndCouldRetry(e); String wasUrlValid = list.get(0); String couldRetry = list.get(1); - UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry); + UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null); + return false; } return true; }); @@ -128,7 +126,7 @@ public class PublicationsRetrieverPlugin { logger.info("Batch counter: " + (++batchCount) + " | progress: " + PublicationsRetriever.df.format(((batchCount-1) * taskCount) * 100.0 / tasksSize) + "% | every batch contains " + FileUtils.jsonBatchSize + " id-url pairs."); LoaderAndChecker.invokeAllTasksAndWait(callableTasks); addUrlReportsToWorkerReport(); - callableTasks = new ArrayList<>(FileUtils.jsonBatchSize); // Reset the thread-tasks-list for the next batch. + callableTasks.clear(); // Reset the thread-tasks-list for the next batch. } }// end tasks-for-loop } @@ -140,10 +138,8 @@ public class PublicationsRetrieverPlugin { { for ( DataToBeLogged data : FileUtils.dataToBeLoggedList ) { - String status = null, fileLocation = null, hash = null; - Long size = null; + String status = null, fileLocation = null, comment = data.getComment(), mimeType = null; Error error = null; - String comment = data.getComment(); if ( data.getWasDocumentOrDatasetAccessible().equals("true") ) { @@ -164,26 +160,8 @@ public class PublicationsRetrieverPlugin { fileLocation = "File not retrieved"; else { fileLocation = comment; - try { - File docFile = new File(fileLocation); - if ( docFile.isFile() ) { - hash = Files.hash(docFile, Hashing.md5()).toString(); // These hashing functions are deprecated, but just to inform us that MD5 is not secure. Luckily, we use MD5 just to identify duplicate files. - //logger.debug("MD5 for file \"" + docFile.getName() + "\": " + hash); // DEBUG! - size = java.nio.file.Files.size(Paths.get(fileLocation)); - //logger.debug("Size of file \"" + docFile.getName() + "\": " + size); // DEBUG! - } else - logger.error("No file was found with path: " + fileLocation); - } catch (Exception e) { - if ( hash == null ) - logger.error("Could not retrieve the MD5-hash for the file: " + fileLocation); - - if ( size == null ) - logger.error("Could not retrieve the size of the file: " + fileLocation); - - e.printStackTrace(); - } + mimeType = "application/pdf"; } - error = new Error(null, null); // We do not want to send a "null" object, since it just adds more complicated handling in the controller.. } else { @@ -198,7 +176,7 @@ public class PublicationsRetrieverPlugin { if ( docOrDatasetUrl.equals(UrlUtils.unreachableDocOrDatasetUrlIndicator) || docOrDatasetUrl.equals(UrlUtils.duplicateUrlIndicator) ) docOrDatasetUrl = null; - Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), "application/pdf", size, hash, fileLocation, "crawl:PublicationsRetriever"); + Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), mimeType, data.getSize(), data.getHash(), fileLocation, "crawl:PublicationsRetriever"); // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified. AssignmentHandler.urlReports.add(new UrlReport(status, payload, error)); @@ -214,7 +192,7 @@ public class PublicationsRetrieverPlugin { List list = LoaderAndChecker.getWasValidAndCouldRetry(e); String wasUrlValid = list.get(0); String couldRetry = list.get(1); - UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry); + UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null); return false; } } diff --git a/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java b/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java index c2dab55..562dd10 100644 --- a/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java +++ b/src/main/java/eu/openaire/urls_worker/util/AssignmentHandler.java @@ -104,14 +104,14 @@ public class AssignmentHandler { // For now, let's just run all tasks in the generic plugin. try { - PublicationsRetrieverPlugin.processAssginmets(assignmentRequestCounter, assignmentsForPlugins.values()); + PublicationsRetrieverPlugin.processAssignments(assignmentRequestCounter, assignmentsForPlugins.values()); } catch (Exception e) { logger.error(e.getMessage(), e); } - postWorkerReport(assignmentRequestCounter); + isAvailableForWork = true; // State this before posting, to catch the soonest next scheduled request. - isAvailableForWork = true; + postWorkerReport(assignmentRequestCounter); }