- Integrate the latest changes from the "PublicationsRetriever"-plugin. The fileSize and the fileHash are computed inside the plugin now.

- Make the "mimeType" "null", when no docFile was retrieved.
- Signal the scheduler that the worker is ready for work, when it has finished processing but not yet posted the previous data.
- Fix a minor bug; now return "false" when there is any problem with the url of a specific task.
- Avoid memory re-allocations for "callableTasks".
This commit is contained in:
Lampros Smyrnaios 2021-09-08 05:02:14 +03:00
parent 6fd9eed1ec
commit b2788d31a9
2 changed files with 12 additions and 34 deletions

View File

@ -1,7 +1,5 @@
package eu.openaire.urls_worker.plugins;
import com.google.common.hash.Hashing;
import com.google.common.io.Files;
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
import eu.openaire.publications_retriever.PublicationsRetriever;
import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
@ -24,7 +22,6 @@ import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Paths;
import java.util.*;
import java.util.concurrent.Callable;
import java.util.concurrent.Executors;
@ -65,7 +62,7 @@ public class PublicationsRetrieverPlugin {
}
public static void processAssginmets(Long assignmentRequestCounter, Collection<Assignment> assignments) throws RuntimeException, FileNotFoundException
public static void processAssignments(Long assignmentRequestCounter, Collection<Assignment> assignments) throws RuntimeException, FileNotFoundException
{
ConnSupportUtils.setKnownMimeTypes();
@ -93,7 +90,7 @@ public class PublicationsRetrieverPlugin {
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
logger.warn("Could not canonicalize url: " + sourceUrl);
UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false");
UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
LoaderAndChecker.connProblematicUrls.incrementAndGet();
return false;
}
@ -118,7 +115,8 @@ public class PublicationsRetrieverPlugin {
List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry);
UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
return false;
}
return true;
});
@ -128,7 +126,7 @@ public class PublicationsRetrieverPlugin {
logger.info("Batch counter: " + (++batchCount) + " | progress: " + PublicationsRetriever.df.format(((batchCount-1) * taskCount) * 100.0 / tasksSize) + "% | every batch contains " + FileUtils.jsonBatchSize + " id-url pairs.");
LoaderAndChecker.invokeAllTasksAndWait(callableTasks);
addUrlReportsToWorkerReport();
callableTasks = new ArrayList<>(FileUtils.jsonBatchSize); // Reset the thread-tasks-list for the next batch.
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
}
}// end tasks-for-loop
}
@ -140,10 +138,8 @@ public class PublicationsRetrieverPlugin {
{
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
{
String status = null, fileLocation = null, hash = null;
Long size = null;
String status = null, fileLocation = null, comment = data.getComment(), mimeType = null;
Error error = null;
String comment = data.getComment();
if ( data.getWasDocumentOrDatasetAccessible().equals("true") )
{
@ -164,26 +160,8 @@ public class PublicationsRetrieverPlugin {
fileLocation = "File not retrieved";
else {
fileLocation = comment;
try {
File docFile = new File(fileLocation);
if ( docFile.isFile() ) {
hash = Files.hash(docFile, Hashing.md5()).toString(); // These hashing functions are deprecated, but just to inform us that MD5 is not secure. Luckily, we use MD5 just to identify duplicate files.
//logger.debug("MD5 for file \"" + docFile.getName() + "\": " + hash); // DEBUG!
size = java.nio.file.Files.size(Paths.get(fileLocation));
//logger.debug("Size of file \"" + docFile.getName() + "\": " + size); // DEBUG!
} else
logger.error("No file was found with path: " + fileLocation);
} catch (Exception e) {
if ( hash == null )
logger.error("Could not retrieve the MD5-hash for the file: " + fileLocation);
if ( size == null )
logger.error("Could not retrieve the size of the file: " + fileLocation);
e.printStackTrace();
}
mimeType = "application/pdf";
}
error = new Error(null, null); // We do not want to send a "null" object, since it just adds more complicated handling in the controller..
}
else {
@ -198,7 +176,7 @@ public class PublicationsRetrieverPlugin {
if ( docOrDatasetUrl.equals(UrlUtils.unreachableDocOrDatasetUrlIndicator) || docOrDatasetUrl.equals(UrlUtils.duplicateUrlIndicator) )
docOrDatasetUrl = null;
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), "application/pdf", size, hash, fileLocation, "crawl:PublicationsRetriever");
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), mimeType, data.getSize(), data.getHash(), fileLocation, "crawl:PublicationsRetriever");
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
AssignmentHandler.urlReports.add(new UrlReport(status, payload, error));
@ -214,7 +192,7 @@ public class PublicationsRetrieverPlugin {
List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry);
UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
return false;
}
}

View File

@ -104,14 +104,14 @@ public class AssignmentHandler {
// For now, let's just run all tasks in the generic plugin.
try {
PublicationsRetrieverPlugin.processAssginmets(assignmentRequestCounter, assignmentsForPlugins.values());
PublicationsRetrieverPlugin.processAssignments(assignmentRequestCounter, assignmentsForPlugins.values());
} catch (Exception e) {
logger.error(e.getMessage(), e);
}
postWorkerReport(assignmentRequestCounter);
isAvailableForWork = true; // State this before posting, to catch the soonest next scheduled request.
isAvailableForWork = true;
postWorkerReport(assignmentRequestCounter);
}