forked from lsmyrnaios/UrlsWorker
- Integrate the latest changes from the "PublicationsRetriever"-plugin. The fileSize and the fileHash are computed inside the plugin now.
- Make the "mimeType" "null", when no docFile was retrieved. - Signal the scheduler that the worker is ready for work, when it has finished processing but not yet posted the previous data. - Fix a minor bug; now return "false" when there is any problem with the url of a specific task. - Avoid memory re-allocations for "callableTasks".
This commit is contained in:
parent
6fd9eed1ec
commit
b2788d31a9
|
@ -1,7 +1,5 @@
|
|||
package eu.openaire.urls_worker.plugins;
|
||||
|
||||
import com.google.common.hash.Hashing;
|
||||
import com.google.common.io.Files;
|
||||
import edu.uci.ics.crawler4j.url.URLCanonicalizer;
|
||||
import eu.openaire.publications_retriever.PublicationsRetriever;
|
||||
import eu.openaire.publications_retriever.exceptions.DocFileNotRetrievedException;
|
||||
|
@ -24,7 +22,6 @@ import java.io.File;
|
|||
import java.io.FileNotFoundException;
|
||||
import java.io.FileOutputStream;
|
||||
import java.nio.charset.StandardCharsets;
|
||||
import java.nio.file.Paths;
|
||||
import java.util.*;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.Executors;
|
||||
|
@ -65,7 +62,7 @@ public class PublicationsRetrieverPlugin {
|
|||
}
|
||||
|
||||
|
||||
public static void processAssginmets(Long assignmentRequestCounter, Collection<Assignment> assignments) throws RuntimeException, FileNotFoundException
|
||||
public static void processAssignments(Long assignmentRequestCounter, Collection<Assignment> assignments) throws RuntimeException, FileNotFoundException
|
||||
{
|
||||
ConnSupportUtils.setKnownMimeTypes();
|
||||
|
||||
|
@ -93,7 +90,7 @@ public class PublicationsRetrieverPlugin {
|
|||
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
|
||||
if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
|
||||
logger.warn("Could not canonicalize url: " + sourceUrl);
|
||||
UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false");
|
||||
UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
|
||||
LoaderAndChecker.connProblematicUrls.incrementAndGet();
|
||||
return false;
|
||||
}
|
||||
|
@ -118,7 +115,8 @@ public class PublicationsRetrieverPlugin {
|
|||
List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
|
||||
String wasUrlValid = list.get(0);
|
||||
String couldRetry = list.get(1);
|
||||
UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry);
|
||||
UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
});
|
||||
|
@ -128,7 +126,7 @@ public class PublicationsRetrieverPlugin {
|
|||
logger.info("Batch counter: " + (++batchCount) + " | progress: " + PublicationsRetriever.df.format(((batchCount-1) * taskCount) * 100.0 / tasksSize) + "% | every batch contains " + FileUtils.jsonBatchSize + " id-url pairs.");
|
||||
LoaderAndChecker.invokeAllTasksAndWait(callableTasks);
|
||||
addUrlReportsToWorkerReport();
|
||||
callableTasks = new ArrayList<>(FileUtils.jsonBatchSize); // Reset the thread-tasks-list for the next batch.
|
||||
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
|
||||
}
|
||||
}// end tasks-for-loop
|
||||
}
|
||||
|
@ -140,10 +138,8 @@ public class PublicationsRetrieverPlugin {
|
|||
{
|
||||
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
|
||||
{
|
||||
String status = null, fileLocation = null, hash = null;
|
||||
Long size = null;
|
||||
String status = null, fileLocation = null, comment = data.getComment(), mimeType = null;
|
||||
Error error = null;
|
||||
String comment = data.getComment();
|
||||
|
||||
if ( data.getWasDocumentOrDatasetAccessible().equals("true") )
|
||||
{
|
||||
|
@ -164,26 +160,8 @@ public class PublicationsRetrieverPlugin {
|
|||
fileLocation = "File not retrieved";
|
||||
else {
|
||||
fileLocation = comment;
|
||||
try {
|
||||
File docFile = new File(fileLocation);
|
||||
if ( docFile.isFile() ) {
|
||||
hash = Files.hash(docFile, Hashing.md5()).toString(); // These hashing functions are deprecated, but just to inform us that MD5 is not secure. Luckily, we use MD5 just to identify duplicate files.
|
||||
//logger.debug("MD5 for file \"" + docFile.getName() + "\": " + hash); // DEBUG!
|
||||
size = java.nio.file.Files.size(Paths.get(fileLocation));
|
||||
//logger.debug("Size of file \"" + docFile.getName() + "\": " + size); // DEBUG!
|
||||
} else
|
||||
logger.error("No file was found with path: " + fileLocation);
|
||||
} catch (Exception e) {
|
||||
if ( hash == null )
|
||||
logger.error("Could not retrieve the MD5-hash for the file: " + fileLocation);
|
||||
|
||||
if ( size == null )
|
||||
logger.error("Could not retrieve the size of the file: " + fileLocation);
|
||||
|
||||
e.printStackTrace();
|
||||
}
|
||||
mimeType = "application/pdf";
|
||||
}
|
||||
|
||||
error = new Error(null, null); // We do not want to send a "null" object, since it just adds more complicated handling in the controller..
|
||||
}
|
||||
else {
|
||||
|
@ -198,7 +176,7 @@ public class PublicationsRetrieverPlugin {
|
|||
if ( docOrDatasetUrl.equals(UrlUtils.unreachableDocOrDatasetUrlIndicator) || docOrDatasetUrl.equals(UrlUtils.duplicateUrlIndicator) )
|
||||
docOrDatasetUrl = null;
|
||||
|
||||
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), "application/pdf", size, hash, fileLocation, "crawl:PublicationsRetriever");
|
||||
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, new Date(), mimeType, data.getSize(), data.getHash(), fileLocation, "crawl:PublicationsRetriever");
|
||||
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
|
||||
|
||||
AssignmentHandler.urlReports.add(new UrlReport(status, payload, error));
|
||||
|
@ -214,7 +192,7 @@ public class PublicationsRetrieverPlugin {
|
|||
List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
|
||||
String wasUrlValid = list.get(0);
|
||||
String couldRetry = list.get(1);
|
||||
UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry);
|
||||
UrlUtils.logOutputData(null, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -104,14 +104,14 @@ public class AssignmentHandler {
|
|||
// For now, let's just run all tasks in the generic plugin.
|
||||
|
||||
try {
|
||||
PublicationsRetrieverPlugin.processAssginmets(assignmentRequestCounter, assignmentsForPlugins.values());
|
||||
PublicationsRetrieverPlugin.processAssignments(assignmentRequestCounter, assignmentsForPlugins.values());
|
||||
} catch (Exception e) {
|
||||
logger.error(e.getMessage(), e);
|
||||
}
|
||||
|
||||
postWorkerReport(assignmentRequestCounter);
|
||||
isAvailableForWork = true; // State this before posting, to catch the soonest next scheduled request.
|
||||
|
||||
isAvailableForWork = true;
|
||||
postWorkerReport(assignmentRequestCounter);
|
||||
}
|
||||
|
||||
|
||||
|
|
Loading…
Reference in New Issue