|
|
|
@ -23,6 +23,7 @@ import java.nio.charset.StandardCharsets;
|
|
|
|
|
import java.sql.Timestamp;
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
|
import java.util.Collection;
|
|
|
|
|
import java.util.HashMap;
|
|
|
|
|
import java.util.List;
|
|
|
|
|
import java.util.concurrent.Callable;
|
|
|
|
|
import java.util.concurrent.Executors;
|
|
|
|
@ -142,7 +143,7 @@ public class PublicationsRetrieverPlugin {
|
|
|
|
|
if ( numFailedTasks > 0 )
|
|
|
|
|
logger.warn(numFailedTasks + " tasks failed, from assignments_" + assignmentRequestCounter);
|
|
|
|
|
|
|
|
|
|
addUrlReportsToWorkerReport();
|
|
|
|
|
addUrlReportsToWorkerReport(assignments);
|
|
|
|
|
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
|
|
|
|
|
|
|
|
|
|
UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch.
|
|
|
|
@ -150,8 +151,14 @@ public class PublicationsRetrieverPlugin {
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
public static void addUrlReportsToWorkerReport()
|
|
|
|
|
public static void addUrlReportsToWorkerReport(Collection<Assignment> assignments)
|
|
|
|
|
{
|
|
|
|
|
// Index the UrlIds with the DatasourceIds for quick-search later.
|
|
|
|
|
HashMap<String, String> urlIdsWithDatasourceIds = new HashMap<>(assignments.size());
|
|
|
|
|
for ( Assignment assignment : assignments ) {
|
|
|
|
|
urlIdsWithDatasourceIds.put(assignment.getId(), assignment.getDatasource().getId());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records.
|
|
|
|
|
|
|
|
|
|
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
|
|
|
|
@ -213,7 +220,10 @@ public class PublicationsRetrieverPlugin {
|
|
|
|
|
if ( (hash != null) && (hash.equals("null")) )
|
|
|
|
|
hash = null;
|
|
|
|
|
|
|
|
|
|
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever");
|
|
|
|
|
String urlId = data.getUrlId();
|
|
|
|
|
String datasourceId = urlIdsWithDatasourceIds.get(urlId);
|
|
|
|
|
|
|
|
|
|
Payload payload = new Payload(urlId, data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever", datasourceId);
|
|
|
|
|
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
|
|
|
|
|
|
|
|
|
|
AssignmentsHandler.urlReports.add(new UrlReport(status, payload, error));
|
|
|
|
|