- Update the Worker's report to include the datasourceID for each record. It is used by the Controller inside the S3-fileNames.

- Update dependencies.
master
Lampros Smyrnaios 2 years ago
parent 5fee05e994
commit 31af0a81eb

@ -1,5 +1,5 @@
plugins {
id 'org.springframework.boot' version '2.6.5'
id 'org.springframework.boot' version '2.6.6'
id 'io.spring.dependency-management' version '1.0.11.RELEASE'
id 'java'
}

@ -1,5 +1,5 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.1-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists

@ -31,7 +31,7 @@ if [[ ! -f $inputDataFile ]]; then
echo -e "\n\n"
fi
gradleVersion="7.4.1"
gradleVersion="7.4.2"
if [[ justInstall -eq 0 ]]; then

@ -17,7 +17,8 @@ import java.sql.Timestamp;
"size",
"hash",
"location",
"provenance"
"provenance",
"datasourceId"
})
public class Payload {
@ -48,9 +49,13 @@ public class Payload {
@JsonProperty("provenance")
private String provenance; // "crawl:<PluginName>"
@JsonProperty("provenance")
private String datasourceId; // "crawl:<PluginName>"
public Payload() {}
public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance) {
public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance, String datasourceId) {
this.id = id;
this.original_url = original_url;
this.actual_url = actual_url;
@ -60,6 +65,7 @@ public class Payload {
this.hash = hash;
this.location = location;
this.provenance = provenance;
this.datasourceId = datasourceId;
}
public String getId() {
@ -134,18 +140,29 @@ public class Payload {
this.provenance = provenance;
}
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(String datasourceId) {
this.datasourceId = datasourceId;
}
@Override
public String toString() {
return "Payload{" +
"id='" + id + '\'' +
", original_url='" + original_url + '\'' +
", actual_url='" + actual_url + '\'' +
", timestamp_acquired='" + timestamp_acquired + '\'' +
", timestamp_acquired=" + timestamp_acquired +
", mime_type='" + mime_type + '\'' +
", size='" + size + '\'' +
", size=" + size +
", hash='" + hash + '\'' +
", location='" + location + '\'' +
", provenance='" + provenance + '\'' +
", datasourceId='" + datasourceId + '\'' +
'}';
}
}

@ -23,6 +23,7 @@ import java.nio.charset.StandardCharsets;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.concurrent.Callable;
import java.util.concurrent.Executors;
@ -142,7 +143,7 @@ public class PublicationsRetrieverPlugin {
if ( numFailedTasks > 0 )
logger.warn(numFailedTasks + " tasks failed, from assignments_" + assignmentRequestCounter);
addUrlReportsToWorkerReport();
addUrlReportsToWorkerReport(assignments);
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch.
@ -150,8 +151,14 @@ public class PublicationsRetrieverPlugin {
}
public static void addUrlReportsToWorkerReport()
public static void addUrlReportsToWorkerReport(Collection<Assignment> assignments)
{
// Index the UrlIds with the DatasourceIds for quick-search later.
HashMap<String, String> urlIdsWithDatasourceIds = new HashMap<>(assignments.size());
for ( Assignment assignment : assignments ) {
urlIdsWithDatasourceIds.put(assignment.getId(), assignment.getDatasource().getId());
}
Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records.
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
@ -213,7 +220,10 @@ public class PublicationsRetrieverPlugin {
if ( (hash != null) && (hash.equals("null")) )
hash = null;
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever");
String urlId = data.getUrlId();
String datasourceId = urlIdsWithDatasourceIds.get(urlId);
Payload payload = new Payload(urlId, data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever", datasourceId);
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
AssignmentsHandler.urlReports.add(new UrlReport(status, payload, error));

Loading…
Cancel
Save