- Update the Worker's report to include the datasourceID for each record. It is used by the Controller inside the S3-fileNames.
- Update dependencies.
This commit is contained in:
parent
5fee05e994
commit
31af0a81eb
|
@ -1,5 +1,5 @@
|
|||
plugins {
|
||||
id 'org.springframework.boot' version '2.6.5'
|
||||
id 'org.springframework.boot' version '2.6.6'
|
||||
id 'io.spring.dependency-management' version '1.0.11.RELEASE'
|
||||
id 'java'
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
distributionBase=GRADLE_USER_HOME
|
||||
distributionPath=wrapper/dists
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.1-bin.zip
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
zipStorePath=wrapper/dists
|
||||
|
|
|
@ -31,7 +31,7 @@ if [[ ! -f $inputDataFile ]]; then
|
|||
echo -e "\n\n"
|
||||
fi
|
||||
|
||||
gradleVersion="7.4.1"
|
||||
gradleVersion="7.4.2"
|
||||
|
||||
if [[ justInstall -eq 0 ]]; then
|
||||
|
||||
|
|
|
@ -17,7 +17,8 @@ import java.sql.Timestamp;
|
|||
"size",
|
||||
"hash",
|
||||
"location",
|
||||
"provenance"
|
||||
"provenance",
|
||||
"datasourceId"
|
||||
})
|
||||
public class Payload {
|
||||
|
||||
|
@ -48,9 +49,13 @@ public class Payload {
|
|||
@JsonProperty("provenance")
|
||||
private String provenance; // "crawl:<PluginName>"
|
||||
|
||||
@JsonProperty("provenance")
|
||||
private String datasourceId; // "crawl:<PluginName>"
|
||||
|
||||
|
||||
public Payload() {}
|
||||
|
||||
public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance) {
|
||||
public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance, String datasourceId) {
|
||||
this.id = id;
|
||||
this.original_url = original_url;
|
||||
this.actual_url = actual_url;
|
||||
|
@ -60,6 +65,7 @@ public class Payload {
|
|||
this.hash = hash;
|
||||
this.location = location;
|
||||
this.provenance = provenance;
|
||||
this.datasourceId = datasourceId;
|
||||
}
|
||||
|
||||
public String getId() {
|
||||
|
@ -134,18 +140,29 @@ public class Payload {
|
|||
this.provenance = provenance;
|
||||
}
|
||||
|
||||
public String getDatasourceId() {
|
||||
return datasourceId;
|
||||
}
|
||||
|
||||
public void setDatasourceId(String datasourceId) {
|
||||
this.datasourceId = datasourceId;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public String toString() {
|
||||
return "Payload{" +
|
||||
"id='" + id + '\'' +
|
||||
", original_url='" + original_url + '\'' +
|
||||
", actual_url='" + actual_url + '\'' +
|
||||
", timestamp_acquired='" + timestamp_acquired + '\'' +
|
||||
", timestamp_acquired=" + timestamp_acquired +
|
||||
", mime_type='" + mime_type + '\'' +
|
||||
", size='" + size + '\'' +
|
||||
", size=" + size +
|
||||
", hash='" + hash + '\'' +
|
||||
", location='" + location + '\'' +
|
||||
", provenance='" + provenance + '\'' +
|
||||
", datasourceId='" + datasourceId + '\'' +
|
||||
'}';
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -23,6 +23,7 @@ import java.nio.charset.StandardCharsets;
|
|||
import java.sql.Timestamp;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collection;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.concurrent.Callable;
|
||||
import java.util.concurrent.Executors;
|
||||
|
@ -142,7 +143,7 @@ public class PublicationsRetrieverPlugin {
|
|||
if ( numFailedTasks > 0 )
|
||||
logger.warn(numFailedTasks + " tasks failed, from assignments_" + assignmentRequestCounter);
|
||||
|
||||
addUrlReportsToWorkerReport();
|
||||
addUrlReportsToWorkerReport(assignments);
|
||||
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
|
||||
|
||||
UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch.
|
||||
|
@ -150,8 +151,14 @@ public class PublicationsRetrieverPlugin {
|
|||
}
|
||||
|
||||
|
||||
public static void addUrlReportsToWorkerReport()
|
||||
public static void addUrlReportsToWorkerReport(Collection<Assignment> assignments)
|
||||
{
|
||||
// Index the UrlIds with the DatasourceIds for quick-search later.
|
||||
HashMap<String, String> urlIdsWithDatasourceIds = new HashMap<>(assignments.size());
|
||||
for ( Assignment assignment : assignments ) {
|
||||
urlIdsWithDatasourceIds.put(assignment.getId(), assignment.getDatasource().getId());
|
||||
}
|
||||
|
||||
Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records.
|
||||
|
||||
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
|
||||
|
@ -213,7 +220,10 @@ public class PublicationsRetrieverPlugin {
|
|||
if ( (hash != null) && (hash.equals("null")) )
|
||||
hash = null;
|
||||
|
||||
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever");
|
||||
String urlId = data.getUrlId();
|
||||
String datasourceId = urlIdsWithDatasourceIds.get(urlId);
|
||||
|
||||
Payload payload = new Payload(urlId, data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever", datasourceId);
|
||||
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
|
||||
|
||||
AssignmentsHandler.urlReports.add(new UrlReport(status, payload, error));
|
||||
|
|
Loading…
Reference in New Issue