- Update the Worker's report to include the datasourceID for each record. It is used by the Controller inside the S3-fileNames.
- Update dependencies.
This commit is contained in:
parent
5fee05e994
commit
31af0a81eb
|
@ -1,5 +1,5 @@
|
||||||
plugins {
|
plugins {
|
||||||
id 'org.springframework.boot' version '2.6.5'
|
id 'org.springframework.boot' version '2.6.6'
|
||||||
id 'io.spring.dependency-management' version '1.0.11.RELEASE'
|
id 'io.spring.dependency-management' version '1.0.11.RELEASE'
|
||||||
id 'java'
|
id 'java'
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
distributionBase=GRADLE_USER_HOME
|
distributionBase=GRADLE_USER_HOME
|
||||||
distributionPath=wrapper/dists
|
distributionPath=wrapper/dists
|
||||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.1-bin.zip
|
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip
|
||||||
zipStoreBase=GRADLE_USER_HOME
|
zipStoreBase=GRADLE_USER_HOME
|
||||||
zipStorePath=wrapper/dists
|
zipStorePath=wrapper/dists
|
||||||
|
|
|
@ -31,7 +31,7 @@ if [[ ! -f $inputDataFile ]]; then
|
||||||
echo -e "\n\n"
|
echo -e "\n\n"
|
||||||
fi
|
fi
|
||||||
|
|
||||||
gradleVersion="7.4.1"
|
gradleVersion="7.4.2"
|
||||||
|
|
||||||
if [[ justInstall -eq 0 ]]; then
|
if [[ justInstall -eq 0 ]]; then
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,8 @@ import java.sql.Timestamp;
|
||||||
"size",
|
"size",
|
||||||
"hash",
|
"hash",
|
||||||
"location",
|
"location",
|
||||||
"provenance"
|
"provenance",
|
||||||
|
"datasourceId"
|
||||||
})
|
})
|
||||||
public class Payload {
|
public class Payload {
|
||||||
|
|
||||||
|
@ -48,9 +49,13 @@ public class Payload {
|
||||||
@JsonProperty("provenance")
|
@JsonProperty("provenance")
|
||||||
private String provenance; // "crawl:<PluginName>"
|
private String provenance; // "crawl:<PluginName>"
|
||||||
|
|
||||||
|
@JsonProperty("provenance")
|
||||||
|
private String datasourceId; // "crawl:<PluginName>"
|
||||||
|
|
||||||
|
|
||||||
public Payload() {}
|
public Payload() {}
|
||||||
|
|
||||||
public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance) {
|
public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance, String datasourceId) {
|
||||||
this.id = id;
|
this.id = id;
|
||||||
this.original_url = original_url;
|
this.original_url = original_url;
|
||||||
this.actual_url = actual_url;
|
this.actual_url = actual_url;
|
||||||
|
@ -60,6 +65,7 @@ public class Payload {
|
||||||
this.hash = hash;
|
this.hash = hash;
|
||||||
this.location = location;
|
this.location = location;
|
||||||
this.provenance = provenance;
|
this.provenance = provenance;
|
||||||
|
this.datasourceId = datasourceId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getId() {
|
public String getId() {
|
||||||
|
@ -134,18 +140,29 @@ public class Payload {
|
||||||
this.provenance = provenance;
|
this.provenance = provenance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getDatasourceId() {
|
||||||
|
return datasourceId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDatasourceId(String datasourceId) {
|
||||||
|
this.datasourceId = datasourceId;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "Payload{" +
|
return "Payload{" +
|
||||||
"id='" + id + '\'' +
|
"id='" + id + '\'' +
|
||||||
", original_url='" + original_url + '\'' +
|
", original_url='" + original_url + '\'' +
|
||||||
", actual_url='" + actual_url + '\'' +
|
", actual_url='" + actual_url + '\'' +
|
||||||
", timestamp_acquired='" + timestamp_acquired + '\'' +
|
", timestamp_acquired=" + timestamp_acquired +
|
||||||
", mime_type='" + mime_type + '\'' +
|
", mime_type='" + mime_type + '\'' +
|
||||||
", size='" + size + '\'' +
|
", size=" + size +
|
||||||
", hash='" + hash + '\'' +
|
", hash='" + hash + '\'' +
|
||||||
", location='" + location + '\'' +
|
", location='" + location + '\'' +
|
||||||
", provenance='" + provenance + '\'' +
|
", provenance='" + provenance + '\'' +
|
||||||
|
", datasourceId='" + datasourceId + '\'' +
|
||||||
'}';
|
'}';
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -23,6 +23,7 @@ import java.nio.charset.StandardCharsets;
|
||||||
import java.sql.Timestamp;
|
import java.sql.Timestamp;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.Collection;
|
import java.util.Collection;
|
||||||
|
import java.util.HashMap;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.concurrent.Callable;
|
import java.util.concurrent.Callable;
|
||||||
import java.util.concurrent.Executors;
|
import java.util.concurrent.Executors;
|
||||||
|
@ -142,7 +143,7 @@ public class PublicationsRetrieverPlugin {
|
||||||
if ( numFailedTasks > 0 )
|
if ( numFailedTasks > 0 )
|
||||||
logger.warn(numFailedTasks + " tasks failed, from assignments_" + assignmentRequestCounter);
|
logger.warn(numFailedTasks + " tasks failed, from assignments_" + assignmentRequestCounter);
|
||||||
|
|
||||||
addUrlReportsToWorkerReport();
|
addUrlReportsToWorkerReport(assignments);
|
||||||
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
|
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
|
||||||
|
|
||||||
UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch.
|
UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch.
|
||||||
|
@ -150,8 +151,14 @@ public class PublicationsRetrieverPlugin {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
public static void addUrlReportsToWorkerReport()
|
public static void addUrlReportsToWorkerReport(Collection<Assignment> assignments)
|
||||||
{
|
{
|
||||||
|
// Index the UrlIds with the DatasourceIds for quick-search later.
|
||||||
|
HashMap<String, String> urlIdsWithDatasourceIds = new HashMap<>(assignments.size());
|
||||||
|
for ( Assignment assignment : assignments ) {
|
||||||
|
urlIdsWithDatasourceIds.put(assignment.getId(), assignment.getDatasource().getId());
|
||||||
|
}
|
||||||
|
|
||||||
Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records.
|
Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records.
|
||||||
|
|
||||||
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
|
for ( DataToBeLogged data : FileUtils.dataToBeLoggedList )
|
||||||
|
@ -213,7 +220,10 @@ public class PublicationsRetrieverPlugin {
|
||||||
if ( (hash != null) && (hash.equals("null")) )
|
if ( (hash != null) && (hash.equals("null")) )
|
||||||
hash = null;
|
hash = null;
|
||||||
|
|
||||||
Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever");
|
String urlId = data.getUrlId();
|
||||||
|
String datasourceId = urlIdsWithDatasourceIds.get(urlId);
|
||||||
|
|
||||||
|
Payload payload = new Payload(urlId, data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever", datasourceId);
|
||||||
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
|
// TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
|
||||||
|
|
||||||
AssignmentsHandler.urlReports.add(new UrlReport(status, payload, error));
|
AssignmentsHandler.urlReports.add(new UrlReport(status, payload, error));
|
||||||
|
|
Loading…
Reference in New Issue