diff --git a/build.gradle b/build.gradle index 2c0df82..9d1c903 100644 --- a/build.gradle +++ b/build.gradle @@ -1,5 +1,5 @@ plugins { - id 'org.springframework.boot' version '2.6.5' + id 'org.springframework.boot' version '2.6.6' id 'io.spring.dependency-management' version '1.0.11.RELEASE' id 'java' } diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 00e33ed..aa991fc 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,5 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.1-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/installAndRun.sh b/installAndRun.sh index f33e388..bc9bdf9 100755 --- a/installAndRun.sh +++ b/installAndRun.sh @@ -31,7 +31,7 @@ if [[ ! -f $inputDataFile ]]; then echo -e "\n\n" fi -gradleVersion="7.4.1" +gradleVersion="7.4.2" if [[ justInstall -eq 0 ]]; then diff --git a/src/main/java/eu/openaire/urls_worker/models/Payload.java b/src/main/java/eu/openaire/urls_worker/models/Payload.java index 4b618f6..e2095d8 100644 --- a/src/main/java/eu/openaire/urls_worker/models/Payload.java +++ b/src/main/java/eu/openaire/urls_worker/models/Payload.java @@ -17,7 +17,8 @@ import java.sql.Timestamp; "size", "hash", "location", - "provenance" + "provenance", + "datasourceId" }) public class Payload { @@ -48,9 +49,13 @@ public class Payload { @JsonProperty("provenance") private String provenance; // "crawl:" + @JsonProperty("provenance") + private String datasourceId; // "crawl:" + + public Payload() {} - public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance) { + public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance, String datasourceId) { this.id = id; this.original_url = original_url; this.actual_url = actual_url; @@ -60,6 +65,7 @@ public class Payload { this.hash = hash; this.location = location; this.provenance = provenance; + this.datasourceId = datasourceId; } public String getId() { @@ -134,18 +140,29 @@ public class Payload { this.provenance = provenance; } + public String getDatasourceId() { + return datasourceId; + } + + public void setDatasourceId(String datasourceId) { + this.datasourceId = datasourceId; + } + + @Override public String toString() { return "Payload{" + "id='" + id + '\'' + ", original_url='" + original_url + '\'' + ", actual_url='" + actual_url + '\'' + - ", timestamp_acquired='" + timestamp_acquired + '\'' + + ", timestamp_acquired=" + timestamp_acquired + ", mime_type='" + mime_type + '\'' + - ", size='" + size + '\'' + + ", size=" + size + ", hash='" + hash + '\'' + ", location='" + location + '\'' + ", provenance='" + provenance + '\'' + + ", datasourceId='" + datasourceId + '\'' + '}'; } + } diff --git a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java index bfb8fd8..63c9501 100644 --- a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java +++ b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java @@ -23,6 +23,7 @@ import java.nio.charset.StandardCharsets; import java.sql.Timestamp; import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.List; import java.util.concurrent.Callable; import java.util.concurrent.Executors; @@ -142,7 +143,7 @@ public class PublicationsRetrieverPlugin { if ( numFailedTasks > 0 ) logger.warn(numFailedTasks + " tasks failed, from assignments_" + assignmentRequestCounter); - addUrlReportsToWorkerReport(); + addUrlReportsToWorkerReport(assignments); callableTasks.clear(); // Reset the thread-tasks-list for the next batch. UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch. @@ -150,8 +151,14 @@ public class PublicationsRetrieverPlugin { } - public static void addUrlReportsToWorkerReport() + public static void addUrlReportsToWorkerReport(Collection assignments) { + // Index the UrlIds with the DatasourceIds for quick-search later. + HashMap urlIdsWithDatasourceIds = new HashMap<>(assignments.size()); + for ( Assignment assignment : assignments ) { + urlIdsWithDatasourceIds.put(assignment.getId(), assignment.getDatasource().getId()); + } + Timestamp timestamp = new Timestamp(System.currentTimeMillis()); // Store it here, in order to have the same for all current records. for ( DataToBeLogged data : FileUtils.dataToBeLoggedList ) @@ -213,7 +220,10 @@ public class PublicationsRetrieverPlugin { if ( (hash != null) && (hash.equals("null")) ) hash = null; - Payload payload = new Payload(data.getUrlId(), data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever"); + String urlId = data.getUrlId(); + String datasourceId = urlIdsWithDatasourceIds.get(urlId); + + Payload payload = new Payload(urlId, data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever", datasourceId); // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified. AssignmentsHandler.urlReports.add(new UrlReport(status, payload, error));