diff --git a/build.gradle b/build.gradle index 6668f96..520201d 100644 --- a/build.gradle +++ b/build.gradle @@ -1,5 +1,5 @@ plugins { - id 'org.springframework.boot' version '2.6.5' + id 'org.springframework.boot' version '2.6.6' id 'io.spring.dependency-management' version '1.0.11.RELEASE' id 'java' } diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 00e33ed..aa991fc 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,5 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.1-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/installAndRun.sh b/installAndRun.sh index 3cf6fc1..fcb2e2b 100755 --- a/installAndRun.sh +++ b/installAndRun.sh @@ -17,7 +17,7 @@ if [[ justInstall -eq 1 && shouldRunInDocker -eq 1 ]]; then justInstall=0 fi -gradleVersion="7.4.1" +gradleVersion="7.4.2" if [[ justInstall -eq 0 ]]; then diff --git a/src/main/java/eu/openaire/urls_controller/models/Payload.java b/src/main/java/eu/openaire/urls_controller/models/Payload.java index f87aa84..82560c1 100644 --- a/src/main/java/eu/openaire/urls_controller/models/Payload.java +++ b/src/main/java/eu/openaire/urls_controller/models/Payload.java @@ -17,7 +17,8 @@ import java.sql.Timestamp; "size", "hash", "location", - "provenance" + "provenance", + "datasourceId" }) public class Payload { @@ -48,9 +49,13 @@ public class Payload { @JsonProperty("provenance") private String provenance; // "crawl:" + @JsonProperty("provenance") + private String datasourceId; // "crawl:" + + public Payload() {} - public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance) { + public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance, String datasourceId) { this.id = id; this.original_url = original_url; this.actual_url = actual_url; @@ -60,6 +65,7 @@ public class Payload { this.hash = hash; this.location = location; this.provenance = provenance; + this.datasourceId = datasourceId; } public String getId() { @@ -134,18 +140,29 @@ public class Payload { this.provenance = provenance; } + public String getDatasourceId() { + return datasourceId; + } + + public void setDatasourceId(String datasourceId) { + this.datasourceId = datasourceId; + } + + @Override public String toString() { return "Payload{" + "id='" + id + '\'' + ", original_url='" + original_url + '\'' + ", actual_url='" + actual_url + '\'' + - ", timestamp_acquired='" + timestamp_acquired + '\'' + + ", timestamp_acquired=" + timestamp_acquired + ", mime_type='" + mime_type + '\'' + - ", size='" + size + '\'' + - ", md5='" + hash + '\'' + + ", size=" + size + + ", hash='" + hash + '\'' + ", location='" + location + '\'' + ", provenance='" + provenance + '\'' + + ", datasourceId='" + datasourceId + '\'' + '}'; } + } diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index b1113d0..9883533 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -90,7 +90,13 @@ public class FileUtils { @Value("services.pdfaggregation.controller.baseTargetLocation") private String baseTargetLocation; public static DecimalFormat df = new DecimalFormat("0.00"); - private final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:()]+\\.[\\w]{2,10})$"); + private static final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:()]+\\.[\\w]{2,10})$"); + + // The following regex might be usefull in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension". It may even be merged with the above regex. + // Possible full-filenames are: "ID.pdf", "ID(12).pdf" + //private static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(([^.()]+)[^.]*)(\.[\w]{2,10})$"); + + private final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames). public UploadFullTextsResponse getAndUploadFullTexts(List urlReports, HttpServletRequest request, long assignmentsBatchCounter, String workerId) { @@ -243,6 +249,15 @@ public class FileUtils { // Let's try to upload the file to S3 and update the payloads, either in successful file-uploads (right-away) or not (in the end). try { + // Prepare the filename as: "datasourceid/publicationid(123)::hash.pdf" + // All related payloads point to this exact same file and have the same datasourceId. + Payload firstRelatedPayload = fileRelatedPayloads.stream().findFirst().get(); + String datasourceId = firstRelatedPayload.getDatasourceId(); + String hash = firstRelatedPayload.getHash(); + + String[] fileNameData = fileName.split("\\."); + fileName = datasourceId + "/" + fileNameData[0] + "::" + hash + "." + fileNameData[1]; + String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath); setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url); numUploadedFiles ++;