- Change the fileNames' structure in the S3-ObjectStore.

- Update dependencies.
This commit is contained in:
Lampros Smyrnaios 2022-04-01 19:24:04 +03:00
parent 48670f3399
commit 5e4fad2479
5 changed files with 41 additions and 9 deletions

View File

@ -1,5 +1,5 @@
plugins { plugins {
id 'org.springframework.boot' version '2.6.5' id 'org.springframework.boot' version '2.6.6'
id 'io.spring.dependency-management' version '1.0.11.RELEASE' id 'io.spring.dependency-management' version '1.0.11.RELEASE'
id 'java' id 'java'
} }

View File

@ -1,5 +1,5 @@
distributionBase=GRADLE_USER_HOME distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.1-bin.zip distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip
zipStoreBase=GRADLE_USER_HOME zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists zipStorePath=wrapper/dists

View File

@ -17,7 +17,7 @@ if [[ justInstall -eq 1 && shouldRunInDocker -eq 1 ]]; then
justInstall=0 justInstall=0
fi fi
gradleVersion="7.4.1" gradleVersion="7.4.2"
if [[ justInstall -eq 0 ]]; then if [[ justInstall -eq 0 ]]; then

View File

@ -17,7 +17,8 @@ import java.sql.Timestamp;
"size", "size",
"hash", "hash",
"location", "location",
"provenance" "provenance",
"datasourceId"
}) })
public class Payload { public class Payload {
@ -48,9 +49,13 @@ public class Payload {
@JsonProperty("provenance") @JsonProperty("provenance")
private String provenance; // "crawl:<PluginName>" private String provenance; // "crawl:<PluginName>"
@JsonProperty("provenance")
private String datasourceId; // "crawl:<PluginName>"
public Payload() {} public Payload() {}
public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance) { public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance, String datasourceId) {
this.id = id; this.id = id;
this.original_url = original_url; this.original_url = original_url;
this.actual_url = actual_url; this.actual_url = actual_url;
@ -60,6 +65,7 @@ public class Payload {
this.hash = hash; this.hash = hash;
this.location = location; this.location = location;
this.provenance = provenance; this.provenance = provenance;
this.datasourceId = datasourceId;
} }
public String getId() { public String getId() {
@ -134,18 +140,29 @@ public class Payload {
this.provenance = provenance; this.provenance = provenance;
} }
public String getDatasourceId() {
return datasourceId;
}
public void setDatasourceId(String datasourceId) {
this.datasourceId = datasourceId;
}
@Override @Override
public String toString() { public String toString() {
return "Payload{" + return "Payload{" +
"id='" + id + '\'' + "id='" + id + '\'' +
", original_url='" + original_url + '\'' + ", original_url='" + original_url + '\'' +
", actual_url='" + actual_url + '\'' + ", actual_url='" + actual_url + '\'' +
", timestamp_acquired='" + timestamp_acquired + '\'' + ", timestamp_acquired=" + timestamp_acquired +
", mime_type='" + mime_type + '\'' + ", mime_type='" + mime_type + '\'' +
", size='" + size + '\'' + ", size=" + size +
", md5='" + hash + '\'' + ", hash='" + hash + '\'' +
", location='" + location + '\'' + ", location='" + location + '\'' +
", provenance='" + provenance + '\'' + ", provenance='" + provenance + '\'' +
", datasourceId='" + datasourceId + '\'' +
'}'; '}';
} }
} }

View File

@ -90,7 +90,13 @@ public class FileUtils {
@Value("services.pdfaggregation.controller.baseTargetLocation") @Value("services.pdfaggregation.controller.baseTargetLocation")
private String baseTargetLocation; private String baseTargetLocation;
public static DecimalFormat df = new DecimalFormat("0.00"); public static DecimalFormat df = new DecimalFormat("0.00");
private final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:()]+\\.[\\w]{2,10})$"); private static final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:()]+\\.[\\w]{2,10})$");
// The following regex might be usefull in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension". It may even be merged with the above regex.
// Possible full-filenames are: "ID.pdf", "ID(12).pdf"
//private static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(([^.()]+)[^.]*)(\.[\w]{2,10})$");
private final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames). private final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames).
public UploadFullTextsResponse getAndUploadFullTexts(List<UrlReport> urlReports, HttpServletRequest request, long assignmentsBatchCounter, String workerId) { public UploadFullTextsResponse getAndUploadFullTexts(List<UrlReport> urlReports, HttpServletRequest request, long assignmentsBatchCounter, String workerId) {
@ -243,6 +249,15 @@ public class FileUtils {
// Let's try to upload the file to S3 and update the payloads, either in successful file-uploads (right-away) or not (in the end). // Let's try to upload the file to S3 and update the payloads, either in successful file-uploads (right-away) or not (in the end).
try { try {
// Prepare the filename as: "datasourceid/publicationid(123)::hash.pdf"
// All related payloads point to this exact same file and have the same datasourceId.
Payload firstRelatedPayload = fileRelatedPayloads.stream().findFirst().get();
String datasourceId = firstRelatedPayload.getDatasourceId();
String hash = firstRelatedPayload.getHash();
String[] fileNameData = fileName.split("\\.");
fileName = datasourceId + "/" + fileNameData[0] + "::" + hash + "." + fileNameData[1];
String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath); String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath);
setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url); setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url);
numUploadedFiles ++; numUploadedFiles ++;