- Change the fileNames' structure in the S3-ObjectStore.
- Update dependencies.
This commit is contained in:
parent
48670f3399
commit
5e4fad2479
|
@ -1,5 +1,5 @@
|
||||||
plugins {
|
plugins {
|
||||||
id 'org.springframework.boot' version '2.6.5'
|
id 'org.springframework.boot' version '2.6.6'
|
||||||
id 'io.spring.dependency-management' version '1.0.11.RELEASE'
|
id 'io.spring.dependency-management' version '1.0.11.RELEASE'
|
||||||
id 'java'
|
id 'java'
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
distributionBase=GRADLE_USER_HOME
|
distributionBase=GRADLE_USER_HOME
|
||||||
distributionPath=wrapper/dists
|
distributionPath=wrapper/dists
|
||||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.1-bin.zip
|
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4.2-bin.zip
|
||||||
zipStoreBase=GRADLE_USER_HOME
|
zipStoreBase=GRADLE_USER_HOME
|
||||||
zipStorePath=wrapper/dists
|
zipStorePath=wrapper/dists
|
||||||
|
|
|
@ -17,7 +17,7 @@ if [[ justInstall -eq 1 && shouldRunInDocker -eq 1 ]]; then
|
||||||
justInstall=0
|
justInstall=0
|
||||||
fi
|
fi
|
||||||
|
|
||||||
gradleVersion="7.4.1"
|
gradleVersion="7.4.2"
|
||||||
|
|
||||||
if [[ justInstall -eq 0 ]]; then
|
if [[ justInstall -eq 0 ]]; then
|
||||||
|
|
||||||
|
|
|
@ -17,7 +17,8 @@ import java.sql.Timestamp;
|
||||||
"size",
|
"size",
|
||||||
"hash",
|
"hash",
|
||||||
"location",
|
"location",
|
||||||
"provenance"
|
"provenance",
|
||||||
|
"datasourceId"
|
||||||
})
|
})
|
||||||
public class Payload {
|
public class Payload {
|
||||||
|
|
||||||
|
@ -48,9 +49,13 @@ public class Payload {
|
||||||
@JsonProperty("provenance")
|
@JsonProperty("provenance")
|
||||||
private String provenance; // "crawl:<PluginName>"
|
private String provenance; // "crawl:<PluginName>"
|
||||||
|
|
||||||
|
@JsonProperty("provenance")
|
||||||
|
private String datasourceId; // "crawl:<PluginName>"
|
||||||
|
|
||||||
|
|
||||||
public Payload() {}
|
public Payload() {}
|
||||||
|
|
||||||
public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance) {
|
public Payload(String id, String original_url, String actual_url, Timestamp timestamp_acquired, String mime_type, Long size, String hash, String location, String provenance, String datasourceId) {
|
||||||
this.id = id;
|
this.id = id;
|
||||||
this.original_url = original_url;
|
this.original_url = original_url;
|
||||||
this.actual_url = actual_url;
|
this.actual_url = actual_url;
|
||||||
|
@ -60,6 +65,7 @@ public class Payload {
|
||||||
this.hash = hash;
|
this.hash = hash;
|
||||||
this.location = location;
|
this.location = location;
|
||||||
this.provenance = provenance;
|
this.provenance = provenance;
|
||||||
|
this.datasourceId = datasourceId;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String getId() {
|
public String getId() {
|
||||||
|
@ -134,18 +140,29 @@ public class Payload {
|
||||||
this.provenance = provenance;
|
this.provenance = provenance;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
public String getDatasourceId() {
|
||||||
|
return datasourceId;
|
||||||
|
}
|
||||||
|
|
||||||
|
public void setDatasourceId(String datasourceId) {
|
||||||
|
this.datasourceId = datasourceId;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
@Override
|
@Override
|
||||||
public String toString() {
|
public String toString() {
|
||||||
return "Payload{" +
|
return "Payload{" +
|
||||||
"id='" + id + '\'' +
|
"id='" + id + '\'' +
|
||||||
", original_url='" + original_url + '\'' +
|
", original_url='" + original_url + '\'' +
|
||||||
", actual_url='" + actual_url + '\'' +
|
", actual_url='" + actual_url + '\'' +
|
||||||
", timestamp_acquired='" + timestamp_acquired + '\'' +
|
", timestamp_acquired=" + timestamp_acquired +
|
||||||
", mime_type='" + mime_type + '\'' +
|
", mime_type='" + mime_type + '\'' +
|
||||||
", size='" + size + '\'' +
|
", size=" + size +
|
||||||
", md5='" + hash + '\'' +
|
", hash='" + hash + '\'' +
|
||||||
", location='" + location + '\'' +
|
", location='" + location + '\'' +
|
||||||
", provenance='" + provenance + '\'' +
|
", provenance='" + provenance + '\'' +
|
||||||
|
", datasourceId='" + datasourceId + '\'' +
|
||||||
'}';
|
'}';
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -90,7 +90,13 @@ public class FileUtils {
|
||||||
@Value("services.pdfaggregation.controller.baseTargetLocation")
|
@Value("services.pdfaggregation.controller.baseTargetLocation")
|
||||||
private String baseTargetLocation;
|
private String baseTargetLocation;
|
||||||
public static DecimalFormat df = new DecimalFormat("0.00");
|
public static DecimalFormat df = new DecimalFormat("0.00");
|
||||||
private final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:()]+\\.[\\w]{2,10})$");
|
private static final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:()]+\\.[\\w]{2,10})$");
|
||||||
|
|
||||||
|
// The following regex might be usefull in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension". It may even be merged with the above regex.
|
||||||
|
// Possible full-filenames are: "ID.pdf", "ID(12).pdf"
|
||||||
|
//private static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(([^.()]+)[^.]*)(\.[\w]{2,10})$");
|
||||||
|
|
||||||
|
|
||||||
private final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames).
|
private final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames).
|
||||||
|
|
||||||
public UploadFullTextsResponse getAndUploadFullTexts(List<UrlReport> urlReports, HttpServletRequest request, long assignmentsBatchCounter, String workerId) {
|
public UploadFullTextsResponse getAndUploadFullTexts(List<UrlReport> urlReports, HttpServletRequest request, long assignmentsBatchCounter, String workerId) {
|
||||||
|
@ -243,6 +249,15 @@ public class FileUtils {
|
||||||
|
|
||||||
// Let's try to upload the file to S3 and update the payloads, either in successful file-uploads (right-away) or not (in the end).
|
// Let's try to upload the file to S3 and update the payloads, either in successful file-uploads (right-away) or not (in the end).
|
||||||
try {
|
try {
|
||||||
|
// Prepare the filename as: "datasourceid/publicationid(123)::hash.pdf"
|
||||||
|
// All related payloads point to this exact same file and have the same datasourceId.
|
||||||
|
Payload firstRelatedPayload = fileRelatedPayloads.stream().findFirst().get();
|
||||||
|
String datasourceId = firstRelatedPayload.getDatasourceId();
|
||||||
|
String hash = firstRelatedPayload.getHash();
|
||||||
|
|
||||||
|
String[] fileNameData = fileName.split("\\.");
|
||||||
|
fileName = datasourceId + "/" + fileNameData[0] + "::" + hash + "." + fileNameData[1];
|
||||||
|
|
||||||
String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath);
|
String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath);
|
||||||
setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url);
|
setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url);
|
||||||
numUploadedFiles ++;
|
numUploadedFiles ++;
|
||||||
|
|
Loading…
Reference in New Issue