- Fix the fileName-ID not being directly related with the datasourceID, in the S3-ObjectStore name. Add explanatory comments.

- Add missing error-logs.
This commit is contained in:
Lampros Smyrnaios 2022-04-05 16:22:02 +03:00
parent a23c918a42
commit 33fc61a8d9
2 changed files with 50 additions and 13 deletions

View File

@ -49,7 +49,7 @@ dependencies {
// https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp
implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3'
// This is required by the minio, as Spring uses a version which is not supported by minio.
// This is required by the minio, as Spring < v.2.7.x , uses a version which is not supported by minio.
// https://mvnrepository.com/artifact/com.cloudera.impala/jdbc
implementation("com.cloudera.impala:jdbc:2.5.31") {

View File

@ -91,12 +91,10 @@ public class FileUtils {
private String baseTargetLocation;
public static DecimalFormat df = new DecimalFormat("0.00");
private static final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:()]+\\.[\\w]{2,10})$");
// The following regex might be usefull in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension". It may even be merged with the above regex.
// Possible full-filenames are: "ID.pdf", "ID(12).pdf"
//private static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(([^.()]+)[^.]*)(\.[\w]{2,10})$");
// The following regex might be usefull in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension". TODO - It may even be merged with the above regex.
// Possible full-filenames are: "path1/path2/ID.pdf", "ID2.pdf", "path1/path2/ID(12).pdf", "ID2(25).pdf"
private static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(?:[^.()]+/)?((([^./()]+)[^./]*)(\\.[\\w]{2,10}))$");
private final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames).
@ -160,12 +158,14 @@ public class FileUtils {
}
// Extract the "fileNameWithExtension" to be added in the HashMultimap.
Matcher matcher = FILENAME_WITH_EXTENSION.matcher(fileLocation);
Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileLocation);
if ( ! matcher.matches() ) {
logger.error("Failed to match the \"" + fileLocation + "\" with the regex: " + FILENAME_ID_EXTENSION);
continue;
}
String fileNameWithExtension = matcher.group(1);
if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) {
logger.error("Failed to extract the \"fileNameWithExtension\" from \"" + fileLocation + "\".");
continue;
}
@ -251,13 +251,50 @@ public class FileUtils {
// Let's try to upload the file to S3 and update the payloads, either in successful file-uploads (right-away) or not (in the end).
try {
// Prepare the filename as: "datasourceid/publicationid(123)::hash.pdf"
// All related payloads point to this exact same file and have the same datasourceId.
Payload firstRelatedPayload = fileRelatedPayloads.stream().findFirst().get();
String datasourceId = firstRelatedPayload.getDatasourceId();
String hash = firstRelatedPayload.getHash();
// All related payloads point to this exact same file, BUT, may be related with different urlIDs, which in turn be related with different datasourceIDs.
// This file could have been found from different urlIds and thus be related to multiple datasourceIds.
// BUT, since the filename contains a specific urlID, the datasourceId should be the one related to that specific urlID.
// So, we extract this urlID, search the payload inside the "fileRelatedPayloads" and get the related datasourceID (instead of taking the first or a random datasourceID).
String[] fileNameData = fileName.split("\\.");
fileName = datasourceId + "/" + fileNameData[0] + "::" + hash + "." + fileNameData[1];
Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileName);
if ( ! matcher.matches() ) {
logger.error("Failed to match the \"" + fileName + "\" with the regex: " + FILENAME_ID_EXTENSION);
continue;
}
String filenameWithoutExtension = matcher.group(2);
if ( (filenameWithoutExtension == null) || filenameWithoutExtension.isEmpty() ) {
logger.error("Failed to extract the \"filenameWithoutExtension\" from \"" + fileName + "\".");
continue;
}
String fileNameID = matcher.group(3);
if ( (fileNameID == null) || fileNameID.isEmpty() ) {
logger.error("Failed to extract the \"fileNameID\" from \"" + fileName + "\".");
continue;
}
String fileExtension = matcher.group(4);
if ( (fileExtension == null) || fileExtension.isEmpty() ) {
logger.error("Failed to extract the \"fileExtension\" from \"" + fileName + "\".");
continue;
}
String datasourceId = null;
String hash = null;
boolean isFound = false;
for ( Payload payload : fileRelatedPayloads ) {
if ( fileNameID.equals(payload.getId()) ) {
datasourceId = payload.getDatasourceId();
hash = payload.getHash();
isFound = true;
break;
}
}
if ( ! isFound ) { // This should never normally happen. If it does, then a very bad change will have taken place.
logger.error("The \"fileNameID\" (" + fileNameID + ") was not found inside the \"fileRelatedPayloads\" for fileName: " + fileName);
continue;
}
fileName = datasourceId + "/" + filenameWithoutExtension + "::" + hash + fileExtension;
String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath);
setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url);