- Fix the fileName-ID not being directly related with the datasourceID, in the S3-ObjectStore name. Add explanatory comments.
- Add missing error-logs.
This commit is contained in:
parent
a23c918a42
commit
33fc61a8d9
|
@ -49,7 +49,7 @@ dependencies {
|
||||||
|
|
||||||
// https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp
|
// https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp
|
||||||
implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3'
|
implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3'
|
||||||
// This is required by the minio, as Spring uses a version which is not supported by minio.
|
// This is required by the minio, as Spring < v.2.7.x , uses a version which is not supported by minio.
|
||||||
|
|
||||||
// https://mvnrepository.com/artifact/com.cloudera.impala/jdbc
|
// https://mvnrepository.com/artifact/com.cloudera.impala/jdbc
|
||||||
implementation("com.cloudera.impala:jdbc:2.5.31") {
|
implementation("com.cloudera.impala:jdbc:2.5.31") {
|
||||||
|
|
|
@ -91,12 +91,10 @@ public class FileUtils {
|
||||||
private String baseTargetLocation;
|
private String baseTargetLocation;
|
||||||
|
|
||||||
public static DecimalFormat df = new DecimalFormat("0.00");
|
public static DecimalFormat df = new DecimalFormat("0.00");
|
||||||
private static final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:()]+\\.[\\w]{2,10})$");
|
|
||||||
|
|
||||||
// The following regex might be usefull in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension". It may even be merged with the above regex.
|
|
||||||
// Possible full-filenames are: "ID.pdf", "ID(12).pdf"
|
|
||||||
//private static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(([^.()]+)[^.]*)(\.[\w]{2,10})$");
|
|
||||||
|
|
||||||
|
// The following regex might be usefull in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension". TODO - It may even be merged with the above regex.
|
||||||
|
// Possible full-filenames are: "path1/path2/ID.pdf", "ID2.pdf", "path1/path2/ID(12).pdf", "ID2(25).pdf"
|
||||||
|
private static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(?:[^.()]+/)?((([^./()]+)[^./]*)(\\.[\\w]{2,10}))$");
|
||||||
|
|
||||||
private final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames).
|
private final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames).
|
||||||
|
|
||||||
|
@ -160,12 +158,14 @@ public class FileUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Extract the "fileNameWithExtension" to be added in the HashMultimap.
|
// Extract the "fileNameWithExtension" to be added in the HashMultimap.
|
||||||
Matcher matcher = FILENAME_WITH_EXTENSION.matcher(fileLocation);
|
Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileLocation);
|
||||||
if ( ! matcher.matches() ) {
|
if ( ! matcher.matches() ) {
|
||||||
|
logger.error("Failed to match the \"" + fileLocation + "\" with the regex: " + FILENAME_ID_EXTENSION);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
String fileNameWithExtension = matcher.group(1);
|
String fileNameWithExtension = matcher.group(1);
|
||||||
if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) {
|
if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) {
|
||||||
|
logger.error("Failed to extract the \"fileNameWithExtension\" from \"" + fileLocation + "\".");
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -251,13 +251,50 @@ public class FileUtils {
|
||||||
// Let's try to upload the file to S3 and update the payloads, either in successful file-uploads (right-away) or not (in the end).
|
// Let's try to upload the file to S3 and update the payloads, either in successful file-uploads (right-away) or not (in the end).
|
||||||
try {
|
try {
|
||||||
// Prepare the filename as: "datasourceid/publicationid(123)::hash.pdf"
|
// Prepare the filename as: "datasourceid/publicationid(123)::hash.pdf"
|
||||||
// All related payloads point to this exact same file and have the same datasourceId.
|
// All related payloads point to this exact same file, BUT, may be related with different urlIDs, which in turn be related with different datasourceIDs.
|
||||||
Payload firstRelatedPayload = fileRelatedPayloads.stream().findFirst().get();
|
// This file could have been found from different urlIds and thus be related to multiple datasourceIds.
|
||||||
String datasourceId = firstRelatedPayload.getDatasourceId();
|
// BUT, since the filename contains a specific urlID, the datasourceId should be the one related to that specific urlID.
|
||||||
String hash = firstRelatedPayload.getHash();
|
// So, we extract this urlID, search the payload inside the "fileRelatedPayloads" and get the related datasourceID (instead of taking the first or a random datasourceID).
|
||||||
|
|
||||||
String[] fileNameData = fileName.split("\\.");
|
Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileName);
|
||||||
fileName = datasourceId + "/" + fileNameData[0] + "::" + hash + "." + fileNameData[1];
|
if ( ! matcher.matches() ) {
|
||||||
|
logger.error("Failed to match the \"" + fileName + "\" with the regex: " + FILENAME_ID_EXTENSION);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
String filenameWithoutExtension = matcher.group(2);
|
||||||
|
if ( (filenameWithoutExtension == null) || filenameWithoutExtension.isEmpty() ) {
|
||||||
|
logger.error("Failed to extract the \"filenameWithoutExtension\" from \"" + fileName + "\".");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
String fileNameID = matcher.group(3);
|
||||||
|
if ( (fileNameID == null) || fileNameID.isEmpty() ) {
|
||||||
|
logger.error("Failed to extract the \"fileNameID\" from \"" + fileName + "\".");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
String fileExtension = matcher.group(4);
|
||||||
|
if ( (fileExtension == null) || fileExtension.isEmpty() ) {
|
||||||
|
logger.error("Failed to extract the \"fileExtension\" from \"" + fileName + "\".");
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
String datasourceId = null;
|
||||||
|
String hash = null;
|
||||||
|
boolean isFound = false;
|
||||||
|
for ( Payload payload : fileRelatedPayloads ) {
|
||||||
|
if ( fileNameID.equals(payload.getId()) ) {
|
||||||
|
datasourceId = payload.getDatasourceId();
|
||||||
|
hash = payload.getHash();
|
||||||
|
isFound = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( ! isFound ) { // This should never normally happen. If it does, then a very bad change will have taken place.
|
||||||
|
logger.error("The \"fileNameID\" (" + fileNameID + ") was not found inside the \"fileRelatedPayloads\" for fileName: " + fileName);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
fileName = datasourceId + "/" + filenameWithoutExtension + "::" + hash + fileExtension;
|
||||||
|
|
||||||
String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath);
|
String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath);
|
||||||
setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url);
|
setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url);
|
||||||
|
|
Loading…
Reference in New Issue