- Remove the obsolete "parenthesis" and "increasing duplicate-num" from the full-texts' names, before sending them to the S3-Object-Store. They now end with the "file-hash", so it is guaranteed that they will be unique. The Worker continues to produce the previous kind of names, without any disturbance.

- Improve logging.
- Update MinIO dependency.
This commit is contained in:
Lampros Smyrnaios 2022-04-11 21:15:22 +03:00
parent a81ed3c60f
commit 9b95eebb6c
4 changed files with 11 additions and 7 deletions

View File

@ -45,7 +45,7 @@ dependencies {
// https://mvnrepository.com/artifact/org.apache.commons/commons-lang3
implementation group: 'org.apache.commons', name: 'commons-lang3', version: '3.12.0'
implementation 'io.minio:minio:8.3.7'
implementation 'io.minio:minio:8.3.8'
// https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp
implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3'

View File

@ -53,7 +53,7 @@ public class ImpalaConnector {
private void createDatabase()
{
if ( isTestEnvironment ) {
logger.info("Going to create (if not exist) the database \"" + testDatabaseName + "\" and its tables. Also will fill some tables with data from database \"" + initialDatabaseName + "\".");
logger.info("Going to create (if not exist) the test-database \"" + testDatabaseName + "\" and its tables. Also will fill some tables with data from database \"" + initialDatabaseName + "\".");
jdbcTemplate.execute("CREATE DATABASE IF NOT EXISTS " + testDatabaseName);
jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + testDatabaseName + ".publication stored as parquet as select * from " + initialDatabaseName + ".publication");
@ -72,8 +72,10 @@ public class ImpalaConnector {
jdbcTemplate.execute("COMPUTE STATS " + testDatabaseName + ".assignment");
databaseName = testDatabaseName; // For the rest of the queries.
} else
} else {
logger.info("Going to create or validate the tables that are populated by the Controller, for the \"initialDatabase\" = \"" + initialDatabaseName + "\"");
databaseName = initialDatabaseName;
}
// For both cases, got check and create the tables which will be populated by the Controller.

View File

@ -306,7 +306,9 @@ public class FileUtils {
continue;
}
fileName = datasourceId + "/" + filenameWithoutExtension + "::" + hash + fileExtension;
// Use the "fileID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName).
// Now we append the file-hash, so it is guaranteed that the filename will be unique.
fileName = datasourceId + "/" + fileNameID + "::" + hash + fileExtension;
String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath);
setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url);

View File

@ -104,9 +104,9 @@ public class S3ObjectStore {
.object(fileObjKeyName).filename(fileFullPath)
.contentType(contentType).build());
// TODO - What if the fileObjKeyName already exists? Right now it gets overwritten (unless we add versioning0, which is not currently supported by our S3ObjectStore).
// Each Worker handles some of these cases, but in case of id-urls splitting between different workers or re-attempting some temporarily faulty urls later,
// duplicate fileNames may appear and cause file-overwriting from the part of S3ObjectStore.
// What if the fileObjKeyName already exists? (Default action from S3-Object-Store --> overwrite)
// Each Worker handles some of these cases, but in case of id-urls splitting between different workers or re-attempting some temporarily faulty urls later, duplicate fileNames may appear.
// However, the Controller uses the file-hash (instead of the duplicate -number), so it is 99.99% guaranteed that no overwrites will ever occur.
String s3Url = s3Protocol + bucketName + "/" + fileObjKeyName; // Be aware: This url works only if the access to the bucket is public.
//logger.debug("Uploaded file \"" + fileObjKeyName + "\". The s3Url is: " + s3Url);