From 9b95eebb6c80ff587053b2c8aa594e42e0eaf9b1 Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Mon, 11 Apr 2022 21:15:22 +0300 Subject: [PATCH] - Remove the obsolete "parenthesis" and "increasing duplicate-num" from the full-texts' names, before sending them to the S3-Object-Store. They now end with the "file-hash", so it is guaranteed that they will be unique. The Worker continues to produce the previous kind of names, without any disturbance. - Improve logging. - Update MinIO dependency. --- build.gradle | 2 +- .../urls_controller/configuration/ImpalaConnector.java | 6 ++++-- .../java/eu/openaire/urls_controller/util/FileUtils.java | 4 +++- .../eu/openaire/urls_controller/util/S3ObjectStore.java | 6 +++--- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/build.gradle b/build.gradle index ee24d64..e11f3db 100644 --- a/build.gradle +++ b/build.gradle @@ -45,7 +45,7 @@ dependencies { // https://mvnrepository.com/artifact/org.apache.commons/commons-lang3 implementation group: 'org.apache.commons', name: 'commons-lang3', version: '3.12.0' - implementation 'io.minio:minio:8.3.7' + implementation 'io.minio:minio:8.3.8' // https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3' diff --git a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java index ebfc9c0..be99e09 100644 --- a/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java +++ b/src/main/java/eu/openaire/urls_controller/configuration/ImpalaConnector.java @@ -53,7 +53,7 @@ public class ImpalaConnector { private void createDatabase() { if ( isTestEnvironment ) { - logger.info("Going to create (if not exist) the database \"" + testDatabaseName + "\" and its tables. Also will fill some tables with data from database \"" + initialDatabaseName + "\"."); + logger.info("Going to create (if not exist) the test-database \"" + testDatabaseName + "\" and its tables. Also will fill some tables with data from database \"" + initialDatabaseName + "\"."); jdbcTemplate.execute("CREATE DATABASE IF NOT EXISTS " + testDatabaseName); jdbcTemplate.execute("CREATE TABLE IF NOT EXISTS " + testDatabaseName + ".publication stored as parquet as select * from " + initialDatabaseName + ".publication"); @@ -72,8 +72,10 @@ public class ImpalaConnector { jdbcTemplate.execute("COMPUTE STATS " + testDatabaseName + ".assignment"); databaseName = testDatabaseName; // For the rest of the queries. - } else + } else { + logger.info("Going to create or validate the tables that are populated by the Controller, for the \"initialDatabase\" = \"" + initialDatabaseName + "\""); databaseName = initialDatabaseName; + } // For both cases, got check and create the tables which will be populated by the Controller. diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index 95053fc..152ab50 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -306,7 +306,9 @@ public class FileUtils { continue; } - fileName = datasourceId + "/" + filenameWithoutExtension + "::" + hash + fileExtension; + // Use the "fileID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName). + // Now we append the file-hash, so it is guaranteed that the filename will be unique. + fileName = datasourceId + "/" + fileNameID + "::" + hash + fileExtension; String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath); setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url); diff --git a/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java b/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java index e60fcfe..244d68d 100644 --- a/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java +++ b/src/main/java/eu/openaire/urls_controller/util/S3ObjectStore.java @@ -104,9 +104,9 @@ public class S3ObjectStore { .object(fileObjKeyName).filename(fileFullPath) .contentType(contentType).build()); - // TODO - What if the fileObjKeyName already exists? Right now it gets overwritten (unless we add versioning0, which is not currently supported by our S3ObjectStore). - // Each Worker handles some of these cases, but in case of id-urls splitting between different workers or re-attempting some temporarily faulty urls later, - // duplicate fileNames may appear and cause file-overwriting from the part of S3ObjectStore. + // What if the fileObjKeyName already exists? (Default action from S3-Object-Store --> overwrite) + // Each Worker handles some of these cases, but in case of id-urls splitting between different workers or re-attempting some temporarily faulty urls later, duplicate fileNames may appear. + // However, the Controller uses the file-hash (instead of the duplicate -number), so it is 99.99% guaranteed that no overwrites will ever occur. String s3Url = s3Protocol + bucketName + "/" + fileObjKeyName; // Be aware: This url works only if the access to the bucket is public. //logger.debug("Uploaded file \"" + fileObjKeyName + "\". The s3Url is: " + s3Url);