- Fix the fileName-ID not being directly related with the datasourceID, in the S3-ObjectStore name. Add explanatory comments.

- Add missing error-logs.
2022-04-05 16:22:02 +03:00 · 2022-04-05 16:22:02 +03:00 · 33fc61a8d9
parent a23c918a42
commit 33fc61a8d9
2 changed files with 50 additions and 13 deletions
--- a/build.gradle
+++ b/build.gradle
@ -49,7 +49,7 @@ dependencies {

    // https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp
    implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3'
-    // This is required by the minio, as Spring uses a version which is not supported by minio.
+    // This is required by the minio, as Spring < v.2.7.x , uses a version which is not supported by minio.

    // https://mvnrepository.com/artifact/com.cloudera.impala/jdbc
    implementation("com.cloudera.impala:jdbc:2.5.31") {
--- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
+++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
@ -91,12 +91,10 @@ public class FileUtils {
    private String baseTargetLocation;

    public static DecimalFormat df = new DecimalFormat("0.00");
-    private static final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:()]+\\.[\\w]{2,10})$");
-
-    // The following regex might be usefull in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension". It may even be merged with the above regex.
-    // Possible full-filenames are: "ID.pdf", "ID(12).pdf"
-    //private static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(([^.()]+)[^.]*)(\.[\w]{2,10})$");

+    // The following regex might be usefull in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension". TODO - It may even be merged with the above regex.
+    // Possible full-filenames are: "path1/path2/ID.pdf", "ID2.pdf", "path1/path2/ID(12).pdf", "ID2(25).pdf"
+    private static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(?:[^.()]+/)?((([^./()]+)[^./]*)(\\.[\\w]{2,10}))$");

    private final int numOfFullTextsPerBatch = 70;   // The HTTP-headers cannot be too large (It failed with 100 fileNames).

@ -160,12 +158,14 @@ public class FileUtils {
            }

            // Extract the "fileNameWithExtension" to be added in the HashMultimap.
-            Matcher matcher = FILENAME_WITH_EXTENSION.matcher(fileLocation);
+            Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileLocation);
            if ( ! matcher.matches() ) {
+                logger.error("Failed to match the \"" + fileLocation + "\" with the regex: " + FILENAME_ID_EXTENSION);
                continue;
            }
            String fileNameWithExtension = matcher.group(1);
            if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) {
+                logger.error("Failed to extract the \"fileNameWithExtension\" from \"" + fileLocation + "\".");
                continue;
            }

@ -251,13 +251,50 @@ public class FileUtils {
                    // Let's try to upload the file to S3 and update the payloads, either in successful file-uploads (right-away) or not (in the end).
                    try {
                        // Prepare the filename as: "datasourceid/publicationid(123)::hash.pdf"
-                        // All related payloads point to this exact same file and have the same datasourceId.
-                        Payload firstRelatedPayload = fileRelatedPayloads.stream().findFirst().get();
-                        String datasourceId = firstRelatedPayload.getDatasourceId();
-                        String hash = firstRelatedPayload.getHash();
+                        // All related payloads point to this exact same file, BUT, may be related with different urlIDs, which in turn be related with different datasourceIDs.
+                        // This file could have been found from different urlIds and thus be related to multiple datasourceIds.
+                        // BUT, since the filename contains a specific urlID, the datasourceId should be the one related to that specific urlID.
+                        // So, we extract this urlID, search the payload inside the "fileRelatedPayloads" and get the related datasourceID (instead of taking the first or a random datasourceID).

-                        String[] fileNameData = fileName.split("\\.");
-                        fileName = datasourceId + "/" + fileNameData[0] + "::" + hash + "." + fileNameData[1];
+                        Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileName);
+                        if ( ! matcher.matches() ) {
+                            logger.error("Failed to match the \"" + fileName + "\" with the regex: " + FILENAME_ID_EXTENSION);
+                            continue;
+                        }
+                        String filenameWithoutExtension = matcher.group(2);
+                        if ( (filenameWithoutExtension == null) || filenameWithoutExtension.isEmpty() ) {
+                            logger.error("Failed to extract the \"filenameWithoutExtension\" from \"" + fileName + "\".");
+                            continue;
+                        }
+                        String fileNameID = matcher.group(3);
+                        if ( (fileNameID == null) || fileNameID.isEmpty() ) {
+                            logger.error("Failed to extract the \"fileNameID\" from \"" + fileName + "\".");
+                            continue;
+                        }
+                        String fileExtension = matcher.group(4);
+                        if ( (fileExtension == null) || fileExtension.isEmpty() ) {
+                            logger.error("Failed to extract the \"fileExtension\" from \"" + fileName + "\".");
+                            continue;
+                        }
+
+                        String datasourceId = null;
+                        String hash = null;
+                        boolean isFound = false;
+                        for ( Payload payload : fileRelatedPayloads ) {
+                            if ( fileNameID.equals(payload.getId()) ) {
+                                datasourceId = payload.getDatasourceId();
+                                hash = payload.getHash();
+                                isFound = true;
+                                break;
+                            }
+                        }
+
+                        if ( ! isFound ) {  // This should never normally happen. If it does, then a very bad change will have taken place.
+                            logger.error("The \"fileNameID\" (" + fileNameID + ") was not found inside the \"fileRelatedPayloads\" for fileName: " + fileName);
+                            continue;
+                        }
+
+                        fileName = datasourceId + "/" + filenameWithoutExtension + "::" + hash + fileExtension;

                        String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath);
                        setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url);