diff --git a/build.gradle b/build.gradle index 520201d..ee24d64 100644 --- a/build.gradle +++ b/build.gradle @@ -49,7 +49,7 @@ dependencies { // https://mvnrepository.com/artifact/com.squareup.okhttp3/okhttp implementation group: 'com.squareup.okhttp3', name: 'okhttp', version: '4.9.3' - // This is required by the minio, as Spring uses a version which is not supported by minio. + // This is required by the minio, as Spring < v.2.7.x , uses a version which is not supported by minio. // https://mvnrepository.com/artifact/com.cloudera.impala/jdbc implementation("com.cloudera.impala:jdbc:2.5.31") { diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index 7f70805..4137774 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -91,12 +91,10 @@ public class FileUtils { private String baseTargetLocation; public static DecimalFormat df = new DecimalFormat("0.00"); - private static final Pattern FILENAME_WITH_EXTENSION = Pattern.compile(".*/([\\w_:()]+\\.[\\w]{2,10})$"); - - // The following regex might be usefull in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension". It may even be merged with the above regex. - // Possible full-filenames are: "ID.pdf", "ID(12).pdf" - //private static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(([^.()]+)[^.]*)(\.[\w]{2,10})$"); + // The following regex might be usefull in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension". TODO - It may even be merged with the above regex. + // Possible full-filenames are: "path1/path2/ID.pdf", "ID2.pdf", "path1/path2/ID(12).pdf", "ID2(25).pdf" + private static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(?:[^.()]+/)?((([^./()]+)[^./]*)(\\.[\\w]{2,10}))$"); private final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames). @@ -160,12 +158,14 @@ public class FileUtils { } // Extract the "fileNameWithExtension" to be added in the HashMultimap. - Matcher matcher = FILENAME_WITH_EXTENSION.matcher(fileLocation); + Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileLocation); if ( ! matcher.matches() ) { + logger.error("Failed to match the \"" + fileLocation + "\" with the regex: " + FILENAME_ID_EXTENSION); continue; } String fileNameWithExtension = matcher.group(1); if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) { + logger.error("Failed to extract the \"fileNameWithExtension\" from \"" + fileLocation + "\"."); continue; } @@ -251,13 +251,50 @@ public class FileUtils { // Let's try to upload the file to S3 and update the payloads, either in successful file-uploads (right-away) or not (in the end). try { // Prepare the filename as: "datasourceid/publicationid(123)::hash.pdf" - // All related payloads point to this exact same file and have the same datasourceId. - Payload firstRelatedPayload = fileRelatedPayloads.stream().findFirst().get(); - String datasourceId = firstRelatedPayload.getDatasourceId(); - String hash = firstRelatedPayload.getHash(); + // All related payloads point to this exact same file, BUT, may be related with different urlIDs, which in turn be related with different datasourceIDs. + // This file could have been found from different urlIds and thus be related to multiple datasourceIds. + // BUT, since the filename contains a specific urlID, the datasourceId should be the one related to that specific urlID. + // So, we extract this urlID, search the payload inside the "fileRelatedPayloads" and get the related datasourceID (instead of taking the first or a random datasourceID). - String[] fileNameData = fileName.split("\\."); - fileName = datasourceId + "/" + fileNameData[0] + "::" + hash + "." + fileNameData[1]; + Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileName); + if ( ! matcher.matches() ) { + logger.error("Failed to match the \"" + fileName + "\" with the regex: " + FILENAME_ID_EXTENSION); + continue; + } + String filenameWithoutExtension = matcher.group(2); + if ( (filenameWithoutExtension == null) || filenameWithoutExtension.isEmpty() ) { + logger.error("Failed to extract the \"filenameWithoutExtension\" from \"" + fileName + "\"."); + continue; + } + String fileNameID = matcher.group(3); + if ( (fileNameID == null) || fileNameID.isEmpty() ) { + logger.error("Failed to extract the \"fileNameID\" from \"" + fileName + "\"."); + continue; + } + String fileExtension = matcher.group(4); + if ( (fileExtension == null) || fileExtension.isEmpty() ) { + logger.error("Failed to extract the \"fileExtension\" from \"" + fileName + "\"."); + continue; + } + + String datasourceId = null; + String hash = null; + boolean isFound = false; + for ( Payload payload : fileRelatedPayloads ) { + if ( fileNameID.equals(payload.getId()) ) { + datasourceId = payload.getDatasourceId(); + hash = payload.getHash(); + isFound = true; + break; + } + } + + if ( ! isFound ) { // This should never normally happen. If it does, then a very bad change will have taken place. + logger.error("The \"fileNameID\" (" + fileNameID + ") was not found inside the \"fileRelatedPayloads\" for fileName: " + fileName); + continue; + } + + fileName = datasourceId + "/" + filenameWithoutExtension + "::" + hash + fileExtension; String s3Url = s3ObjectStore.uploadToS3(fileName, fileFullPath); setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url);