From 08de530f0381c4372b760a514fe73e90ec01673e Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Fri, 29 Mar 2024 17:23:01 +0200 Subject: [PATCH] Various improvements: - Handle the case when "fileUtils.constructS3FilenameAndUploadToS3()" returns "null", in "processBulkImportedFile()". - Avoid an "IllegalArgumentException" in "Lists.partition()" when the number of files to bulkImport are fewer than the number of threads available to handle them. - Include the last directory's "/" divider in the fileDIR group of "FILEPATH_ID_EXTENSION" regex (renamed from "FILENAME_ID_EXTENSION"). - Fix an incomplete log-message. - Provide the "fileLocation" argument in the "DocFileData" constructor, in "processBulkImportedFile()", even though it's not used after. --- .../models/FileLocationData.java | 4 ++-- .../services/BulkImportServiceImpl.java | 9 +++++++-- .../urls_controller/util/FileUtils.java | 18 +++++++++--------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/src/main/java/eu/openaire/urls_controller/models/FileLocationData.java b/src/main/java/eu/openaire/urls_controller/models/FileLocationData.java index 58c766d..82fd849 100644 --- a/src/main/java/eu/openaire/urls_controller/models/FileLocationData.java +++ b/src/main/java/eu/openaire/urls_controller/models/FileLocationData.java @@ -18,9 +18,9 @@ public class FileLocationData { public FileLocationData(String fileLocation) throws RuntimeException { // Extract and set LocationData. - Matcher matcher = FileUtils.FILENAME_ID_EXTENSION.matcher(fileLocation); + Matcher matcher = FileUtils.FILEPATH_ID_EXTENSION.matcher(fileLocation); if ( !matcher.matches() ) - throw new RuntimeException("Failed to match the \"" + fileLocation + "\" with the regex: " + FileUtils.FILENAME_ID_EXTENSION); + throw new RuntimeException("Failed to match the \"" + fileLocation + "\" with the regex: " + FileUtils.FILEPATH_ID_EXTENSION); fileDir = matcher.group(1); if ( (fileDir == null) || fileDir.isEmpty() ) throw new RuntimeException("Failed to extract the \"fileDir\" from \"" + fileLocation + "\"."); diff --git a/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java b/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java index 52f85f0..beda9ea 100644 --- a/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java +++ b/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java @@ -133,6 +133,8 @@ public class BulkImportServiceImpl implements BulkImportService { List> callableTasksForFileSegments = new ArrayList<>(numOfFiles); int sizeOfEachSegment = (numOfFiles / BulkImportController.numOfThreadsForBulkImportProcedures); + if ( sizeOfEachSegment == 0 ) // In case the numFiles are fewer than the numOfThreads to handle them. + sizeOfEachSegment = 1; List> subLists = Lists.partition(fileLocations, sizeOfEachSegment); // Divide the initial list to "numOfThreadsForBulkImportProcedures" subLists. The last one may have marginally fewer files. int subListsSize = subLists.size(); @@ -363,7 +365,7 @@ public class BulkImportServiceImpl implements BulkImportService { private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, String additionalLoggingMsg) throws ConnectException, UnknownHostException { - DocFileData docFileData = new DocFileData(new File(fileLocation), null, null, null); + DocFileData docFileData = new DocFileData(new File(fileLocation), null, null, fileLocation); docFileData.calculateAndSetHashAndSize(); // Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded. @@ -419,8 +421,11 @@ public class BulkImportServiceImpl implements BulkImportService { // The above analysis is educational, it does not need to take place and is not currently used. s3Url = alreadyFoundFileLocation; - } else + } else { s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), openAireId, fileLocationData.getDotFileExtension(), datasourceId, fileHash); + if ( s3Url == null ) + return null; + } return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(), docFileData.getSize(), fileHash, s3Url, provenance, true); // It may return null. diff --git a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java index ce30a4c..dce131b 100644 --- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java +++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java @@ -84,7 +84,7 @@ public class FileUtils { // The following regex might be useful in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension". // Possible full-filenames are: "path1/path2/ID.pdf", "ID2.pdf", "path1/path2/ID(12).pdf", "ID2(25).pdf" - public static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(?:([^.()]+)/)?((([^/()]+)[^./]*)(\\.[\\w]{2,10}))$"); + public static final Pattern FILEPATH_ID_EXTENSION = Pattern.compile("([^.()]+/)?((([^/()]+)[^./]*)(\\.[\\w]{2,10}))$"); private static final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames). @@ -139,15 +139,15 @@ public class FileUtils { else { // This file has not been found before.. // Extract the "fileNameWithExtension" to be added in the HashMultimap. String fileLocation = payload.getLocation(); - Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileLocation); + Matcher matcher = FILEPATH_ID_EXTENSION.matcher(fileLocation); if ( ! matcher.matches() ) { - logger.error("Failed to match the \"fileLocation\": \"" + fileLocation + "\" of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILENAME_ID_EXTENSION); + logger.error("Failed to match the \"fileLocation\": \"" + fileLocation + "\" of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILEPATH_ID_EXTENSION); numFullTextsWithProblematicLocations ++; continue; } String fileNameWithExtension = matcher.group(2); if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) { - logger.error("Failed to extract the \"fileNameWithExtension\" from \"fileLocation\": \"" + fileLocation + "\", of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILENAME_ID_EXTENSION); + logger.error("Failed to extract the \"fileNameWithExtension\" from \"fileLocation\": \"" + fileLocation + "\", of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILEPATH_ID_EXTENSION); numFullTextsWithProblematicLocations ++; continue; } @@ -367,7 +367,7 @@ public class FileUtils { // We do NOT have to find and cross-reference the missing files with the urlReports, in order to set their locations to , // since, in the end of each assignments-batch, an iteration will be made and for all the non-retrieved and non-uploaded full-texts, the app will set them to null. } - uploadFullTexts(extractedFileNames, targetDirectory, allFileNamesWithPayloads); + uploadFullTexts(extractedFileNames, targetDirectory, allFileNamesWithPayloads, batchCounter); return true; } catch (Exception e) { logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + GenericUtils.endOfLine + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6). @@ -435,7 +435,7 @@ public class FileUtils { } - private void uploadFullTexts(String[] fileNames, String targetDirectory, SetMultimap allFileNamesWithPayloads) + private void uploadFullTexts(String[] fileNames, String targetDirectory, SetMultimap allFileNamesWithPayloads, int batchCounter) { // Iterate over the files and upload them to S3. //int numUploadedFiles = 0; @@ -459,9 +459,9 @@ public class FileUtils { // BUT, since the filename contains a specific urlID, the datasourceId should be the one related to that specific urlID. // So, we extract this urlID, search the payload inside the "fileRelatedPayloads" and get the related datasourceID (instead of taking the first or a random datasourceID). - Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileName); + Matcher matcher = FILEPATH_ID_EXTENSION.matcher(fileName); if ( !matcher.matches() ) { - logger.error("Failed to match the \"" + fileName + "\" with the regex: " + FILENAME_ID_EXTENSION); + logger.error("Failed to match the \"" + fileName + "\" with the regex: " + FILEPATH_ID_EXTENSION); continue; } // The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used. @@ -510,7 +510,7 @@ public class FileUtils { //numUploadedFiles ++; } } catch (Exception e) { - logger.error("Avoid uploading the rest of the files of batch.."); + logger.error("Avoid uploading the rest of the files of batch_" + batchCounter + " | " + e.getMessage()); break; } // Else, the record will have its file-data set to "null", in the end of the caller method (as it will not have an s3Url as its location).