Various improvements:

- Handle the case when "fileUtils.constructS3FilenameAndUploadToS3()" returns "null", in "processBulkImportedFile()". - Avoid an "IllegalArgumentException" in "Lists.partition()" when the number of files to bulkImport are fewer than the number of threads available to handle them. - Include the last directory's "/" divider in the fileDIR group of "FILEPATH_ID_EXTENSION" regex (renamed from "FILENAME_ID_EXTENSION"). - Fix an incomplete log-message. - Provide the "fileLocation" argument in the "DocFileData" constructor, in "processBulkImportedFile()", even though it's not used after.
2024-03-29 17:23:01 +02:00 · 2024-03-29 17:23:01 +02:00 · 08de530f03
parent 1d821ed803
commit 08de530f03
3 changed files with 18 additions and 13 deletions
--- a/src/main/java/eu/openaire/urls_controller/models/FileLocationData.java
+++ b/src/main/java/eu/openaire/urls_controller/models/FileLocationData.java
@ -18,9 +18,9 @@ public class FileLocationData {

    public FileLocationData(String fileLocation) throws RuntimeException {
        // Extract and set LocationData.
-        Matcher matcher = FileUtils.FILENAME_ID_EXTENSION.matcher(fileLocation);
+        Matcher matcher = FileUtils.FILEPATH_ID_EXTENSION.matcher(fileLocation);
        if ( !matcher.matches() )
-            throw new RuntimeException("Failed to match the \"" + fileLocation + "\" with the regex: " + FileUtils.FILENAME_ID_EXTENSION);
+            throw new RuntimeException("Failed to match the \"" + fileLocation + "\" with the regex: " + FileUtils.FILEPATH_ID_EXTENSION);
        fileDir = matcher.group(1);
        if ( (fileDir == null) || fileDir.isEmpty() )
            throw new RuntimeException("Failed to extract the \"fileDir\" from \"" + fileLocation + "\".");
--- a/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java
+++ b/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java
@ -133,6 +133,8 @@ public class BulkImportServiceImpl implements BulkImportService {

        List<Callable<Integer>> callableTasksForFileSegments = new ArrayList<>(numOfFiles);
        int sizeOfEachSegment = (numOfFiles / BulkImportController.numOfThreadsForBulkImportProcedures);
+        if ( sizeOfEachSegment == 0 )   // In case the numFiles are fewer than the numOfThreads to handle them.
+            sizeOfEachSegment = 1;
        List<List<String>> subLists = Lists.partition(fileLocations, sizeOfEachSegment);    // Divide the initial list to "numOfThreadsForBulkImportProcedures" subLists. The last one may have marginally fewer files.
        int subListsSize = subLists.size();

@ -363,7 +365,7 @@ public class BulkImportServiceImpl implements BulkImportService {
    private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, String additionalLoggingMsg)
            throws ConnectException, UnknownHostException
    {
-        DocFileData docFileData = new DocFileData(new File(fileLocation), null, null, null);
+        DocFileData docFileData = new DocFileData(new File(fileLocation), null, null, fileLocation);
        docFileData.calculateAndSetHashAndSize();

        // Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded.
@ -419,8 +421,11 @@ public class BulkImportServiceImpl implements BulkImportService {
            // The above analysis is educational, it does not need to take place and is not currently used.

            s3Url = alreadyFoundFileLocation;
-        } else
+        } else {
            s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), openAireId, fileLocationData.getDotFileExtension(), datasourceId, fileHash);
+            if ( s3Url == null )
+                return null;
+        }

        return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(),
                docFileData.getSize(), fileHash, s3Url, provenance, true);  // It may return null.
--- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
+++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
@ -84,7 +84,7 @@ public class FileUtils {

    // The following regex might be useful in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension".
    // Possible full-filenames are: "path1/path2/ID.pdf", "ID2.pdf", "path1/path2/ID(12).pdf", "ID2(25).pdf"
-    public static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(?:([^.()]+)/)?((([^/()]+)[^./]*)(\\.[\\w]{2,10}))$");
+    public static final Pattern FILEPATH_ID_EXTENSION = Pattern.compile("([^.()]+/)?((([^/()]+)[^./]*)(\\.[\\w]{2,10}))$");

    private static final int numOfFullTextsPerBatch = 70;   // The HTTP-headers cannot be too large (It failed with 100 fileNames).

@ -139,15 +139,15 @@ public class FileUtils {
                else {  // This file has not been found before..
                    // Extract the "fileNameWithExtension" to be added in the HashMultimap.
                    String fileLocation = payload.getLocation();
-                    Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileLocation);
+                    Matcher matcher = FILEPATH_ID_EXTENSION.matcher(fileLocation);
                    if ( ! matcher.matches() ) {
-                        logger.error("Failed to match the \"fileLocation\": \"" + fileLocation + "\" of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILENAME_ID_EXTENSION);
+                        logger.error("Failed to match the \"fileLocation\": \"" + fileLocation + "\" of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILEPATH_ID_EXTENSION);
                        numFullTextsWithProblematicLocations ++;
                        continue;
                    }
                    String fileNameWithExtension = matcher.group(2);
                    if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) {
-                        logger.error("Failed to extract the \"fileNameWithExtension\" from \"fileLocation\": \"" + fileLocation + "\", of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILENAME_ID_EXTENSION);
+                        logger.error("Failed to extract the \"fileNameWithExtension\" from \"fileLocation\": \"" + fileLocation + "\", of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILEPATH_ID_EXTENSION);
                        numFullTextsWithProblematicLocations ++;
                        continue;
                    }
@ -367,7 +367,7 @@ public class FileUtils {
                // We do NOT have to find and cross-reference the missing files with the urlReports, in order to set their locations to <null>,
                // since, in the end of each assignments-batch, an iteration will be made and for all the non-retrieved and non-uploaded full-texts, the app will set them to null.
            }
-            uploadFullTexts(extractedFileNames, targetDirectory, allFileNamesWithPayloads);
+            uploadFullTexts(extractedFileNames, targetDirectory, allFileNamesWithPayloads, batchCounter);
            return true;
        } catch (Exception e) {
            logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + GenericUtils.endOfLine + e.getMessage(), e);   // It shows the response body (after Spring v.2.5.6).
@ -435,7 +435,7 @@ public class FileUtils {
    }


-    private void uploadFullTexts(String[] fileNames, String targetDirectory, SetMultimap<String, Payload> allFileNamesWithPayloads)
+    private void uploadFullTexts(String[] fileNames, String targetDirectory, SetMultimap<String, Payload> allFileNamesWithPayloads, int batchCounter)
    {
        // Iterate over the files and upload them to S3.
        //int numUploadedFiles = 0;
@ -459,9 +459,9 @@ public class FileUtils {
            // BUT, since the filename contains a specific urlID, the datasourceId should be the one related to that specific urlID.
            // So, we extract this urlID, search the payload inside the "fileRelatedPayloads" and get the related datasourceID (instead of taking the first or a random datasourceID).

-            Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileName);
+            Matcher matcher = FILEPATH_ID_EXTENSION.matcher(fileName);
            if ( !matcher.matches() ) {
-                logger.error("Failed to match the \"" + fileName + "\" with the regex: " + FILENAME_ID_EXTENSION);
+                logger.error("Failed to match the \"" + fileName + "\" with the regex: " + FILEPATH_ID_EXTENSION);
                continue;
            }
            // The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used.
@ -510,7 +510,7 @@ public class FileUtils {
                    //numUploadedFiles ++;
                }
            } catch (Exception e) {
-                logger.error("Avoid uploading the rest of the files of batch..");
+                logger.error("Avoid uploading the rest of the files of batch_" + batchCounter + " | " + e.getMessage());
                break;
            }
            // Else, the record will have its file-data set to "null", in the end of the caller method (as it will not have an s3Url as its location).