- Add a warn-log for duplicate files inside a file-segment, when bulk-importing.

- Add error-handling in "ScheduledTasks.extractAssignmentsCounterAndDeleteRelatedAssignmentRecords()". - Improve an error-message.
2024-06-17 13:16:38 +03:00 · 2024-06-17 13:16:38 +03:00 · 63cf63e6cc
parent c45a172c21
commit 63cf63e6cc
2 changed files with 21 additions and 5 deletions
--- a/src/main/java/eu/openaire/urls_controller/components/ScheduledTasks.java
+++ b/src/main/java/eu/openaire/urls_controller/components/ScheduledTasks.java
@ -134,7 +134,7 @@ public class ScheduledTasks {
                logger.error("IOOBE for background_task_" + i + " in the futures-list! " + ioobe.getMessage());
                // Only here, the "future" will be null.
            } finally {
-                if ( future != null )   // It may be null in case we have a IOBE.
+                if ( future != null )   // It may be null in case we have a IOOBE.
                    futuresToDelete.add(future);    // Do not delete them directly here, as the indexes will get messed up and we will get "IOOBE".
            }
        }
@ -426,7 +426,7 @@ public class ScheduledTasks {
            else
                logger.debug(initMsg);
        } catch (Exception e) {
-            logger.error("", e);
+            logger.error("Failed to perform the \"" + actionForWorkerReports.toString() + "\" action!", e);
        }
    }

@ -512,9 +512,9 @@ public class ScheduledTasks {
        logger.debug("Will delete the assignments of the old, not-successful, workerReport: " + workerReportName);

        DatabaseConnector.databaseLock.lock();
-        urlsService.deleteAssignmentsBatch(curReportAssignmentsCounter);    // Any error-log is written inside.
+        String errorMsg = urlsService.deleteAssignmentsBatch(curReportAssignmentsCounter);    // The potential error-log is written inside.
        DatabaseConnector.databaseLock.unlock();
-        return true;
+        return (errorMsg == null);
    }

 }
--- a/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java
+++ b/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java
@ -265,7 +265,23 @@ public class BulkImportServiceImpl implements BulkImportService {
            docFileData.calculateAndSetHashAndSize();
            String fileHash = docFileData.getHash();
            if ( fileHash != null ) {   // We will not work with files for which we cannot produce the hash, since the s3-file-location requires it.
-                fileHashes.add(fileHash);
+                if ( !fileHashes.add(fileHash) ) {
+                    msg = "File \"" + fileLocation + "\" is duplicate (has the same md5-hash)!";
+                    logger.warn(msg + additionalLoggingMsg);
+                    bulkImportReport.addEvent(msg);
+                    fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
+                    continue;
+                    // The fileLocation will be different from any other, so this means that a different OpenAIRE-ID will be generated and it's ok to have 2 payload records.
+                    // TODO - We should avoid uploading the duplicate to S3 though.. Add it to a "duplicateFiles" HashSet?
+                    // Of course, other segments may have files duplicate to files of this segment. In this case, we should have a global concurrent set to check these issues.
+                    // But then again, we should split the processSegment code to two parts. one to gather the hashes and check for duplicates, then close the threads and then reopen them and use this data to process each segment.
+                    // (otherwise, one segment may proceed to bulkImport event though there are still hashes to be gathered by other segments --> TODO - is it better to use "SIGNALS" ??)
+
+                    // TODO - Before making heavy changes, it's better to ask ICM to verify the following from arXiv documentation:
+                    // 1) can it be the case, that there are multiple files with the same name inside a single directory or across multiple directories?
+                            // note that multiple bulkImport requests may be issued and executed in parallel, not just multiple segments from a single request.
+                    // 2) can it be the case, that duplicate files exist with different names inside one or more directories?
+                }
                fileLocationsWithData.put(fileLocation, docFileData);
            }
        }