Handle the case when the "upload-file-to-S3" operation fails with a "ConnectException". In this case, all remaining upload operations for the files of that particular batch or segment, are canceled.

2023-10-04 13:01:13 +03:00 · 2023-10-04 13:01:13 +03:00 · c9626de120
parent 865926fbc3
commit c9626de120
2 changed files with 102 additions and 76 deletions
--- a/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java
+++ b/src/main/java/eu/openaire/urls_controller/services/BulkImportServiceImpl.java
@ -21,6 +21,7 @@ import org.springframework.stereotype.Service;

 import javax.xml.bind.DatatypeConverter;
 import java.io.File;
+import java.net.ConnectException;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
@ -243,8 +244,20 @@ public class BulkImportServiceImpl implements BulkImportService {
        int counter = 0;

        // Upload files to S3 and collect payloadRecords.
-        for ( String fileLocation: fileLocationsSegment ) {
-            GenericData.Record record = processBulkImportedFile(fileLocation, provenance, bulkImportSource, timeMillis, additionalLoggingMsg);
+        for ( int i=0; i < numOfFilesInSegment; ++i ) {
+            String fileLocation = fileLocationsSegment.get(i);
+            GenericData.Record record = null;
+            try {
+                record = processBulkImportedFile(fileLocation, provenance, bulkImportSource, timeMillis, additionalLoggingMsg);
+            } catch (ConnectException ce) {
+                String errorMsg = "ConnectException when uploading the files of segment_" + segmentCounter + " to the S3 Object Store. Will avoid uploading any file for this segment..";
+                logger.error(errorMsg + additionalLoggingMsg);
+                bulkImportReport.addEvent(errorMsg);
+                for ( int j=i; j < numOfFilesInSegment; ++j )
+                    failedFiles.add(fileLocationsSegment.get(j));   // The rest of the files are considered "failed".
+                break;
+            }
+
            if ( record != null )
                payloadRecords.add(record);
            else {
@ -343,6 +356,7 @@ public class BulkImportServiceImpl implements BulkImportService {


    private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, String additionalLoggingMsg)
+            throws ConnectException
    {
        File fullTextFile = new File(fileLocation);
        DocFileData docFileData = new DocFileData(fullTextFile, null, null, null);
@ -401,16 +415,8 @@ public class BulkImportServiceImpl implements BulkImportService {
            // The above analysis is educational, it does not need to take place and is not currently used.

            s3Url = alreadyFoundFileLocation;
-        } else {
-            try {
-                s3Url = fileUtils.constructFileNameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), openAireId, fileLocationData.getDotFileExtension(), datasourceId, fileHash);  // This throws Exception, in case the uploading failed.
-                if ( s3Url == null )
-                    return null;    // In case the 'datasourceID' or 'hash' is null. Which should never happen here, since both of them are checked before the execution reaches here.
-            } catch (Exception e) {
-                logger.error("Could not upload the file '" + fileLocationData.getFileName() + "' to the S3 ObjectStore!" + additionalLoggingMsg, e);
-                return null;
-            }
-        }
+        } else
+            s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), fileNameID, fileLocationData.getDotFileExtension(), datasourceId, fileHash);

        GenericData.Record record = new GenericData.Record(ParquetFileUtils.payloadsSchema);
        record.put("id", openAireId);
--- a/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
+++ b/src/main/java/eu/openaire/urls_controller/util/FileUtils.java
@ -18,6 +18,7 @@ import org.springframework.jdbc.core.JdbcTemplate;
 import org.springframework.stereotype.Component;

 import java.io.*;
+import java.net.ConnectException;
 import java.net.HttpURLConnection;
 import java.net.URL;
 import java.nio.file.Files;
@ -487,7 +488,7 @@ public class FileUtils {
            }

            // Let's try to upload the file to S3 and update the payloads, either in successful file-uploads (right-away) or not (in the end).
-            try {
+
            // Prepare the filename as: "datasourceid/publicationid::hash.pdf"
            // All related payloads point to this exact same file, BUT, may be related with different urlIDs, which in turn be related with different datasourceIDs.
            // This file could have been found from different urlIds and thus be related to multiple datasourceIds.
@ -500,6 +501,7 @@ public class FileUtils {
                continue;
            }
            // The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used.
+            // Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName).
            String fileNameID = matcher.group(4);
            if ( (fileNameID == null) || fileNameID.isEmpty() ) {
                logger.error("Failed to extract the \"fileNameID\" from \"" + fileName + "\".");
@ -537,16 +539,17 @@ public class FileUtils {
                continue;
            }

-                String s3Url = constructFileNameAndUploadToS3(targetDirectory, fileName, fileNameID, dotFileExtension, datasourceId, hash);
-                if ( s3Url == null )
-                    continue;
-
+            try {
+                String s3Url = constructS3FilenameAndUploadToS3(targetDirectory, fileName, fileNameID, dotFileExtension, datasourceId, hash);
+                if (s3Url != null) {
                    setFullTextForMultiplePayloads(fileRelatedPayloads, s3Url);
                    //numUploadedFiles ++;
-            } catch (Exception e) {
-                logger.error("Could not upload the file \"" + fileName + "\" to the S3 ObjectStore!", e);
                }
-            // Else, the record will have its file-data set to "null", in the end of this method.
+            } catch (ConnectException ce) {
+                logger.error("Avoid uploading the rest of the files of this batch..");
+                break;
+            }
+            // Else, the record will have its file-data set to "null", in the end of the caller method (as it will not have an s3Url as its location).
        }//end filenames-for-loop

        //logger.debug("Finished uploading " + numUploadedFiles + " full-texts (out of " + (fileNames.length -2) + " distinct files) from assignments_" + assignmentsBatchCounter + ", batch_" + batchCounter + " on S3-ObjectStore.");
@ -554,7 +557,29 @@ public class FileUtils {
    }


-    public String constructFileNameAndUploadToS3(String fileDir, String fileName, String openAireID, String dotFileExtension, String datasourceId, String hash) throws Exception
+    public String constructS3FilenameAndUploadToS3(String targetDirectory, String fileName, String fileNameID,
+                                                   String dotFileExtension, String datasourceId, String hash) throws ConnectException
+    {
+        String filenameForS3 = constructS3FileName(fileName, fileNameID, dotFileExtension, datasourceId, hash);   // This name is for the uploaded file, in the S3 Object Store.
+        if ( filenameForS3 == null )    // The error is logged inside.
+            return null;
+
+        String fileFullPath = targetDirectory + File.separator + fileName;   // The fullPath to the local file (which has the previous name).
+        String s3Url = null;
+        try {
+            s3Url = s3ObjectStore.uploadToS3(filenameForS3, fileFullPath);
+        } catch (ConnectException ce) {
+            logger.error("Could not connect with the S3 Object Store! " + ce.getMessage());
+            throw ce;
+        } catch (Exception e) {
+            logger.error("Could not upload the local-file \"" + fileFullPath + "\" to the S3 ObjectStore, with S3-filename: \"" + filenameForS3 + "\"!", e);
+            return null;
+        }
+        return s3Url;
+    }
+
+
+    public String constructS3FileName(String fileName, String openAireID, String dotFileExtension, String datasourceId, String hash)
    {
        if ( datasourceId == null ) {
            logger.error("The retrieved \"datasourceId\" was \"null\" for file: " + fileName);
@ -565,13 +590,8 @@ public class FileUtils {
            return null;
        }

-        String fileFullPath = fileDir + File.separator + fileName;   // The fullPath to the local file.
-
-        // Use the "fileNameID" and not the "filenameWithoutExtension", as we want to avoid keeping the possible "parenthesis" with the increasing number (about the duplication of ID-fileName).
        // Now we append the file-hash, so it is guaranteed that the filename will be unique.
-        fileName = datasourceId + "/" + openAireID + "::" + hash + dotFileExtension;    // This is the fileName to be used in the objectStore, not of the local file!
-
-        return s3ObjectStore.uploadToS3(fileName, fileFullPath);
+        return datasourceId + "/" + openAireID + "::" + hash + dotFileExtension;    // This is the fileName to be used in the objectStore, not of the local file!
    }