Various improvements:

- Handle the case when "fileUtils.constructS3FilenameAndUploadToS3()" returns "null", in "processBulkImportedFile()".
- Avoid an "IllegalArgumentException" in "Lists.partition()" when the number of files to bulkImport are fewer than the number of threads available to handle them.
- Include the last directory's "/" divider in the fileDIR group of "FILEPATH_ID_EXTENSION" regex (renamed from "FILENAME_ID_EXTENSION").
- Fix an incomplete log-message.
- Provide the "fileLocation" argument in the "DocFileData" constructor, in "processBulkImportedFile()", even though it's not used after.
This commit is contained in:
Lampros Smyrnaios 2024-03-29 17:23:01 +02:00
parent 1d821ed803
commit 08de530f03
3 changed files with 18 additions and 13 deletions

View File

@ -18,9 +18,9 @@ public class FileLocationData {
public FileLocationData(String fileLocation) throws RuntimeException {
// Extract and set LocationData.
Matcher matcher = FileUtils.FILENAME_ID_EXTENSION.matcher(fileLocation);
Matcher matcher = FileUtils.FILEPATH_ID_EXTENSION.matcher(fileLocation);
if ( !matcher.matches() )
throw new RuntimeException("Failed to match the \"" + fileLocation + "\" with the regex: " + FileUtils.FILENAME_ID_EXTENSION);
throw new RuntimeException("Failed to match the \"" + fileLocation + "\" with the regex: " + FileUtils.FILEPATH_ID_EXTENSION);
fileDir = matcher.group(1);
if ( (fileDir == null) || fileDir.isEmpty() )
throw new RuntimeException("Failed to extract the \"fileDir\" from \"" + fileLocation + "\".");

View File

@ -133,6 +133,8 @@ public class BulkImportServiceImpl implements BulkImportService {
List<Callable<Integer>> callableTasksForFileSegments = new ArrayList<>(numOfFiles);
int sizeOfEachSegment = (numOfFiles / BulkImportController.numOfThreadsForBulkImportProcedures);
if ( sizeOfEachSegment == 0 ) // In case the numFiles are fewer than the numOfThreads to handle them.
sizeOfEachSegment = 1;
List<List<String>> subLists = Lists.partition(fileLocations, sizeOfEachSegment); // Divide the initial list to "numOfThreadsForBulkImportProcedures" subLists. The last one may have marginally fewer files.
int subListsSize = subLists.size();
@ -363,7 +365,7 @@ public class BulkImportServiceImpl implements BulkImportService {
private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, String additionalLoggingMsg)
throws ConnectException, UnknownHostException
{
DocFileData docFileData = new DocFileData(new File(fileLocation), null, null, null);
DocFileData docFileData = new DocFileData(new File(fileLocation), null, null, fileLocation);
docFileData.calculateAndSetHashAndSize();
// Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded.
@ -419,8 +421,11 @@ public class BulkImportServiceImpl implements BulkImportService {
// The above analysis is educational, it does not need to take place and is not currently used.
s3Url = alreadyFoundFileLocation;
} else
} else {
s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), openAireId, fileLocationData.getDotFileExtension(), datasourceId, fileHash);
if ( s3Url == null )
return null;
}
return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(),
docFileData.getSize(), fileHash, s3Url, provenance, true); // It may return null.

View File

@ -84,7 +84,7 @@ public class FileUtils {
// The following regex might be useful in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension".
// Possible full-filenames are: "path1/path2/ID.pdf", "ID2.pdf", "path1/path2/ID(12).pdf", "ID2(25).pdf"
public static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(?:([^.()]+)/)?((([^/()]+)[^./]*)(\\.[\\w]{2,10}))$");
public static final Pattern FILEPATH_ID_EXTENSION = Pattern.compile("([^.()]+/)?((([^/()]+)[^./]*)(\\.[\\w]{2,10}))$");
private static final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames).
@ -139,15 +139,15 @@ public class FileUtils {
else { // This file has not been found before..
// Extract the "fileNameWithExtension" to be added in the HashMultimap.
String fileLocation = payload.getLocation();
Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileLocation);
Matcher matcher = FILEPATH_ID_EXTENSION.matcher(fileLocation);
if ( ! matcher.matches() ) {
logger.error("Failed to match the \"fileLocation\": \"" + fileLocation + "\" of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILENAME_ID_EXTENSION);
logger.error("Failed to match the \"fileLocation\": \"" + fileLocation + "\" of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILEPATH_ID_EXTENSION);
numFullTextsWithProblematicLocations ++;
continue;
}
String fileNameWithExtension = matcher.group(2);
if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) {
logger.error("Failed to extract the \"fileNameWithExtension\" from \"fileLocation\": \"" + fileLocation + "\", of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILENAME_ID_EXTENSION);
logger.error("Failed to extract the \"fileNameWithExtension\" from \"fileLocation\": \"" + fileLocation + "\", of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILEPATH_ID_EXTENSION);
numFullTextsWithProblematicLocations ++;
continue;
}
@ -367,7 +367,7 @@ public class FileUtils {
// We do NOT have to find and cross-reference the missing files with the urlReports, in order to set their locations to <null>,
// since, in the end of each assignments-batch, an iteration will be made and for all the non-retrieved and non-uploaded full-texts, the app will set them to null.
}
uploadFullTexts(extractedFileNames, targetDirectory, allFileNamesWithPayloads);
uploadFullTexts(extractedFileNames, targetDirectory, allFileNamesWithPayloads, batchCounter);
return true;
} catch (Exception e) {
logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + GenericUtils.endOfLine + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6).
@ -435,7 +435,7 @@ public class FileUtils {
}
private void uploadFullTexts(String[] fileNames, String targetDirectory, SetMultimap<String, Payload> allFileNamesWithPayloads)
private void uploadFullTexts(String[] fileNames, String targetDirectory, SetMultimap<String, Payload> allFileNamesWithPayloads, int batchCounter)
{
// Iterate over the files and upload them to S3.
//int numUploadedFiles = 0;
@ -459,9 +459,9 @@ public class FileUtils {
// BUT, since the filename contains a specific urlID, the datasourceId should be the one related to that specific urlID.
// So, we extract this urlID, search the payload inside the "fileRelatedPayloads" and get the related datasourceID (instead of taking the first or a random datasourceID).
Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileName);
Matcher matcher = FILEPATH_ID_EXTENSION.matcher(fileName);
if ( !matcher.matches() ) {
logger.error("Failed to match the \"" + fileName + "\" with the regex: " + FILENAME_ID_EXTENSION);
logger.error("Failed to match the \"" + fileName + "\" with the regex: " + FILEPATH_ID_EXTENSION);
continue;
}
// The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used.
@ -510,7 +510,7 @@ public class FileUtils {
//numUploadedFiles ++;
}
} catch (Exception e) {
logger.error("Avoid uploading the rest of the files of batch..");
logger.error("Avoid uploading the rest of the files of batch_" + batchCounter + " | " + e.getMessage());
break;
}
// Else, the record will have its file-data set to "null", in the end of the caller method (as it will not have an s3Url as its location).