Various improvements:
- Handle the case when "fileUtils.constructS3FilenameAndUploadToS3()" returns "null", in "processBulkImportedFile()". - Avoid an "IllegalArgumentException" in "Lists.partition()" when the number of files to bulkImport are fewer than the number of threads available to handle them. - Include the last directory's "/" divider in the fileDIR group of "FILEPATH_ID_EXTENSION" regex (renamed from "FILENAME_ID_EXTENSION"). - Fix an incomplete log-message. - Provide the "fileLocation" argument in the "DocFileData" constructor, in "processBulkImportedFile()", even though it's not used after.
This commit is contained in:
parent
1d821ed803
commit
08de530f03
|
@ -18,9 +18,9 @@ public class FileLocationData {
|
||||||
|
|
||||||
public FileLocationData(String fileLocation) throws RuntimeException {
|
public FileLocationData(String fileLocation) throws RuntimeException {
|
||||||
// Extract and set LocationData.
|
// Extract and set LocationData.
|
||||||
Matcher matcher = FileUtils.FILENAME_ID_EXTENSION.matcher(fileLocation);
|
Matcher matcher = FileUtils.FILEPATH_ID_EXTENSION.matcher(fileLocation);
|
||||||
if ( !matcher.matches() )
|
if ( !matcher.matches() )
|
||||||
throw new RuntimeException("Failed to match the \"" + fileLocation + "\" with the regex: " + FileUtils.FILENAME_ID_EXTENSION);
|
throw new RuntimeException("Failed to match the \"" + fileLocation + "\" with the regex: " + FileUtils.FILEPATH_ID_EXTENSION);
|
||||||
fileDir = matcher.group(1);
|
fileDir = matcher.group(1);
|
||||||
if ( (fileDir == null) || fileDir.isEmpty() )
|
if ( (fileDir == null) || fileDir.isEmpty() )
|
||||||
throw new RuntimeException("Failed to extract the \"fileDir\" from \"" + fileLocation + "\".");
|
throw new RuntimeException("Failed to extract the \"fileDir\" from \"" + fileLocation + "\".");
|
||||||
|
|
|
@ -133,6 +133,8 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
|
|
||||||
List<Callable<Integer>> callableTasksForFileSegments = new ArrayList<>(numOfFiles);
|
List<Callable<Integer>> callableTasksForFileSegments = new ArrayList<>(numOfFiles);
|
||||||
int sizeOfEachSegment = (numOfFiles / BulkImportController.numOfThreadsForBulkImportProcedures);
|
int sizeOfEachSegment = (numOfFiles / BulkImportController.numOfThreadsForBulkImportProcedures);
|
||||||
|
if ( sizeOfEachSegment == 0 ) // In case the numFiles are fewer than the numOfThreads to handle them.
|
||||||
|
sizeOfEachSegment = 1;
|
||||||
List<List<String>> subLists = Lists.partition(fileLocations, sizeOfEachSegment); // Divide the initial list to "numOfThreadsForBulkImportProcedures" subLists. The last one may have marginally fewer files.
|
List<List<String>> subLists = Lists.partition(fileLocations, sizeOfEachSegment); // Divide the initial list to "numOfThreadsForBulkImportProcedures" subLists. The last one may have marginally fewer files.
|
||||||
int subListsSize = subLists.size();
|
int subListsSize = subLists.size();
|
||||||
|
|
||||||
|
@ -363,7 +365,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, String additionalLoggingMsg)
|
private GenericData.Record processBulkImportedFile(String fileLocation, String provenance, BulkImport.BulkImportSource bulkImportSource, long timeMillis, String additionalLoggingMsg)
|
||||||
throws ConnectException, UnknownHostException
|
throws ConnectException, UnknownHostException
|
||||||
{
|
{
|
||||||
DocFileData docFileData = new DocFileData(new File(fileLocation), null, null, null);
|
DocFileData docFileData = new DocFileData(new File(fileLocation), null, null, fileLocation);
|
||||||
docFileData.calculateAndSetHashAndSize();
|
docFileData.calculateAndSetHashAndSize();
|
||||||
|
|
||||||
// Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded.
|
// Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded.
|
||||||
|
@ -419,8 +421,11 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
// The above analysis is educational, it does not need to take place and is not currently used.
|
// The above analysis is educational, it does not need to take place and is not currently used.
|
||||||
|
|
||||||
s3Url = alreadyFoundFileLocation;
|
s3Url = alreadyFoundFileLocation;
|
||||||
} else
|
} else {
|
||||||
s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), openAireId, fileLocationData.getDotFileExtension(), datasourceId, fileHash);
|
s3Url = fileUtils.constructS3FilenameAndUploadToS3(fileLocationData.getFileDir(), fileLocationData.getFileName(), openAireId, fileLocationData.getDotFileExtension(), datasourceId, fileHash);
|
||||||
|
if ( s3Url == null )
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(),
|
return parquetFileUtils.getPayloadParquetRecord(openAireId, originalUrl, actualUrl, timeMillis, bulkImportSource.getMimeType(),
|
||||||
docFileData.getSize(), fileHash, s3Url, provenance, true); // It may return null.
|
docFileData.getSize(), fileHash, s3Url, provenance, true); // It may return null.
|
||||||
|
|
|
@ -84,7 +84,7 @@ public class FileUtils {
|
||||||
|
|
||||||
// The following regex might be useful in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension".
|
// The following regex might be useful in a future scenario. It extracts the "plain-filename" and "file-ID" and the "file-extension".
|
||||||
// Possible full-filenames are: "path1/path2/ID.pdf", "ID2.pdf", "path1/path2/ID(12).pdf", "ID2(25).pdf"
|
// Possible full-filenames are: "path1/path2/ID.pdf", "ID2.pdf", "path1/path2/ID(12).pdf", "ID2(25).pdf"
|
||||||
public static final Pattern FILENAME_ID_EXTENSION = Pattern.compile("(?:([^.()]+)/)?((([^/()]+)[^./]*)(\\.[\\w]{2,10}))$");
|
public static final Pattern FILEPATH_ID_EXTENSION = Pattern.compile("([^.()]+/)?((([^/()]+)[^./]*)(\\.[\\w]{2,10}))$");
|
||||||
|
|
||||||
private static final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames).
|
private static final int numOfFullTextsPerBatch = 70; // The HTTP-headers cannot be too large (It failed with 100 fileNames).
|
||||||
|
|
||||||
|
@ -139,15 +139,15 @@ public class FileUtils {
|
||||||
else { // This file has not been found before..
|
else { // This file has not been found before..
|
||||||
// Extract the "fileNameWithExtension" to be added in the HashMultimap.
|
// Extract the "fileNameWithExtension" to be added in the HashMultimap.
|
||||||
String fileLocation = payload.getLocation();
|
String fileLocation = payload.getLocation();
|
||||||
Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileLocation);
|
Matcher matcher = FILEPATH_ID_EXTENSION.matcher(fileLocation);
|
||||||
if ( ! matcher.matches() ) {
|
if ( ! matcher.matches() ) {
|
||||||
logger.error("Failed to match the \"fileLocation\": \"" + fileLocation + "\" of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILENAME_ID_EXTENSION);
|
logger.error("Failed to match the \"fileLocation\": \"" + fileLocation + "\" of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILEPATH_ID_EXTENSION);
|
||||||
numFullTextsWithProblematicLocations ++;
|
numFullTextsWithProblematicLocations ++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
String fileNameWithExtension = matcher.group(2);
|
String fileNameWithExtension = matcher.group(2);
|
||||||
if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) {
|
if ( (fileNameWithExtension == null) || fileNameWithExtension.isEmpty() ) {
|
||||||
logger.error("Failed to extract the \"fileNameWithExtension\" from \"fileLocation\": \"" + fileLocation + "\", of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILENAME_ID_EXTENSION);
|
logger.error("Failed to extract the \"fileNameWithExtension\" from \"fileLocation\": \"" + fileLocation + "\", of id: \"" + payload.getId() + "\", originalUrl: \"" + payload.getOriginal_url() + "\", using this regex: " + FILEPATH_ID_EXTENSION);
|
||||||
numFullTextsWithProblematicLocations ++;
|
numFullTextsWithProblematicLocations ++;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -367,7 +367,7 @@ public class FileUtils {
|
||||||
// We do NOT have to find and cross-reference the missing files with the urlReports, in order to set their locations to <null>,
|
// We do NOT have to find and cross-reference the missing files with the urlReports, in order to set their locations to <null>,
|
||||||
// since, in the end of each assignments-batch, an iteration will be made and for all the non-retrieved and non-uploaded full-texts, the app will set them to null.
|
// since, in the end of each assignments-batch, an iteration will be made and for all the non-retrieved and non-uploaded full-texts, the app will set them to null.
|
||||||
}
|
}
|
||||||
uploadFullTexts(extractedFileNames, targetDirectory, allFileNamesWithPayloads);
|
uploadFullTexts(extractedFileNames, targetDirectory, allFileNamesWithPayloads, batchCounter);
|
||||||
return true;
|
return true;
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + GenericUtils.endOfLine + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6).
|
logger.error("Could not extract and upload the full-texts for batch_" + batchCounter + " of assignments_" + assignmentsBatchCounter + GenericUtils.endOfLine + e.getMessage(), e); // It shows the response body (after Spring v.2.5.6).
|
||||||
|
@ -435,7 +435,7 @@ public class FileUtils {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
private void uploadFullTexts(String[] fileNames, String targetDirectory, SetMultimap<String, Payload> allFileNamesWithPayloads)
|
private void uploadFullTexts(String[] fileNames, String targetDirectory, SetMultimap<String, Payload> allFileNamesWithPayloads, int batchCounter)
|
||||||
{
|
{
|
||||||
// Iterate over the files and upload them to S3.
|
// Iterate over the files and upload them to S3.
|
||||||
//int numUploadedFiles = 0;
|
//int numUploadedFiles = 0;
|
||||||
|
@ -459,9 +459,9 @@ public class FileUtils {
|
||||||
// BUT, since the filename contains a specific urlID, the datasourceId should be the one related to that specific urlID.
|
// BUT, since the filename contains a specific urlID, the datasourceId should be the one related to that specific urlID.
|
||||||
// So, we extract this urlID, search the payload inside the "fileRelatedPayloads" and get the related datasourceID (instead of taking the first or a random datasourceID).
|
// So, we extract this urlID, search the payload inside the "fileRelatedPayloads" and get the related datasourceID (instead of taking the first or a random datasourceID).
|
||||||
|
|
||||||
Matcher matcher = FILENAME_ID_EXTENSION.matcher(fileName);
|
Matcher matcher = FILEPATH_ID_EXTENSION.matcher(fileName);
|
||||||
if ( !matcher.matches() ) {
|
if ( !matcher.matches() ) {
|
||||||
logger.error("Failed to match the \"" + fileName + "\" with the regex: " + FILENAME_ID_EXTENSION);
|
logger.error("Failed to match the \"" + fileName + "\" with the regex: " + FILEPATH_ID_EXTENSION);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used.
|
// The "matcher.group(3)" returns the "filenameWithoutExtension", which is currently not used.
|
||||||
|
@ -510,7 +510,7 @@ public class FileUtils {
|
||||||
//numUploadedFiles ++;
|
//numUploadedFiles ++;
|
||||||
}
|
}
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
logger.error("Avoid uploading the rest of the files of batch..");
|
logger.error("Avoid uploading the rest of the files of batch_" + batchCounter + " | " + e.getMessage());
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
// Else, the record will have its file-data set to "null", in the end of the caller method (as it will not have an s3Url as its location).
|
// Else, the record will have its file-data set to "null", in the end of the caller method (as it will not have an s3Url as its location).
|
||||||
|
|
Loading…
Reference in New Issue