Update the BulkImport API:
- Refactor the "bulkImportReportID". - Add the "bulk:" prefix in the provenance value, in the DB. - Fix not using correctly the "Lists.partition()" method. - Make sure the "bulkImportDir" is removed from the "bulkImportDirsUnderProcessing" Set, in case of an early-error. - Fix the "numFailedSegments"-calculation. - Improve some messages. - Code polishing.
This commit is contained in:
parent
a524375656
commit
b3e0d214fd
|
@ -14,7 +14,7 @@ public class BulkImport {
|
||||||
|
|
||||||
private String bulkImportReportLocation;
|
private String bulkImportReportLocation;
|
||||||
|
|
||||||
private int numOfThreadsPerBulkImportProcedure;
|
private int numOfThreadsForBulkImportProcedures;
|
||||||
|
|
||||||
private Map<String, BulkImportSource> bulkImportSources;
|
private Map<String, BulkImportSource> bulkImportSources;
|
||||||
|
|
||||||
|
@ -37,12 +37,12 @@ public class BulkImport {
|
||||||
this.bulkImportReportLocation = bulkImportReportLocation;
|
this.bulkImportReportLocation = bulkImportReportLocation;
|
||||||
}
|
}
|
||||||
|
|
||||||
public int getNumOfThreadsPerBulkImportProcedure() {
|
public int getNumOfThreadsForBulkImportProcedures() {
|
||||||
return numOfThreadsPerBulkImportProcedure;
|
return numOfThreadsForBulkImportProcedures;
|
||||||
}
|
}
|
||||||
|
|
||||||
public void setNumOfThreadsPerBulkImportProcedure(int numOfThreadsPerBulkImportProcedure) {
|
public void setNumOfThreadsForBulkImportProcedures(int numOfThreadsForBulkImportProcedures) {
|
||||||
this.numOfThreadsPerBulkImportProcedure = numOfThreadsPerBulkImportProcedure;
|
this.numOfThreadsForBulkImportProcedures = numOfThreadsForBulkImportProcedures;
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, BulkImportSource> getBulkImportSources() {
|
public Map<String, BulkImportSource> getBulkImportSources() {
|
||||||
|
@ -58,7 +58,7 @@ public class BulkImport {
|
||||||
return "BulkImport{" +
|
return "BulkImport{" +
|
||||||
"baseBulkImportLocation='" + baseBulkImportLocation + '\'' +
|
"baseBulkImportLocation='" + baseBulkImportLocation + '\'' +
|
||||||
", bulkImportReportLocation='" + bulkImportReportLocation + '\'' +
|
", bulkImportReportLocation='" + bulkImportReportLocation + '\'' +
|
||||||
", numOfThreadsPerBulkImportProcedure=" + numOfThreadsPerBulkImportProcedure +
|
", numOfThreadsForBulkImportProcedures=" + numOfThreadsForBulkImportProcedures +
|
||||||
", bulkImportSources=" + bulkImportSources +
|
", bulkImportSources=" + bulkImportSources +
|
||||||
'}';
|
'}';
|
||||||
}
|
}
|
||||||
|
|
|
@ -45,9 +45,9 @@ public class BulkImportController {
|
||||||
|
|
||||||
private final HashMap<String, BulkImport.BulkImportSource> bulkImportSources;
|
private final HashMap<String, BulkImport.BulkImportSource> bulkImportSources;
|
||||||
|
|
||||||
public static final Set<String> bulkImportDirs = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
|
public static final Set<String> bulkImportDirsUnderProcessing = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
|
||||||
|
|
||||||
public static int numOfThreadsPerBulkImportProcedure;
|
public static int numOfThreadsForBulkImportProcedures;
|
||||||
public static ExecutorService bulkImportExecutor;
|
public static ExecutorService bulkImportExecutor;
|
||||||
|
|
||||||
|
|
||||||
|
@ -65,9 +65,9 @@ public class BulkImportController {
|
||||||
|
|
||||||
this.bulkImportService = bulkImportService;
|
this.bulkImportService = bulkImportService;
|
||||||
|
|
||||||
numOfThreadsPerBulkImportProcedure = bulkImport.getNumOfThreadsPerBulkImportProcedure();
|
numOfThreadsForBulkImportProcedures = bulkImport.getNumOfThreadsForBulkImportProcedures();
|
||||||
logger.info("Will use " + numOfThreadsPerBulkImportProcedure + " threads per bulk-import procedure.");
|
logger.info("Will use " + numOfThreadsForBulkImportProcedures + " threads per bulk-import procedure.");
|
||||||
bulkImportExecutor = Executors.newFixedThreadPool(numOfThreadsPerBulkImportProcedure); // At most < numOfThreadsPerBulkImportProcedure > threads will be used per bulk-import procedure..
|
bulkImportExecutor = Executors.newFixedThreadPool(numOfThreadsForBulkImportProcedures); // At most < numOfThreadsPerBulkImportProcedure > threads will be used per bulk-import procedure..
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -146,7 +146,7 @@ public class BulkImportController {
|
||||||
return ResponseEntity.badRequest().body(errorMsg);
|
return ResponseEntity.badRequest().body(errorMsg);
|
||||||
}
|
}
|
||||||
// The above check does not catch the case were the directory has at least one subdirectory, but no full-texts files.
|
// The above check does not catch the case were the directory has at least one subdirectory, but no full-texts files.
|
||||||
// The "iterator()" will have a "next" entry, but no full-text file will exist. Although, that case will be rare and will be caught later on, after this procedure being accepted.
|
// The "iterator()" will have a "next" entry, but no full-text file will exist. Although, that case will be rare and will be caught later on, after this procedure has been accepted.
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
String errorMsg = "Error when checking if the givenDir \"" + givenDir + "\" is empty!";
|
String errorMsg = "Error when checking if the givenDir \"" + givenDir + "\" is empty!";
|
||||||
logger.error(errorMsg);
|
logger.error(errorMsg);
|
||||||
|
@ -161,7 +161,7 @@ public class BulkImportController {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Detect if the same directory is scheduled for being processed. In that case, return a 429.
|
// Detect if the same directory is scheduled for being processed. In that case, return a 429.
|
||||||
if ( ! bulkImportDirs.add(bulkImportDir) ) {
|
if ( ! bulkImportDirsUnderProcessing.add(bulkImportDir) ) {
|
||||||
// We allow multiple jobs for the same provenance, running at the same time, but not multiple jobs for the same bulkImportDirectory.
|
// We allow multiple jobs for the same provenance, running at the same time, but not multiple jobs for the same bulkImportDirectory.
|
||||||
String errorMsg = "There is a bulk-import request for the directory \"" + bulkImportDir + "\" that is being handled at the moment. Please wait until it's finished being processed, before making another request.";
|
String errorMsg = "There is a bulk-import request for the directory \"" + bulkImportDir + "\" that is being handled at the moment. Please wait until it's finished being processed, before making another request.";
|
||||||
logger.error(errorMsg);
|
logger.error(errorMsg);
|
||||||
|
@ -172,13 +172,14 @@ public class BulkImportController {
|
||||||
try {
|
try {
|
||||||
Files.createDirectories(currentBulkImportReportLocationDir); // No-op if dir exists. It does not throw a "alreadyExistsException"
|
Files.createDirectories(currentBulkImportReportLocationDir); // No-op if dir exists. It does not throw a "alreadyExistsException"
|
||||||
} catch (Exception e) {
|
} catch (Exception e) {
|
||||||
String errorMsg = "Could nor create the \"bulkImportReportLocation\" for provenance \"" + provenance + "\" : " + currentBulkImportReportLocationDir;
|
String errorMsg = "Could not create the \"bulkImportReportLocation\" for provenance \"" + provenance + "\" : " + currentBulkImportReportLocationDir;
|
||||||
logger.error(errorMsg, e);
|
logger.error(errorMsg, e);
|
||||||
|
bulkImportDirsUnderProcessing.remove(bulkImportDir);
|
||||||
return ResponseEntity.internalServerError().body(errorMsg);
|
return ResponseEntity.internalServerError().body(errorMsg);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Generate the "bulkImportReportID". We are removing the ending "slash" ("/") from the "relativeBulkImportDir".
|
// Generate the "bulkImportReportID". We are removing the ending "slash" ("/") from the "relativeBulkImportDir".
|
||||||
String bulkImportReportID = provenance + "/" + relativeBulkImportDir.substring(0, (relativeBulkImportDir.length() -1)) + "_report_" + GenericUtils.getRandomNumber(10000, 99999);
|
String bulkImportReportID = provenance + "/" + relativeBulkImportDir + "report_" + GenericUtils.getRandomNumber(10000, 99999);
|
||||||
String bulkImportReportFullPath = this.bulkImportReportLocation + bulkImportReportID + ".json";
|
String bulkImportReportFullPath = this.bulkImportReportLocation + bulkImportReportID + ".json";
|
||||||
|
|
||||||
String msg = "The bulkImportFullTexts request for " + provenance + " procedure and bulkImportDir: " + givenBulkDir + " was accepted and will be scheduled for execution. "
|
String msg = "The bulkImportFullTexts request for " + provenance + " procedure and bulkImportDir: " + givenBulkDir + " was accepted and will be scheduled for execution. "
|
||||||
|
@ -189,8 +190,10 @@ public class BulkImportController {
|
||||||
bulkImportReport.addEvent(msg);
|
bulkImportReport.addEvent(msg);
|
||||||
|
|
||||||
String errorMsg = fileUtils.writeToFile(bulkImportReportFullPath, bulkImportReport.getJsonReport(), true);
|
String errorMsg = fileUtils.writeToFile(bulkImportReportFullPath, bulkImportReport.getJsonReport(), true);
|
||||||
if ( errorMsg != null )
|
if ( errorMsg != null ) {
|
||||||
|
bulkImportDirsUnderProcessing.remove(bulkImportDir);
|
||||||
return ResponseEntity.internalServerError().body(errorMsg);
|
return ResponseEntity.internalServerError().body(errorMsg);
|
||||||
|
}
|
||||||
|
|
||||||
logger.info(msg + " \"bulkImportReportID\": " + bulkImportReportID);
|
logger.info(msg + " \"bulkImportReportID\": " + bulkImportReportID);
|
||||||
|
|
||||||
|
@ -201,6 +204,7 @@ public class BulkImportController {
|
||||||
bulkImportService.bulkImportFullTextsFromDirectory(bulkImportReport, finalRelativeBulkImportDir, finalBulkImportDir, givenDir, provenance, bulkImportSource, shouldDeleteFilesOnFinish)
|
bulkImportService.bulkImportFullTextsFromDirectory(bulkImportReport, finalRelativeBulkImportDir, finalBulkImportDir, givenDir, provenance, bulkImportSource, shouldDeleteFilesOnFinish)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// This directory, will be removed from "bulkImportDirsUnderProcessing", when the background job finishes.
|
||||||
return ResponseEntity.ok().body(new BulkImportResponse(msg, bulkImportReportID)); // The response is automatically serialized to json and it's of type "application/json".
|
return ResponseEntity.ok().body(new BulkImportResponse(msg, bulkImportReportID)); // The response is automatically serialized to json and it's of type "application/json".
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -73,7 +73,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
logger.error(errorMsg);
|
logger.error(errorMsg);
|
||||||
bulkImportReport.addEvent(errorMsg);
|
bulkImportReport.addEvent(errorMsg);
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
BulkImportController.bulkImportDirs.remove(bulkImportDirName);
|
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -81,7 +81,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
if ( fileLocations == null ) {
|
if ( fileLocations == null ) {
|
||||||
bulkImportReport.addEvent("Could not retrieve the files for bulk-import!");
|
bulkImportReport.addEvent("Could not retrieve the files for bulk-import!");
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
BulkImportController.bulkImportDirs.remove(bulkImportDirName);
|
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -91,7 +91,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
logger.warn(errorMsg);
|
logger.warn(errorMsg);
|
||||||
bulkImportReport.addEvent(errorMsg);
|
bulkImportReport.addEvent(errorMsg);
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
BulkImportController.bulkImportDirs.remove(bulkImportDirName);
|
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -106,7 +106,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
logger.error(errorMsg, e);
|
logger.error(errorMsg, e);
|
||||||
bulkImportReport.addEvent(errorMsg);
|
bulkImportReport.addEvent(errorMsg);
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
BulkImportController.bulkImportDirs.remove(bulkImportDirName);
|
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -117,17 +117,18 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
logger.error(errorMsg);
|
logger.error(errorMsg);
|
||||||
bulkImportReport.addEvent(errorMsg);
|
bulkImportReport.addEvent(errorMsg);
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
BulkImportController.bulkImportDirs.remove(bulkImportDirName);
|
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
long timeMillis = System.currentTimeMillis(); // Store it here, in order to have the same for all current records.
|
long timeMillis = System.currentTimeMillis(); // Store it here, in order to have the same for all current records.
|
||||||
|
|
||||||
List<Callable<Integer>> callableTasksForFileSegments = new ArrayList<>(numOfFiles);
|
List<Callable<Integer>> callableTasksForFileSegments = new ArrayList<>(numOfFiles);
|
||||||
List<List<String>> subLists = Lists.partition(fileLocations, BulkImportController.numOfThreadsPerBulkImportProcedure); // Divide the initial list to "numOfThreadsPerBulkImportProcedure" subLists. The last one may have marginally fewer files.
|
int sizeOfEachSegment = (numOfFiles / BulkImportController.numOfThreadsForBulkImportProcedures);
|
||||||
|
List<List<String>> subLists = Lists.partition(fileLocations, sizeOfEachSegment); // Divide the initial list to "numOfThreadsPerBulkImportProcedure" subLists. The last one may have marginally fewer files.
|
||||||
int subListsSize = subLists.size();
|
int subListsSize = subLists.size();
|
||||||
|
|
||||||
bulkImportReport.addEvent("Going to import the files in " + subListsSize + " segments, in parallel.");
|
bulkImportReport.addEvent("Going to import the files in parallel, after dividing them in " + subListsSize + " segments.");
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
|
|
||||||
for ( int i = 0; i < subListsSize; ++i ) {
|
for ( int i = 0; i < subListsSize; ++i ) {
|
||||||
|
@ -138,16 +139,18 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
}
|
}
|
||||||
|
|
||||||
int numFailedSegments = 0;
|
int numFailedSegments = 0;
|
||||||
int numFailedFiles = 0;
|
int numFailedFilesForSegment = 0;
|
||||||
|
int numAllFailedFiles = 0;
|
||||||
try {
|
try {
|
||||||
List<Future<Integer>> futures = BulkImportController.bulkImportExecutor.invokeAll(callableTasksForFileSegments); // This waits for all tasks to finish.
|
List<Future<Integer>> futures = BulkImportController.bulkImportExecutor.invokeAll(callableTasksForFileSegments); // This waits for all tasks to finish.
|
||||||
int sizeOfFutures = futures.size();
|
int sizeOfFutures = futures.size(); // This is the same as the "subListsSize".
|
||||||
for ( int i = 0; i < sizeOfFutures; ++i ) {
|
for ( int i = 0; i < sizeOfFutures; ++i )
|
||||||
|
{ // For each segment..
|
||||||
try {
|
try {
|
||||||
numFailedFiles += futures.get(i).get();
|
numFailedFilesForSegment = futures.get(i).get();
|
||||||
if ( numFailedFiles == subLists.get(i).size() ) { // Get and see if it was successfully or not, or if an exception is thrown..
|
numAllFailedFiles += numFailedFilesForSegment;
|
||||||
|
if ( numFailedFilesForSegment == subLists.get(i).size() )
|
||||||
numFailedSegments++;
|
numFailedSegments++;
|
||||||
}
|
|
||||||
// In case all the files failed to be bulk-imported, then we will detect it in the "numSuccessfulSegments"-check later.
|
// In case all the files failed to be bulk-imported, then we will detect it in the "numSuccessfulSegments"-check later.
|
||||||
// The failed-to-be-imported files, will not be deleted, even if the user specifies that he wants to delete the directory.
|
// The failed-to-be-imported files, will not be deleted, even if the user specifies that he wants to delete the directory.
|
||||||
} catch (ExecutionException ee) {
|
} catch (ExecutionException ee) {
|
||||||
|
@ -164,7 +167,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
logger.error(errorMsg, e);
|
logger.error(errorMsg, e);
|
||||||
bulkImportReport.addEvent(errorMsg);
|
bulkImportReport.addEvent(errorMsg);
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
BulkImportController.bulkImportDirs.remove(bulkImportDirName);
|
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
|
||||||
return false;
|
return false;
|
||||||
} finally {
|
} finally {
|
||||||
logger.debug("Deleting local parquet directory: " + localParquetDir);
|
logger.debug("Deleting local parquet directory: " + localParquetDir);
|
||||||
|
@ -173,15 +176,15 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
|
|
||||||
// Check the results.
|
// Check the results.
|
||||||
String msg;
|
String msg;
|
||||||
if ( numFailedFiles == numOfFiles ) {
|
if ( numAllFailedFiles == numOfFiles ) {
|
||||||
String errorMsg = "None of the files inside the bulkImportDir: " + bulkImportDirName + " were imported!";
|
String errorMsg = "None of the files inside the bulkImportDir: " + bulkImportDirName + " were imported!";
|
||||||
logger.error(errorMsg);
|
logger.error(errorMsg);
|
||||||
bulkImportReport.addEvent(errorMsg);
|
bulkImportReport.addEvent(errorMsg);
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
BulkImportController.bulkImportDirs.remove(bulkImportDirName);
|
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
|
||||||
return false;
|
return false;
|
||||||
} else if ( numFailedFiles > 0 ) { // Some failed, but not all.
|
} else if ( numAllFailedFiles > 0 ) { // Some failed, but not all.
|
||||||
msg = numFailedFiles + " files" + (numFailedSegments > 0 ? (" and " + numFailedSegments + " whole segments") : "") + " failed to be bulk-imported, from the bulkImportDir: " + bulkImportDirName;
|
msg = numAllFailedFiles + " files" + (numFailedSegments > 0 ? (" and " + numFailedSegments + " whole segments") : "") + " failed to be bulk-imported, from the bulkImportDir: " + bulkImportDirName;
|
||||||
logger.warn(msg);
|
logger.warn(msg);
|
||||||
} else {
|
} else {
|
||||||
msg = "All " + numOfFiles + " files, from bulkImportDir: " + bulkImportDirName + " were bulkImported successfully.";
|
msg = "All " + numOfFiles + " files, from bulkImportDir: " + bulkImportDirName + " were bulkImported successfully.";
|
||||||
|
@ -197,7 +200,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
ImpalaConnector.databaseLock.unlock();
|
ImpalaConnector.databaseLock.unlock();
|
||||||
bulkImportReport.addEvent(mergeErrorMsg);
|
bulkImportReport.addEvent(mergeErrorMsg);
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
BulkImportController.bulkImportDirs.remove(bulkImportDirName);
|
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
ImpalaConnector.databaseLock.unlock();
|
ImpalaConnector.databaseLock.unlock();
|
||||||
|
@ -210,7 +213,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
// Also, we do not want to write the object in the end (in its final form), since we want the user to have the ability to request the report at any time,
|
// Also, we do not want to write the object in the end (in its final form), since we want the user to have the ability to request the report at any time,
|
||||||
// after submitting the bulk-import request, to see its progress (since the number of file may be very large and the processing may take many hours).
|
// after submitting the bulk-import request, to see its progress (since the number of file may be very large and the processing may take many hours).
|
||||||
|
|
||||||
BulkImportController.bulkImportDirs.remove(bulkImportDirName);
|
BulkImportController.bulkImportDirsUnderProcessing.remove(bulkImportDirName);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -222,7 +225,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
String bulkImportReportLocation = bulkImportReport.getReportLocation();
|
String bulkImportReportLocation = bulkImportReport.getReportLocation();
|
||||||
|
|
||||||
int numOfFilesInSegment = fileLocationsSegment.size();
|
int numOfFilesInSegment = fileLocationsSegment.size();
|
||||||
String msg = "Going to import " + numOfFilesInSegment + " files for segment-" + segmentCounter + " , of bulkImport procedure: " + provenance + " | dir: " + bulkImportDirName;
|
String msg = "Going to import " + numOfFilesInSegment + " files, for segment-" + segmentCounter + ", of bulkImport procedure: " + provenance + " | dir: " + bulkImportDirName;
|
||||||
logger.debug(msg);
|
logger.debug(msg);
|
||||||
bulkImportReport.addEvent(msg);
|
bulkImportReport.addEvent(msg);
|
||||||
|
|
||||||
|
@ -250,7 +253,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
|
|
||||||
int numOfPayloadRecords = payloadRecords.size();
|
int numOfPayloadRecords = payloadRecords.size();
|
||||||
if ( numOfPayloadRecords == 0 ) {
|
if ( numOfPayloadRecords == 0 ) {
|
||||||
String errorMsg = "No payload-records were generated for any of the files inside the bulkImportDir: " + bulkImportDirName;
|
String errorMsg = "No payload-records were generated for any of the files, of segment-" + segmentCounter + ", inside the bulkImportDir: " + bulkImportDirName;
|
||||||
logger.warn(errorMsg);
|
logger.warn(errorMsg);
|
||||||
bulkImportReport.addEvent(errorMsg);
|
bulkImportReport.addEvent(errorMsg);
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
|
@ -258,13 +261,13 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
return numOfFilesInSegment;
|
return numOfFilesInSegment;
|
||||||
} else if ( numOfPayloadRecords != numOfFilesInSegment ) {
|
} else if ( numOfPayloadRecords != numOfFilesInSegment ) {
|
||||||
// Write this important note here, in order to certainly be in the report, even if a parquet-file failure happens and the method exists early.
|
// Write this important note here, in order to certainly be in the report, even if a parquet-file failure happens and the method exists early.
|
||||||
String errorMsg = failedFiles.size() + " out of " + numOfFilesInSegment + " files failed to be imported, for segment-" + segmentCounter + " !";
|
String errorMsg = failedFiles.size() + " out of " + numOfFilesInSegment + " files failed to be bulk-imported, for segment-" + segmentCounter + " !";
|
||||||
logger.warn(errorMsg);
|
logger.warn(errorMsg);
|
||||||
bulkImportReport.addEvent(errorMsg);
|
bulkImportReport.addEvent(errorMsg);
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Construct the parquet file, upload it to HDFS and load them it in the "payload_bulk_import" table.
|
// Construct the parquet file, upload it to HDFS and load it in the "payload_bulk_import" table.
|
||||||
String parquetFileName = "payloads_" + segmentCounter + ".parquet";
|
String parquetFileName = "payloads_" + segmentCounter + ".parquet";
|
||||||
String fullLocalParquetFilePath = localParquetDir + parquetFileName;
|
String fullLocalParquetFilePath = localParquetDir + parquetFileName;
|
||||||
|
|
||||||
|
@ -272,10 +275,10 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
logger.trace("Going to write " + numOfPayloadRecords + " payload-records to the parquet file: " + fullLocalParquetFilePath); // DEBUG!
|
logger.trace("Going to write " + numOfPayloadRecords + " payload-records to the parquet file: " + fullLocalParquetFilePath); // DEBUG!
|
||||||
|
|
||||||
if ( ! parquetFileUtils.writeToParquet(payloadRecords, ParquetFileUtils.payloadsSchema, fullLocalParquetFilePath) ) {
|
if ( ! parquetFileUtils.writeToParquet(payloadRecords, ParquetFileUtils.payloadsSchema, fullLocalParquetFilePath) ) {
|
||||||
bulkImportReport.addEvent("Could not write the payload-records to the parquet-file: " + parquetFileName + " !");
|
bulkImportReport.addEvent("Could not write the payload-records for segment-" + segmentCounter + " to the parquet-file: " + parquetFileName + " !");
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
// None of the files of this segment will be deleted, in any case.
|
// None of the files of this segment will be deleted, in any case.
|
||||||
return numOfFilesInSegment;
|
return numOfFilesInSegment; // All files of this segment have failed.
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( logger.isTraceEnabled() )
|
if ( logger.isTraceEnabled() )
|
||||||
|
@ -287,7 +290,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
bulkImportReport.addEvent("Could not upload the parquet-file " + parquetFileName + " to HDFS!");
|
bulkImportReport.addEvent("Could not upload the parquet-file " + parquetFileName + " to HDFS!");
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
// None of the files of this segment will be deleted, in any case.
|
// None of the files of this segment will be deleted, in any case.
|
||||||
return numOfFilesInSegment;
|
return numOfFilesInSegment; // All files of this segment have failed.
|
||||||
}
|
}
|
||||||
|
|
||||||
if ( logger.isTraceEnabled() )
|
if ( logger.isTraceEnabled() )
|
||||||
|
@ -296,14 +299,14 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
ImpalaConnector.databaseLock.lock();
|
ImpalaConnector.databaseLock.lock();
|
||||||
if ( !parquetFileUtils.loadParquetDataIntoTable((currentBulkImportHdfsDir + parquetFileName), "payload_bulk_import") ) {
|
if ( !parquetFileUtils.loadParquetDataIntoTable((currentBulkImportHdfsDir + parquetFileName), "payload_bulk_import") ) {
|
||||||
ImpalaConnector.databaseLock.unlock();
|
ImpalaConnector.databaseLock.unlock();
|
||||||
bulkImportReport.addEvent("Could not load the payload-records to the database!");
|
bulkImportReport.addEvent("Could not load the payload-records to the database, for segment-" + segmentCounter + "!");
|
||||||
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
fileUtils.writeToFile(bulkImportReportLocation, bulkImportReport.getJsonReport(), true);
|
||||||
// None of the files of this segment will be deleted, in any case.
|
// None of the files of this segment will be deleted, in any case.
|
||||||
return numOfFilesInSegment;
|
return numOfFilesInSegment; // All files of this segment have failed.
|
||||||
}
|
}
|
||||||
ImpalaConnector.databaseLock.unlock();
|
ImpalaConnector.databaseLock.unlock();
|
||||||
|
|
||||||
String segmentSuccessMsg = "Finished importing " + numOfPayloadRecords + " files, out of " + numOfFilesInSegment + " , for segment-" + segmentCounter + ".";
|
String segmentSuccessMsg = "Finished importing " + numOfPayloadRecords + " files, out of " + numOfFilesInSegment + ", for segment-" + segmentCounter + ".";
|
||||||
logger.info(segmentSuccessMsg);
|
logger.info(segmentSuccessMsg);
|
||||||
bulkImportReport.addEvent(segmentSuccessMsg);
|
bulkImportReport.addEvent(segmentSuccessMsg);
|
||||||
|
|
||||||
|
@ -407,7 +410,7 @@ public class BulkImportServiceImpl implements BulkImportService {
|
||||||
record.put("size", ((size != null) ? String.valueOf(size) : null));
|
record.put("size", ((size != null) ? String.valueOf(size) : null));
|
||||||
record.put("hash", fileHash); // This is already checked and will not be null here.
|
record.put("hash", fileHash); // This is already checked and will not be null here.
|
||||||
record.put("location", s3Url);
|
record.put("location", s3Url);
|
||||||
record.put("provenance", provenance);
|
record.put("provenance", ("bulk:" + provenance)); // Add a prefix in order to be more clear that this record comes from bulkImport, when looking all records in the "payload" VIEW.
|
||||||
|
|
||||||
return record;
|
return record;
|
||||||
}
|
}
|
||||||
|
|
|
@ -39,7 +39,7 @@ services:
|
||||||
bulk-import:
|
bulk-import:
|
||||||
baseBulkImportLocation: /mnt/bulk_import/
|
baseBulkImportLocation: /mnt/bulk_import/
|
||||||
bulkImportReportLocation: /reports/bulkImportReports/
|
bulkImportReportLocation: /reports/bulkImportReports/
|
||||||
numOfThreadsPerBulkImportProcedure: 6
|
numOfThreadsForBulkImportProcedures: 6
|
||||||
bulkImportSources: # These sources are accepted for bulk-import requests and are excluded from crawling.
|
bulkImportSources: # These sources are accepted for bulk-import requests and are excluded from crawling.
|
||||||
arxivImport:
|
arxivImport:
|
||||||
datasourceID: opendoar____::6f4922f45568161a8cdf4ad2299f6d23
|
datasourceID: opendoar____::6f4922f45568161a8cdf4ad2299f6d23
|
||||||
|
|
Loading…
Reference in New Issue