- Improve performance of uploading parquet-files to HDFS.

- Add some logs.
- Code polishing.
This commit is contained in:
Lampros Smyrnaios 2023-05-11 19:40:48 +03:00
parent 992d4ffd5e
commit 8381df70c6
2 changed files with 89 additions and 80 deletions

View File

@ -278,6 +278,8 @@ public class FullTextsServiceImpl implements FullTextsService {
}
//logger.trace("Parquet file '" + parquetFileName + "' was created and filled."); // DEBUG!
logger.trace("Going to upload the parquet file: " + fullLocalParquetFilePath + " to HDFS."); // DEBUG!
// Upload and insert the data to the "payload" Impala table. (no database-locking is required)
String errorMsg = parquetFileUtils.uploadParquetFileToHDFS(fullLocalParquetFilePath, parquetFileName, currentBulkImportHdfsDir);
if ( errorMsg != null ) { // The possible error-message returned, is already logged by the Controller.
@ -287,6 +289,8 @@ public class FullTextsServiceImpl implements FullTextsService {
return numOfFilesInSegment;
}
logger.trace("Going to load the data of parquet-file: \"" + parquetFileName + "\" to the database-table: \"payload_bulk_import\"."); // DEBUG!
ImpalaConnector.databaseLock.lock();
if ( !parquetFileUtils.loadParquetDataIntoTable((currentBulkImportHdfsDir + parquetFileName), "payload_bulk_import") ) {
ImpalaConnector.databaseLock.unlock();
@ -327,10 +331,6 @@ public class FullTextsServiceImpl implements FullTextsService {
// Check if this file is already found by crawling. Even though we started excluding this datasource from crawling, many full-texts have already been downloaded.
// Also, it may be the case that this file was downloaded by another datasource.
String fileHash = docFileData.getHash();
if ( fileHash == null )
return null; // No check of past found full-text can be made nor the S3-fileName can be created.
FileLocationData fileLocationData;
try {
fileLocationData = new FileLocationData(fileLocation);
@ -339,11 +339,15 @@ public class FullTextsServiceImpl implements FullTextsService {
return null;
}
String fileHash = docFileData.getHash();
if ( fileHash == null )
return null; // No check of past found full-text can be made nor the S3-fileName can be created.
String datasourceId = bulkImportSource.getDatasourceID();
String datasourcePrefix = bulkImportSource.getDatasourcePrefix();
String fileNameID = fileLocationData.getFileNameID();
String actualUrl = (bulkImportSource.getPdfUrlPrefix() + fileNameID); // This is the urls with the ArvixId.
String actualUrl = (bulkImportSource.getPdfUrlPrefix() + fileNameID); // This string-concatenation, works with urls of Arvix. A different construction may be needed for other datasources.
String originalUrl = actualUrl; // We have the full-text files from bulk-import, so let's assume the original-url is also the full-text-link.
final String getFileLocationForHashQuery = "select `location` from " + ImpalaConnector.databaseName + ".payload where `hash` = ? limit 1";

View File

@ -29,10 +29,10 @@ import org.springframework.http.ResponseEntity;
import org.springframework.jdbc.core.JdbcTemplate;
import org.springframework.stereotype.Component;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.nio.charset.StandardCharsets;
@ -372,7 +372,7 @@ public class ParquetFileUtils {
// Check if the parquet file exists locally.
File parquetFile = new File(parquetFileFullLocalPath);
if ( ! parquetFile.isFile() ) {
String errorMsg = "The parquet file \"" + parquetFileFullLocalPath + "\" does not exist!";
String errorMsg = "The local parquet file \"" + parquetFileFullLocalPath + "\" does not exist!";
logger.error(errorMsg);
return errorMsg;
}
@ -411,12 +411,16 @@ public class ParquetFileUtils {
conn.setRequestProperty("Authorization", hdfsHttpAuthString);
conn.setRequestProperty("content-type", "application/octet-stream");
conn.setDoOutput(true);
conn.setInstanceFollowRedirects(true); // It is possible that the "location was"
conn.setInstanceFollowRedirects(true); // It is possible that the "location" was an intermediate one.
conn.connect();
// Write the parquet file.
try ( InputStream inputS = Files.newInputStream(parquetFile.toPath()); OutputStream outS = conn.getOutputStream()) {
int readByte = -1; while ( (readByte = inputS.read()) != -1 ) outS.write(readByte);
try ( BufferedInputStream inputS = new BufferedInputStream(Files.newInputStream(parquetFile.toPath()), FileUtils.tenMb);
BufferedOutputStream outS = new BufferedOutputStream(conn.getOutputStream(), FileUtils.tenMb) )
{
int readByte = -1;
while ( (readByte = inputS.read()) != -1 )
outS.write(readByte);
} // Any exception will be caught in the end of this method.
statusCode = conn.getResponseCode();
@ -500,8 +504,10 @@ public class ParquetFileUtils {
attemptCreationSuccessful = applyHDFOperation(webHDFSBaseUrl + parquetHDFSDirectoryPathAttempts + mkDirsAndParams);
payloadAggregatedCreationSuccessful = applyHDFOperation(webHDFSBaseUrl + parquetHDFSDirectoryPathPayloadsAggregated + mkDirsAndParams);
payloadBulkImportCreationSuccessful = applyHDFOperation(webHDFSBaseUrl + parquetHDFSDirectoryPathPayloadsBulkImport + mkDirsAndParams);
return (attemptCreationSuccessful && payloadAggregatedCreationSuccessful && payloadBulkImportCreationSuccessful);
// We need all directories to be created in order for the app to function properly!
}
else {
// Check the json-response, to see if all the subdirectories exist.
// Take the json-response.
@ -569,14 +575,13 @@ public class ParquetFileUtils {
payloadBulkImportCreationSuccessful = applyHDFOperation(webHDFSBaseUrl + parquetHDFSDirectoryPathPayloadsBulkImport + mkDirsAndParams);
} else
logger.info("The remote parquet directory \"" + parquetHDFSDirectoryPathPayloadsBulkImport + "\" exists.");
}
return (attemptCreationSuccessful && payloadAggregatedCreationSuccessful && payloadBulkImportCreationSuccessful);
// We need all directories to be created in order for the app to function properly!
} catch (Exception e) {
logger.error("", e);
return false;
}
return (attemptCreationSuccessful && payloadAggregatedCreationSuccessful && payloadBulkImportCreationSuccessful);
// We need all directories to be created in order for the app to function properly!
}