forked from lsmyrnaios/UrlsController
- Increase security by sanitizing the value of the "workerId" before use it in sql-statements. Impala has bugs with some types of PreparedStatements.
- Improve reliability, by dropping the "current_assignment" table in case of an error, thus the next "getUrls"-request will not fail. - Fix the "databaseLock" not being unlocked when the "addWorkerReport()" method returned early on some error-cases. - Delete the "assignment"-data after inserting the related payloads and attempts in the database.
This commit is contained in:
parent
a46ab84f10
commit
0178e44574
|
@ -18,6 +18,7 @@ import java.sql.*;
|
|||
|
||||
import java.util.*;
|
||||
import java.util.concurrent.atomic.AtomicLong;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
@RestController
|
||||
@RequestMapping("/urls")
|
||||
|
@ -27,10 +28,19 @@ public class UrlController {
|
|||
|
||||
private static final AtomicLong assignmentsBatchCounter = new AtomicLong(0); // Just for the "getTestUrls"-endpoint.
|
||||
|
||||
private static final Pattern MALICIOUS_INPUT_STRING = Pattern.compile(".*[';`\"]+.*");
|
||||
|
||||
|
||||
@GetMapping("")
|
||||
public ResponseEntity<?> getUrls(@RequestParam String workerId, @RequestParam int workerAssignmentsLimit) {
|
||||
|
||||
// As the Impala-driver is buggy and struggles to support parameterized queries in some types of prepared-statements, we have to sanitize the "workerId" ourselves.
|
||||
if ( MALICIOUS_INPUT_STRING.matcher(workerId).matches() ) {
|
||||
String errorMsg = "Possibly malicious \"workerId\" received: " + workerId;
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.status(HttpStatus.FORBIDDEN).body(errorMsg);
|
||||
}
|
||||
|
||||
logger.info("Worker with id: \"" + workerId + "\", requested " + workerAssignmentsLimit + " assignments. The assignments-limit of the controller is: " + ControllerConstants.ASSIGNMENTS_LIMIT);
|
||||
|
||||
// Create the Assignments from the id-urls stored in the database up to the < assignmentsLimit >.
|
||||
|
@ -78,9 +88,14 @@ public class UrlController {
|
|||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body("Problem when connecting with the Impala-database!");
|
||||
}
|
||||
|
||||
// All transactions in Impala automatically commit at the end of the statement. Currently, Impala does not support multi-statement transactions.
|
||||
// https://impala.apache.org/docs/build/html/topics/impala_transactions.html
|
||||
// We cannot use "savePoints" along with "autoCommit = false" to roll back to a previous state among multiple statements.
|
||||
|
||||
PreparedStatement createCurrentAssignmentsPreparedStatement = null;
|
||||
try {
|
||||
createCurrentAssignmentsPreparedStatement = con.prepareStatement(createAssignmentsQuery);
|
||||
// We cannot set the "limits" and the MAX_ATTEMPTS_PER_RECORD as preparedStatements parameters, as we get a "java.sql.SQLException: [Simba][JDBC](11420) Error, parameter metadata not populated."
|
||||
createCurrentAssignmentsPreparedStatement.execute();
|
||||
} catch (SQLException sqle) {
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
|
@ -100,8 +115,11 @@ public class UrlController {
|
|||
computeCurrentAssignmentsStatsPreparedStatement = con.prepareStatement(computeCurrentAssignmentsStatsQuery);
|
||||
computeCurrentAssignmentsStatsPreparedStatement.execute();
|
||||
} catch (SQLException sqle) {
|
||||
String errorMsg = dropCurrentAssignmentTable(con);
|
||||
if ( errorMsg != null )
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
String errorMsg = ImpalaConnector.handlePreparedStatementException("computeCurrentAssignmentsStatsQuery", computeCurrentAssignmentsStatsQuery, "computeCurrentAssignmentsStatsPreparedStatement", computeCurrentAssignmentsStatsPreparedStatement, con, sqle);
|
||||
errorMsg = ImpalaConnector.handlePreparedStatementException("computeCurrentAssignmentsStatsQuery", computeCurrentAssignmentsStatsQuery, "computeCurrentAssignmentsStatsPreparedStatement", computeCurrentAssignmentsStatsPreparedStatement, con, sqle);
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
} finally {
|
||||
try {
|
||||
|
@ -116,8 +134,11 @@ public class UrlController {
|
|||
try {
|
||||
getAssignmentsPreparedStatement = con.prepareStatement(getAssignmentsQuery);
|
||||
} catch (SQLException sqle) {
|
||||
String errorMsg = dropCurrentAssignmentTable(con);
|
||||
if ( errorMsg != null )
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
String errorMsg = ImpalaConnector.handlePreparedStatementException("getAssignmentsQuery", getAssignmentsQuery, "getAssignmentsPreparedStatement", getAssignmentsPreparedStatement, con, sqle);
|
||||
errorMsg = ImpalaConnector.handlePreparedStatementException("getAssignmentsQuery", getAssignmentsQuery, "getAssignmentsPreparedStatement", getAssignmentsPreparedStatement, con, sqle);
|
||||
// The "getAssignmentsPreparedStatement" will always be null here, so we do not close it.
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
}
|
||||
|
@ -135,7 +156,7 @@ public class UrlController {
|
|||
|
||||
// The cursor is automatically before the first element in this configuration.
|
||||
while ( resultSet.next() ) {
|
||||
// The following few lines, cannot be outside the "while" loop, since the same record is returned, despite that we update the inner-values.
|
||||
// The following few lines, cannot be outside the "while" loop, since the same object is added, despite that we update the inner-values.
|
||||
Assignment assignment = new Assignment();
|
||||
assignment.setWorkerId(workerId);
|
||||
assignment.setTimestamp(timestamp);
|
||||
|
@ -153,8 +174,11 @@ public class UrlController {
|
|||
assignments.add(assignment);
|
||||
}
|
||||
} catch (Exception e) {
|
||||
String errorMsg = dropCurrentAssignmentTable(con);
|
||||
if ( errorMsg != null )
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
String errorMsg = "Problem when executing the \"getAssignmentsQuery\"!\n";
|
||||
errorMsg = "Problem when executing the \"getAssignmentsQuery\"!\n";
|
||||
logger.error(errorMsg, e);
|
||||
ImpalaConnector.closeConnection(con);
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
|
@ -168,8 +192,11 @@ public class UrlController {
|
|||
|
||||
int assignmentsSize = assignments.size();
|
||||
if ( assignmentsSize == 0 ) {
|
||||
String errorMsg = dropCurrentAssignmentTable(con);
|
||||
if ( errorMsg != null )
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
String errorMsg = "No results retrieved from the \"findAssignmentsQuery\" for worker with id: " + workerId;
|
||||
errorMsg = "No results retrieved from the \"findAssignmentsQuery\" for worker with id: " + workerId;
|
||||
logger.error(errorMsg);
|
||||
ImpalaConnector.closeConnection(con);
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
|
@ -180,6 +207,7 @@ public class UrlController {
|
|||
|
||||
// Write the Assignment details to the assignment-table.
|
||||
|
||||
// The "timestamp" is generated from the Java-code, so it's in no way provided by a 3rd party.
|
||||
String insertAssignmentsQuery = "insert into " + ImpalaConnector.databaseName + ".assignment \n select pub_data.pubid, pub_data.url, '" + workerId + "', cast('" + timestamp + "' as timestamp)\n"
|
||||
+ "from (\n select pubid, url from " + ImpalaConnector.databaseName + ".current_assignment) as pub_data";
|
||||
|
||||
|
@ -188,8 +216,11 @@ public class UrlController {
|
|||
insertAssignmentsPreparedStatement = con.prepareStatement(insertAssignmentsQuery);
|
||||
insertAssignmentsPreparedStatement.execute();
|
||||
} catch (SQLException sqle) {
|
||||
String errorMsg = dropCurrentAssignmentTable(con);
|
||||
if ( errorMsg != null )
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
String errorMsg = ImpalaConnector.handlePreparedStatementException("insertAssignmentsQuery", insertAssignmentsQuery, "insertAssignmentsPreparedStatement", insertAssignmentsPreparedStatement, con, sqle);
|
||||
errorMsg = ImpalaConnector.handlePreparedStatementException("insertAssignmentsQuery", insertAssignmentsQuery, "insertAssignmentsPreparedStatement", insertAssignmentsPreparedStatement, con, sqle);
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
} finally {
|
||||
try {
|
||||
|
@ -200,35 +231,21 @@ public class UrlController {
|
|||
}
|
||||
}
|
||||
|
||||
String dropCurrentAssignmentsQuery = "DROP TABLE " + ImpalaConnector.databaseName + ".current_assignment PURGE";
|
||||
PreparedStatement dropCurrentAssignmentsPreparedStatement = null;
|
||||
try {
|
||||
dropCurrentAssignmentsPreparedStatement = con.prepareStatement(dropCurrentAssignmentsQuery);
|
||||
dropCurrentAssignmentsPreparedStatement.execute();
|
||||
} catch (SQLException sqle) {
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
String errorMsg = ImpalaConnector.handlePreparedStatementException("dropCurrentAssignmentsQuery", dropCurrentAssignmentsQuery, "dropCurrentAssignmentsPreparedStatement", dropCurrentAssignmentsPreparedStatement, con, sqle);
|
||||
String errorMsg = dropCurrentAssignmentTable(con);
|
||||
if ( errorMsg != null )
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||
} finally {
|
||||
try {
|
||||
if ( dropCurrentAssignmentsPreparedStatement != null )
|
||||
dropCurrentAssignmentsPreparedStatement.close();
|
||||
} catch (SQLException sqle2) {
|
||||
logger.error("Failed to close the \"dropCurrentAssignmentsPreparedStatement\"!\n" + sqle2.getMessage());
|
||||
}
|
||||
}
|
||||
|
||||
logger.debug("Finished inserting " + assignmentsSize + " assignments into the \"assignment\"-table. Going to merge the parquet files for this table.");
|
||||
|
||||
String mergeErrorMsg = FileUtils.mergeParquetFiles("assignment", con);
|
||||
String mergeErrorMsg = FileUtils.mergeParquetFiles("assignment", con, "", null);
|
||||
if ( mergeErrorMsg != null ) {
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
ImpalaConnector.closeConnection(con);
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(mergeErrorMsg);
|
||||
}
|
||||
|
||||
ImpalaConnector.closeConnection(con);
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
ImpalaConnector.closeConnection(con);
|
||||
|
||||
long curAssignmentsBatchCounter = assignmentsBatchCounter.incrementAndGet();
|
||||
logger.info("Sending batch-assignments_" + curAssignmentsBatchCounter + " with " + assignmentsSize + " assignments to worker with ID: " + workerId + ".");
|
||||
|
@ -245,18 +262,32 @@ public class UrlController {
|
|||
return ResponseEntity.status(HttpStatus.BAD_REQUEST).body(errorMsg);
|
||||
}
|
||||
|
||||
String curWorkerId = workerReport.getWorkerId();
|
||||
if ( curWorkerId == null ) {
|
||||
String errorMsg = "No \"workerId\" was included inside the \"WorkerReport\"!";
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.status(HttpStatus.BAD_REQUEST).body(errorMsg);
|
||||
}
|
||||
|
||||
// As the Impala-driver is buggy and struggles to support parameterized queries in some types of prepared-statements, we have to sanitize the "workerId" ourselves.
|
||||
if ( MALICIOUS_INPUT_STRING.matcher(curWorkerId).matches() ) {
|
||||
String errorMsg = "Possibly malicious \"workerId\" received: " + curWorkerId;
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.status(HttpStatus.FORBIDDEN).body(errorMsg);
|
||||
}
|
||||
|
||||
List<UrlReport> urlReports = workerReport.getUrlReports();
|
||||
if ( (urlReports == null) || urlReports.isEmpty() ) {
|
||||
String errorMsg = "The given \"WorkerReport\" from worker with ID \"" + workerReport.getWorkerId() + "\" was empty!";
|
||||
String errorMsg = "The given \"WorkerReport\" from worker with ID \"" + curWorkerId + "\" was empty (without any UrlReports)!";
|
||||
logger.error(errorMsg);
|
||||
return ResponseEntity.status(HttpStatus.BAD_REQUEST).body(errorMsg);
|
||||
}
|
||||
|
||||
long curReportAssignments = workerReport.getAssignmentRequestCounter();
|
||||
logger.info("Received the WorkerReport for batch-assignments_" + curReportAssignments + ", from the worker with id: " + workerReport.getWorkerId() + ". It contains " + urlReports.size() + " urlReports. Going to request the fullTexts from the Worker and insert the UrlReports into the database.");
|
||||
logger.info("Received the WorkerReport for batch-assignments_" + curReportAssignments + ", from the worker with id: " + curWorkerId + ". It contains " + urlReports.size() + " urlReports. Going to request the fullTexts from the Worker and insert the UrlReports into the database.");
|
||||
|
||||
// Before continuing with inserts, take and upload the fullTexts from the Worker. Also, update the file-"location".
|
||||
if ( ! FileUtils.getAndUploadFullTexts(urlReports, request, curReportAssignments, workerReport.getWorkerId()) ) {
|
||||
if ( ! FileUtils.getAndUploadFullTexts(urlReports, request, curReportAssignments, curWorkerId) ) {
|
||||
logger.error("Failed to get and/or upload the fullTexts for assignments_" + curReportAssignments);
|
||||
// The docUrls were still found! Just update ALL the fileLocations. sizes and hashes, to show that the files are not available and continue with writing the attempts and the Payloads.
|
||||
FileUtils.updateUrlReportsToHaveNoFullTextFiles(urlReports);
|
||||
|
@ -280,6 +311,7 @@ public class UrlController {
|
|||
tempInsertQueryName = "insertIntoAttemptBaseQuery";
|
||||
preparedInsertAttemptStatement = con.prepareStatement(insertIntoAttemptBaseQuery);
|
||||
} catch (SQLException sqle) {
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
String errorMsg = "Problem when creating the prepared statement for \"" + tempInsertQueryName + "\"!\n";
|
||||
logger.error(errorMsg + sqle.getMessage());
|
||||
closePreparedStatements(preparedInsertPayloadStatement, preparedInsertAttemptStatement, con);
|
||||
|
@ -289,6 +321,7 @@ public class UrlController {
|
|||
try {
|
||||
con.setAutoCommit(false); // Avoid writing to disk for each insert. Write them all in the end.
|
||||
} catch (SQLException sqle) {
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
String errorMsg = "Problem when setting Connection.AutoCommit to \"false\"!";
|
||||
logger.error(errorMsg + "\n" + sqle.getMessage());
|
||||
closePreparedStatements(preparedInsertPayloadStatement, preparedInsertAttemptStatement, con);
|
||||
|
@ -360,14 +393,23 @@ public class UrlController {
|
|||
|
||||
logger.debug("Finished inserting the payloads and the attempts into the \"payload\" and \"attempt\" tables. Going to merge the parquet files for those tables.");
|
||||
|
||||
String mergeErrorMsg = FileUtils.mergeParquetFiles("payload", con);
|
||||
String mergeErrorMsg = FileUtils.mergeParquetFiles("payload", con, "", null);
|
||||
if ( mergeErrorMsg != null ) {
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
ImpalaConnector.closeConnection(con);
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(mergeErrorMsg);
|
||||
}
|
||||
|
||||
mergeErrorMsg = FileUtils.mergeParquetFiles("attempt", con);
|
||||
mergeErrorMsg = FileUtils.mergeParquetFiles("attempt", con, "", null);
|
||||
if ( mergeErrorMsg != null ) {
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
ImpalaConnector.closeConnection(con);
|
||||
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(mergeErrorMsg);
|
||||
}
|
||||
|
||||
// This will delete the rows of the "assignment" table which refer to the curWorkerId. As we have non-kudu Impala tables, the Delete operation can only succeed through a "merge" operation of the rest of the data.
|
||||
// Only the rows referring to OTHER workerIDs get stored in a temp-table, while the "assignment" table gets deleted. Then, the temp_table becomes the "assignment" table.
|
||||
mergeErrorMsg = FileUtils.mergeParquetFiles("assignment", con, " WHERE workerid != ", curWorkerId);
|
||||
if ( mergeErrorMsg != null ) {
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
ImpalaConnector.closeConnection(con);
|
||||
|
@ -492,4 +534,26 @@ public class UrlController {
|
|||
return ResponseEntity.status(HttpStatus.OK).header("Content-Type", "application/json").body(new AssignmentsResponse(curAssignmentsBatchCounter, assignments));
|
||||
}
|
||||
|
||||
|
||||
private String dropCurrentAssignmentTable(Connection con)
|
||||
{
|
||||
String dropCurrentAssignmentsQuery = "DROP TABLE " + ImpalaConnector.databaseName + ".current_assignment PURGE";
|
||||
PreparedStatement dropCurrentAssignmentsPreparedStatement = null;
|
||||
try {
|
||||
dropCurrentAssignmentsPreparedStatement = con.prepareStatement(dropCurrentAssignmentsQuery);
|
||||
dropCurrentAssignmentsPreparedStatement.execute();
|
||||
return null;
|
||||
} catch (SQLException sqle) {
|
||||
ImpalaConnector.databaseLock.unlock();
|
||||
return ImpalaConnector.handlePreparedStatementException("dropCurrentAssignmentsQuery", dropCurrentAssignmentsQuery, "dropCurrentAssignmentsPreparedStatement", dropCurrentAssignmentsPreparedStatement, con, sqle);
|
||||
} finally {
|
||||
try {
|
||||
if ( dropCurrentAssignmentsPreparedStatement != null )
|
||||
dropCurrentAssignmentsPreparedStatement.close();
|
||||
} catch (SQLException sqle2) {
|
||||
logger.error("Failed to close the \"dropCurrentAssignmentsPreparedStatement\"!\n" + sqle2.getMessage());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -65,7 +65,7 @@ public class FileUtils {
|
|||
* Renames the clone to the original's name.
|
||||
* Returns the errorMsg, if an error appears, otherwise is returns "null".
|
||||
* */
|
||||
public static String mergeParquetFiles(String tableName, Connection con)
|
||||
public static String mergeParquetFiles(String tableName, Connection con, String whereClause, String parameter)
|
||||
{
|
||||
String errorMsg;
|
||||
if ( tableName == null ) {
|
||||
|
@ -74,6 +74,15 @@ public class FileUtils {
|
|||
return errorMsg;
|
||||
}
|
||||
|
||||
// Make sure the following are empty strings (in case another method call this one in the future with a null-value).
|
||||
if ( whereClause == null )
|
||||
whereClause = "";
|
||||
|
||||
if ( parameter == null )
|
||||
parameter = "";
|
||||
else
|
||||
parameter = " '" + parameter + "'"; // This will be a "string-check".
|
||||
|
||||
Statement statement;
|
||||
try {
|
||||
statement = con.createStatement();
|
||||
|
@ -84,7 +93,7 @@ public class FileUtils {
|
|||
}
|
||||
|
||||
try {
|
||||
statement.execute("CREATE TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp stored as parquet AS SELECT * FROM " + ImpalaConnector.databaseName + "." + tableName);
|
||||
statement.execute("CREATE TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp stored as parquet AS SELECT * FROM " + ImpalaConnector.databaseName + "." + tableName + " " + whereClause + parameter);
|
||||
statement.execute("DROP TABLE " + ImpalaConnector.databaseName + "." + tableName + " PURGE");
|
||||
statement.execute("ALTER TABLE " + ImpalaConnector.databaseName + "." + tableName + "_tmp RENAME TO " + ImpalaConnector.databaseName + "." + tableName);
|
||||
statement.execute("COMPUTE STATS " + ImpalaConnector.databaseName + "." + tableName);
|
||||
|
|
Loading…
Reference in New Issue