- Improve performance when handling WorkerReports' database insertions, by using parallelism to insert to two different tables in the same time. Also, pre-cache the query-argument-types.
- Update the error-message and counting system, on partial insertion event.
This commit is contained in:
parent
be4898e43e
commit
6aab1d242b
|
@ -1,7 +1,8 @@
|
||||||
package eu.openaire.urls_controller;
|
package eu.openaire.urls_controller;
|
||||||
|
|
||||||
import eu.openaire.urls_controller.util.UriBuilder;
|
import eu.openaire.urls_controller.controllers.UrlController;
|
||||||
import org.springframework.boot.CommandLineRunner;
|
import org.slf4j.Logger;
|
||||||
|
import org.slf4j.LoggerFactory;
|
||||||
import org.springframework.boot.SpringApplication;
|
import org.springframework.boot.SpringApplication;
|
||||||
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
import org.springframework.boot.autoconfigure.SpringBootApplication;
|
||||||
import org.springframework.context.annotation.Bean;
|
import org.springframework.context.annotation.Bean;
|
||||||
|
@ -11,13 +12,17 @@ import org.springframework.web.cors.CorsConfiguration;
|
||||||
import org.springframework.web.cors.CorsConfigurationSource;
|
import org.springframework.web.cors.CorsConfigurationSource;
|
||||||
import org.springframework.web.cors.UrlBasedCorsConfigurationSource;
|
import org.springframework.web.cors.UrlBasedCorsConfigurationSource;
|
||||||
|
|
||||||
|
import javax.annotation.PreDestroy;
|
||||||
import java.util.Arrays;
|
import java.util.Arrays;
|
||||||
import java.util.Collections;
|
import java.util.Collections;
|
||||||
|
import java.util.concurrent.TimeUnit;
|
||||||
|
|
||||||
@SpringBootApplication
|
@SpringBootApplication
|
||||||
@EnableScheduling
|
@EnableScheduling
|
||||||
public class Application {
|
public class Application {
|
||||||
|
|
||||||
|
private static final Logger logger = LoggerFactory.getLogger(Application.class);
|
||||||
|
|
||||||
public static void main(String[] args) {
|
public static void main(String[] args) {
|
||||||
SpringApplication.run(Application.class, args);
|
SpringApplication.run(Application.class, args);
|
||||||
}
|
}
|
||||||
|
@ -33,6 +38,29 @@ public class Application {
|
||||||
source.registerCorsConfiguration("/**", configuration);
|
source.registerCorsConfiguration("/**", configuration);
|
||||||
return source;
|
return source;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@PreDestroy
|
||||||
|
public void closeThreads() {
|
||||||
|
logger.info("Shutting down the threads..");
|
||||||
|
UrlController.insertsExecutor.shutdown(); // Define that no new tasks will be scheduled.
|
||||||
|
try {
|
||||||
|
if ( ! UrlController.insertsExecutor.awaitTermination(1, TimeUnit.MINUTES) ) {
|
||||||
|
logger.warn("The working threads did not finish on time! Stopping them immediately..");
|
||||||
|
UrlController.insertsExecutor.shutdownNow();
|
||||||
|
}
|
||||||
|
} catch (SecurityException se) {
|
||||||
|
logger.error("Could not shutdown the threads in any way..!", se);
|
||||||
|
} catch (InterruptedException ie) {
|
||||||
|
try {
|
||||||
|
UrlController.insertsExecutor.shutdownNow();
|
||||||
|
} catch (SecurityException se) {
|
||||||
|
logger.error("Could not shutdown the threads in any way..!", se);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// @Bean
|
// @Bean
|
||||||
// public CommandLineRunner setServerBaseUrl(Environment environment) {
|
// public CommandLineRunner setServerBaseUrl(Environment environment) {
|
||||||
|
|
|
@ -19,6 +19,10 @@ import javax.servlet.http.HttpServletRequest;
|
||||||
import java.sql.*;
|
import java.sql.*;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
|
import java.util.concurrent.Callable;
|
||||||
|
import java.util.concurrent.ExecutorService;
|
||||||
|
import java.util.concurrent.Executors;
|
||||||
|
import java.util.concurrent.atomic.AtomicInteger;
|
||||||
import java.util.concurrent.atomic.AtomicLong;
|
import java.util.concurrent.atomic.AtomicLong;
|
||||||
import java.util.regex.Pattern;
|
import java.util.regex.Pattern;
|
||||||
|
|
||||||
|
@ -212,6 +216,8 @@ public class UrlController {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
public static ExecutorService insertsExecutor = Executors.newFixedThreadPool(2);
|
||||||
|
|
||||||
@PostMapping("addWorkerReport")
|
@PostMapping("addWorkerReport")
|
||||||
public ResponseEntity<?> addWorkerReport(@RequestBody WorkerReport workerReport, HttpServletRequest request) {
|
public ResponseEntity<?> addWorkerReport(@RequestBody WorkerReport workerReport, HttpServletRequest request) {
|
||||||
|
|
||||||
|
@ -258,58 +264,84 @@ public class UrlController {
|
||||||
|
|
||||||
// Store the workerReport into the database. We use "PreparedStatements" to do insertions, for security and valid SQL syntax reasons.
|
// Store the workerReport into the database. We use "PreparedStatements" to do insertions, for security and valid SQL syntax reasons.
|
||||||
String insertIntoPayloadBaseQuery = "INSERT INTO " + databaseName + ".payload (id, original_url, actual_url, date, mimetype, size, hash, location, provenance) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)";
|
String insertIntoPayloadBaseQuery = "INSERT INTO " + databaseName + ".payload (id, original_url, actual_url, date, mimetype, size, hash, location, provenance) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)";
|
||||||
|
int[] payloadArgTypes = new int[] {Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR};
|
||||||
|
|
||||||
String insertIntoAttemptBaseQuery = "INSERT INTO " + databaseName + ".attempt (id, original_url, date, status, error_class, error_message) VALUES (?, ?, ?, ?, ?, ?)";
|
String insertIntoAttemptBaseQuery = "INSERT INTO " + databaseName + ".attempt (id, original_url, date, status, error_class, error_message) VALUES (?, ?, ?, ?, ?, ?)";
|
||||||
String payloadErrorMsg = null;
|
int[] attemptArgTypes = new int[] {Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR};
|
||||||
int failedCount = 0;
|
|
||||||
|
final AtomicInteger failedCount = new AtomicInteger(0);
|
||||||
|
|
||||||
|
List<Callable<Void>> callableTasks = new ArrayList<>(2);
|
||||||
|
// One thread will handle the inserts to the "payload" table adn the other to the "attempt" table. This way there will be as little blocking as possible (from the part of Impala).
|
||||||
|
|
||||||
|
callableTasks.add(() -> { // Handle inserts to the "payload" table.
|
||||||
|
for ( UrlReport urlReport : urlReports ) {
|
||||||
|
Payload payload = urlReport.getPayload();
|
||||||
|
if ( payload == null ) {
|
||||||
|
logger.warn("Payload was \"null\" for a \"urlReport\", in assignments_" + curReportAssignments + "\n" + urlReport);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
Long size = payload.getSize();
|
||||||
|
Object[] args = new Object[] {payload.getId(), payload.getOriginal_url(), payload.getActual_url(), payload.getTimestamp_acquired(),
|
||||||
|
payload.getMime_type(), (size != null) ? String.valueOf(size) : null, payload.getHash(),
|
||||||
|
payload.getLocation(), payload.getProvenance()};
|
||||||
|
|
||||||
|
jdbcTemplate.update(insertIntoPayloadBaseQuery, args, payloadArgTypes);
|
||||||
|
} catch (Exception sqle) {
|
||||||
|
logger.error("Problem when executing the \"insertIntoPayloadBaseQuery\": ", sqle);
|
||||||
|
failedCount.incrementAndGet();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
|
callableTasks.add(() -> { // Handle inserts to the "attempt" table.
|
||||||
|
for ( UrlReport urlReport : urlReports ) {
|
||||||
|
Payload payload = urlReport.getPayload();
|
||||||
|
if ( payload == null ) {
|
||||||
|
logger.warn("Payload was \"null\" for a \"urlReport\", in assignments_" + curReportAssignments + "\n" + urlReport);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
Error error = urlReport.getError();
|
||||||
|
if ( error == null ) { // A bit rare to happen, but we should fix it (otherwise NPEs will be thrown for the rest of this loop).
|
||||||
|
logger.warn("Error was \"null\" for \"urlReport\": " + urlReport + "\nSetting an empty object with \"null\" members.");
|
||||||
|
error = new Error(null, null);
|
||||||
|
}
|
||||||
|
|
||||||
|
try { // We use a "PreparedStatement" to do insertions, for security and valid SQL syntax reasons.
|
||||||
|
Object[] args = new Object[] {payload.getId(), payload.getOriginal_url(), payload.getTimestamp_acquired(),
|
||||||
|
urlReport.getStatus().toString(), String.valueOf(error.getType()), error.getMessage()};
|
||||||
|
|
||||||
|
jdbcTemplate.update(insertIntoAttemptBaseQuery, args, attemptArgTypes);
|
||||||
|
} catch (Exception sqle) {
|
||||||
|
logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\": " + sqle.getMessage());
|
||||||
|
failedCount.incrementAndGet();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
});
|
||||||
|
|
||||||
ImpalaConnector.databaseLock.lock();
|
ImpalaConnector.databaseLock.lock();
|
||||||
// TODO - Think about handling this loop with multiple threads.. The Impala-server will handle the synchronization itself..
|
|
||||||
|
|
||||||
for ( UrlReport urlReport : urlReports ) {
|
try { // Invoke all the tasks and wait for them to finish before moving to the next batch.
|
||||||
Payload payload = urlReport.getPayload();
|
insertsExecutor.invokeAll(callableTasks);
|
||||||
if ( payload == null ) {
|
} catch (InterruptedException ie) { // In this case, any unfinished tasks are cancelled.
|
||||||
logger.warn("Payload was \"null\" for a \"urlReport\", in assignments_" + curReportAssignments);
|
logger.warn("The current thread was interrupted when waiting for the worker-threads to finish inserting into the tables: " + ie.getMessage());
|
||||||
payloadErrorMsg = (++failedCount) + " urlReports failed to be processed because they had no payload!";
|
// TODO - This is a very rare case, but what should be done..?
|
||||||
continue;
|
} catch (Exception e) {
|
||||||
}
|
ImpalaConnector.databaseLock.unlock();
|
||||||
|
String errorMsg = "Unexpected error when inserting into the \"payload\" and \"attempt\" tables in parallel! " + e.getMessage();
|
||||||
|
logger.error(errorMsg, e);
|
||||||
|
return ResponseEntity.status(HttpStatus.INTERNAL_SERVER_ERROR).body(errorMsg);
|
||||||
|
}
|
||||||
|
|
||||||
try {
|
int failedQueries = failedCount.get();
|
||||||
Long size = payload.getSize();
|
String failedQueriesMsg = failedQueries + " out of " + urlReports.size() + " failed to be processed!";
|
||||||
Object[] args = new Object[] {
|
logger.debug("Finished inserting the payloads and the attempts into the \"payload\" and \"attempt\" tables" + ((failedQueries > 0) ? (", although " + failedQueriesMsg) : ".")
|
||||||
payload.getId(), payload.getOriginal_url(), payload.getActual_url(), payload.getTimestamp_acquired(),
|
+ " Going to merge the parquet files for those tables.");
|
||||||
payload.getMime_type(), (size != null) ? String.valueOf(size) : null, payload.getHash(),
|
|
||||||
payload.getLocation(), payload.getProvenance()};
|
|
||||||
int[] argTypes = new int[] {
|
|
||||||
Types.VARCHAR, Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR,
|
|
||||||
Types.VARCHAR, Types.VARCHAR, Types.VARCHAR};
|
|
||||||
|
|
||||||
jdbcTemplate.update(insertIntoPayloadBaseQuery, args, argTypes);
|
|
||||||
} catch (Exception sqle) {
|
|
||||||
logger.error("Problem when executing the \"insertIntoPayloadBaseQuery\": ", sqle);
|
|
||||||
}
|
|
||||||
|
|
||||||
Error error = urlReport.getError();
|
|
||||||
if ( error == null ) { // A bit rare to happen, but we should fix it (otherwise NPEs will be thrown for the rest of this loop).
|
|
||||||
logger.warn("Error was \"null\" for \"urlReport\": " + urlReport + "\nSetting an empty object with \"null\" members.");
|
|
||||||
error = new Error(null, null);
|
|
||||||
}
|
|
||||||
|
|
||||||
try { // We use a "PreparedStatement" to do insertions, for security and valid SQL syntax reasons.
|
|
||||||
Object[] args = new Object[] {
|
|
||||||
payload.getId(), payload.getOriginal_url(), payload.getTimestamp_acquired(),
|
|
||||||
urlReport.getStatus().toString(), String.valueOf(error.getType()), error.getMessage()};
|
|
||||||
int[] argTypes = new int[] {Types.VARCHAR, Types.VARCHAR, Types.TIMESTAMP, Types.VARCHAR, Types.VARCHAR, Types.VARCHAR};
|
|
||||||
|
|
||||||
jdbcTemplate.update(insertIntoAttemptBaseQuery, args, argTypes);
|
|
||||||
} catch (Exception sqle) {
|
|
||||||
logger.error("Problem when executing the \"insertIntoAttemptBaseQuery\": " + sqle.getMessage());
|
|
||||||
}
|
|
||||||
}//end for-loop
|
|
||||||
|
|
||||||
if ( payloadErrorMsg != null )
|
|
||||||
logger.debug("Finished inserting the payloads and the attempts into the \"payload\" and \"attempt\" tables, although " + payloadErrorMsg + " Going to merge the parquet files for those tables.");
|
|
||||||
else
|
|
||||||
logger.debug("Finished inserting the payloads and the attempts into the \"payload\" and \"attempt\" tables. Going to merge the parquet files for those tables.");
|
|
||||||
|
|
||||||
String mergeErrorMsg = fileUtils.mergeParquetFiles("payload", "", null);
|
String mergeErrorMsg = fileUtils.mergeParquetFiles("payload", "", null);
|
||||||
if ( mergeErrorMsg != null ) {
|
if ( mergeErrorMsg != null ) {
|
||||||
|
@ -333,8 +365,9 @@ public class UrlController {
|
||||||
}
|
}
|
||||||
|
|
||||||
ImpalaConnector.databaseLock.unlock();
|
ImpalaConnector.databaseLock.unlock();
|
||||||
|
|
||||||
logger.debug("Finished merging the database tables.");
|
logger.debug("Finished merging the database tables.");
|
||||||
return ResponseEntity.status(HttpStatus.OK).body(payloadErrorMsg);
|
return ResponseEntity.status(HttpStatus.OK).body(failedQueriesMsg);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -362,9 +362,7 @@ public class FileUtils {
|
||||||
if ( j < (numFullTextsCurBatch -1) )
|
if ( j < (numFullTextsCurBatch -1) )
|
||||||
sb.append(",");
|
sb.append(",");
|
||||||
}
|
}
|
||||||
String requestUrl = sb.toString();
|
return sb.toString();
|
||||||
|
|
||||||
return requestUrl;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
private final int bufferSize = 20971520; // 20 MB
|
private final int bufferSize = 20971520; // 20 MB
|
||||||
|
|
Loading…
Reference in New Issue