2021-03-16 14:25:15 +01:00
package eu.openaire.urls_controller.controllers ;
2023-02-21 14:36:35 +01:00
import eu.openaire.urls_controller.models.UrlReport ;
2023-05-15 12:12:20 +02:00
import eu.openaire.urls_controller.models.WorkerInfo ;
2021-06-22 04:38:48 +02:00
import eu.openaire.urls_controller.payloads.requests.WorkerReport ;
2023-02-21 14:36:35 +01:00
import eu.openaire.urls_controller.services.UrlsService ;
2023-05-24 12:52:28 +02:00
import eu.openaire.urls_controller.util.FileUtils ;
2023-06-06 15:49:53 +02:00
import eu.openaire.urls_controller.util.GenericUtils ;
2023-05-15 12:12:20 +02:00
import eu.openaire.urls_controller.util.ParquetFileUtils ;
2021-03-16 14:25:15 +01:00
import org.slf4j.Logger ;
import org.slf4j.LoggerFactory ;
2022-01-30 21:14:52 +01:00
import org.springframework.beans.factory.annotation.Autowired ;
import org.springframework.beans.factory.annotation.Value ;
2021-06-22 04:38:48 +02:00
import org.springframework.http.HttpStatus ;
2021-03-16 14:25:15 +01:00
import org.springframework.http.ResponseEntity ;
2021-06-22 04:38:48 +02:00
import org.springframework.web.bind.annotation.* ;
2021-03-16 14:25:15 +01:00
2021-11-30 17:23:27 +01:00
import javax.servlet.http.HttpServletRequest ;
2023-05-24 12:52:28 +02:00
import java.nio.file.Files ;
import java.nio.file.Path ;
import java.nio.file.Paths ;
2023-05-29 11:21:48 +02:00
import java.util.ArrayList ;
import java.util.Collections ;
2022-01-30 21:14:52 +01:00
import java.util.List ;
2023-05-29 11:21:48 +02:00
import java.util.concurrent.Callable ;
2023-05-15 12:12:20 +02:00
import java.util.concurrent.ConcurrentHashMap ;
2023-05-29 11:21:48 +02:00
import java.util.concurrent.ExecutorService ;
import java.util.concurrent.Executors ;
2021-12-10 20:47:58 +01:00
import java.util.regex.Pattern ;
2021-03-16 14:25:15 +01:00
2022-02-02 19:19:46 +01:00
2021-03-16 14:25:15 +01:00
@RestController
@RequestMapping ( " /urls " )
2023-03-13 11:39:39 +01:00
public class UrlsController {
2021-03-16 14:25:15 +01:00
2023-03-13 11:39:39 +01:00
private static final Logger logger = LoggerFactory . getLogger ( UrlsController . class ) ;
2021-03-16 14:25:15 +01:00
2022-01-30 21:14:52 +01:00
@Autowired
2023-02-21 14:36:35 +01:00
private UrlsService urlsService ;
2022-01-30 21:14:52 +01:00
2023-05-24 12:52:28 +02:00
@Autowired
private FileUtils fileUtils ;
2023-05-15 12:12:20 +02:00
@Autowired
private ParquetFileUtils parquetFileUtils ;
2022-11-10 16:18:21 +01:00
2021-12-10 20:47:58 +01:00
private static final Pattern MALICIOUS_INPUT_STRING = Pattern . compile ( " .*[';` \" ]+.* " ) ;
2022-01-31 12:49:14 +01:00
@Value ( " ${services.pdfaggregation.controller.assignmentLimit} " )
2022-01-30 21:14:52 +01:00
private int assignmentLimit ;
2021-03-16 14:25:15 +01:00
2023-05-15 12:12:20 +02:00
public static final ConcurrentHashMap < String , WorkerInfo > workersInfoMap = new ConcurrentHashMap < > ( 6 ) ;
2023-05-29 11:21:48 +02:00
public static final ExecutorService backgroundExecutor = Executors . newFixedThreadPool ( 4 ) ; // At most 4 threads will be used.
public static final List < Callable < Boolean > > backgroundCallableTasks = Collections . synchronizedList ( new ArrayList < > ( ) ) ;
2023-05-24 15:59:42 +02:00
private final String workerReportsDirPath ;
2023-05-15 12:12:20 +02:00
2023-05-24 12:52:28 +02:00
public UrlsController ( @Value ( " ${services.pdfaggregation.controller.workerReportsDirPath} " ) String workerReportsDirPath )
{
if ( ! workerReportsDirPath . endsWith ( " / " ) )
workerReportsDirPath + = " / " ;
2023-05-15 12:12:20 +02:00
2023-05-24 12:52:28 +02:00
this . workerReportsDirPath = workerReportsDirPath ; // This dir will be created later.
}
2023-05-15 12:12:20 +02:00
2021-03-16 14:25:15 +01:00
@GetMapping ( " " )
2023-05-15 12:12:20 +02:00
public ResponseEntity < ? > getAssignments ( @RequestParam String workerId , @RequestParam int workerAssignmentsLimit , HttpServletRequest request ) {
2021-03-16 14:25:15 +01:00
2021-12-10 20:47:58 +01:00
// As the Impala-driver is buggy and struggles to support parameterized queries in some types of prepared-statements, we have to sanitize the "workerId" ourselves.
if ( MALICIOUS_INPUT_STRING . matcher ( workerId ) . matches ( ) ) {
String errorMsg = " Possibly malicious \" workerId \" received: " + workerId ;
logger . error ( errorMsg ) ;
return ResponseEntity . status ( HttpStatus . FORBIDDEN ) . body ( errorMsg ) ;
}
2022-01-30 21:14:52 +01:00
logger . info ( " Worker with id: \" " + workerId + " \" , requested " + workerAssignmentsLimit + " assignments. The assignments-limit of the controller is: " + assignmentLimit ) ;
2021-03-16 14:25:15 +01:00
2021-11-09 22:59:27 +01:00
// Sanitize the "assignmentsLimit". Do not let an overload happen in the Controller's or the Impala's server.
int assignmentsLimit = workerAssignmentsLimit ;
if ( assignmentsLimit = = 0 ) {
String errorMsg = " The given \" workerAssignmentsLimit \" was ZERO! " ;
logger . error ( errorMsg ) ;
return ResponseEntity . status ( HttpStatus . BAD_REQUEST ) . body ( errorMsg ) ;
2022-01-30 21:14:52 +01:00
} else if ( assignmentsLimit > assignmentLimit ) {
logger . warn ( " The given \" workerAssignmentsLimit \" ( " + workerAssignmentsLimit + " ) was larger than the Controller's limit ( " + assignmentLimit + " ). Will use the Controller's limit. " ) ;
assignmentsLimit = assignmentLimit ;
2021-11-09 22:59:27 +01:00
}
2021-08-05 14:43:37 +02:00
2023-05-24 12:42:29 +02:00
if ( ShutdownController . shouldShutdownService ) {
// There might be the case that the Controller has not sent shutDown requests to the Workers yet, or it has, BUT:
// 1) A worker requests for new assignments before the shutDown request in handled by its side.
// 2) A new Worker joins the Service (unexpected, but anyway).
String warnMsg = " The Service is about to shutdown, after all under-processing assignments are handled. No new requests are accepted! " ;
logger . warn ( warnMsg ) ; // It's likely not an actual error, but still it's not accepted.
return ResponseEntity . status ( HttpStatus . CONFLICT ) . body ( warnMsg ) ; // The worker will wait 15 mins and upon going to retry it will notice that it should not do a new request or it may have shutdown in the meantime.
}
2023-05-15 12:12:20 +02:00
if ( request = = null ) {
logger . error ( " The \" HttpServletRequest \" is null! " ) ;
return ResponseEntity . status ( HttpStatus . INTERNAL_SERVER_ERROR ) . build ( ) ;
}
2023-06-06 15:49:53 +02:00
String remoteAddr = GenericUtils . getRequestorAddress ( request ) ;
2023-05-15 12:12:20 +02:00
WorkerInfo workerInfo = workersInfoMap . get ( workerId ) ;
if ( workerInfo ! = null ) { // This worker has already been identified.
String savedWorkerIp = workerInfo . getWorkerIP ( ) ;
2023-05-23 13:57:15 +02:00
if ( ! savedWorkerIp . equals ( remoteAddr ) ) {
2023-05-15 12:12:20 +02:00
logger . warn ( " The worker with id \" " + workerId + " \" has changed IP from \" " + savedWorkerIp + " \" to \" " + remoteAddr + " \" . " ) ;
workerInfo . setWorkerIP ( remoteAddr ) ; // Set the new IP. The update will be reflected in the map.
2023-05-16 11:24:14 +02:00
} // In this case, the worker may has previously informed the Controller it has shutdown or it may have crashed.
if ( workerInfo . getHasShutdown ( ) ) {
logger . info ( " The worker with id \" " + workerId + " \" was restarted. " ) ;
workerInfo . setHasShutdown ( false ) ;
2023-05-15 12:12:20 +02:00
}
} else {
2023-05-29 11:12:08 +02:00
logger . info ( " The worker \" " + workerId + " \" is requesting assignments for the first time. Going to store its IP [ " + remoteAddr + " ]. " ) ;
2023-05-15 12:12:20 +02:00
workersInfoMap . put ( workerId , new WorkerInfo ( remoteAddr , false ) ) ;
}
2023-03-07 15:55:41 +01:00
return urlsService . getAssignments ( workerId , assignmentsLimit ) ;
2021-03-16 14:25:15 +01:00
}
2022-02-02 19:19:46 +01:00
2021-06-22 04:38:48 +02:00
@PostMapping ( " addWorkerReport " )
2023-05-27 01:36:05 +02:00
public ResponseEntity < ? > addWorkerReport ( @RequestBody WorkerReport workerReport )
{
2021-11-09 22:59:27 +01:00
if ( workerReport = = null ) {
String errorMsg = " No \" WorkerReport \" was given! " ;
logger . error ( errorMsg ) ;
return ResponseEntity . status ( HttpStatus . BAD_REQUEST ) . body ( errorMsg ) ;
}
2021-12-10 20:47:58 +01:00
String curWorkerId = workerReport . getWorkerId ( ) ;
if ( curWorkerId = = null ) {
String errorMsg = " No \" workerId \" was included inside the \" WorkerReport \" ! " ;
logger . error ( errorMsg ) ;
return ResponseEntity . status ( HttpStatus . BAD_REQUEST ) . body ( errorMsg ) ;
}
// As the Impala-driver is buggy and struggles to support parameterized queries in some types of prepared-statements, we have to sanitize the "workerId" ourselves.
if ( MALICIOUS_INPUT_STRING . matcher ( curWorkerId ) . matches ( ) ) {
String errorMsg = " Possibly malicious \" workerId \" received: " + curWorkerId ;
logger . error ( errorMsg ) ;
return ResponseEntity . status ( HttpStatus . FORBIDDEN ) . body ( errorMsg ) ;
}
2022-04-04 23:01:44 +02:00
int sizeOUrlReports = 0 ;
2021-11-09 22:59:27 +01:00
List < UrlReport > urlReports = workerReport . getUrlReports ( ) ;
2022-04-04 23:01:44 +02:00
if ( ( urlReports = = null ) | | ( ( sizeOUrlReports = urlReports . size ( ) ) = = 0 ) ) {
2021-12-10 20:47:58 +01:00
String errorMsg = " The given \" WorkerReport \" from worker with ID \" " + curWorkerId + " \" was empty (without any UrlReports)! " ;
2021-11-09 22:59:27 +01:00
logger . error ( errorMsg ) ;
return ResponseEntity . status ( HttpStatus . BAD_REQUEST ) . body ( errorMsg ) ;
}
2021-06-22 04:38:48 +02:00
2023-05-24 12:52:28 +02:00
long curReportAssignmentsCounter = workerReport . getAssignmentRequestCounter ( ) ;
logger . info ( " Received the WorkerReport for batch-assignments_ " + curReportAssignmentsCounter + " , from the worker with id: " + curWorkerId + " . It contains " + sizeOUrlReports + " urlReports. Going to request the fullTexts from the Worker and insert the UrlReports into the database. " ) ;
// Make sure this worker's report directory is created.
Path currentWorkerReportLocationDir = Paths . get ( this . workerReportsDirPath , curWorkerId ) ;
try {
Files . createDirectories ( currentWorkerReportLocationDir ) ; // No-op if dir exists. It does not throw a "alreadyExistsException"
} catch ( Exception e ) {
String errorMsg = " Could nor create the \" currentWorkerReportLocationDir \" for worker \" " + curWorkerId + " \" : " + currentWorkerReportLocationDir ;
logger . error ( errorMsg , e ) ;
return ResponseEntity . internalServerError ( ) . body ( errorMsg ) ;
}
// Create the report file and write the worker-report to it.
String workerReportFile = currentWorkerReportLocationDir + " / " + curWorkerId + " _assignments_ " + curReportAssignmentsCounter + " _report.json " ;
2023-05-24 15:59:42 +02:00
// In case a file with the same name already exists (e.g. from a previous run of the Service), then it will be overwritten.
2023-05-24 12:52:28 +02:00
logger . debug ( " Going to write the worker report to json-file: \" " + workerReportFile + " \" . " ) ;
fileUtils . writeToFile ( workerReportFile , workerReport . getJsonReport ( ) , false ) ; // Only one thread is writing to this specific file.
// The above method will overwrite a possibly existing file. So in case of a crash, it's better to back up the reports before starting the Controller again (as the assignments-counter will start over, from 0).
int finalSizeOUrlReports = sizeOUrlReports ;
2023-05-29 11:21:48 +02:00
UrlsController . backgroundCallableTasks . add ( ( ) - >
2023-05-24 12:52:28 +02:00
urlsService . addWorkerReport ( curWorkerId , curReportAssignmentsCounter , urlReports , finalSizeOUrlReports )
) ;
2021-06-22 04:38:48 +02:00
2023-05-24 12:52:28 +02:00
String msg = " The 'addWorkerReport' request for worker with id: ' " + curWorkerId + " ' and assignments_ " + curReportAssignmentsCounter + " , was accepted and will be scheduled for execution. " ;
logger . info ( msg ) ;
return ResponseEntity . ok ( ) . body ( msg ) ;
2021-11-09 22:59:27 +01:00
}
2021-03-16 14:25:15 +01:00
}