2021-05-20 02:28:48 +02:00
package eu.openaire.urls_worker.util ;
import com.google.common.collect.HashMultimap ;
import com.google.common.collect.Multimap ;
2022-06-27 16:58:02 +02:00
import eu.openaire.publications_retriever.util.http.HttpConnUtils ;
Bug fixes:
- Fix a bug, where, in case it took too long to get the assignments from the Controller (possible when there are too many workers requesting at the same time or if the database is responding slowly), the Worker's scheduler would request for new assignments, in the meantime.
- Fix a bug, where, if the "maxAssignmentsBatchesToHandleBeforeRestart" was set, the Worker's scheduler could request another batch, right before the Worker was about to shut down.
- Fix a bug, where the condition of when to clear the over-sized data-structures was based on the "assignmentRequestCounter" send by the Controller (which is increased on each request by any worker and not for each individual one), and not on the "numHandledAssignmentsBatches" kept by each individual worker. This would result in much earlier cleanup, relative to the number of the Workers.
2022-02-19 16:09:02 +01:00
import eu.openaire.publications_retriever.util.url.GenericUtils ;
import eu.openaire.publications_retriever.util.url.UrlUtils ;
2021-09-21 15:21:39 +02:00
import eu.openaire.urls_worker.UrlsWorkerApplication ;
2022-06-22 17:53:27 +02:00
import eu.openaire.urls_worker.controllers.GeneralController ;
2021-05-20 02:28:48 +02:00
import eu.openaire.urls_worker.models.Assignment ;
2021-06-22 04:58:07 +02:00
import eu.openaire.urls_worker.models.UrlReport ;
2021-11-27 01:37:33 +01:00
import eu.openaire.urls_worker.payloads.requests.AssignmentsRequest ;
2021-06-22 04:58:07 +02:00
import eu.openaire.urls_worker.payloads.responces.WorkerReport ;
2021-07-29 08:01:53 +02:00
import eu.openaire.urls_worker.plugins.PublicationsRetrieverPlugin ;
2021-05-20 02:28:48 +02:00
import org.slf4j.Logger ;
import org.slf4j.LoggerFactory ;
import org.springframework.boot.web.client.RestTemplateBuilder ;
2021-06-22 04:58:07 +02:00
import org.springframework.http.HttpStatus ;
import org.springframework.http.ResponseEntity ;
2021-09-23 15:23:49 +02:00
import org.springframework.web.client.RestClientException ;
import org.springframework.web.client.RestTemplate ;
2021-05-20 02:28:48 +02:00
2022-06-27 16:58:02 +02:00
import java.net.CookieStore ;
2021-09-23 15:23:49 +02:00
import java.time.Duration ;
2021-12-17 07:24:09 +01:00
import java.util.ArrayList ;
import java.util.List ;
import java.util.Set ;
2021-05-20 02:28:48 +02:00
2021-11-27 01:37:33 +01:00
public class AssignmentsHandler {
2021-05-20 02:28:48 +02:00
2021-11-27 01:37:33 +01:00
private static final Logger logger = LoggerFactory . getLogger ( AssignmentsHandler . class ) ;
2021-05-20 02:28:48 +02:00
2021-12-06 23:52:40 +01:00
public static List < UrlReport > urlReports = null ;
2021-10-30 16:14:18 +02:00
private static final int expectedDatasourcesPerRequest = 1400 ; // Per 10_000 assignments.
2021-12-06 23:52:40 +01:00
public static Multimap < String , Assignment > assignmentsForPlugins = null ;
2021-09-21 15:21:39 +02:00
private static final boolean askForTest = false ; // Enable this only for testing.
2022-02-21 11:48:21 +01:00
private static String requestUrl ;
2021-08-05 14:09:28 +02:00
2021-10-11 12:27:40 +02:00
private static final Duration requestConnectTimeoutDuration = Duration . ofMinutes ( 1 ) ; // 1 minute.
2022-02-22 12:29:02 +01:00
private static final Duration requestReadTimeoutDuration = Duration . ofHours ( 3 ) ; // 3 hours. Time to wait for the data to get transferred over the network. Many workers may try to get assignments from the Worker, so each worker might have to wait some 10s of minutes for work.
2021-09-23 15:23:49 +02:00
// The controller has to retrieve the data from the database, then prepare them in memory, insert them in the "assignment"-table and, finally, return them to the worker.
2021-10-30 16:14:18 +02:00
public static final RestTemplate restTemplate = new RestTemplateBuilder ( ) . setConnectTimeout ( requestConnectTimeoutDuration ) . setReadTimeout ( requestReadTimeoutDuration ) . build ( ) ;
2022-02-21 11:48:21 +01:00
public static boolean hadConnectionErrorOnRequest = false ;
2021-06-22 04:58:07 +02:00
2021-12-23 23:12:34 +01:00
public static long numHandledAssignmentsBatches = 0 ; // No need to be synchronized.
Bug fixes:
- Fix a bug, where, in case it took too long to get the assignments from the Controller (possible when there are too many workers requesting at the same time or if the database is responding slowly), the Worker's scheduler would request for new assignments, in the meantime.
- Fix a bug, where, if the "maxAssignmentsBatchesToHandleBeforeRestart" was set, the Worker's scheduler could request another batch, right before the Worker was about to shut down.
- Fix a bug, where the condition of when to clear the over-sized data-structures was based on the "assignmentRequestCounter" send by the Controller (which is increased on each request by any worker and not for each individual one), and not on the "numHandledAssignmentsBatches" kept by each individual worker. This would result in much earlier cleanup, relative to the number of the Workers.
2022-02-19 16:09:02 +01:00
public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 10_000_000 ;
public static final long idUrlsToHandleBeforeClearingDuplicateUrlsData = 1_000_000 ;
2021-12-06 23:52:40 +01:00
2022-06-27 16:58:02 +02:00
private static CookieStore cookieStore = null ;
2021-12-06 23:52:40 +01:00
public AssignmentsHandler ( )
{
urlReports = new ArrayList < > ( UrlsWorkerApplication . maxAssignmentsLimitPerBatch ) ;
2021-12-17 07:24:09 +01:00
int expectedAssignmentsPerDatasource = ( UrlsWorkerApplication . maxAssignmentsLimitPerBatch / expectedDatasourcesPerRequest ) ;
2021-12-06 23:52:40 +01:00
assignmentsForPlugins = HashMultimap . create ( expectedDatasourcesPerRequest , expectedAssignmentsPerDatasource ) ;
2022-02-21 11:48:21 +01:00
requestUrl = UrlsWorkerApplication . controllerBaseUrl + ( askForTest ? " test/ " : " " ) + " urls?workerId= " + UrlsWorkerApplication . workerId + " &workerAssignmentsLimit= " + UrlsWorkerApplication . maxAssignmentsLimitPerBatch ;
2022-06-27 16:58:02 +02:00
cookieStore = HttpConnUtils . cookieManager . getCookieStore ( ) ;
2021-12-06 23:52:40 +01:00
}
2021-12-17 07:24:09 +01:00
2021-11-27 01:37:33 +01:00
public static AssignmentsRequest requestAssignments ( )
2021-06-10 13:29:20 +02:00
{
2021-09-21 15:21:39 +02:00
logger . info ( " Going to request assignments from the controller-server: " + requestUrl ) ;
2021-11-27 01:37:33 +01:00
AssignmentsRequest assignmentRequest = null ;
2021-10-30 16:14:18 +02:00
try { // Here, the HTTP-request is executed.
2021-11-27 01:37:33 +01:00
assignmentRequest = restTemplate . getForObject ( requestUrl , AssignmentsRequest . class ) ;
2021-10-30 16:14:18 +02:00
} catch ( RestClientException rce ) {
2022-02-22 12:29:02 +01:00
logger . error ( " Could not retrieve the assignments! \ n " + rce . getMessage ( ) ) ; // It shows the response body (from Spring v.2.5.6 onwards).
2022-02-21 11:48:21 +01:00
hadConnectionErrorOnRequest = true ;
2021-09-23 15:23:49 +02:00
return null ;
2021-05-20 02:28:48 +02:00
}
2021-06-09 04:45:07 +02:00
//logger.debug(assignmentRequest.toString()); // DEBUG!
2021-07-05 14:00:29 +02:00
return assignmentRequest ;
2021-05-20 02:28:48 +02:00
}
2021-07-05 14:00:29 +02:00
public static void handleAssignments ( )
2021-06-10 13:29:20 +02:00
{
2021-11-27 01:37:33 +01:00
AssignmentsRequest assignmentsRequest = requestAssignments ( ) ;
2022-02-21 11:48:21 +01:00
if ( assignmentsRequest = = null )
2021-08-05 14:09:28 +02:00
return ;
2021-07-05 14:00:29 +02:00
2021-11-27 01:37:33 +01:00
Long assignmentRequestCounter = assignmentsRequest . getAssignmentsCounter ( ) ;
List < Assignment > assignments = assignmentsRequest . getAssignments ( ) ;
2021-07-05 14:00:29 +02:00
if ( assignments = = null ) {
logger . warn ( " The assignments were found to be null for assignmentRequestCounter = " + assignmentRequestCounter ) ;
2021-05-20 02:28:48 +02:00
return ;
}
2021-06-11 12:44:33 +02:00
2021-10-14 02:03:47 +02:00
int assignmentsSize = assignments . size ( ) ;
if ( assignmentsSize = = 0 ) {
logger . warn ( " The assignmentsSize was < 0 > for assignmentRequestCounter = " + assignmentRequestCounter ) ;
return ;
}
logger . info ( " AssignmentRequest < " + assignmentRequestCounter + " > was received and it's ready to be processed. It contains " + assignmentsSize + " tasks. " ) ;
2021-10-30 16:14:18 +02:00
// Iterate over the tasks and add each task in its own list depending on the DATASOURCE in order to decide which plugin to use later.
2021-05-20 02:28:48 +02:00
2021-07-05 14:00:29 +02:00
for ( Assignment assignment : assignments ) {
2021-05-20 02:28:48 +02:00
// Add each task in its own HashSet.
2021-10-30 16:14:18 +02:00
try {
assignmentsForPlugins . put ( assignment . getDatasource ( ) . getId ( ) , assignment ) ;
} catch ( NullPointerException npe ) {
logger . warn ( " An NPE was thrown when splitting the assignments based on the datasource-types. The assignment was: " + assignment ) ; // Do not use "assignment.toString()", it may cause an NPE.
}
2021-05-20 02:28:48 +02:00
}
2021-10-30 16:14:18 +02:00
//countDatasourcesAndRecords(assignmentsSize); // Only for DEBUG! Keep it commented in normal run.
2021-06-22 04:58:07 +02:00
// TODO - Decide which tasks run with what plugin (depending on their datasource).
2021-05-20 02:28:48 +02:00
// First run -in parallel- the tasks which require some specific plugin.
// Then run the remaining tasks in the generic plugin (which handles parallelism itself).
2021-06-22 04:58:07 +02:00
// For now, let's just run all tasks in the generic plugin.
try {
2021-09-08 04:02:14 +02:00
PublicationsRetrieverPlugin . processAssignments ( assignmentRequestCounter , assignmentsForPlugins . values ( ) ) ;
2021-06-22 04:58:07 +02:00
} catch ( Exception e ) {
2021-12-03 03:09:40 +01:00
logger . error ( " Exception when processing the assignments_ " + assignmentRequestCounter , e ) ;
} // In this case, we will either have an empty WorkerReport or a half-filled one. Either way, we want to report back to the Controller.
2021-06-22 04:58:07 +02:00
2022-01-17 22:46:15 +01:00
// TODO - If we have more than one plugin running at the same time, then make the "AssignmentsHandler.urlReports"-list thread-safe.
2021-09-23 15:23:49 +02:00
if ( askForTest ) {
logger . debug ( " UrlReports: " ) ; // DEBUG!
for ( UrlReport urlReport : urlReports )
logger . debug ( urlReport . toString ( ) ) ;
} // Avoid posting the results in "askForTestUrls"-mode. We don't want for test-results to be written into the database by the controller.
else
postWorkerReport ( assignmentRequestCounter ) ;
2021-09-21 15:21:39 +02:00
2022-06-22 17:53:27 +02:00
numHandledAssignmentsBatches + + ; // This is used later to stop this app, if a user-defined upper limit is set and reached.
2021-12-23 23:12:34 +01:00
Bug fixes:
- Fix a bug, where, in case it took too long to get the assignments from the Controller (possible when there are too many workers requesting at the same time or if the database is responding slowly), the Worker's scheduler would request for new assignments, in the meantime.
- Fix a bug, where, if the "maxAssignmentsBatchesToHandleBeforeRestart" was set, the Worker's scheduler could request another batch, right before the Worker was about to shut down.
- Fix a bug, where the condition of when to clear the over-sized data-structures was based on the "assignmentRequestCounter" send by the Controller (which is increased on each request by any worker and not for each individual one), and not on the "numHandledAssignmentsBatches" kept by each individual worker. This would result in much earlier cleanup, relative to the number of the Workers.
2022-02-19 16:09:02 +01:00
// Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program.
// This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine.
long idUrlPairsHandled = ( numHandledAssignmentsBatches * UrlsWorkerApplication . maxAssignmentsLimitPerBatch ) ;
if ( idUrlPairsHandled > = idUrlsToHandleBeforeClearingDuplicateUrlsData )
UrlUtils . duplicateUrls . clear ( ) ;
if ( idUrlPairsHandled > = idUrlsToHandleBeforeClearingDomainAndPathTrackingData )
GenericUtils . clearDomainAndPathTrackingData ( ) ;
2021-10-30 16:14:18 +02:00
2022-06-22 17:53:27 +02:00
if ( GeneralController . shouldShutdownWorker
| | ( AssignmentsHandler . numHandledAssignmentsBatches = = UrlsWorkerApplication . maxAssignmentsBatchesToHandleBeforeShutdown ) )
2021-12-31 02:51:58 +01:00
{
2022-06-22 17:53:27 +02:00
logger . info ( " The worker will now shutdown, as " + ( GeneralController . shouldShutdownWorker
? " it received a \" shutdownWorker \" request! "
: " the maximum assignments-batches ( " + UrlsWorkerApplication . maxAssignmentsBatchesToHandleBeforeShutdown + " ) to be handled was reached! " ) ) ;
2022-01-03 23:23:45 +01:00
UrlsWorkerApplication . gentleAppShutdown ( ) ;
Bug fixes:
- Fix a bug, where, in case it took too long to get the assignments from the Controller (possible when there are too many workers requesting at the same time or if the database is responding slowly), the Worker's scheduler would request for new assignments, in the meantime.
- Fix a bug, where, if the "maxAssignmentsBatchesToHandleBeforeRestart" was set, the Worker's scheduler could request another batch, right before the Worker was about to shut down.
- Fix a bug, where the condition of when to clear the over-sized data-structures was based on the "assignmentRequestCounter" send by the Controller (which is increased on each request by any worker and not for each individual one), and not on the "numHandledAssignmentsBatches" kept by each individual worker. This would result in much earlier cleanup, relative to the number of the Workers.
2022-02-19 16:09:02 +01:00
// The "gentleAppShutdown()" will exit the app, so the "isAvailableForWork" will not be set below.
2021-12-31 02:51:58 +01:00
}
Bug fixes:
- Fix a bug, where, in case it took too long to get the assignments from the Controller (possible when there are too many workers requesting at the same time or if the database is responding slowly), the Worker's scheduler would request for new assignments, in the meantime.
- Fix a bug, where, if the "maxAssignmentsBatchesToHandleBeforeRestart" was set, the Worker's scheduler could request another batch, right before the Worker was about to shut down.
- Fix a bug, where the condition of when to clear the over-sized data-structures was based on the "assignmentRequestCounter" send by the Controller (which is increased on each request by any worker and not for each individual one), and not on the "numHandledAssignmentsBatches" kept by each individual worker. This would result in much earlier cleanup, relative to the number of the Workers.
2022-02-19 16:09:02 +01:00
// Note: Cannot call this method, here, retrospectively, as if it runs 100s of times, the memory-stack may break..
// The scheduler will handle calling it every 15 mins, in case the Worker is available for work..
2021-06-22 04:58:07 +02:00
}
2021-05-20 02:28:48 +02:00
2022-06-22 17:53:27 +02:00
/ * *
* Post the worker report and wait for the Controller to request the publication - files .
* Once the Controller finishes with uploading the files to the S3 - ObjectStore , it returns an " HTTP-200-OK " response to the Worker .
* * /
2021-07-05 14:00:29 +02:00
public static boolean postWorkerReport ( Long assignmentRequestCounter )
2021-06-22 04:58:07 +02:00
{
2021-09-21 15:21:39 +02:00
String postUrl = UrlsWorkerApplication . controllerBaseUrl + " urls/addWorkerReport " ;
2021-10-14 02:03:47 +02:00
logger . info ( " Going to post the WorkerReport of assignment_ " + assignmentRequestCounter + " to the controller-server: " + postUrl ) ;
2021-05-20 02:28:48 +02:00
try {
2021-10-30 16:14:18 +02:00
ResponseEntity < String > responseEntity = restTemplate . postForEntity ( postUrl , new WorkerReport ( UrlsWorkerApplication . workerId , assignmentRequestCounter , urlReports ) , String . class ) ;
2021-10-11 12:27:40 +02:00
int responseCode = responseEntity . getStatusCodeValue ( ) ;
2021-12-23 23:12:34 +01:00
if ( responseCode = = HttpStatus . OK . value ( ) ) {
logger . info ( " The submission of the WorkerReport of assignments_ " + assignmentRequestCounter + " to the Controller, and the full-text delivering, were successful! " ) ;
return true ;
} else {
2021-10-30 16:14:18 +02:00
logger . error ( " HTTP-Connection problem with the submission of the WorkerReport of assignment_ " + assignmentRequestCounter + " to the Controller. Error-code was: " + responseCode ) ;
2021-06-22 04:58:07 +02:00
return false ;
}
} catch ( Exception e ) {
2021-07-05 14:00:29 +02:00
logger . error ( " Error when submitting the WorkerReport of assignment_ " + assignmentRequestCounter + " to the Controller: " , e ) ;
2021-06-22 04:58:07 +02:00
return false ;
2021-09-21 15:21:39 +02:00
} finally {
2021-10-30 16:14:18 +02:00
urlReports . clear ( ) ; // Reset, without de-allocating.
assignmentsForPlugins . clear ( ) ;
2022-06-27 16:58:02 +02:00
logger . debug ( " The number of cookies is: " + cookieStore . getCookies ( ) . size ( ) ) ;
boolean cookiesDeleted = cookieStore . removeAll ( ) ;
logger . debug ( cookiesDeleted ? " The cookies where removed! " : " No cookies where removed! " ) ;
2021-05-20 02:28:48 +02:00
}
}
2021-10-30 16:14:18 +02:00
public static void countDatasourcesAndRecords ( int assignmentsSize )
{
Set < String > datasources = assignmentsForPlugins . keySet ( ) ;
int numDatasources = datasources . size ( ) ;
logger . debug ( " Num of datasources: " + numDatasources ) ;
for ( String datasource : datasources ) {
logger . debug ( " Num of records for datasource \" " + datasource + " \" is: " + assignmentsForPlugins . get ( datasource ) . size ( ) ) ;
}
logger . debug ( " Average num of records per datasource: " + ( assignmentsSize / numDatasources ) ) ;
}
2021-05-20 02:28:48 +02:00
}