- Fix a bug, causing the domainAndPath-tracking data to be deleted after every batch, after the initial threshold was reached. Now the thresholds increase, along the processed id-urls, in order to clear data, e.g. every 300_000 processed id-urls, as intended.

- Use different thresholds for clearing just the "domainAndPath"-blocking-data and all-tracking-data.
This commit is contained in:
Lampros Smyrnaios 2022-09-28 19:10:01 +03:00
parent 373bfa810b
commit b051e10fd3
1 changed files with 20 additions and 4 deletions

View File

@ -44,8 +44,14 @@ public class AssignmentsHandler {
public static long numHandledAssignmentsBatches = 0; // No need to be synchronized.
public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 300_000;
public static final long idUrlsToHandleBeforeClearingDomainAndPathBlockingData = 300_000;
public static long timesClearingDomainAndPathBlockingData = 0;
public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 600_000;
public static long timesClearingDomainAndPathTrackingData = 0;
public static final long idUrlsToHandleBeforeClearingDuplicateUrlsData = 200_000;
public static long timesClearingDuplicateUrlsData = 0;
public AssignmentsHandler()
@ -135,11 +141,21 @@ public class AssignmentsHandler {
// Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program.
// This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine.
long idUrlPairsHandled = (numHandledAssignmentsBatches * UrlsWorkerApplication.maxAssignmentsLimitPerBatch);
if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDuplicateUrlsData )
UrlUtils.duplicateUrls.clear();
if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDomainAndPathTrackingData )
if ( idUrlPairsHandled >= ((timesClearingDuplicateUrlsData +1) * idUrlsToHandleBeforeClearingDuplicateUrlsData) ) {
UrlUtils.duplicateUrls.clear();
timesClearingDuplicateUrlsData ++;
}
if ( idUrlPairsHandled >= ((timesClearingDomainAndPathTrackingData +1) * idUrlsToHandleBeforeClearingDomainAndPathTrackingData) ) {
GenericUtils.clearDomainAndPathTrackingData();
timesClearingDomainAndPathTrackingData ++;
timesClearingDomainAndPathBlockingData ++; // Increment this also, as we avoid the following check in this case, but the counter has to be increased nevertheless.
// This includes the "blocking data", we may say "if this condition is true, do not bother checking the just-blocking condition"
} else if ( idUrlPairsHandled >= ((timesClearingDomainAndPathBlockingData +1) * idUrlsToHandleBeforeClearingDomainAndPathBlockingData) ) {
GenericUtils.clearDomainAndPathBlockingData();
timesClearingDomainAndPathBlockingData ++;
}
if ( GeneralController.shouldShutdownWorker
|| (AssignmentsHandler.numHandledAssignmentsBatches == UrlsWorkerApplication.maxAssignmentsBatchesToHandleBeforeShutdown) )