- Fix a bug, causing the domainAndPath-tracking data to be deleted after every batch, after the initial threshold was reached. Now the thresholds increase, along the processed id-urls, in order to clear data, e.g. every 300_000 processed id-urls, as intended.
- Use different thresholds for clearing just the "domainAndPath"-blocking-data and all-tracking-data.
This commit is contained in:
parent
373bfa810b
commit
b051e10fd3
|
@ -44,8 +44,14 @@ public class AssignmentsHandler {
|
||||||
|
|
||||||
public static long numHandledAssignmentsBatches = 0; // No need to be synchronized.
|
public static long numHandledAssignmentsBatches = 0; // No need to be synchronized.
|
||||||
|
|
||||||
public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 300_000;
|
public static final long idUrlsToHandleBeforeClearingDomainAndPathBlockingData = 300_000;
|
||||||
|
public static long timesClearingDomainAndPathBlockingData = 0;
|
||||||
|
|
||||||
|
public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 600_000;
|
||||||
|
public static long timesClearingDomainAndPathTrackingData = 0;
|
||||||
|
|
||||||
public static final long idUrlsToHandleBeforeClearingDuplicateUrlsData = 200_000;
|
public static final long idUrlsToHandleBeforeClearingDuplicateUrlsData = 200_000;
|
||||||
|
public static long timesClearingDuplicateUrlsData = 0;
|
||||||
|
|
||||||
|
|
||||||
public AssignmentsHandler()
|
public AssignmentsHandler()
|
||||||
|
@ -135,11 +141,21 @@ public class AssignmentsHandler {
|
||||||
// Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program.
|
// Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program.
|
||||||
// This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine.
|
// This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine.
|
||||||
long idUrlPairsHandled = (numHandledAssignmentsBatches * UrlsWorkerApplication.maxAssignmentsLimitPerBatch);
|
long idUrlPairsHandled = (numHandledAssignmentsBatches * UrlsWorkerApplication.maxAssignmentsLimitPerBatch);
|
||||||
if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDuplicateUrlsData )
|
|
||||||
UrlUtils.duplicateUrls.clear();
|
|
||||||
|
|
||||||
if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDomainAndPathTrackingData )
|
if ( idUrlPairsHandled >= ((timesClearingDuplicateUrlsData +1) * idUrlsToHandleBeforeClearingDuplicateUrlsData) ) {
|
||||||
|
UrlUtils.duplicateUrls.clear();
|
||||||
|
timesClearingDuplicateUrlsData ++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if ( idUrlPairsHandled >= ((timesClearingDomainAndPathTrackingData +1) * idUrlsToHandleBeforeClearingDomainAndPathTrackingData) ) {
|
||||||
GenericUtils.clearDomainAndPathTrackingData();
|
GenericUtils.clearDomainAndPathTrackingData();
|
||||||
|
timesClearingDomainAndPathTrackingData ++;
|
||||||
|
timesClearingDomainAndPathBlockingData ++; // Increment this also, as we avoid the following check in this case, but the counter has to be increased nevertheless.
|
||||||
|
// This includes the "blocking data", we may say "if this condition is true, do not bother checking the just-blocking condition"
|
||||||
|
} else if ( idUrlPairsHandled >= ((timesClearingDomainAndPathBlockingData +1) * idUrlsToHandleBeforeClearingDomainAndPathBlockingData) ) {
|
||||||
|
GenericUtils.clearDomainAndPathBlockingData();
|
||||||
|
timesClearingDomainAndPathBlockingData ++;
|
||||||
|
}
|
||||||
|
|
||||||
if ( GeneralController.shouldShutdownWorker
|
if ( GeneralController.shouldShutdownWorker
|
||||||
|| (AssignmentsHandler.numHandledAssignmentsBatches == UrlsWorkerApplication.maxAssignmentsBatchesToHandleBeforeShutdown) )
|
|| (AssignmentsHandler.numHandledAssignmentsBatches == UrlsWorkerApplication.maxAssignmentsBatchesToHandleBeforeShutdown) )
|
||||||
|
|
Loading…
Reference in New Issue