diff --git a/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java b/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java index 37c69cc..8286c7d 100644 --- a/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java +++ b/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java @@ -44,8 +44,14 @@ public class AssignmentsHandler { public static long numHandledAssignmentsBatches = 0; // No need to be synchronized. - public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 300_000; + public static final long idUrlsToHandleBeforeClearingDomainAndPathBlockingData = 300_000; + public static long timesClearingDomainAndPathBlockingData = 0; + + public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 600_000; + public static long timesClearingDomainAndPathTrackingData = 0; + public static final long idUrlsToHandleBeforeClearingDuplicateUrlsData = 200_000; + public static long timesClearingDuplicateUrlsData = 0; public AssignmentsHandler() @@ -135,11 +141,21 @@ public class AssignmentsHandler { // Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program. // This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine. long idUrlPairsHandled = (numHandledAssignmentsBatches * UrlsWorkerApplication.maxAssignmentsLimitPerBatch); - if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDuplicateUrlsData ) - UrlUtils.duplicateUrls.clear(); - if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDomainAndPathTrackingData ) + if ( idUrlPairsHandled >= ((timesClearingDuplicateUrlsData +1) * idUrlsToHandleBeforeClearingDuplicateUrlsData) ) { + UrlUtils.duplicateUrls.clear(); + timesClearingDuplicateUrlsData ++; + } + + if ( idUrlPairsHandled >= ((timesClearingDomainAndPathTrackingData +1) * idUrlsToHandleBeforeClearingDomainAndPathTrackingData) ) { GenericUtils.clearDomainAndPathTrackingData(); + timesClearingDomainAndPathTrackingData ++; + timesClearingDomainAndPathBlockingData ++; // Increment this also, as we avoid the following check in this case, but the counter has to be increased nevertheless. + // This includes the "blocking data", we may say "if this condition is true, do not bother checking the just-blocking condition" + } else if ( idUrlPairsHandled >= ((timesClearingDomainAndPathBlockingData +1) * idUrlsToHandleBeforeClearingDomainAndPathBlockingData) ) { + GenericUtils.clearDomainAndPathBlockingData(); + timesClearingDomainAndPathBlockingData ++; + } if ( GeneralController.shouldShutdownWorker || (AssignmentsHandler.numHandledAssignmentsBatches == UrlsWorkerApplication.maxAssignmentsBatchesToHandleBeforeShutdown) )