From b051e10fd309c8bdd385c26c3decb74c912191fe Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 28 Sep 2022 19:10:01 +0300 Subject: [PATCH] - Fix a bug, causing the domainAndPath-tracking data to be deleted after every batch, after the initial threshold was reached. Now the thresholds increase, along the processed id-urls, in order to clear data, e.g. every 300_000 processed id-urls, as intended. - Use different thresholds for clearing just the "domainAndPath"-blocking-data and all-tracking-data. --- .../urls_worker/util/AssignmentsHandler.java | 24 +++++++++++++++---- 1 file changed, 20 insertions(+), 4 deletions(-) diff --git a/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java b/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java index 37c69cc..8286c7d 100644 --- a/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java +++ b/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java @@ -44,8 +44,14 @@ public class AssignmentsHandler { public static long numHandledAssignmentsBatches = 0; // No need to be synchronized. - public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 300_000; + public static final long idUrlsToHandleBeforeClearingDomainAndPathBlockingData = 300_000; + public static long timesClearingDomainAndPathBlockingData = 0; + + public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 600_000; + public static long timesClearingDomainAndPathTrackingData = 0; + public static final long idUrlsToHandleBeforeClearingDuplicateUrlsData = 200_000; + public static long timesClearingDuplicateUrlsData = 0; public AssignmentsHandler() @@ -135,11 +141,21 @@ public class AssignmentsHandler { // Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program. // This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine. long idUrlPairsHandled = (numHandledAssignmentsBatches * UrlsWorkerApplication.maxAssignmentsLimitPerBatch); - if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDuplicateUrlsData ) - UrlUtils.duplicateUrls.clear(); - if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDomainAndPathTrackingData ) + if ( idUrlPairsHandled >= ((timesClearingDuplicateUrlsData +1) * idUrlsToHandleBeforeClearingDuplicateUrlsData) ) { + UrlUtils.duplicateUrls.clear(); + timesClearingDuplicateUrlsData ++; + } + + if ( idUrlPairsHandled >= ((timesClearingDomainAndPathTrackingData +1) * idUrlsToHandleBeforeClearingDomainAndPathTrackingData) ) { GenericUtils.clearDomainAndPathTrackingData(); + timesClearingDomainAndPathTrackingData ++; + timesClearingDomainAndPathBlockingData ++; // Increment this also, as we avoid the following check in this case, but the counter has to be increased nevertheless. + // This includes the "blocking data", we may say "if this condition is true, do not bother checking the just-blocking condition" + } else if ( idUrlPairsHandled >= ((timesClearingDomainAndPathBlockingData +1) * idUrlsToHandleBeforeClearingDomainAndPathBlockingData) ) { + GenericUtils.clearDomainAndPathBlockingData(); + timesClearingDomainAndPathBlockingData ++; + } if ( GeneralController.shouldShutdownWorker || (AssignmentsHandler.numHandledAssignmentsBatches == UrlsWorkerApplication.maxAssignmentsBatchesToHandleBeforeShutdown) )