- Reduce memory-consumption in the long-run, by clearing some underlying data-structures after a threshold.

- Update Gradle.
2022-02-18 20:02:34 +02:00 · 2022-02-18 20:02:34 +02:00 · 3d1faf4a8a
parent 4cadaf98fc
commit 3d1faf4a8a
3 changed files with 16 additions and 3 deletions
--- a/gradle/wrapper/gradle-wrapper.properties
+++ b/gradle/wrapper/gradle-wrapper.properties
@ -1,5 +1,5 @@
 distributionBase=GRADLE_USER_HOME
 distributionPath=wrapper/dists
-distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.3-bin.zip
+distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip
 zipStoreBase=GRADLE_USER_HOME
 zipStorePath=wrapper/dists
--- a/installAndRun.sh
+++ b/installAndRun.sh
@ -31,7 +31,7 @@ if [[ ! -f $inputDataFile ]]; then
  echo -e "\n\n"
 fi

-gradleVersion="7.3.3"
+gradleVersion="7.4"

 if [[ justInstall -eq 0 ]]; then

--- a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java
+++ b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java
@ -6,6 +6,7 @@ import eu.openaire.publications_retriever.util.file.FileUtils;
 import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
 import eu.openaire.publications_retriever.util.http.HttpConnUtils;
 import eu.openaire.publications_retriever.util.url.DataToBeLogged;
+import eu.openaire.publications_retriever.util.url.GenericUtils;
 import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
 import eu.openaire.publications_retriever.util.url.UrlUtils;
 import eu.openaire.urls_worker.UrlsWorkerApplication;
@ -34,6 +35,9 @@ public class PublicationsRetrieverPlugin {

    public static String assignmentsBasePath;

+    public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 10_000_000;
+    public static final long idUrlsToHandleBeforeClearingDuplicateUrlsData = 1_000_000;
+

    public PublicationsRetrieverPlugin() {
        // Specify some configurations
@ -147,6 +151,15 @@ public class PublicationsRetrieverPlugin {

        UrlUtils.docOrDatasetUrlsWithIDs.clear();   // This HashTable is useful only for a single assignments-batch.
        // In the next batch, the previously stored files might have been already uploaded by the Controller and deleted by the worker. Also, they will be stored in a different directory anyway.
+
+        // Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program.
+        // This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine.
+        long idUrlPairsHandled = (assignmentRequestCounter * UrlsWorkerApplication.maxAssignmentsLimitPerBatch);
+        if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDuplicateUrlsData )
+            UrlUtils.duplicateUrls.clear();
+
+        if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDomainAndPathTrackingData )
+            GenericUtils.clearDomainAndPathTrackingData();
    }


@ -179,7 +192,7 @@ public class PublicationsRetrieverPlugin {
                            fileLocation = data_2.getComment();
                            size = data_2.getSize();
                            hash = data_2.getHash();
-                            mimeType = "application/pdf";   // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
+                            mimeType = "application/pdf";   // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is assigned correctly.
                            foundAlreadyDownloadedFullText = true;
                            break;
                        }