diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 2e6e589..41dfb87 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,5 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.3-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/installAndRun.sh b/installAndRun.sh index 3d1821a..edcec00 100755 --- a/installAndRun.sh +++ b/installAndRun.sh @@ -31,7 +31,7 @@ if [[ ! -f $inputDataFile ]]; then echo -e "\n\n" fi -gradleVersion="7.3.3" +gradleVersion="7.4" if [[ justInstall -eq 0 ]]; then diff --git a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java index 3d75680..ddaeacf 100644 --- a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java +++ b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java @@ -6,6 +6,7 @@ import eu.openaire.publications_retriever.util.file.FileUtils; import eu.openaire.publications_retriever.util.http.ConnSupportUtils; import eu.openaire.publications_retriever.util.http.HttpConnUtils; import eu.openaire.publications_retriever.util.url.DataToBeLogged; +import eu.openaire.publications_retriever.util.url.GenericUtils; import eu.openaire.publications_retriever.util.url.LoaderAndChecker; import eu.openaire.publications_retriever.util.url.UrlUtils; import eu.openaire.urls_worker.UrlsWorkerApplication; @@ -34,6 +35,9 @@ public class PublicationsRetrieverPlugin { public static String assignmentsBasePath; + public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 10_000_000; + public static final long idUrlsToHandleBeforeClearingDuplicateUrlsData = 1_000_000; + public PublicationsRetrieverPlugin() { // Specify some configurations @@ -147,6 +151,15 @@ public class PublicationsRetrieverPlugin { UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch. // In the next batch, the previously stored files might have been already uploaded by the Controller and deleted by the worker. Also, they will be stored in a different directory anyway. + + // Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program. + // This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine. + long idUrlPairsHandled = (assignmentRequestCounter * UrlsWorkerApplication.maxAssignmentsLimitPerBatch); + if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDuplicateUrlsData ) + UrlUtils.duplicateUrls.clear(); + + if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDomainAndPathTrackingData ) + GenericUtils.clearDomainAndPathTrackingData(); } @@ -179,7 +192,7 @@ public class PublicationsRetrieverPlugin { fileLocation = data_2.getComment(); size = data_2.getSize(); hash = data_2.getHash(); - mimeType = "application/pdf"; // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified. + mimeType = "application/pdf"; // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is assigned correctly. foundAlreadyDownloadedFullText = true; break; }