- Reduce memory-consumption in the long-run, by clearing some underlying data-structures after a threshold.

- Update Gradle.
This commit is contained in:
Lampros Smyrnaios 2022-02-18 20:02:34 +02:00
parent 4cadaf98fc
commit 3d1faf4a8a
3 changed files with 16 additions and 3 deletions

View File

@ -1,5 +1,5 @@
distributionBase=GRADLE_USER_HOME
distributionPath=wrapper/dists
distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.3-bin.zip
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip
zipStoreBase=GRADLE_USER_HOME
zipStorePath=wrapper/dists

View File

@ -31,7 +31,7 @@ if [[ ! -f $inputDataFile ]]; then
echo -e "\n\n"
fi
gradleVersion="7.3.3"
gradleVersion="7.4"
if [[ justInstall -eq 0 ]]; then

View File

@ -6,6 +6,7 @@ import eu.openaire.publications_retriever.util.file.FileUtils;
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
import eu.openaire.publications_retriever.util.url.GenericUtils;
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
import eu.openaire.publications_retriever.util.url.UrlUtils;
import eu.openaire.urls_worker.UrlsWorkerApplication;
@ -34,6 +35,9 @@ public class PublicationsRetrieverPlugin {
public static String assignmentsBasePath;
public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 10_000_000;
public static final long idUrlsToHandleBeforeClearingDuplicateUrlsData = 1_000_000;
public PublicationsRetrieverPlugin() {
// Specify some configurations
@ -147,6 +151,15 @@ public class PublicationsRetrieverPlugin {
UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch.
// In the next batch, the previously stored files might have been already uploaded by the Controller and deleted by the worker. Also, they will be stored in a different directory anyway.
// Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program.
// This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine.
long idUrlPairsHandled = (assignmentRequestCounter * UrlsWorkerApplication.maxAssignmentsLimitPerBatch);
if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDuplicateUrlsData )
UrlUtils.duplicateUrls.clear();
if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDomainAndPathTrackingData )
GenericUtils.clearDomainAndPathTrackingData();
}
@ -179,7 +192,7 @@ public class PublicationsRetrieverPlugin {
fileLocation = data_2.getComment();
size = data_2.getSize();
hash = data_2.getHash();
mimeType = "application/pdf"; // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
mimeType = "application/pdf"; // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is assigned correctly.
foundAlreadyDownloadedFullText = true;
break;
}