- Reduce memory-consumption in the long-run, by clearing some underlying data-structures after a threshold.
- Update Gradle.
This commit is contained in:
parent
4cadaf98fc
commit
3d1faf4a8a
|
@ -1,5 +1,5 @@
|
|||
distributionBase=GRADLE_USER_HOME
|
||||
distributionPath=wrapper/dists
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.3-bin.zip
|
||||
distributionUrl=https\://services.gradle.org/distributions/gradle-7.4-bin.zip
|
||||
zipStoreBase=GRADLE_USER_HOME
|
||||
zipStorePath=wrapper/dists
|
||||
|
|
|
@ -31,7 +31,7 @@ if [[ ! -f $inputDataFile ]]; then
|
|||
echo -e "\n\n"
|
||||
fi
|
||||
|
||||
gradleVersion="7.3.3"
|
||||
gradleVersion="7.4"
|
||||
|
||||
if [[ justInstall -eq 0 ]]; then
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ import eu.openaire.publications_retriever.util.file.FileUtils;
|
|||
import eu.openaire.publications_retriever.util.http.ConnSupportUtils;
|
||||
import eu.openaire.publications_retriever.util.http.HttpConnUtils;
|
||||
import eu.openaire.publications_retriever.util.url.DataToBeLogged;
|
||||
import eu.openaire.publications_retriever.util.url.GenericUtils;
|
||||
import eu.openaire.publications_retriever.util.url.LoaderAndChecker;
|
||||
import eu.openaire.publications_retriever.util.url.UrlUtils;
|
||||
import eu.openaire.urls_worker.UrlsWorkerApplication;
|
||||
|
@ -34,6 +35,9 @@ public class PublicationsRetrieverPlugin {
|
|||
|
||||
public static String assignmentsBasePath;
|
||||
|
||||
public static final long idUrlsToHandleBeforeClearingDomainAndPathTrackingData = 10_000_000;
|
||||
public static final long idUrlsToHandleBeforeClearingDuplicateUrlsData = 1_000_000;
|
||||
|
||||
|
||||
public PublicationsRetrieverPlugin() {
|
||||
// Specify some configurations
|
||||
|
@ -147,6 +151,15 @@ public class PublicationsRetrieverPlugin {
|
|||
|
||||
UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch.
|
||||
// In the next batch, the previously stored files might have been already uploaded by the Controller and deleted by the worker. Also, they will be stored in a different directory anyway.
|
||||
|
||||
// Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program.
|
||||
// This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine.
|
||||
long idUrlPairsHandled = (assignmentRequestCounter * UrlsWorkerApplication.maxAssignmentsLimitPerBatch);
|
||||
if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDuplicateUrlsData )
|
||||
UrlUtils.duplicateUrls.clear();
|
||||
|
||||
if ( idUrlPairsHandled >= idUrlsToHandleBeforeClearingDomainAndPathTrackingData )
|
||||
GenericUtils.clearDomainAndPathTrackingData();
|
||||
}
|
||||
|
||||
|
||||
|
@ -179,7 +192,7 @@ public class PublicationsRetrieverPlugin {
|
|||
fileLocation = data_2.getComment();
|
||||
size = data_2.getSize();
|
||||
hash = data_2.getHash();
|
||||
mimeType = "application/pdf"; // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified.
|
||||
mimeType = "application/pdf"; // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is assigned correctly.
|
||||
foundAlreadyDownloadedFullText = true;
|
||||
break;
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue