From 952bf7c035d9f181e402e16ad738daac4443f47a Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Thu, 6 Jul 2023 13:22:09 +0300 Subject: [PATCH] - Update dependencies. - Code polishing. --- build.gradle | 12 +++--- gradle/wrapper/gradle-wrapper.properties | 3 +- installAndRun.sh | 2 +- .../components/AssignmentsHandler.java | 38 ++++++++----------- .../plugins/PublicationsRetrieverPlugin.java | 14 +++---- 5 files changed, 31 insertions(+), 38 deletions(-) diff --git a/build.gradle b/build.gradle index fa31383..6c1361f 100644 --- a/build.gradle +++ b/build.gradle @@ -1,12 +1,14 @@ plugins { - id 'org.springframework.boot' version '2.7.12' + id 'org.springframework.boot' version '2.7.13' id 'io.spring.dependency-management' version '1.1.0' id 'java' } -group = 'eu.openaire.urls_worker' -version = '2.1.0-SNAPSHOT' -sourceCompatibility = '1.8' +java { + group = 'eu.openaire.urls_worker' + version = '2.1.0-SNAPSHOT' + sourceCompatibility = '1.8' +} repositories { mavenCentral() @@ -37,7 +39,7 @@ dependencies { exclude group: 'io.minio' // This is not used in the Worker, since it's the Controller which uploads the full-texts to S3. It also includes an older "commons-compress" version which causes problems. } - implementation group: 'com.google.guava', name: 'guava', version: '32.0.1-jre' + implementation group: 'com.google.guava', name: 'guava', version: '32.1.1-jre' // https://mvnrepository.com/artifact/com.google.code.gson/gson implementation 'com.google.code.gson:gson:2.10.1' diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index 37aef8d..62f495d 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,7 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-8.1.1-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-8.2-bin.zip networkTimeout=10000 +validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/installAndRun.sh b/installAndRun.sh index 1359d5f..eb0a1a1 100755 --- a/installAndRun.sh +++ b/installAndRun.sh @@ -22,7 +22,7 @@ elif [[ $# -gt 2 ]]; then echo -e "Wrong number of arguments given: ${#} (more than 2)\nPlease execute it like: script.sh "; exit 2 fi -gradleVersion="8.1.1" +gradleVersion="8.2" shouldBeCarefulWithMaxHeap=0 # This is NOT a cmd-arg. diff --git a/src/main/java/eu/openaire/urls_worker/components/AssignmentsHandler.java b/src/main/java/eu/openaire/urls_worker/components/AssignmentsHandler.java index 7229f12..d62e392 100644 --- a/src/main/java/eu/openaire/urls_worker/components/AssignmentsHandler.java +++ b/src/main/java/eu/openaire/urls_worker/components/AssignmentsHandler.java @@ -195,7 +195,7 @@ public class AssignmentsHandler { // Every time we reach a "limit" of handled id-url clear some data-structures of the underlying "PublicationsRetriever" program. // This helps with reducing the memory consumption over the period of weeks or months, and also give a 2nd chance to some domains which may be blocked due to a connectivity issues, but after a month they may be fine. - long idUrlPairsHandled = (numHandledAssignmentsBatches * this.maxAssignmentsLimitPerBatch); + long idUrlPairsHandled = (numHandledAssignmentsBatches * maxAssignmentsLimitPerBatch); if ( idUrlPairsHandled >= ((timesClearingDuplicateUrlsData +1) * idUrlsToHandleBeforeClearingDuplicateUrlsData) ) { UrlUtils.duplicateUrls.clear(); @@ -212,11 +212,11 @@ public class AssignmentsHandler { } if ( GeneralController.shouldShutdownWorker - || (numHandledAssignmentsBatches == this.maxAssignmentsBatchesToHandleBeforeShutdown) ) + || (numHandledAssignmentsBatches == maxAssignmentsBatchesToHandleBeforeShutdown) ) { logger.info("The worker will shutdown, after the full-texts are delivered to the Controller, as " + (GeneralController.shouldShutdownWorker ? "it received a \"shutdownWorker\" request!" - : "the maximum assignments-batches (" + this.maxAssignmentsBatchesToHandleBeforeShutdown + ") to be handled was reached!")); + : "the maximum assignments-batches (" + maxAssignmentsBatchesToHandleBeforeShutdown + ") to be handled was reached!")); // Here, just specify that we do not want to request for more assignments. A scheduling job will check if the fulltexts were delivered to the Controller and then shutdown the Worker. shouldNotRequestMore = true; @@ -302,13 +302,12 @@ public class AssignmentsHandler { } if ( sb != null ) { - logger.debug("Before change:\n" + sb.toString()); // DEBUG! + logger.debug("Before change:\n" + sb); // DEBUG! sb.setLength(0); // Reset it without re-sizing it. } List domains = new ArrayList<>(domainsWithAssignments.keySet()); int domainsSize = domains.size(); - Integer domainsCounter = -1; for ( int i = 0; i < assignmentsSize; ++i ) @@ -324,7 +323,7 @@ public class AssignmentsHandler { } if ( sb != null ) - logger.debug("After change:\n" + sb.toString()); + logger.debug("After change:\n" + sb); return spacedOutAssignments; } @@ -337,13 +336,9 @@ public class AssignmentsHandler { * */ public static HashMap getFirstAvailableObjectForSpacedOutDomains(List domainsList, Integer domainsCounter, HashMultimap domainsWithAssignments, int domainsSize, StringBuilder sb) { - HashMap result = new HashMap<>(); - Object nextAssignment = null; - // Normally, this method does not need a recursion-break-safety, as the initial-caller method should call this method exactly N times, where N is the number of all the values of "domainsWithAssignments". // Although, for extra-safety and re-usability, let's have this check here. - Set domainsSet = domainsWithAssignments.keySet(); - if ( domainsSet.isEmpty() ) + if ( domainsWithAssignments.keySet().isEmpty() ) return null; // Break recursion when the domains run-out. if ( domainsCounter < (domainsSize -1) ) @@ -353,22 +348,21 @@ public class AssignmentsHandler { String currentDomain = domainsList.get(domainsCounter); Set assignmentsOfCurrentDomain = domainsWithAssignments.get(currentDomain); - if ( assignmentsOfCurrentDomain.isEmpty() ) { - // This domain is out of assignments, check the next available one. - result = getFirstAvailableObjectForSpacedOutDomains(domainsList, domainsCounter, domainsWithAssignments, domainsSize, sb); - } else { - nextAssignment = assignmentsOfCurrentDomain.toArray()[0]; - result.put(nextAssignment, domainsCounter); - domainsWithAssignments.remove(currentDomain, nextAssignment); - if ( sb != null ) - sb.append(currentDomain).append("\n"); // DEBUG! - } + if ( assignmentsOfCurrentDomain.isEmpty() ) // This domain is out of assignments, check the next available one. + return getFirstAvailableObjectForSpacedOutDomains(domainsList, domainsCounter, domainsWithAssignments, domainsSize, sb); + + Object nextAssignment = assignmentsOfCurrentDomain.toArray()[0]; + HashMap result = new HashMap<>(); + result.put(nextAssignment, domainsCounter); + domainsWithAssignments.remove(currentDomain, nextAssignment); + if ( sb != null ) + sb.append(currentDomain).append("\n"); // DEBUG! return result; } - Lock fileWriteLock = new ReentrantLock(true); + private static final Lock fileWriteLock = new ReentrantLock(true); public String writeToFile(String fileFullPath, String stringToWrite, boolean shouldLockThreads) { diff --git a/src/main/java/eu/openaire/urls_worker/components/plugins/PublicationsRetrieverPlugin.java b/src/main/java/eu/openaire/urls_worker/components/plugins/PublicationsRetrieverPlugin.java index d9345a7..ea62442 100644 --- a/src/main/java/eu/openaire/urls_worker/components/plugins/PublicationsRetrieverPlugin.java +++ b/src/main/java/eu/openaire/urls_worker/components/plugins/PublicationsRetrieverPlugin.java @@ -7,9 +7,7 @@ import eu.openaire.publications_retriever.util.http.HttpConnUtils; import eu.openaire.publications_retriever.util.url.DataToBeLogged; import eu.openaire.publications_retriever.util.url.LoaderAndChecker; import eu.openaire.publications_retriever.util.url.UrlUtils; -import eu.openaire.urls_worker.UrlsWorkerApplication; import eu.openaire.urls_worker.components.AssignmentsHandler; -import eu.openaire.urls_worker.components.ConnWithController; import eu.openaire.urls_worker.controllers.GeneralController; import eu.openaire.urls_worker.models.Assignment; import eu.openaire.urls_worker.models.Error; @@ -18,13 +16,11 @@ import eu.openaire.urls_worker.models.UrlReport; import eu.openaire.urls_worker.services.FileStorageService; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.annotation.Value; import org.springframework.stereotype.Component; import java.io.File; import java.net.CookieStore; -import java.nio.charset.StandardCharsets; import java.sql.Timestamp; import java.util.ArrayList; import java.util.Collection; @@ -39,10 +35,6 @@ public class PublicationsRetrieverPlugin { private static final Logger logger = LoggerFactory.getLogger(PublicationsRetrieverPlugin.class); - @Autowired - private ConnWithController connWithController; - - public static String assignmentsBasePath; private static String workerId; @@ -179,6 +171,9 @@ public class PublicationsRetrieverPlugin { private static final int lengthOfAlreadyDownloadedFromSourceUrlContinuedMessage = ConnSupportUtils.alreadyDownloadedFromSourceUrlContinuedMessage.length(); private static final int lengthOfAlreadyDownloadedFromIDMessage = ConnSupportUtils.alreadyDownloadedFromIDMessage.length(); + private static final String provenance = "crawl:PublicationsRetriever"; + + public static void addUrlReportsToWorkerReport(Collection assignments) { if ( FileUtils.dataToBeLoggedList.size() != assignments.size() ) { @@ -246,6 +241,7 @@ public class PublicationsRetrieverPlugin { if ( !foundAlreadyDownloadedFullText ) { String addErrorMessage = ((!foundIDUrlInWorkerReport) ? " | That ID-sourceUrl was not found inside the WorkerReport!" : " | The file was not downloaded!"); error = new Error(Error.ErrorType.couldRetry, comment + addErrorMessage); // We can still try to download it from the found docUrl, in the future. + // The "fileLocation" is null. } } else if ( ! comment.startsWith(HttpConnUtils.docFileNotRetrievedMessage, 0) ) { // If it was downloaded without an error. @@ -279,7 +275,7 @@ public class PublicationsRetrieverPlugin { // Each record will have the urlID, the datasourceID and possibly one filename, which may contain a different urlID. // The Controller will select the correct datasourceID for before adding it inside the S3-ObjectStore filename. - Payload payload = new Payload(urlId, data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, "crawl:PublicationsRetriever", datasourceId); + Payload payload = new Payload(urlId, data.getSourceUrl(), docOrDatasetUrl, timestamp, mimeType, size, hash, fileLocation, provenance, datasourceId); // TODO - If support is added for other doc-formats other than "pdf", then make sure the "mime_type" is correctly specified. AssignmentsHandler.urlReports.add(new UrlReport(status, payload, error));