diff --git a/README.md b/README.md index 8d8d5d3..130327c 100644 --- a/README.md +++ b/README.md @@ -1,9 +1,10 @@ # UrlsWorker -The Worker's Application, requests assignments from the [Controller](https://code-repo.d4science.org/lsmyrnaios/UrlsController) and processes them with the help of the [__PublicationsRetriever__](https://github.com/LSmyrnaios/PublicationsRetriever) software and downloads the available full-texts.
+The Worker's Application, requests assignments from the [**Controller**](https://code-repo.d4science.org/lsmyrnaios/UrlsController) and processes them with the help of the [__PublicationsRetriever__](https://github.com/LSmyrnaios/PublicationsRetriever) software and downloads the available full-texts.
Then, it posts the results to the Controller, which in turn, requests from the Worker, the full-texts which are not already found by other workers, in batches.
The Worker responds by compressing and sending the requested files, in each batch.

+Multiple instances of this app are deployed on the cloud.
We use Facebook's [**Zstandard**](https://facebook.github.io/zstd/) compression algorithm, which brings very big benefits in compression rate and speed.

diff --git a/build.gradle b/build.gradle index 13ef2cf..fa31383 100644 --- a/build.gradle +++ b/build.gradle @@ -25,8 +25,6 @@ dependencies { implementation("org.springframework.security:spring-security-web") implementation("org.springframework.security:spring-security-config") - implementation 'org.projectlombok:lombok:1.18.28' - //implementation group: 'io.jsonwebtoken', name: 'jjwt-api', version: '0.11.5' // Use this in case we use auth-tokens later on. // Enable the validation annotations. @@ -39,13 +37,13 @@ dependencies { exclude group: 'io.minio' // This is not used in the Worker, since it's the Controller which uploads the full-texts to S3. It also includes an older "commons-compress" version which causes problems. } - implementation group: 'com.google.guava', name: 'guava', version: '32.0.0-jre' + implementation group: 'com.google.guava', name: 'guava', version: '32.0.1-jre' // https://mvnrepository.com/artifact/com.google.code.gson/gson implementation 'com.google.code.gson:gson:2.10.1' implementation 'org.apache.commons:commons-compress:1.23.0' - implementation 'com.github.luben:zstd-jni:1.5.5-3' // Even though this is part of the above dependency, it is needed separately as well, specifically here, in the Worker. + implementation 'com.github.luben:zstd-jni:1.5.5-4' // Even though this is part of the above dependency, it is needed separately as well, specifically here, in the Worker. testImplementation 'org.springframework.security:spring-security-test' testImplementation "org.springframework.boot:spring-boot-starter-test" diff --git a/src/main/java/eu/openaire/urls_worker/components/plugins/PublicationsRetrieverPlugin.java b/src/main/java/eu/openaire/urls_worker/components/plugins/PublicationsRetrieverPlugin.java index 2757b9e..d9345a7 100644 --- a/src/main/java/eu/openaire/urls_worker/components/plugins/PublicationsRetrieverPlugin.java +++ b/src/main/java/eu/openaire/urls_worker/components/plugins/PublicationsRetrieverPlugin.java @@ -163,8 +163,8 @@ public class PublicationsRetrieverPlugin { logger.warn(numFailedTasks + " tasks failed, from assignments_" + assignmentRequestCounter); addUrlReportsToWorkerReport(assignments); - callableTasks.clear(); // Reset the thread-tasks-list for the next batch. + callableTasks.clear(); // Reset the thread-tasks-list for the next batch. UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch. // In the next batch, the previously stored files might have been already uploaded by the Controller and deleted by the worker. Also, they will be stored in a different directory anyway. @@ -185,7 +185,7 @@ public class PublicationsRetrieverPlugin { logger.warn("The number of the results (" + FileUtils.dataToBeLoggedList.size() + ") is different from the number of the given assignments (" + assignments.size() + ")!"); } // TODO - Should any other step be taken, except from just showing the log-message? - // Index the UrlIds with the DatasourceIds for quick-search later. + // Index the UrlIds with the DatasourceIds for quick-search later. The datasourceIds are not included in the "DataToBeLogged" objects. HashMap urlIdsWithDatasourceIds = new HashMap<>(assignments.size()); for ( Assignment assignment : assignments ) urlIdsWithDatasourceIds.put(assignment.getId(), assignment.getDatasource().getId()); @@ -287,18 +287,4 @@ public class PublicationsRetrieverPlugin { FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment. } - - public static boolean connectWithUrlTest(String urlToCheck) { - String testID = "testID"; - try { - return HttpConnUtils.connectAndCheckMimeType(testID, urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures. - } catch (Exception e) { - List list = LoaderAndChecker.getWasValidAndCouldRetry(e, urlToCheck); - String wasUrlValid = list.get(0); - String couldRetry = list.get(1); - UrlUtils.logOutputData(testID, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null); - return false; - } - } - }