- Improve README.

- Update and cleanup dependencies.
- Code polishing.
This commit is contained in:
Lampros Smyrnaios 2023-06-22 12:47:36 +03:00
parent 9c897b8bf4
commit 33df46f6f5
3 changed files with 6 additions and 21 deletions

View File

@ -1,9 +1,10 @@
# UrlsWorker
The Worker's Application, requests assignments from the [Controller](https://code-repo.d4science.org/lsmyrnaios/UrlsController) and processes them with the help of the [__PublicationsRetriever__](https://github.com/LSmyrnaios/PublicationsRetriever) software and downloads the available full-texts.<br>
The Worker's Application, requests assignments from the [**Controller**](https://code-repo.d4science.org/lsmyrnaios/UrlsController) and processes them with the help of the [__PublicationsRetriever__](https://github.com/LSmyrnaios/PublicationsRetriever) software and downloads the available full-texts.<br>
Then, it posts the results to the Controller, which in turn, requests from the Worker, the full-texts which are not already found by other workers, in batches.<br>
The Worker responds by compressing and sending the requested files, in each batch.<br>
<br>
Multiple instances of this app are deployed on the cloud.<br>
We use Facebook's [**Zstandard**](https://facebook.github.io/zstd/) compression algorithm, which brings very big benefits in compression rate and speed.
<br>
<br>

View File

@ -25,8 +25,6 @@ dependencies {
implementation("org.springframework.security:spring-security-web")
implementation("org.springframework.security:spring-security-config")
implementation 'org.projectlombok:lombok:1.18.28'
//implementation group: 'io.jsonwebtoken', name: 'jjwt-api', version: '0.11.5' // Use this in case we use auth-tokens later on.
// Enable the validation annotations.
@ -39,13 +37,13 @@ dependencies {
exclude group: 'io.minio' // This is not used in the Worker, since it's the Controller which uploads the full-texts to S3. It also includes an older "commons-compress" version which causes problems.
}
implementation group: 'com.google.guava', name: 'guava', version: '32.0.0-jre'
implementation group: 'com.google.guava', name: 'guava', version: '32.0.1-jre'
// https://mvnrepository.com/artifact/com.google.code.gson/gson
implementation 'com.google.code.gson:gson:2.10.1'
implementation 'org.apache.commons:commons-compress:1.23.0'
implementation 'com.github.luben:zstd-jni:1.5.5-3' // Even though this is part of the above dependency, it is needed separately as well, specifically here, in the Worker.
implementation 'com.github.luben:zstd-jni:1.5.5-4' // Even though this is part of the above dependency, it is needed separately as well, specifically here, in the Worker.
testImplementation 'org.springframework.security:spring-security-test'
testImplementation "org.springframework.boot:spring-boot-starter-test"

View File

@ -163,8 +163,8 @@ public class PublicationsRetrieverPlugin {
logger.warn(numFailedTasks + " tasks failed, from assignments_" + assignmentRequestCounter);
addUrlReportsToWorkerReport(assignments);
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch.
// In the next batch, the previously stored files might have been already uploaded by the Controller and deleted by the worker. Also, they will be stored in a different directory anyway.
@ -185,7 +185,7 @@ public class PublicationsRetrieverPlugin {
logger.warn("The number of the results (" + FileUtils.dataToBeLoggedList.size() + ") is different from the number of the given assignments (" + assignments.size() + ")!");
} // TODO - Should any other step be taken, except from just showing the log-message?
// Index the UrlIds with the DatasourceIds for quick-search later.
// Index the UrlIds with the DatasourceIds for quick-search later. The datasourceIds are not included in the "DataToBeLogged" objects.
HashMap<String, String> urlIdsWithDatasourceIds = new HashMap<>(assignments.size());
for ( Assignment assignment : assignments )
urlIdsWithDatasourceIds.put(assignment.getId(), assignment.getDatasource().getId());
@ -287,18 +287,4 @@ public class PublicationsRetrieverPlugin {
FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment.
}
public static boolean connectWithUrlTest(String urlToCheck) {
String testID = "testID";
try {
return HttpConnUtils.connectAndCheckMimeType(testID, urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures.
} catch (Exception e) {
List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e, urlToCheck);
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
UrlUtils.logOutputData(testID, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
return false;
}
}
}