- Improve README.
- Update and cleanup dependencies. - Code polishing.
This commit is contained in:
parent
9c897b8bf4
commit
33df46f6f5
|
@ -1,9 +1,10 @@
|
|||
# UrlsWorker
|
||||
|
||||
The Worker's Application, requests assignments from the [Controller](https://code-repo.d4science.org/lsmyrnaios/UrlsController) and processes them with the help of the [__PublicationsRetriever__](https://github.com/LSmyrnaios/PublicationsRetriever) software and downloads the available full-texts.<br>
|
||||
The Worker's Application, requests assignments from the [**Controller**](https://code-repo.d4science.org/lsmyrnaios/UrlsController) and processes them with the help of the [__PublicationsRetriever__](https://github.com/LSmyrnaios/PublicationsRetriever) software and downloads the available full-texts.<br>
|
||||
Then, it posts the results to the Controller, which in turn, requests from the Worker, the full-texts which are not already found by other workers, in batches.<br>
|
||||
The Worker responds by compressing and sending the requested files, in each batch.<br>
|
||||
<br>
|
||||
Multiple instances of this app are deployed on the cloud.<br>
|
||||
We use Facebook's [**Zstandard**](https://facebook.github.io/zstd/) compression algorithm, which brings very big benefits in compression rate and speed.
|
||||
<br>
|
||||
<br>
|
||||
|
|
|
@ -25,8 +25,6 @@ dependencies {
|
|||
implementation("org.springframework.security:spring-security-web")
|
||||
implementation("org.springframework.security:spring-security-config")
|
||||
|
||||
implementation 'org.projectlombok:lombok:1.18.28'
|
||||
|
||||
//implementation group: 'io.jsonwebtoken', name: 'jjwt-api', version: '0.11.5' // Use this in case we use auth-tokens later on.
|
||||
|
||||
// Enable the validation annotations.
|
||||
|
@ -39,13 +37,13 @@ dependencies {
|
|||
exclude group: 'io.minio' // This is not used in the Worker, since it's the Controller which uploads the full-texts to S3. It also includes an older "commons-compress" version which causes problems.
|
||||
}
|
||||
|
||||
implementation group: 'com.google.guava', name: 'guava', version: '32.0.0-jre'
|
||||
implementation group: 'com.google.guava', name: 'guava', version: '32.0.1-jre'
|
||||
|
||||
// https://mvnrepository.com/artifact/com.google.code.gson/gson
|
||||
implementation 'com.google.code.gson:gson:2.10.1'
|
||||
|
||||
implementation 'org.apache.commons:commons-compress:1.23.0'
|
||||
implementation 'com.github.luben:zstd-jni:1.5.5-3' // Even though this is part of the above dependency, it is needed separately as well, specifically here, in the Worker.
|
||||
implementation 'com.github.luben:zstd-jni:1.5.5-4' // Even though this is part of the above dependency, it is needed separately as well, specifically here, in the Worker.
|
||||
|
||||
testImplementation 'org.springframework.security:spring-security-test'
|
||||
testImplementation "org.springframework.boot:spring-boot-starter-test"
|
||||
|
|
|
@ -163,8 +163,8 @@ public class PublicationsRetrieverPlugin {
|
|||
logger.warn(numFailedTasks + " tasks failed, from assignments_" + assignmentRequestCounter);
|
||||
|
||||
addUrlReportsToWorkerReport(assignments);
|
||||
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
|
||||
|
||||
callableTasks.clear(); // Reset the thread-tasks-list for the next batch.
|
||||
UrlUtils.docOrDatasetUrlsWithIDs.clear(); // This HashTable is useful only for a single assignments-batch.
|
||||
// In the next batch, the previously stored files might have been already uploaded by the Controller and deleted by the worker. Also, they will be stored in a different directory anyway.
|
||||
|
||||
|
@ -185,7 +185,7 @@ public class PublicationsRetrieverPlugin {
|
|||
logger.warn("The number of the results (" + FileUtils.dataToBeLoggedList.size() + ") is different from the number of the given assignments (" + assignments.size() + ")!");
|
||||
} // TODO - Should any other step be taken, except from just showing the log-message?
|
||||
|
||||
// Index the UrlIds with the DatasourceIds for quick-search later.
|
||||
// Index the UrlIds with the DatasourceIds for quick-search later. The datasourceIds are not included in the "DataToBeLogged" objects.
|
||||
HashMap<String, String> urlIdsWithDatasourceIds = new HashMap<>(assignments.size());
|
||||
for ( Assignment assignment : assignments )
|
||||
urlIdsWithDatasourceIds.put(assignment.getId(), assignment.getDatasource().getId());
|
||||
|
@ -287,18 +287,4 @@ public class PublicationsRetrieverPlugin {
|
|||
FileUtils.dataToBeLoggedList.clear(); // Empty the list, to be re-populated by the next batch / assignment.
|
||||
}
|
||||
|
||||
|
||||
public static boolean connectWithUrlTest(String urlToCheck) {
|
||||
String testID = "testID";
|
||||
try {
|
||||
return HttpConnUtils.connectAndCheckMimeType(testID, urlToCheck, urlToCheck, urlToCheck, null, true, false); // Sent the < null > in quotes to avoid an NPE in the concurrent data-structures.
|
||||
} catch (Exception e) {
|
||||
List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e, urlToCheck);
|
||||
String wasUrlValid = list.get(0);
|
||||
String couldRetry = list.get(1);
|
||||
UrlUtils.logOutputData(testID, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue