From 90a69686cf97626a242739fca9b35c4fe8bd31bd Mon Sep 17 00:00:00 2001 From: LSmyrnaios Date: Wed, 2 Nov 2022 02:27:04 +0200 Subject: [PATCH] - When the Worker is about to shut-down, after deleting all the handled assignments' files, check for remaining full-texts in the local storage and warn the user. If no remaining files were found, then delete the parent fulltexts' directory. - Polish the code. --- README.md | 2 +- build.gradle | 4 +++ .../urls_worker/UrlsWorkerApplication.java | 1 + .../components/ScheduledTasks.java | 36 +++++++++++++++++-- .../controllers/FullTextsController.java | 4 +-- .../controllers/GeneralController.java | 4 +-- .../plugins/PublicationsRetrieverPlugin.java | 8 ++--- .../urls_worker/util/FilesZipper.java | 12 +++---- src/main/resources/application.properties | 2 ++ 9 files changed, 56 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index ca06072..7fcc352 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ The Worker's Application, requests assignments from the [Controller](https://code-repo.d4science.org/lsmyrnaios/UrlsController) and processes them, downloading the available full-texts.
Then, it posts the results to the Controller, which in turn, requests from the Worker, the full-texts which are not already found by other workers, in batches.
-The Worker responds by compressing and sending the requested files in each batch.
+The Worker responds by compressing and sending the requested files, in each batch.

To install and run the application: - Run ```git clone``` and then ```cd UrlsWorker```. diff --git a/build.gradle b/build.gradle index b3dd6ec..c5316e3 100644 --- a/build.gradle +++ b/build.gradle @@ -27,6 +27,10 @@ dependencies { implementation 'org.projectlombok:lombok:1.18.24' + // https://mvnrepository.com/artifact/commons-io/commons-io + implementation 'commons-io:commons-io:2.11.0' + + //implementation group: 'io.jsonwebtoken', name: 'jjwt-api', version: '0.11.5' // Use this in case we use auth-tokens later on. // Enable the validation annotations. diff --git a/src/main/java/eu/openaire/urls_worker/UrlsWorkerApplication.java b/src/main/java/eu/openaire/urls_worker/UrlsWorkerApplication.java index 89d8226..a639ed2 100644 --- a/src/main/java/eu/openaire/urls_worker/UrlsWorkerApplication.java +++ b/src/main/java/eu/openaire/urls_worker/UrlsWorkerApplication.java @@ -94,6 +94,7 @@ public class UrlsWorkerApplication { } } + ScheduledTasks.isLastTime = true; ScheduledTasks.deleteHandledAssignmentsFullTexts(); } diff --git a/src/main/java/eu/openaire/urls_worker/components/ScheduledTasks.java b/src/main/java/eu/openaire/urls_worker/components/ScheduledTasks.java index 5c1d8ec..1d37d11 100644 --- a/src/main/java/eu/openaire/urls_worker/components/ScheduledTasks.java +++ b/src/main/java/eu/openaire/urls_worker/components/ScheduledTasks.java @@ -13,8 +13,12 @@ import org.springframework.stereotype.Component; import java.io.File; import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; import java.util.Map; import java.util.Set; +import java.util.stream.Stream; @Component @@ -41,10 +45,13 @@ public class ScheduledTasks { } + public static boolean isLastTime = false; + + @Scheduled(fixedDelay = 43_200_000, initialDelay = 43_200_000) // Every 12 hours, after 12 hours from the start of this app. public static void deleteHandledAssignmentsFullTexts() { - Set> entrySet = FullTextsController.assignmentsNumsHandledAndLocallyDeleted.entrySet(); + Set> entrySet = FullTextsController.assignmentsNumsHandledAndApprovedToBeDeleted.entrySet(); if ( entrySet.isEmpty() ) return; @@ -67,11 +74,36 @@ public class ScheduledTasks { try { FileUtils.deleteDirectory(curDir); - FullTextsController.assignmentsNumsHandledAndLocallyDeleted.put(curAssignments, true); // Set the is-handled to true. + FullTextsController.assignmentsNumsHandledAndApprovedToBeDeleted.put(curAssignments, true); // Set the is-handled to true. } catch (IOException e) { logger.error("The following directory could not be deleted: " + currentAssignmentsBasePath, e); } } + + if ( isLastTime ) { // Delete the parent "assignments" directory if not files are left behind. + // In case something went wrong in the full-texts delivering to the controller, then the non-transferred files will remain in the Worker's local storage, for future fix. + // So, delete the parent directory, only if it's empty! + + logger.info("Going to delete the parent \"" + PublicationsRetrieverPlugin.assignmentsBasePath + "\" directory."); + + boolean isAnEmptyDir = false; + try ( Stream stream = Files.list(Paths.get(PublicationsRetrieverPlugin.assignmentsBasePath)) ) { + isAnEmptyDir = ! stream.findAny().isPresent(); + } catch (IOException e) { + logger.error("Could not list the contents of the parent directory: " + PublicationsRetrieverPlugin.assignmentsBasePath, e); + return; + } + + if ( isAnEmptyDir ) { + try { + FileUtils.deleteDirectory(new File(PublicationsRetrieverPlugin.assignmentsBasePath)); + } catch (IOException e) { + logger.error("The following directory could not be deleted: " + PublicationsRetrieverPlugin.assignmentsBasePath, e); + } + } else + logger.warn("The parent directory \"" + PublicationsRetrieverPlugin.assignmentsBasePath + "\" was not empty! Which means there were some unhandled full-text batches!"); + } + } } diff --git a/src/main/java/eu/openaire/urls_worker/controllers/FullTextsController.java b/src/main/java/eu/openaire/urls_worker/controllers/FullTextsController.java index 3ce1bbb..cd89c28 100644 --- a/src/main/java/eu/openaire/urls_worker/controllers/FullTextsController.java +++ b/src/main/java/eu/openaire/urls_worker/controllers/FullTextsController.java @@ -25,7 +25,7 @@ public class FullTextsController { private static final Logger logger = LoggerFactory.getLogger(GeneralController.class); - public static HashMap assignmentsNumsHandledAndLocallyDeleted = new HashMap<>(); + public static HashMap assignmentsNumsHandledAndApprovedToBeDeleted = new HashMap<>(); public static String assignmentsBaseDir = null; @@ -76,7 +76,7 @@ public class FullTextsController { // If this is the last batch for this assignments-count, then make sure it is deleted in the next scheduled delete-operation. if ( zipBatchCounter == totalZipBatches ) { - assignmentsNumsHandledAndLocallyDeleted.put(assignmentsCounter, false); + assignmentsNumsHandledAndApprovedToBeDeleted.put(assignmentsCounter, false); logger.debug("Will return the " + ((totalZipBatches > 1) ? "last" : "only") + " batch (" + zipBatchCounter + ") of Assignments_" + assignmentsCounter + " to the Controller and these assignments will be deleted later."); } diff --git a/src/main/java/eu/openaire/urls_worker/controllers/GeneralController.java b/src/main/java/eu/openaire/urls_worker/controllers/GeneralController.java index e962a95..bf7d4f5 100644 --- a/src/main/java/eu/openaire/urls_worker/controllers/GeneralController.java +++ b/src/main/java/eu/openaire/urls_worker/controllers/GeneralController.java @@ -69,8 +69,8 @@ public class GeneralController { @GetMapping("getHandledAssignmentsCounts") public ResponseEntity getHandledAssignmentsCounts() { - List handledAssignmentsCounts = new ArrayList<>(FullTextsController.assignmentsNumsHandledAndLocallyDeleted.size()/2); - for ( Map.Entry entry : FullTextsController.assignmentsNumsHandledAndLocallyDeleted.entrySet() ) + List handledAssignmentsCounts = new ArrayList<>(FullTextsController.assignmentsNumsHandledAndApprovedToBeDeleted.size()/2); + for ( Map.Entry entry : FullTextsController.assignmentsNumsHandledAndApprovedToBeDeleted.entrySet() ) { if ( entry.getValue().equals(true) ) handledAssignmentsCounts.add(entry.getKey()); diff --git a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java index 8bdfc4e..de3704b 100644 --- a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java +++ b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java @@ -100,7 +100,7 @@ public class PublicationsRetrieverPlugin { if ( (id == null) || id.isEmpty() || (url == null) || url.isEmpty() ) { String errorMsg = "Got null or empty pair! ID=" + id + " , url=" + url; logger.warn(errorMsg); - UrlUtils.logOutputData(id, url, null, "unreachable", "Discarded at loading time, due to input problems. " + errorMsg, null, true, "true", "false", "false", "false", "false", null, null); + UrlUtils.logOutputData(id, url, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to input problems. " + errorMsg, null, true, "true", "false", "false", "false", "false", null, null); return false; } @@ -112,7 +112,7 @@ public class PublicationsRetrieverPlugin { String sourceUrl = urlToCheck; // Hold it here for the logging-messages. if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) { logger.warn("Could not canonicalize url: " + sourceUrl); - UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null); + UrlUtils.logOutputData(id, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null); LoaderAndChecker.connProblematicUrls.incrementAndGet(); return false; } @@ -137,7 +137,7 @@ public class PublicationsRetrieverPlugin { List list = LoaderAndChecker.getWasValidAndCouldRetry(e); String wasUrlValid = list.get(0); String couldRetry = list.get(1); - UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null); + UrlUtils.logOutputData(id, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null); return false; } return true; @@ -272,7 +272,7 @@ public class PublicationsRetrieverPlugin { List list = LoaderAndChecker.getWasValidAndCouldRetry(e); String wasUrlValid = list.get(0); String couldRetry = list.get(1); - UrlUtils.logOutputData(testID, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null); + UrlUtils.logOutputData(testID, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null); return false; } } diff --git a/src/main/java/eu/openaire/urls_worker/util/FilesZipper.java b/src/main/java/eu/openaire/urls_worker/util/FilesZipper.java index b824d1b..2d24e46 100644 --- a/src/main/java/eu/openaire/urls_worker/util/FilesZipper.java +++ b/src/main/java/eu/openaire/urls_worker/util/FilesZipper.java @@ -21,11 +21,11 @@ public class FilesZipper public static File zipMultipleFilesAndGetZip(long assignmentsCounter, int zipBatchCounter, List filesToZip, String baseDirectory) { - String zipFilename = baseDirectory + "assignments_" + assignmentsCounter + "_full-texts_" + zipBatchCounter + ".zip"; - // For example: assignments_2_full-texts_4.zip | where < 4 > is referred to the 4th batch of files requested by the controller. + String zipFileFullPath = baseDirectory + "assignments_" + assignmentsCounter + "_full-texts_" + zipBatchCounter + ".zip"; + // For example: assignments_2_full-texts_4.zip | where < 4 > is referred to the 4th batch of files requested by the Controller. int numZippedFiles = 0; - File zipFile = new File(zipFilename); + File zipFile = new File(zipFileFullPath); try ( ZipOutputStream zos = new ZipOutputStream(Files.newOutputStream(zipFile.toPath()), StandardCharsets.UTF_8) ) { for ( String file : filesToZip ) { @@ -33,10 +33,10 @@ public class FilesZipper numZippedFiles ++; } } catch (Exception e) { - logger.error("Exception when creating the zip-file: " + zipFilename, e); + logger.error("Exception when creating the zip-file: " + zipFileFullPath, e); return null; } - logger.debug("Zipped " + numZippedFiles + " files for assignments_" + assignmentsCounter + ", batch_" + zipBatchCounter); + logger.debug("Zipped " + numZippedFiles + " (out of " + filesToZip.size() + ") files for assignments_" + assignmentsCounter + ", batch_" + zipBatchCounter); return zipFile; } @@ -62,7 +62,7 @@ public class FilesZipper } finally { if ( shouldCloseEntry ) { try { - zos.closeEntry(); // close the entry here (not the ZipOutputStream) + zos.closeEntry(); // close just the ZipEntry here (not the ZipOutputStream) } catch (IOException e) { logger.error("", e); } diff --git a/src/main/resources/application.properties b/src/main/resources/application.properties index edb61fd..9ddd43b 100644 --- a/src/main/resources/application.properties +++ b/src/main/resources/application.properties @@ -11,6 +11,8 @@ # HTTP CONFIGURATION server.port = 1881 +# Set the above value to < 0 >, in order to choose a random port (it will automatically choose a new random port, if the previously chosen is already in use).. + # Server api path server.servlet.context-path=/api