- When the Worker is about to shut-down, after deleting all the handled assignments' files, check for remaining full-texts in the local storage and warn the user. If no remaining files were found, then delete the parent fulltexts' directory.

- Polish the code.
This commit is contained in:
Lampros Smyrnaios 2022-11-02 02:27:04 +02:00
parent 6450a4b8ac
commit 90a69686cf
9 changed files with 56 additions and 17 deletions

View File

@ -2,7 +2,7 @@
The Worker's Application, requests assignments from the [Controller](https://code-repo.d4science.org/lsmyrnaios/UrlsController) and processes them, downloading the available full-texts.<br>
Then, it posts the results to the Controller, which in turn, requests from the Worker, the full-texts which are not already found by other workers, in batches.<br>
The Worker responds by compressing and sending the requested files in each batch.<br>
The Worker responds by compressing and sending the requested files, in each batch.<br>
<br>
To install and run the application:
- Run ```git clone``` and then ```cd UrlsWorker```.

View File

@ -27,6 +27,10 @@ dependencies {
implementation 'org.projectlombok:lombok:1.18.24'
// https://mvnrepository.com/artifact/commons-io/commons-io
implementation 'commons-io:commons-io:2.11.0'
//implementation group: 'io.jsonwebtoken', name: 'jjwt-api', version: '0.11.5' // Use this in case we use auth-tokens later on.
// Enable the validation annotations.

View File

@ -94,6 +94,7 @@ public class UrlsWorkerApplication {
}
}
ScheduledTasks.isLastTime = true;
ScheduledTasks.deleteHandledAssignmentsFullTexts();
}

View File

@ -13,8 +13,12 @@ import org.springframework.stereotype.Component;
import java.io.File;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;
@Component
@ -41,10 +45,13 @@ public class ScheduledTasks {
}
public static boolean isLastTime = false;
@Scheduled(fixedDelay = 43_200_000, initialDelay = 43_200_000) // Every 12 hours, after 12 hours from the start of this app.
public static void deleteHandledAssignmentsFullTexts()
{
Set<Map.Entry<Long, Boolean>> entrySet = FullTextsController.assignmentsNumsHandledAndLocallyDeleted.entrySet();
Set<Map.Entry<Long, Boolean>> entrySet = FullTextsController.assignmentsNumsHandledAndApprovedToBeDeleted.entrySet();
if ( entrySet.isEmpty() )
return;
@ -67,11 +74,36 @@ public class ScheduledTasks {
try {
FileUtils.deleteDirectory(curDir);
FullTextsController.assignmentsNumsHandledAndLocallyDeleted.put(curAssignments, true); // Set the is-handled to true.
FullTextsController.assignmentsNumsHandledAndApprovedToBeDeleted.put(curAssignments, true); // Set the is-handled to true.
} catch (IOException e) {
logger.error("The following directory could not be deleted: " + currentAssignmentsBasePath, e);
}
}
if ( isLastTime ) { // Delete the parent "assignments" directory if not files are left behind.
// In case something went wrong in the full-texts delivering to the controller, then the non-transferred files will remain in the Worker's local storage, for future fix.
// So, delete the parent directory, only if it's empty!
logger.info("Going to delete the parent \"" + PublicationsRetrieverPlugin.assignmentsBasePath + "\" directory.");
boolean isAnEmptyDir = false;
try ( Stream<Path> stream = Files.list(Paths.get(PublicationsRetrieverPlugin.assignmentsBasePath)) ) {
isAnEmptyDir = ! stream.findAny().isPresent();
} catch (IOException e) {
logger.error("Could not list the contents of the parent directory: " + PublicationsRetrieverPlugin.assignmentsBasePath, e);
return;
}
if ( isAnEmptyDir ) {
try {
FileUtils.deleteDirectory(new File(PublicationsRetrieverPlugin.assignmentsBasePath));
} catch (IOException e) {
logger.error("The following directory could not be deleted: " + PublicationsRetrieverPlugin.assignmentsBasePath, e);
}
} else
logger.warn("The parent directory \"" + PublicationsRetrieverPlugin.assignmentsBasePath + "\" was not empty! Which means there were some unhandled full-text batches!");
}
}
}

View File

@ -25,7 +25,7 @@ public class FullTextsController {
private static final Logger logger = LoggerFactory.getLogger(GeneralController.class);
public static HashMap<Long, Boolean> assignmentsNumsHandledAndLocallyDeleted = new HashMap<>();
public static HashMap<Long, Boolean> assignmentsNumsHandledAndApprovedToBeDeleted = new HashMap<>();
public static String assignmentsBaseDir = null;
@ -76,7 +76,7 @@ public class FullTextsController {
// If this is the last batch for this assignments-count, then make sure it is deleted in the next scheduled delete-operation.
if ( zipBatchCounter == totalZipBatches ) {
assignmentsNumsHandledAndLocallyDeleted.put(assignmentsCounter, false);
assignmentsNumsHandledAndApprovedToBeDeleted.put(assignmentsCounter, false);
logger.debug("Will return the " + ((totalZipBatches > 1) ? "last" : "only") + " batch (" + zipBatchCounter + ") of Assignments_" + assignmentsCounter + " to the Controller and these assignments will be deleted later.");
}

View File

@ -69,8 +69,8 @@ public class GeneralController {
@GetMapping("getHandledAssignmentsCounts")
public ResponseEntity<?> getHandledAssignmentsCounts()
{
List<Long> handledAssignmentsCounts = new ArrayList<>(FullTextsController.assignmentsNumsHandledAndLocallyDeleted.size()/2);
for ( Map.Entry<Long,Boolean> entry : FullTextsController.assignmentsNumsHandledAndLocallyDeleted.entrySet() )
List<Long> handledAssignmentsCounts = new ArrayList<>(FullTextsController.assignmentsNumsHandledAndApprovedToBeDeleted.size()/2);
for ( Map.Entry<Long,Boolean> entry : FullTextsController.assignmentsNumsHandledAndApprovedToBeDeleted.entrySet() )
{
if ( entry.getValue().equals(true) )
handledAssignmentsCounts.add(entry.getKey());

View File

@ -100,7 +100,7 @@ public class PublicationsRetrieverPlugin {
if ( (id == null) || id.isEmpty() || (url == null) || url.isEmpty() ) {
String errorMsg = "Got null or empty pair! ID=" + id + " , url=" + url;
logger.warn(errorMsg);
UrlUtils.logOutputData(id, url, null, "unreachable", "Discarded at loading time, due to input problems. " + errorMsg, null, true, "true", "false", "false", "false", "false", null, null);
UrlUtils.logOutputData(id, url, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to input problems. " + errorMsg, null, true, "true", "false", "false", "false", "false", null, null);
return false;
}
@ -112,7 +112,7 @@ public class PublicationsRetrieverPlugin {
String sourceUrl = urlToCheck; // Hold it here for the logging-messages.
if ( !sourceUrl.contains("#/") && (urlToCheck = URLCanonicalizer.getCanonicalURL(sourceUrl, null, StandardCharsets.UTF_8)) == null ) {
logger.warn("Could not canonicalize url: " + sourceUrl);
UrlUtils.logOutputData(id, sourceUrl, null, "unreachable", "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
UrlUtils.logOutputData(id, sourceUrl, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to canonicalization's problems.", null, true, "true", "false", "false", "false", "false", null, null);
LoaderAndChecker.connProblematicUrls.incrementAndGet();
return false;
}
@ -137,7 +137,7 @@ public class PublicationsRetrieverPlugin {
List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
UrlUtils.logOutputData(id, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
UrlUtils.logOutputData(id, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
return false;
}
return true;
@ -272,7 +272,7 @@ public class PublicationsRetrieverPlugin {
List<String> list = LoaderAndChecker.getWasValidAndCouldRetry(e);
String wasUrlValid = list.get(0);
String couldRetry = list.get(1);
UrlUtils.logOutputData(testID, urlToCheck, null, "unreachable", "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
UrlUtils.logOutputData(testID, urlToCheck, null, UrlUtils.unreachableDocOrDatasetUrlIndicator, "Discarded at loading time, due to connectivity problems.", null, true, "true", wasUrlValid, "false", "false", couldRetry, null, null);
return false;
}
}

View File

@ -21,11 +21,11 @@ public class FilesZipper
public static File zipMultipleFilesAndGetZip(long assignmentsCounter, int zipBatchCounter, List<String> filesToZip, String baseDirectory)
{
String zipFilename = baseDirectory + "assignments_" + assignmentsCounter + "_full-texts_" + zipBatchCounter + ".zip";
// For example: assignments_2_full-texts_4.zip | where < 4 > is referred to the 4th batch of files requested by the controller.
String zipFileFullPath = baseDirectory + "assignments_" + assignmentsCounter + "_full-texts_" + zipBatchCounter + ".zip";
// For example: assignments_2_full-texts_4.zip | where < 4 > is referred to the 4th batch of files requested by the Controller.
int numZippedFiles = 0;
File zipFile = new File(zipFilename);
File zipFile = new File(zipFileFullPath);
try ( ZipOutputStream zos = new ZipOutputStream(Files.newOutputStream(zipFile.toPath()), StandardCharsets.UTF_8) )
{
for ( String file : filesToZip ) {
@ -33,10 +33,10 @@ public class FilesZipper
numZippedFiles ++;
}
} catch (Exception e) {
logger.error("Exception when creating the zip-file: " + zipFilename, e);
logger.error("Exception when creating the zip-file: " + zipFileFullPath, e);
return null;
}
logger.debug("Zipped " + numZippedFiles + " files for assignments_" + assignmentsCounter + ", batch_" + zipBatchCounter);
logger.debug("Zipped " + numZippedFiles + " (out of " + filesToZip.size() + ") files for assignments_" + assignmentsCounter + ", batch_" + zipBatchCounter);
return zipFile;
}
@ -62,7 +62,7 @@ public class FilesZipper
} finally {
if ( shouldCloseEntry ) {
try {
zos.closeEntry(); // close the entry here (not the ZipOutputStream)
zos.closeEntry(); // close just the ZipEntry here (not the ZipOutputStream)
} catch (IOException e) {
logger.error("", e);
}

View File

@ -11,6 +11,8 @@
# HTTP CONFIGURATION
server.port = 1881
# Set the above value to < 0 >, in order to choose a random port (it will automatically choose a new random port, if the previously chosen is already in use)..
# Server api path
server.servlet.context-path=/api