diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties index e750102..84d1f85 100644 --- a/gradle/wrapper/gradle-wrapper.properties +++ b/gradle/wrapper/gradle-wrapper.properties @@ -1,5 +1,5 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionUrl=https\://services.gradle.org/distributions/gradle-7.3-bin.zip +distributionUrl=https\://services.gradle.org/distributions/gradle-7.3.1-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/installAndRun.sh b/installAndRun.sh index 9369275..4966cca 100755 --- a/installAndRun.sh +++ b/installAndRun.sh @@ -25,7 +25,7 @@ if [[ ! -f $inputDataFile ]]; then echo -e "\n\n" fi -gradleVersion="7.3" +gradleVersion="7.3.1" if [[ justInstall -eq 0 ]]; then diff --git a/src/main/java/eu/openaire/urls_worker/UrlsWorkerApplication.java b/src/main/java/eu/openaire/urls_worker/UrlsWorkerApplication.java index 14e9ac2..7bee1cc 100644 --- a/src/main/java/eu/openaire/urls_worker/UrlsWorkerApplication.java +++ b/src/main/java/eu/openaire/urls_worker/UrlsWorkerApplication.java @@ -43,6 +43,11 @@ public class UrlsWorkerApplication { new PublicationsRetrieverPlugin(); SpringApplication.run(UrlsWorkerApplication.class, args); + + Runtime javaRuntime = Runtime.getRuntime(); + logger.debug("HeapSize: " + javaRuntime.totalMemory()); + logger.debug("HeapMaxSize: " + javaRuntime.maxMemory()); + logger.debug("HeapFreeSize: " + javaRuntime.freeMemory()); } diff --git a/src/main/java/eu/openaire/urls_worker/controllers/FullTextsController.java b/src/main/java/eu/openaire/urls_worker/controllers/FullTextsController.java index 95b7da7..8a97663 100644 --- a/src/main/java/eu/openaire/urls_worker/controllers/FullTextsController.java +++ b/src/main/java/eu/openaire/urls_worker/controllers/FullTextsController.java @@ -42,10 +42,24 @@ public class FullTextsController { @GetMapping("getFullTexts/{assignmentsCounter:[\\d]+}/{totalZipBatches:[\\d]+}/{zipBatchCounter:[\\d]+}/{fileNamesWithExtensions}") public ResponseEntity getMultipleFullTexts(@PathVariable long assignmentsCounter, @PathVariable int totalZipBatches, @PathVariable int zipBatchCounter, @PathVariable List fileNamesWithExtensions, HttpServletRequest request) { - logger.info("Received a \"getMultipleFullTexts\" request for returning a zip-file containing " + fileNamesWithExtensions.size() + " full-texts, from assignments-" + assignmentsCounter + ", for batch-" + zipBatchCounter); + int fileNamesListNum = fileNamesWithExtensions.size(); + if ( (fileNamesListNum == 1) && (fileNamesWithExtensions.get(0).length() == 0) ) { // In case the last "/" in the url was given, then this list will not be empty, but have one empty item instead. + // In case the url does not end in "/", then Spring will automatically return an "HTTP-BadRequest". + String errorMsg = "An empty \"fileNamesWithExtensions\" list was given from assignments_" + assignmentsCounter + ", for batch_" + zipBatchCounter; + logger.warn(errorMsg); + return ResponseEntity.badRequest().body(errorMsg); + } + + logger.info("Received a \"getMultipleFullTexts\" request for returning a zip-file containing " + fileNamesListNum + " full-texts, from assignments_" + assignmentsCounter + ", for batch_" + zipBatchCounter); String currentAssignmentsBaseFullTextsPath = assignmentsBaseDir + "assignments_" + assignmentsCounter + "_fullTexts" + File.separator; + if ( ! (new File(currentAssignmentsBaseFullTextsPath).isDirectory()) ) { + String errorMsg = "The base directory for assignments_" + assignmentsCounter + " was not found: " + currentAssignmentsBaseFullTextsPath; + logger.error(errorMsg); + return ResponseEntity.badRequest().body(errorMsg); + } + File zipFile = FilesZipper.zipMultipleFilesAndGetZip(assignmentsCounter, zipBatchCounter, fileNamesWithExtensions, currentAssignmentsBaseFullTextsPath); if ( zipFile == null ) { String errorMsg = "Failed to create the zip file for \"zipBatchCounter\"-" + zipBatchCounter; diff --git a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java index d6d716f..8065a2e 100644 --- a/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java +++ b/src/main/java/eu/openaire/urls_worker/plugins/PublicationsRetrieverPlugin.java @@ -59,23 +59,25 @@ public class PublicationsRetrieverPlugin { private static final List> callableTasks = new ArrayList<>(FileUtils.jsonBatchSize); - public static void processAssignments(Long assignmentRequestCounter, Collection assignments) throws RuntimeException, FileNotFoundException + public static void processAssignments(Long assignmentRequestCounter, Collection assignments) throws RuntimeException { FileUtils.storeDocFilesDir = assignmentsBasePath + "assignments_" + assignmentRequestCounter + "_fullTexts" + File.separator; // It needs the last separator, because of how the docFiles are named and stored. File curAssignmentsDirs = new File(FileUtils.storeDocFilesDir); - if ( !curAssignmentsDirs.exists() ) { - if ( !curAssignmentsDirs.mkdirs() ) { // Create the directories. - String workingDir = System.getProperty("user.dir") + File.separator; - logger.error("Could not create the \"assignments_fullTexts directories\": \"" + FileUtils.storeDocFilesDir + "\". Using the \"workingDir\" instead (" + workingDir + ")."); - FileUtils.storeDocFilesDir = assignmentsBasePath = workingDir; + try { + if ( !curAssignmentsDirs.exists() ) { + if ( !curAssignmentsDirs.mkdirs() ) { // Create the directories. + String workingDir = System.getProperty("user.dir") + File.separator; + logger.error("Could not create the \"assignments_fullTexts directories\": \"" + FileUtils.storeDocFilesDir + "\". Using the \"workingDir\" instead (" + workingDir + ")."); + FileUtils.storeDocFilesDir = assignmentsBasePath = workingDir; + } } + } catch (Exception e) { + String errorMsg = "Failed to create the full-texts directory for assignments_" + assignmentRequestCounter; + logger.error(errorMsg, e); + throw new RuntimeException(errorMsg + ": " + e.getMessage()); } - int tasksNumber = assignments.size(); - int batchCount = 0; - int tasksCount = 0; - // Start loading and checking urls. for ( Assignment assignment : assignments ) { @@ -121,17 +123,13 @@ public class PublicationsRetrieverPlugin { } return true; }); + } - // Invoke the tasks every time we reach the "jsonBatchSize" tasks, or we are at the end of the list. - tasksCount ++; - if ( (tasksCount == FileUtils.jsonBatchSize) || (tasksCount == tasksNumber) ) - { - logger.info("Batch counter: " + (++batchCount) + " | progress: " + PublicationsRetriever.df.format((batchCount * tasksCount) * 100.0 / tasksNumber) + "% | every batch contains " + FileUtils.jsonBatchSize + " id-url pairs."); - LoaderAndChecker.invokeAllTasksAndWait(callableTasks); - addUrlReportsToWorkerReport(); - callableTasks.clear(); // Reset the thread-tasks-list for the next batch. - } - }// end tasks-for-loop + int numFailedTasks = LoaderAndChecker.invokeAllTasksAndWait(callableTasks); + if ( numFailedTasks > 0 ) + logger.warn(numFailedTasks + " tasks failed!"); + addUrlReportsToWorkerReport(); + callableTasks.clear(); // Reset the thread-tasks-list for the next batch. } diff --git a/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java b/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java index 380edd5..f35b54a 100644 --- a/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java +++ b/src/main/java/eu/openaire/urls_worker/util/AssignmentsHandler.java @@ -103,8 +103,8 @@ public class AssignmentsHandler { try { PublicationsRetrieverPlugin.processAssignments(assignmentRequestCounter, assignmentsForPlugins.values()); } catch (Exception e) { - logger.error(e.getMessage(), e); - } + logger.error("Exception when processing the assignments_" + assignmentRequestCounter, e); + } // In this case, we will either have an empty WorkerReport or a half-filled one. Either way, we want to report back to the Controller. if ( askForTest ) { logger.debug("UrlReports:"); // DEBUG! @@ -115,9 +115,10 @@ public class AssignmentsHandler { postWorkerReport(assignmentRequestCounter); isAvailableForWork = true; // State this after posting, to avoid breaking the "UrlReports" in the current or the next run. + // Also, since the worker has limited resources, it's better to finish sending the full-texts first and then request a new batch of assignments. // Note: Cannot call this method here retrospectively, as if it runs 100s of times, the memory-stack may break.. - // The scheduler will handle calling it every half an hour, in case the Worker is available for work.. + // The scheduler will handle calling it every 15 mins, in case the Worker is available for work.. } diff --git a/src/main/java/eu/openaire/urls_worker/util/FilesZipper.java b/src/main/java/eu/openaire/urls_worker/util/FilesZipper.java index c01c050..a1c6975 100644 --- a/src/main/java/eu/openaire/urls_worker/util/FilesZipper.java +++ b/src/main/java/eu/openaire/urls_worker/util/FilesZipper.java @@ -4,6 +4,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; +import java.nio.charset.StandardCharsets; import java.util.List; import java.util.zip.ZipEntry; import java.util.zip.ZipOutputStream; @@ -16,47 +17,35 @@ public class FilesZipper public static File zipMultipleFilesAndGetZip(long assignmentsCounter, int zipBatchCounter, List filesToZip, String baseDirectory) { - File zipFile = null; - ZipOutputStream zos = null; - try { - String zipFilename = baseDirectory + "assignments_" + assignmentsCounter + "_full-texts_" + zipBatchCounter + ".zip"; - // For example: assignments_2_full-texts_4.zip | where < 4 > is referred to the 4th batch of files requested by the controller. - zipFile = new File(zipFilename); - zos = new ZipOutputStream(new FileOutputStream(zipFile)); + String zipFilename = baseDirectory + "assignments_" + assignmentsCounter + "_full-texts_" + zipBatchCounter + ".zip"; + // For example: assignments_2_full-texts_4.zip | where < 4 > is referred to the 4th batch of files requested by the controller. - // Iterate over the given full-texts and add them to the zip. - for ( String file : filesToZip ) - { + File zipFile = new File(zipFilename); + try ( ZipOutputStream zos = new ZipOutputStream(new FileOutputStream(zipFile), StandardCharsets.UTF_8) ) + { + for ( String file : filesToZip ) { zipAFile(file, zos, baseDirectory); } } catch (Exception e) { - logger.error("", e); + logger.error("Exception when creating the zip-file: " + zipFilename, e); return null; - } finally { - try { - if ( zos != null ) - zos.close(); - } catch (IOException e) { - logger.error(e.getMessage(), e); - } } return zipFile; } - private static boolean zipAFile(String fileName, ZipOutputStream zos, String baseDir) + private static final int BUFFER_SIZE = 3145728; // 3MB (average fullText-size) + private static final byte[] dataBuffer = new byte[BUFFER_SIZE]; + + // This method is "synchronized" to avoid any future problems with shared-buffer, if the requests are asynchronous. + private static synchronized boolean zipAFile(String fileName, ZipOutputStream zos, String baseDir) { - final int BUFFER = 1048576; // 1 MB - byte[] data = new byte[BUFFER]; - BufferedInputStream bis = null; String fullFileName = baseDir + fileName; - try { - FileInputStream fis = new FileInputStream(fullFileName); - bis = new BufferedInputStream(fis, BUFFER); + try ( BufferedInputStream bis = new BufferedInputStream(new FileInputStream(fullFileName), BUFFER_SIZE) ) { zos.putNextEntry(new ZipEntry(fileName)); int count; - while ( (count = bis.read(data, 0, BUFFER)) != -1 ) { - zos.write(data, 0, count); + while ( (count = bis.read(dataBuffer, 0, BUFFER_SIZE)) != -1 ) { + zos.write(dataBuffer, 0, count); } zos.closeEntry(); // close the entry here (not the ZipOutputStream) } catch (FileNotFoundException fnfe) { @@ -66,13 +55,6 @@ public class FilesZipper if ( ! e.getMessage().contains("duplicate") ) logger.error("Error zipping file: " + fullFileName, e); return false; - } finally { - try { - if ( bis != null ) - bis.close(); - } catch (IOException e) { - logger.error(e.getMessage(), e); - } } return true; } diff --git a/src/main/resources/logback-spring.xml b/src/main/resources/logback-spring.xml index 59ee12e..23d6069 100644 --- a/src/main/resources/logback-spring.xml +++ b/src/main/resources/logback-spring.xml @@ -1,10 +1,12 @@ - + logs/UrlsWorker.log logs/UrlsWorker.%i.log.zip + 1 + 20 @@ -24,7 +26,7 @@ - + \ No newline at end of file