205 lines
10 KiB
Java
205 lines
10 KiB
Java
|
package eu.openaire.urls_controller.controllers;
|
||
|
|
||
|
import eu.openaire.urls_controller.components.BulkImport;
|
||
|
import eu.openaire.urls_controller.models.BulkImportReport;
|
||
|
import eu.openaire.urls_controller.services.FullTextsService;
|
||
|
import eu.openaire.urls_controller.services.FullTextsServiceImpl;
|
||
|
import eu.openaire.urls_controller.util.FileUtils;
|
||
|
import eu.openaire.urls_controller.util.GenericUtils;
|
||
|
import org.slf4j.Logger;
|
||
|
import org.slf4j.LoggerFactory;
|
||
|
import org.springframework.beans.factory.annotation.Autowired;
|
||
|
import org.springframework.http.HttpStatus;
|
||
|
import org.springframework.http.ResponseEntity;
|
||
|
import org.springframework.web.bind.annotation.GetMapping;
|
||
|
import org.springframework.web.bind.annotation.RequestMapping;
|
||
|
import org.springframework.web.bind.annotation.RequestParam;
|
||
|
import org.springframework.web.bind.annotation.RestController;
|
||
|
|
||
|
import java.io.BufferedReader;
|
||
|
import java.io.File;
|
||
|
import java.io.InputStreamReader;
|
||
|
import java.nio.file.*;
|
||
|
import java.util.Collections;
|
||
|
import java.util.HashMap;
|
||
|
import java.util.Set;
|
||
|
import java.util.concurrent.ConcurrentHashMap;
|
||
|
import java.util.regex.Matcher;
|
||
|
import java.util.regex.Pattern;
|
||
|
|
||
|
@RestController
|
||
|
@RequestMapping("")
|
||
|
public class FullTextsController {
|
||
|
|
||
|
private static final Logger logger = LoggerFactory.getLogger(FullTextsController.class);
|
||
|
|
||
|
@Autowired
|
||
|
private FileUtils fileUtils;
|
||
|
|
||
|
private final FullTextsService fullTextsService;
|
||
|
|
||
|
private final String baseBulkImportLocation;
|
||
|
|
||
|
private final String bulkImportReportLocation;
|
||
|
|
||
|
private final HashMap<String, BulkImport.BulkImportSource> bulkImportSources;
|
||
|
|
||
|
public static final Set<String> bulkImportDirs = Collections.newSetFromMap(new ConcurrentHashMap<String, Boolean>());
|
||
|
|
||
|
|
||
|
|
||
|
public FullTextsController(FullTextsService fullTextsService, BulkImport bulkImport)
|
||
|
{
|
||
|
String bulkImportReportLocation1;
|
||
|
this.baseBulkImportLocation = bulkImport.getBaseBulkImportLocation();
|
||
|
|
||
|
this.bulkImportSources = new HashMap<>(bulkImport.getBulkImportSources());
|
||
|
|
||
|
bulkImportReportLocation1 = bulkImport.getBulkImportReportLocation();
|
||
|
if ( !bulkImportReportLocation1.endsWith("/") )
|
||
|
bulkImportReportLocation1 += "/";
|
||
|
this.bulkImportReportLocation = bulkImportReportLocation1;
|
||
|
|
||
|
this.fullTextsService = fullTextsService;
|
||
|
}
|
||
|
|
||
|
|
||
|
private static final Pattern LAST_DIR_REGEX = Pattern.compile("^.*/([^/]+[/]?)$");
|
||
|
|
||
|
@GetMapping("bulkImportFullTexts")
|
||
|
public ResponseEntity<?> bulkImportFullTexts(@RequestParam String provenance, @RequestParam String bulkImportDir, @RequestParam boolean shouldDeleteFilesOnFinish) {
|
||
|
|
||
|
BulkImport.BulkImportSource bulkImportSource = bulkImportSources.get(provenance);
|
||
|
if ( bulkImportSource == null ) {
|
||
|
String errorMsg = "The provided provenance \"" + provenance + "\" is not in the list of the bulk-imported sources, so no configuration-rules are available!";
|
||
|
logger.error(errorMsg);
|
||
|
return ResponseEntity.badRequest().body(errorMsg); // It's the user's fault that gave an unsupported provenance.
|
||
|
}
|
||
|
|
||
|
// Check if the given directory parameter exists.
|
||
|
if ( bulkImportDir.isEmpty() ) {
|
||
|
String errorMsg = "The \"bulkImportDir\" was missing from the request!";
|
||
|
logger.error(errorMsg);
|
||
|
return ResponseEntity.badRequest().body(errorMsg);
|
||
|
}
|
||
|
|
||
|
String givenBulkDir = bulkImportDir; // Keep the given value here, to not expose the full-path, in case the user has not provided an absolut path.
|
||
|
|
||
|
// Make sure the whole path ends with "/", so that we can easily append file-names later.
|
||
|
if ( !bulkImportDir.endsWith("/") )
|
||
|
bulkImportDir += "/";
|
||
|
|
||
|
String relativeBulkImportDir = null;
|
||
|
|
||
|
// Check if we have "relative-path" so that we can append it to the "baseBulkImportLocation".
|
||
|
if ( !bulkImportDir.startsWith("/") ) {
|
||
|
// A relative path was given.
|
||
|
relativeBulkImportDir = bulkImportDir;
|
||
|
bulkImportDir = baseBulkImportLocation + bulkImportDir;
|
||
|
} else {
|
||
|
String errMsg = "The bulkImportDir \"" + bulkImportDir + "\" was problematic!";
|
||
|
Matcher matcher = LAST_DIR_REGEX.matcher(bulkImportDir);
|
||
|
if ( !matcher.matches() ) {
|
||
|
logger.error(errMsg);
|
||
|
return ResponseEntity.badRequest().body(errMsg);
|
||
|
}
|
||
|
relativeBulkImportDir = matcher.group(1);
|
||
|
if ( (relativeBulkImportDir == null) || relativeBulkImportDir.isEmpty() ) {
|
||
|
logger.error(errMsg);
|
||
|
return ResponseEntity.badRequest().body(errMsg);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// The "relativeBulkImportDir" should always be guaranteed to end with "/"! Otherwise, the import-procedure will fail.
|
||
|
logger.info("Received a \"bulkImportFullTexts\" request for \"" + provenance + "\" procedure and bulkImportDir: \"" + bulkImportDir + "\".");
|
||
|
|
||
|
// Check whether the given directory is accessible.
|
||
|
File givenDir = new File(bulkImportDir);
|
||
|
if ( !givenDir.isDirectory() ) {
|
||
|
String errorMsg = "The bulkImportDir \"" + bulkImportDir + "\" is invalid!";
|
||
|
logger.error(errorMsg);
|
||
|
return ResponseEntity.badRequest().body(errorMsg);
|
||
|
}
|
||
|
|
||
|
// Efficiently check if the dir is empty, without loading all the file-entries in memory.
|
||
|
try ( DirectoryStream<Path> directory = Files.newDirectoryStream(givenDir.toPath()) ) {
|
||
|
if ( !directory.iterator().hasNext() ) {
|
||
|
String errorMsg = "The givenDir \"" + givenDir + "\" is empty!";
|
||
|
logger.warn(errorMsg);
|
||
|
return ResponseEntity.badRequest().body(errorMsg);
|
||
|
}
|
||
|
// The above check does not catch the case were the directory has at least one subdirectory, but no full-texts files.
|
||
|
// The "iterator()" will have a "next" entry, but no full-text file will exist. Although, that case will be rare.
|
||
|
} catch (Exception e) {
|
||
|
String errorMsg = "Error when checking if the givenDir \"" + givenDir + "\" is empty!";
|
||
|
logger.error(errorMsg);
|
||
|
return ResponseEntity.internalServerError().body(errorMsg);
|
||
|
}
|
||
|
|
||
|
// Detect if the same directory is scheduled for being processed. In that case, return a 429.
|
||
|
if ( ! bulkImportDirs.add(bulkImportDir) ) {
|
||
|
// We allow multiple jobs for the same provenance, running at the same time, but not multiple jobs for the same bulkImportDirectory.
|
||
|
String errorMsg = "There is a bulk-import request for the directory \"" + bulkImportDir + "\" that is being handled at the moment. Please wait until it's finished being processed, before making another request.";
|
||
|
logger.error(errorMsg);
|
||
|
return ResponseEntity.status(HttpStatus.TOO_MANY_REQUESTS).body(errorMsg);
|
||
|
}
|
||
|
|
||
|
Path currentBulkImportReportLocationDir = Paths.get(this.bulkImportReportLocation, provenance);
|
||
|
try {
|
||
|
Files.createDirectories(currentBulkImportReportLocationDir); // No-op if dir exists. It does not throw a "alreadyExistsException"
|
||
|
} catch (Exception e) {
|
||
|
String errorMsg = "Could nor create the \"bulkImportReportLocation\" for provenance \"" + provenance + "\" : " + currentBulkImportReportLocationDir;
|
||
|
logger.error(errorMsg, e);
|
||
|
return ResponseEntity.internalServerError().body(errorMsg);
|
||
|
}
|
||
|
|
||
|
String bulkImportReportID = provenance + "/" + relativeBulkImportDir.substring(0, (relativeBulkImportDir.length() -1)) + "_report_" + GenericUtils.getRandomNumber(10000, 99999);
|
||
|
String bulkImportReportFullPath = this.bulkImportReportLocation + bulkImportReportID + ".json";
|
||
|
|
||
|
String msg = "The 'bulkImportFullTexts' request for '" + provenance + "' procedure and bulkImportDir: '" + givenBulkDir + "' was accepted and will be scheduled for execution. "
|
||
|
+ (shouldDeleteFilesOnFinish ? "The successfully imported files will be deleted." : "All files will remain inside the directory after processing.")
|
||
|
+ " You can request a report at any moment, using this reportFileID: " + bulkImportReportID;
|
||
|
|
||
|
BulkImportReport bulkImportReport = new BulkImportReport(provenance, bulkImportReportFullPath, bulkImportReportID);
|
||
|
bulkImportReport.addEvent(msg);
|
||
|
|
||
|
String errorMsg = fileUtils.writeToFile(bulkImportReportFullPath, bulkImportReport.getJsonReport());
|
||
|
if ( errorMsg != null )
|
||
|
return ResponseEntity.internalServerError().body(errorMsg);
|
||
|
|
||
|
logger.info(msg);
|
||
|
|
||
|
// Add this to a background job, since it will take a lot of time to be completed, and the caller will get a "read-timeout" at least and a socket-timeout at most (in case of a network failure during those hours).
|
||
|
String finalBulkImportDir = bulkImportDir;
|
||
|
String finalRelativeBulkImportDir = relativeBulkImportDir;
|
||
|
FullTextsServiceImpl.backgroundCallableTasks.add(() ->
|
||
|
fullTextsService.bulkImportFullTextsFromDirectory(bulkImportReport, finalRelativeBulkImportDir, finalBulkImportDir, givenDir, provenance, bulkImportSource, shouldDeleteFilesOnFinish)
|
||
|
);
|
||
|
|
||
|
return ResponseEntity.ok().body(msg);
|
||
|
}
|
||
|
|
||
|
|
||
|
@GetMapping("getBulkImportReport")
|
||
|
public ResponseEntity<?> getBulkImportReport(@RequestParam("id") String bulkImportReportId)
|
||
|
{
|
||
|
// Write the contents of the report-file to a string (efficiently!) and return the whole content as an HTTP-response.
|
||
|
StringBuilder stringBuilder = new StringBuilder(2_000);
|
||
|
String line;
|
||
|
try ( BufferedReader in = new BufferedReader(new InputStreamReader(Files.newInputStream(Paths.get(this.bulkImportReportLocation, bulkImportReportId + ".json"))), FileUtils.tenMb) ) {
|
||
|
while ( (line = in.readLine()) != null )
|
||
|
stringBuilder.append(line).append("\n"); // The "readLine()" does not return the line-term char.
|
||
|
} catch (NoSuchFileException nsfe) {
|
||
|
logger.warn("The requested report-file with ID: \"" + bulkImportReportId + "\" was not found!");
|
||
|
return ResponseEntity.notFound().build();
|
||
|
} catch (Exception e) {
|
||
|
String errorMsg = "Failed to read the contents of report-file with ID: " + bulkImportReportId;
|
||
|
logger.error(errorMsg, e);
|
||
|
return ResponseEntity.internalServerError().body(errorMsg); // It's ok to give the file-path to the user, since the report already contains the file-path.
|
||
|
}
|
||
|
|
||
|
return ResponseEntity.ok().body(stringBuilder.toString());
|
||
|
}
|
||
|
|
||
|
}
|