2023-05-11 02:07:55 +02:00
package eu.openaire.urls_controller.controllers ;
import eu.openaire.urls_controller.components.BulkImport ;
import eu.openaire.urls_controller.models.BulkImportReport ;
2023-05-29 11:12:08 +02:00
import eu.openaire.urls_controller.services.BulkImportService ;
2023-05-11 02:07:55 +02:00
import eu.openaire.urls_controller.util.FileUtils ;
import eu.openaire.urls_controller.util.GenericUtils ;
import org.slf4j.Logger ;
import org.slf4j.LoggerFactory ;
import org.springframework.beans.factory.annotation.Autowired ;
import org.springframework.http.HttpStatus ;
import org.springframework.http.ResponseEntity ;
import org.springframework.web.bind.annotation.GetMapping ;
import org.springframework.web.bind.annotation.RequestMapping ;
import org.springframework.web.bind.annotation.RequestParam ;
import org.springframework.web.bind.annotation.RestController ;
import java.io.BufferedReader ;
import java.io.File ;
import java.io.InputStreamReader ;
import java.nio.file.* ;
import java.util.Collections ;
import java.util.HashMap ;
import java.util.Set ;
import java.util.concurrent.ConcurrentHashMap ;
2023-07-21 10:45:50 +02:00
import java.util.concurrent.ExecutorService ;
import java.util.concurrent.Executors ;
2023-05-11 02:07:55 +02:00
import java.util.regex.Matcher ;
import java.util.regex.Pattern ;
@RestController
@RequestMapping ( " " )
2023-05-29 11:12:08 +02:00
public class BulkImportController {
2023-05-11 02:07:55 +02:00
2023-05-29 11:12:08 +02:00
private static final Logger logger = LoggerFactory . getLogger ( BulkImportController . class ) ;
2023-05-11 02:07:55 +02:00
@Autowired
private FileUtils fileUtils ;
2023-05-29 11:12:08 +02:00
private final BulkImportService bulkImportService ;
2023-05-11 02:07:55 +02:00
private final String baseBulkImportLocation ;
private final String bulkImportReportLocation ;
private final HashMap < String , BulkImport . BulkImportSource > bulkImportSources ;
public static final Set < String > bulkImportDirs = Collections . newSetFromMap ( new ConcurrentHashMap < String , Boolean > ( ) ) ;
2023-07-21 10:45:50 +02:00
public static int numOfThreadsPerBulkImportProcedure ;
public static ExecutorService bulkImportExecutor ;
2023-05-11 02:07:55 +02:00
2023-05-29 11:12:08 +02:00
public BulkImportController ( BulkImportService bulkImportService , BulkImport bulkImport )
2023-05-11 02:07:55 +02:00
{
String bulkImportReportLocation1 ;
this . baseBulkImportLocation = bulkImport . getBaseBulkImportLocation ( ) ;
this . bulkImportSources = new HashMap < > ( bulkImport . getBulkImportSources ( ) ) ;
bulkImportReportLocation1 = bulkImport . getBulkImportReportLocation ( ) ;
if ( ! bulkImportReportLocation1 . endsWith ( " / " ) )
bulkImportReportLocation1 + = " / " ;
this . bulkImportReportLocation = bulkImportReportLocation1 ;
2023-05-29 11:12:08 +02:00
this . bulkImportService = bulkImportService ;
2023-07-21 10:45:50 +02:00
numOfThreadsPerBulkImportProcedure = bulkImport . getNumOfThreadsPerBulkImportProcedure ( ) ;
logger . info ( " Will use " + numOfThreadsPerBulkImportProcedure + " threads per bulk-import procedure. " ) ;
bulkImportExecutor = Executors . newFixedThreadPool ( numOfThreadsPerBulkImportProcedure ) ; // At most < numOfThreadsPerBulkImportProcedure > threads will be used per bulk-import procedure..
2023-05-11 02:07:55 +02:00
}
private static final Pattern LAST_DIR_REGEX = Pattern . compile ( " ^.*/([^/]+[/]?)$ " ) ;
@GetMapping ( " bulkImportFullTexts " )
public ResponseEntity < ? > bulkImportFullTexts ( @RequestParam String provenance , @RequestParam String bulkImportDir , @RequestParam boolean shouldDeleteFilesOnFinish ) {
BulkImport . BulkImportSource bulkImportSource = bulkImportSources . get ( provenance ) ;
if ( bulkImportSource = = null ) {
String errorMsg = " The provided provenance \" " + provenance + " \" is not in the list of the bulk-imported sources, so no configuration-rules are available! " ;
logger . error ( errorMsg ) ;
return ResponseEntity . badRequest ( ) . body ( errorMsg ) ; // It's the user's fault that gave an unsupported provenance.
}
// Check if the given directory parameter exists.
if ( bulkImportDir . isEmpty ( ) ) {
String errorMsg = " The \" bulkImportDir \" was missing from the request! " ;
logger . error ( errorMsg ) ;
return ResponseEntity . badRequest ( ) . body ( errorMsg ) ;
}
String givenBulkDir = bulkImportDir ; // Keep the given value here, to not expose the full-path, in case the user has not provided an absolut path.
// Make sure the whole path ends with "/", so that we can easily append file-names later.
if ( ! bulkImportDir . endsWith ( " / " ) )
bulkImportDir + = " / " ;
String relativeBulkImportDir = null ;
// Check if we have "relative-path" so that we can append it to the "baseBulkImportLocation".
if ( ! bulkImportDir . startsWith ( " / " ) ) {
// A relative path was given.
relativeBulkImportDir = bulkImportDir ;
bulkImportDir = baseBulkImportLocation + bulkImportDir ;
} else {
String errMsg = " The bulkImportDir \" " + bulkImportDir + " \" was problematic! " ;
Matcher matcher = LAST_DIR_REGEX . matcher ( bulkImportDir ) ;
if ( ! matcher . matches ( ) ) {
logger . error ( errMsg ) ;
return ResponseEntity . badRequest ( ) . body ( errMsg ) ;
}
relativeBulkImportDir = matcher . group ( 1 ) ;
if ( ( relativeBulkImportDir = = null ) | | relativeBulkImportDir . isEmpty ( ) ) {
logger . error ( errMsg ) ;
return ResponseEntity . badRequest ( ) . body ( errMsg ) ;
}
}
// The "relativeBulkImportDir" should always be guaranteed to end with "/"! Otherwise, the import-procedure will fail.
logger . info ( " Received a \" bulkImportFullTexts \" request for \" " + provenance + " \" procedure and bulkImportDir: \" " + bulkImportDir + " \" . " ) ;
// Check whether the given directory is accessible.
File givenDir = new File ( bulkImportDir ) ;
if ( ! givenDir . isDirectory ( ) ) {
String errorMsg = " The bulkImportDir \" " + bulkImportDir + " \" is invalid! " ;
logger . error ( errorMsg ) ;
return ResponseEntity . badRequest ( ) . body ( errorMsg ) ;
}
// Efficiently check if the dir is empty, without loading all the file-entries in memory.
try ( DirectoryStream < Path > directory = Files . newDirectoryStream ( givenDir . toPath ( ) ) ) {
if ( ! directory . iterator ( ) . hasNext ( ) ) {
String errorMsg = " The givenDir \" " + givenDir + " \" is empty! " ;
logger . warn ( errorMsg ) ;
return ResponseEntity . badRequest ( ) . body ( errorMsg ) ;
}
// The above check does not catch the case were the directory has at least one subdirectory, but no full-texts files.
// The "iterator()" will have a "next" entry, but no full-text file will exist. Although, that case will be rare.
} catch ( Exception e ) {
String errorMsg = " Error when checking if the givenDir \" " + givenDir + " \" is empty! " ;
logger . error ( errorMsg ) ;
return ResponseEntity . internalServerError ( ) . body ( errorMsg ) ;
}
// Detect if the same directory is scheduled for being processed. In that case, return a 429.
if ( ! bulkImportDirs . add ( bulkImportDir ) ) {
// We allow multiple jobs for the same provenance, running at the same time, but not multiple jobs for the same bulkImportDirectory.
String errorMsg = " There is a bulk-import request for the directory \" " + bulkImportDir + " \" that is being handled at the moment. Please wait until it's finished being processed, before making another request. " ;
logger . error ( errorMsg ) ;
return ResponseEntity . status ( HttpStatus . TOO_MANY_REQUESTS ) . body ( errorMsg ) ;
}
Path currentBulkImportReportLocationDir = Paths . get ( this . bulkImportReportLocation , provenance ) ;
try {
Files . createDirectories ( currentBulkImportReportLocationDir ) ; // No-op if dir exists. It does not throw a "alreadyExistsException"
} catch ( Exception e ) {
String errorMsg = " Could nor create the \" bulkImportReportLocation \" for provenance \" " + provenance + " \" : " + currentBulkImportReportLocationDir ;
logger . error ( errorMsg , e ) ;
return ResponseEntity . internalServerError ( ) . body ( errorMsg ) ;
}
String bulkImportReportID = provenance + " / " + relativeBulkImportDir . substring ( 0 , ( relativeBulkImportDir . length ( ) - 1 ) ) + " _report_ " + GenericUtils . getRandomNumber ( 10000 , 99999 ) ;
String bulkImportReportFullPath = this . bulkImportReportLocation + bulkImportReportID + " .json " ;
String msg = " The 'bulkImportFullTexts' request for ' " + provenance + " ' procedure and bulkImportDir: ' " + givenBulkDir + " ' was accepted and will be scheduled for execution. "
+ ( shouldDeleteFilesOnFinish ? " The successfully imported files will be deleted. " : " All files will remain inside the directory after processing. " )
+ " You can request a report at any moment, using this reportFileID: " + bulkImportReportID ;
BulkImportReport bulkImportReport = new BulkImportReport ( provenance , bulkImportReportFullPath , bulkImportReportID ) ;
bulkImportReport . addEvent ( msg ) ;
2023-05-24 12:52:28 +02:00
String errorMsg = fileUtils . writeToFile ( bulkImportReportFullPath , bulkImportReport . getJsonReport ( ) , true ) ;
2023-05-11 02:07:55 +02:00
if ( errorMsg ! = null )
return ResponseEntity . internalServerError ( ) . body ( errorMsg ) ;
logger . info ( msg ) ;
// Add this to a background job, since it will take a lot of time to be completed, and the caller will get a "read-timeout" at least and a socket-timeout at most (in case of a network failure during those hours).
String finalBulkImportDir = bulkImportDir ;
String finalRelativeBulkImportDir = relativeBulkImportDir ;
2023-05-29 11:21:48 +02:00
UrlsController . backgroundCallableTasks . add ( ( ) - >
2023-05-29 11:12:08 +02:00
bulkImportService . bulkImportFullTextsFromDirectory ( bulkImportReport , finalRelativeBulkImportDir , finalBulkImportDir , givenDir , provenance , bulkImportSource , shouldDeleteFilesOnFinish )
2023-05-11 02:07:55 +02:00
) ;
return ResponseEntity . ok ( ) . body ( msg ) ;
}
@GetMapping ( " getBulkImportReport " )
public ResponseEntity < ? > getBulkImportReport ( @RequestParam ( " id " ) String bulkImportReportId )
{
// Write the contents of the report-file to a string (efficiently!) and return the whole content as an HTTP-response.
StringBuilder stringBuilder = new StringBuilder ( 2_000 ) ;
String line ;
try ( BufferedReader in = new BufferedReader ( new InputStreamReader ( Files . newInputStream ( Paths . get ( this . bulkImportReportLocation , bulkImportReportId + " .json " ) ) ) , FileUtils . tenMb ) ) {
while ( ( line = in . readLine ( ) ) ! = null )
stringBuilder . append ( line ) . append ( " \ n " ) ; // The "readLine()" does not return the line-term char.
} catch ( NoSuchFileException nsfe ) {
logger . warn ( " The requested report-file with ID: \" " + bulkImportReportId + " \" was not found! " ) ;
return ResponseEntity . notFound ( ) . build ( ) ;
} catch ( Exception e ) {
String errorMsg = " Failed to read the contents of report-file with ID: " + bulkImportReportId ;
logger . error ( errorMsg , e ) ;
return ResponseEntity . internalServerError ( ) . body ( errorMsg ) ; // It's ok to give the file-path to the user, since the report already contains the file-path.
}
return ResponseEntity . ok ( ) . body ( stringBuilder . toString ( ) ) ;
}
}